return((n_students , x[1] * n_students, x[2] * n_students, x[3] * n_students , 1 )) def avg(x): n_students = x[0] if n_students ==0: return(0,0,0) return( x[1] / n_students , x[2] / n_students , x[3] / n_students ) if __name__ == '__main__': sc, sqlContext = init_spark(verbose_logging=True) sat = read_hdfs_csv(sqlContext, '/user/zz1409/sat_scores2012.csv') loc = read_hdfs_csv(sqlContext, '/user/zz1409/loc_clean.csv') sat= sat.map(lambda x: (x[1].lower(), (x[2],x[3],x[4] ,x[5])) ) loc = loc.select( 'school_name' , 'zip').map(lambda x: (x[0],x[1])) df = sat.join(loc).map( lambda x: (x[1][1] , total_score( x[1][0] ) )) df = df.reduceByKey( lambda x,y: [ x+y for x,y in zip(x,y) ] ) df = df.map(lambda x: ( x[0] , avg(x[1]) )) df = df.map(lambda x: (x[0], x[1][0],x[1][1],x[1][2] )) df = df.toDF(['zip', 'reading','math','writing']) df.toPandas().to_csv('mycsv.csv')
import re from pyspark.sql import Row from pyspark.sql.functions import lit from util import read_hdfs_csv, init_spark import datetime import numpy as np #--packages com.databricks:spark-csv_2.11:1.5.0 if __name__ == '__main__': sc, sqlContext = init_spark(verbose_logging=True) rows = read_hdfs_csv(sqlContext, '/user/qg323/rows.csv') borrow_names = [ 'MANHATTAN', 'QUEENS', 'STATEN ISLAND', 'BROOKLYN', 'BRONX' ] for borrow in borrow_names: df = rows.filter('BORO_NM = "' + borrow + '"') df = df.select('CMPLNT_FR_DT').map( lambda x: (x['CMPLNT_FR_DT'], 1)).reduceByKey(lambda x, y: x + y) freq = np.array([x[1] for x in df.collect() if x[1] > 2]) np.save(borrow + '.npy', freq) #days = rows.select('CMPLNT_FR_DT','BORO_NM').map(lambda x: (x['CMPLNT_FR_DT'], x['BORO_NM'] ,1)) #np.save('daily_frequency.npy',np.array(days)
# TODO: I'm not sure if I should set KY_CD and PD_CD as INT or TEXT. # Technically it should be TEXT, but it can also be INT. for i, c in enumerate(cols): s = 'DATETIME' if isdatetime[i] else dtypes_dict[dtypes[c]] df = df.withColumn(c + '_dtype', lit(s)) return df def check_date_consistency(r): # TODO return False if __name__ == '__main__': sc, sqlContext = init_spark(verbose_logging=True) rows = read_hdfs_csv(sqlContext, 'rows.csv') # Change to your filename # (1) Assign data type for each row rows = assign_type(rows) # Inconsistency checks: # (a) Make sure the IDs are unique: if rows.select('CMPLNT_NUM').distinct().count != rows.count(): # In practice we should print out which ID is not unique, but here we # have a very friendly dataset and this block never gets run. print 'The ID\'s are not unique' # (b) Make sure the TO_ date/time is after FR_ date/time inconsistent_date = rows.filter(check_date_consistency) inconsistent_date_count = inconsistent_date.count() if inconsistent_date_count > 0:
l = s.split(',') r = [] for x in l: if x.find('-') != -1: a, b = x.split('-') r.extend(range(int(a), int(b) + 1)) else: r.append(int(x)) return r if __name__ == '__main__': sc, sqlContext = init_spark() sc.addPyFile('util.py') rows = read_hdfs_csv(sqlContext, dbname) rows_infer = read_hdfs_csv(sqlContext, dbname, inferschema=True) id_ = rows.select('CMPLNT_NUM') fields = parse_ints(sys.argv[1]) # CMPLNT_NUM if 1 in fields: df = handle_cmplnt_num(rows, rows_infer) summarize(df, 'CMPLNT_NUM') dump_info(df, 'CMPLNT_NUM') id_ = id_.intersect( df.where(df.CMPLNT_NUM_valid == 'VALID').select('CMPLNT_NUM')) # CMPLNT_FR_DT - CMPLNT_TO_TM if 2 in fields: