Ejemplo n.º 1
0
    return((n_students , x[1] * n_students, x[2] * n_students, x[3] * n_students , 1 ))

def avg(x):

    n_students = x[0]

    if n_students ==0:
        return(0,0,0)

    return( x[1] / n_students , x[2] / n_students , x[3] / n_students  )

if __name__ == '__main__':
        
    sc, sqlContext = init_spark(verbose_logging=True)

    sat = read_hdfs_csv(sqlContext, '/user/zz1409/sat_scores2012.csv')
    loc = read_hdfs_csv(sqlContext, '/user/zz1409/loc_clean.csv')

    sat= sat.map(lambda x: (x[1].lower(), (x[2],x[3],x[4] ,x[5])) )

    loc = loc.select( 'school_name' , 'zip').map(lambda x: (x[0],x[1]))

    df = sat.join(loc).map( lambda x: (x[1][1] , total_score( x[1][0]  )   ))
    df = df.reduceByKey( lambda x,y: [ x+y for x,y in zip(x,y) ] )
    df = df.map(lambda x: ( x[0] , avg(x[1]) ))
    df = df.map(lambda x: (x[0], x[1][0],x[1][1],x[1][2]  ))

    df = df.toDF(['zip', 'reading','math','writing'])
    df.toPandas().to_csv('mycsv.csv')

Ejemplo n.º 2
0
import re
from pyspark.sql import Row
from pyspark.sql.functions import lit
from util import read_hdfs_csv, init_spark
import datetime
import numpy as np

#--packages com.databricks:spark-csv_2.11:1.5.0

if __name__ == '__main__':

    sc, sqlContext = init_spark(verbose_logging=True)

    rows = read_hdfs_csv(sqlContext, '/user/qg323/rows.csv')

    borrow_names = [
        'MANHATTAN', 'QUEENS', 'STATEN ISLAND', 'BROOKLYN', 'BRONX'
    ]

    for borrow in borrow_names:

        df = rows.filter('BORO_NM =  "' + borrow + '"')
        df = df.select('CMPLNT_FR_DT').map(
            lambda x: (x['CMPLNT_FR_DT'], 1)).reduceByKey(lambda x, y: x + y)
        freq = np.array([x[1] for x in df.collect() if x[1] > 2])
        np.save(borrow + '.npy', freq)

    #days = rows.select('CMPLNT_FR_DT','BORO_NM').map(lambda x: (x['CMPLNT_FR_DT'],  x['BORO_NM'] ,1))
    #np.save('daily_frequency.npy',np.array(days)
Ejemplo n.º 3
0
    # TODO: I'm not sure if I should set KY_CD and PD_CD as INT or TEXT.
    # Technically it should be TEXT, but it can also be INT.
    for i, c in enumerate(cols):
        s = 'DATETIME' if isdatetime[i] else dtypes_dict[dtypes[c]]
        df = df.withColumn(c + '_dtype', lit(s))

    return df

def check_date_consistency(r):
    # TODO
    return False

if __name__ == '__main__':
    sc, sqlContext = init_spark(verbose_logging=True)

    rows = read_hdfs_csv(sqlContext, 'rows.csv')    # Change to your filename

    # (1) Assign data type for each row
    rows = assign_type(rows)

    # Inconsistency checks:
    # (a) Make sure the IDs are unique:
    if rows.select('CMPLNT_NUM').distinct().count != rows.count():
        # In practice we should print out which ID is not unique, but here we
        # have a very friendly dataset and this block never gets run.
        print 'The ID\'s are not unique'

    # (b) Make sure the TO_ date/time is after FR_ date/time
    inconsistent_date = rows.filter(check_date_consistency)
    inconsistent_date_count = inconsistent_date.count()
    if inconsistent_date_count > 0:
Ejemplo n.º 4
0
    l = s.split(',')
    r = []
    for x in l:
        if x.find('-') != -1:
            a, b = x.split('-')
            r.extend(range(int(a), int(b) + 1))
        else:
            r.append(int(x))
    return r


if __name__ == '__main__':
    sc, sqlContext = init_spark()
    sc.addPyFile('util.py')

    rows = read_hdfs_csv(sqlContext, dbname)
    rows_infer = read_hdfs_csv(sqlContext, dbname, inferschema=True)
    id_ = rows.select('CMPLNT_NUM')

    fields = parse_ints(sys.argv[1])

    # CMPLNT_NUM
    if 1 in fields:
        df = handle_cmplnt_num(rows, rows_infer)
        summarize(df, 'CMPLNT_NUM')
        dump_info(df, 'CMPLNT_NUM')
        id_ = id_.intersect(
            df.where(df.CMPLNT_NUM_valid == 'VALID').select('CMPLNT_NUM'))

    # CMPLNT_FR_DT - CMPLNT_TO_TM
    if 2 in fields: