def write_data(path): sc = SparkContext('local') sqlContext = SQLContext(sc) rdd = sc.textFile('s3a://{path}'.format(path = path), 100)\ .map(lambda x: parse_line.parse_line(x)) df = rdd.toDF() df_with_int_temp = df.withColumn( "int_temp", df["air_temperature_observation_air_temperature"].cast(IntegerType())) df_with_int_temp.write.mode('Overwrite').parquet( 's3a://paulhtremblay/parquet_test/no_partition'.format( date=datetime.datetime.now())) #df.printSchema() #sqlContext.registerDataFrameAsTable(df_with_int_temp, 'weather') #new_df = sqlContext.sql("SELECT max(int_temp) FROM weather") #new_df.show() #air_temperature_observation_air_temperature #fixed_weather_station_ncei_wban_identifier #print(dir(df_with_int_temp.write)) df_with_int_temp.write.mode('Overwrite').partitionBy("int_temp").parquet( "s3a://paulhtremblay/parquet_test/partition") #df_with_int_temp.printSchema() """
def write_to_parquet(path): sc = SparkContext('local') sqlContext = SQLContext(sc) rdd = sc.textFile('s3a://{path}'.format(path = path), 100)\ .map(lambda x: parse_line.parse_line(x)) df = rdd.toDF() df.write.mode('Overwrite').parquet( 's3a://paulhtremblay/parquet_test/simple') """
def to_df(path): sc = SparkContext('local') sqlContext = SQLContext(sc) rdd = sc.textFile('s3a://{path}'.format(path = path))\ .map(lambda x: parse_line.parse_line(x))\ .filter(lambda x: x.get('air_temperature_observation_air_temperature') == -7.8) df = rdd.toDF() df.registerTempTable('my_table') df2 = sqlContext.sql("select * from my_table limit 1") return df
def count_us_ws(path): sc = SparkContext('local') num_records = sc.textFile('s3a://{path}'.format(path = path), 100)\ .map(lambda x: parse_line.parse_line(x))\ .filter(lambda x: x.get('geophysical_point_observation_longitude_coordinate') != None )\ .filter(lambda x: x.get('geophysical_point_observation_longitude_coordinate') < 48 )\ .filter(lambda x: x.get('geophysical_point_observation_longitude_coordinate') > 22 )\ .filter(lambda x: x.get('geophysical_point_observation_latitude_coordinate') != None)\ .filter(lambda x: x.get('geophysical_point_observation_latitude_coordinate') > -68)\ .filter(lambda x: x.get('geophysical_point_observation_latitude_coordinate') > -117)\ .count() #pp.pprint(rdd.take(1)) #print(rdd.getNumPartitions()) print(num_records)
def to_df(path): sc = SparkContext('local') sqlContext = SQLContext(sc) rdd = sc.textFile('s3a://{path}'.format(path = path))\ .map(lambda x: parse_line.parse_line(x))\ .filter(lambda x: x.get('geophysical_point_observation_longitude_coordinate') != None )\ .filter(lambda x: x.get('geophysical_point_observation_longitude_coordinate') < -67 )\ .filter(lambda x: x.get('geophysical_point_observation_longitude_coordinate') > -125 )\ .filter(lambda x: x.get('geophysical_point_observation_latitude_coordinate') != None)\ .filter(lambda x: x.get('geophysical_point_observation_latitude_coordinate') < 48)\ .filter(lambda x: x.get('geophysical_point_observation_latitude_coordinate') > 25) df = rdd.toDF() df.registerTempTable('my_table') df2 = sqlContext.sql(""" select distinct fixed_weather_station_usaf_master_station_catalog_identifier from my_table """) return df2
def map_to_fields(path): return read_from_s3(path)\ .map(lambda x: parse_line.parse_line(x))
def test_parse_line_does_all_lines_does_not_raise_error(self): for line in lines: parse_line.parse_line(line)
def test_parse_line_does_not_raise_error(self): pp.pprint(parse_line.parse_line(lines[0]))