def write_data(path):
    sc = SparkContext('local')
    sqlContext = SQLContext(sc)
    rdd = sc.textFile('s3a://{path}'.format(path = path), 100)\
       .map(lambda x: parse_line.parse_line(x))

    df = rdd.toDF()
    df_with_int_temp = df.withColumn(
        "int_temp",
        df["air_temperature_observation_air_temperature"].cast(IntegerType()))
    df_with_int_temp.write.mode('Overwrite').parquet(
        's3a://paulhtremblay/parquet_test/no_partition'.format(
            date=datetime.datetime.now()))
    #df.printSchema()
    #sqlContext.registerDataFrameAsTable(df_with_int_temp, 'weather')
    #new_df = sqlContext.sql("SELECT max(int_temp) FROM weather")
    #new_df.show()

    #air_temperature_observation_air_temperature
    #fixed_weather_station_ncei_wban_identifier
    #print(dir(df_with_int_temp.write))
    df_with_int_temp.write.mode('Overwrite').partitionBy("int_temp").parquet(
        "s3a://paulhtremblay/parquet_test/partition")
    #df_with_int_temp.printSchema()
    """
Beispiel #2
0
def write_to_parquet(path):
    sc = SparkContext('local')
    sqlContext = SQLContext(sc)
    rdd = sc.textFile('s3a://{path}'.format(path = path), 100)\
       .map(lambda x: parse_line.parse_line(x))
    df = rdd.toDF()
    df.write.mode('Overwrite').parquet(
        's3a://paulhtremblay/parquet_test/simple')
    """
Beispiel #3
0
def to_df(path):
    sc = SparkContext('local')
    sqlContext = SQLContext(sc)
    rdd = sc.textFile('s3a://{path}'.format(path = path))\
       .map(lambda x: parse_line.parse_line(x))\
       .filter(lambda x: x.get('air_temperature_observation_air_temperature') == -7.8)
    df = rdd.toDF()
    df.registerTempTable('my_table')
    df2 = sqlContext.sql("select * from my_table limit 1")
    return df
def count_us_ws(path):
    sc = SparkContext('local')
    num_records = sc.textFile('s3a://{path}'.format(path = path), 100)\
       .map(lambda x: parse_line.parse_line(x))\
       .filter(lambda x: x.get('geophysical_point_observation_longitude_coordinate') != None )\
       .filter(lambda x: x.get('geophysical_point_observation_longitude_coordinate') < 48 )\
       .filter(lambda x: x.get('geophysical_point_observation_longitude_coordinate') > 22 )\
       .filter(lambda x: x.get('geophysical_point_observation_latitude_coordinate') != None)\
       .filter(lambda x: x.get('geophysical_point_observation_latitude_coordinate') > -68)\
       .filter(lambda x: x.get('geophysical_point_observation_latitude_coordinate') > -117)\
       .count()
    #pp.pprint(rdd.take(1))
    #print(rdd.getNumPartitions())
    print(num_records)
def to_df(path):
    sc = SparkContext('local')
    sqlContext = SQLContext(sc)
    rdd = sc.textFile('s3a://{path}'.format(path = path))\
       .map(lambda x: parse_line.parse_line(x))\
       .filter(lambda x: x.get('geophysical_point_observation_longitude_coordinate') != None )\
       .filter(lambda x: x.get('geophysical_point_observation_longitude_coordinate') < -67 )\
       .filter(lambda x: x.get('geophysical_point_observation_longitude_coordinate') > -125 )\
       .filter(lambda x: x.get('geophysical_point_observation_latitude_coordinate') != None)\
       .filter(lambda x: x.get('geophysical_point_observation_latitude_coordinate') < 48)\
       .filter(lambda x: x.get('geophysical_point_observation_latitude_coordinate') > 25)
    df = rdd.toDF()
    df.registerTempTable('my_table')
    df2 = sqlContext.sql(""" select distinct
    fixed_weather_station_usaf_master_station_catalog_identifier
from my_table
    """)
    return df2
Beispiel #6
0
def map_to_fields(path):
    return read_from_s3(path)\
        .map(lambda x: parse_line.parse_line(x))
 def test_parse_line_does_all_lines_does_not_raise_error(self):
     for line in lines:
         parse_line.parse_line(line)
 def test_parse_line_does_not_raise_error(self):
     pp.pprint(parse_line.parse_line(lines[0]))