Beispiel #1
0
def as_spark_type(tpe) -> types.DataType:
    """
    Given a python type, returns the equivalent spark type.
    Accepts:
    - the built-in types in python
    - the built-in types in numpy
    - list of pairs of (field_name, type)
    - dictionaries of field_name -> type
    - python3's typing system
    """
    if tpe in (str, "str", "string"):
        return types.StringType()
    elif tpe in (bytes, ):
        return types.BinaryType()
    elif tpe in (np.int8, "int8", "byte"):
        return types.ByteType()
    elif tpe in (np.int16, "int16", "short"):
        return types.ShortType()
    elif tpe in (int, "int", np.int, np.int32):
        return types.IntegerType()
    elif tpe in (np.int64, "int64", "long", "bigint"):
        return types.LongType()
    elif tpe in (float, "float", np.float):
        return types.FloatType()
    elif tpe in (np.float64, "float64", "double"):
        return types.DoubleType()
    elif tpe in (datetime.datetime, np.datetime64):
        return types.TimestampType()
    elif tpe in (datetime.date, ):
        return types.DateType()
    elif tpe in (bool, "boolean", "bool", np.bool):
        return types.BooleanType()
    elif tpe in ():
        return types.ArrayType(types.StringType())
Beispiel #2
0
def main():
    df = spark \
        .read \
        .format('kafka') \
        .option('kafka.bootstrap.servers', BOOTSTRAP_SERVERS) \
        .option('subscribe', TOPIC_NAME) \
        .option('group.id', GROUP_ID) \
        .option('startingOffsets', utils.get_starting_offsets(TOPIC_NAME)) \
        .load() \
        .cache()

    ads_data = df.select(
        F.from_json(F.col('value').cast('string'), kafka_schema) \
            .alias('json')
    ) \
        .select('json.*') \
        .withColumn('announcementid', F.col('announcementid').cast('long')) \
        .withColumn('floorNumber', F.col('floorNumber').cast('int')) \
        .withColumn('floorsCount', F.col('floorsCount').cast('int')) \
        .withColumn('roomsCount', F.col('roomsCount').cast('int')) \
        .withColumn('ptn_dadd', F.col('dateInserted').cast(T.DateType()))

    ads_data \
        .write \
        .format('orc') \
        .mode('append') \
        .partitionBy('ptn_dadd') \
        .saveAsTable(RESULT_TABLE)

    partition_offsets_mapping = {
        str(partition): offset + 1
        for partition, offset in df.groupBy('partition').agg({'offset': 'max'}).collect()
    }

    utils.dump_offsets(TOPIC_NAME, partition_offsets_mapping)
def main(inputs,output):
    tmax_schema = types.StructType([
    types.StructField('station', types.StringType()),
    types.StructField('date', types.DateType()),
    types.StructField('latitude', types.FloatType()),
    types.StructField('longitude', types.FloatType()),
    types.StructField('elevation', types.FloatType()),
    types.StructField('tmax', types.FloatType()),
    ])

    data = spark.read.csv(inputs, schema=tmax_schema)
    train, validation = data.randomSplit([0.75, 0.25])
    train = train.cache()
    validation = validation.cache()

    sqlTrans = SQLTransformer(statement = 'select *,dayofyear(date) as day FROM __THIS__')
 
    sqlTrans1 = SQLTransformer(statement = 'SELECT today.station,today.date,today.latitude,today.longitude,today.elevation,today.tmax, yesterday.tmax AS yesterday_tmax FROM __THIS__ as today INNER JOIN __THIS__ as yesterday ON date_sub(today.date, 1) = yesterday.date AND today.station = yesterday.station')
    assemble_features = VectorAssembler(inputCols = ['latitude','longitude','elevation','day','yesterday_tmax'], outputCol = 'features')

    gbt = GBTRegressor(featuresCol = 'features', labelCol='tmax')
    pipeline = Pipeline(stages=[sqlTrans1,sqlTrans,assemble_features,gbt])
    weather_model = pipeline.fit(train)

    predictions = weather_model.transform(validation)
    #predictions.show()
    evaluator = RegressionEvaluator(labelCol = 'tmax', predictionCol = 'prediction', metricName = 'rmse')
    score = evaluator.evaluate(predictions)
    print("Root Mean Squared Error (RMSE) on test data = %g" % score)

    weather_model.write().overwrite().save(output)
 def expand_date(df):
     df = df.withColumn('Date', df.Date.cast(T.DateType()))
     return df \
         .withColumn('Year', F.year(df.Date)) \
         .withColumn('Month', F.month(df.Date)) \
         .withColumn('Week', F.weekofyear(df.Date)) \
         .withColumn('Day', F.dayofmonth(df.Date))
Beispiel #5
0
def main(input_dir,output_dir):
    # main logic starts here
    df_schema = types.StructType([
        types.StructField('title_clean', types.StringType()),
        types.StructField('title', types.StringType()),
        types.StructField('created_utc_iso', types.DateType()),
        types.StructField('polarity_subjectivity', types.ArrayType(types.FloatType()))
    ])

    headlines_df = spark.read.json(input_dir,encoding='utf-8',schema=df_schema).repartition(80)
    split_sentiment_df = headlines_df.withColumn(
        'polarity', functions.element_at(headlines_df['polarity_subjectivity'],1)
    ).withColumn(
        'subjectivity', functions.element_at(headlines_df['polarity_subjectivity'],2)
    ).cache()

    for year_int in range(2008,2020):
        print('Plotting for '+str(year_int))
        headlines_year = split_sentiment_df.where(
            functions.year(split_sentiment_df['created_utc_iso']) == year_int
        ).withColumn('year',functions.year(split_sentiment_df['created_utc_iso']))

        headlines_grouped = headlines_year.groupBy(headlines_year['year']).agg(
            functions.collect_set(headlines_year['title_clean']).alias('titles_group')
        )
        headlines_joined = headlines_grouped.select( functions.array_join(headlines_grouped['titles_group'],' ').alias('joined') )
        string_to_plot = headlines_joined.collect()[0]['joined'] #only one row remaining of concatenated headlines

        wordcloud = WordCloud(background_color='white', stopwords=stopwords, width=1000, height=500).generate(string_to_plot)
        wordcloud.to_file(output_dir + '/'+str(year_int)+'_words.png')
def main(inputs, output):
    '''define the schema'''
    tweets_schema = types.StructType([
		    types.StructField('username', types.StringType()),
		    types.StructField('date', types.DateType()),
		    types.StructField('retweets', types.StringType()),
		    types.StructField('favorites', types.StringType()),
		    types.StructField('text', types.StringType()),
		    types.StructField('geo', types.StringType()),
		    types.StructField('mentions', types.StringType()),
		    types.StructField('hashtags', types.StringType()),
		    types.StructField('id', types.StringType()),
		    types.StructField('permalink', types.StringType())
		])
    '''
    pass the schema when reading input file to avoid Spark DataFrames from directly infering the Schema from the input
    '''
    df = spark.read.format("csv").option('header','true').option('delimiter','\u0001').schema(tweets_schema).load(inputs)
    df = df.select('date', 'text', 'hashtags')
    ''' start preprocessing '''
    df = df.filter(df['text'].isNotNull())
    df = df.filter(df['hashtags'].isNotNull())
    df = df.withColumn('hashtags', lower(df['hashtags']))
    df = process_tweet_text(df)    
    df = get_sentiment(df)
    df = get_party(df)
    df.show()
Beispiel #7
0
def infer_spark_type(typeclass) -> t.DataType:
    if typeclass in (None, NoneType):
        return t.NullType()
    elif typeclass is str:
        return t.StringType()
    elif typeclass in {bytes, bytearray}:
        return t.BinaryType()
    elif typeclass is bool:
        return t.BooleanType()
    elif typeclass is date:
        return t.DateType()
    elif typeclass is datetime:
        return t.TimestampType()
    elif typeclass is Decimal:
        return t.DecimalType(precision=36, scale=6)
    elif isinstance(typeclass, type) and issubclass(typeclass, BoundDecimal):
        (precision, scale) = typeclass.__constraints__
        return t.DecimalType(precision=precision, scale=scale)
    elif typeclass is float:
        return t.DoubleType()
    elif typeclass is int:
        return t.IntegerType()
    elif typeclass is long:
        return t.LongType()
    elif typeclass is short:
        return t.ShortType()
    elif typeclass is byte:
        return t.ByteType()
    elif getattr(typeclass, "__origin__", None) is not None:
        return infer_complex_spark_type(typeclass)
    elif is_pyspark_class(typeclass):
        return transform(typeclass)
    else:
        raise TypeError(f"Don't know how to represent {typeclass} in Spark")
def main(inputs, output, start_year, end_year):

    #Specifying the schema for the dataframe
    amazon_schema = types.StructType([
        types.StructField('marketplace', types.StringType()),
        types.StructField('customer_id', types.IntegerType()),
        types.StructField('review_id', types.StringType()),
        types.StructField('product_id', types.StringType()),
        types.StructField('product_parent', types.LongType()),
        types.StructField('product_title', types.StringType()),
        types.StructField('product_category', types.StringType()),
        types.StructField('star_rating', types.IntegerType()),
        types.StructField('helpful_votes', types.IntegerType()),
        types.StructField('total_votes', types.IntegerType()),
        types.StructField('vine', types.StringType()),
        types.StructField('verified_purchase', types.StringType()),
        types.StructField('review_headline', types.StringType()),
        types.StructField('review_body', types.StringType()),
        types.StructField('review_date', types.DateType())
    ])

    #Loading the data into dataframe
    raw_dataset = spark.read.option('sep', '\t').csv(inputs,
                                                     schema=amazon_schema,
                                                     header='true')
    raw_dataset = raw_dataset.repartition(96)
    #print("No of rows in raw_dataset:",raw_dataset.count())

    #Keeping only those rows which are verified purchases
    verified_purchases_df = raw_dataset.filter(
        col('verified_purchase') == "Y").cache()
    #print("No of rows in verified_purchases_df:",verified_purchases_df.count())

    #10-core products only - Keeping only the products which have more than 10 reviews
    product_count = verified_purchases_df.groupby('product_id').count().filter(
        col('count') > 10)
    ten_core_dataset = verified_purchases_df.join(broadcast(
        product_count.select('product_id')),
                                                  on='product_id')
    ten_core_dataset.registerTempTable('ten_core_dataset')
    #print("No of rows in ten_core_dataset:",ten_core_dataset.count())
    #Selecting data in the given time range
    sliced_data = spark.sql(
        "SELECT * from ten_core_dataset WHERE year(review_date) BETWEEN " +
        start_year + " AND " + end_year)
    #sliced_data = spark.sql("SELECT * from ten_core_dataset WHERE year(review_date) BETWEEN "+str(2010)+" AND "+str(2015))
    #sliced_data.registerTempTable("sliced_data")
    print("No of rows in sliced_dataset:", sliced_data.count())

    #splitting the datasets year-wise
    # years_year=[2010,2011,2012,2013,2014,2015]
    # for i in years_year:
    #     split_to_years = spark.sql("SELECT * from sliced_data WHERE year(review_date)="+str(i))
    #     print("No of rows in",i,split_to_years.count())
    #     split_to_years.write.partitionBy('product_category').parquet(output+"_"+str(i))

    #Storing the data partitioned on product categories for easy access later on
    sliced_data.write.partitionBy('product_category').parquet(output)
def import_data(request):
    if request.method == 'POST':
        new_file = next(iter(request.FILES.values()))
        # new_file = request.FILES['myfile']
        path = 'data/' + str(new_file.name)

        # import pdb
        # pdb.set_trace()
        project_id = request.user.project.id
        company_name = request.user.project.company

        data = Spark.sc.textFile(path)
        header = data.first()

        # fields2 = [(typ.StructField(h, typ.DateType(), True)) for h in header.split(',') if ('Date' in str(h) else (typ.StructField(h, typ.StringType(), True))
        #           for h in header.split(',')]

        # fields1 = [typ.StructField(h, typ.StringType(), True)
        #           for h in header.split(',')]
        fields = []
        date_column = None
        drop_column = None
        for index, field_name in enumerate(header.split(',')):
            if ('Date' in str(field_name)):
                date_column = index
                drop_column = field_name
                fields.append(typ.StructField(field_name, typ.DateType(),
                                              True))
            else:
                fields.append(
                    typ.StructField(field_name, typ.StringType(), True))
        schema = typ.StructType(fields)

        data = data.filter(lambda row: row != header) \
            .map(lambda row: [dt.strptime(elem, '%d/%m/%Y') if (index==date_column) else str(elem) for index, elem in enumerate(row.split(','))])

        data_df = Spark.sqlContext.createDataFrame(data, schema)
        table_name = str(company_name) + '_Test'
        if drop_column:
            data_df.drop(drop_column).collect()
        # import pdb
        # pdb.set_trace()
        if date_column:
            try:
                unique_value = CustomFields.objects.get(project_id=project_id)
                unique_value.date_column = date_column
                unique_value.save()
            except:
                pass

        data_df.write.format('jdbc').options(
            url='jdbc:mysql://localhost:3306/disease',
            dbtable=table_name,
            user='******',
            password='******').mode('append').save()

    return render(request, "import_data.html")
Beispiel #10
0
def main(inputs, keyspace, table):
    if table == "yelp_business":
        business_schema = StructType([
            types.StructField('business_id', types.StringType(), True),
            types.StructField('name', types.StringType(), True),
            types.StructField('neighborhood', types.StringType(), True),
            types.StructField('address', types.StringType(), True),
            types.StructField('city', types.StringType(), True),
            types.StructField('state', types.StringType(), True),
            types.StructField('postal_code', types.StringType(), True),
            types.StructField('latitude', types.FloatType(), True),
            types.StructField('longitude', types.FloatType(), True),
            types.StructField('stars', types.FloatType(), True),
            types.StructField('review_count', types.LongType(), True),
            types.StructField('is_open', types.IntegerType(), True)
        ])
        business = spark.read.json(inputs, schema=business_schema)
        df = business.drop('neighborhood').filter(business.is_open == 1)
        df.cache()
        business_data = sc.textFile(inputs).map(json_key_value_1).map(
            lambda x: Row(x[0], x[1], x[2], x[3]))
        df_1 = business_data.toDF()
        df_2 = df_1.withColumnRenamed("_1", "bus_id").withColumnRenamed(
            "_2", "attributes").withColumnRenamed(
                "_3", "categories").withColumnRenamed("_4", "hours")
        df_2.cache()
        result = df.join(df_2, df.business_id == df_2.bus_id,
                         how='inner').drop(df_2.bus_id)

    elif table == "yelp_checkin":

        checkin_data = sc.textFile(inputs).map(json_key_value_2).map(
            lambda x: Row(str(uuid.uuid1()), x[0], x[1]))
        df = checkin_data.toDF().cache()
        df_1 = df.withColumnRenamed("_1", "id").withColumnRenamed(
            "_2", "time").withColumnRenamed("_3", "business_id")
        result = df_1

    if table == "yelp_review":
        reviews_schema = types.StructType([
            types.StructField('business_id', types.StringType(), True),
            types.StructField('cool', types.LongType(), True),
            types.StructField('date', types.DateType(), True),
            types.StructField('funny', types.LongType(), True),
            types.StructField('review_id', types.StringType(), True),
            types.StructField('stars', types.LongType(), True),
            types.StructField('text', types.StringType(), True),
            types.StructField('useful', types.LongType(), True),
            types.StructField('user_id', types.StringType(), True)
        ])

        reviews = spark.read.json(inputs, schema=reviews_schema)
        uuidUdf = udf(lambda: str(uuid.uuid1()), types.StringType())
        result = reviews.withColumn("id", uuidUdf())
    result.repartition(300).write.format(
        "org.apache.spark.sql.cassandra").options(table=table,
                                                  keyspace=keyspace).save()
Beispiel #11
0
def schema():
    tmax_schema = types.StructType([
        types.StructField('station', types.StringType()),
        types.StructField('date', types.DateType()),
        types.StructField('latitude', types.FloatType()),
        types.StructField('longitude', types.FloatType()),
        types.StructField('elevation', types.FloatType()),
        types.StructField('tmax', types.FloatType()),
    ])
    return tmax_schema
Beispiel #12
0
 def __get_basic_schema(self):
     return TableSchema(
         [
             t.StructField("name", t.StringType(), False),
             t.StructField("date", t.DateType(), False),
             t.StructField("visits", t.IntegerType(), True),
         ],
         primary_key=["name", "date"],
         partition_by=["name"],
     )
Beispiel #13
0
def test_repartition_by_druid_segment_size(spark):
    add_dataframe_druid_extension()

    schema = t.StructType([
        t.StructField('date', t.DateType()),
        t.StructField('country', t.StringType()),
        t.StructField('dau', t.IntegerType()),
        t.StructField('revenue', t.DoubleType()),
    ])

    rows = [
        row(date=to_date("2019-10-17"), country="US", dau=50, revenue=100.0),
        row(date=to_date("2019-10-17"), country="GB", dau=20, revenue=20.0),
        row(date=to_date("2019-10-17"), country="DE", dau=20, revenue=20.0),
        row(date=to_date("2019-10-16"), country="US", dau=50, revenue=100.0),
        row(date=to_date("2019-10-16"), country="FI", dau=20, revenue=20.0),
        row(date=to_date("2019-10-16"), country="GB", dau=20, revenue=20.0),
        row(date=to_date("2019-10-16"), country="DE", dau=20, revenue=20.0)
    ]

    df: DataFrame = get_df(spark, rows, schema)

    # note how we can call .repartitionByDruidSegmentSize directly on Dataset[Row]
    # the nice thing is this allows continuous method chaining on Dataset without braking the chain
    df = df.repartition_by_druid_segment_size('date',
                                              segment_granularity='DAY',
                                              rows_per_segment=2)

    # group & count
    # because we can't know which exact rows end up in each partition within the same date
    # however we know how many partitions there should be for each date
    df = df.groupBy('__PARTITION_TIME__', '__PARTITION_NUM__').count()

    expected: DataFrame = get_df(
        spark, [
            row(__PARTITION_TIME__=to_date("2019-10-17"),
                __PARTITION_NUM__=0,
                count=2),
            row(__PARTITION_TIME__=to_date("2019-10-16"),
                __PARTITION_NUM__=1,
                count=2),
            row(__PARTITION_TIME__=to_date("2019-10-17"),
                __PARTITION_NUM__=1,
                count=1),
            row(__PARTITION_TIME__=to_date("2019-10-16"),
                __PARTITION_NUM__=0,
                count=2),
        ],
        t.StructType([
            t.StructField('__PARTITION_TIME__', t.TimestampType()),
            t.StructField('__PARTITION_NUM__', t.IntegerType()),
            t.StructField('count', t.LongType()),
        ]))

    assert_df(df, expected)
Beispiel #14
0
    def invoice_dataframe(self, invoice_source):
        """
            Fact Invoice records and attributes from dataA Sources
        """
        ri = (
            self.read_source(source=invoice_source).where(
                "business_unit_id == '10'").where("sale_type in ('I', 'E')").
            where("system_id not in ('SA', '30')")
            # TODO Add year/period filter into config
            .where("concat(year,period) > '201609'"))
        ri = ri.withColumn('iptmeta_source_system', F.lit('dataA'))
        ri = ri.withColumn(
            'ship1_material_id_int',
            F.when(ri.ship_mat1_id.rlike('[^0-9]+'), F.lit(None)).otherwise(
                ri.ship_mat1_id.cast(T.IntegerType())))

        lstrip_0_udf = lstrip_0()
        ri = ri.withColumn('sold_customer_id_lstrip_0',
                           lstrip_0_udf(ri.sold_customer_id))
        ri = ri.withColumn('ship_customer_id_lstrip_0',
                           lstrip_0_udf(ri.ship_customer_id))

        # Strip leading zeros from numeric material_id's
        ri = ri.withColumn(
            'mmf_material',
            F.concat(ri.system_id, F.lit('/'),
                     F.coalesce(ri.ship1_material_id_int, ri.ship_mat1_id)))
        ri = ri.withColumn(
            'commercial_print_customer_key',
            F.concat(ri.system_id, F.lit('/'), ri.sold_customer_id_lstrip_0,
                     ri.system_id, F.lit('/'), ri.ship_customer_id_lstrip_0))
        ri = ri.withColumn("inv_date", F.col('inv_date').cast(T.DateType()))
        ri = ri.withColumn("inv_month", F.month(F.col('inv_date')))
        ri = ri.withColumn("inv_year", F.year(F.col('inv_date')))
        ri = ri.withColumn('invoice_volume',
                           F.coalesce(ri.line_qty, F.lit(MISSING_NUMBER)))
        ri = ri.withColumn(
            'invoice_uom_id',
            F.coalesce(ri.invoice_uom_id, F.lit(MISSING_STRING_ID)))
        ri = ri.withColumnRenamed('sales_rep_id', 'ri_sales_rep_id')
        # some lines have multiple quality class values so if any are prime we treat the whole line as GOOD
        ri = ri.withColumn(
            'prime_flag',
            F.max(
                F.when(F.isnull(ri.quality_class),
                       1).when(ri.quality_class == 'GOOD',
                               1).otherwise(0)).over(
                                   W.partitionBy(ri.system_id, ri.invoice_id,
                                                 ri.line_number)))
        ri = ri.withColumn(
            'quality_class',
            F.when(ri.prime_flag == 1,
                   F.lit('GOOD')).otherwise(ri.quality_class))
        return ri
def process_user_json(input_json_user):
    user_schema = types.StructType([
        types.StructField('user_id', types.StringType(), True),
        types.StructField('average_stars', types.DoubleType(), True),
        types.StructField('review_count', types.LongType(), True),
        types.StructField('yelping_since', types.DateType(), True)
    ])
    users_df = spark.read.json(input_json_user,
                               schema=user_schema).repartition(100)

    write_to_cassandra(users_df, TABLE_USER)
def myschema():

    comments_schema = types.StructType([
        types.StructField('index', types.IntegerType()),
        types.StructField('listing_id', types.IntegerType()),
        types.StructField('id', types.IntegerType()),
        types.StructField('date', types.DateType()),
        types.StructField('reviewer_id', types.IntegerType()),
        types.StructField('reviewer_name', types.StringType()),
        types.StructField('comments', types.StringType())
    ])
    return (comments_schema)
Beispiel #17
0
def process_etl_immig(spark):
    """
    Function to load data from dataset, process it and write to parquet files
    """
    # Reading Immigration data
    immig_df = spark.read.format('com.github.saurfang.sas.spark').load(Inpath + 'i94_apr16_sub.sas7bdat')
    #immig_df = spark.read.options(header='True', inferSchema='True', delimiter=',').csv(Inpath + "immigration_data_apr16.csv")

    # Reading supporting documents for immigration data
    i94res_df = spark.read.options(header='True', inferSchema='True', delimiter=',').csv(Inpath + "i94res_country_codes_immigration.csv")
    i94res_df = i94res_df.drop("_c3")
    i94res_df = i94res_df.dropna(how='any',subset = ['i94res','Country']).drop_duplicates()


    i94port_df = spark.read.options(header='True', inferSchema='True', delimiter=';').csv(Inpath + "i94port_city_codes_immigration.csv")
    i94port_df = i94port_df.drop("_c3")
    i94port_df = i94port_df.dropna(how='any',subset = ['i94port','City','State_CD']).drop_duplicates()
    
    # Cleaning the data
    immig_df = immig_df.dropna(how='any',subset=['cicid','i94res','i94port','arrdate','i94addr','i94bir','gender','visatype'])
    immig_df = immig_df.drop_duplicates()
    
    get_date_sas = udf(lambda x: (datetime(1960, 1, 1) + timedelta(days=int(x))), T.DateType())
    
    immig_df = immig_df.withColumn("arrival_date",get_date_sas(immig_df.arrdate))
    
    # create view of immigration and supporting data to extract using SQL queries
    immig_df.createOrReplaceTempView("immigration_data")
    
    i94res_df.createOrReplaceTempView("country_data")
    
    i94port_df.createOrReplaceTempView("port_data")
    
    # extract columns to create staging immigration table
    stg_immig = spark.sql("""SELECT DISTINCT 
                                CAST(id.cicid AS INT) AS ID,
                                INITCAP(cd.Country) AS origin_country,
                                INITCAP(pd.City) AS city,
                                id.i94addr AS state_cd,
                                id.arrival_date,
                                CAST(id.i94bir AS INT) AS age,
                                id.gender AS gender,
                                id.visatype AS visa_type
                          FROM immigration_data id 
                          JOIN country_data cd ON id.i94res = cd.i94res
                          JOIN port_data pd ON id.i94port = pd.i94port AND id.i94addr = pd.State_CD
                          WHERE city IS NOT NULL
                             """)
    
#     stg_immig.show(25)
    
    stg_immig.write.parquet(Outpath + "/immigration",mode = 'overwrite')
Beispiel #18
0
def main(inputs, keyspace, table):
    schema = types.StructType([
        types.StructField('id', types.StringType(), True),
        types.StructField('host', types.StringType(), True),
        types.StructField('datetime', types.DateType(), True),
        types.StructField('path', types.StringType(), True),
        types.StructField('bytes', types.IntegerType(), True)
    ])

    fields = spark.sparkContext.textFile(inputs).flatMap(getFields)
    data = spark.createDataFrame(fields, schema)
    data.write.format("org.apache.spark.sql.cassandra").options(
        table=table, keyspace=keyspace).save()
Beispiel #19
0
def transform_tealium_event_dataframe_for_date(sql_context, target_date,
                                               df_events):
    df_events = (df_events.withColumn(
        'date',
        F.udf(lambda x: datetime.datetime.strptime(x, '%Y%m%d'),
              T.DateType())(F.col('data.udo.yyyymmdd'))).select(
                  F.col('post_time').alias("microtime"),
                  F.col('data.firstparty_tealium_cookies.device_id').alias(
                      "device_id"),
                  F.col('data.udo.user_agent').alias('user_agent'), 'date',
                  F.col('data.dom.referrer').alias('referrer')).filter(
                      F.col('date') == target_date).distinct())
    return df_events
Beispiel #20
0
def get_dtypes_spark(type):
    switcher = {
        'int32': st.IntegerType(),
        'int64': st.LongType(),
        'float32': st.FloatType(),
        'float64': st.DoubleType(),
        'date64': st.DateType(),  #TimestampType
        'str': st.StringType(),
        'boolean': st.BooleanType()
    }

    func = switcher.get(type, "nothing")
    # Execute the function
    return func
Beispiel #21
0
def as_spark_type(tpe) -> types.DataType:
    """
    Given a Python type, returns the equivalent spark type.
    Accepts:
    - the built-in types in Python
    - the built-in types in numpy
    - list of pairs of (field_name, type)
    - dictionaries of field_name -> type
    - Python3's typing system
    """
    # TODO: Add "boolean" and "string" types.
    # ArrayType
    if tpe in (np.ndarray,):
        return types.ArrayType(types.StringType())
    elif hasattr(tpe, "__origin__") and issubclass(tpe.__origin__, list):
        return types.ArrayType(as_spark_type(tpe.__args__[0]))
    # BinaryType
    elif tpe in (bytes, np.character, np.bytes_, np.string_):
        return types.BinaryType()
    # BooleanType
    elif tpe in (bool, np.bool, "bool", "?"):
        return types.BooleanType()
    # DateType
    elif tpe in (datetime.date,):
        return types.DateType()
    # NumericType
    elif tpe in (np.int8, np.byte, "int8", "byte", "b"):
        return types.ByteType()
    elif tpe in (decimal.Decimal,):
        # TODO: considering about the precision & scale for decimal type.
        return types.DecimalType(38, 18)
    elif tpe in (float, np.float, np.float64, "float", "float64", "double"):
        return types.DoubleType()
    elif tpe in (np.float32, "float32", "f"):
        return types.FloatType()
    elif tpe in (np.int32, "int32", "i"):
        return types.IntegerType()
    elif tpe in (int, np.int, np.int64, "int", "int64", "long", "bigint"):
        return types.LongType()
    elif tpe in (np.int16, "int16", "short"):
        return types.ShortType()
    # StringType
    elif tpe in (str, np.unicode_, "str", "U"):
        return types.StringType()
    # TimestampType
    elif tpe in (datetime.datetime, np.datetime64, "datetime64[ns]", "M"):
        return types.TimestampType()
    else:
        raise TypeError("Type %s was not understood." % tpe)
Beispiel #22
0
def main(inputs, output):
    tmax_schema = types.StructType([
        types.StructField('station', types.StringType()),
        types.StructField('date', types.DateType()),
        types.StructField('latitude', types.FloatType()),
        types.StructField('longitude', types.FloatType()),
        types.StructField('elevation', types.FloatType()),
        types.StructField('tmax', types.FloatType()),
    ])
    data = spark.read.csv(inputs, schema=tmax_schema)

    query = "SELECT t.station AS station, t.date AS date, t.day AS day, t.latitude AS latitude, t.longitude AS longitude, t.elevation AS elevation, t.tmax AS tmax, y.tmax AS tmax_yesterday FROM (SELECT station, date, latitude, longitude, elevation, tmax, DAYOFYEAR(date) AS day, date_sub(date,1) AS date_yesterday FROM __THIS__) t, (SELECT station, date, latitude, longitude, elevation, tmax, DAYOFYEAR(date) AS day, date_sub(date,1) AS date_yesterday FROM __THIS__) y WHERE t.date = y.date_yesterday AND t.station = y.station"
    sqlTrans = SQLTransformer(statement=query)
    train, validation = data.randomSplit([0.75, 0.25])
    train = train.cache()
    # train.show()
    validation = validation.cache()
    assembler = VectorAssembler(inputCols=[
        "latitude", "longitude", "elevation", "day", "tmax_yesterday"
    ],
                                outputCol="features")
    classifier = GBTRegressor(featuresCol='features', labelCol='tmax')
    pipeline = Pipeline(stages=[sqlTrans, assembler, classifier])
    model = pipeline.fit(train)
    predictions = model.transform(validation)
    predictions.show()

    r2_evaluator = RegressionEvaluator(predictionCol='prediction',
                                       labelCol='tmax',
                                       metricName='r2')
    r2 = r2_evaluator.evaluate(predictions)

    print("R-square for the validation data is: " + str(r2))
    model.write().overwrite().save(output)

    r2 = r2_evaluator.evaluate(model.transform(train))
    print("R-square for the training data is: " + str(r2))

    print(model.stages[-1].featureImportances)

    sfu_predict = [("sfu", datetime.date(2018, 11,
                                         12), 49.2771, -122.9146, 330.0, 12.0),
                   ("sfu", datetime.date(2018, 11,
                                         13), 49.2771, -122.9146, 330.0, 12.0)]
    sfu_predict_df = spark.createDataFrame(sfu_predict, schema=tmax_schema)
    sfu_predict_df.show()
    sfu_predictions = model.transform(sfu_predict_df).select(
        'station', 'date', 'prediction')
    sfu_predictions.show()
 def invoice_dataframe(self, invoice_source):
     inv = self.read_source(source=invoice_source)
     convert_date = F.udf(
         lambda xdate: datetime.datetime.strptime(xdate, '%Y%m%d')
         if len(xdate) == 8 else datetime.datetime(1, 1, 1, 0, 0),
         T.DateType())
     inv = inv.withColumnRenamed('invoice_date', 'invoice_date_original')
     inv = inv.withColumn(
         'invoice_date',
         convert_date(inv.invoice_date_original.cast(
             T.StringType())))  # Invoice_date must be 10 or more
     # Filter the invoice dataframe to simplify later processing
     inv = self.dataB_filter_plantvals(inv)
     inv = self.dataB_filter_report_date(inv)
     return inv
 def load_to_df(self, json_file):
     temp_df = self.sqlContext.read.json(json_file)
     df = temp_df.select(
         col('id').cast(types.IntegerType()),
         col('name'),
         col('has_test'),
         col('published_at').cast(types.DateType()),
         col('created_at').cast(types.DateType()),
         col('url'),
         col('area.name').alias('area_name'),
         col('salary.from').alias('salary_from'),
         col('salary.to').alias('salary_to'),
         col('salary.currency').alias('salary_currency'),
         col('address.street').alias('address.street'),
         col('address.building').alias('address_building'),
         col('address.raw').alias('address_raw'),
         col('address.metro.station_name').alias('metro_name'),
         col('employer.id').alias('employer_id').cast(types.IntegerType()),
         col('employer.name').alias('employer_name'),
         col('snippet.requirement').alias('snippet_requirement'),
         col('snippet.responsibility').alias('snippet_responsibility'),
         col('contacts.name').alias('contacts_name'),
         col('contacts.email').alias('contacts_email'))
     return df
def process_immig_data(spark, input_data, output_data):
    """
    Summary : Procedure to process log data from S3 song files and extract and write User, time and songdata parquet files back to S3
    
    Parameters
    spark - The spark session creted in the main function
    input_data - The location of the root folder on S3 under which all the json files are stored. 
    output_data - The location of the root folder on S3 under which all the processed parquet files will be stored.
    
    Python functions needed to convert epoch time in logs to datetimestamp to extract all time relevant information.
    
    """
    
    # get filepath to log data file
    log_data = input_data + 'log_data/*'

    # read log data file
    immg_data = spark.read.format('csv').options(header='true', delimiter=',' ) \
                    .load('immigration_data_sample.csv')
    
    def convert_datetime(x):
        try:
            start = datetime(1960, 1, 1)
            return start + timedelta(days=int(x))
        except:
            return None
    
    # cleanup
    udf_datetime_from_sas = udf(lambda x: convert_datetime(x), T.DateType())
    immg_data = immg_data \
        .withColumn("i94yr", col("i94yr").cast("integer")) \
        .withColumn("i94mon", col("i94mon").cast("integer")) \
        .withColumn("i94cit", col("i94cit").cast("integer")) \
        .withColumn("i94res", col("i94res").cast("integer")) \
        .withColumn("i94visa", col("i94visa").cast("integer")) \
        .withColumn("biryear", col("biryear").cast("integer")) \
        .withColumn("admnum", col("admnum").cast("integer")) \
        .withColumn("arrival_date", udf_datetime_from_sas(col("arrdate").cast("integer"))) \
        .withColumn("departure_date", udf_datetime_from_sas(col("depdate").cast("integer"))) 

    #drop duplicates
    immg_data = immg_data.distinct()

    immg_data.createOrReplaceTempView("immg_data")    
    
    # write time table to parquet files partitioned by year and month
    immg_data.write.partitionBy("i94yr","i94mon").mode("overwrite")
        .parquet(os.path.join(output_data,'immg_data.parquet'))
def main(input_dir, keyspace, table):

    data = spark.sparkContext.textFile(input_dir)
    request = data.map(parse)
    df_schema = types.StructType([
        types.StructField('id', types.StringType(), True),
        types.StructField('host', types.StringType(), True),
        types.StructField('datetime', types.DateType(), True),
        types.StructField('path', types.StringType(), True),
        types.StructField('bytes', types.IntegerType(), True)
    ])

    df = spark.createDataFrame(request, df_schema).dropna().repartition('host')

    df.write.format("org.apache.spark.sql.cassandra").mode('overwrite').option('confirm.truncate', True) \
    .options(table=table, keyspace=keyspace).save()
def main(input, model_file):
    tmax_schema = types.StructType([
        types.StructField('station', types.StringType()),
        types.StructField('date', types.DateType()),
        types.StructField('latitude', types.FloatType()),
        types.StructField('longitude', types.FloatType()),
        types.StructField('elevation', types.FloatType()),
        types.StructField('tmax', types.FloatType()),
    ])
    data = spark.read.csv(input, schema=tmax_schema)
    train, validation = data.randomSplit([0.75, 0.25], seed=123)
    train = train.cache()
    validation = validation.cache()
    y_tmax = SQLTransformer(
        statement=
        "SELECT today.station,today.latitude,today.longitude,today.elevation,today.date,today.tmax,yesterday.tmax AS yesterday_tmax FROM __THIS__ as today INNER JOIN __THIS__ as yesterday ON date_sub(today.date, 1) = yesterday.date AND today.station = yesterday.station"
    )
    getvalues = SQLTransformer(
        statement=
        "SELECT station,latitude,longitude,elevation,dayofyear(date) AS dayofyear,tmax,yesterday_tmax from __THIS__"
    )

    assemble_features = VectorAssembler(inputCols=[
        'latitude', 'longitude', 'elevation', 'dayofyear', 'yesterday_tmax'
    ],
                                        outputCol='features')
    classifier = GBTRegressor(featuresCol='features', labelCol='tmax')
    pipeline = Pipeline(
        stages=[y_tmax, getvalues, assemble_features, classifier])

    model = pipeline.fit(train)
    predictions = model.transform(validation)

    r2_evaluator = RegressionEvaluator(predictionCol='prediction',
                                       labelCol='tmax',
                                       metricName='r2')
    r2 = r2_evaluator.evaluate(predictions)
    print('-----------------------------------')
    print('r2: %g' % (r2, ))
    print('-----------------------------------')
    rmse_evaluator = RegressionEvaluator(predictionCol='prediction',
                                         labelCol='tmax',
                                         metricName='rmse')
    rmse = rmse_evaluator.evaluate(predictions)
    print('rmse: %g' % (rmse, ))
    model.write().overwrite().save(model_file)
def main(inputs, model_filew):
    tmax_schema = types.StructType([
        types.StructField('station', types.StringType()),
        types.StructField('date', types.DateType()),
        types.StructField('latitude', types.FloatType()),
        types.StructField('longitude', types.FloatType()),
        types.StructField('elevation', types.FloatType()),
        types.StructField('tmax', types.FloatType()),
    ])
    data = spark.read.csv(inputs, schema=tmax_schema)
    train, validation = data.randomSplit([0.75, 0.25])
    train = train.cache()
    validation = validation.cache()
    # data_process=SQLTransformer(statement="SELECT *, dayofyear(date) AS day_of_year FROM __THIS__ ")
    data_process = SQLTransformer(
        statement=
        "SELECT today.latitude,today.longitude,today.tmax AS tmax, today.elevation,  \
                                  dayofyear(today.date) AS day_of_year,yesterday.tmax AS yesterday_tmax\
                                  FROM __THIS__ as today \
                                  INNER JOIN __THIS__ as yesterday \
                                  ON date_sub(today.date, 1) = yesterday.date AND today.station = yesterday.station"
    )
    # assemble_features = VectorAssembler(inputCols=['latitude','longitude','elevation', 'day_of_year'], outputCol='features')
    assemble_features = VectorAssembler(inputCols=[
        'latitude', 'longitude', 'elevation', 'day_of_year', 'yesterday_tmax'
    ],
                                        outputCol='features')
    classifier = GBTRegressor(featuresCol='features', labelCol='tmax')
    # classifier = GeneralizedLinearRegression(featuresCol='features', labelCol='tmax',family='gaussian', link='identity')
    pipeline = Pipeline(stages=[data_process, assemble_features, classifier])
    model = pipeline.fit(train)
    predictions = model.transform(validation)
    #    predictions.show()

    r2_evaluator = RegressionEvaluator(predictionCol='prediction',
                                       labelCol='tmax',
                                       metricName='r2')
    rmse_evaluator = RegressionEvaluator(predictionCol='prediction',
                                         labelCol='tmax',
                                         metricName='rmse')
    r2 = r2_evaluator.evaluate(predictions)
    rmse = rmse_evaluator.evaluate(predictions)
    print('r-square for GBT model: %g' % (r2, ))
    print('root mean square error for GBT model: %g' % (rmse, ))
    model.write().overwrite().save(model_file)
Beispiel #29
0
 def test_prepending_a_mapping_with_duplicated_columns(
         self, input_columns, mapped_df):
     """Output schema is correct for newly prepended mapping with columns
     that are also included in the input schema"""
     new_mapping = [
         ("created_date", "meta.created_at_sec", "DateType"),
         ("birthday", "birthday", "DateType"),
     ]
     new_columns = [name for (name, path, data_type) in new_mapping]
     new_columns_deduplicated = [
         x for x in new_columns if x not in input_columns
     ]
     new_mapped_df = Mapper(
         mapping=new_mapping, mode="prepend",
         ignore_missing_columns=True).transform(mapped_df)
     assert new_columns_deduplicated + input_columns == new_mapped_df.columns
     assert mapped_df.schema["birthday"].dataType == T.TimestampType()
     assert new_mapped_df.schema["birthday"].dataType == T.DateType()
def get_spark_type(field, required_type):
    if isinstance(required_type, type(db_types.DATE())):
        return spk_types.StructField(field, spk_types.DateType(), True)
    elif isinstance(required_type, type(db_types.DATETIME())):
        return spk_types.StructField(field, spk_types.TimestampType(), True)
    elif isinstance(required_type, type(db_types.VARCHAR())):
        return spk_types.StructField(field, spk_types.StringType(), True)
    elif isinstance(required_type, type(db_types.INT())):
        return spk_types.StructField(
            field, spk_types.LongType(), True
        )  # db type enforced earlier than spark ones, so spark types needs to be less restrictive than spark ones so needs to choose LongType instead of IntegerType
    elif isinstance(required_type, type(db_types.FLOAT())):
        return spk_types.StructField(field, spk_types.FloatType(), True)
    elif isinstance(required_type, type(db_types.BOOLEAN())):
        return spk_types.StructField(field, spk_types.BooleanType(), True)
    else:
        raise Exception(
            "Type not recognized, field={}, required_type={}".format(
                field, required_type))