Esempio n. 1
0
def getCalendar(df_immigration):
    """ Generate Calendar dimension table based on Immigration
    
    Args:
      df_immig     : dataframe contain Immigration information
      
    Returns:
      df_calendar  : dataframe contains Calendar information
    
    """
    # create dimension date, combine arrival and departure date
    df_temp = df_immigration.select(
        col("arrival_date").alias("date_deparr")).distinct()
    df_temp2 = df_immigration.select(
        col("departure_date").alias("date_deparr")).distinct()
    df_calendar = df_temp.union(df_temp2)
    df_calendar = df_calendar.distinct()

    # derived year, month, day from date
    df_calendar = df_calendar.withColumn("year_deparr", year(col("date_deparr"))) \
                             .withColumn("month_deparr", month(col("date_deparr"))) \
                             .withColumn("day_deparr", dayofmonth(col("date_deparr"))) \
                             .withColumn("quarter_deparr", quarter(col("date_deparr")))

    # remove rows based on missing values
    df_calendar = df_calendar.na.drop(subset=("date_deparr"))

    return df_calendar
Esempio n. 2
0
def createDimDate(df):
    '''
    Creates date dimensional table from DateTime field in upstream dataframe
    :param df:
    :return: Date Dimensional Dataframe
    '''
    df = df.withColumn("rawKey", F.col('rawKey'))
    df = df.withColumn("year", F.year(F.col('DateTime')))
    df = df.withColumn("month", F.month(F.col('DateTime')))
    df = df.withColumn("dayofmonth", F.dayofmonth(F.col('DateTime')))
    df = df.withColumn("dayofweek", F.dayofweek(F.col('DateTime')))
    df = df.withColumn("dayofyear", F.dayofyear(F.col('DateTime')))
    df = df.withColumn("hour", F.hour(F.col('DateTime')))
    df = df.withColumn("minute", F.minute(F.col('DateTime')))
    df = df.withColumn("dateMinute", F.date_format(F.col("DateTime"), "yyyyMMddHHmm"))
    df = df.withColumn("quarter", F.quarter(F.col('DateTime')))
    df = df.withColumn("date", F.to_date(F.col('DateTime')))
    df.createOrReplaceTempView('tempDimDateTable')
    dimDateDF = spark.sql(" SELECT * FROM \
                    (select rawKey,dateMinute,dateTime, date,year, month,dayofmonth,dayofweek,dayofyear,hour, minute,quarter \
                    from tempDimDateTable \
                    group by rawKey,dateMinute,dateTime, date,year, month,dayofmonth,dayofweek,dayofyear,hour, minute,quarter \
                    order by dateMinute ASC) \
                    ")

    # Generating dateKey field
    dimDateDF = dimDateDF.withColumn('dateKey', F.monotonically_increasing_id() + 1)
    # Creating dataframe including date field which will help to generate Fact table
    factHelperDateDF = dimDateDF.select(F.col('rawKey'), F.col('dateKey'), F.col('dateMinute'))
    # Dropping unnecessary rawKey field
    dimDateDF = dimDateDF.drop(F.col('rawKey'))
    return dimDateDF, factHelperDateDF
Esempio n. 3
0
    def _extract_dim_date(self, imgDF: DataFrame, output_path: str):
        print('Extracting dim_date from immigration data...')

        dateCols = [col for col in imgDF.columns if col.endswith('_date')]
        dates: DataFrame = None
        for dateCol in dateCols:
            dt = imgDF. \
                where(
                F.col(dateCol).isNotNull()
            ). \
                select(
                    F.col(dateCol).alias('date'),
                    F.year(F.col(dateCol)).cast('smallint').alias('year'),
                    F.quarter(F.col(dateCol)).cast('smallint').alias('quarter'),
                    F.month(F.col(dateCol)).cast('smallint').alias('month'),
                    F.dayofweek(F.col(dateCol)).cast('smallint').alias('day_of_week'),
                    F.dayofmonth(F.col(dateCol)).cast('smallint').alias('day_of_month'),
                    F.dayofyear(F.col(dateCol)).cast('smallint').alias('day_of_year'),
                    F.weekofyear(F.col(dateCol)).cast('smallint').alias('week_of_year')
            )
            if dates:
                dates = dates.union(dt).distinct()
            else:
                dates = dt

        print('Saving dim_date')
        self._write_data(dates, output_path, 'date')
        print('Finished saving dim_date')

        print('Finished extracting dim_date from immigration data.')
def transform(dataframe):
    # Add date partitions and cast user_ids
    cleaned = (
        dataframe.withColumn("year", F.year("timestamp"))
        .withColumn("quarter", F.quarter("timestamp"))
        .withColumn("user_id", F.col("user_id").cast("int"))
        .orderBy("timestamp")
    )
    return cleaned
Esempio n. 5
0
def glean_3(invoice_df):
    invoices = invoice_df.withColumn('invoice_year',
                                     funcs.year('invoice_date'))
    invoices = invoices.withColumn('invoice_month',
                                   funcs.month('invoice_date'))
    invoices = invoices.withColumn('invoice_quarter',
                                   funcs.quarter('invoice_date'))
    invoices = invoices.groupBy([
        'canonical_vendor_id', 'invoice_year', 'invoice_month',
        'invoice_quarter'
    ]).agg({
        'total_amount': 'sum',
        'invoice_date': 'max'
    }).sort('canonical_vendor_id', 'invoice_year', 'invoice_month')
    days = lambda i: i * 86400

    w = (Window.partitionBy('canonical_vendor_id').orderBy(
        funcs.col('max(invoice_date)').cast('long')).rangeBetween(
            -days(365), 0))

    invoices = invoices.withColumn('rolling_average_12m',
                                   funcs.avg("sum(total_amount)").over(w))

    glean3 = invoices[
        ((invoices['sum(total_amount)'] >= invoices['rolling_average_12m'] * 6)
         & (invoices['sum(total_amount)'] < 1000) &
         (invoices['sum(total_amount)'] >= 100)) |
        ((invoices['sum(total_amount)'] >= invoices['rolling_average_12m'] * 3)
         & (invoices['sum(total_amount)'] < 10000) &
         (invoices['sum(total_amount)'] >= 1000)) |
        ((invoices['sum(total_amount)'] >= invoices['rolling_average_12m'] *
          1.5) & (invoices['sum(total_amount)'] >= 10000))]

    glean3 = glean3.withColumn(
        'dollar_dif',
        funcs.col('sum(total_amount)') - funcs.col('rolling_average_12m'))
    glean3 = glean3.withColumn(
        'percent_dif',
        funcs.round(100 * funcs.col('dollar_dif') /
                    funcs.col('rolling_average_12m')))
    glean3 = glean3.withColumn('glean_location', funcs.lit('vendor'))
    glean3 = glean3.withColumn("glean_type",
                               funcs.lit('large_month_increase_mtd'))
    glean3 = glean3.withColumn(
        "glean_text",
        funcs.concat(funcs.lit('Monthly spend with '),
                     funcs.col('canonical_vendor_id'), funcs.lit(' is $'),
                     funcs.col('dollar_dif'), funcs.lit(' ('),
                     funcs.col('percent_dif'),
                     funcs.lit('%) higher than average')))

    glean3 = glean3.withColumn("invoice_id", funcs.lit('n/a'))
    glean3 = glean3.withColumn('glean_date', funcs.col('max(invoice_date)'))
    return glean3
def extract_date(df):
    #df = df.na.fill('NORMAL',subset=['types'])
    df = df.dropna(how='all')
    df = (df
    .withColumn('Yearday', F.dayofyear(F.col("SalDate")))
    .withColumn('Month', F.month(F.col('SalDate')))
    .withColumn('DayofWeek', F.dayofweek(F.col('SalDate')))
    .withColumn('Year', F.year(F.col('SalDate')))
    .withColumn('Quarter', F.quarter(F.col('SalDate')))
    .withColumn('WeekOfYear', F.weekofyear(F.col('SalDate')))
    .withColumn('MonthQuarter', F.when((df['DayofMonth'] <= 8), 0)\
     .otherwise(F.when((df['DayofMonth'] <= 16), 1).otherwise(F.when((df['DayofMonth'] <= 24), 2)\
      .otherwise(3))))
    )
    df = df.cache()
    return df
def transform(dataframe, limit=None):
    # prepare the revision column
    revision_columns = [
        "article_id",
        "rev_id",
        "article_title",
        "timestamp",
        "username",
        "user_id",
    ]
    revision_query = [
        F.col("_revision").getItem(i).alias(name)
        for i, name in enumerate(revision_columns)
    ]
    dataframe = (
        dataframe.withColumn("_revision", F.split("revision", " "))
        .select(revision_query + dataframe.columns[1:])
        .drop("_revision")
    )

    # add in the proper typing
    typemap = dict(
        [
            ("article_id", "int"),
            ("rev_id", "int"),
            ("timestamp", "timestamp"),
            ("minor", "boolean"),
            ("textdata", "int"),
        ]
    )

    def cast(name):
        if name in typemap:
            return F.col(name).cast(typemap[name])
        return F.col(name)

    cast_query = map(cast, dataframe.columns)
    dataframe = dataframe.select(cast_query)

    # include limits
    if limit:
        dataframe = dataframe.limit(limit)

    return dataframe.withColumn("year", F.year("timestamp")).withColumn(
        "quarter", F.quarter("timestamp")
    )
Esempio n. 8
0
 def _transform(self, df):
     self.check_input_type(df.schema)
     return df.withColumn(self.outputCol, F.quarter(df[self.inputCol]))
Esempio n. 9
0
df2 = df.filter(df.Timestamp.isNotNull())
df2 = df2.limit(500)
# convert timestamp to the right format
timeStampPreCleaning = udf(
    lambda x: str(x) + " 2020" if len(x) < 8 else x.replace(",", ""), StringType()
)
df2 = df2.withColumn("Timestamp", timeStampPreCleaning("Timestamp"))

# StirngToDateType
df3 = df2.withColumn("TimeStampDateType", F.to_date(F.col("Timestamp"), "MMM dd yyyy"))

# drop null value rows which timestamp columns is not in the standard format.
df3 = df3.filter(df3.TimeStampDateType.isNotNull())
df3 = df3.withColumn("Year", F.year(df3.TimeStampDateType))
df3 = df3.withColumn("Month", F.month(df3.TimeStampDateType))
df3 = df3.withColumn("Qurter", F.quarter(df3.TimeStampDateType))
# fill null with 0 and convert unit to the right numbers.
cols = ["Comments", "Likes", "Retweets"]

df3 = df3.fillna("0", subset=cols)

# apply the transform_number udf
transformNumber = udf(lambda z: transform_number(z), T.IntegerType())
df3 = df3.withColumn("Comments", transformNumber("Comments"))
df3 = df3.withColumn("Likes", transformNumber("Likes"))
df3 = df3.withColumn("Retweets", transformNumber("Retweets"))
### check
logNormal = udf(lambda x: int(round(np.log2(x + 1)))+1, T.IntegerType())
df3 = df3.withColumn("Likes_log", logNormal("Likes"))
df3 = df3.withColumn("Retweets_log", logNormal("Retweets"))
# df3 = df3.filter(df3.Likes_log.isNotNull())
Esempio n. 10
0
        then b.UnitPrice*b.Quantity*(1-b.Discount) else 0 end), 0) "Qtr 4" 
from Products a 
inner join Order_Details b on a.ProductID = b.ProductID
inner join Orders c on c.OrderID = b.OrderID
inner join Customers d on d.CustomerID = c.CustomerID 
where c.OrderDate between date('1997-01-01') and date('1997-12-31')
group by a.ProductName, 
    d.CompanyName, 
    year(OrderDate)
order by a.ProductName, d.CompanyName;
"""

ordersDF = getTable("david_test.orders")
customersDF = getTable("david_test.Customers")

quarterlyOrdersByProductsDF = productsDF.alias("a").join(orderDetailsDF.alias("b"), productsDF.product_id == orderDetailsDF.product_id, how="inner") \
                                        .join(ordersDF.alias("c"), orderDetailsDF.order_id == ordersDF.order_id, how="inner") \
                                        .join(customersDF.alias("d"), ordersDF.customer_id == customersDF.customer_id, how="inner") \
                                        .filter(col("c.order_date").between("1997-01-01", "1997-12-31")) \
                                        .select("a.product_name", "d.company_name", year("c.order_date").alias("year_order_date"), "c.order_date", "b.unit_price", "b.quantity", "b.discount") \
                                        .groupBy("a.product_name", "d.company_name", "year_order_date") \
                                        .agg(sum(when(quarter("c.order_date") == 1, col("b.unit_price") * col("b.quantity") * (1 - col("b.discount"))).otherwise(0)).cast(IntegerType()).alias("qtr 1"),
                                             sum(when(quarter("c.order_date") == 2, col("b.unit_price") * col("b.quantity") * (1 - col("b.discount"))).otherwise(0)).cast(IntegerType()).alias("qtr 2"),
                                             sum(when(quarter("c.order_date") == 3, col("b.unit_price") * col("b.quantity") * (1 - col("b.discount"))).otherwise(0)).cast(IntegerType()).alias("qtr 3"),
                                             sum(when(quarter("c.order_date") == 4, col("b.unit_price") * col("b.quantity") * (1 - col("b.discount"))).otherwise(0)).cast(IntegerType()).alias("qtr 4")) \
                                        .select("a.product_name", "d.company_name", "year_order_date", "qtr 1", "qtr 2", "qtr 3", "qtr 4") \
                                        .orderBy("a.product_name", "d.company_name")

# quarterlyOrdersByProductsDF.show()

quarterlyOrdersByProductsDF.explain()
def insert_time_dim(start_date_id, end_date_id):
    time_begin = datetime.strptime(str(start_date_id), "%Y%m%d").date()
    time_end = datetime.strptime(str(end_date_id), "%Y%m%d").date()

    print('time_begin')
    print(time_begin)

    print('time_end')
    print(time_end)

    # tao dataframe tu time_begin va time_end
    data = [(time_begin, time_end)]
    df = spark.createDataFrame(data, ["minDate", "maxDate"])
    # convert kieu dl va ten field
    df = df.select(
        df.minDate.cast(DateType()).alias("minDate"),
        df.maxDate.cast(DateType()).alias("maxDate"))

    # chay vong lap lay tat ca cac ngay giua mindate va maxdate
    df = df.withColumn("daysDiff", f.datediff("maxDate", "minDate")) \
        .withColumn("repeat", f.expr("split(repeat(',', daysDiff), ',')")) \
        .select("*", f.posexplode("repeat").alias("date", "val")) \
        .withColumn("date", f.expr("to_date(date_add(minDate, date))")) \
        .select('date')

    # convert date thanh cac option ngay_thang_nam
    df = df.withColumn('id', f.date_format(df.date, "yyyyMMdd")) \
        .withColumn('ngay_trong_thang', f.dayofmonth(df.date)) \
        .withColumn('ngay_trong_tuan', f.from_unixtime(f.unix_timestamp(df.date, "yyyy-MM-dd"), "EEEEE")) \
        .withColumn('tuan_trong_nam', f.weekofyear(df.date)) \
        .withColumn('thang', f.month(df.date)) \
        .withColumn('quy', f.quarter(df.date)) \
        .withColumn('nam', f.year(df.date))
    df = df.withColumn('tuan_trong_thang', (df.ngay_trong_thang - 1) / 7 + 1)

    data_time = DynamicFrame.fromDF(df, glueContext, 'data_time')

    # convert data
    data_time = data_time.resolveChoice(specs=[('tuan_trong_thang',
                                                'cast:int')])

    # chon cac truong va kieu du lieu day vao db
    applymapping1 = ApplyMapping.apply(
        frame=data_time,
        mappings=[("id", "string", "id", "bigint"),
                  ("ngay_trong_thang", 'int', 'ngay_trong_thang', 'int'),
                  ("ngay_trong_tuan", "string", "ngay_trong_tuan", "string"),
                  ("tuan_trong_thang", "int", "tuan_trong_thang", "int"),
                  ("tuan_trong_nam", "int", "tuan_trong_nam", "int"),
                  ("thang", "int", "thang", "int"),
                  ("quy", "int", "quy", "int"), ("nam", "int", "nam", "int"),
                  ("date", "date", "ngay", "timestamp")])

    resolvechoice2 = ResolveChoice.apply(frame=applymapping1,
                                         choice="make_cols",
                                         transformation_ctx="resolvechoice2")
    dropnullfields3 = DropNullFields.apply(
        frame=resolvechoice2, transformation_ctx="dropnullfields3")

    # ghi dl vao db
    preactions = 'delete student.time_dim where id >= ' + str(start_date_id)
    datasink4 = glueContext.write_dynamic_frame.from_jdbc_conf(
        frame=dropnullfields3,
        catalog_connection="glue_redshift",
        connection_options={
            "preactions": preactions,
            "dbtable": "student.time_dim",
            "database": "student_native_report"
        },
        redshift_tmp_dir=
        "s3n://dts-odin/temp/tu-student_native_report/student/time_dim",
        transformation_ctx="datasink4")
def main():

    # args = getResolvedOptions(sys.argv, ['TempDir', 'JOB_NAME'])

    sc = SparkContext()
    glueContext = GlueContext(sc)
    spark = glueContext.spark_session
    # job = Job(glueContext)
    # job.init(args['JOB_NAME'], args)

    # tao timebegin bang ngay hien tai + 1 ngay
    # time_begin = datetime.date.today() + datetime.timedelta(days=1)
    #20190908
    time_begin = datetime.date(2019, 9, 9)

    # tao timeend bang ngay hien tai + 1 thang - 1 ngay
    # time_end = time_begin + relativedelta(months=1) - datetime.timedelta(days=1)

    time_end = datetime.date(2019, 10, 1)

    # tao dataframe tu time_begin va time_end
    data = [(time_begin, time_end)]
    df = spark.createDataFrame(data, ["minDate", "maxDate"])
    # convert kieu dl va ten field
    df = df.select(
        df.minDate.cast(DateType()).alias("minDate"),
        df.maxDate.cast(DateType()).alias("maxDate"))

    # chay vong lap lay tat ca cac ngay giua mindate va maxdate
    df = df.withColumn("daysDiff", f.datediff("maxDate", "minDate")) \
        .withColumn("repeat", f.expr("split(repeat(',', daysDiff), ',')")) \
        .select("*", f.posexplode("repeat").alias("date", "val")) \
        .withColumn("date", f.expr("to_date(date_add(minDate, date))")) \
        .select('date')

    # convert date thanh cac option ngay_thang_nam
    df = df.withColumn('id', date_format(df.date, "yyyyMMdd")) \
        .withColumn('ngay_trong_thang', f.dayofmonth(df.date)) \
        .withColumn('ngay_trong_tuan', from_unixtime(unix_timestamp(df.date, "yyyy-MM-dd"), "EEEEE")) \
        .withColumn('tuan_trong_nam', f.weekofyear(df.date)) \
        .withColumn('thang', f.month(df.date)) \
        .withColumn('quy', f.quarter(df.date)) \
        .withColumn('nam', f.year(df.date))
    df = df.withColumn('tuan_trong_thang', (df.ngay_trong_thang - 1) / 7 + 1)

    data_time = DynamicFrame.fromDF(df, glueContext, 'data_time')

    # convert data
    data_time = data_time.resolveChoice(specs=[('tuan_trong_thang',
                                                'cast:int')])

    # chon cac truong va kieu du lieu day vao db
    applymapping1 = ApplyMapping.apply(
        frame=data_time,
        mappings=[("id", "string", "id", "bigint"),
                  ("ngay_trong_thang", 'int', 'ngay_trong_thang', 'int'),
                  ("ngay_trong_tuan", "string", "ngay_trong_tuan", "string"),
                  ("tuan_trong_thang", "int", "tuan_trong_thang", "int"),
                  ("tuan_trong_nam", "int", "tuan_trong_nam", "int"),
                  ("thang", "int", "thang", "int"),
                  ("quy", "int", "quy", "int"), ("nam", "int", "nam", "int"),
                  ("date", "date", "ngay", "timestamp")])

    resolvechoice2 = ResolveChoice.apply(frame=applymapping1,
                                         choice="make_cols",
                                         transformation_ctx="resolvechoice2")
    dropnullfields3 = DropNullFields.apply(
        frame=resolvechoice2, transformation_ctx="dropnullfields3")

    # ghi dl vao db
    datasink4 = glueContext.write_dynamic_frame.from_jdbc_conf(
        frame=dropnullfields3,
        catalog_connection="glue_redshift",
        connection_options={
            "dbtable": "dim_time",
            "database": "dts_odin"
        },
        redshift_tmp_dir="s3n://dts-odin/temp/tu-hoc/hwb/fdfdf",
        transformation_ctx="datasink4")
Esempio n. 13
0
        Fld("title", String()),
        Fld("video", Boolean()),
        Fld("vote_average", Float()),
        Fld("vote_count", Int())
    ])


    movies_df = spark.read.option("header", "true") \
                           .csv("s3a://{}/{}/movies_metadata.csv".format(s3_bucket, s3_key),
                                schema=movies_schema)

    movies_df = movies_df.na.drop()

    # extract columns to create time table
    date_table = movies_df.select(
        col('release_date'),
        dayofmonth("release_date").alias('day'),
        weekofyear("release_date").alias('week'),
        month("release_date").alias('month'),
        quarter("release_date").alias('quarter'),
        year("release_date").alias('year')).dropDuplicates()

    date_table.write \
              .format("jdbc")  \
              .option("url", redshift_conn_string) \
              .option("dbtable", "movies.stage_date") \
              .option("user", sys.argv[6]) \
              .option("password", sys.argv[7]) \
              .option("driver", "com.amazon.redshift.jdbc42.Driver") \
              .mode("append") \
              .save()
def generate_dim_date(spark, start_year=1901, number_years_out_from_start=300):
    """Create `dim_date` table containing various date feature columns.

    Args:
        spark (SparkSession): Instantiated SparkSession
        start_year (int): starting year for dim_date table.
        number_years_out_from_start (int): number out from `start_year` to increment.

    Returns:
        Spark DataFrame.
    """
    years = [start_year + i for i in range(number_years_out_from_start + 1)]
    months = [i for i in range(1, 13)]
    days = [i for i in range(1, 32)]

    years_df = spark.createDataFrame(
        pd.DataFrame({
            'year': years,
            'temp_join_key': '1'
        }))
    months_df = spark.createDataFrame(
        pd.DataFrame({
            'month': months,
            'temp_join_key': '1'
        }))
    days_df = spark.createDataFrame(
        pd.DataFrame({
            'day_of_month': days,
            'temp_join_key': '1'
        }))

    years_months_df = (years_df.join(months_df, ['temp_join_key'],
                                     how='inner'))

    years_month_days_df = (years_months_df.join(days_df, ['temp_join_key'],
                                                how='inner'))

    date_keys = (
        years_month_days_df.withColumn(
            'date',
            to_date(
                concat(col('year'), lpad(col('month'), 2, '0'),
                       lpad(col('day_of_month'), 2, '0')), 'yyyyMMdd'))
        # remove invalid dates
        .filter("date IS NOT NULL").withColumn(
            'date_key',
            regexp_replace(col('date').cast('string'), '-',
                           '').cast('integer')))

    date_features = (date_keys
                     # get `week` and `quarter`
                     .withColumn('week', weekofyear(col('date')))
                     .withColumn('quarter', quarter(col('date')))
                     # get `day_name` and `month_name`
                     .withColumn('day_name', date_format(col('date'), 'EEEE'))
                     .withColumn('month_name', date_format(col('date'), 'MMMM'))
                     # get `date_year`, `date_quarter`, `date_month`, `date_week`
                     .withColumn('date_week', expr("MIN(date) OVER(PARTITION BY week, year)"))
                     .withColumn('date_month', date_format(col('date'), 'yyyy-MM-01'))
                     .withColumn('date_quarter', expr("MIN(date) OVER(PARTITION BY quarter, year)"))
                     .withColumn('date_year', date_format(col('date'), 'yyyy-01-01'))
                     # get `day_of_week`, `day_of_quarter`, `day_of_year`
                     .withColumn('day_of_week', dayofweek(col('date')))
                     .withColumn('day_of_quarter', datediff(col('date'), col('date_quarter')) + lit(1))
                     .withColumn('day_of_year', dayofyear(col('date')))
                     # get `weekend_flag`, `us_holiday_flag`, `business_day_flag`, `leap_year_flag`,
                     # `month_start_flag`, `month_end_flag`
                     .withColumn('weekend_flag', when(col('day_of_week').isin([7, 1]), 'Y').otherwise('N'))
                     .withColumn('us_holiday_flag', pd_is_holiday_usa(col('date').cast('timestamp')))
                     .withColumn('us_biz_day_flag', when((col('weekend_flag') == lit('Y')) |
                                                         (col('us_holiday_flag') == lit('Y')), 'Y').otherwise('N'))
                     .withColumn('leap_year_flag',
                                 when(dayofmonth(last_day(concat(col('year'), lit('-02-01')).cast('date'))) == 29, 'Y')
                                 .otherwise('N'))
                     .withColumn('month_start_flag', when(col('day_of_month') == lit(1), 'Y').otherwise('N'))
                     .withColumn('month_end_flag', when(col('date') == last_day(col('date')), 'Y').otherwise('N'))
                     # get `pct_into_month`, `pct_into_quarter`, `pct_into_year`
                     .withColumn('pct_into_month',
                                 (col('day_of_month') / dayofmonth(last_day(col('date')))).cast('decimal(7, 6)'))
                     .withColumn('date_quarter_end',
                                 when(col('quarter') == lit(1), concat(col('year'), lit('-03-31')))
                                 .when(col('quarter') == lit(2), concat(col('year'), lit('-06-30')))
                                 .when(col('quarter') == lit(3), concat(col('year'), lit('-09-30')))
                                 .when(col('quarter') == lit(4), concat(col('year'), lit('-12-31')))
                                 .otherwise(None)
                                 .cast('date'))
                     .withColumn('days_in_quarter', datediff(col('date_quarter_end'), col('date_quarter')) + lit(1))
                     .withColumn('pct_into_quarter',
                                 (col('day_of_quarter') / col('days_in_quarter')).cast('decimal(7, 6)'))
                     .withColumn('pct_into_year',
                                 (col('day_of_year') / when(col('leap_year_flag') == lit('Y'), 366.0).otherwise(365.0))
                                 .cast('decimal(7, 6)'))
                     # get seasons
                     .withColumn('season_northern',
                                 when(col('month').isin(12, 1, 2), 'Winter')
                                 .when(col('month').isin(3, 4, 5), 'Spring')
                                 .when(col('month').isin(6, 7, 8), 'Summer')
                                 .when(col('month').isin(9, 10, 11), 'Fall')
                                 .otherwise('UNKNOWN'))
                     .withColumn('season_southern',
                                 when(col('month').isin(6, 7, 8), 'Winter')
                                 .when(col('month').isin(9, 10, 11), 'Spring')
                                 .when(col('month').isin(12, 1, 2), 'Summer')
                                 .when(col('month').isin(3, 4, 5), 'Fall')
                                 .otherwise('UNKNOWN')))

    dim_date = (date_features.sort('date').select([
        'date_key', 'date', 'date_week', 'date_month', 'date_quarter',
        'date_year', 'day_of_week', 'day_of_month', 'day_of_quarter',
        'day_of_year', 'week', 'month', 'quarter', 'year', 'days_in_quarter',
        'day_name', 'month_name', 'season_northern', 'season_southern',
        'weekend_flag', 'us_holiday_flag', 'us_biz_day_flag',
        'month_start_flag', 'month_end_flag', 'leap_year_flag',
        'pct_into_month', 'pct_into_quarter', 'pct_into_year'
    ]))
    return dim_date
    df = spark.sql("""SELECT * FROM stag_immigration""")

    # get date range
    df = df.select("arrival_date").distinct().orderBy("arrival_date")

    # generate columns
    df = df.withColumn("year", F.date_format("arrival_date", "y")) \
        .withColumn("month", F.date_format("arrival_date", "M")) \
        .withColumn("day", F.date_format("arrival_date", "d")) \
        .withColumn("month_string", F.date_format("arrival_date", "MMM")) \
        .withColumn("day_string", F.date_format("arrival_date", "E")) \
        .withColumn("week", F.date_format("arrival_date", "w")) \
        .withColumn("day_of_year", F.dayofyear("arrival_date")) \
        .withColumn("day_of_week", F.dayofweek("arrival_date")) \
        .withColumn("quarter", F.quarter("arrival_date"))

    # create unique identifier
    df = df.withColumn("id", F.monotonically_increasing_id() + 1)

    # select relevant columns
    df = df.select("id", "arrival_date", "year", "month", "day",
                   "month_string", "day_string", "week", "day_of_year",
                   "day_of_week", "quarter")

    # show final table
    df.show()
    df.printSchema()

    # write to parquet on s3
    df.write.mode("overwrite").option("path", OUTPUT_PATH) \
Esempio n. 16
0
def load_datasources(spark):
    # Load JDBC data sources (tables: claims, drugproduct, drugproductfamily, indication):
    jdbc_url = 'jdbc:postgresql://localhost:5432/bigdata_source'
    jdbc_table_claim = 'public.claim'
    jdbc_table_drugproduct = 'public.drugproduct'
    jdbc_table_drugproductfamily = 'public.drugproductfamily'
    jdbc_table_indication = 'public.indication'
    jdbc_credentials = {'user': '******', 'password': '******'}
    data_filepath_base = '/home/janusz/bigdata/projekt/data/'

    src_claims = spark.read \
        .jdbc(jdbc_url, jdbc_table_claim,
              properties=jdbc_credentials)

    src_drugproduct = spark.read \
        .jdbc(jdbc_url, jdbc_table_drugproduct,
              properties=jdbc_credentials)

    src_drugproductfamily = spark.read \
        .jdbc(jdbc_url, jdbc_table_drugproductfamily,
              properties=jdbc_credentials)

    src_indication = spark.read \
        .jdbc(jdbc_url, jdbc_table_indication,
              properties=jdbc_credentials)

    # Load file data sources:
    src_countries = spark.read.load(data_filepath_base + 'countries_data.csv', format='csv', header='true')

    src_factories1 = spark.read.load(data_filepath_base + 'factories.tsv', format='csv', delimiter='\t')
    src_factories2 = spark.read.load(data_filepath_base + 'factories_2.tsv', format='csv', delimiter='\t')
    src_factories = src_factories1.unionAll(src_factories2).distinct()
    src_factories = src_factories.toDF('code', 'name', 'country')

    src_responses1 = spark.read.load(data_filepath_base + 'responses_data.csv', format='csv', header='true')
    src_responses2 = spark.read.load(data_filepath_base + 'responses_data_2.csv', format='csv', header='true')
    src_responses = src_responses1.unionAll(src_responses2).distinct() \
        .toDF('claim_number', 'response_date', 'response')

    # Build Facts and Dimensions data frames
    def format_bk(bk):
        return bk.strip().upper()

    udf_format_bk = udf(format_bk)
    spark.udf.register('format_bk', format_bk)

    dim_countries = src_countries \
        .toDF('country_name', 'alpha_2_code', 'alpha_3_code', 'numeric_code', 'link') \
        .select('alpha_2_code', 'country_name') \
        .toDF('country_bk', 'country_name') \
        .withColumn('country_id', monotonically_increasing_id())

    dim_factories = src_factories \
        .withColumn('code', udf_format_bk(src_factories.code)) \
        .toDF('factory_bk', 'factory_name', 'factory_country') \
        .withColumn('factory_id', monotonically_increasing_id())

    dim_indications = src_indication \
        .select('indicationid', 'indicationdescription') \
        .toDF('indication_bk', 'indication_description') \
        .withColumn('indication_id', monotonically_increasing_id())

    dic_response_code = spark.read.load(data_filepath_base + 'dic_response_code.tsv', format='csv', delimiter='\t') \
        .toDF('from', 'to')
    src_responses.createOrReplaceTempView('response_src')
    dic_response_code.createOrReplaceTempView('dic_response_code')
    dim_responses = spark.sql("""
        SELECT
        monotonically_increasing_id() as response_id,
        format_bk(claim_number) as response_bk,
        response_date,
        dic.to as response
        FROM response_src
        INNER JOIN dic_response_code as dic ON response_src.response = dic.from
        """)

    src_drugproduct.createOrReplaceTempView('src_drugproduct')
    src_drugproductfamily.createOrReplaceTempView('src_drugproductfamily')

    dim_products = spark.sql("""
    SELECT monotonically_increasing_id() as product_id,
    format_bk(drugproductid) as product_bk,
    drugproductname as product_name,
    drugproductfamilyname as product_family_name
    FROM src_drugproduct
    LEFT OUTER JOIN src_drugproductfamily ON src_drugproduct.drugproductfamilyid = src_drugproductfamily.drugproductfamilyid""")

    format_quarter = udf(lambda num_quarter: 'Q' + str(num_quarter))

    calendar_start = date(year=1900, month=1, day=1)
    calendar_end = date(year=2100, month=1, day=1)
    calendar_rdd = spark.sparkContext.parallelize(
        [dt.date() for dt in rrule(freq=DAILY, dtstart=calendar_start, until=calendar_end)]) \
        .map(lambda dt: Row(dt=dt))

    dim_calendar = spark.createDataFrame(calendar_rdd)
    dim_calendar = dim_calendar \
        .withColumn('date_id', monotonically_increasing_id()) \
        .withColumn('date', date_format(dim_calendar.dt, 'YYYY-MM-dd')) \
        .withColumn('year', year(dim_calendar.dt)) \
        .withColumn('month', month(dim_calendar.dt)) \
        .withColumn('day', dayofmonth(dim_calendar.dt)) \
        .withColumn('quarter', quarter(dim_calendar.dt))

    dim_calendar = dim_calendar \
        .withColumn('quarter', format_quarter(dim_calendar.quarter))

    src_claims.cache()
    dim_responses.cache()
    dim_calendar.cache()
    dim_countries.cache()
    dim_factories.cache()
    dim_indications.cache()
    dim_products.cache()

    src_claims.createOrReplaceTempView('claim')
    dim_responses.createOrReplaceTempView('response')
    dim_calendar.createOrReplaceTempView('calendar')
    dim_countries.createOrReplaceTempView('country')
    dim_factories.createOrReplaceTempView('factory')
    dim_indications.createOrReplaceTempView('indication')
    dim_products.createOrReplaceTempView('product')

    fact_registration_claim = spark.sql("""
      SELECT
      format_bk(claimnumber) as claimnumber,
      product.product_id as product_id,
      indication.indication_id as indication_id,
      factory_api.factory_id as factory_api_id,
      factory_bulk.factory_id as factory_bulk_id,
      factory_package.factory_id as factory_package_id,
      calendar_response.date_id as response_date_id,
      calendar_expectedresponse.date_id as expectedresponse_date_id,
      calendar_submission.date_id as submission_date_id,
      country.country_id as country_id,
      response.response_id as response_id
      FROM claim
      LEFT OUTER JOIN indication on indication.indication_bk = claim.indicationid
      LEFT OUTER JOIN product on product.product_bk = claim.productcodeid
      LEFT OUTER JOIN response on response.response_bk = claim.claimnumber
      LEFT OUTER JOIN calendar as calendar_response on calendar_response.date = response.response_date
      LEFT OUTER JOIN calendar as calendar_expectedresponse on calendar_expectedresponse.date = claim.expectedresponsedate
      LEFT OUTER JOIN calendar as calendar_submission on calendar_submission.date = claim.submissiondate
      LEFT OUTER JOIN country on country.country_bk = claim.countrycode
      LEFT OUTER JOIN factory as factory_api on factory_api.factory_bk = claim.factoryapi
      LEFT OUTER JOIN factory as factory_bulk on factory_bulk.factory_bk = claim.factorybulk
      LEFT OUTER JOIN factory as factory_package on factory_package.factory_bk = claim.factorypackage
      WHERE claim.claimtype = 'REGISTER'
      """)

    fact_reimbursement_claim = spark.sql("""
      SELECT
      format_bk(claimnumber) as claimnumber,
      product.product_id as product_id,
      reimbursementprct as reimbursement_claimed_prc,
      reimbursementprct as reimbursement_app_prc,
      indication.indication_id as indication_id,
      calendar_response.date_id as response_date_id,
      calendar_expectedresponse.date_id as expectedresponse_date_id,
      calendar_submission.date_id as submission_date_id,
      country.country_id as country_id,
      response.response_id as response_id,
      CAST(TRIM(SUBSTRING(claim.drugprice, 1, LENGTH(claim.drugprice)-3)) AS INT) AS drug_price,
      SUBSTRING(claim.drugprice, -3, 3) AS drug_price_currency
      FROM claim
      LEFT OUTER JOIN indication on indication.indication_bk = claim.indicationid
      LEFT OUTER JOIN product on product.product_bk = claim.productcodeid
      LEFT OUTER JOIN response on response.response_bk = claim.claimnumber
      LEFT OUTER JOIN calendar as calendar_response on calendar_response.date = response.response_date
      LEFT OUTER JOIN calendar as calendar_expectedresponse on calendar_expectedresponse.date = claim.expectedresponsedate
      LEFT OUTER JOIN calendar as calendar_submission on calendar_submission.date = claim.submissiondate
      LEFT OUTER JOIN country on country.country_bk = claim.countrycode
      WHERE claim.claimtype = 'REFUND'
      AND claim.drugprice IS NOT NULL AND claim.drugprice != 'null'
      """)

    fact_registration_claim.cache()
    fact_reimbursement_claim.cache()
    fact_registration_claim.createOrReplaceTempView('registration_claim')
    fact_reimbursement_claim.createOrReplaceTempView('reimbursement_claim')

    print('1) Liczba wnioskow o rejestracje na rok')
    # question_1 = spark.sql("""
    #         SELECT
    #         count(claimnumber) as number_of_claims, year
    #         FROM reimbursement_claim
    #         INNER JOIN calendar on submission_date_id = calendar.date_id
    #         group by year
    #         order by year
    #     """)

    question_1 = fact_registration_claim.alias('claim') \
        .join(dim_calendar.alias('calendar'), col('claim.submission_date_id') == col('calendar.date_id')) \
        .groupBy(col('calendar.year')) \
        .count() \
        .orderBy(col('calendar.year'))
    question_1.show()

    # print('2.a) Odsetek wnioskow o refundacje pozytywnych rozpatrzonych w terminie')
    # question_2a = spark.sql("""
    #     select
    #       (SELECT count(claimnumber) as pozytywne
    #       FROM reimbursement_claim
    #       INNER JOIN response on reimbursement_claim.response_id = response.response_id
    #       INNER JOIN calendar expected_cal on expectedresponse_date_id = expected_cal.date_id
    #       INNER JOIN calendar response_cal on response_date_id = response_cal.date_id
    #       where response.response = 'Approved'
    #       and response_cal.date <= expected_cal.date)
    #     /
    #     CAST(count(claimnumber) as FLOAT) AS ODSETEK_REFUND
    #     FROM reimbursement_claim
    #     """)
    # question_2a.show()

    question_2a = fact_reimbursement_claim.alias('claim') \
                      .join(dim_responses.alias('response'), col('claim.response_id') == col('response.response_id')) \
                      .join(dim_calendar.alias('expected_cal'), col('claim.expectedresponse_date_id') == col('expected_cal.date_id')) \
                      .join(dim_calendar.alias('response_cal'), col('claim.response_date_id') == col('response_cal.date_id')) \
                      .where(
        (col('response.response') == 'Approved') &
        (col('response_cal.date') <= col('expected_cal.date'))
    ).count() / float(fact_reimbursement_claim.count())

    print('ODSETEK_REFUND: {}'.format(question_2a))

    print('2.b) Odsetek wnioskow o rejestracje pozytywnych rozpatrzonych w terminie')
    # question_2b = spark.sql("""
    #     select
    #       (SELECT count(claimnumber) as pozytywne
    #       FROM registration_claim
    #       INNER JOIN response on registration_claim.response_id = response.response_id
    #       INNER JOIN calendar expected_cal on expectedresponse_date_id = expected_cal.date_id
    #       INNER JOIN calendar response_cal on response_date_id = response_cal.date_id
    #       where response.response = 'Approved'
    #       and response_cal.date <= expected_cal.date)
    #     /
    #     CAST(count(claimnumber) as FLOAT) AS ODSETEK_REFUND
    #     FROM registration_claim
    #     """)
    # question_2b.show()
    question_2b = fact_registration_claim.alias('claim') \
                      .join(dim_responses.alias('response'), col('claim.response_id') == col('response.response_id')) \
                      .join(dim_calendar.alias('expected_cal'), col('claim.expectedresponse_date_id') == col('expected_cal.date_id')) \
                      .join(dim_calendar.alias('response_cal'), col('claim.response_date_id') == col('response_cal.date_id')) \
                      .where(
        (col('response.response') == 'Approved') &
        (col('response_cal.date') <= col('expected_cal.date'))
    ).count() / float(fact_reimbursement_claim.count())

    print('ODSETEK_REFUND: {}'.format(question_2b))

    print('3) Najdrozszy lek w Skandynawii')
    # question_3 = spark.sql("""
    #         SELECT DISTINCT product.product_bk, product.product_name
    #           FROM reimbursement_claim
    #           INNER JOIN response on reimbursement_claim.response_id = response.response_id
    #           INNER JOIN country on reimbursement_claim.country_id = country.country_id
    #           INNER JOIN product on reimbursement_claim.product_id = product.product_id
    #           INNER JOIN calendar on reimbursement_claim.response_date_id = calendar.date_id
    #         where
    #         reimbursement_claim.drug_price = (
    #         select
    #          MAX(reimbursement_claim.drug_price)
    #           FROM reimbursement_claim
    #           INNER JOIN response on reimbursement_claim.response_id = response.response_id
    #           INNER JOIN country on reimbursement_claim.country_id = country.country_id
    #           INNER JOIN calendar on reimbursement_claim.response_date_id = calendar.date_id
    #           WHERE response.response = 'Approved'
    #           AND country.country_bk in ('DK', 'SE', 'NO', 'FI')
    #           AND calendar.year = 2011
    #           AND calendar.quarter = 'Q4'
    #           )
    #           AND response.response = 'Approved'
    #           AND country.country_bk in ('DK', 'SE', 'NO', 'FI')
    #           AND calendar.year = 2011
    #           AND calendar.quarter = 'Q4'
    #     """)

    max_price = fact_reimbursement_claim.alias('claim') \
               .join(dim_responses.alias('response'), col('claim.response_id') == col('response.response_id')) \
               .join(dim_countries.alias('country'), col('claim.country_id') == col('country.country_id')) \
               .join(dim_calendar.alias('calendar'), col('claim.response_date_id') == col('calendar.date_id')) \
               .where(
                    (col('response.response') == 'Approved') &
                    (col('country.country_bk').isin({'DK', 'SE', 'NO', 'FI'})) &
                    (col('calendar.year') == 2011) &
                    (col('calendar.quarter') == 'Q4')
                ).groupBy().max('drug_price').collect()[0]['max(drug_price)']

    question_3 = fact_reimbursement_claim.alias('claim') \
        .join(dim_responses.alias('response'), col('claim.response_id') == col('response.response_id')) \
        .join(dim_countries.alias('country'), col('claim.country_id') == col('country.country_id')) \
        .join(dim_products.alias('product'), col('claim.product_id') == col('product.product_id')) \
        .join(dim_calendar.alias('calendar'), col('claim.response_date_id') == col('calendar.date_id')) \
        .where((col('claim.drug_price') == max_price) &
               (col('response.response') == 'Approved') &
               (col('country.country_bk').isin({'DK', 'SE', 'NO', 'FI'})) &
               (col('calendar.year') == 2011) &
               (col('calendar.quarter') == 'Q4')
        ).select(col('product.product_bk'), col('product.product_name')).distinct()

    question_3.show()

    print('4.a) Dla jakiego leku przyznano najwyzsza refundacje kwotowo na rynku polskim w latach 2005-2010?')
    question_4a = spark.sql("""
    	select  product.product_bk, product_name, reimbursement_claimed_prc, drug_price
    	  FROM reimbursement_claim
    	  INNER JOIN calendar on reimbursement_claim.response_date_id = calendar.date_id
    	  INNER JOIN country on reimbursement_claim.country_id = country.country_id
    	  INNER JOIN product on reimbursement_claim.product_id = product.product_id
    	  WHERE calendar.date >= '2005-01-01' AND calendar.date <= '2010-12-31'
    	  AND country.country_bk  = 'PL'
    	  AND reimbursement_claim.drug_price / reimbursement_claimed_prc = (
              SELECT MAX(reimbursement_claim.drug_price / reimbursement_claimed_prc)
              FROM reimbursement_claim
              INNER JOIN calendar on reimbursement_claim.response_date_id = calendar.date_id
              INNER JOIN country on reimbursement_claim.country_id = country.country_id
              WHERE calendar.date >= '2005-01-01' AND calendar.date <= '2010-12-31'
              AND country.country_bk  = 'PL'
    	  )
        """)
    question_4a.show()

    print('4.b) Dla jakiego leku przyznano najwyzsza refundacje procentowo na rynku polskim w latach 2005-2010?')
    question_4b = spark.sql("""
        	select  product.product_bk, product_name, reimbursement_claimed_prc, drug_price
        	  FROM reimbursement_claim
        	  INNER JOIN calendar on reimbursement_claim.response_date_id = calendar.date_id
        	  INNER JOIN country on reimbursement_claim.country_id = country.country_id
        	  INNER JOIN product on reimbursement_claim.product_id = product.product_id
        	  WHERE calendar.date >= '2005-01-01' AND calendar.date <= '2010-12-31'
        	  AND country.country_bk  = 'PL'
        	  AND reimbursement_claimed_prc = (
                  SELECT MAX(reimbursement_claimed_prc)
                  FROM reimbursement_claim
                  INNER JOIN calendar on reimbursement_claim.response_date_id = calendar.date_id
                  INNER JOIN country on reimbursement_claim.country_id = country.country_id
                  WHERE calendar.date >= '2005-01-01' AND calendar.date <= '2010-12-31'
                  AND country.country_bk  = 'PL'
        	  )
            """)
    question_4b.show()
Esempio n. 17
0
######CLEAN
df = sqlContext.read.csv("/datasets/crimes.csv", header='true')
#Define date derivatives
df = (df.withColumn(
    'Timestamps', F.to_timestamp("Date", 'MM/dd/yyyy hh:mm:ss a')).withColumn(
        'Day', F.to_date("Date", 'MM/dd/yyyy hh:mm:ss a')).withColumn(
            "Month", F.month("Day")).withColumn(
                "Hour", F.hour("Timestamps")).withColumn(
                    "Minute", F.minute("Timestamps")).withColumn(
                        "DayOfMonth", F.dayofmonth("Day")).withColumn(
                            "DayOfYear", F.dayofyear("Day")).withColumn(
                                "DayOfWeek", F.dayofweek("Day")).withColumn(
                                    'WeekOfYear',
                                    F.weekofyear("Day")).withColumn(
                                        'Quarter', F.quarter("Timestamps")))

cols = [
    "Day", "Year", "Month", "Hour", "Minute", "DayOfMonth", "DayOfYear",
    "DayOfWeek", "WeekOfYear", "Quarter", "District", "Primary Type"
]

df = df.select(*cols)


#Rename Primary Types with less than 1% share OTHER CRIMES
def least_frequent_columns(df, threshold=0.01):
    res = df.groupBy("Primary Type").count()\
                            .withColumn('tot',F.lit(df.count()))\
                            .withColumn('frac',F.expr('count/tot'))\
                            .filter('frac<'+str(threshold))\
Esempio n. 18
0
def getPreprocessingDataframe(df, nlp, version='dev'):
    if version == 'dev':
        df = df.orderBy(rand())
        df2 = df.filter(df.Timestamp.isNotNull())
        df2 = df2.limit(500)
    else:
        df = df.orderBy(rand())
        df2 = df.filter(df.Timestamp.isNotNull())

# convert timestamp to the right format
    timeStampPreCleaning = udf(
        lambda x: str(x) + " 2020" if len(x) < 8 else x.replace(",", ""),
        StringType())
    df2 = df2.withColumn("Timestamp", timeStampPreCleaning("Timestamp"))

    # StirngToDateType
    df3 = df2.withColumn("TimeStampDateType",
                         F.to_date(F.col("Timestamp"), "MMM dd yyyy"))

    # drop null value rows which timestamp columns is not in the standard format.
    df3 = df3.filter(df3.Text.isNotNull())
    df3 = df3.filter(df3.TimeStampDateType.isNotNull())
    df3 = df3.withColumn("Year", F.year(df3.TimeStampDateType))
    df3 = df3.withColumn("Month", F.month(df3.TimeStampDateType))
    df3 = df3.withColumn("Qurter", F.quarter(df3.TimeStampDateType))
    # fill null with 0 and convert unit to the right numbers.
    cols = ["Comments", "Likes", "Retweets"]

    df3 = df3.fillna("0", subset=cols)

    # apply the transform_number udf
    transformNumber = udf(lambda z: transform_number(z), T.IntegerType())
    df3 = df3.withColumn("Comments", transformNumber("Comments"))
    df3 = df3.withColumn("Likes", transformNumber("Likes"))
    df3 = df3.withColumn("Retweets", transformNumber("Retweets"))
    ### check
    logNormal = udf(lambda x: int(round(np.log2(x + 1))) + 1, T.IntegerType())
    df3 = df3.withColumn("Likes_log", logNormal("Likes"))
    df3 = df3.withColumn("Retweets_log", logNormal("Retweets"))
    # df3 = df3.filter(df3.Likes_log.isNotNull())
    df3 = df3.filter(df3.Retweets_log.isNotNull())

    extractKeywordFromQueries = udf(lambda x: extractkeyword(x))
    df3 = df3.filter(df3.Page_URL.isNotNull())
    df3 = df3.withColumn("Keyword", extractKeywordFromQueries("Page_URL"))
    df3 = df3.filter(df3.Keyword.isNotNull())

    keywordToCategory2 = udf(lambda x: getCategory2(x), StringType())
    df3 = df3.withColumn("Category2", keywordToCategory2("Keyword"))

    # NER Model
    # could be empty list,
    nerExtraction = udf(lambda z: ner_extraction(z, nlp),
                        T.ArrayType(StringType()))

    df3 = df3.withColumn("All_phrases", nerExtraction("Text"))
    df3 = df3.filter(df3.All_phrases.isNotNull())

    checkEmpty = udf(lambda x: checkempty(x), T.IntegerType())

    df3 = df3.withColumn('CheckEmpty', checkEmpty('All_phrases'))
    df3 = df3.filter(df3.CheckEmpty.isNotNull())

    df3 = df3.filter(df3.CheckEmpty != int(1))

    sentiment = VaderSentiment()
    vader_sentiment = udf(sentiment.score, T.FloatType())
    df3 = df3.withColumn("Sentiment", vader_sentiment('Text'))

    weighted_phrases_calculate = udf(lambda x, y: y * (int(x) + 1),
                                     T.ArrayType(StringType()))

    df3 = df3.withColumn(
        "Weighted_phrases",
        weighted_phrases_calculate("Retweets_log", "All_phrases"))

    # cols = ['Sentiment','All_phrases','Retweets_log','Weighted_phrases','Year','Month','Keyword']
    cols = ["Weighted_phrases", "Year", "Month", "Keyword", "Category2"]

    weighted_phrases_calculate = udf(lambda x, y: y * (int(x) + 1),
                                     T.FloatType())

    # get the weighted sentiments for each tweets.
    df3 = df3.withColumn("Weighted_Sentiment",
                         weighted_phrases_calculate("Likes_log", "Sentiment"))
    return df3
df2.show()

# Now I will drop all the rows that contains Null in value column
sparkDF.dropna(subset=['value'])
sparkDF.show(20, False)
print('Total rowcount after removing Nulls: ', sparkDF.count())

# def mqy(value):              # First creating a python function
#     if 'Month' in value:
#         return list('M')
#     elif 'Year' in value:
#         return list('Y')
#     else:
#         return list('Q')
# # Converting python function to pyspark UDF
# udf_mqy = fn.udf(lambda v: mqy(v))

# Now creating new column called Absolute_Period in sparkDF
condition1 = fn.col('Desc').like('%Month%')
condition2 = fn.col('Desc').like('%Quarter%')

sparkDF = sparkDF.withColumn(
    'Absolute_Period',
    fn.concat(
        fn.year('Date'),
        fn.when(condition1, 'M').when(condition2, 'Q').otherwise('Y'),
        fn.when(condition1, fn.format_string('%02d', fn.month('Date'))).when(
            condition2, fn.format_string('%02d',
                                         fn.quarter('Date'))).otherwise('')))
sparkDF.show(20, False)
Esempio n. 20
0
    format='csv')

import pyspark.sql.functions as F


@F.udf
def fix_date(dt):
    if dt is None:
        return dt
    parts = dt.split('/')
    return "{}-{}-{}".format(parts[2], parts[0], parts[1])


curated = quotes.withColumn('dt', fix_date(F.col('quotedate')))
curated = curated.withColumn('year', F.year(F.col('dt')))
curated = curated.withColumn('quarter', F.quarter(F.col('dt')))

curated.show()

joined = curated.alias('alpha').join(
    curated.alias('beta'), ['year', 'quarter', 'dt']).select(
        F.col('alpha.year').alias('year'),
        F.col('alpha.quarter').alias('quarter'),
        F.col('alpha.dt').alias('dt'),
        F.col('alpha.symbol').alias('alpha_symbol'),
        F.col('alpha.dt').alias('alpha_dt'),
        F.col('alpha.open').alias('alpha_open'),
        F.col('alpha.high').alias('alpha_high'),
        F.col('alpha.low').alias('alpha_low'),
        F.col('alpha.close').alias('alpha_close'),
        F.col('alpha.volume').alias('alpha_volume'),
Esempio n. 21
0
                                   kyl_tdt_sku.sku_num_sku_r3 == kyl_edv.sku_num_sku_r3,
                                   kyl_tdt_sku.tdt_num_line == kyl_edv.tdt_num_line]) \
    .select(kyl_tdt_sku.the_transaction_id, kyl_tdt_sku.tdt_num_line, kyl_tdt_sku.tdt_date_transaction,
            kyl_tdt_sku.but_num_business_unit, kyl_tdt_sku.but_name_business_unit, kyl_tdt_sku.sku_idr_sku,
            kyl_tdt_sku.f_qty_item, kyl_tdt_sku.sku_num_sku, kyl_tdt_sku.sku_num_sku_r3, kyl_tdt_sku.mdl_num_model_r3,
            kyl_org.purch_org, kyl_org.sales_org, kyl_org.sales_org_text, kyl_org.purch_org_text,
            kyl_org.distrib_channel, kyl_mdg.ege_basique, kyl_mdg.ege_security_product, kyl_mpd.iam_item_and_model_code,
            kyl_mpd.pty_process_type_pty, kyl_mpd.pty_process_type_label, kyl_edv.eta_num_etape_eta).drop_duplicates()

cond = [qf.material_id == ft.sku_num_sku_r3, ft.purch_org == qf.purch_org]
ft = ft.select(ft.the_transaction_id, ft.tdt_num_line, ft.tdt_date_transaction, ft.but_num_business_unit,
               ft.but_name_business_unit, ft.sku_num_sku_r3, ft.mdl_num_model_r3, ft.purch_org, ft.sales_org,
               ft.sales_org_text, ft.purch_org_text, ft.distrib_channel, ft.ege_basique, ft.ege_security_product,
               ft.eta_num_etape_eta, ft.f_qty_item) \
    .withColumn("week_tdt_date_transaction", F.weekofyear(ft["tdt_date_transaction"])) \
    .withColumn("quarter_tdt_date_transaction", F.quarter(ft["tdt_date_transaction"]))
ft = ft.join(qf, how='left', on=cond) \
    .select(ft.the_transaction_id, ft.week_tdt_date_transaction, ft.quarter_tdt_date_transaction,
            ft.tdt_date_transaction, ft.but_num_business_unit, ft.but_name_business_unit, ft.sku_num_sku_r3,
            ft.mdl_num_model_r3, ft.purch_org, ft.sales_org, ft.sales_org_text, ft.purch_org_text, ft.distrib_channel,
            ft.ege_basique, ft.ege_security_product, ft.eta_num_etape_eta, ft.f_qty_item, qf.vendor_id,
            qf.subcontractor_id)
ft = ft.withColumn("year_transaction", F.year(ft.tdt_date_transaction).cast(T.StringType())) \
    .withColumn("month_transaction", F.month(ft.tdt_date_transaction).cast(T.StringType())) \
    .withColumn("tdt_date_transaction", F.dayofmonth(ft.tdt_date_transaction))
ft.repartition(80).write.option("compression", "snappy").mode("overwrite")\
    .insertInto("kylin_usb_mqb.kyl_mqb_vte")

# Stopping Session
spark.sparkContext.stop()
Esempio n. 22
0
                            , (123,1, "2021-02-10")
                            , (123,0, "2021-02-28")
                            , (777,1, "2021-01-01")
                            , (777,1, "2021-01-01")
			], ["id", "flag", "date"])


# change string to date type
df = df.withColumn("date", F.to_date(F.col("date"), "yyyy-MM-dd"))


# time related features
from pyspark.sql.functions import year,month, quarter,weekofyear, dayofweek,date_format, to_date
df = df.withColumn("year", year(df.date))
df = df.withColumn("month", month(df.date))
df = df.withColumn("quarter", quarter(df.date))
df = df.withColumn("day_of_month", date_format(col("date"), "d")
df = df.withColumn("day_of_year", date_format(col("date"), "D")
df = df.withColumn("week_of_year", weekofyear(df.date)) # deprecated in spark3
df = df.withColumn("day_of_year", dayofweek(df.date))
		   
# window functions: rolling + forward
df = f.Window()
		   
# Read from file
# Read from DB
# Query from loaded file

# drop 
df_query=df_query.drop("cust_id", "inst_id", "account_num","seq_num", "currency")