def getCalendar(df_immigration): """ Generate Calendar dimension table based on Immigration Args: df_immig : dataframe contain Immigration information Returns: df_calendar : dataframe contains Calendar information """ # create dimension date, combine arrival and departure date df_temp = df_immigration.select( col("arrival_date").alias("date_deparr")).distinct() df_temp2 = df_immigration.select( col("departure_date").alias("date_deparr")).distinct() df_calendar = df_temp.union(df_temp2) df_calendar = df_calendar.distinct() # derived year, month, day from date df_calendar = df_calendar.withColumn("year_deparr", year(col("date_deparr"))) \ .withColumn("month_deparr", month(col("date_deparr"))) \ .withColumn("day_deparr", dayofmonth(col("date_deparr"))) \ .withColumn("quarter_deparr", quarter(col("date_deparr"))) # remove rows based on missing values df_calendar = df_calendar.na.drop(subset=("date_deparr")) return df_calendar
def createDimDate(df): ''' Creates date dimensional table from DateTime field in upstream dataframe :param df: :return: Date Dimensional Dataframe ''' df = df.withColumn("rawKey", F.col('rawKey')) df = df.withColumn("year", F.year(F.col('DateTime'))) df = df.withColumn("month", F.month(F.col('DateTime'))) df = df.withColumn("dayofmonth", F.dayofmonth(F.col('DateTime'))) df = df.withColumn("dayofweek", F.dayofweek(F.col('DateTime'))) df = df.withColumn("dayofyear", F.dayofyear(F.col('DateTime'))) df = df.withColumn("hour", F.hour(F.col('DateTime'))) df = df.withColumn("minute", F.minute(F.col('DateTime'))) df = df.withColumn("dateMinute", F.date_format(F.col("DateTime"), "yyyyMMddHHmm")) df = df.withColumn("quarter", F.quarter(F.col('DateTime'))) df = df.withColumn("date", F.to_date(F.col('DateTime'))) df.createOrReplaceTempView('tempDimDateTable') dimDateDF = spark.sql(" SELECT * FROM \ (select rawKey,dateMinute,dateTime, date,year, month,dayofmonth,dayofweek,dayofyear,hour, minute,quarter \ from tempDimDateTable \ group by rawKey,dateMinute,dateTime, date,year, month,dayofmonth,dayofweek,dayofyear,hour, minute,quarter \ order by dateMinute ASC) \ ") # Generating dateKey field dimDateDF = dimDateDF.withColumn('dateKey', F.monotonically_increasing_id() + 1) # Creating dataframe including date field which will help to generate Fact table factHelperDateDF = dimDateDF.select(F.col('rawKey'), F.col('dateKey'), F.col('dateMinute')) # Dropping unnecessary rawKey field dimDateDF = dimDateDF.drop(F.col('rawKey')) return dimDateDF, factHelperDateDF
def _extract_dim_date(self, imgDF: DataFrame, output_path: str): print('Extracting dim_date from immigration data...') dateCols = [col for col in imgDF.columns if col.endswith('_date')] dates: DataFrame = None for dateCol in dateCols: dt = imgDF. \ where( F.col(dateCol).isNotNull() ). \ select( F.col(dateCol).alias('date'), F.year(F.col(dateCol)).cast('smallint').alias('year'), F.quarter(F.col(dateCol)).cast('smallint').alias('quarter'), F.month(F.col(dateCol)).cast('smallint').alias('month'), F.dayofweek(F.col(dateCol)).cast('smallint').alias('day_of_week'), F.dayofmonth(F.col(dateCol)).cast('smallint').alias('day_of_month'), F.dayofyear(F.col(dateCol)).cast('smallint').alias('day_of_year'), F.weekofyear(F.col(dateCol)).cast('smallint').alias('week_of_year') ) if dates: dates = dates.union(dt).distinct() else: dates = dt print('Saving dim_date') self._write_data(dates, output_path, 'date') print('Finished saving dim_date') print('Finished extracting dim_date from immigration data.')
def transform(dataframe): # Add date partitions and cast user_ids cleaned = ( dataframe.withColumn("year", F.year("timestamp")) .withColumn("quarter", F.quarter("timestamp")) .withColumn("user_id", F.col("user_id").cast("int")) .orderBy("timestamp") ) return cleaned
def glean_3(invoice_df): invoices = invoice_df.withColumn('invoice_year', funcs.year('invoice_date')) invoices = invoices.withColumn('invoice_month', funcs.month('invoice_date')) invoices = invoices.withColumn('invoice_quarter', funcs.quarter('invoice_date')) invoices = invoices.groupBy([ 'canonical_vendor_id', 'invoice_year', 'invoice_month', 'invoice_quarter' ]).agg({ 'total_amount': 'sum', 'invoice_date': 'max' }).sort('canonical_vendor_id', 'invoice_year', 'invoice_month') days = lambda i: i * 86400 w = (Window.partitionBy('canonical_vendor_id').orderBy( funcs.col('max(invoice_date)').cast('long')).rangeBetween( -days(365), 0)) invoices = invoices.withColumn('rolling_average_12m', funcs.avg("sum(total_amount)").over(w)) glean3 = invoices[ ((invoices['sum(total_amount)'] >= invoices['rolling_average_12m'] * 6) & (invoices['sum(total_amount)'] < 1000) & (invoices['sum(total_amount)'] >= 100)) | ((invoices['sum(total_amount)'] >= invoices['rolling_average_12m'] * 3) & (invoices['sum(total_amount)'] < 10000) & (invoices['sum(total_amount)'] >= 1000)) | ((invoices['sum(total_amount)'] >= invoices['rolling_average_12m'] * 1.5) & (invoices['sum(total_amount)'] >= 10000))] glean3 = glean3.withColumn( 'dollar_dif', funcs.col('sum(total_amount)') - funcs.col('rolling_average_12m')) glean3 = glean3.withColumn( 'percent_dif', funcs.round(100 * funcs.col('dollar_dif') / funcs.col('rolling_average_12m'))) glean3 = glean3.withColumn('glean_location', funcs.lit('vendor')) glean3 = glean3.withColumn("glean_type", funcs.lit('large_month_increase_mtd')) glean3 = glean3.withColumn( "glean_text", funcs.concat(funcs.lit('Monthly spend with '), funcs.col('canonical_vendor_id'), funcs.lit(' is $'), funcs.col('dollar_dif'), funcs.lit(' ('), funcs.col('percent_dif'), funcs.lit('%) higher than average'))) glean3 = glean3.withColumn("invoice_id", funcs.lit('n/a')) glean3 = glean3.withColumn('glean_date', funcs.col('max(invoice_date)')) return glean3
def extract_date(df): #df = df.na.fill('NORMAL',subset=['types']) df = df.dropna(how='all') df = (df .withColumn('Yearday', F.dayofyear(F.col("SalDate"))) .withColumn('Month', F.month(F.col('SalDate'))) .withColumn('DayofWeek', F.dayofweek(F.col('SalDate'))) .withColumn('Year', F.year(F.col('SalDate'))) .withColumn('Quarter', F.quarter(F.col('SalDate'))) .withColumn('WeekOfYear', F.weekofyear(F.col('SalDate'))) .withColumn('MonthQuarter', F.when((df['DayofMonth'] <= 8), 0)\ .otherwise(F.when((df['DayofMonth'] <= 16), 1).otherwise(F.when((df['DayofMonth'] <= 24), 2)\ .otherwise(3)))) ) df = df.cache() return df
def transform(dataframe, limit=None): # prepare the revision column revision_columns = [ "article_id", "rev_id", "article_title", "timestamp", "username", "user_id", ] revision_query = [ F.col("_revision").getItem(i).alias(name) for i, name in enumerate(revision_columns) ] dataframe = ( dataframe.withColumn("_revision", F.split("revision", " ")) .select(revision_query + dataframe.columns[1:]) .drop("_revision") ) # add in the proper typing typemap = dict( [ ("article_id", "int"), ("rev_id", "int"), ("timestamp", "timestamp"), ("minor", "boolean"), ("textdata", "int"), ] ) def cast(name): if name in typemap: return F.col(name).cast(typemap[name]) return F.col(name) cast_query = map(cast, dataframe.columns) dataframe = dataframe.select(cast_query) # include limits if limit: dataframe = dataframe.limit(limit) return dataframe.withColumn("year", F.year("timestamp")).withColumn( "quarter", F.quarter("timestamp") )
def _transform(self, df): self.check_input_type(df.schema) return df.withColumn(self.outputCol, F.quarter(df[self.inputCol]))
df2 = df.filter(df.Timestamp.isNotNull()) df2 = df2.limit(500) # convert timestamp to the right format timeStampPreCleaning = udf( lambda x: str(x) + " 2020" if len(x) < 8 else x.replace(",", ""), StringType() ) df2 = df2.withColumn("Timestamp", timeStampPreCleaning("Timestamp")) # StirngToDateType df3 = df2.withColumn("TimeStampDateType", F.to_date(F.col("Timestamp"), "MMM dd yyyy")) # drop null value rows which timestamp columns is not in the standard format. df3 = df3.filter(df3.TimeStampDateType.isNotNull()) df3 = df3.withColumn("Year", F.year(df3.TimeStampDateType)) df3 = df3.withColumn("Month", F.month(df3.TimeStampDateType)) df3 = df3.withColumn("Qurter", F.quarter(df3.TimeStampDateType)) # fill null with 0 and convert unit to the right numbers. cols = ["Comments", "Likes", "Retweets"] df3 = df3.fillna("0", subset=cols) # apply the transform_number udf transformNumber = udf(lambda z: transform_number(z), T.IntegerType()) df3 = df3.withColumn("Comments", transformNumber("Comments")) df3 = df3.withColumn("Likes", transformNumber("Likes")) df3 = df3.withColumn("Retweets", transformNumber("Retweets")) ### check logNormal = udf(lambda x: int(round(np.log2(x + 1)))+1, T.IntegerType()) df3 = df3.withColumn("Likes_log", logNormal("Likes")) df3 = df3.withColumn("Retweets_log", logNormal("Retweets")) # df3 = df3.filter(df3.Likes_log.isNotNull())
then b.UnitPrice*b.Quantity*(1-b.Discount) else 0 end), 0) "Qtr 4" from Products a inner join Order_Details b on a.ProductID = b.ProductID inner join Orders c on c.OrderID = b.OrderID inner join Customers d on d.CustomerID = c.CustomerID where c.OrderDate between date('1997-01-01') and date('1997-12-31') group by a.ProductName, d.CompanyName, year(OrderDate) order by a.ProductName, d.CompanyName; """ ordersDF = getTable("david_test.orders") customersDF = getTable("david_test.Customers") quarterlyOrdersByProductsDF = productsDF.alias("a").join(orderDetailsDF.alias("b"), productsDF.product_id == orderDetailsDF.product_id, how="inner") \ .join(ordersDF.alias("c"), orderDetailsDF.order_id == ordersDF.order_id, how="inner") \ .join(customersDF.alias("d"), ordersDF.customer_id == customersDF.customer_id, how="inner") \ .filter(col("c.order_date").between("1997-01-01", "1997-12-31")) \ .select("a.product_name", "d.company_name", year("c.order_date").alias("year_order_date"), "c.order_date", "b.unit_price", "b.quantity", "b.discount") \ .groupBy("a.product_name", "d.company_name", "year_order_date") \ .agg(sum(when(quarter("c.order_date") == 1, col("b.unit_price") * col("b.quantity") * (1 - col("b.discount"))).otherwise(0)).cast(IntegerType()).alias("qtr 1"), sum(when(quarter("c.order_date") == 2, col("b.unit_price") * col("b.quantity") * (1 - col("b.discount"))).otherwise(0)).cast(IntegerType()).alias("qtr 2"), sum(when(quarter("c.order_date") == 3, col("b.unit_price") * col("b.quantity") * (1 - col("b.discount"))).otherwise(0)).cast(IntegerType()).alias("qtr 3"), sum(when(quarter("c.order_date") == 4, col("b.unit_price") * col("b.quantity") * (1 - col("b.discount"))).otherwise(0)).cast(IntegerType()).alias("qtr 4")) \ .select("a.product_name", "d.company_name", "year_order_date", "qtr 1", "qtr 2", "qtr 3", "qtr 4") \ .orderBy("a.product_name", "d.company_name") # quarterlyOrdersByProductsDF.show() quarterlyOrdersByProductsDF.explain()
def insert_time_dim(start_date_id, end_date_id): time_begin = datetime.strptime(str(start_date_id), "%Y%m%d").date() time_end = datetime.strptime(str(end_date_id), "%Y%m%d").date() print('time_begin') print(time_begin) print('time_end') print(time_end) # tao dataframe tu time_begin va time_end data = [(time_begin, time_end)] df = spark.createDataFrame(data, ["minDate", "maxDate"]) # convert kieu dl va ten field df = df.select( df.minDate.cast(DateType()).alias("minDate"), df.maxDate.cast(DateType()).alias("maxDate")) # chay vong lap lay tat ca cac ngay giua mindate va maxdate df = df.withColumn("daysDiff", f.datediff("maxDate", "minDate")) \ .withColumn("repeat", f.expr("split(repeat(',', daysDiff), ',')")) \ .select("*", f.posexplode("repeat").alias("date", "val")) \ .withColumn("date", f.expr("to_date(date_add(minDate, date))")) \ .select('date') # convert date thanh cac option ngay_thang_nam df = df.withColumn('id', f.date_format(df.date, "yyyyMMdd")) \ .withColumn('ngay_trong_thang', f.dayofmonth(df.date)) \ .withColumn('ngay_trong_tuan', f.from_unixtime(f.unix_timestamp(df.date, "yyyy-MM-dd"), "EEEEE")) \ .withColumn('tuan_trong_nam', f.weekofyear(df.date)) \ .withColumn('thang', f.month(df.date)) \ .withColumn('quy', f.quarter(df.date)) \ .withColumn('nam', f.year(df.date)) df = df.withColumn('tuan_trong_thang', (df.ngay_trong_thang - 1) / 7 + 1) data_time = DynamicFrame.fromDF(df, glueContext, 'data_time') # convert data data_time = data_time.resolveChoice(specs=[('tuan_trong_thang', 'cast:int')]) # chon cac truong va kieu du lieu day vao db applymapping1 = ApplyMapping.apply( frame=data_time, mappings=[("id", "string", "id", "bigint"), ("ngay_trong_thang", 'int', 'ngay_trong_thang', 'int'), ("ngay_trong_tuan", "string", "ngay_trong_tuan", "string"), ("tuan_trong_thang", "int", "tuan_trong_thang", "int"), ("tuan_trong_nam", "int", "tuan_trong_nam", "int"), ("thang", "int", "thang", "int"), ("quy", "int", "quy", "int"), ("nam", "int", "nam", "int"), ("date", "date", "ngay", "timestamp")]) resolvechoice2 = ResolveChoice.apply(frame=applymapping1, choice="make_cols", transformation_ctx="resolvechoice2") dropnullfields3 = DropNullFields.apply( frame=resolvechoice2, transformation_ctx="dropnullfields3") # ghi dl vao db preactions = 'delete student.time_dim where id >= ' + str(start_date_id) datasink4 = glueContext.write_dynamic_frame.from_jdbc_conf( frame=dropnullfields3, catalog_connection="glue_redshift", connection_options={ "preactions": preactions, "dbtable": "student.time_dim", "database": "student_native_report" }, redshift_tmp_dir= "s3n://dts-odin/temp/tu-student_native_report/student/time_dim", transformation_ctx="datasink4")
def main(): # args = getResolvedOptions(sys.argv, ['TempDir', 'JOB_NAME']) sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session # job = Job(glueContext) # job.init(args['JOB_NAME'], args) # tao timebegin bang ngay hien tai + 1 ngay # time_begin = datetime.date.today() + datetime.timedelta(days=1) #20190908 time_begin = datetime.date(2019, 9, 9) # tao timeend bang ngay hien tai + 1 thang - 1 ngay # time_end = time_begin + relativedelta(months=1) - datetime.timedelta(days=1) time_end = datetime.date(2019, 10, 1) # tao dataframe tu time_begin va time_end data = [(time_begin, time_end)] df = spark.createDataFrame(data, ["minDate", "maxDate"]) # convert kieu dl va ten field df = df.select( df.minDate.cast(DateType()).alias("minDate"), df.maxDate.cast(DateType()).alias("maxDate")) # chay vong lap lay tat ca cac ngay giua mindate va maxdate df = df.withColumn("daysDiff", f.datediff("maxDate", "minDate")) \ .withColumn("repeat", f.expr("split(repeat(',', daysDiff), ',')")) \ .select("*", f.posexplode("repeat").alias("date", "val")) \ .withColumn("date", f.expr("to_date(date_add(minDate, date))")) \ .select('date') # convert date thanh cac option ngay_thang_nam df = df.withColumn('id', date_format(df.date, "yyyyMMdd")) \ .withColumn('ngay_trong_thang', f.dayofmonth(df.date)) \ .withColumn('ngay_trong_tuan', from_unixtime(unix_timestamp(df.date, "yyyy-MM-dd"), "EEEEE")) \ .withColumn('tuan_trong_nam', f.weekofyear(df.date)) \ .withColumn('thang', f.month(df.date)) \ .withColumn('quy', f.quarter(df.date)) \ .withColumn('nam', f.year(df.date)) df = df.withColumn('tuan_trong_thang', (df.ngay_trong_thang - 1) / 7 + 1) data_time = DynamicFrame.fromDF(df, glueContext, 'data_time') # convert data data_time = data_time.resolveChoice(specs=[('tuan_trong_thang', 'cast:int')]) # chon cac truong va kieu du lieu day vao db applymapping1 = ApplyMapping.apply( frame=data_time, mappings=[("id", "string", "id", "bigint"), ("ngay_trong_thang", 'int', 'ngay_trong_thang', 'int'), ("ngay_trong_tuan", "string", "ngay_trong_tuan", "string"), ("tuan_trong_thang", "int", "tuan_trong_thang", "int"), ("tuan_trong_nam", "int", "tuan_trong_nam", "int"), ("thang", "int", "thang", "int"), ("quy", "int", "quy", "int"), ("nam", "int", "nam", "int"), ("date", "date", "ngay", "timestamp")]) resolvechoice2 = ResolveChoice.apply(frame=applymapping1, choice="make_cols", transformation_ctx="resolvechoice2") dropnullfields3 = DropNullFields.apply( frame=resolvechoice2, transformation_ctx="dropnullfields3") # ghi dl vao db datasink4 = glueContext.write_dynamic_frame.from_jdbc_conf( frame=dropnullfields3, catalog_connection="glue_redshift", connection_options={ "dbtable": "dim_time", "database": "dts_odin" }, redshift_tmp_dir="s3n://dts-odin/temp/tu-hoc/hwb/fdfdf", transformation_ctx="datasink4")
Fld("title", String()), Fld("video", Boolean()), Fld("vote_average", Float()), Fld("vote_count", Int()) ]) movies_df = spark.read.option("header", "true") \ .csv("s3a://{}/{}/movies_metadata.csv".format(s3_bucket, s3_key), schema=movies_schema) movies_df = movies_df.na.drop() # extract columns to create time table date_table = movies_df.select( col('release_date'), dayofmonth("release_date").alias('day'), weekofyear("release_date").alias('week'), month("release_date").alias('month'), quarter("release_date").alias('quarter'), year("release_date").alias('year')).dropDuplicates() date_table.write \ .format("jdbc") \ .option("url", redshift_conn_string) \ .option("dbtable", "movies.stage_date") \ .option("user", sys.argv[6]) \ .option("password", sys.argv[7]) \ .option("driver", "com.amazon.redshift.jdbc42.Driver") \ .mode("append") \ .save()
def generate_dim_date(spark, start_year=1901, number_years_out_from_start=300): """Create `dim_date` table containing various date feature columns. Args: spark (SparkSession): Instantiated SparkSession start_year (int): starting year for dim_date table. number_years_out_from_start (int): number out from `start_year` to increment. Returns: Spark DataFrame. """ years = [start_year + i for i in range(number_years_out_from_start + 1)] months = [i for i in range(1, 13)] days = [i for i in range(1, 32)] years_df = spark.createDataFrame( pd.DataFrame({ 'year': years, 'temp_join_key': '1' })) months_df = spark.createDataFrame( pd.DataFrame({ 'month': months, 'temp_join_key': '1' })) days_df = spark.createDataFrame( pd.DataFrame({ 'day_of_month': days, 'temp_join_key': '1' })) years_months_df = (years_df.join(months_df, ['temp_join_key'], how='inner')) years_month_days_df = (years_months_df.join(days_df, ['temp_join_key'], how='inner')) date_keys = ( years_month_days_df.withColumn( 'date', to_date( concat(col('year'), lpad(col('month'), 2, '0'), lpad(col('day_of_month'), 2, '0')), 'yyyyMMdd')) # remove invalid dates .filter("date IS NOT NULL").withColumn( 'date_key', regexp_replace(col('date').cast('string'), '-', '').cast('integer'))) date_features = (date_keys # get `week` and `quarter` .withColumn('week', weekofyear(col('date'))) .withColumn('quarter', quarter(col('date'))) # get `day_name` and `month_name` .withColumn('day_name', date_format(col('date'), 'EEEE')) .withColumn('month_name', date_format(col('date'), 'MMMM')) # get `date_year`, `date_quarter`, `date_month`, `date_week` .withColumn('date_week', expr("MIN(date) OVER(PARTITION BY week, year)")) .withColumn('date_month', date_format(col('date'), 'yyyy-MM-01')) .withColumn('date_quarter', expr("MIN(date) OVER(PARTITION BY quarter, year)")) .withColumn('date_year', date_format(col('date'), 'yyyy-01-01')) # get `day_of_week`, `day_of_quarter`, `day_of_year` .withColumn('day_of_week', dayofweek(col('date'))) .withColumn('day_of_quarter', datediff(col('date'), col('date_quarter')) + lit(1)) .withColumn('day_of_year', dayofyear(col('date'))) # get `weekend_flag`, `us_holiday_flag`, `business_day_flag`, `leap_year_flag`, # `month_start_flag`, `month_end_flag` .withColumn('weekend_flag', when(col('day_of_week').isin([7, 1]), 'Y').otherwise('N')) .withColumn('us_holiday_flag', pd_is_holiday_usa(col('date').cast('timestamp'))) .withColumn('us_biz_day_flag', when((col('weekend_flag') == lit('Y')) | (col('us_holiday_flag') == lit('Y')), 'Y').otherwise('N')) .withColumn('leap_year_flag', when(dayofmonth(last_day(concat(col('year'), lit('-02-01')).cast('date'))) == 29, 'Y') .otherwise('N')) .withColumn('month_start_flag', when(col('day_of_month') == lit(1), 'Y').otherwise('N')) .withColumn('month_end_flag', when(col('date') == last_day(col('date')), 'Y').otherwise('N')) # get `pct_into_month`, `pct_into_quarter`, `pct_into_year` .withColumn('pct_into_month', (col('day_of_month') / dayofmonth(last_day(col('date')))).cast('decimal(7, 6)')) .withColumn('date_quarter_end', when(col('quarter') == lit(1), concat(col('year'), lit('-03-31'))) .when(col('quarter') == lit(2), concat(col('year'), lit('-06-30'))) .when(col('quarter') == lit(3), concat(col('year'), lit('-09-30'))) .when(col('quarter') == lit(4), concat(col('year'), lit('-12-31'))) .otherwise(None) .cast('date')) .withColumn('days_in_quarter', datediff(col('date_quarter_end'), col('date_quarter')) + lit(1)) .withColumn('pct_into_quarter', (col('day_of_quarter') / col('days_in_quarter')).cast('decimal(7, 6)')) .withColumn('pct_into_year', (col('day_of_year') / when(col('leap_year_flag') == lit('Y'), 366.0).otherwise(365.0)) .cast('decimal(7, 6)')) # get seasons .withColumn('season_northern', when(col('month').isin(12, 1, 2), 'Winter') .when(col('month').isin(3, 4, 5), 'Spring') .when(col('month').isin(6, 7, 8), 'Summer') .when(col('month').isin(9, 10, 11), 'Fall') .otherwise('UNKNOWN')) .withColumn('season_southern', when(col('month').isin(6, 7, 8), 'Winter') .when(col('month').isin(9, 10, 11), 'Spring') .when(col('month').isin(12, 1, 2), 'Summer') .when(col('month').isin(3, 4, 5), 'Fall') .otherwise('UNKNOWN'))) dim_date = (date_features.sort('date').select([ 'date_key', 'date', 'date_week', 'date_month', 'date_quarter', 'date_year', 'day_of_week', 'day_of_month', 'day_of_quarter', 'day_of_year', 'week', 'month', 'quarter', 'year', 'days_in_quarter', 'day_name', 'month_name', 'season_northern', 'season_southern', 'weekend_flag', 'us_holiday_flag', 'us_biz_day_flag', 'month_start_flag', 'month_end_flag', 'leap_year_flag', 'pct_into_month', 'pct_into_quarter', 'pct_into_year' ])) return dim_date
df = spark.sql("""SELECT * FROM stag_immigration""") # get date range df = df.select("arrival_date").distinct().orderBy("arrival_date") # generate columns df = df.withColumn("year", F.date_format("arrival_date", "y")) \ .withColumn("month", F.date_format("arrival_date", "M")) \ .withColumn("day", F.date_format("arrival_date", "d")) \ .withColumn("month_string", F.date_format("arrival_date", "MMM")) \ .withColumn("day_string", F.date_format("arrival_date", "E")) \ .withColumn("week", F.date_format("arrival_date", "w")) \ .withColumn("day_of_year", F.dayofyear("arrival_date")) \ .withColumn("day_of_week", F.dayofweek("arrival_date")) \ .withColumn("quarter", F.quarter("arrival_date")) # create unique identifier df = df.withColumn("id", F.monotonically_increasing_id() + 1) # select relevant columns df = df.select("id", "arrival_date", "year", "month", "day", "month_string", "day_string", "week", "day_of_year", "day_of_week", "quarter") # show final table df.show() df.printSchema() # write to parquet on s3 df.write.mode("overwrite").option("path", OUTPUT_PATH) \
def load_datasources(spark): # Load JDBC data sources (tables: claims, drugproduct, drugproductfamily, indication): jdbc_url = 'jdbc:postgresql://localhost:5432/bigdata_source' jdbc_table_claim = 'public.claim' jdbc_table_drugproduct = 'public.drugproduct' jdbc_table_drugproductfamily = 'public.drugproductfamily' jdbc_table_indication = 'public.indication' jdbc_credentials = {'user': '******', 'password': '******'} data_filepath_base = '/home/janusz/bigdata/projekt/data/' src_claims = spark.read \ .jdbc(jdbc_url, jdbc_table_claim, properties=jdbc_credentials) src_drugproduct = spark.read \ .jdbc(jdbc_url, jdbc_table_drugproduct, properties=jdbc_credentials) src_drugproductfamily = spark.read \ .jdbc(jdbc_url, jdbc_table_drugproductfamily, properties=jdbc_credentials) src_indication = spark.read \ .jdbc(jdbc_url, jdbc_table_indication, properties=jdbc_credentials) # Load file data sources: src_countries = spark.read.load(data_filepath_base + 'countries_data.csv', format='csv', header='true') src_factories1 = spark.read.load(data_filepath_base + 'factories.tsv', format='csv', delimiter='\t') src_factories2 = spark.read.load(data_filepath_base + 'factories_2.tsv', format='csv', delimiter='\t') src_factories = src_factories1.unionAll(src_factories2).distinct() src_factories = src_factories.toDF('code', 'name', 'country') src_responses1 = spark.read.load(data_filepath_base + 'responses_data.csv', format='csv', header='true') src_responses2 = spark.read.load(data_filepath_base + 'responses_data_2.csv', format='csv', header='true') src_responses = src_responses1.unionAll(src_responses2).distinct() \ .toDF('claim_number', 'response_date', 'response') # Build Facts and Dimensions data frames def format_bk(bk): return bk.strip().upper() udf_format_bk = udf(format_bk) spark.udf.register('format_bk', format_bk) dim_countries = src_countries \ .toDF('country_name', 'alpha_2_code', 'alpha_3_code', 'numeric_code', 'link') \ .select('alpha_2_code', 'country_name') \ .toDF('country_bk', 'country_name') \ .withColumn('country_id', monotonically_increasing_id()) dim_factories = src_factories \ .withColumn('code', udf_format_bk(src_factories.code)) \ .toDF('factory_bk', 'factory_name', 'factory_country') \ .withColumn('factory_id', monotonically_increasing_id()) dim_indications = src_indication \ .select('indicationid', 'indicationdescription') \ .toDF('indication_bk', 'indication_description') \ .withColumn('indication_id', monotonically_increasing_id()) dic_response_code = spark.read.load(data_filepath_base + 'dic_response_code.tsv', format='csv', delimiter='\t') \ .toDF('from', 'to') src_responses.createOrReplaceTempView('response_src') dic_response_code.createOrReplaceTempView('dic_response_code') dim_responses = spark.sql(""" SELECT monotonically_increasing_id() as response_id, format_bk(claim_number) as response_bk, response_date, dic.to as response FROM response_src INNER JOIN dic_response_code as dic ON response_src.response = dic.from """) src_drugproduct.createOrReplaceTempView('src_drugproduct') src_drugproductfamily.createOrReplaceTempView('src_drugproductfamily') dim_products = spark.sql(""" SELECT monotonically_increasing_id() as product_id, format_bk(drugproductid) as product_bk, drugproductname as product_name, drugproductfamilyname as product_family_name FROM src_drugproduct LEFT OUTER JOIN src_drugproductfamily ON src_drugproduct.drugproductfamilyid = src_drugproductfamily.drugproductfamilyid""") format_quarter = udf(lambda num_quarter: 'Q' + str(num_quarter)) calendar_start = date(year=1900, month=1, day=1) calendar_end = date(year=2100, month=1, day=1) calendar_rdd = spark.sparkContext.parallelize( [dt.date() for dt in rrule(freq=DAILY, dtstart=calendar_start, until=calendar_end)]) \ .map(lambda dt: Row(dt=dt)) dim_calendar = spark.createDataFrame(calendar_rdd) dim_calendar = dim_calendar \ .withColumn('date_id', monotonically_increasing_id()) \ .withColumn('date', date_format(dim_calendar.dt, 'YYYY-MM-dd')) \ .withColumn('year', year(dim_calendar.dt)) \ .withColumn('month', month(dim_calendar.dt)) \ .withColumn('day', dayofmonth(dim_calendar.dt)) \ .withColumn('quarter', quarter(dim_calendar.dt)) dim_calendar = dim_calendar \ .withColumn('quarter', format_quarter(dim_calendar.quarter)) src_claims.cache() dim_responses.cache() dim_calendar.cache() dim_countries.cache() dim_factories.cache() dim_indications.cache() dim_products.cache() src_claims.createOrReplaceTempView('claim') dim_responses.createOrReplaceTempView('response') dim_calendar.createOrReplaceTempView('calendar') dim_countries.createOrReplaceTempView('country') dim_factories.createOrReplaceTempView('factory') dim_indications.createOrReplaceTempView('indication') dim_products.createOrReplaceTempView('product') fact_registration_claim = spark.sql(""" SELECT format_bk(claimnumber) as claimnumber, product.product_id as product_id, indication.indication_id as indication_id, factory_api.factory_id as factory_api_id, factory_bulk.factory_id as factory_bulk_id, factory_package.factory_id as factory_package_id, calendar_response.date_id as response_date_id, calendar_expectedresponse.date_id as expectedresponse_date_id, calendar_submission.date_id as submission_date_id, country.country_id as country_id, response.response_id as response_id FROM claim LEFT OUTER JOIN indication on indication.indication_bk = claim.indicationid LEFT OUTER JOIN product on product.product_bk = claim.productcodeid LEFT OUTER JOIN response on response.response_bk = claim.claimnumber LEFT OUTER JOIN calendar as calendar_response on calendar_response.date = response.response_date LEFT OUTER JOIN calendar as calendar_expectedresponse on calendar_expectedresponse.date = claim.expectedresponsedate LEFT OUTER JOIN calendar as calendar_submission on calendar_submission.date = claim.submissiondate LEFT OUTER JOIN country on country.country_bk = claim.countrycode LEFT OUTER JOIN factory as factory_api on factory_api.factory_bk = claim.factoryapi LEFT OUTER JOIN factory as factory_bulk on factory_bulk.factory_bk = claim.factorybulk LEFT OUTER JOIN factory as factory_package on factory_package.factory_bk = claim.factorypackage WHERE claim.claimtype = 'REGISTER' """) fact_reimbursement_claim = spark.sql(""" SELECT format_bk(claimnumber) as claimnumber, product.product_id as product_id, reimbursementprct as reimbursement_claimed_prc, reimbursementprct as reimbursement_app_prc, indication.indication_id as indication_id, calendar_response.date_id as response_date_id, calendar_expectedresponse.date_id as expectedresponse_date_id, calendar_submission.date_id as submission_date_id, country.country_id as country_id, response.response_id as response_id, CAST(TRIM(SUBSTRING(claim.drugprice, 1, LENGTH(claim.drugprice)-3)) AS INT) AS drug_price, SUBSTRING(claim.drugprice, -3, 3) AS drug_price_currency FROM claim LEFT OUTER JOIN indication on indication.indication_bk = claim.indicationid LEFT OUTER JOIN product on product.product_bk = claim.productcodeid LEFT OUTER JOIN response on response.response_bk = claim.claimnumber LEFT OUTER JOIN calendar as calendar_response on calendar_response.date = response.response_date LEFT OUTER JOIN calendar as calendar_expectedresponse on calendar_expectedresponse.date = claim.expectedresponsedate LEFT OUTER JOIN calendar as calendar_submission on calendar_submission.date = claim.submissiondate LEFT OUTER JOIN country on country.country_bk = claim.countrycode WHERE claim.claimtype = 'REFUND' AND claim.drugprice IS NOT NULL AND claim.drugprice != 'null' """) fact_registration_claim.cache() fact_reimbursement_claim.cache() fact_registration_claim.createOrReplaceTempView('registration_claim') fact_reimbursement_claim.createOrReplaceTempView('reimbursement_claim') print('1) Liczba wnioskow o rejestracje na rok') # question_1 = spark.sql(""" # SELECT # count(claimnumber) as number_of_claims, year # FROM reimbursement_claim # INNER JOIN calendar on submission_date_id = calendar.date_id # group by year # order by year # """) question_1 = fact_registration_claim.alias('claim') \ .join(dim_calendar.alias('calendar'), col('claim.submission_date_id') == col('calendar.date_id')) \ .groupBy(col('calendar.year')) \ .count() \ .orderBy(col('calendar.year')) question_1.show() # print('2.a) Odsetek wnioskow o refundacje pozytywnych rozpatrzonych w terminie') # question_2a = spark.sql(""" # select # (SELECT count(claimnumber) as pozytywne # FROM reimbursement_claim # INNER JOIN response on reimbursement_claim.response_id = response.response_id # INNER JOIN calendar expected_cal on expectedresponse_date_id = expected_cal.date_id # INNER JOIN calendar response_cal on response_date_id = response_cal.date_id # where response.response = 'Approved' # and response_cal.date <= expected_cal.date) # / # CAST(count(claimnumber) as FLOAT) AS ODSETEK_REFUND # FROM reimbursement_claim # """) # question_2a.show() question_2a = fact_reimbursement_claim.alias('claim') \ .join(dim_responses.alias('response'), col('claim.response_id') == col('response.response_id')) \ .join(dim_calendar.alias('expected_cal'), col('claim.expectedresponse_date_id') == col('expected_cal.date_id')) \ .join(dim_calendar.alias('response_cal'), col('claim.response_date_id') == col('response_cal.date_id')) \ .where( (col('response.response') == 'Approved') & (col('response_cal.date') <= col('expected_cal.date')) ).count() / float(fact_reimbursement_claim.count()) print('ODSETEK_REFUND: {}'.format(question_2a)) print('2.b) Odsetek wnioskow o rejestracje pozytywnych rozpatrzonych w terminie') # question_2b = spark.sql(""" # select # (SELECT count(claimnumber) as pozytywne # FROM registration_claim # INNER JOIN response on registration_claim.response_id = response.response_id # INNER JOIN calendar expected_cal on expectedresponse_date_id = expected_cal.date_id # INNER JOIN calendar response_cal on response_date_id = response_cal.date_id # where response.response = 'Approved' # and response_cal.date <= expected_cal.date) # / # CAST(count(claimnumber) as FLOAT) AS ODSETEK_REFUND # FROM registration_claim # """) # question_2b.show() question_2b = fact_registration_claim.alias('claim') \ .join(dim_responses.alias('response'), col('claim.response_id') == col('response.response_id')) \ .join(dim_calendar.alias('expected_cal'), col('claim.expectedresponse_date_id') == col('expected_cal.date_id')) \ .join(dim_calendar.alias('response_cal'), col('claim.response_date_id') == col('response_cal.date_id')) \ .where( (col('response.response') == 'Approved') & (col('response_cal.date') <= col('expected_cal.date')) ).count() / float(fact_reimbursement_claim.count()) print('ODSETEK_REFUND: {}'.format(question_2b)) print('3) Najdrozszy lek w Skandynawii') # question_3 = spark.sql(""" # SELECT DISTINCT product.product_bk, product.product_name # FROM reimbursement_claim # INNER JOIN response on reimbursement_claim.response_id = response.response_id # INNER JOIN country on reimbursement_claim.country_id = country.country_id # INNER JOIN product on reimbursement_claim.product_id = product.product_id # INNER JOIN calendar on reimbursement_claim.response_date_id = calendar.date_id # where # reimbursement_claim.drug_price = ( # select # MAX(reimbursement_claim.drug_price) # FROM reimbursement_claim # INNER JOIN response on reimbursement_claim.response_id = response.response_id # INNER JOIN country on reimbursement_claim.country_id = country.country_id # INNER JOIN calendar on reimbursement_claim.response_date_id = calendar.date_id # WHERE response.response = 'Approved' # AND country.country_bk in ('DK', 'SE', 'NO', 'FI') # AND calendar.year = 2011 # AND calendar.quarter = 'Q4' # ) # AND response.response = 'Approved' # AND country.country_bk in ('DK', 'SE', 'NO', 'FI') # AND calendar.year = 2011 # AND calendar.quarter = 'Q4' # """) max_price = fact_reimbursement_claim.alias('claim') \ .join(dim_responses.alias('response'), col('claim.response_id') == col('response.response_id')) \ .join(dim_countries.alias('country'), col('claim.country_id') == col('country.country_id')) \ .join(dim_calendar.alias('calendar'), col('claim.response_date_id') == col('calendar.date_id')) \ .where( (col('response.response') == 'Approved') & (col('country.country_bk').isin({'DK', 'SE', 'NO', 'FI'})) & (col('calendar.year') == 2011) & (col('calendar.quarter') == 'Q4') ).groupBy().max('drug_price').collect()[0]['max(drug_price)'] question_3 = fact_reimbursement_claim.alias('claim') \ .join(dim_responses.alias('response'), col('claim.response_id') == col('response.response_id')) \ .join(dim_countries.alias('country'), col('claim.country_id') == col('country.country_id')) \ .join(dim_products.alias('product'), col('claim.product_id') == col('product.product_id')) \ .join(dim_calendar.alias('calendar'), col('claim.response_date_id') == col('calendar.date_id')) \ .where((col('claim.drug_price') == max_price) & (col('response.response') == 'Approved') & (col('country.country_bk').isin({'DK', 'SE', 'NO', 'FI'})) & (col('calendar.year') == 2011) & (col('calendar.quarter') == 'Q4') ).select(col('product.product_bk'), col('product.product_name')).distinct() question_3.show() print('4.a) Dla jakiego leku przyznano najwyzsza refundacje kwotowo na rynku polskim w latach 2005-2010?') question_4a = spark.sql(""" select product.product_bk, product_name, reimbursement_claimed_prc, drug_price FROM reimbursement_claim INNER JOIN calendar on reimbursement_claim.response_date_id = calendar.date_id INNER JOIN country on reimbursement_claim.country_id = country.country_id INNER JOIN product on reimbursement_claim.product_id = product.product_id WHERE calendar.date >= '2005-01-01' AND calendar.date <= '2010-12-31' AND country.country_bk = 'PL' AND reimbursement_claim.drug_price / reimbursement_claimed_prc = ( SELECT MAX(reimbursement_claim.drug_price / reimbursement_claimed_prc) FROM reimbursement_claim INNER JOIN calendar on reimbursement_claim.response_date_id = calendar.date_id INNER JOIN country on reimbursement_claim.country_id = country.country_id WHERE calendar.date >= '2005-01-01' AND calendar.date <= '2010-12-31' AND country.country_bk = 'PL' ) """) question_4a.show() print('4.b) Dla jakiego leku przyznano najwyzsza refundacje procentowo na rynku polskim w latach 2005-2010?') question_4b = spark.sql(""" select product.product_bk, product_name, reimbursement_claimed_prc, drug_price FROM reimbursement_claim INNER JOIN calendar on reimbursement_claim.response_date_id = calendar.date_id INNER JOIN country on reimbursement_claim.country_id = country.country_id INNER JOIN product on reimbursement_claim.product_id = product.product_id WHERE calendar.date >= '2005-01-01' AND calendar.date <= '2010-12-31' AND country.country_bk = 'PL' AND reimbursement_claimed_prc = ( SELECT MAX(reimbursement_claimed_prc) FROM reimbursement_claim INNER JOIN calendar on reimbursement_claim.response_date_id = calendar.date_id INNER JOIN country on reimbursement_claim.country_id = country.country_id WHERE calendar.date >= '2005-01-01' AND calendar.date <= '2010-12-31' AND country.country_bk = 'PL' ) """) question_4b.show()
######CLEAN df = sqlContext.read.csv("/datasets/crimes.csv", header='true') #Define date derivatives df = (df.withColumn( 'Timestamps', F.to_timestamp("Date", 'MM/dd/yyyy hh:mm:ss a')).withColumn( 'Day', F.to_date("Date", 'MM/dd/yyyy hh:mm:ss a')).withColumn( "Month", F.month("Day")).withColumn( "Hour", F.hour("Timestamps")).withColumn( "Minute", F.minute("Timestamps")).withColumn( "DayOfMonth", F.dayofmonth("Day")).withColumn( "DayOfYear", F.dayofyear("Day")).withColumn( "DayOfWeek", F.dayofweek("Day")).withColumn( 'WeekOfYear', F.weekofyear("Day")).withColumn( 'Quarter', F.quarter("Timestamps"))) cols = [ "Day", "Year", "Month", "Hour", "Minute", "DayOfMonth", "DayOfYear", "DayOfWeek", "WeekOfYear", "Quarter", "District", "Primary Type" ] df = df.select(*cols) #Rename Primary Types with less than 1% share OTHER CRIMES def least_frequent_columns(df, threshold=0.01): res = df.groupBy("Primary Type").count()\ .withColumn('tot',F.lit(df.count()))\ .withColumn('frac',F.expr('count/tot'))\ .filter('frac<'+str(threshold))\
def getPreprocessingDataframe(df, nlp, version='dev'): if version == 'dev': df = df.orderBy(rand()) df2 = df.filter(df.Timestamp.isNotNull()) df2 = df2.limit(500) else: df = df.orderBy(rand()) df2 = df.filter(df.Timestamp.isNotNull()) # convert timestamp to the right format timeStampPreCleaning = udf( lambda x: str(x) + " 2020" if len(x) < 8 else x.replace(",", ""), StringType()) df2 = df2.withColumn("Timestamp", timeStampPreCleaning("Timestamp")) # StirngToDateType df3 = df2.withColumn("TimeStampDateType", F.to_date(F.col("Timestamp"), "MMM dd yyyy")) # drop null value rows which timestamp columns is not in the standard format. df3 = df3.filter(df3.Text.isNotNull()) df3 = df3.filter(df3.TimeStampDateType.isNotNull()) df3 = df3.withColumn("Year", F.year(df3.TimeStampDateType)) df3 = df3.withColumn("Month", F.month(df3.TimeStampDateType)) df3 = df3.withColumn("Qurter", F.quarter(df3.TimeStampDateType)) # fill null with 0 and convert unit to the right numbers. cols = ["Comments", "Likes", "Retweets"] df3 = df3.fillna("0", subset=cols) # apply the transform_number udf transformNumber = udf(lambda z: transform_number(z), T.IntegerType()) df3 = df3.withColumn("Comments", transformNumber("Comments")) df3 = df3.withColumn("Likes", transformNumber("Likes")) df3 = df3.withColumn("Retweets", transformNumber("Retweets")) ### check logNormal = udf(lambda x: int(round(np.log2(x + 1))) + 1, T.IntegerType()) df3 = df3.withColumn("Likes_log", logNormal("Likes")) df3 = df3.withColumn("Retweets_log", logNormal("Retweets")) # df3 = df3.filter(df3.Likes_log.isNotNull()) df3 = df3.filter(df3.Retweets_log.isNotNull()) extractKeywordFromQueries = udf(lambda x: extractkeyword(x)) df3 = df3.filter(df3.Page_URL.isNotNull()) df3 = df3.withColumn("Keyword", extractKeywordFromQueries("Page_URL")) df3 = df3.filter(df3.Keyword.isNotNull()) keywordToCategory2 = udf(lambda x: getCategory2(x), StringType()) df3 = df3.withColumn("Category2", keywordToCategory2("Keyword")) # NER Model # could be empty list, nerExtraction = udf(lambda z: ner_extraction(z, nlp), T.ArrayType(StringType())) df3 = df3.withColumn("All_phrases", nerExtraction("Text")) df3 = df3.filter(df3.All_phrases.isNotNull()) checkEmpty = udf(lambda x: checkempty(x), T.IntegerType()) df3 = df3.withColumn('CheckEmpty', checkEmpty('All_phrases')) df3 = df3.filter(df3.CheckEmpty.isNotNull()) df3 = df3.filter(df3.CheckEmpty != int(1)) sentiment = VaderSentiment() vader_sentiment = udf(sentiment.score, T.FloatType()) df3 = df3.withColumn("Sentiment", vader_sentiment('Text')) weighted_phrases_calculate = udf(lambda x, y: y * (int(x) + 1), T.ArrayType(StringType())) df3 = df3.withColumn( "Weighted_phrases", weighted_phrases_calculate("Retweets_log", "All_phrases")) # cols = ['Sentiment','All_phrases','Retweets_log','Weighted_phrases','Year','Month','Keyword'] cols = ["Weighted_phrases", "Year", "Month", "Keyword", "Category2"] weighted_phrases_calculate = udf(lambda x, y: y * (int(x) + 1), T.FloatType()) # get the weighted sentiments for each tweets. df3 = df3.withColumn("Weighted_Sentiment", weighted_phrases_calculate("Likes_log", "Sentiment")) return df3
df2.show() # Now I will drop all the rows that contains Null in value column sparkDF.dropna(subset=['value']) sparkDF.show(20, False) print('Total rowcount after removing Nulls: ', sparkDF.count()) # def mqy(value): # First creating a python function # if 'Month' in value: # return list('M') # elif 'Year' in value: # return list('Y') # else: # return list('Q') # # Converting python function to pyspark UDF # udf_mqy = fn.udf(lambda v: mqy(v)) # Now creating new column called Absolute_Period in sparkDF condition1 = fn.col('Desc').like('%Month%') condition2 = fn.col('Desc').like('%Quarter%') sparkDF = sparkDF.withColumn( 'Absolute_Period', fn.concat( fn.year('Date'), fn.when(condition1, 'M').when(condition2, 'Q').otherwise('Y'), fn.when(condition1, fn.format_string('%02d', fn.month('Date'))).when( condition2, fn.format_string('%02d', fn.quarter('Date'))).otherwise(''))) sparkDF.show(20, False)
format='csv') import pyspark.sql.functions as F @F.udf def fix_date(dt): if dt is None: return dt parts = dt.split('/') return "{}-{}-{}".format(parts[2], parts[0], parts[1]) curated = quotes.withColumn('dt', fix_date(F.col('quotedate'))) curated = curated.withColumn('year', F.year(F.col('dt'))) curated = curated.withColumn('quarter', F.quarter(F.col('dt'))) curated.show() joined = curated.alias('alpha').join( curated.alias('beta'), ['year', 'quarter', 'dt']).select( F.col('alpha.year').alias('year'), F.col('alpha.quarter').alias('quarter'), F.col('alpha.dt').alias('dt'), F.col('alpha.symbol').alias('alpha_symbol'), F.col('alpha.dt').alias('alpha_dt'), F.col('alpha.open').alias('alpha_open'), F.col('alpha.high').alias('alpha_high'), F.col('alpha.low').alias('alpha_low'), F.col('alpha.close').alias('alpha_close'), F.col('alpha.volume').alias('alpha_volume'),
kyl_tdt_sku.sku_num_sku_r3 == kyl_edv.sku_num_sku_r3, kyl_tdt_sku.tdt_num_line == kyl_edv.tdt_num_line]) \ .select(kyl_tdt_sku.the_transaction_id, kyl_tdt_sku.tdt_num_line, kyl_tdt_sku.tdt_date_transaction, kyl_tdt_sku.but_num_business_unit, kyl_tdt_sku.but_name_business_unit, kyl_tdt_sku.sku_idr_sku, kyl_tdt_sku.f_qty_item, kyl_tdt_sku.sku_num_sku, kyl_tdt_sku.sku_num_sku_r3, kyl_tdt_sku.mdl_num_model_r3, kyl_org.purch_org, kyl_org.sales_org, kyl_org.sales_org_text, kyl_org.purch_org_text, kyl_org.distrib_channel, kyl_mdg.ege_basique, kyl_mdg.ege_security_product, kyl_mpd.iam_item_and_model_code, kyl_mpd.pty_process_type_pty, kyl_mpd.pty_process_type_label, kyl_edv.eta_num_etape_eta).drop_duplicates() cond = [qf.material_id == ft.sku_num_sku_r3, ft.purch_org == qf.purch_org] ft = ft.select(ft.the_transaction_id, ft.tdt_num_line, ft.tdt_date_transaction, ft.but_num_business_unit, ft.but_name_business_unit, ft.sku_num_sku_r3, ft.mdl_num_model_r3, ft.purch_org, ft.sales_org, ft.sales_org_text, ft.purch_org_text, ft.distrib_channel, ft.ege_basique, ft.ege_security_product, ft.eta_num_etape_eta, ft.f_qty_item) \ .withColumn("week_tdt_date_transaction", F.weekofyear(ft["tdt_date_transaction"])) \ .withColumn("quarter_tdt_date_transaction", F.quarter(ft["tdt_date_transaction"])) ft = ft.join(qf, how='left', on=cond) \ .select(ft.the_transaction_id, ft.week_tdt_date_transaction, ft.quarter_tdt_date_transaction, ft.tdt_date_transaction, ft.but_num_business_unit, ft.but_name_business_unit, ft.sku_num_sku_r3, ft.mdl_num_model_r3, ft.purch_org, ft.sales_org, ft.sales_org_text, ft.purch_org_text, ft.distrib_channel, ft.ege_basique, ft.ege_security_product, ft.eta_num_etape_eta, ft.f_qty_item, qf.vendor_id, qf.subcontractor_id) ft = ft.withColumn("year_transaction", F.year(ft.tdt_date_transaction).cast(T.StringType())) \ .withColumn("month_transaction", F.month(ft.tdt_date_transaction).cast(T.StringType())) \ .withColumn("tdt_date_transaction", F.dayofmonth(ft.tdt_date_transaction)) ft.repartition(80).write.option("compression", "snappy").mode("overwrite")\ .insertInto("kylin_usb_mqb.kyl_mqb_vte") # Stopping Session spark.sparkContext.stop()
, (123,1, "2021-02-10") , (123,0, "2021-02-28") , (777,1, "2021-01-01") , (777,1, "2021-01-01") ], ["id", "flag", "date"]) # change string to date type df = df.withColumn("date", F.to_date(F.col("date"), "yyyy-MM-dd")) # time related features from pyspark.sql.functions import year,month, quarter,weekofyear, dayofweek,date_format, to_date df = df.withColumn("year", year(df.date)) df = df.withColumn("month", month(df.date)) df = df.withColumn("quarter", quarter(df.date)) df = df.withColumn("day_of_month", date_format(col("date"), "d") df = df.withColumn("day_of_year", date_format(col("date"), "D") df = df.withColumn("week_of_year", weekofyear(df.date)) # deprecated in spark3 df = df.withColumn("day_of_year", dayofweek(df.date)) # window functions: rolling + forward df = f.Window() # Read from file # Read from DB # Query from loaded file # drop df_query=df_query.drop("cust_id", "inst_id", "account_num","seq_num", "currency")