def gen_summary(df, output_prefix=""): summary = {} string_cols = [] boolean_cols = [] numeric_cols = [] other_cols = [] for field in df.schema.fields: if isinstance(field.dataType, T.StringType): string_cols.append(field.name) elif isinstance(field.dataType, T.BooleanType): boolean_cols.append(field.name) elif isnumeric(field.dataType): numeric_cols.append(field.name) else: other_cols.append(field.name) counts = cardinalities(df, string_cols) uniques = likely_unique(counts) categoricals = unique_values(df, likely_categoricals(counts)) for span in [2, 3, 4, 6, 12]: thecube = df.cube( "Churn", F.ceil(df.tenure / span).alias("%d_month_spans" % span), "gender", "Partner", "SeniorCitizen", "Contract", "PaperlessBilling", "PaymentMethod", F.ceil(F.log2(F.col("MonthlyCharges")) * 10).alias("log_charges")).count() therollup = df.rollup( "Churn", F.ceil(df.tenure / span).alias("%d_month_spans" % span), "SeniorCitizen", "Contract", "PaperlessBilling", "PaymentMethod", F.ceil(F.log2(F.col("MonthlyCharges")) * 10).alias("log_charges")).agg( F.sum(F.col("TotalCharges")).alias("sum_charges")) thecube.write.mode("overwrite").parquet("%scube-%d.parquet" % (output_prefix, span)) therollup.write.mode("overwrite").parquet("%srollup-%d.parquet" % (output_prefix, span)) encoding_struct = { "categorical": categoricals, "numeric": numeric_cols + boolean_cols, "unique": uniques } summary["schema"] = df.schema.jsonValue() summary["ecdfs"] = approx_ecdf(df, numeric_cols) summary["true_percentage"] = percent_true(df, boolean_cols) summary["encoding"] = encoding_struct summary["distinct_customers"] = df.select(df.customerID).distinct().count() return summary
def casting_data(df): """ Casting of the data to double timestamp, unix or long """ df = df.withColumn("X", df["X"].cast("double")) df = df.withColumn("Y", df["Y"].cast("double")) df = df.withColumn("Z", df["Z"].cast("double")) df = df.withColumn("TremorGA", df["TremorGA"].cast("double")) df = df.withColumn("BradykinesiaGA", df["BradykinesiaGA"].cast("double")) df = df.withColumn("DyskinesiaGA", df["DyskinesiaGA"].cast("double")) df = df.withColumn("TSStart", df["TSStart"].cast("timestamp")) df = df.withColumn("TSEnd", df["TSEnd"].cast("timestamp")) df = df.withColumn("interval_start", ((ceil(unix_timestamp(df["TSStart"]).cast("long")))%10**8)) df = df.withColumn("interval_end", ((ceil(unix_timestamp(df["TSEnd"]).cast("long")))%10**8)) df = df.withColumn("temp", utils_function_spark.find_milisec_udf('TS')) df = df.withColumn("interval", (((unix_timestamp(df["TS"]).cast("long"))))) df = df.withColumn("interval", utils_function_spark.merge_integers_udf('interval', 'temp')) df = df.withColumn("key", utils_function_spark.give_my_key_udf("interval_start", "interval_end", 'SubjectId') ) df = df.withColumn("key", df["key"].cast("double")) return df
def generate_reports(self, from_time, to_time): ecg_data = self.get_monitoring_data(from_time, to_time) findspark.init() spark = SparkSession.builder.appName("ECGLearning").master( "local[*]").getOrCreate() spark.conf.set('spark.sql.session.timeZone', 'Asia/Kolkata') os.makedirs(os.path.join(self.write_path, 'public'), exist_ok=True) # Create data frame ecg_data_rdd = spark.sparkContext.parallelize(ecg_data) schema = StructType([ StructField('time', IntegerType(), True), StructField('tps', StringType(), True) ]) tps_df = spark.createDataFrame(ecg_data_rdd, schema) tps_df = tps_df.withColumn("tps", tps_df["tps"].cast("float")) tps_df = tps_df.withColumn("tps", F.ceil(tps_df["tps"])) tps_df = tps_df.withColumn( "time", F.from_unixtime(tps_df["time"], "yyyy/MM/dd HH:mm:ss")) # Downloading the current file from blob container get_data_from_blob( Path(self.write_path).joinpath('public', self.csv_file_name)) current_blob_df = spark.read.csv(os.path.join(self.write_path, 'public', self.csv_file_name), header=True) current_blob_df = current_blob_df.withColumn( "tps", current_blob_df["tps"].cast("int")) current_blob_df = current_blob_df.union(tps_df) current_blob_df = current_blob_df.dropDuplicates(["time"]) current_blob_df = current_blob_df.sort("time") # removing the first day's data on 7 days data current_blob_df = self.remove_last_day(current_blob_df) os.makedirs(os.path.join(self.write_path, 'public'), exist_ok=True) current_blob_df.toPandas().to_csv(os.path.join(self.write_path, 'public', self.csv_file_name), index=False) create_json( os.path.join(self.write_path, 'public', self.csv_file_name), True) # Uploading updated data to Azure blob container write_data_to_blob(self.write_path, os.path.join('public', self.csv_file_name)) write_data_to_blob(self.write_path, os.path.join('public', self.json_file_name)) spark.stop()
def round_up_cents(df: DataFrame, column: str, precision: int = 2) -> DataFrame: """ Rounds up single column to a given precision and returns a dataframe Parameters: df (DataFrame): A pyspark DataFrame column (str): The column that the transformation should be applied to precision (int): digits after the decimal point the mapping will round to (default: 2) """ return df.withColumn(column, ceil(df[column] * 10**precision) / 10**precision)
def add_index_eventTs(date_1, df, index_hrs_day): ## date to ts : midnight : timegm doing as default date_1_ts = calendar.timegm(date_1.timetuple()) ## time diff from midnight 00:00:00 : in secs df = df.withColumn("time_diff_mid", df.eventTs - F.lit(date_1_ts)) # adjustment ( for data dump inaccuracies) # filetring out ping with hour index not in "1-86400", between includes both df = df.filter(F.col("time_diff_mid").between(1,86400)) # df = df.repartition(200) : keeping off for reducing shuffles ## fix index level , ex: 4 hours : 3600*4 secs index_sec = 3600 * index_hrs_day #creating index col : 1 to 6 df = df.withColumn("index", F.ceil(df.time_diff_mid / F.lit(index_sec))) df = df.drop("time_diff_mid") return df
def grouped_map_pandas_udf(spark): @pandas_udf( returnType="id long, v double", functionType=PandasUDFType.GROUPED_MAP ) # functionType: an enum value in pyspark.sql.functions.PandasUDFType, Default SCALAR def subtract_mean(pdf): v = pdf.v # pdf is a pandas.DataFrame return pdf.assign(v=v - v.mean()) # 添加新的列或者覆盖原有的列,这里需要理解一下 @pandas_udf("id long, v double", PandasUDFType.GROUPED_MAP) # id,v是自定义的列名 def mean_udf(key, pdf): # key is a tuple of one numpy.int64, which is the value of 'id' for the current group return pd.DataFrame([key + (pdf['v'].mean(), )]) @pandas_udf("id long, `ceil(v / 2)` long, v double", PandasUDFType.GROUPED_MAP) def sum_udf(key, pdf): # key is a tuple of two numpy.int64s, which is the values of 'id' and 'ceil(df.v / 2)' for the current group return pd.DataFrame([key + (pdf['v'].sum(), )]) df = spark.createDataFrame([(1, 1.0), (1, 2.0), (2, 3.0), (2, 5.0), (2, 10.0)], ("id", "v")) df.groupBy("id").apply(subtract_mean).show() # +---+----+ # | id| v| # +---+----+ # | 1|-0.5| # | 1| 0.5| # | 2|-3.0| # | 2|-1.0| # | 2| 4.0| # +---+----+ df.groupBy('id').apply(mean_udf).show() # +---+---+ # | id| v| # +---+---+ # | 1|1.5| # | 2|6.0| # +---+---+ df.groupBy('id', ceil(df['v'] / 2)).apply(sum_udf).show() # ceil返回大于或者等于指定表达式的最小整数
def get_filtered_by_week(data: DataFrame) -> DataFrame: """ Method transforms periods "start_date - end_date" to year and week number of year source: +---+----------+----------+ |key|start_date| end_date| +---+----------+----------+ | 5|2018-01-01|2018-01-09| +---+----------+----------+ result: +---+----------+----------+ |key| year| week_num| +---+----------+----------+ | 5| 2018| 1| | 5| 2018| 2| +---+----------+----------+ """ max_week_number = 53 transformed_data = data \ .withColumn('start_week', F.weekofyear('start_date')) \ .withColumn('weeks_diff', F.ceil(F.datediff(F.col('end_date'), F.col('start_date')) / 7)) \ .withColumn("year", F.year("start_date")) \ .withColumn("repeat", F.expr("split(repeat(',', weeks_diff), ',')")) \ .select("*", F.posexplode("repeat").alias("week_add", "val")) \ .withColumn('total_week_num', F.col('start_week') + F.col('week_add')) \ .withColumn('add_year', (F.col('total_week_num') / max_week_number).cast(IntegerType())) \ .withColumn('total_week_num', F.col('total_week_num') - (max_week_number * F.col('add_year'))) \ .withColumn('week_num', F.when(F.col('total_week_num') == 0, 1) .otherwise(F.col('total_week_num'))) \ .withColumn('year', F.col('year') + F.col('add_year')) \ .drop('start_date', 'end_date', 'week_add', 'repeat', 'val', 'date', 'add_year', 'weeks_diff', 'total_week_num') \ .dropDuplicates() return transformed_data
def _estimate_workload(sketch_l, sketch_r): for i in range(len(hash_udfs)): sketch_l = sketch_l.withColumnRenamed(str(i), 'l' + str(i)) sketch_r = sketch_r.withColumnRenamed(str(i), 'r' + str(i)) sketch = sketch_l.join(sketch_r, 'hash_val') sketch.cache() sum_cols = [ F.sum((F.col('l' + str(i)) * F.col('r' + str(i)))) for i in range(len(hash_udfs)) ] total_load = sketch.select(sum_cols).first() workloads = [l for l in enumerate(total_load)] workloads.sort(key=lambda x: x[1]) median_idx = workloads[int(len(hash_udfs) / 2)][0] median_load_total = workloads[int(len(hash_udfs) / 2)][1] avg_load = int(median_load_total / n_workers) median_load = sketch.select('hash_val', (F.col('l' + str(median_idx)) * F.col('r' + str(median_idx))).alias('load')) \ .withColumn('avg_load', F.lit(avg_load)) \ .withColumn('n_part', F.ceil(F.col('load') / F.col('avg_load'))) \ .select('hash_val', 'n_part') return hash_udfs[median_idx], median_load
def run_dm_dc_order(output_str, info_str, stock_date, run_date, start_date, end_date, log_file, sqlc): print_output( f"Load DM items and DC for DM that starts between {start_date} and {end_date}", log_file) dm_item_dc_sql = \ """ SELECT distinct ndt.dm_theme_id, ndt.theme_start_date, ndt.theme_end_date, del.npp, del.ppp, del.ppp_start_date, del.ppp_end_date, del.dept_code, dcid.holding_code, dcid.risk_item_unilever, dcid.primary_ds_supplier as ds_supplier_code, cast(dcid.qty_per_unit as int) as pcb, dcid.rotation, dcid.qty_per_unit, icis.item_id, icis.sub_id, icis.item_code, icis.sub_code, icis.date_key AS run_date, fdo.first_order_date AS past_result FROM vartefact.forecast_nsa_dm_extract_log del JOIN ods.nsa_dm_theme ndt ON del.dm_theme_id = ndt.dm_theme_id JOIN ods.p4md_stogld ps ON del.city_code = ps.stocity JOIN vartefact.forecast_item_code_id_stock icis ON icis.date_key = '{0}' AND del.item_code = CONCAT ( icis.dept_code, icis.item_code ) AND del.sub_code = icis.sub_code AND del.dept_code = icis.dept_code JOIN vartefact.forecast_dc_item_details dcid ON dcid.item_code =icis.item_code AND dcid.sub_code = icis.sub_code AND dcid.dept_code = icis.dept_code AND dcid.rotation != 'X' AND dcid.dc_status != 'Stop' AND dcid.item_type not in ('New','Company Purchase','Seasonal') JOIN vartefact.forecast_store_item_details id ON ps.stostocd = id.store_code AND dcid.dept_code = id.dept_code AND dcid.item_code = id.item_code AND dcid.sub_code = id.sub_code LEFT JOIN vartefact.forecast_dm_dc_orders fdo ON ndt.dm_theme_id = fdo.dm_theme_id AND icis.dept_code = fdo.dept_code AND icis.item_code = fdo.item_code AND icis.sub_code = fdo.sub_code WHERE del.extract_order >= 40 AND del.date_key = '{1}' AND to_timestamp(ndt.theme_start_date, 'yyyy-MM-dd') >= to_timestamp('{2}', 'yyyyMMdd') AND to_timestamp(ndt.theme_start_date, 'yyyy-MM-dd') < to_timestamp('{3}', 'yyyyMMdd') """.replace("\n", " ") dm_item_dc_sql = dm_item_dc_sql.format(stock_date.strftime("%Y%m%d"), run_date.strftime("%Y%m%d"), start_date.strftime("%Y%m%d"), end_date.strftime("%Y%m%d")) dm_item_dc_df = sqlc.sql(dm_item_dc_sql) # # Exclude the DM that already have orders print_output("Exclude the DM that already have orders", log_file) dm_item_dc_df = dm_item_dc_df.filter("past_result is null") output_line = f"After filtering already calculated DM {dm_item_dc_df.count()}" print_output(output_line, log_file) output_str = output_str + output_line + "," # # Only consider the nearest DM first_dc_dm = dm_item_dc_df. \ groupBy(['item_id', 'sub_id']). \ agg(F.min("theme_start_date").alias("theme_start_date")) dm_item_dc_df = dm_item_dc_df.join( first_dc_dm, ['item_id', 'sub_id', 'theme_start_date']) dm_item_dc_cnt = dm_item_dc_df.count() print_output(f"After getting only first DM in DC {dm_item_dc_cnt}", log_file) output_str = output_str + f"After getting only first DM in DC {dm_item_dc_cnt}," + "," if dm_item_dc_cnt == 0: run_date_str = run_date.strftime("%Y%m%d") print_output( f"skip date {run_date_str} cause no active order opportunity for today", log_file) info_str = info_str + f"Job Finish:{get_current_time()}," info_str = info_str + f"skip date {run_date_str} cause no active order opportunity for today" return output_line = f"Number of item that will have DM order in DC {dm_item_dc_df.count()}" print_output(output_line, log_file) output_str = output_str + output_line + "," dm_item_dc_df.cache() dm_item_dc_df.createOrReplaceTempView("dm_item_dc") # + dc_order_sql = \ """ SELECT distinct dis.item_id, dis.sub_id, ord.date_key AS first_order_date, dev.date_key AS first_delivery_date FROM dm_item_dc dis JOIN vartefact.forecast_dc_order_delivery_mapping dodm ON dis.holding_code = dodm.con_holding AND dis.risk_item_unilever = dodm.risk_item_unilever JOIN vartefact.forecast_calendar ord ON ord.date_key = dodm.order_date JOIN vartefact.forecast_calendar dev ON dev.weekday_short = dodm.delivery_weekday and dev.week_index = ord.week_index + dodm.week_shift WHERE to_timestamp(ord.date_key, 'yyyyMMdd') >= to_timestamp(dis.ppp_start_date, 'yyyy-MM-dd') AND dev.date_key <= '{0}' AND dis.rotation != 'X' """.replace("\n", " ") dc_order_sql = dc_order_sql.format(end_date.strftime("%Y%m%d")) # + dc_order_deliver_df = sqlc.sql(dc_order_sql) dc_first_order_df = dc_order_deliver_df.groupBy(['item_id', 'sub_id']). \ agg(F.min("first_order_date").alias("first_order_date")) dc_first_order_deliver_df = dc_order_deliver_df \ .select(['item_id', 'sub_id', 'first_order_date', 'first_delivery_date']) \ .join(dc_first_order_df, ['item_id', 'sub_id', 'first_order_date']) # - dm_item_dc_order_df = dm_item_dc_df \ .join(dc_first_order_deliver_df, \ ['item_id', 'sub_id']) dm_item_dc_order_df.createOrReplaceTempView("dm_item_dc_order") dm_store_to_dc_sql = \ """ select dm.item_id, dm.sub_id, dm.holding_code, dm.theme_start_date, dm.theme_end_date, dm.npp, dm.ppp, dm.ppp_start_date, dm.ppp_end_date, dm.dept_code, dm.item_code, dm.sub_code, dm.pcb, dm.ds_supplier_code, dm.rotation, dm.run_date, dm.first_order_date, dm.first_delivery_date, sum(sod.regular_sales_before_dm) as regular_sales_before_dm, sum(sod.four_weeks_after_dm) as four_weeks_after_dm, sum(sod.dm_sales) as dm_sales, sum(sod.order_qty) as dm_order_qty_without_pcb, dm.dm_theme_id FROM vartefact.forecast_dm_orders sod JOIN dm_item_dc_order dm on sod.item_id = dm.item_id and sod.sub_id = dm.sub_id and sod.dm_theme_id = dm.dm_theme_id GROUP BY dm.dm_theme_id, dm.item_id, dm.sub_id, dm.holding_code, dm.theme_start_date, dm.theme_end_date, dm.npp, dm.ppp, dm.ppp_start_date, dm.ppp_end_date, dm.dept_code, dm.item_code, dm.sub_code, dm.pcb, dm.ds_supplier_code, dm.rotation, dm.run_date, dm.first_order_date, dm.first_delivery_date """.replace("\n", " ") dm_dc_order = sqlc.sql(dm_store_to_dc_sql) dm_dc_pcb = dm_dc_order \ .withColumn("dm_order_qty", F.when(dm_dc_order.dm_order_qty_without_pcb > 0.0, F.ceil(dm_dc_order.dm_order_qty_without_pcb / dm_dc_order.pcb) * dm_dc_order.pcb) .otherwise(int(0))) dm_dc_pcb.createOrReplaceTempView("dm_dc_final") output_line = f"Number of DM DC orders {dm_dc_pcb.count()}" print_output(output_line, log_file) output_str = output_str + output_line print_output("Write DC order to datalake", log_file) dm_dc_sql = \ """ INSERT INTO vartefact.forecast_dm_dc_orders PARTITION (dm_theme_id) SELECT item_id, sub_id, holding_code, theme_start_date, theme_end_date, npp, ppp, ppp_start_date, ppp_end_date, dept_code, item_code, sub_code, pcb, ds_supplier_code, rotation, run_date, first_order_date, first_delivery_date, regular_sales_before_dm, four_weeks_after_dm, dm_sales, dm_order_qty, dm_order_qty_without_pcb, dm_theme_id FROM dm_dc_final """.replace("\n", " ") # + sqlc.sql(dm_dc_sql) sqlc.sql("refresh table vartefact.forecast_dm_dc_orders") info_str = info_str + f"Job Finish:{get_current_time()}"
def run_dm_store_order(output_str, info_str, stock_date, run_date, start_date, end_date, log_file, sqlc): print_output( f"Load DM items and stores for DM that starts between {start_date} and {end_date}", log_file) dm_item_store_sql = \ """ SELECT distinct ndt.dm_theme_id, ndt.theme_start_date, ndt.theme_end_date, del.npp, del.ppp, del.ppp_start_date, del.ppp_end_date, del.city_code, id.store_code, del.dept_code, id.con_holding, id.risk_item_unilever, cast(id.qty_per_unit as int) as pcb, id.dc_supplier_code, id.ds_supplier_code, id.rotation, icis.item_id, icis.sub_id, icis.item_code, icis.sub_code, icis.date_key AS run_date, fdo.first_order_date AS past_result FROM vartefact.forecast_nsa_dm_extract_log del JOIN ods.nsa_dm_theme ndt ON del.dm_theme_id = ndt.dm_theme_id JOIN ods.p4md_stogld ps ON del.city_code = ps.stocity JOIN vartefact.forecast_store_item_details id ON ps.stostocd = id.store_code AND del.item_code = CONCAT ( id.dept_code, id.item_code ) AND del.sub_code = id.sub_code AND del.dept_code = id.dept_code AND id.store_status != 'Stop' AND id.item_type not in ('New','Company Purchase','Seasonal') JOIN vartefact.forecast_item_code_id_stock icis ON icis.date_key = '{0}' AND id.item_code = icis.item_code AND id.sub_code = icis.sub_code AND id.dept_code = icis.dept_code AND id.store_code = icis.store_code LEFT JOIN vartefact.forecast_dm_orders fdo ON ndt.dm_theme_id = fdo.dm_theme_id AND icis.dept_code = fdo.dept_code AND icis.item_code = fdo.item_code AND icis.sub_code = fdo.sub_code AND icis.store_code = fdo.store_code WHERE del.extract_order >= 40 AND del.date_key = '{1}' AND to_timestamp(ndt.theme_start_date, 'yyyy-MM-dd') >= to_timestamp('{2}', 'yyyyMMdd') AND to_timestamp(ndt.theme_start_date, 'yyyy-MM-dd') < to_timestamp('{3}', 'yyyyMMdd') """.replace("\n", " ") dm_item_store_sql = dm_item_store_sql.format(stock_date.strftime("%Y%m%d"), run_date.strftime("%Y%m%d"), start_date.strftime("%Y%m%d"), end_date.strftime("%Y%m%d")) # # Exclude the DM that already have orders dm_item_store_df = sqlc.sql(dm_item_store_sql) print_output( f"Number of DM item stores in date range {dm_item_store_df.count()}", log_file) print_output("Exclude the DM that already have orders", log_file) dm_item_store_df = dm_item_store_df.filter("past_result is null") output_line = f"After filtering already calculated DM {dm_item_store_df.count()}" print_output(output_line, log_file) output_str = output_str + output_line + "," # # Only consider the nearest DM first_dm = dm_item_store_df. \ groupBy(['item_id', 'sub_id', 'store_code']). \ agg(F.min("theme_start_date").alias("theme_start_date")) dm_item_store_df = dm_item_store_df.join( first_dm, ['item_id', 'sub_id', 'store_code', 'theme_start_date']) dm_item_store_cnt = dm_item_store_df.count() print_output(f"After getting only first DM {dm_item_store_cnt}", log_file) output_str = output_str + f"After getting only first DM {dm_item_store_cnt}," + "," if dm_item_store_cnt == 0: run_date_str = run_date.strftime("%Y%m%d") print_output( f"skip date {run_date_str} cause no active order opportunity for today", log_file) info_str = info_str + f"Job Finish:{get_current_time()}," info_str = info_str + f"skip date {run_date_str} cause no active order opportunity for today" return dm_item_store_df.write.mode("overwrite").format("parquet").saveAsTable( "vartefact.tmp_dm_item_store") dm_item_store_df.createOrReplaceTempView("dm_item_store") # # The first order day within PPP period print_output("Get first order day within PPP period", log_file) onstock_order_sql = \ """ SELECT dis.item_id, dis.sub_id, dis.store_code, ord.date_key AS first_order_date, dev.date_key AS first_delivery_date FROM dm_item_store dis JOIN vartefact.forecast_onstock_order_delivery_mapping mp ON dis.dept_code = mp.dept_code AND dis.rotation = mp.rotation AND dis.store_code = mp.store_code JOIN vartefact.forecast_calendar ord ON ord.iso_weekday = mp.order_iso_weekday JOIN vartefact.forecast_calendar dev ON dev.iso_weekday = mp.delivery_iso_weekday AND dev.week_index = ord.week_index + mp.week_shift WHERE to_timestamp(ord.date_key, 'yyyyMMdd') >= to_timestamp(dis.ppp_start_date, 'yyyy-MM-dd') AND to_timestamp(dev.date_key, 'yyyyMMdd') >= date_add(to_timestamp(dis.theme_start_date, 'yyyy-MM-dd'), -7) AND dev.date_key <= '{0}' """.replace("\n", " ") onstock_order_sql = onstock_order_sql.format(end_date.strftime("%Y%m%d")) onstock_order_deliver_df = sqlc.sql(onstock_order_sql) xdock_order_sql = \ """ SELECT dis.item_id, dis.sub_id, dis.store_code, ord.date_key AS first_order_date, date_format( date_add( to_timestamp(dodm.delivery_date, 'yyyyMMdd'), xo.dc_to_store_time ), 'yyyyMMdd' ) AS first_delivery_date FROM dm_item_store dis JOIN vartefact.forecast_xdock_order_mapping xo ON dis.item_code = xo.item_code AND dis.sub_code = xo.sub_code AND dis.dept_code = xo.dept_code AND dis.store_code = xo.store_code JOIN vartefact.forecast_calendar ord ON ord.iso_weekday = xo.order_iso_weekday JOIN vartefact.forecast_dc_order_delivery_mapping dodm ON dodm.con_holding = dis.con_holding AND dodm.order_date = ord.date_key AND dis.risk_item_unilever = dodm.risk_item_unilever WHERE to_timestamp(ord.date_key, 'yyyyMMdd') >= to_timestamp(dis.ppp_start_date, 'yyyy-MM-dd') AND date_add(to_timestamp(dodm.delivery_date, 'yyyyMMdd'), xo.dc_to_store_time) <= to_timestamp('{0}', 'yyyyMMdd') """.replace("\n", " ") xdock_order_sql = xdock_order_sql.format(end_date.strftime("%Y%m%d")) xdock_order_deliver_df = sqlc.sql(xdock_order_sql) order_deliver_df = onstock_order_deliver_df.union(xdock_order_deliver_df) order_deliver_df.cache() first_order_df = order_deliver_df.groupBy(['item_id', 'sub_id', 'store_code']). \ agg(F.min("first_order_date").alias("first_order_date")) first_order_df.cache() first_order_deliver_df = order_deliver_df \ .join(first_order_df, ['item_id', 'sub_id', 'store_code', 'first_order_date']) # - dm_item_store_order_df = dm_item_store_df \ .join(first_order_deliver_df, ['item_id', 'sub_id', 'store_code']) dm_item_store_order_df.createOrReplaceTempView("dm_item_store_order") output_line = f"Number of item stores that will have DM {dm_item_store_order_df.count()}" print_output(output_line, log_file) output_str = output_str + output_line + "," # # Get DM sales prediction dm_sales_predict_sql = \ """ select dm.*, cast(coalesce(pred.sales_prediction, '0', pred.sales_prediction) as double) as dm_sales, coalesce(pred.sales_prediction, 'no', 'yes') as having_dm_prediction from dm_item_store_order dm left join vartefact.forecast_weekly_dm_view pred on cast(pred.item_id as int) = dm.item_id and cast(pred.sub_id as int) = dm.sub_id and cast(pred.current_dm_theme_id as int) = dm.dm_theme_id and pred.store_code = dm.store_code """.replace("\n", " ") dm_prediction = sqlc.sql(dm_sales_predict_sql) dm_prediction.createOrReplaceTempView("dm_prediction") dm_prediction.filter("having_dm_prediction = 'no' ") \ .write.mode("overwrite").format("parquet") \ .saveAsTable("vartefact.forecast_no_dm_prediction") output_line = f"Number of DM sales prediction {dm_prediction.count()}" print_output(output_line, log_file) output_str = output_str + output_line + "," print_output("Regular sales before DM", log_file) # # Regular sales from first order day to DM start day dm_regular_sales_sql = \ """ SELECT dp.item_id, dp.sub_id, dp.store_code, dp.dm_theme_id, case when fcst.daily_sales_prediction_original < 0.2 and dp.rotation != 'A' then 0 when fcst.daily_sales_prediction_original < 0 then 0 else fcst.daily_sales_prediction_original end AS sales_prediction FROM vartefact.t_forecast_daily_sales_prediction fcst JOIN dm_prediction dp ON fcst.item_id = dp.item_id AND fcst.sub_id = dp.sub_id AND fcst.store_code = dp.store_code AND fcst.date_key > dp.first_delivery_date AND to_timestamp(fcst.date_key, 'yyyyMMdd') < to_timestamp(dp.theme_start_date, 'yyyy-MM-dd') """.replace("\n", " ") dm_regular_sales = sqlc.sql(dm_regular_sales_sql) # - agg_dm_regular_sales = dm_regular_sales.groupBy(['item_id', 'sub_id', 'store_code', 'dm_theme_id']). \ agg(F.sum("sales_prediction").alias("regular_sales_before_dm")) dm_with_regular = dm_prediction.join( agg_dm_regular_sales, ['item_id', 'sub_id', 'store_code', 'dm_theme_id'], "left") # # For ppp <= 90% npp, get 4 weeks after sales for ROTATION A items print_output("DM PPP logic", log_file) after_fourweek_sql = \ """ SELECT dp.item_id, dp.sub_id, dp.store_code, dp.dm_theme_id, case when fcst.daily_sales_prediction_original < 0.2 and dp.rotation != 'A' then 0 when fcst.daily_sales_prediction_original < 0 then 0 else fcst.daily_sales_prediction_original end AS sales_prediction FROM dm_prediction dp JOIN vartefact.t_forecast_daily_sales_prediction fcst ON fcst.item_id = dp.item_id AND fcst.sub_id = dp.sub_id AND fcst.store_code = dp.store_code AND to_timestamp(fcst.date_key, 'yyyyMMdd') > to_timestamp(dp.theme_end_date, 'yyyy-MM-dd') AND to_timestamp(fcst.date_key, 'yyyyMMdd') < date_add(to_timestamp(dp.theme_end_date, 'yyyy-MM-dd'), 28) WHERE dp.rotation = 'A' AND dp.ppp <= dp.npp * 0.9 """.replace("\n", " ") after_fourweek_sales = sqlc.sql( after_fourweek_sql.format(run_date.strftime("%Y%m%d"))) agg_after_fourweek_sales = after_fourweek_sales.groupBy(['item_id', 'sub_id', 'store_code', 'dm_theme_id']). \ agg(F.sum("sales_prediction").alias("four_weeks_after_dm")) output_line = f"Number of DM having PPP {agg_after_fourweek_sales.count()}" print_output(output_line, log_file) output_str = output_str + output_line + "," dm_with_fourweek = dm_with_regular.join( agg_after_fourweek_sales, ['item_id', 'sub_id', 'store_code', 'dm_theme_id'], "left") # # Fill NA dm_with_fourweek = dm_with_fourweek.na.fill(0) dm_with_fourweek.cache() output_line = f"Number of DM store orders {dm_with_fourweek.count()}" print_output(output_line, log_file) output_str = output_str + output_line # # Final calculation print_output("Calculate order quantity", log_file) dm_final = dm_with_fourweek.withColumn( "dm_order_qty_without_pcb", dm_with_fourweek.regular_sales_before_dm + dm_with_fourweek.four_weeks_after_dm + dm_with_fourweek.dm_sales) dm_final = dm_final \ .withColumn("first_dm_order_qty_without_pcb", F.when(dm_final.rotation != 'X', 0.75 * dm_final.dm_order_qty_without_pcb) .otherwise(dm_final.dm_order_qty_without_pcb)) dm_final = dm_final \ .withColumn("first_dm_order_qty", F.when(dm_final.first_dm_order_qty_without_pcb > 0.0, F.ceil(dm_final.first_dm_order_qty_without_pcb / dm_final.pcb) * dm_final.pcb) .otherwise(int(0))) dm_final_pcb = dm_final \ .withColumn("dm_order_qty", F.when(dm_final.dm_order_qty_without_pcb > 0.0, F.ceil(dm_final.dm_order_qty_without_pcb / dm_final.pcb) * dm_final.pcb) .otherwise(int(0))) dm_final_pcb.createOrReplaceTempView("dm_final_pcb") print_output("Write store order to datalake", log_file) dm_sql = \ """ INSERT INTO vartefact.forecast_dm_orders PARTITION (dm_theme_id) SELECT item_id, sub_id, store_code, con_holding, theme_start_date, theme_end_date, npp, ppp, ppp_start_date, ppp_end_date, city_code, dept_code, item_code, sub_code, pcb, dc_supplier_code, ds_supplier_code, rotation, run_date, first_order_date, first_delivery_date, regular_sales_before_dm, four_weeks_after_dm, dm_sales, dm_order_qty, first_dm_order_qty, dm_order_qty_without_pcb, dm_theme_id FROM dm_final_pcb """.replace("\n", " ") if dm_item_store_cnt > 0: sqlc.sql(dm_sql) sqlc.sql("refresh table vartefact.forecast_dm_orders") print_output("Finish writing store order to datalake", log_file)
from pyspark.sql import SparkSession from pyspark.sql.functions import col, split, avg, ceil import string spark = SparkSession.builder.master("local[2]").appName( "Ratings").getOrCreate() df = spark.read.csv("ratings.csv") dfsel = df.select(col("_c1").alias("id"), col("_c2").alias("rating")) df_avg = dfsel.groupBy("id").agg(avg(col("rating"))) df_final = df_avg.select( ceil(col("avg(rating)")).alias("RatingRange"), col("id")).sort("RatingRange", ascending=True).rdd rdd = df_final.map(lambda x: ("Range " + str(x["RatingRange"]), x["id"])) rdd.saveAsTextFile("output4.txt")
def compile_ceil(t, expr, scope, **kwargs): op = expr.op() src_column = t.translate(op.arg, scope) return F.ceil(src_column)
def process_data(self, study_dt): ############################################################################## # DECLARE VARIABLES ############################################################################## dt_range = self.study_dates(study_dt) dt = dt_range s1_bucket_name = 'b6-8f-fc-09-0f-db-50-3f-gpsdata' s1_initial_bucket_depth = 'cuebiq/daily-feed/US/' s1_bucket_output = 'cuebiq/daily-feed-reduced/US/' s2_bucket_name = 'b6-8f-fc-09-0f-db-50-3f-gpsdata' s2_initial_bucket_depth = 'cuebiq/daily-feed-reduced/US/' s2_bucket_output = 'cuebiq/processed-data/US/micro-clusters/' anchor_dist = 430 time_thresh = 28800 part_num = 9 gps_schema = StructType([ StructField("utc_timestamp", IntegerType(), True), StructField("device_id", StringType(), True), StructField("os", IntegerType(), True), StructField("latitude", FloatType(), True), StructField("longitude", FloatType(), True), StructField("accuracy", IntegerType(), True), StructField("tz_offset", IntegerType(), True) ]) s2_gps_schema = StructType([ StructField("utc_timestamp", IntegerType(), True), StructField("device_id", StringType(), True), StructField("os", IntegerType(), True), StructField("latitude", FloatType(), True), StructField("longitude", FloatType(), True), StructField("accuracy", IntegerType(), True), StructField("tz_offset", IntegerType(), True), StructField("row_number", IntegerType(), True) ]) ############################################################################## # WINDOWS ############################################################################## w = Window().partitionBy('device_id').orderBy('utc_timestamp') l = Window().partitionBy('device_id', 'lin_grp').orderBy('utc_timestamp') w2 = Window().partitionBy('device_id').orderBy('row_number') ############################################################################## # BEGIN DAILY ITERATION ############################################################################## print("Reading in files for {}".format(str(dt['study_dt'])[:10])) print("s3://{}/{}[{}|{}|{}]/*.gz".format(s1_bucket_name, s1_initial_bucket_depth, dt['s3_before'], dt['s3_study_dt'], dt['s3_after'])) print("") ################################################################################################# # START STEP 1 ################################################################################################# df1 = dataFrameReader \ .options(header = 'false', delimiter = '\t', codec = 'gzip') \ .schema(gps_schema) \ .format("csv") \ .load("/opt/spark/sample_data/daily-feed/US/2020729*/*.csv.gz") #.load("s3://" + s1_bucket_name + "/" + s1_initial_bucket_depth + dt['s3_before'] +"/*.gz") # the day before df2 = dataFrameReader \ .options(header = 'false', delimiter = '\t', codec = 'gzip') \ .schema(gps_schema) \ .format("csv") \ .load("/opt/spark/sample_data/daily-feed/US/2020730*/*.csv.gz") #.load("s3://" + s1_bucket_name + "/" + s1_initial_bucket_depth + dt['s3_study_dt'] +"/*.gz") # actual study date df3 = dataFrameReader \ .options(header = 'false', delimiter = '\t', codec = 'gzip') \ .schema(gps_schema) \ .format("csv") \ .load("/opt/spark/sample_data/daily-feed/US/2020731*/*.csv.gz") #.load("s3://" + s1_bucket_name + "/" + s1_initial_bucket_depth + dt['s3_after'] +"/*.gz") # the day after # Union data from three inputs into 1 dataframe df = df1.union(df2).union(df3) \ .repartition(part_num, 'device_id') del df1 del df2 del df3 ############################################################################## # FILTER INITIAL JUNK RECORDS # Removes duplicated records (based on time and id), poor accuracy, bad coordinates, and timestamps outside of study range ############################################################################## df = df.na.drop(subset=['latitude','longitude','tz_offset','accuracy']) \ .filter(((df['accuracy'] >= 5) & (df['accuracy'] <= 65)) \ & ((~(df['latitude'] == 0)) | ~(df['longitude'] == 0)) \ & (df['utc_timestamp'] + df['tz_offset']) \ .between(dt['utc_study_dt'], dt['utc_after'])) \ .dropDuplicates(['utc_timestamp','device_id']) ############################################################################## # EXCESSIVE SPEED REMOVAL ############################################################################## df = df.withColumn('dist_to',distance(df['latitude'], df['longitude'], lead(df['latitude'],1).over(w), \ lead(df['longitude'],1).over(w))) \ .withColumn('sec_to', (lead(df['utc_timestamp'], 1).over(w) - df['utc_timestamp'])) \ .withColumn('speed_to', rate_of_speed(col('dist_to'), col('sec_to'),'hour')) \ .withColumn('dist_from', lag(col('dist_to'), 1).over(w)) \ .withColumn('sec_from', lag(col('sec_to'), 1).over(w)) \ .withColumn('speed_from', lag(col('speed_to'), 1).over(w)) \ .filter(((col('dist_to').isNull()) | (col('dist_from').isNull())) \ | ((((col('speed_from') + col('speed_to')) / 2) <= 90) | ((col('dist_to') >= 150) | (col('dist_from') >= 150))) \ & ((col('speed_from') < 600) & (col('speed_to') < 600)) \ & ((col('speed_from') < 20) | (col('speed_to') < 20))) \ .select('utc_timestamp', 'device_id', 'os', 'latitude', 'longitude', 'accuracy', 'tz_offset') ############################################################################## # LINEAR TRAVEL PING REMOVAL # Break pings out into groups of 4 to measure the linear distance ############################################################################## #Assign a record number and linear grouping and lead distance df = df.withColumn('RecordNum',row_number().over(w)) \ .withColumn('lin_grp', py.ceil(row_number().over(w) / 4)) \ .withColumn('dist_to', distance(df['latitude'], df['longitude'], \ lead(df['latitude'],1).over(l), lead(df['longitude'],1).over(l),'meters')) # Create aggregated table for linear groupings expr = [py.min(col('utc_timestamp')).alias('min_utc_timestamp'),py.max(col('utc_timestamp')).alias('max_utc_timestamp'), \ py.count(col('utc_timestamp')).alias('cnt'),py.sum(col('dist_to')).alias('sum_dist'),py.min(col('dist_to')).alias('min_dist')] dfl_grp = df.groupBy('device_id', 'lin_grp').agg(*expr) dfl_grp.createOrReplaceTempView('dfl_grp') df.createOrReplaceTempView('dfl') # Grab just the first and last records in each linear grouping and append aggregated info dfls = spark.sql( "SELECT a.utc_timestamp, a.device_id, a.os, a.latitude, a.longitude, a.accuracy, a.tz_offset, \ a.lin_grp, b.sum_dist, b.min_dist, b.cnt \ FROM dfl as a INNER JOIN dfl_grp as b \ ON a.device_id = b.device_id \ AND a.lin_grp = b.lin_grp \ AND a.utc_timestamp = b.min_utc_timestamp \ UNION ALL \ SELECT a.utc_timestamp, a.device_id, a.os, a.latitude, a.longitude, a.accuracy, a.tz_offset, \ a.lin_grp, b.sum_dist, b.min_dist, b.cnt \ FROM dfl as a INNER JOIN dfl_grp as b \ ON a.device_id = b.device_id \ AND a.lin_grp = b.lin_grp \ AND a.utc_timestamp = b.max_utc_timestamp") # Measure the distance between first and last in each linear grouping and compare to sum distance of all points # Only keep groups that meet criteria for being straight-line df_j = dfls.withColumn('strt_dist', distance(dfls['latitude'],dfls['longitude'], \ lead(dfls['latitude'],1).over(l), \ lead(dfls['longitude'],1).over(l), 'meters')) \ .withColumn('lin',col('strt_dist') / dfls['sum_dist']) \ .na.drop(subset=['strt_dist']) \ .filter((dfls['min_dist'] > 0) \ & (col('strt_dist').between(150, 2000)) \ & (dfls['cnt'] == 4) \ & (col('lin') >= .99825)) \ .select('device_id','lin_grp', 'lin') # Outer join main dataframe to linears groups to filter non-linear pings df = df.join(df_j, ['device_id','lin_grp'], how='left_outer') \ .filter(col('lin').isNull()) \ .select('utc_timestamp','device_id', 'os', 'latitude', 'longitude', 'accuracy', 'tz_offset') del dfl_grp del dfls del df_j ####################################### # CHAIN # Calculating the dynamic chain threshold to find proximate ping relationships ####################################### df = df.withColumn('chain_dist', ((((df['accuracy'] + lead(df['accuracy'],1).over(w)) - 10) * (230 / 120) + 200))) \ .withColumn('chain', when((distance(df['latitude'], df['longitude'], \ lead(df['latitude'],1).over(w), lead(df['longitude'], 1).over(w),'feet')) <= col('chain_dist'), 1) .when((distance(df['latitude'], df['longitude'], \ lag(df['latitude'],1).over(w), lag(df['longitude'], 1).over(w),'feet')) <= lag(col('chain_dist'), 1).over(w), 1)) \ .filter(col('chain') == 1) \ .withColumn('row_number', row_number().over(w)) \ .select('utc_timestamp','device_id', 'os', 'latitude', 'longitude', 'accuracy', 'tz_offset','row_number') \ .persist() df \ .repartition(100,'device_id').sortWithinPartitions('device_id','row_number') \ .write \ .csv(path="/opt/spark/sample_data/daily-feed-reduced/"+dt['s3_study_dt'], mode="append", compression="gzip", sep=",") #.csv(path="s3://" + s1_bucket_name + '/' + s1_bucket_output + dt['s3_study_dt'], mode="append", compression="gzip", sep=",") ############################################################################################## # START STEP 2 ############################################################################################## print('Begin micro-clustering') # INITIALIZE ANCHOR TABLE - Create initial anchor start points based on row number = 1 and distance threshold self.df_dist = df.withColumn('tz_timestamp', df['utc_timestamp'] + df['tz_offset']) \ .withColumn('anchor', when(df['row_number'] == 1, col('tz_timestamp')) \ .when(distance(df['latitude'], df['longitude'], \ lag(df['latitude'],1).over(w2),lag(df['longitude'],1).over(w2),'feet') \ >= anchor_dist, col('tz_timestamp')) \ .when(col('tz_timestamp') - lag(col('tz_timestamp'),1).over(w2) >= time_thresh, col('tz_timestamp'))) \ .select('tz_timestamp','device_id','os','latitude','longitude','accuracy','row_number','anchor') \ .repartition(part_num, 'device_id') \ .persist() print('df_dist starting count = {}'.format( self.df_dist.count())) # Materialize table for caching df.unpersist() del df ##################################################################################################### # ITERATE THROUGH DATAFRAME ANCHOR PROCESS - iterations are broken out to speed up checkpointing # Checkpointing is used to chop off the physical plans of the dataframes that grow with each iteration ###################################################################################################### df_anchor1 = self.anchor_func(3, 3) df_anchor2 = self.anchor_func(5, 5) df_anchor3 = self.anchor_func(12, 6) df_anchor4 = self.anchor_func(20, 5) df_anchor5 = self.anchor_func(30, 5) df_anchor6 = self.anchor_func(50, 5) df_anchor7 = self.anchor_func(80, 5, 1000000) df_anchor8 = self.anchor_func(1000, 5, 1000000) ################################################################################################## # Collect remaining pings to driver for Python analysis print('collect remaining pings') anchor_list = self.df_dist.rdd.map(lambda row: {'timestamp':row[0], 'device_id':row[1], 'latitude':row[3], \ 'longitude':row[4], 'anchor':row[7]}).collect() # Sort elements in list by device_id and timestamp anchor_list.sort(key=operator.itemgetter('device_id', 'timestamp')) # Python analysis on driver of final remaining pings print('iterate through remaining pings on driver') anchor_dr = [] for r in anchor_list: if r['anchor'] is not None: anchor_dr.append(r) else: if anchor_dr[-1]['device_id'] == r['device_id']: if distance_dr(r['latitude'],r['longitude'], \ anchor_dr[-1]['latitude'], \ anchor_dr[-1]['longitude'], 'feet') <= anchor_dist \ & r['timestamp'] - anchor_dr[-1]['timestamp'] < time_thresh: anchor_dr.append({'timestamp':r['timestamp'], 'device_id':r['device_id'], \ 'latitude':anchor_dr[-1]['latitude'], 'longitude':anchor_dr[-1]['longitude'], \ 'anchor':anchor_dr[-1]['anchor']}) else: r['anchor'] = r['timestamp'] anchor_dr.append(r) # Condense result table for dataframe distribution print('generate driver anchor table') new_anchor = [] for r in anchor_dr: new_anchor.append([r['timestamp'], r['device_id'], r['anchor']]) # Bring driver results back into a distributed dataframe and join results print('disperse driver anchor table back to cluster') new_anchor_schema = StructType([ StructField('tz_timestamp', IntegerType(), True), StructField('device_id', StringType(), True), StructField('anchor', IntegerType(), True) ]) df_anchor_dr = spark.createDataFrame(new_anchor,new_anchor_schema) \ .repartition(part_num, 'device_id') # Join remaining anchors to main analysis table self.df_dist = self.df_dist.select('tz_timestamp','device_id','os','latitude','longitude', \ 'accuracy','row_number') \ .join(df_anchor_dr,['tz_timestamp','device_id']) \ # Union all anchor tables together and sort print('finalizing anchor results into central table') df_anchors_fnl = df_anchor1.union(df_anchor2).union(df_anchor3).union(df_anchor4).union(df_anchor5) \ .union(df_anchor6).union(df_anchor7).union(df_anchor8).union(self.df_dist) \ .repartition(part_num,'device_id') \ .persist() self.df_dist.unpersist() ####################################################################################### # Calculate centroids ####################################################################################### print('start calculating centroids') # Get max accuracy value for each micro-cluster and filter clusters with fewer than 2 pings df_anchor_grp = df_anchors_fnl.groupBy('device_id','anchor').agg(*[py.max(col('accuracy')).alias('max_accuracy'), \ py.count(col('tz_timestamp')).alias('cnt')]) \ .withColumn('max_acc_1', col('max_accuracy') + 1) \ .filter(col('cnt') > 1) \ .select('device_id','anchor','max_acc_1','cnt') # Calculate the nominator for each micro-cluster df_anchors_fnl = df_anchors_fnl.join(df_anchor_grp, ['device_id','anchor']) \ .withColumn('nom',col('max_acc_1') - col('accuracy')) df_denom = df_anchors_fnl.groupBy( 'device_id', 'anchor').agg(*[py.sum(col('nom')).alias('denom')]) df_anchors_fnl = df_anchors_fnl.join(df_denom, ['device_id','anchor']) \ .withColumn('weight', df_anchors_fnl['nom'] / df_denom['denom']) \ .withColumn('lat', df_anchors_fnl['latitude'] * col('weight')) \ .withColumn('lon', df_anchors_fnl['longitude'] * col('weight')) expr = [py.sum(col('lat')).alias('new_latitude'), py.sum(col('lon')).alias('new_longitude'), \ py.avg(col('latitude')).alias('avg_latitude'), py.avg(col('longitude')).alias('avg_longitude'), \ py.count(col('tz_timestamp')).alias('cluster_png_cnt'), py.first(col('os')).alias('os'), \ py.min(col('tz_timestamp')).alias('start_timestamp'), py.max(col('tz_timestamp')).alias('end_timestamp'), \ py.avg(col('accuracy')).alias('avg_accuracy')] df_micro = df_anchors_fnl.groupBy('device_id','anchor').agg(*expr) \ .withColumn('fnl_lat', (col('new_latitude') * (3/4)) + (col('avg_latitude') * (1/4))) \ .withColumn('fnl_lon', (col('new_longitude') * (3/4)) + (col('avg_longitude') * (1/4))) \ .withColumn('geohash9', geohash_udf_9(col('fnl_lat'), col('fnl_lon'))) \ .withColumn('dwell_seconds', col('end_timestamp') - col('start_timestamp')) \ .withColumn('start_tm', py.from_unixtime(col('start_timestamp'))) \ .withColumn('end_tm', py.from_unixtime(col('end_timestamp'))) \ .filter(col('dwell_seconds') > 1) \ .select('device_id','os','start_tm','end_tm', \ 'dwell_seconds','cluster_png_cnt', col('fnl_lat').alias('latitude'), \ col('fnl_lon').alias('longitude'), 'geohash9', 'avg_accuracy') df_micro \ .repartition(100,'device_id').sortWithinPartitions('device_id','start_tm') \ .write \ .csv(path="/opt/spark/sample_data/processed-data/" + dt['s3_study_dt'], mode="append", compression="gzip", sep=",") #.csv(path="s3://" + s2_bucket_name + '/' + s2_bucket_output + dt['s3_study_dt'], mode="append", compression="gzip", sep=",") df_anchors_fnl.unpersist() return
.addGrid(gbt.maxIter, [20, 50, 100])\ .addGrid(gbt.stepSize, [0.1, 0.2])\ .build() cv = CrossValidator(estimator = pipeline, estimatorParamMaps = grid, evaluator = evaluator, numFolds = 5) model = cv.fit(addingColTraining) bestFitness = max(model.avgMetrics) print('best fitness = ', bestFitness) bestModel = model.bestModel #bestModel.save('trainning_model_version3') #model = PipelineModel.load('trainning_model2') print("type model = ",bestModel) print(bestModel.stages[2].explainParam('maxDepth')) filePath = 'test.csv' customSchema = StructType([StructField('PassengerId', IntegerType(), False), StructField('PClass', StringType(), True), StructField('Name', StringType(), False), StructField('Sex', StringType(), True), StructField('Age', FloatType(), True), StructField('SibSb', StringType(), True), StructField('Parch', StringType(), True), StructField('Ticket', StringType(), True), StructField('Fare', FloatType(), True), StructField('Cabin', StringType(), True), StructField('Embarked', StringType(), True)]) rawTesting = spark.read.csv(filePath, header = True, schema = customSchema) selectedTesting = rawTesting.select('PassengerId', 'PClass', 'Sex', 'Age', 'Fare') addingColTesting = selectedTesting.withColumn('Missing_Age', selectedTesting['Age'].isNull()).withColumn('Missing_Fare', selectedTesting['Fare'].isNull()) result = model.transform(addingColTesting).select('PassengerId', ceil(col('prediction')).alias('Survived')) result.write.csv('output_version_rf.csv', header = True, mode = 'overwrite')
def get_faces(annotate_host_probability=True, annotate_in_commerical=True): global _faces_cached if annotate_host_probability and annotate_in_commerical: if _faces_cached is not None: return _faces_cached faces = spark.load('query_face').alias('faces') videos = get_videos() frames = get_frames() haircolors = get_hair_colors() hairlengths = get_hair_lengths() clothing = get_clothing() faces = faces.join( frames, faces.frame_id == frames.id ).join( videos, frames.video_id == videos.id ).where( (videos.corrupted == False) & (videos.duplicate == False) ).join( haircolors.where(haircolors.labeler_id == Labeler.objects.get(name='haotian-hairstyle').id), faces.id == haircolors.face_id, 'left_outer' ).join( hairlengths.where(hairlengths.labeler_id == Labeler.objects.get(name='haotian-hairstyle').id), faces.id == hairlengths.face_id, 'left_outer' ).join( clothing.where(clothing.labeler_id == Labeler.objects.get(name='haotian-clothing').id), faces.id == clothing.face_id, 'left_outer' ).select( 'faces.*', videos.show_id, videos.canonical_show_id, videos.channel_id, videos.time, videos.fps, videos.week_day, videos.threeyears_dataset, frames.video_id, frames.number, haircolors.color_id.alias('haircolor_id'), clothing.clothing_id.alias('clothing_id'), hairlengths.length_id.alias('hairlength_id') ).where( ((videos.threeyears_dataset == True) & (frames.number % func.floor(videos.fps * 3) == 0)) | \ ((videos.threeyears_dataset == False) & (frames.number % func.ceil(videos.fps * 3) == 0)) ) faces = faces.withColumn('height', faces.bbox_y2 - faces.bbox_y1) faces = faces.withColumn('width', faces.bbox_x2 - faces.bbox_x1) faces = faces.withColumn('area', faces.height * faces.width) faces = faces.withColumn('duration', func.lit(3)) faces = faces.withColumn('min_frame', faces.number) faces = faces.withColumn('max_frame', faces.number + func.floor(faces.fps * 3) - 1) faces = _annotate_hour(faces) if annotate_in_commerical: faces = _annotate_in_commercial(faces) if annotate_host_probability: host_probs = get_host_probs() faces = faces.join( host_probs, faces.id == host_probs.face_id, 'left_outer' ).select(*faces.columns, host_probs.host_probability) faces = faces.na.fill({'host_probability': 0.}) if annotate_host_probability and annotate_in_commerical: _faces_cached = faces return faces
kernal = data_kernal_group.join(data_kernal_mean,data_kernal_group.date==data_kernal_mean.date).drop(data_kernal_mean.date) kernal = kernal.withColumn('dates', F.date_format('date', 'yyyy-MM-dd')).withColumn('hours', F.date_format('date', 'HH')) Aggregate time into one minutes and join all the features # assign time scale in order to aggregate data into it #time_interval = 60 #start_timestep = 1435708800 - 7200 # 2015-07-01 00:00:00 2 hours difference #data = (data # .withColumn('timestep', F.ceil((F.unix_timestamp('dt')-sc._jsc.startTime())/time_interval)) # .drop('radar_id') # ) # assign each location to the cells index track_grid_x = track_grid.withColumn('x_categories', F.ceil((F.col('position_x') - min_lon)/interval_lon)) data = track_grid_x.withColumn('y_categories', F.ceil((F.col('position_y') - min_lat)/interval_lat)) data = data.fillna(0).drop('radar_id') data = data.withColumn("location_index",F.concat(data.y_categories,data.x_categories)).drop('x_categories').drop('y_categories') # join the attribute features data_count=data.groupBy('location_index', 'dt').count() attribute=data.groupBy('location_index', 'dt').mean('position_x','position_y', 'velocity','airspeed', 'heading','heading_vertical', 'peak_mass','mass','mass_correction') cond = [data_count.location_index == attribute.location_index, data_count.dt == attribute.dt] data_grid = (attribute.join(data_count, cond, 'inner') .drop(attribute.dt) .drop(attribute.location_index) )
def linear_filter(self): print( "\n_______________________________________________\nLINEAR MOVEMENT FILTER\n\n" ) init_cnt = self.df.count() # Create various partitions and sortings for downstream window functions w = Window().partitionBy('device_id', 'study_dt').orderBy('utc_timestamp') l = Window().partitionBy('device_id', 'study_dt', 'lin_grp').orderBy('utc_timestamp') # Number of pings to analyze in a group to determine linearity lgrp = 4 self.df = self.df.withColumn('RecordNum',row_number().over(w)) \ .withColumn('lin_grp', py.ceil(row_number().over(w) / lgrp)) \ .withColumn('dist_to', distance(self.df['latitude'], self.df['longitude'], \ lead(self.df['latitude'],1).over(l), lead(self.df['longitude'],1).over(l),'meters')) \ .withColumn('sequence', row_number().over(l)) # Create aggregated table for linear groupings expr = [py.min(col('utc_timestamp')).alias('min_utc_timestamp'), \ py.max(col('utc_timestamp')).alias('max_utc_timestamp'), \ py.count(col('utc_timestamp')).alias('cnt'), \ py.sum(col('dist_to')).alias('sum_dist'), \ py.min(col('dist_to')).alias('min_dist')] #Measure the distance between first and last in each linear grouping and compare to sum distance of all points df_grp = self.df.groupBy('device_id', 'study_dt', 'lin_grp').agg(*expr) df_l = self.df.filter(self.df['sequence'].isin([1, lgrp])).join( df_grp, ['device_id', 'study_dt', 'lin_grp']) # Only keep groups that meet criteria for being straight-line df_j = df_l.withColumn('strt_dist', distance(df_l['latitude'],df_l['longitude'], \ lead(df_l['latitude'],1).over(l), \ lead(df_l['longitude'],1).over(l), 'meters')) \ .withColumn('lin', col('strt_dist') / df_l['sum_dist']) \ .na.drop(subset=['strt_dist']) \ .filter((df_l['min_dist'] > 0) \ & (col('strt_dist').between(150, 2000)) \ & (df_l['cnt'] == 4) \ & (col('lin') >= .99825)) \ .select('device_id','lin_grp', 'lin') # Outer join main dataframe to linears groups to filter non-linear pings self.df = self.df.join(df_j, ['device_id','lin_grp'], how='left_outer') \ .filter(col('lin').isNull()) \ .drop('lin_grp', 'RecordNum', 'dist_to', 'sequence', 'lin') #lin_cnt = self.df.cache().count() lin_cnt = self.df.count() tbl_data = [['Initial count', init_cnt, 0, 0, 'Count of pings before applying linear movement filter'], \ ['Final count', lin_cnt, init_cnt - lin_cnt, ((init_cnt - lin_cnt) / float(init_cnt)) * 100, \ 'Count of pings after applying linear movement filter']] # Display filter table print(tabulate(tbl_data, floatfmt=".2f", headers=['Phase', 'Ping Count', 'Removed Pings', \ 'Percent Reduction', 'Description']))
for key in function_dict.keys(): f = F.udf(function_dict[key][0], check_type(function_dict[key][0])) df = df.withColumn('%s' % key, f(*[F.col(x) for x in function_dict[key][1]])) #df.show() print(df.dtypes) # statistics agg_interval = 900000000 # microseconds, so 15 mins #agg_interval = 604800000000 # 1 week in mus ts_col = F.col('timestamp') columns = df.columns[1:] df = df.withColumn('floor', (F.floor(ts_col/agg_interval) * agg_interval))\ .withColumn('ceiling', (F.ceil(ts_col/agg_interval) * agg_interval)).orderBy(F.col('floor')) #df.show() #print(df.dtypes) column = 'add100_sa' print(columns) # mean, median, std, agg_df = df.groupBy('floor').agg(F.sum(column), F.min(column), F.max(column)) #, F.count(column), #F.kurtosis(column), F.mean(column), F.skewness(column), #F.stddev(column), F.variance(column)) dropcols = agg_df.select( [agg_df.where(F.isnan(F.col(c)), c).alias(c) for c in agg_df.columns]) dropcols.show() agg_df = agg_df.drop('ceiling').dropna(how='all').drop_duplicates()
from pyspark import SparkConf, SparkContext import pyspark.sql.functions as sf from pyspark.sql import SparkSession conf = SparkConf().setMaster('local[1]').setAppName('movie') sc = SparkContext(conf = conf) spark = SparkSession(sc) text_file = spark.read.csv( "ratings.csv", header=True ) avg_df = text_file.groupBy("movieId").agg(sf.avg("rating").alias("avg_rating")) range_df = avg_df.groupBy(sf.ceil("avg_rating").alias("Range")).agg(sf.collect_list("movieId").alias("list_of_movieId")) changedTypedf = range_df.withColumn("list_of_movieId", range_df["list_of_movieId"].cast("string")) changedTypedf.repartition(1).write.option("header",True).csv("output_q24")
def create_covid_time_series(spark, input_df, column_offset, total_column_name, delta_column_name, include_state): ''' Create and return a time series of Covid-19 data by county Parameters: spark (SparkContext): Spark context to run operations on input_df (DataFrame): Source Covid-19 data, either from a previous cleaning step, or loaded from disc total_column_name (String): Column name for the total value (case or death) in each row delta_column_name (String): Column name for the delta value (case or death) in each row compared to the previous day include_state (Boolean): Do we want to include the state column in this dataframe? Returns: output_df (Dataframe): Spark dataframe containing a time series of Covid-19 data over time by county ''' print(f"Started creating Covid-19 time series data for '{total_column_name}' and '{delta_column_name}'") unix_time = pd.Timestamp("1970-01-01") second = pd.Timedelta('1s') date_list = [(pd.to_datetime(c) - unix_time) // second for c in input_df.columns[column_offset:]] time_data_columns = input_df.columns[column_offset:] time_data_columns.insert(0, 'fips') if include_state: time_data_columns.insert(1, 'state') time_series = [] def extract_county_data_including_state(row): fips = time_data_columns[0] state = time_data_columns[1] for i in range(2, len(time_data_columns)): time_series.append((row[fips], row[state], date_list[i - 2], row[time_data_columns[i]])) def extract_county_data_excluding_state(row): fips = time_data_columns[0] for i in range(1, len(time_data_columns)): time_series.append((row[fips], date_list[i - 1], row[time_data_columns[i]])) if include_state: for row in input_df.collect(): extract_county_data_including_state(row) else: for row in input_df.collect(): extract_county_data_excluding_state(row) time_series_columns = ["fips", "timestamp", total_column_name] if include_state: time_series_columns.insert(1, 'state') output_df = spark.createDataFrame(time_series, time_series_columns) windowSpec = Window \ .partitionBy(output_df['fips']) \ .orderBy(output_df['timestamp'].asc()) output_df = output_df.withColumn('lag', F.lag(output_df[total_column_name], 1).over(windowSpec)) output_df = output_df.withColumn('lead', F.lead(output_df[total_column_name], 1).over(windowSpec)) # Populate deltas output_df = output_df.withColumn(delta_column_name, \ F.when(output_df['lag'].isNull(), 0) \ .otherwise(output_df[total_column_name] - output_df['lag'])) output_df = output_df.withColumn('next_delta', F.lead(output_df[delta_column_name], 1).over(windowSpec)) # Fix overreporting output_df = output_df.withColumn(total_column_name, \ F.when((output_df['next_delta'] >= 0) | (output_df['lag'].isNull() | (output_df['lead'].isNull())), output_df[total_column_name]) \ .otherwise(F.ceil((output_df['lead'] + output_df['lag']) / 2))) # Recalculate deltas output_df = output_df.withColumn('lag', F.lag(output_df[total_column_name], 1).over(windowSpec)) output_df = output_df.withColumn(delta_column_name, \ F.when(output_df['lag'].isNull(), 0) \ .otherwise(output_df[total_column_name] - output_df['lag'])) output_df = output_df.drop('lag').drop('lead').drop('next_delta') print(f"Finished creating Covid-19 time series data for '{total_column_name}' and '{delta_column_name}'") return output_df
# try: # median = np.median(values_list) #get the median of values in a list in each row # return round(float(median),2) # except Exception: # return None #if there is anything wrong with the given values def find_median(values_list): median = np.median( values_list) #get the median of values in a list in each row return round(float(median), 2) median_finder = f.udf(find_median, FloatType()) df_complete = df_complete.withColumn("Xbar", f.ceil(median_finder("Xbar"))) df_complete = df_complete.withColumn("MRbar", f.ceil(median_finder("MRbar"))) df_complete = df_complete.withColumn( "UCL_Individual", df_complete.Xbar + (f.lit(2.66) * df_complete.MRbar)) df_complete = df_complete.withColumn( "LCL_Individual", df_complete.Xbar - (f.lit(2.66) * df_complete.MRbar)) def _is_outlier(cumsum, ucl, lcl): if lcl <= cumsum <= ucl: return 0 return 1 outlier_udf = f.udf(_is_outlier)
def parquet_to_pcd(spark, day_parquet, day_store_dir, day_base_timestamp, min_bucket=1, max_bucket=1441): def max_rows(rs): maxes = [] for feat in renamed[3:]: maxes.append(max([r[feat] for r in rs])) return maxes def create_rows(z, y, x, zc, yc, xc, zi, yi, xi, zcw, ycw, xcw, ziw, yiw, xiw, zcl, ycl, xcl, zil, yil, xil): # Choose correct maximum values, including when lead is null xcf = max(xc, xcw) if xcl is None else max(xc, xcl) ycf = max(yc, ycw) if ycl is None else max(yc, ycl) zcf = max(zc, zcw) if zcl is None else max(zc, zcl) xif = max(xi, xiw) if xil is None else max(xi, xil) yif = max(yi, yiw) if yil is None else max(yi, yil) zif = max(zi, ziw) if zil is None else max(zi, zil) # Create points fs = '{} {} {} {}\n{} {} {} {}\n{} {} {} {}\n' xf, yf, zf = float(x), float(y), float(z) xrgb = stall_to_float(xcf, xif) yrgb = stall_to_float(ycf, yif) zrgb = stall_to_float(zcf, zif) s = fs.format(xf + 0.5, yf, zf, xrgb, xf, yf + 0.5, zf, yrgb, xf, yf, zf + 0.5, zrgb) return s def with_cols(df, names, cols): for n, c in zip(names, cols): df = df.withColumn(n, c) return df def rename_cols(df, old, new): for o, n in zip(old, new): df = df.withColumnRenamed(o, n) return df def create_filestring_tup(r): l = r[1][1] * 3 g = header_fmt.format(l, l) return (r[0], g + r[1][0]) try: os.mkdir(day_store_dir) except OSError: pass # Add bucket df = spark.read.parquet(day_parquet) df = df.withColumn('bucket', F.ceil((F.col('#Time') - day_base_timestamp + 30) / 60)) # Filter df = df.where((F.col('bucket') >= min_bucket) &\ (F.col('bucket') < max_bucket)) # Max between the 2 compids and any extra readings df = df.select('bucket', *fix_stats) df = df.na.fill({k: 0 for k in fix_stats[3:]}) # fill nulls with 0 df = rename_cols(df, fix_stats, renamed) rdd = df.rdd rdd = rdd.map(lambda r: ((r['bucket'], r['Z'], r['Y'], r['X']), [r]))\ .reduceByKey(lambda a, b: a + b)\ .map(lambda kv: list(map(int, list(kv[0]) + max_rows(kv[1])))) df = spark.createDataFrame(rdd, ['bucket'] + renamed) # Add corresponding minus directions xw = Window.partitionBy('bucket', 'Z', 'Y').orderBy('X') yw = Window.partitionBy('bucket', 'Z', 'X').orderBy('Y') zw = Window.partitionBy('bucket', 'Y', 'X').orderBy('Z') wrap_names = [ 'ZC_wrap', 'YC_wrap', 'XC_wrap', 'ZI_wrap', 'YI_wrap', 'XI_wrap' ] wrap_cols = [ F.first('ZC-').over(zw), F.first('YC-').over(yw), F.first('XC-').over(xw), F.first('ZI-').over(zw), F.first('YI-').over(yw), F.first('XI-').over(xw) ] lead_names = [ 'ZC_lead', 'YC_lead', 'XC_lead', 'ZI_lead', 'YI_lead', 'XI_lead' ] lead_cols = [ F.lead('ZC-').over(zw), F.lead('YC-').over(yw), F.lead('XC-').over(xw), F.lead('ZI-').over(zw), F.lead('YI-').over(yw), F.lead('XI-').over(xw) ] df = with_cols(df, wrap_names, wrap_cols) df = with_cols(df, lead_names, lead_cols) df = df.drop(*renamed[9:]) # Calculate string str_args = renamed[:9] + wrap_names + lead_names udf_create_rows = F.udf(create_rows, StringType()) df = df.withColumn('pt_string', udf_create_rows(*str_args))\ .drop(*str_args) # Count and add headers rdd = df.rdd rdd = rdd.map(lambda r: (r['bucket'], [r['pt_string'], 1]))\ .reduceByKey(lambda s1, s2: [s1[0] + s2[0], s1[1] + s2[1]])\ .map(create_filestring_tup) file_contents = rdd.collect() for b, contents in file_contents: with open(get_bucketfile(day_store_dir, b), 'w+') as f: f.write(contents)
df = spark.read.csv('ratings.csv', header=True) # Remove directory where results will be stored. shutil.rmtree('output4', ignore_errors=True, onerror=None) # Select the two columns needed in the exercise. (df.select('movieId', 'rating') # Change rating type to float. .withColumn('rating', df['rating'].cast('float')) # Group by movie id, calculating the average of the ratings. .groupBy('movieId').agg(avg('rating')) # Create a new column with the range corresponding to each film, which is the ceiling of # its average rating. .withColumn('Range', ceil('avg(rating)')) # We select only the range and the movie id. .select('Range', 'movieId') # We goup films by range, collecting in a list all movie ids in a range. .groupBy('Range').agg(collect_list('movieId').alias('ids')) # Sort by range (This is done to see results more clear).. .sort('Range') # We create a RDD from the dataFrame, and reduce the partitions number to one in order # to store results in a single file (if we don't do that it creates 199 output files). .rdd.coalesce(1).saveAsTextFile('output4'))
#bestModel.save('trainning_model_version3') #model = PipelineModel.load('trainning_model2') print("type model = ", bestModel) print(bestModel.stages[2].explainParam('maxIter')) print(bestModel.stages[2].explainParam('regParam')) print(bestModel.stages[2].explainParam('elasticNetParam')) filePath = 'test.csv' customSchema = StructType([ StructField('PassengerId', IntegerType(), False), StructField('PClass', StringType(), True), StructField('Name', StringType(), False), StructField('Sex', StringType(), True), StructField('Age', FloatType(), True), StructField('SibSb', StringType(), True), StructField('Parch', StringType(), True), StructField('Ticket', StringType(), True), StructField('Fare', FloatType(), True), StructField('Cabin', StringType(), True), StructField('Embarked', StringType(), True) ]) rawTesting = spark.read.csv(filePath, header=True, schema=customSchema) selectedTesting = rawTesting.select('PassengerId', 'PClass', 'Sex', 'Age', 'Fare') addingColTesting = selectedTesting.withColumn( 'Missing_Age', selectedTesting['Age'].isNull()).withColumn( 'Missing_Fare', selectedTesting['Fare'].isNull()) result = model.transform(addingColTesting).select( 'PassengerId', ceil(col('prediction')).alias('Survived')) result.write.csv('output_version3.csv', header=True, mode='overwrite')
def usage(transform_context, record_store_df): """component which groups together record store records by provided group by columns list, sorts within the group by event timestamp field, applies group stats udf and returns the latest quantity as a instance usage dataframe This component does groups records by event_type (a.k.a metric name) and expects two kinds of records in record_store data total quantity records - the total available quantity e.g. cpu.total_logical_cores idle perc records - percentage that is idle e.g. cpu.idle_perc To calculate the utilized quantity this component uses following formula: utilized quantity = ceil((100 - idle_perc) * total_quantity / 100) """ sql_context = SQLContext.getOrCreate(record_store_df.rdd.context) transform_spec_df = transform_context.transform_spec_df_info # get rollup operation (sum, max, avg, min) agg_params = transform_spec_df.select( "aggregation_params_map.usage_fetch_operation"). \ collect()[0].asDict() usage_fetch_operation = agg_params["usage_fetch_operation"] # check if operation is valid if not FetchQuantityUtil. \ _is_valid_fetch_quantity_util_operation(usage_fetch_operation): raise FetchQuantityUtilException( "Operation %s is not supported" % usage_fetch_operation) # get the quantities for idle perc and quantity instance_usage_df = FetchQuantity().usage( transform_context, record_store_df) # get aggregation period for instance usage dataframe agg_params = transform_spec_df.select( "aggregation_params_map.aggregation_period").collect()[0].asDict() aggregation_period = agg_params["aggregation_period"] group_by_period_list = ComponentUtils.\ _get_instance_group_by_period_list(aggregation_period) # get what we want to group by agg_params = transform_spec_df.select( "aggregation_params_map.aggregation_group_by_list").\ collect()[0].asDict() aggregation_group_by_list = agg_params["aggregation_group_by_list"] # group by columns list group_by_columns_list = group_by_period_list + \ aggregation_group_by_list # get quantity event type agg_params = transform_spec_df.select( "aggregation_params_map.usage_fetch_util_quantity_event_type").\ collect()[0].asDict() usage_fetch_util_quantity_event_type = \ agg_params["usage_fetch_util_quantity_event_type"] # check if driver parameter is provided if usage_fetch_util_quantity_event_type is None or \ usage_fetch_util_quantity_event_type == "": raise FetchQuantityUtilException( "Driver parameter '%s' is missing" % "usage_fetch_util_quantity_event_type") # get idle perc event type agg_params = transform_spec_df.select( "aggregation_params_map.usage_fetch_util_idle_perc_event_type").\ collect()[0].asDict() usage_fetch_util_idle_perc_event_type = \ agg_params["usage_fetch_util_idle_perc_event_type"] # check if driver parameter is provided if usage_fetch_util_idle_perc_event_type is None or \ usage_fetch_util_idle_perc_event_type == "": raise FetchQuantityUtilException( "Driver parameter '%s' is missing" % "usage_fetch_util_idle_perc_event_type") # get quantity records dataframe event_type_quantity_clause = "processing_meta.event_type='%s'" \ % usage_fetch_util_quantity_event_type quantity_df = instance_usage_df.select('*').where( event_type_quantity_clause).alias("quantity_df_alias") # get idle perc records dataframe event_type_idle_perc_clause = "processing_meta.event_type='%s'" \ % usage_fetch_util_idle_perc_event_type idle_perc_df = instance_usage_df.select('*').where( event_type_idle_perc_clause).alias("idle_perc_df_alias") # join quantity records with idle perc records # create a join condition without the event_type cond = [item for item in group_by_columns_list if item != 'event_type'] quant_idle_perc_df = quantity_df.join(idle_perc_df, cond, 'left') # # Find utilized quantity based on idle percentage # # utilized quantity = (100 - idle_perc) * total_quantity / 100 # quant_idle_perc_calc_df = quant_idle_perc_df.select( col("quantity_df_alias.*"), when(col("idle_perc_df_alias.quantity") != 0.0, ceil(((100.0 - col( "idle_perc_df_alias.quantity"))) * col( "quantity_df_alias.quantity") / 100.0)) .otherwise(col("quantity_df_alias.quantity")) .alias("utilized_quantity"), col("quantity_df_alias.quantity") .alias("total_quantity"), col("idle_perc_df_alias.quantity") .alias("idle_perc")) instance_usage_json_rdd = \ quant_idle_perc_calc_df.rdd.map( FetchQuantityUtil._format_quantity_util) instance_usage_df = \ InstanceUsageUtils.create_df_from_json_rdd(sql_context, instance_usage_json_rdd) return instance_usage_df
def transform_df(df): df = df.withColumn("Home Team", name_changer_udf(col("Home Team"))) \ .withColumn("Away Team", name_changer_udf(col("Away Team"))) \ .withColumn("day_of_week", date_format('date', 'E')) \ .filter(col("Play Off Game?") != "Y") \ .withColumn("night_game", when(hour(col("Kick-off (local)")) >= 17, 1).otherwise(0)) \ .withColumn("game_index", monotonically_increasing_id()) \ .withColumn("extra_time", when(col("Over Time?") == "Y", lit(1)).otherwise(lit(0))) home_games = df.select(df["Home Team"].alias("team"), df["Home Score"].alias("score"), df["Home Odds"].alias("odds"), df["Away Team"].alias("opp_team"), df["Away Score"].alias("opp_score"), df["Away Odds"].alias("opp_odds"), df["Date"].alias("date"), df["day_of_week"], df["night_game"], df["game_index"], df["extra_time"], df["Kick-off (local)"].alias("local_time")) \ .withColumn("type", lit("home")) away_games = df.select(df["Away Team"].alias("team"), df["Away Score"].alias("score"), df["Away Odds"].alias("odds"), df["Home Team"].alias("opp_team"), df["Home Score"].alias("opp_score"), df["Home Odds"].alias("opp_odds"), df["Date"].alias("date"), df["day_of_week"], df["night_game"], df["game_index"], df["extra_time"], df["Kick-off (local)"].alias("local_time")) \ .withColumn("type", lit("away")) games = home_games.union(away_games) get_record_udf = udf(get_record, IntegerType()) get_time_between_udf = udf(get_time_between, FloatType()) games = games \ .withColumn("year", year(col("date"))) \ .withColumn("points_awarded", when(col("score") > col("opp_score"), lit(2)) \ .otherwise(when(col("score") < col("opp_score"), lit(0)) \ .otherwise(lit(1)))) \ .sort(col("date"), col("local_time")) \ .withColumn("rest", datediff(col("date"), lag(col("date"), 1) \ .over(Window. \ partitionBy(col("team"), col("year")) \ .orderBy(col("date"))))) \ .withColumn("result", when(col("score") > col("opp_score"), lit(1)) \ .otherwise(when(col("score") < col("opp_score"), lit(0)) \ .otherwise(np.nan))) \ .withColumn("win", when(col("score") > col("opp_score"), lit(1)) \ .otherwise(lit(0))) \ .withColumn("loss", when(col("score") < col("opp_score"), lit(1)) \ .otherwise(lit(0))) \ .withColumn("draw", when(col("score") == col("opp_score"), lit(1)) \ .otherwise(lit(0))) \ .withColumn("record", collect_list(col("result")) \ .over(Window. \ partitionBy(col("team"), col("year")) \ .orderBy(col("date")))) \ .withColumn("record_date", collect_list(col("date")) \ .over(Window. \ partitionBy(col("team"), col("year")) \ .orderBy(col("date")))) \ .withColumn("record_extra_time", collect_list(col("extra_time")) \ .over(Window. \ partitionBy(col("team"), col("year")) \ .orderBy(col("date")))) \ .withColumn("total_points", sum(col("points_awarded")) \ .over(Window. \ partitionBy(col("team"), col("year")) \ .orderBy(col("date"))) - col("points_awarded")) \ .withColumn("total_points_after_game", sum(col("points_awarded")) \ .over(Window. \ partitionBy(col("team"), col("year")) \ .orderBy(col("date")))) \ .withColumn("total_for", sum(col("score")) \ .over(Window. \ partitionBy(col("team"), col("year")) \ .orderBy(col("date"))) - col("score")) \ .withColumn("total_for_after_game", sum(col("score")) \ .over(Window. \ partitionBy(col("team"), col("year")) \ .orderBy(col("date")))) \ .withColumn("total_against", sum(col("opp_score")) \ .over(Window. \ partitionBy(col("team"), col("year")) \ .orderBy(col("date"))) - col("opp_score")) \ .withColumn("total_against_after_game", sum(col("opp_score")) \ .over(Window. \ partitionBy(col("team"), col("year")) \ .orderBy(col("date")))) \ .withColumn("total_for_per_game", col("total_for") / ((count("*") .over(Window. partitionBy(col("team"), col("year")) .orderBy(col("date")))) - 1)) \ .withColumn("total_against_per_game", col("total_against") / ((count("*") .over(Window. partitionBy(col("team"), col("year")) .orderBy(col("date")))) - 1)) \ .withColumn("total_diff_per_game", col("total_for_per_game") - col("total_against_per_game")) \ .withColumn("total_diff", col("total_for") - col("total_against")) \ .withColumn("total_diff_after_game", col("total_for_after_game") - col("total_against_after_game")) \ .withColumn("time_from_last_win", get_time_between_udf(col("record"), col("record_date"), col("date"), lit("win"))) \ .withColumn("time_from_last_extra_time_game", get_time_between_udf(col("record_extra_time"), col("record_date"), col("date"), lit("win"))) \ .withColumn("time_from_last_loss", get_time_between_udf(col("record"), col("record_date"), col("date"), lit("loss"))) \ .withColumn("time_from_last_draw", get_time_between_udf(col("record"), col("record_date"), col("date"))) \ .withColumn("wins_in_a_row", get_record_udf(col("record"), lit(True))) \ .withColumn("losses_in_a_row", get_record_udf(col("record"), lit(False))) pgames = games.toPandas().sort_values( ['game_index'], ascending=False).reset_index(drop=True) for index, row in pgames.iterrows(): if (index - 16) < 0 or row['date'].year != pgames.loc[(index - 16), 'date'].year: pgames.loc[index, 'position'] = np.nan else: table = [] for reindex in range(index - 1, -1, -1): if row['date'].year != pgames.loc[reindex, 'date'].year or len( table) == 16: break elif row['game_index'] == pgames.loc[ reindex, 'game_index'] or pgames.loc[reindex, 'team'] in [ i['team'] for i in table ]: continue else: team_pos = {} team_pos['team'] = pgames.loc[reindex, 'team'] team_pos['points'] = pgames.loc[reindex, 'total_points_after_game'] team_pos['diff'] = pgames.loc[reindex, 'total_diff_after_game'] table.append(team_pos) table = sorted(table, key=itemgetter('points', 'diff'), reverse=True) info_by_team = build_dict(table, key='team') pgames.loc[index, 'position'] = info_by_team[row['team']]['index'] + 1 for index, row in pgames.iterrows(): if (index - 16) < 0 or row['date'].year != pgames.loc[(index - 16), 'date'].year: pgames.loc[index, 'position'] = np.nan else: table = [] for reindex in range(index, -1, -1): if row['date'].year != pgames.loc[reindex, 'date'].year or len( table) == 16: break elif pgames.loc[reindex, 'team'] in [i['team'] for i in table]: continue else: team_pos = {} team_pos['team'] = pgames.loc[reindex, 'team'] team_pos['points'] = pgames.loc[reindex, 'total_points_after_game'] team_pos['diff'] = pgames.loc[reindex, 'total_diff_after_game'] table.append(team_pos) table = sorted(table, key=itemgetter('points', 'diff'), reverse=True) info_by_team = build_dict(table, key='team') pgames.loc[ index, 'position_after_game'] = info_by_team[row['team']]['index'] + 1 games = op.create \ .df(pdf=pgames) \ .withColumn("ranking_quantile", when(col("position") != np.nan, ceil(col("position") / 4)) \ .otherwise(np.nan)) \ .withColumn("opp_position", sum(col("position")) \ .over(Window.partitionBy("game_index")) - col("position")) \ .withColumn("opp_ranking_quantile", when(col("opp_position") != np.nan, ceil(col("opp_position") / 4)) \ .otherwise(np.nan)) \ .withColumn("previous_opp_position", lag(col("opp_position"), 1) \ .over(Window.partitionBy("team").orderBy(col("game_index")))) \ .withColumn("previous_opp_ranking_quantile", when(col("previous_opp_position") != np.nan, ceil(col("previous_opp_position") / 4)) \ .otherwise(np.nan)) \ .withColumn("previous_result", lag(col("points_awarded"), 1) \ .over(Window.partitionBy("team").orderBy(col("game_index")))) \ .withColumn("previous_result_ranking", col("previous_result") * col("previous_opp_ranking_quantile")) home_games = games \ .filter(col("type") == "home") \ .select(col("team").alias("home_team"), col("odds").alias("home_odds"), col("opp_team").alias("opp_away_team"), col("score").alias("home_score"), col("date"), col("local_time"), col("day_of_week"), col("night_game"), col("game_index"), col("wins_in_a_row").alias("home_wins_in_a_row"), col("losses_in_a_row").alias("home_losses_in_a_row"), col("position").alias("home_position"), col("ranking_quantile").alias("home_ranking_quantile"), col("total_points").alias("home_points"), col("total_for_per_game").alias("home_for_per_game"), col("total_against_per_game").alias("home_against_per_game"), col("previous_result_ranking").alias("home_previous_result"), col("time_from_last_win").alias("home_time_from_last_win"), col("time_from_last_extra_time_game").alias("home_time_from_last_extra_time_game"), col("time_from_last_loss").alias("home_time_from_last_loss"), col("time_from_last_draw").alias("home_time_from_last_draw"), col("rest").alias("home_rest")) away_games = games \ .filter(col("type") == "away") \ .select(col("team").alias("away_team"), col("odds").alias("away_odds"), col("opp_team").alias("opp_home_team"), col("score").alias("away_score"), col("position").alias("away_position"), col("total_points").alias("away_points"), col("wins_in_a_row").alias("away_wins_in_a_row"), col("losses_in_a_row").alias("away_losses_in_a_row"), col("ranking_quantile").alias("away_ranking_quantile"), col("total_for_per_game").alias("away_for_per_game"), col("total_against_per_game").alias("away_against_per_game"), col("rest").alias("away_rest"), col("previous_result_ranking").alias("away_previous_result"), col("time_from_last_win").alias("away_time_from_last_win"), col("time_from_last_extra_time_game").alias("away_time_from_last_extra_time_game"), col("time_from_last_loss").alias("away_time_from_last_loss"), col("time_from_last_draw").alias("away_time_from_last_draw"), col("date").alias("away_date")) df = home_games.join(away_games, (home_games['home_team'] == away_games['opp_home_team']) & \ (home_games['opp_away_team'] == away_games['away_team']) & \ (home_games['date'] == away_games['away_date'])) \ .drop("opp_away_team", "opp_home_team", "away_date") \ .withColumn("rest_spread", col("home_rest") - col("away_rest")) \ .withColumn("game_id", monotonically_increasing_id()) \ .withColumn("home_win", when(col("home_score") > col("away_score"), lit(1)) \ .otherwise(when(col("home_score") < col("away_score"), lit(0)) \ .otherwise(np.nan))) \ .withColumn("winner", when(col("home_score") > col("away_score"), lit("home")) \ .otherwise(when(col("home_score") < col("away_score"), lit("away")) \ .otherwise("draw"))) \ .withColumn("margin", col("home_score") - col("away_score")) \ .withColumn("year", year(col("date"))) \ .withColumn("hour", hour(col("local_time"))) \ .withColumn("game_id_season", row_number().over(Window.partitionBy(col("year")).orderBy(col("date")))) \ .withColumn("first_round", when(col("game_id_season") <= 8, lit(1)).otherwise(lit(0))) \ .withColumn("second_round", when((col("game_id_season") <= 16) & (col("game_id_season") > 8), lit(1)).otherwise(lit(0))) \ .drop("local_time") pdf = df.toPandas() return pdf
# # Final calculation dm_final = dm_with_fourweek.withColumn( "dm_order_qty_without_pcb", dm_with_fourweek.regular_sales_before_dm + dm_with_fourweek.four_weeks_after_dm + dm_with_fourweek.dm_sales) dm_final = dm_final \ .withColumn("first_dm_order_qty_without_pcb", F.when(dm_final.rotation != 'X', 0.75 * dm_final.dm_order_qty_without_pcb) .otherwise(dm_final.dm_order_qty_without_pcb)) dm_final = dm_final \ .withColumn("first_dm_order_qty", F.when(dm_final.first_dm_order_qty_without_pcb > 0.0, F.ceil(dm_final.first_dm_order_qty_without_pcb / dm_final.pcb) * dm_final.pcb) .otherwise(0)) dm_final_pcb = dm_final \ .withColumn("dm_order_qty", F.when(dm_final.dm_order_qty_without_pcb > 0.0, F.ceil(dm_final.dm_order_qty_without_pcb / dm_final.pcb) * dm_final.pcb) .otherwise(0)) # + dm_final_pcb = dm_final_pcb.withColumn( "first_dm_order_qty", dm_final_pcb["first_dm_order_qty"].cast("Int")) dm_final_pcb = dm_final_pcb.withColumn( "dm_order_qty", dm_final_pcb["dm_order_qty"].cast("Int")) # -
def dm_order_simulation(date_str): warehouse_location = abspath('spark-warehouse') print_output( f'\n Forecast simulation process for DM start with input date {date_str} \n' ) # for logging output_str = "" info_str = f"Job start:{get_current_time()}, " spark = SparkSession.builder \ .appName("Forecast process for DM") \ .config("spark.sql.warehouse.dir", warehouse_location) \ .config("spark.driver.memory", '6g') \ .config("spark.executor.memory", '6g') \ .config("spark.num.executors", '14') \ .config("hive.exec.compress.output", 'false') \ .config("spark.sql.broadcastTimeout", 7200) \ .config("spark.sql.autoBroadcastJoinThreshold", -1) \ .enableHiveSupport() \ .getOrCreate() sc = spark.sparkContext sqlc = SQLContext(sc) print_output('Spark environment loaded') run_date = datetime.datetime.strptime(date_str, '%Y%m%d').date() # starting day of the DM calculation period start_date = run_date + timedelta(weeks=4) # end day of the DM calculation period end_date = run_date + timedelta(weeks=5) stock_date = run_date + timedelta(days=-1) parameter = "Run date:" + run_date.strftime("%Y%m%d") \ + ", DM start date:" + start_date.strftime("%Y%m%d") \ + ", DM end date:" + end_date.strftime("%Y%m%d") print_output( f"Load DM items and stores for DM that starts between {start_date} and {end_date}" ) dm_item_store_sql = \ """ SELECT distinct ndt.dm_theme_id, ndt.theme_start_date, ndt.theme_end_date, del.npp, del.ppp, del.ppp_start_date, del.ppp_end_date, del.city_code, id.store_code, del.dept_code, id.con_holding, id.risk_item_unilever, cast(id.qty_per_unit as int) as pcb, id.dc_supplier_code, id.ds_supplier_code, id.rotation, icis.item_id, icis.sub_id, icis.item_code, icis.sub_code, icis.date_key AS run_date, fdo.first_order_date AS past_result FROM vartefact.forecast_nsa_dm_extract_log del JOIN ods.nsa_dm_theme ndt ON del.dm_theme_id = ndt.dm_theme_id JOIN ods.p4md_stogld ps ON del.city_code = ps.stocity JOIN vartefact.forecast_store_item_details id ON ps.stostocd = id.store_code AND del.item_code = CONCAT ( id.dept_code, id.item_code ) AND del.sub_code = id.sub_code AND del.dept_code = id.dept_code AND id.store_status != 'Stop' AND id.item_type not in ('New','Company Purchase','Seasonal') JOIN vartefact.forecast_item_code_id_stock icis ON icis.date_key = '{0}' AND id.item_code = icis.item_code AND id.sub_code = icis.sub_code AND id.dept_code = icis.dept_code AND id.store_code = icis.store_code LEFT JOIN vartefact.forecast_simulation_dm_orders fdo ON ndt.dm_theme_id = fdo.dm_theme_id AND icis.dept_code = fdo.dept_code AND icis.item_code = fdo.item_code AND icis.sub_code = fdo.sub_code AND icis.store_code = fdo.store_code WHERE del.extract_order >= 40 AND del.date_key = '{1}' AND to_timestamp(ndt.theme_start_date, 'yyyy-MM-dd') >= to_timestamp('{2}', 'yyyyMMdd') AND to_timestamp(ndt.theme_start_date, 'yyyy-MM-dd') < to_timestamp('{3}', 'yyyyMMdd') """.replace("\n", " ") dm_item_store_sql = dm_item_store_sql.format(stock_date.strftime("%Y%m%d"), run_date.strftime("%Y%m%d"), start_date.strftime("%Y%m%d"), end_date.strftime("%Y%m%d")) # # Exclude the DM that already have orders dm_item_store_df = sqlc.sql(dm_item_store_sql) print_output( f"Number of DM item stores in date range {dm_item_store_df.count()}") print_output("Exclude the DM that already have orders") dm_item_store_df = dm_item_store_df.filter("past_result is null") output_line = f"After filtering already calculated DM {dm_item_store_df.count()}" print_output(output_line) output_str = output_str + output_line + "," # # Only consider the nearest DM first_dm = dm_item_store_df. \ groupBy(['item_id', 'sub_id', 'store_code']). \ agg(F.min("theme_start_date").alias("theme_start_date")) dm_item_store_df = dm_item_store_df.join( first_dm, ['item_id', 'sub_id', 'store_code', 'theme_start_date']) dm_item_store_cnt = dm_item_store_df.count() print_output(f"After getting only first DM {dm_item_store_cnt}") output_str = output_str + f"After getting only first DM {dm_item_store_cnt}," + "," if dm_item_store_cnt == 0: print_output( f"skip date {date_str} cause no active order opportunity for today" ) info_str = info_str + f"Job Finish:{get_current_time()}," info_str = info_str + f"skip date {date_str} cause no active order opportunity for today" insert_script_run(date_str, "Success", parameter, output_str, info_str, "", sqlc) return dm_item_store_df.createOrReplaceTempView("dm_item_store") # # The first order day within PPP period print_output("Get first order day within PPP period") onstock_order_sql = \ """ SELECT dis.item_id, dis.sub_id, dis.store_code, ord.date_key AS first_order_date, dev.date_key AS first_delivery_date FROM dm_item_store dis JOIN vartefact.forecast_onstock_order_delivery_mapping mp ON dis.dept_code = mp.dept_code AND dis.rotation = mp.rotation AND dis.store_code = mp.store_code JOIN vartefact.forecast_calendar ord ON ord.iso_weekday = mp.order_iso_weekday JOIN vartefact.forecast_calendar dev ON dev.iso_weekday = mp.delivery_iso_weekday AND dev.week_index = ord.week_index + mp.week_shift WHERE to_timestamp(ord.date_key, 'yyyyMMdd') >= to_timestamp(dis.ppp_start_date, 'yyyy-MM-dd') AND to_timestamp(dev.date_key, 'yyyyMMdd') >= date_add(to_timestamp(dis.theme_start_date, 'yyyy-MM-dd'), -7) AND dev.date_key <= '{0}' """.replace("\n", " ") onstock_order_sql = onstock_order_sql.format(end_date.strftime("%Y%m%d")) onstock_order_deliver_df = sqlc.sql(onstock_order_sql) xdock_order_sql = \ """ SELECT dis.item_id, dis.sub_id, dis.store_code, ord.date_key AS first_order_date, date_format( date_add( to_timestamp(dodm.delivery_date, 'yyyyMMdd'), xo.dc_to_store_time ), 'yyyyMMdd' ) AS first_delivery_date FROM dm_item_store dis JOIN vartefact.forecast_xdock_order_mapping xo ON dis.item_code = xo.item_code AND dis.sub_code = xo.sub_code AND dis.dept_code = xo.dept_code AND dis.store_code = xo.store_code JOIN vartefact.forecast_calendar ord ON ord.iso_weekday = xo.order_iso_weekday JOIN vartefact.forecast_dc_order_delivery_mapping dodm ON dodm.con_holding = dis.con_holding AND dodm.order_date = ord.date_key AND dis.risk_item_unilever = dodm.risk_item_unilever WHERE to_timestamp(ord.date_key, 'yyyyMMdd') >= to_timestamp(dis.ppp_start_date, 'yyyy-MM-dd') AND date_add(to_timestamp(dodm.delivery_date, 'yyyyMMdd'), xo.dc_to_store_time) <= to_timestamp('{0}', 'yyyyMMdd') """.replace("\n", " ") xdock_order_sql = xdock_order_sql.format(end_date.strftime("%Y%m%d")) xdock_order_deliver_df = sqlc.sql(xdock_order_sql) order_deliver_df = onstock_order_deliver_df.union(xdock_order_deliver_df) first_order_df = order_deliver_df.groupBy(['item_id', 'sub_id', 'store_code']). \ agg(F.min("first_order_date").alias("first_order_date")) first_order_deliver_df = order_deliver_df \ .select(['item_id', 'sub_id', 'store_code', 'first_order_date', 'first_delivery_date']) \ .join(first_order_df, ['item_id', 'sub_id', 'store_code', 'first_order_date']) dm_item_store_order_df = dm_item_store_df \ .join(first_order_deliver_df, \ ['item_id', 'sub_id', 'store_code']) dm_item_store_order_df.createOrReplaceTempView("dm_item_store_order") output_line = f"Number of item stores that will have DM {dm_item_store_order_df.count()}" print_output(output_line) output_str = output_str + output_line + "," # # Get DM sales prediction dm_sales_predict_sql = \ """ select dm.*, cast(coalesce(pred.sales_prediction, '0', pred.sales_prediction) as double) as dm_sales, coalesce(pred.sales_prediction, 'no', 'yes') as having_dm_prediction from dm_item_store_order dm left join temp.v_forecast_simulation_dm_sales_prediction pred on cast(pred.item_id as int) = dm.item_id and cast(pred.sub_id as int) = dm.sub_id and cast(pred.current_dm_theme_id as int) = dm.dm_theme_id and pred.store_code = dm.store_code """.replace("\n", " ") dm_prediction = sqlc.sql(dm_sales_predict_sql) dm_prediction.filter("having_dm_prediction = 'no' ") \ .write.mode("overwrite").format("parquet") \ .saveAsTable("vartefact.forecast_no_dm_prediction") dm_prediction.createOrReplaceTempView("dm_prediction") output_line = f"Number of DM sales prediction {dm_prediction.count()}" print_output(output_line) output_str = output_str + output_line + "," # # Regular sales from first order day to DM start day print_output("Regular sales before DM") dm_regular_sales_sql = \ """ SELECT dp.item_id, dp.sub_id, dp.store_code, dp.dm_theme_id, case when fcst.daily_sales_prediction_original < 0.2 and dp.rotation != 'A' then 0 when fcst.daily_sales_prediction_original < 0 then 0 else fcst.daily_sales_prediction_original end AS sales_prediction FROM temp.t_forecast_simulation_daily_sales_prediction fcst JOIN dm_prediction dp ON fcst.item_id = dp.item_id AND fcst.sub_id = dp.sub_id AND fcst.store_code = dp.store_code AND fcst.date_key > dp.first_delivery_date AND to_timestamp(fcst.date_key, 'yyyyMMdd') < to_timestamp(dp.theme_start_date, 'yyyy-MM-dd') """.replace("\n", " ") dm_regular_sales = sqlc.sql(dm_regular_sales_sql) agg_dm_regular_sales = dm_regular_sales.groupBy(['item_id', 'sub_id', 'store_code', 'dm_theme_id']). \ agg(F.sum("sales_prediction").alias("regular_sales_before_dm")) dm_with_regular = dm_prediction.join( agg_dm_regular_sales, ['item_id', 'sub_id', 'store_code', 'dm_theme_id'], "left") # # For ppp <= 90% npp, get 4 weeks after sales for ROTATION A items print_output("DM PPP logic") after_fourweek_sql = \ """ SELECT dp.item_id, dp.sub_id, dp.store_code, dp.dm_theme_id, case when fcst.daily_sales_prediction_original < 0.2 and dp.rotation != 'A' then 0 when fcst.daily_sales_prediction_original < 0 then 0 else fcst.daily_sales_prediction_original end AS sales_prediction FROM dm_prediction dp JOIN temp.t_forecast_simulation_daily_sales_prediction fcst ON fcst.item_id = dp.item_id AND fcst.sub_id = dp.sub_id AND fcst.store_code = dp.store_code AND to_timestamp(fcst.date_key, 'yyyyMMdd') > to_timestamp(dp.theme_end_date, 'yyyy-MM-dd') AND to_timestamp(fcst.date_key, 'yyyyMMdd') < date_add(to_timestamp(dp.theme_end_date, 'yyyy-MM-dd'), 28) WHERE dp.rotation = 'A' AND dp.ppp <= dp.npp * 0.9 """.replace("\n", " ") after_fourweek_sales = sqlc.sql( after_fourweek_sql.format(run_date.strftime("%Y%m%d"))) agg_after_fourweek_sales = after_fourweek_sales.groupBy(['item_id', 'sub_id', 'store_code', 'dm_theme_id']). \ agg(F.sum("sales_prediction").alias("four_weeks_after_dm")) output_line = f"Number of DM having PPP {agg_after_fourweek_sales.count()}" print_output(output_line) output_str = output_str + output_line + "," dm_with_fourweek = dm_with_regular.join( agg_after_fourweek_sales, ['item_id', 'sub_id', 'store_code', 'dm_theme_id'], "left") # # Fill NA dm_with_fourweek = dm_with_fourweek.na.fill(0) dm_with_fourweek.cache() output_line = f"Number of DM store orders {dm_with_fourweek.count()}" print_output(output_line) output_str = output_str + output_line # # Final calculation print_output("Calculate order quantity") dm_final = dm_with_fourweek.withColumn( "dm_order_qty_without_pcb", dm_with_fourweek.regular_sales_before_dm + dm_with_fourweek.four_weeks_after_dm + dm_with_fourweek.dm_sales) dm_final = dm_final \ .withColumn("first_dm_order_qty_without_pcb", F.when(dm_final.rotation != 'X', 0.75 * dm_final.dm_order_qty_without_pcb) .otherwise(dm_final.dm_order_qty_without_pcb)) dm_final = dm_final \ .withColumn("first_dm_order_qty", F.when(dm_final.first_dm_order_qty_without_pcb > 0.0, F.ceil(dm_final.first_dm_order_qty_without_pcb / dm_final.pcb) * dm_final.pcb) .otherwise(int(0))) dm_final_pcb = dm_final \ .withColumn("dm_order_qty", F.when(dm_final.dm_order_qty_without_pcb > 0.0, F.ceil(dm_final.dm_order_qty_without_pcb / dm_final.pcb) * dm_final.pcb) .otherwise(int(0))) dm_final_pcb.createOrReplaceTempView("dm_final_pcb") print_output("Write store order to datalake") dm_sql = \ """ INSERT INTO vartefact.forecast_simulation_dm_orders PARTITION (dm_theme_id) SELECT item_id, sub_id, store_code, con_holding, theme_start_date, theme_end_date, npp, ppp, ppp_start_date, ppp_end_date, city_code, dept_code, item_code, sub_code, pcb, dc_supplier_code, ds_supplier_code, rotation, run_date, first_order_date, first_delivery_date, regular_sales_before_dm, four_weeks_after_dm, dm_sales, dm_order_qty, first_dm_order_qty, dm_order_qty_without_pcb, dm_theme_id FROM dm_final_pcb """.replace("\n", " ") sqlc.sql(dm_sql) sqlc.sql("refresh table vartefact.forecast_simulation_dm_orders") print_output("Finish writing store order to datalake") print_output("Start generating DC orders") dm_item_dc_sql = \ """ SELECT distinct ndt.dm_theme_id, ndt.theme_start_date, ndt.theme_end_date, del.npp, del.ppp, del.ppp_start_date, del.ppp_end_date, del.dept_code, dcid.holding_code, dcid.risk_item_unilever, dcid.primary_ds_supplier as ds_supplier_code, cast(dcid.qty_per_unit as int) as pcb, dcid.rotation, dcid.qty_per_unit, icis.item_id, icis.sub_id, icis.item_code, icis.sub_code, icis.date_key AS run_date FROM vartefact.forecast_nsa_dm_extract_log del JOIN ods.nsa_dm_theme ndt ON del.dm_theme_id = ndt.dm_theme_id JOIN ods.p4md_stogld ps ON del.city_code = ps.stocity JOIN vartefact.forecast_item_code_id_stock icis ON icis.date_key = '{0}' AND del.item_code = CONCAT ( icis.dept_code, icis.item_code ) AND del.sub_code = icis.sub_code AND del.dept_code = icis.dept_code JOIN vartefact.forecast_dc_item_details dcid ON dcid.item_code =icis.item_code AND dcid.sub_code = icis.sub_code AND dcid.dept_code = icis.dept_code AND dcid.rotation != 'X' AND dcid.dc_status != 'Stop' AND dcid.seasonal = 'No' AND dcid.item_type not in ('New','Company Purchase','Seasonal') JOIN vartefact.forecast_store_item_details id ON ps.stostocd = id.store_code AND dcid.dept_code = id.dept_code AND dcid.item_code = id.item_code AND dcid.sub_code = id.sub_code WHERE del.extract_order >= 40 AND del.date_key = '{1}' AND to_timestamp(ndt.theme_start_date, 'yyyy-MM-dd') >= to_timestamp('{2}', 'yyyyMMdd') AND to_timestamp(ndt.theme_start_date, 'yyyy-MM-dd') < to_timestamp('{3}', 'yyyyMMdd') """.replace("\n", " ") dm_item_dc_sql = dm_item_dc_sql.format(stock_date.strftime("%Y%m%d"), run_date.strftime("%Y%m%d"), start_date.strftime("%Y%m%d"), end_date.strftime("%Y%m%d")) dm_item_dc_df = sqlc.sql(dm_item_dc_sql) first_dc_dm = dm_item_dc_df. \ groupBy(['item_id', 'sub_id']). \ agg(F.min("theme_start_date").alias("theme_start_date")) dm_item_dc_df = dm_item_dc_df.join( first_dc_dm, ['item_id', 'sub_id', 'theme_start_date']) output_line = f"Number of item that will have DM order in DC {dm_item_dc_df.count()}" print_output(output_line) output_str = output_str + output_line + "," dm_item_dc_df.cache() dm_item_dc_df.createOrReplaceTempView("dm_item_dc") # + dc_order_sql = \ """ SELECT distinct dis.item_id, dis.sub_id, ord.date_key AS first_order_date, dev.date_key AS first_delivery_date FROM dm_item_dc dis JOIN vartefact.forecast_dc_order_delivery_mapping dodm ON dis.holding_code = dodm.con_holding AND dis.risk_item_unilever = dodm.risk_item_unilever JOIN vartefact.forecast_calendar ord ON ord.date_key = dodm.order_date JOIN vartefact.forecast_calendar dev ON dev.weekday_short = dodm.delivery_weekday and dev.week_index = ord.week_index + dodm.week_shift WHERE to_timestamp(ord.date_key, 'yyyyMMdd') >= to_timestamp(dis.ppp_start_date, 'yyyy-MM-dd') AND dev.date_key <= '{0}' AND dis.rotation != 'X' """.replace("\n", " ") dc_order_sql = dc_order_sql.format(end_date.strftime("%Y%m%d")) # + dc_order_deliver_df = sqlc.sql(dc_order_sql) dc_first_order_df = dc_order_deliver_df.groupBy(['item_id', 'sub_id']). \ agg(F.min("first_order_date").alias("first_order_date")) dc_first_order_deliver_df = dc_order_deliver_df \ .select(['item_id', 'sub_id', 'first_order_date', 'first_delivery_date']) \ .join(dc_first_order_df, ['item_id', 'sub_id', 'first_order_date']) # - dm_item_dc_order_df = dm_item_dc_df \ .join(dc_first_order_deliver_df, \ ['item_id', 'sub_id']) dm_item_dc_order_df.createOrReplaceTempView("dm_item_dc_order") dm_store_to_dc_sql = \ """ select dm.item_id, dm.sub_id, dm.holding_code, dm.theme_start_date, dm.theme_end_date, dm.npp, dm.ppp, dm.ppp_start_date, dm.ppp_end_date, dm.dept_code, dm.item_code, dm.sub_code, dm.pcb, dm.ds_supplier_code, dm.rotation, dm.run_date, dm.first_order_date, dm.first_delivery_date, sum(sod.regular_sales_before_dm) as regular_sales_before_dm, sum(sod.four_weeks_after_dm) as four_weeks_after_dm, sum(sod.dm_sales) as dm_sales, sum(sod.order_qty) as dm_order_qty_without_pcb, dm.dm_theme_id FROM vartefact.forecast_simulation_dm_orders sod JOIN dm_item_dc_order dm on sod.item_id = dm.item_id and sod.sub_id = dm.sub_id and sod.dm_theme_id = dm.dm_theme_id GROUP BY dm.dm_theme_id, dm.item_id, dm.sub_id, dm.holding_code, dm.theme_start_date, dm.theme_end_date, dm.npp, dm.ppp, dm.ppp_start_date, dm.ppp_end_date, dm.dept_code, dm.item_code, dm.sub_code, dm.pcb, dm.ds_supplier_code, dm.rotation, dm.run_date, dm.first_order_date, dm.first_delivery_date """.replace("\n", " ") dm_dc_order = sqlc.sql(dm_store_to_dc_sql) dm_dc_pcb = dm_dc_order \ .withColumn("dm_order_qty", F.when(dm_dc_order.dm_order_qty_without_pcb > 0.0, F.ceil(dm_dc_order.dm_order_qty_without_pcb / dm_dc_order.pcb) * dm_dc_order.pcb) .otherwise(int(0))) dm_dc_pcb.createOrReplaceTempView("dm_dc_final") output_line = f"Number of DM DC orders {dm_dc_pcb.count()}" print_output(output_line) output_str = output_str + output_line print_output("Write DC order to datalake") dm_dc_sql = \ """ INSERT INTO vartefact.forecast_simulation_dm_dc_orders PARTITION (dm_theme_id) SELECT item_id, sub_id, holding_code, theme_start_date, theme_end_date, npp, ppp, ppp_start_date, ppp_end_date, dept_code, item_code, sub_code, pcb, ds_supplier_code, rotation, run_date, first_order_date, first_delivery_date, regular_sales_before_dm, four_weeks_after_dm, dm_sales, dm_order_qty, dm_order_qty_without_pcb, dm_theme_id FROM dm_dc_final """.replace("\n", " ") # + sqlc.sql(dm_dc_sql) sqlc.sql("refresh table vartefact.forecast_simulation_dm_dc_orders") info_str = info_str + f"Job Finish:{get_current_time()}" insert_script_run(date_str, "Success", parameter, output_str, info_str, "", sqlc) sc.stop() print_output("Job finish")
def fillspark(hist, df): import pyspark.sql.functions as fcns indexes = [] for axis in hist._group + hist._fixed: exprcol = tocolumns(df, histbook.instr.totree(axis._parsed)) if isinstance(axis, histbook.axis.groupby): indexes.append(exprcol) elif isinstance(axis, histbook.axis.groupbin): scaled = (exprcol - float(axis.origin)) * (1.0 / float(axis.binwidth)) if axis.closedlow: discretized = fcns.floor(scaled) else: discretized = fcns.ceil(scaled) - 1 indexes.append( fcns.nanvl( discretized * float(axis.binwidth) + float(axis.origin), fcns.lit("NaN"))) elif isinstance(axis, histbook.axis.bin): scaled = (exprcol - float(axis.low)) * (int(axis.numbins) / (float(axis.high) - float(axis.low))) if axis.closedlow: discretized = fcns.floor(scaled) + 1 else: discretized = fcns.ceil(scaled) indexes.append( fcns.when( fcns.isnull(exprcol) | fcns.isnan(exprcol), int(axis.numbins) + 2).otherwise( fcns.greatest( fcns.lit(0), fcns.least(fcns.lit(int(axis.numbins) + 1), discretized)))) elif isinstance(axis, histbook.axis.intbin): indexes.append( fcns.greatest( fcns.lit(0), fcns.least(fcns.lit(int(axis.max) - int(axis.min) + 1), fcns.round(exprcol - int(axis.min) + 1)))) elif isinstance(axis, histbook.axis.split): def build(x, i): if i < len(axis.edges): if axis.closedlow: return build(x.when(exprcol < float(axis.edges[i]), i), i + 1) else: return build( x.when(exprcol <= float(axis.edges[i]), i), i + 1) else: return x.otherwise(i) indexes.append( build( fcns.when( fcns.isnull(exprcol) | fcns.isnan(exprcol), len(axis.edges) + 1), 0)) elif isinstance(axis, histbook.axis.cut): indexes.append(fcns.when(exprcol, 0).otherwise(1)) else: raise AssertionError(axis) aliasnum = [-1] def alias(x): aliasnum[0] += 1 return x.alias("@" + str(aliasnum[0])) index = alias(fcns.struct(*indexes)) selectcols = [index] if hist._weightoriginal is not None: weightcol = tocolumns(df, histbook.instr.totree(hist._weightparsed)) for axis in hist._profile: exprcol = tocolumns(df, histbook.instr.totree(axis._parsed)) if hist._weightoriginal is None: selectcols.append(alias(exprcol)) selectcols.append(alias(exprcol * exprcol)) else: selectcols.append(alias(exprcol * weightcol)) selectcols.append(alias(exprcol * exprcol * weightcol)) if hist._weightoriginal is None: df2 = df.select(*selectcols) else: selectcols.append(alias(weightcol)) selectcols.append(alias(weightcol * weightcol)) df2 = df.select(*selectcols) aggs = [fcns.sum(df2[n]) for n in df2.columns[1:]] if hist._weightoriginal is None: aggs.append(fcns.count(df2[df2.columns[0]])) def getornew(content, key, nextaxis): if key in content: return content[key] elif isinstance(nextaxis, histbook.axis.GroupAxis): return {} else: return numpy.zeros(hist._shape, dtype=histbook.hist.COUNTTYPE) def recurse(index, columns, axis, content): if len(axis) == 0: content += columns elif isinstance(axis[0], (histbook.axis.groupby, histbook.axis.groupbin)): content[index[0]] = recurse( index[1:], columns, axis[1:], getornew(content, index[0], axis[1] if len(axis) > 1 else None)) if isinstance(axis[0], histbook.axis.groupbin) and None in content: content["NaN"] = content[None] del content[None] elif isinstance( axis[0], (histbook.axis.bin, histbook.axis.intbin, histbook.axis.split)): i = index[0] - (1 if not axis[0].underflow else 0) if int(i) < axis[0].totbins: recurse(index[1:], columns, axis[1:], content[int(i)]) elif isinstance(axis[0], histbook.axis.cut): recurse(index[1:], columns, axis[1:], content[0 if index[0] else 1]) else: raise AssertionError(axis[0]) return content query = df2.groupBy(df2[df2.columns[0]]).agg(*aggs) def wait(): for row in query.collect(): recurse(row[0], row[1:], hist._group + hist._fixed, hist._content) return wait
from functools import partial spark = SparkSession.builder.appName("some_testing2").master("local").getOrCreate() df = spark.read.format('com.databricks.spark.csv').option("header", "True").option("delimiter", ",")\ .load('C:/Users/awagner/Desktop/For_Tom/'+'AllLabData.csv') df = df.withColumn("X", df["X"].cast("double")) df = df.withColumn("Y", df["Y"].cast("double")) df = df.withColumn("Z", df["Z"].cast("double")) df = df.withColumn("TremorGA", df["TremorGA"].cast("double")) df = df.withColumn("BradykinesiaGA", df["BradykinesiaGA"].cast("double")) df = df.withColumn("DyskinesiaGA", df["DyskinesiaGA"].cast("double")) df = df.withColumn("TSStart", df["TSStart"].cast("timestamp")) df = df.withColumn("TSEnd", df["TSEnd"].cast("timestamp")) df = df.withColumn("interval_start", ((ceil(unix_timestamp(df["TSStart"]).cast("long")))%)) df = df.withColumn("interval_end", ((ceil(unix_timestamp(df["TSEnd"]).cast("long"))))) schema = ArrayType(FloatType(), False) parse2 = udf(lambda s: eval(str(s)), schema) find_milisec = udf(lambda raw: (raw)[(raw.find('.')+1):(raw.find('.')+3)]) merge_integers = udf(lambda raw1, raw2: int(str(raw1) + str(raw2))) df = df.withColumn("temp", find_milisec('TS')) df = df.withColumn("interval", (((unix_timestamp(df["TS"]).cast("long"))))) df = df.withColumn("interval", merge_integers('interval', 'temp')) def give_my_key(*args): key = 0
def tocolumns(df, expr): import pyspark.sql.functions as fcns if isinstance(expr, histbook.expr.Const): return fcns.lit(expr.value) elif isinstance(expr, (histbook.expr.Name, histbook.expr.Predicate)): return df[expr.value] elif isinstance(expr, histbook.expr.Call): if expr.fcn == "abs" or expr.fcn == "fabs": return fcns.abs(tocolumns(df, expr.args[0])) elif expr.fcn == "max" or expr.fcn == "fmax": return fcns.greatest(*[tocolumns(df, x) for x in expr.args]) elif expr.fcn == "min" or expr.fcn == "fmin": return fcns.least(*[tocolumns(df, x) for x in expr.args]) elif expr.fcn == "arccos": return fcns.acos(tocolumns(df, expr.args[0])) elif expr.fcn == "arccosh": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "arcsin": return fcns.asin(tocolumns(df, expr.args[0])) elif expr.fcn == "arcsinh": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "arctan2": return fcns.atan2(tocolumns(df, expr.args[0]), tocolumns(df, expr.args[1])) elif expr.fcn == "arctan": return fcns.atan(tocolumns(df, expr.args[0])) elif expr.fcn == "arctanh": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "ceil": return fcns.ceil(tocolumns(df, expr.args[0])) elif expr.fcn == "copysign": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "cos": return fcns.cos(tocolumns(df, expr.args[0])) elif expr.fcn == "cosh": return fcns.cosh(tocolumns(df, expr.args[0])) elif expr.fcn == "rad2deg": return tocolumns(df, expr.args[0]) * (180.0 / math.pi) elif expr.fcn == "erfc": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "erf": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "exp": return fcns.exp(tocolumns(df, expr.args[0])) elif expr.fcn == "expm1": return fcns.expm1(tocolumns(df, expr.args[0])) elif expr.fcn == "factorial": return fcns.factorial(tocolumns(df, expr.args[0])) elif expr.fcn == "floor": return fcns.floor(tocolumns(df, expr.args[0])) elif expr.fcn == "fmod": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "gamma": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "hypot": return fcns.hypot(tocolumns(df, expr.args[0]), tocolumns(df, expr.args[1])) elif expr.fcn == "isinf": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "isnan": return fcns.isnan(tocolumns(df, expr.args[0])) elif expr.fcn == "lgamma": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "log10": return fcns.log10(tocolumns(df, expr.args[0])) elif expr.fcn == "log1p": return fcns.log1p(tocolumns(df, expr.args[0])) elif expr.fcn == "log": return fcns.log(tocolumns(df, expr.args[0])) elif expr.fcn == "pow": return fcns.pow(tocolumns(df, expr.args[0]), tocolumns(df, expr.args[1])) elif expr.fcn == "deg2rad": return tocolumns(df, expr.args[0]) * (math.pi / 180.0) elif expr.fcn == "sinh": return fcns.sinh(tocolumns(df, expr.args[0])) elif expr.fcn == "sin": return fcns.sin(tocolumns(df, expr.args[0])) elif expr.fcn == "sqrt": return fcns.sqrt(tocolumns(df, expr.args[0])) elif expr.fcn == "tanh": return fcns.tanh(tocolumns(df, expr.args[0])) elif expr.fcn == "tan": return fcns.tan(tocolumns(df, expr.args[0])) elif expr.fcn == "trunc": raise NotImplementedError( expr.fcn) # FIXME (fcns.trunc is for dates) elif expr.fcn == "xor": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "conjugate": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "exp2": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "heaviside": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "isfinite": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "left_shift" and isinstance(expr.args[1], histbook.expr.Const): return fcns.shiftLeft(tocolumns(df, expr.args[0]), expr.args[1].value) elif expr.fcn == "log2": return fcns.log2(tocolumns(df, expr.args[0])) elif expr.fcn == "logaddexp2": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "logaddexp": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "mod" or expr.fcn == "fmod": return tocolumns(df, expr.args[0]) % tocolumns(df, expr.args[1]) elif expr.fcn == "right_shift" and isinstance(expr.args[1], histbook.expr.Const): return fcns.shiftRight(tocolumns(df, expr.args[0]), expr.args[1].value) elif expr.fcn == "rint": return fcns.rint(tocolumns(df, expr.args[0])) elif expr.fcn == "sign": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "where": return fcns.when(tocolumns(df, expr.args[0]), tocolumns(df, expr.args[1])).otherwise( tocolumns(df, expr.args[2])) elif expr.fcn == "numpy.equal": return tocolumns(df, expr.args[0]) == tocolumns(df, expr.args[1]) elif expr.fcn == "numpy.not_equal": return tocolumns(df, expr.args[0]) != tocolumns(df, expr.args[1]) elif expr.fcn == "numpy.less": return tocolumns(df, expr.args[0]) < tocolumns(df, expr.args[1]) elif expr.fcn == "numpy.less_equal": return tocolumns(df, expr.args[0]) <= tocolumns(df, expr.args[1]) elif expr.fcn == "numpy.isin": return tocolumns(df, expr.args[0]) in tocolumns(df, expr.args[1]) elif expr.fcn == "numpy.logical_not": return ~tocolumns(df, expr.args[0]) elif expr.fcn == "numpy.add": return tocolumns(df, expr.args[0]) + tocolumns(df, expr.args[1]) elif expr.fcn == "numpy.subtract": return tocolumns(df, expr.args[0]) - tocolumns(df, expr.args[1]) elif expr.fcn == "numpy.multiply": return tocolumns(df, expr.args[0]) * tocolumns(df, expr.args[1]) elif expr.fcn == "numpy.true_divide": return tocolumns(df, expr.args[0]) / tocolumns(df, expr.args[1]) elif expr.fcn == "numpy.logical_or": return tocolumns(df, expr.args[0]) | tocolumns(df, expr.args[1]) elif expr.fcn == "numpy.logical_and": return tocolumns(df, expr.args[0]) & tocolumns(df, expr.args[1]) else: raise NotImplementedError(expr.fcn) else: raise AssertionError(expr)