def test_window_functions_cumulative_sum(self): df = self.spark.createDataFrame([("one", 1), ("two", 2)], ["key", "value"]) from pyspark.sql import functions as F # Test cumulative sum sel = df.select( df.key, F.sum(df.value).over(Window.rowsBetween(Window.unboundedPreceding, 0))) rs = sorted(sel.collect()) expected = [("one", 1), ("two", 3)] for r, ex in zip(rs, expected): self.assertEqual(tuple(r), ex[:len(r)]) # Test boundary values less than JVM's Long.MinValue and make sure we don't overflow sel = df.select( df.key, F.sum(df.value).over(Window.rowsBetween(Window.unboundedPreceding - 1, 0))) rs = sorted(sel.collect()) expected = [("one", 1), ("two", 3)] for r, ex in zip(rs, expected): self.assertEqual(tuple(r), ex[:len(r)]) # Test boundary values greater than JVM's Long.MaxValue and make sure we don't overflow frame_end = Window.unboundedFollowing + 1 sel = df.select( df.key, F.sum(df.value).over(Window.rowsBetween(Window.currentRow, frame_end))) rs = sorted(sel.collect()) expected = [("one", 3), ("two", 2)] for r, ex in zip(rs, expected): self.assertEqual(tuple(r), ex[:len(r)])
def test_window_functions(self): df = self.sqlCtx.createDataFrame([(1, "1"), (2, "2"), (1, "2"), (1, "2")], ["key", "value"]) w = Window.partitionBy("value").orderBy("key") from pyspark.sql import functions as F sel = df.select( df.value, df.key, F.max("key").over(w.rowsBetween(0, 1)), F.min("key").over(w.rowsBetween(0, 1)), F.count("key").over(w.rowsBetween(float("-inf"), float("inf"))), F.rowNumber().over(w), F.rank().over(w), F.denseRank().over(w), F.ntile(2).over(w), ) rs = sorted(sel.collect()) expected = [ ("1", 1, 1, 1, 1, 1, 1, 1, 1), ("2", 1, 1, 1, 3, 1, 1, 1, 1), ("2", 1, 2, 1, 3, 2, 1, 1, 1), ("2", 2, 2, 2, 3, 3, 3, 2, 2), ] for r, ex in zip(rs, expected): self.assertEqual(tuple(r), ex[: len(r)])
def train(self, df, featureCols): ntiles = [] for col in featureCols: w = Window.partitionBy().orderBy(col) aux = df.select(F.ntile(self._n).over(w).alias('ntile'),col) ntiles.append(list(aux.groupby('ntile').max(col).collect())) self.ntiles_ = np.array(ntiles) self.columns_ = map(str,featureCols) self._is_trained = True
def compute(day): # On veut les jours day-30 à day-1 sums = wikipediadata.where( (wikipediadata.day >= day-30) & (wikipediadata.day <= day-1)) # Sous-ensemble de test #sums = sums.where((sums.page == 'Cadillac_Brougham') | ((sums.page == 'Roald_Dahl') & (sums.projectcode == 'fr'))) # On somme les heures de la journées sums = sums.groupby('projectcode', 'page', 'day').sum('views') # On cache pour plus tard sums.cache() # on définit une windows := jour precedent window_spec = Window.partitionBy(sums.projectcode, sums.page) \ .orderBy(sums.day.asc()).rowsBetween(-1, -1) # on calcule la différence entre views(d) - views(d-1) diffs = sums.withColumn('diff', sums.views - F.sum(sums.views) \ .over(window_spec)) # on calcule les coefs à appliquer à chaque jour coefs = pd.DataFrame({'day': range(day-30, day)}) coefs['coef'] = 1. / (day - coefs.day) coefs = hc.createDataFrame(coefs) diffs = diffs.join(coefs, 'day') # on calcul le score de chaque jour diffs = diffs.withColumn('sub_score', diffs.diff * diffs.coef) totals = diffs.groupby('projectcode', 'page').sum('views', 'sub_score') # on normalise par la racine de la somme des views totals = totals.withColumn('score', totals['SUM(sub_score)'] / F.sqrt(totals['SUM(views)'])) \ .orderBy(F.desc('score')) \ .withColumnRenamed('SUM(views)', 'total_views') \ .limit(10) views = sums.select('projectcode', 'page', 'day', 'views') \ .join(totals.select('projectcode', 'page', 'total_views', 'score'), (totals.projectcode == sums.projectcode) & (totals.page == sums.page), 'right_outer') df = totals.select('projectcode', 'page', 'total_views', 'score').toPandas() df2 = views.toPandas() df2 = df2.iloc[:, 2:] df2 = df2.pivot_table(values='views', columns=['day'], index=['projectcode', 'page'], fill_value=0) df = df.merge(df2, left_on=['projectcode', 'page'], right_index=True) df.to_csv(filename(day), index=False) # on vide le cache hc.clearCache()
def runOtherFunctions(spark, personDf): df = spark.createDataFrame([("v1", "v2", "v3")], ["c1", "c2", "c3"]); # array df.select(df.c1, df.c2, df.c3, array("c1", "c2", "c3").alias("newCol")).show(truncate=False) # desc, asc personDf.show() personDf.sort(functions.desc("age"), functions.asc("name")).show() # pyspark 2.1.0 버전은 desc_nulls_first, desc_nulls_last, asc_nulls_first, asc_nulls_last 지원하지 않음 # split, length (pyspark에서 컬럼은 df["col"] 또는 df.col 형태로 사용 가능) df2 = spark.createDataFrame([("Splits str around pattern",)], ['value']) df2.select(df2.value, split(df2.value, " "), length(df2.value)).show(truncate=False) # rownum, rank f1 = StructField("date", StringType(), True) f2 = StructField("product", StringType(), True) f3 = StructField("amount", IntegerType(), True) schema = StructType([f1, f2, f3]) p1 = ("2017-12-25 12:01:00", "note", 1000) p2 = ("2017-12-25 12:01:10", "pencil", 3500) p3 = ("2017-12-25 12:03:20", "pencil", 23000) p4 = ("2017-12-25 12:05:00", "note", 1500) p5 = ("2017-12-25 12:05:07", "note", 2000) p6 = ("2017-12-25 12:06:25", "note", 1000) p7 = ("2017-12-25 12:08:00", "pencil", 500) p8 = ("2017-12-25 12:09:45", "note", 30000) dd = spark.createDataFrame([p1, p2, p3, p4, p5, p6, p7, p8], schema) w1 = Window.partitionBy("product").orderBy("amount") w2 = Window.orderBy("amount") dd.select(dd.product, dd.amount, functions.row_number().over(w1).alias("rownum"), functions.rank().over(w2).alias("rank")).show()
where B.flight in (select flight from (select if(isnull(flight), icao, if(flight="00000000",icao, \ flight)) as flight, lon, lat, altitude from adsb) as A where (lat > ' + minLat + ' and lat < ' + \ maxLat + ' and lon > ' + minLon + ' and lon < ' + maxLon + ' and altitude < ' + altitude + ') group \ by flight) order by UTC' ) adsbTable = adsbTable.dropDuplicates(['flight', 'lon', 'lat']) # the next set of lines will handle the scenario where the same flight number from one airport arrives at another # airport, then departs to another airport we do not want to treat this as one flight, rather multiple flights, for # example, if we have flight ACA108 that comes in from Montreal to Toronto then leaves for Winnipeg, we will have # ACA108-0 and ACA108-1 # # if there was just one flight number, then only -0 is appended # partition the flight by UTC window_flightNum_UTC = Window.partitionBy("flight").orderBy("UTC") # calculate the deltaTime between the previous and current coordinate for each flight adsbDf=adsbTable.withColumn("deltaTime", psf.unix_timestamp('UTC') - psf.lag(psf.unix_timestamp('UTC')). \ over(window_flightNum_UTC)) # create a trigger when the time elapse from the previous coordinates to the next is more than 1200 seconds # (20 minutes) note that this isn't foolproof, if there is gaps in your coverage that is 20 minutes or longer then # this will break that one flight into two separate flights, adjust the 1200 seconds below to suit your # requirements adsbDf1 = adsbDf.withColumn( "trigger", psf.when(adsbDf.deltaTime > 1200, 1).otherwise(0)) # create the suffix for the flight by using the cumulative sum of the triggers for each flight adsbDf2 = adsbDf1.withColumn(
def main(sentiment_input, user_input, review_input, model_input, output_folder): # read input files df_sentiment = spark.read.csv(sentiment_input, header=True) df_user = spark.read.parquet(user_input) df_review = spark.read.parquet(review_input) # get 50 users df_50_users = df_user.limit(50) # cross join user and business df_usr_bus_all = df_50_users \ .crossJoin(df_sentiment) \ .where(df_sentiment['ZipCode'].isNull() == False) \ .select( df_sentiment['BusinessID'], \ df_user['UserID'], \ df_user['UserName'], \ df_user['ReviewCount'].alias('UserReviewCount'), \ df_user['AverageStars'].alias('UserAverageStars'), \ functions.lit(0).alias('ReviewStars'), \ functions.dayofyear(functions.current_date()).alias('ReviewDayOfYear'), \ df_sentiment['Name'].alias('BusinessName'), \ df_sentiment['ZipCode'].alias('BusinessPostalCode'), \ df_sentiment['ZipCode'].substr(1, 3).alias('BusinessNeighborhood'), \ df_sentiment['Latitude'].cast(types.FloatType()), \ df_sentiment['Longitude'].cast(types.FloatType()), \ df_sentiment['avg_neg'].cast(types.FloatType()).alias('AverageNegative'), \ df_sentiment['avg_neu'].cast(types.FloatType()).alias('AverageNeutral'), \ df_sentiment['avg_pos'].cast(types.FloatType()).alias('AveragePositive'), \ df_sentiment['avg_composite_score'].cast(types.FloatType()).alias('AverageComposite')) # left join with reviews df_joined = df_usr_bus_all.join(df_review, ['BusinessID', 'UserID'], 'left_outer') \ .select(df_review['ReviewID'], \ df_usr_bus_all['BusinessID'], \ df_usr_bus_all['UserID'], \ df_usr_bus_all['UserName'], \ df_usr_bus_all['UserReviewCount'], \ df_usr_bus_all['UserAverageStars'], \ df_usr_bus_all['ReviewStars'], \ df_usr_bus_all['ReviewDayOfYear'], \ df_usr_bus_all['BusinessName'], \ df_usr_bus_all['BusinessPostalCode'], \ df_usr_bus_all['BusinessNeighborhood'], \ df_usr_bus_all['Latitude'], \ df_usr_bus_all['Longitude'], \ df_usr_bus_all['AverageNegative'], \ df_usr_bus_all['AverageNeutral'], \ df_usr_bus_all['AveragePositive'], \ df_usr_bus_all['AverageComposite']) # get restaurants that user has not visited df_not_visited_rests = df_joined.where(df_joined['ReviewID'].isNull()) # load the model loaded_model = PipelineModel.load(model_input) # use the model to make predictions predictions = loaded_model.transform(df_not_visited_rests) predictions_init = predictions.select(predictions['BusinessID'], \ predictions['BusinessName'], \ predictions['BusinessPostalCode'], \ predictions['BusinessNeighborhood'], \ predictions['UserID'], \ predictions['UserName'], \ predictions['UserReviewCount'], \ predictions['UserAverageStars'], \ predictions['ReviewDayOfYear'], \ predictions['prediction'].alias('PredictedReviewStar'), \ predictions['Latitude'], \ predictions['Longitude'], \ predictions['AverageNegative'], \ predictions['AverageNeutral'], \ predictions['AveragePositive'], \ predictions['AverageComposite']) # change scores > 5 to 5 and < 0 to 0 predictions_final = predictions_init.withColumn('FinalStar', \ functions.when(predictions_init["PredictedReviewStar"] >= 5, 5) \ .otherwise(functions.when(predictions_init["PredictedReviewStar"] <= 0, 0) \ .otherwise(predictions_init['PredictedReviewStar']))) # partition By user window = Window.partitionBy(predictions_final['UserID']).orderBy( predictions_final['FinalStar'].desc()) # get top 10 scores for each user based on partition prediction_to_save = predictions_final.select( '*', functions.row_number().over(window).alias('rank')).filter( col('rank') <= 10) # save predictions to output prediction_to_save.coalesce(1).write.csv(output_folder + '/TestModel', header=True)
def growing_range_window(self): return Window.partitionBy('id').orderBy('v') \ .rangeBetween(Window.unboundedPreceding, 4)
def sliding_range_window(self): return Window.partitionBy('id').orderBy('v').rangeBetween(-2, 4)
def _generate_invoice(sc, sqlContext, **kwargs): from transform._invoice_latest import _get_invoice_data from transform._prediction_list import _get_models_list, _get_prediction_list from properties import FINAL_PREDICTION_LOCATION, _PREDICTION_LOCATION, TESTING, REPARTITION_VAL from transform._remainder_addition import _get_remainder # Get visit list as an argument _visit_list = kwargs.get('visit_list') # Get order date as an argument -- Used for simulation purpose now. Should be changed to current system date ideally order_date = kwargs.get('order_date') # Filters out the visit list for the provided order_date vl_df = _get_visit_list(sc=sc, sqlContext=sqlContext, order_date=order_date, visit_list=_visit_list) # # Used to the authorised material list for a particular 'vkbur' # aglu_df = _get_aglu_list(sc=sc, sqlContext=sqlContext) vl_df.registerTempTable("vl_sql") # # query to join the visit list with authorised material list to obtain a complete set of products for all scheduled visits. # # TODO: Clarify is there is any method to isolate the materials for a particular visit # aglu_df.select(col('MATNR').alias('mat_no'), col('scl_auth_matlst')).registerTempTable("aglu_sql") # q = """ # select e.*, f.mat_no mat_no # from # ( # select b.customernumber customernumber, b.order_date order_date, d.scl_auth_matlst scl_auth_matlst, d.vkbur vkbur, IF(d.vsbed == '01', 2, 1) dlvry_lag # from # ( # select a.customernumber customernumber, a.order_date order_date # from vl_sql a # ) b # join # ( # select c.kunnr customernumber, c.scl_auth_matlst scl_auth_matlst, c.vkbur vkbur, c.vsbed vsbed # from mdm.customer c # ) d # on d.customernumber = b.customernumber # ) e # cross join # ( # select g.mat_no mat_no, g.SCL_AUTH_MATLST scl_auth_matlst # from aglu_sql g # ) f # where e.scl_auth_matlst = f.scl_auth_matlst # """ # Query to join the visit list to the customer master table to obtain delivery lag for a customer # # query to join the visit list with authorised material list to obtain a complete set of products for all scheduled visits. q = """ select e.* from ( select b.customernumber customernumber, b.mat_no mat_no, b.order_date order_date, d.vkbur vkbur, IF(d.vsbed == '01', 2, 1) dlvry_lag from ( select a.customernumber customernumber, a.mat_no mat_no, a.order_date order_date from vl_sql a ) b join ( select c.kunnr customernumber, c.vkbur vkbur, c.vsbed vsbed from mdm.customer c ) d on d.customernumber = b.customernumber ) e """ # # Obtaining delivery_date from given order_date provided the delivery_lag visit_list_final = sqlContext.sql(q) \ .repartition(REPARTITION_VAL) \ .withColumn('delivery_date', udf(_get_delivery_date, StringType())(col('order_date'), col('dlvry_lag'))) \ .drop(col('dlvry_lag')) print("Visit List Final") # visit_list_final.show() # invoice_raw = _get_invoice_data(sqlContext=sqlContext, CRITERIA_DATE=get_criteria_date(order_date=order_date)) \ .repartition(REPARTITION_VAL) print("Invoice_raw") # invoice_raw.show() visit_invoice_condition = [ visit_list_final.customernumber == invoice_raw.customernumber, visit_list_final.mat_no == invoice_raw.mat_no ] visit_list_final_join_invoice_raw = visit_list_final \ .join(invoice_raw, on=visit_invoice_condition, how='inner') \ .repartition(REPARTITION_VAL) \ .select(visit_list_final.customernumber, visit_list_final.mat_no, visit_list_final.order_date, visit_list_final.vkbur, visit_list_final.delivery_date, invoice_raw.last_delivery_date) \ .withColumn('mod_last_delivery_date', udf(_get_tweaked_last_del_dt, StringType())(col('last_delivery_date'))) \ .drop(col('last_delivery_date')) print("Visit list final join invoice raw") # visit_list_final_join_invoice_raw.show() # print(visit_list_final_join_invoice_raw.count()) # visit_list_final_join_invoice_raw.printSchema() models_data_raw = _get_models_list(sc=sc, sqlContext=sqlContext, CRITERIA_DATE=order_date, testing=TESTING) \ .repartition(REPARTITION_VAL) print("Prediction Data") # models_data_raw.show() # print(models_data_raw.count()) # models_data_raw.printSchema() visit_pred_condition = [ visit_list_final_join_invoice_raw.customernumber == models_data_raw.customernumber, visit_list_final_join_invoice_raw.mat_no == models_data_raw.mat_no ] _final_df_stage = visit_list_final_join_invoice_raw \ .join(models_data_raw, on=visit_pred_condition, how='inner') \ .repartition(REPARTITION_VAL) \ .select(visit_list_final_join_invoice_raw.customernumber, visit_list_final_join_invoice_raw.mat_no, visit_list_final_join_invoice_raw.order_date, visit_list_final_join_invoice_raw.vkbur, visit_list_final_join_invoice_raw.delivery_date, visit_list_final_join_invoice_raw.mod_last_delivery_date, models_data_raw.mdl_bld_dt, models_data_raw.cutoff_date ) print("Final_df_stage") # _final_df_stage.show() ######################################## from pyspark.sql.window import Window import sys ######################################## cust_pdt_mdl_bld_dt_window = Window \ .partitionBy(_final_df_stage.customernumber, _final_df_stage.mat_no) \ .orderBy(_final_df_stage.mdl_bld_dt) \ .rangeBetween(-sys.maxsize, sys.maxsize) # valid_model_flag = (udf(_is_model_valid, BooleanType())(col('delivery_date'), col('mod_last_delivery_date'), col('mdl_bld_dt'), max(col('mdl_bld_dt').filter(lambda _date: _date < col('mod_last_delivery_date'))).alias('min_mdl_bld_dt'))) # # TODO: Update 1 here to account for cutoff_date modification as discussed. _final_df = _final_df_stage \ .repartition(REPARTITION_VAL) \ .select(col('customernumber'), col('mat_no'), col('order_date'), col('vkbur'), col('delivery_date'), col('mod_last_delivery_date'), col('mdl_bld_dt'), col('cutoff_date'), (max(udf(_get_max_date_spprt_func_1, DateType())(col('mdl_bld_dt'), col('mod_last_delivery_date')))).over( window=cust_pdt_mdl_bld_dt_window).cast(StringType()).alias('min_mdl_bld_dt') ) \ .withColumn('mdl_validity', udf(_is_model_valid, BooleanType())(col('delivery_date'), col('mod_last_delivery_date'), col('mdl_bld_dt'), col('min_mdl_bld_dt'))) \ .filter(col('mdl_validity') == True) \ .select(col('customernumber'), col('mat_no'), col('order_date'), col('vkbur'), col('delivery_date'), col('mod_last_delivery_date'), col('mdl_bld_dt'), col('cutoff_date'), (max(udf(string_to_gregorian, DateType())(col('cutoff_date')))).over( window=Window.partitionBy("customernumber", "mat_no", "mdl_bld_dt").orderBy( "cutoff_date").rangeBetween(-sys.maxsize, sys.maxsize)).cast( StringType()).alias('latest_cutoff_date') ) \ .filter(col('cutoff_date') == col('latest_cutoff_date')) print("Final_df") # _final_df.show() _prediction_df_raw = _get_prediction_list(sqlContext=sqlContext, testing=TESTING) \ .repartition(REPARTITION_VAL) print("prediction_df_raw") # _prediction_df_raw.show() _final_df_prediction_df_raw_condition = [ _final_df.customernumber == _prediction_df_raw.customernumber, _final_df.mat_no == _prediction_df_raw.mat_no, _final_df.mdl_bld_dt == _prediction_df_raw.mdl_bld_dt, _final_df.cutoff_date == _prediction_df_raw.cutoff_date ] _temp_df = _prediction_df_raw \ .join(_final_df, on=_final_df_prediction_df_raw_condition, how='inner') \ .repartition(REPARTITION_VAL) \ .select(_prediction_df_raw.customernumber, _prediction_df_raw.mat_no, _prediction_df_raw.pred_val, _prediction_df_raw.pdt_cat, _final_df.order_date.cast(StringType()), _final_df.delivery_date, _final_df.mod_last_delivery_date, _final_df.mdl_bld_dt, _final_df.cutoff_date ) \ .withColumn('scale_denom', when(col('pdt_cat').isin('I', 'II', 'III', 'VII'), 7).otherwise(31)) print("_temp_df") # _temp_df.show() # _temp_df.printSchema() _temp_df_rdd_mapped = _temp_df.flatMap( lambda _row: map_pred_val_to_week_month_year(_row)) # _temp_df_rdd_mapped = _temp_df.map(lambda _row: _row) # print _temp_df_rdd_mapped.take(1) # cust_pdt_week_month_window = Window \ # .partitionBy(_final_df_stage.customernumber, _final_df_stage.mat_no) \ # .orderBy(_final_df_stage.mdl_bld_dt) \ # .rangeBetween(-sys.maxsize, sys.maxsize) # # TODO: Update 2 here to account for cutoff_date modification as discussed. _temp_df_flat = sqlContext.createDataFrame(_temp_df_rdd_mapped, schema=_pred_val_to_week_month_year_schema()) \ .repartition(REPARTITION_VAL) \ .select(col('customernumber'), col('mat_no'), col('week_month_key'), col('pred_val'), col('pdt_cat'), col('delivery_date'), col('mod_last_delivery_date').alias('last_delivery_date'), col('mdl_bld_dt'), col('cutoff_date'), col('scale_denom'), (max(udf(string_to_gregorian, DateType())(col('mdl_bld_dt')))).over( window=Window.partitionBy("customernumber", "mat_no", "week_month_key").orderBy( "mdl_bld_dt").rangeBetween(-sys.maxsize, sys.maxsize)).cast( StringType()).alias('updt_mdl_bld_dt') ) \ .filter(col('mdl_bld_dt') == col('updt_mdl_bld_dt')) \ .withColumn('_partial_diff', udf(_get_partial_date_diff, IntegerType())(col('week_month_key'), col('pdt_cat'), col('delivery_date'), col('last_delivery_date'))) \ .withColumn('_partial_quantity_temp', (col('pred_val') / col('scale_denom')) * col('_partial_diff')) \ .withColumn('_partial_quantity', round(col('_partial_quantity_temp'), 2)) \ .withColumn('identification_params', udf(_get_string_from_cols, StringType())(col('week_month_key'), col('pred_val'), col('pdt_cat'), col('mdl_bld_dt'), col('cutoff_date'), col('_partial_diff'), col('_partial_quantity'))) \ .drop(col('_partial_quantity_temp')) \ .drop(col('mdl_bld_dt')) print("temp_df_flat") # _temp_df_flat.show() _remainder_df = _get_remainder(sqlContext=sqlContext, testing=TESTING) \ .repartition(REPARTITION_VAL) print("remainder_df") # _remainder_df.show() print("result_df_stage") result_df_stage = _temp_df_flat \ .repartition(REPARTITION_VAL) \ .select(col('customernumber'), col('mat_no'), col('delivery_date'), col('_partial_quantity'), col('identification_params')) \ .withColumn('order_date', lit(order_date)) \ .groupBy(col('customernumber'), col('mat_no'), col('order_date'), col('delivery_date')) \ .agg( round(sum(col('_partial_quantity')), 2).alias('quantity_stg'), collect_list(col('identification_params')).alias('identification_param_set') ) # result_df_stage.show(5) # result_df_stage.printSchema() print("Writing raw invoice to HDFS") result_df_stage \ .write.mode('append') \ .format('orc') \ .option("header", "false") \ .save(_PREDICTION_LOCATION) result_df_stage_remainder_df_cond = [ result_df_stage.customernumber == _remainder_df.customernumber, result_df_stage.mat_no == _remainder_df.mat_no ] result_df = result_df_stage \ .join(_remainder_df, on=result_df_stage_remainder_df_cond, how='left_outer') \ .repartition(REPARTITION_VAL) \ .select(result_df_stage.customernumber, result_df_stage.mat_no, result_df_stage.order_date, result_df_stage.delivery_date, result_df_stage.quantity_stg, _remainder_df.remainder.alias('carryover') ) \ .na.fill(0.0) \ .withColumn('quantity_temp', round((col('quantity_stg') + col('carryover')), 2).cast(FloatType())) \ .withColumn('quantity', floor(col('quantity_temp')).cast(FloatType())) \ .withColumn('remainder', round((col('quantity_temp') - col('quantity')), 2).cast(FloatType())) \ .select(col('customernumber'), col('mat_no'), col('order_date'), col('delivery_date'), col('quantity'), col('remainder')) # .withColumn('carryover', when(col('carryover_stg') == None, 0.0).otherwise(col('carryover_stg')).cast(FloatType()))\ # .drop(col('carryover_stg')) \ print("result_df") # result_df.cache() # result_df.show() # print(result_df.count()) print("Writing invoice to HDFS") result_df \ .write.mode('append') \ .format('orc') \ .option("header", "false") \ .save(FINAL_PREDICTION_LOCATION)
def main(): "Main function" optmgr = OptionParser() opts = optmgr.parser.parse_args() # setup spark/sql context to be used for communication with HDFS sc = SparkContext(appName="phedex_br") if not opts.yarn: sc.setLogLevel("ERROR") sqlContext = HiveContext(sc) schema_def = schema() # read given file(s) into RDD if opts.fname: pdf = sqlContext.read.format('com.databricks.spark.csv')\ .options(treatEmptyValuesAsNulls='true', nullValue='null')\ .load(opts.fname, schema = schema_def) elif opts.basedir: fromdate, todate = defDates(opts.fromdate, opts.todate) files = getFileList(opts.basedir, fromdate, todate) msg = "Between dates %s and %s found %d directories" % (fromdate, todate, len(files)) print msg if not files: return pdf = unionAll([sqlContext.read.format('com.databricks.spark.csv') .options(treatEmptyValuesAsNulls='true', nullValue='null')\ .load(file_path, schema = schema_def) \ for file_path in files]) else: raise ValueError("File or directory not specified. Specify fname or basedir parameters.") # parsing additional data (to given data adding: group name, node kind, acquisition era, data tier, now date) groupdic, nodedic = getJoinDic() acquisition_era_reg = r"^/[^/]*/([^/^-]*)-[^/]*/[^/]*$" data_tier_reg = r"^/[^/]*/[^/^-]*-[^/]*/([^/]*)$" groupf = udf(lambda x: groupdic[x], StringType()) nodef = udf(lambda x: nodedic[x], StringType()) ndf = pdf.withColumn("br_user_group", groupf(pdf.br_user_group_id)) \ .withColumn("node_kind", nodef(pdf.node_id)) \ .withColumn("now", from_unixtime(pdf.now_sec, "YYYY-MM-dd")) \ .withColumn("acquisition_era", when(regexp_extract(pdf.dataset_name, acquisition_era_reg, 1) == "",\ lit("null")).otherwise(regexp_extract(pdf.dataset_name, acquisition_era_reg, 1))) \ .withColumn("data_tier", when(regexp_extract(pdf.dataset_name, data_tier_reg, 1) == "",\ lit("null")).otherwise(regexp_extract(pdf.dataset_name, data_tier_reg, 1))) # print dataframe schema if opts.verbose: ndf.show() print("pdf data type", type(ndf)) ndf.printSchema() # process aggregation parameters keys = [key.lower().strip() for key in opts.keys.split(',')] results = [result.lower().strip() for result in opts.results.split(',')] aggregations = [agg.strip() for agg in opts.aggregations.split(',')] order = [orde.strip() for orde in opts.order.split(',')] if opts.order else [] asc = [asce.strip() for asce in opts.asc.split(',')] if opts.order else [] filtc, filtv = opts.filt.split(":") if opts.filt else (None,None) validateAggregationParams(keys, results, aggregations, order, filtc) if filtc and filtv: ndf = ndf.filter(getattr(ndf, filtc) == filtv) # if delta aggregation is used if DELTA in aggregations: validateDeltaParam(opts.interval, results) result = results[0] #1 for all dates generate interval group dictionary datedic = generateDateDict(fromdate, todate, opts.interval) boundic = generateBoundDict(datedic) max_interval = max(datedic.values()) interval_group = udf(lambda x: datedic[x], IntegerType()) interval_start = udf(lambda x: boundic[x][0], StringType()) interval_end = udf(lambda x: boundic[x][1], StringType()) #2 group data by block, node, interval and last result in the interval ndf = ndf.select(ndf.block_name, ndf.node_name, ndf.now, getattr(ndf, result)) idf = ndf.withColumn("interval_group", interval_group(ndf.now)) win = Window.partitionBy(idf.block_name, idf.node_name, idf.interval_group).orderBy(idf.now.desc()) idf = idf.withColumn("row_number", rowNumber().over(win)) rdf = idf.where((idf.row_number == 1) & (idf.interval_group != 0))\ .withColumn(result, when(idf.now == interval_end(idf.interval_group), getattr(idf, result)).otherwise(lit(0))) rdf = rdf.select(rdf.block_name, rdf.node_name, rdf.interval_group, getattr(rdf, result)) rdf.cache() #3 create intervals that not exist but has minus delta win = Window.partitionBy(idf.block_name, idf.node_name).orderBy(idf.interval_group) adf = rdf.withColumn("interval_group_aft", lead(rdf.interval_group, 1, 0).over(win)) hdf = adf.filter(((adf.interval_group + 1) != adf.interval_group_aft) & (adf.interval_group != max_interval))\ .withColumn("interval_group", adf.interval_group + 1)\ .withColumn(result, lit(0))\ .drop(adf.interval_group_aft) #4 join data frames idf = rdf.unionAll(hdf) #3 join every interval with previous interval win = Window.partitionBy(idf.block_name, idf.node_name).orderBy(idf.interval_group) fdf = idf.withColumn("delta", getattr(idf, result) - lag(getattr(idf, result), 1, 0).over(win)) #5 calculate delta_plus and delta_minus columns and aggregate by date and node ddf =fdf.withColumn("delta_plus", when(fdf.delta > 0, fdf.delta).otherwise(0)) \ .withColumn("delta_minus", when(fdf.delta < 0, fdf.delta).otherwise(0)) aggres = ddf.groupBy(ddf.node_name, ddf.interval_group).agg(sum(ddf.delta_plus).alias("delta_plus"),\ sum(ddf.delta_minus).alias("delta_minus")) aggres = aggres.select(aggres.node_name, interval_end(aggres.interval_group).alias("date"), aggres.delta_plus, aggres.delta_minus) else: resAgg_dic = zipResultAgg(results, aggregations) order, asc = formOrdAsc(order, asc, resAgg_dic) # perform aggregation if order: aggres = ndf.groupBy(keys).agg(resAgg_dic).orderBy(order, ascending=asc) else: aggres = ndf.groupBy(keys).agg(resAgg_dic) # output results if opts.fout: fout_header = formFileHeader(opts.fout) if opts.header: aggres.write.format('com.databricks.spark.csv').options(header = 'true').save(fout_header) else: aggres.write.format('com.databricks.spark.csv').save(fout_header) else: aggres.show(50)
def ordered_window(self): return Window.partitionBy('id').orderBy('v')
def unpartitioned_window(self): return Window.partitionBy()
def unbounded_window(self): return Window.partitionBy('id') \ .rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)
def main(inputFile, outputFile): def getItemName(payload): payload = json.loads(payload) return payload['item_name'].split('.')[0] item_udf = udf(lambda payload: getItemName(payload)) df = spark.read.parquet(inputFile+'/*') #DF for Item Play Started start = df.filter(col('event').isin(['ITEM_PLAY_STARTED'])) start = start.withColumn('Content Name', item_udf(start['payload'])) # get content name start = start.withColumn('Time Start', df['time'].cast("timestamp")) #turn Time Start String to easy to use Time Stamp Object cols = ["device_id", "Content Name", "Time Start"] #Select just the columns we need start = start.select(*cols) def getEndOfStream(payload): payload = json.loads(payload) if "did_reach_end_of_stream" in payload: return payload['did_reach_end_of_stream'] else: return "false" stream_end_UDF = udf(lambda x: getEndOfStream(x)) #DF for Item Play Finished finished = df.filter(col('event').isin(['ITEM_PLAY_FINISHED'])) finished = finished.withColumn('Content Name', item_udf(finished['payload'])) # get content name finished = finished.withColumn('reach_end_of_stream', stream_end_UDF(finished['payload'])) # get did_reach_end_of_stream finished = finished.withColumn("reach_end_of_stream", F.trim(col("reach_end_of_stream"))) #Get rid of white space #Convert True/False strings to actual boolean values finished = finished.withColumn( 'reach_end_of_stream', F.when(F.col("reach_end_of_stream") == "true", True)\ .otherwise(False) ) #turn Time End String to easy to use Time Stamp Object finished = finished.withColumn('Time End', df['time'].cast("timestamp")) #Select just the columns we need cols = ["device_id", "Content Name", "Time End", "reach_end_of_stream"] finished = finished.select(*cols) #combine two dataframes for our transformed Schema transformed = start.join(finished, on=["device_id", "Content Name"], how='left_outer') #Make sure Time Start before time end transformed = transformed.where(col("Time Start") <= col("Time End")) #Convert time stamps to unix #transformed = transformed.withColumn('Time Start', F.unix_timestamp('Time Start')) #transformed = transformed.withColumn('Time End', F.unix_timestamp('Time End')) #Get correct Time Ends def getEndTime(end_time_list): return end_time_list[0] end_time_udf = udf(lambda end_time_list: end_time_list[0], T.TimestampType()) df = transformed.withColumn("end_time_list", F.collect_list("Time End").over(Window.partitionBy("device_id",'Content Name','Time Start', "reach_end_of_stream").orderBy('Time End'))) df = df.groupBy('device_id','Time Start','Content Name', "reach_end_of_stream").agg(F.max('end_time_list').alias('end_time_list')) #Still gets laggy here running the udf that takes first item of list (aka the smallest date time) df = df.withColumn('Time End', end_time_udf("end_time_list")) df = df.drop('end_time_list') #rename columns + reorder df = df.withColumnRenamed("Time Start", "start").withColumnRenamed("Time End", "end").withColumnRenamed("Content Name", "item_name") df = df.select("device_id", "item_name", "start", "end", "reach_end_of_stream") df.write.parquet(outputFile) # Write onto output Parquet
def predict_using_cosine_similarity(self, input_df, predict_course_df=None): # preprocessed input data # print("begin predict using cosine similarity") encoded_df = self.indexer_model.transform(input_df) normalize_rating_udf = udf(lambda p: 0.0 if p > 10 else p, DoubleType()) encoded_df = encoded_df.withColumn( self.rating_col_name, normalize_rating_udf(encoded_df[self.rating_col_name])) # get predict course df (remaining course) if predict_course_df is None: predict_course_df_predict = encoded_df.join(self.item_index_df, encoded_df[self.item_col_name_index] != self.item_index_df[ self.item_col_name_index]) \ .select(self.user_col_name, self.item_col_name_index) else: predict_course_df = self.indexer_model.transform(predict_course_df) predict_course_df_predict = predict_course_df.drop( self.rating_col_name) # get all value that can participate in evaluate final score similarity_score_df = encoded_df.join(self.item_similarity, encoded_df[self.item_col_name_index] == self.item_similarity['id1']) \ .select(self.user_col_name, self.rating_col_name, 'id1', 'id2', 'similarity') \ # .withColumnRenamed(self.user_col_name, "user_name_similarity") # encoded_df[self.item_col_name_index] == self.item_similarity['id2']) # can delete this part if allow duplicate id1,id2 # def predict(student, course, similarity_score_df): # # get first 5 course the student already attended which are the most relevant to the current course # relevant_df = similarity_score_df.filter(similarity_score_df[self.user_col_name] == student and # similarity_score_df['id2'] == course) \ # .orderBy('similarity', ascending=False) \ # .head(5) # relevant_df = relevant_df.withColumn('score', relevant_df[self.rating_col_name] * relevant_df['similarity']) # return relevant_df.select(spark_func.avg(relevant_df['score']).alias('avg')).collect()[0][ # 'avg'] # need to check again if avg is enough def predict(list_score, list_similarity): sum_simi = sum(list_similarity) if sum_simi == 0: return 0.0 return sum([ list_score[i] * list_similarity[i] for i in range(len(list_score)) ]) / sum(list_similarity) predict_udf = udf(predict, DoubleType()) window = Window.partitionBy([ spark_func.col(self.user_col_name), spark_func.col(self.item_col_name_index) ]).orderBy(spark_func.col('similarity').desc()) predict_course_df_predict = predict_course_df_predict.join( similarity_score_df.withColumnRenamed("id2", self.item_col_name_index), [self.item_col_name_index, self.user_col_name]) \ .select("*", spark_func.rank().over(window).alias("rank")) \ .filter(spark_func.col("rank") <= 7).groupby(self.user_col_name, self.item_col_name_index) \ .agg(spark_func.collect_list(self.rating_col_name).alias("list_score"), spark_func.collect_list("similarity").alias("list_similarity")) predict_course_df_predict = predict_course_df_predict.withColumn( "prediction", predict_udf(spark_func.col("list_score"), spark_func.col("list_similarity"))) if predict_course_df is not None and self.rating_col_name in predict_course_df.columns: predict_course_df_predict = predict_course_df_predict.join( predict_course_df, [self.user_col_name, self.item_col_name_index]) return predict_course_df_predict
#Récupération de la matrice de types sous forme de dataframe #df_types=dtf.typesToDf() #Spark DataFrame #df_types=spark.createDataFrame(df_types) #Récupération de la matrice de similarité sous forme de dataframe #df_similarities=dtf.similaritiesToDf() #Spark DataFrame #df_similarities=spark.createDataFrame(df_similarities) #Récupération de la matrice de placeTypes sous forme de dataframe #df_placeTypes=dtf.placeTypesToDf() #Spark DataFrame #df_placeTypes=spark.createDataFrame(df_placeTypes) #Récupération de la matrice de placesSimilarity sous forme de dataframe #Spark DataFrame #placesSimilarities=spark.read.format('csv').option('header', 'true').load('../data/placesSimilarities.csv') windowSpec = Window.orderBy("ind") df_cities = dtf.citiesToDf() df_types = dtf.typesToDf() df_placeTypes = dtf.placeTypesToDf() df_similarities = dtf.similaritiesToDf() df_places = dtf.placesToDf() sc_places = spark.createDataFrame(df_places) user_tags = [] d = {'Visits': []} df_visits = pd.DataFrame(data=d) d = {'City_id': []} df_city = pd.DataFrame(data=d) df_types = dtf.typesToDf() #n=len(tab_tags) n_pT = len(df_placeTypes) #Ajout des city_id à la matrice
def preprocess(spark, file_path, percent=None): ''' Split data to train, val and test. file_path: the directory where we want to read preprocessed parquet file and store to be splitted parquet files. e.g., hdfs:/user/ym1970/. percent: the ratio of downsampling, such as 1, 5 or 25 as suggested in instructions. ''' df = spark.read.parquet(file_path + 'interactions_preprocessed_repa.parquet') print('Start downsampling...') if percent: df_downsample_id, _ = [ i for i in df.select('user').distinct().randomSplit( [percent / 100, 1 - percent / 100], 123) ] df = df.join(df_downsample_id, 'user', 'left_semi') print('Start splitting...') df_train_id, df_val_id, df_test_id = [ i for i in df.select('user').distinct().randomSplit([0.6, 0.2, 0.2], 123) ] print('Select records based on user id...') df_train = df.join(df_train_id, 'user', 'left_semi') df_val = df.join(df_val_id, 'user', 'left_semi') df_test = df.join(df_test_id, 'user', 'left_semi') print('Sample user id to be moved to train...') window = Window.partitionBy('user').orderBy('item') df_val_window = (df_val.select( 'user', 'item', 'rating', F.row_number().over(window).alias('row_number'))) df_test_window = (df_test.select( 'user', 'item', 'rating', F.row_number().over(window).alias('row_number'))) print('Move to train...') df_val_to_train = df_val_window.filter( df_val_window.row_number % 2 == 1).select('user', 'item', 'rating') df_test_to_train = df_test_window.filter( df_test_window.row_number % 2 == 1).select('user', 'item', 'rating') df_train = df_train.union(df_val_to_train).union(df_test_to_train) df_val = df_val.subtract(df_val_to_train) df_test = df_test.subtract(df_test_to_train) print('Delete based on book id...') df_val = df_val.join(df_train, 'item', 'left_semi') df_test = df_test.join(df_train, 'item', 'left_semi') print('Write to parquet...') df_train.write.parquet(file_path + 'interactions_train_' + str(percent) + '.parquet') df_val.write.parquet(file_path + 'interactions_val_' + str(percent) + '.parquet') df_test.write.parquet(file_path + 'interactions_test_' + str(percent) + '.parquet')
def main(): sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session # job = Job(glueContext) # job.init(args['JOB_NAME'], args) spark.conf.set("spark.sql.session.timeZone", "GMT+07:00") is_dev = True ho_chi_minh_timezone = pytz.timezone('Asia/Ho_Chi_Minh') today = datetime.now(ho_chi_minh_timezone) today_second = long(today.strftime("%s")) print('today_id: ', today_second) #------------------------------------------------------------------------------------------------------------------# # ------------------------------------------------------------------------------------------------------------------# def concaText(student_behavior_date, behavior_id, student_id, contact_id, package_code, package_endtime, package_starttime, student_level_code, student_status_code, transformed_at): text_concat = "" if student_behavior_date is not None: text_concat += str(student_behavior_date) if behavior_id is not None: text_concat += str(behavior_id) if student_id is not None: text_concat += str(student_id) if contact_id is not None: text_concat += str(contact_id) if package_code is not None: text_concat += str(package_code) if package_endtime is not None: text_concat += str(package_endtime) if package_starttime is not None: text_concat += str(package_starttime) if student_level_code is not None: text_concat += str(student_level_code) if student_status_code is not None: text_concat += str(student_status_code) if transformed_at is not None: text_concat += str(transformed_at) return text_concat concaText = f.udf(concaText, StringType()) def convertStudentIdToLong(student_id): try: student_id_long = long(student_id) return student_id_long except: return 0L convertStudentIdToLong = f.udf(convertStudentIdToLong, LongType()) # ------------------------------------------------------------------------------------------------------------------# ################################################## # Lay du lieu kiem tra ky thuat trong bang student_technical_test dyf_datasourceTech = glueContext.create_dynamic_frame.from_catalog( database="dm_toa", table_name="student_technical_test_odin") # print('dyf_datasourceTech') # dyf_datasourceTech.printSchema() # Chon cac truong can thiet dyf_datasourceTech = dyf_datasourceTech.select_fields([ '_key', 'thoigianhenktkt', 'ketluan', 'emailhocvien', 'dauthoigian', 'emailadvisor', 'nguoitiepnhan', 'trinhdohocvien' ]) dyf_datasourceTech = dyf_datasourceTech.resolveChoice( specs=[('_key', 'cast:long')]) if (dyf_datasourceTech.count() > 0): dyf_datasourceTech = Filter.apply( frame=dyf_datasourceTech, f=lambda x: x["emailhocvien"] is not None and x["emailhocvien"] != '' and x["thoigianhenktkt"] is not None and x[ "thoigianhenktkt"] != '' and x["ketluan"] == 'Pass') dyf_datasourceTech_numeber = dyf_datasourceTech.count() print("Count data 2: ", dyf_datasourceTech_numeber) if dyf_datasourceTech_numeber < 1: return dy_datasourceTech = dyf_datasourceTech.toDF() dy_datasourceTech = dy_datasourceTech.limit(100) print('dy_datasourceTech') dy_datasourceTech.printSchema() dy_datasourceTech = dy_datasourceTech.withColumn( 'thoigianhenktkt_id', f.unix_timestamp('thoigianhenktkt', 'yyyy-MM-dd HH:mm:ss').cast('long')) print('dy_datasourceTech__2') dy_datasourceTech.printSchema() # lay thoi gian kich hoat dau tien w2 = Window.partitionBy("emailhocvien").orderBy( f.col("thoigianhenktkt_id").desc()) dy_datasourceTech = dy_datasourceTech.withColumn("row", f.row_number().over(w2)) \ .where(f.col('row') <= 1) \ print('dy_datasourceTech__3') dy_datasourceTech.printSchema() #--------------------------------------------------------------------------------------------------------------# dyf_student_contact = glueContext.create_dynamic_frame.from_catalog( database="tig_advisor", table_name="student_contact") # chon cac field dyf_student_contact = dyf_student_contact.select_fields( ['_key', 'contact_id', 'student_id', 'user_name']) dyf_student_contact = Filter.apply( frame=dyf_student_contact, f=lambda x: x["contact_id"] is not None and x["contact_id"] != '' and x["student_id"] is not None and x["student_id"] != '' and x[ "user_name"] is not None and x["user_name"] != '') dyf_student_contact_number = dyf_student_contact.count() print('dyf_student_contact_number::number: ', dyf_student_contact_number) if dyf_student_contact_number < 1: return dy_student_contact = dyf_student_contact.toDF() dy_student_contact.dropDuplicates(['student_id']) dy_join_teach_concat = dy_datasourceTech.join( dy_student_contact, dy_datasourceTech.emailhocvien == dy_student_contact.user_name) print('dyf_join_teach_concat::schema') dy_join_teach_concat.printSchema() join_teach_concat_number = dy_join_teach_concat.count() print('join_teach_concat_number::number: ', join_teach_concat_number) if join_teach_concat_number < 1: return #--------------------------------------------------------------------------------------------------------------# dyf_student_package_status = glueContext.create_dynamic_frame.from_catalog( database="od_student_behavior", table_name="student_status") dyf_student_package_status = dyf_student_package_status\ .select_fields(['contact_id', 'status_code', 'start_date', 'end_date'])\ .rename_field('contact_id', 'contact_id_ps') print('dyf_student_package_status::drop_duplicates') df_student_package_status = dyf_student_package_status.toDF() print('dyf_student_package_status::drop_duplicates::before: ', df_student_package_status.count()) df_student_package_status = df_student_package_status.drop_duplicates() print('dyf_student_package_status::drop_duplicates::after: ', df_student_package_status.count()) print('dy_student_package_status') df_student_package_status.printSchema() # --------------------------------------------------------------------------------------------------------------# dyf_student_package = glueContext.create_dynamic_frame.from_catalog( database="od_student_behavior", table_name="student_package") print('dyf_student_package__0') dyf_student_package.printSchema() dyf_student_package = dyf_student_package \ .select_fields(['student_id', 'package_code', 'start_time', 'end_time'])\ .rename_field('student_id', 'student_id_pk') # --------------------------------------------------------------------------------------------------------------# print('dyf_student_package__1') dyf_student_package.printSchema() dyf_student_package = dyf_student_package.resolveChoice( specs=[('start_time', 'cast:long'), ('end_time', 'cast:long')]) print('dyf_student_package__2') dyf_student_package.printSchema() df_student_package = dyf_student_package.toDF() print('df_student_package::drop_duplicates::before: ', df_student_package.count()) df_student_package = df_student_package.drop_duplicates() print('df_student_package::drop_duplicates::after: ', df_student_package.count()) print('df_student_package') df_student_package.printSchema() df_student_package.show(3) df_student_package_number = df_student_package.count() print('df_student_package_number: ', df_student_package_number) # --------------------------------------------------------------------------------------------------------------# # --------------------------------------------------------------------------------------------------------------# dy_join_teach_concat_number = dy_join_teach_concat.count() print('dy_join_teach_concat_number: ', dy_join_teach_concat_number) join_result = dy_join_teach_concat\ .join(df_student_package_status, (dy_join_teach_concat.contact_id == df_student_package_status.contact_id_ps) & (dy_join_teach_concat.thoigianhenktkt_id >= df_student_package_status.start_date) & (dy_join_teach_concat.thoigianhenktkt_id < df_student_package_status.end_date), 'left' )\ .join(df_student_package, (dy_join_teach_concat.student_id == df_student_package.student_id_pk) &(dy_join_teach_concat.thoigianhenktkt_id >= df_student_package.start_time) &(dy_join_teach_concat.thoigianhenktkt_id < df_student_package.end_time), 'left' ) print('join_result') join_result.printSchema() join_result_number = join_result.count() print('join_result_number: ', join_result_number) if join_result_number < 1: return join_result.show(3) student_id_unavailable = 0L package_endtime_unavailable = 99999999999L package_starttime_unavailable = 0L package_code_unavailable = 'UNAVAILABLE' student_level_code_unavailable = 'UNAVAILABLE' student_status_code_unavailable = 'UNAVAILABLE' measure1_unavailable = 0 measure2_unavailable = 0 measure3_unavailable = 0 measure4_unavailable = float(0.0) # join_result = join_result.withColumnRenamed('student_id', 'student_id_a') join_result = join_result.select( join_result.thoigianhenktkt_id.alias('student_behavior_date'), f.lit(5L).alias('behavior_id'), 'student_id', join_result.contact_id.alias('contact_id'), join_result.package_code.alias('package_code'), join_result.end_time.cast('long').alias('package_endtime'), join_result.start_time.cast('long').alias('package_starttime'), join_result.trinhdohocvien.cast('string').alias( 'student_level_code'), join_result.status_code.cast('string').alias( 'student_status_code'), f.lit(today_second).cast('long').alias('transformed_at'), ) join_result = join_result.na.fill({ 'package_code': package_code_unavailable, 'package_endtime': package_starttime_unavailable, 'package_starttime': package_endtime_unavailable, 'student_level_code': student_level_code_unavailable, 'student_status_code': student_status_code_unavailable }) print('join_result--1') join_result.printSchema() join_result.show(1) join_result = join_result.withColumn( 'student_behavior_id', f.md5( concaText(join_result.student_behavior_date, join_result.behavior_id, join_result.student_id, join_result.contact_id, join_result.package_code, join_result.package_endtime, join_result.package_starttime, join_result.student_level_code, join_result.student_status_code, join_result.transformed_at))) # print('join_result--2') join_result.printSchema() join_result.show(5) dyf_join_result = DynamicFrame.fromDF(join_result, glueContext, 'dyf_join_result') dyf_join_result = Filter.apply( frame=dyf_join_result, f=lambda x: x["contact_id"] is not None and x["contact_id"] != '') apply_ouput = ApplyMapping.apply( frame=dyf_join_result, mappings=[("student_behavior_id", "string", "student_behavior_id", "string"), ("student_behavior_date", "long", "student_behavior_date", "long"), ("behavior_id", "long", "behavior_id", "long"), ("student_id", "string", "student_id", "long"), ("contact_id", "string", "contact_id", "string"), ("package_code", "long", "package_code", "string"), ("package_endtime", "long", "package_endtime", "long"), ("package_starttime", "long", "package_starttime", "long"), ("student_level_code", "string", "student_level_code", "string"), ("student_status_code", "string", "student_status_code", "string"), ("transformed_at", "long", "transformed_at", "long")]) dfy_output = ResolveChoice.apply(frame=apply_ouput, choice="make_cols", transformation_ctx="resolvechoice2") # glueContext.write_dynamic_frame.from_options( frame=dfy_output, connection_type="s3", connection_options={ "path": "s3://dtsodin/student_behavior/student_behavior", "partitionKeys": ["behavior_id"] }, format="parquet")
def create_test_data(): df_train = spark.read.parquet(os.path.join("datasets", "train.parquet")) df_train.createOrReplaceTempView("data") max_id = spark.sql(""" SELECT max(uid) as m FROM data """).first().m print("max_id", max_id) df_result = pd.read_csv(os.path.join("datasets", "sample_submission.csv")) files = list(df_result["seg_id"].values) schema = StructType([StructField("x", DoubleType(), True)]) seg = 0 for file in files: sep = "." if seg % 200 == 0: sep = "|" if seg % 20 == 0: print(sep, end="", flush=True) seg += 1 print("", end="\n", flush=True) seg = 0 df_test = None for file in files: # print(file) if seg % 20 == 0: print("|", end="", flush=True) w1 = Window.orderBy("uid") w2 = Window.partitionBy("seg").orderBy("uid") df_temp = spark.read.csv( os.path.join("datasets", "test", file + ".csv"), header=True, schema=schema).withColumn( "y", lit(None).cast(DoubleType())).withColumn( "uid", lit(max_id + 1) + monotonically_increasing_id()).withColumn( "idx", row_number().over(w1).cast(IntegerType())).withColumn( "seg", lit(seg).cast(IntegerType())).withColumn( "no", row_number().over(w2).cast( IntegerType())).withColumn( "name", (lit(file.split(".")[0])).cast( StringType())).withColumn( "set", lit(1)) df_temp.createOrReplaceTempView("data") df_temp_f = spark.sql(""" SELECT uid, set, seg, no, name, x, y FROM data ORDER BY set, seg, no, uid """) max_id = spark.sql(""" SELECT max(uid) as m FROM data """).first().m seg += 1 if df_test == None: df_test = df_temp_f else: df_test = df_test.union(df_temp_f) # create 1 file per 20 = I had issue when processing all in one go if seg % 20 == 0: file_name = "test_1_{:04}.parquet".format(seg) df_test = df_test.repartition(1) df_test.write.parquet(os.path.join("datasets", file_name)) df_test = None # if seg == 4 : break print("(", end="", flush=True) # left under 20 batch if df_test != None: file_name = "test_1_{:04}.parquet".format(seg) df_test = df_test.repartition(1) df_test.write.parquet(os.path.join("datasets", file_name)) df_test = None print("x)", end="\n", flush=True) print("max_id", max_id) df_result = pd.read_csv(os.path.join("datasets", "sample_submission.csv")) files = list(df_result["seg_id"].values) seg = 0 for file in files: sep = "." if seg % 200 == 0: sep = "|" if seg % 20 == 0: print(sep, end="", flush=True) seg += 1 print("", end="\n", flush=True) seg = 0 mode = "overwrite" for file in files: if seg % 20 == 0: print("|", end="", flush=True) seg += 1 if seg % 20 == 0: file_name = "test_1_{:04}.parquet".format(seg) df_test = spark.read.parquet(os.path.join("datasets", file_name)) df_test.write.mode(mode).parquet( os.path.join("datasets", "test.parquet")) mode = "append" print("(", end="", flush=True) # left under 20 batch if seg % 20 != 0: file_name = "test_1_{:04}.parquet".format(seg) df_test = spark.read.parquet(os.path.join("datasets", file_name)) df_test.write.mode(mode).parquet( os.path.join("datasets", "test.parquet")) mode = "append" print("x", end="", flush=True) print(")", end="\n", flush=True) seg = 0 for file in files: sep = "." if seg % 200 == 0: sep = "|" if seg % 20 == 0: print(sep, end="", flush=True) seg += 1 print("", end="\n", flush=True) seg = 0 for file in files: if seg % 20 == 0: print("|", end="", flush=True) seg += 1 if seg % 20 == 0: file_name = "test_1_{:04}.parquet".format(seg) shutil.rmtree(os.path.join("datasets", file_name)) print("(", end="", flush=True) # left under 20 batch if seg % 20 != 0: file_name = "test_1_{:04}.parquet".format(seg) shutil.rmtree(os.path.join("datasets", file_name)) print("x", end="", flush=True) print(")", end="\n", flush=True)
def with_window_column(df): from pyspark.sql.window import Window from pyspark.sql.functions import percent_rank windowSpec = Window.partitionBy(df['id']).orderBy(df['forecast']) return df.withColumn("r", percent_rank().over(windowSpec))
return (x1 + x2) / 2 multiple_col_avg_udf = udf(multiple_col_avg, DoubleType()) # using df spark = SparkSession.builder.getOrCreate() df = spark.createDataFrame(studentMarksData, ["id", "year", "score1", "score2"]) df.show() avg_score_by_id_year = df.withColumn("avg_marks", multiple_col_avg_udf(col("score1").cast(IntegerType()), col("score2").cast(IntegerType())),) avg_score_by_id_year.show(truncate=False) # Calculate Top student using rank function ( use case where we just need to calculte on the whole DF ) print("Calculate Top student using rank function") windowSpec = Window.orderBy(col("avg_marks")) students_with_rank = avg_score_by_id_year.withColumn("rank" , rank().over(windowSpec)) students_with_rank.show() sec_year_students_with_rank = avg_score_by_id_year.filter(col("year") == "year2").withColumn("rank" , rank().over(windowSpec)) sec_year_students_with_rank.filter(col("rank") <= 3).show() sec_year_students_with_row_number = avg_score_by_id_year.filter(col("year") == "year2")\ .withColumn("row_number", row_number().over(windowSpec)) sec_year_students_with_row_number.show() # Calculate Top student using rank function ( use case where we need to divide/partition the data on some col for example by year, # or any other condition) print("Window By Partition ") windowSpec = Window.partitionBy(col("year")).orderBy(col("avg_marks").desc()) students_by_yearly_rank = avg_score_by_id_year.withColumn("rank", rank().over(windowSpec))
def create_dataset(self, sqlContext): for table in self.tables_dict: # if (self.omop_ver == "6.0" and table == "person"): # self.tables_dict[table].append("death_datetime") self.tables_to_join[table] = self.loaded_tables[table].select( self.tables_dict[table]) self.tables_to_join["history_aids"] = ( self.tables_to_join["note_nlp"].where( (col("lexical_variant").rlike("(?i)history")) & ((col("lexical_variant").rlike("(?i)aids")) | (col("lexical_variant").rlike("(?i)immunodeficiency")) | (col("lexical_variant").rlike( "(?i)acquired immune deficiency syndrome"))) & (~col("lexical_variant").rlike("(?i)family"))).select( col("note_id")).drop_duplicates().withColumn( "history_aids", lit(1)).join(self.tables_to_join["note"], how="left", on="note_id").select( col("person_id"), col("history_aids"), col("note_date")).groupby( col("person_id"), col("history_aids")).agg( F.min("note_date").alias( "history_aids_min_date"))) self.tables_to_join["history_metastases"] = ( self.tables_to_join["note_nlp"].where( (col("lexical_variant").rlike("(?i)history")) & ((col("lexical_variant").rlike("(?i)metastases")) | (col("lexical_variant").rlike("(?i)metastatic")) | (col("lexical_variant").rlike("(?i)metastasis")) | (col("lexical_variant").rlike("(?i)secondary tumor")) | (col("lexical_variant").rlike("(?i)secondary cancer"))) & (~col("lexical_variant").rlike("(?i)family"))).select( col("note_id")).drop_duplicates().withColumn( "history_metastases", lit(1)).join(self.tables_to_join["note"], how="left", on="note_id").select( col("person_id"), col("history_metastases"), col("note_date")).groupby( col("person_id"), col("history_metastases")). agg(F.min("note_date").alias("history_metastases_min_date"))) self.tables_to_join["history_hemato"] = ( self.tables_to_join["note_nlp"].where( (col("lexical_variant").rlike("(?i)history")) & ((col("lexical_variant").rlike("(?i)myeloma")) | (col("lexical_variant").rlike("(?i)leukemia")) | (col("lexical_variant").rlike("(?i)lymphoma"))) & (~col("lexical_variant").rlike("(?i)family"))).select( col("note_id")).drop_duplicates().withColumn( "history_hemato", lit(1)).join( self.tables_to_join["note"], how="left", on="note_id").select(col("person_id"), col("history_hemato"), col("note_date")).groupby( col("person_id"), col("history_hemato")). agg(F.min("note_date").alias("history_hemato_min_date"))) self.tables_to_join["preceding_surgery"] = ( self.tables_to_join["visit_detail"].where( col("visit_detail_concept_id") == 4149152).select( col("visit_occurrence_id")).withColumn( "from_surgery", lit(1)).drop_duplicates()) if self.database == "mimicomop": self.tables_to_join["icu_stays"] = ( self.tables_to_join["visit_occurrence"].join( self.tables_to_join["visit_detail"], how="left", on="visit_occurrence_id").where( (col("visit_detail_concept_id") == 32037) & (col("visit_type_concept_id") == 2000000006)).select( col("person_id"), col("visit_occurrence_id"), col("visit_detail_id"), col("visit_detail_concept_id"), col("visit_start_date"), col("visit_end_date"), col("discharge_to_concept_id"))) else: self.tables_to_join["icu_stays"] = ( self.tables_to_join["visit_occurrence"].join( self.tables_to_join["visit_detail"], how="left", on="visit_occurrence_id"). join(self.tables_to_join["care_site"].where( F.lower(F.col("care_site_name")).like("% rea %") | F.lower(F.col("care_site_name")).like("%reanimation%")), how="inner", on="care_site_id").select(col("person_id"), col("visit_occurrence_id"), col("visit_detail_id"), col("visit_detail_concept_id"), col("visit_start_date"), col("visit_end_date"), col("discharge_to_concept_id"))) window_asc = (Window.partitionBy("person_id", "visit_occurrence_id", "measurement_concept_id").orderBy( asc("measurement_datetime"))) window_desc = (Window.partitionBy("person_id", "visit_occurrence_id", "measurement_concept_id").orderBy( desc("measurement_datetime"))) self.pivot_measures_asc = (self.tables_to_join["icu_stays"].join( self.tables_to_join["measurement"], how="left", on="visit_occurrence_id").where( (col("measurement_datetime") >= col("visit_start_date")) & (col("measurement_datetime") <= col("visit_end_date"))).select( '*', rank().over(window_asc).alias('rank')).filter( col('rank') == 1).groupBy("visit_occurrence_id").pivot( "measurement_name", list(self.measurements.keys())).agg( first("value_as_number", ignorenulls=True))) self.pivot_measures_asc = self.rename_columns( self.pivot_measures_asc, {str(k): str("first_" + k) for k in self.measurements.keys()}) self.pivot_measures_desc = (self.tables_to_join["icu_stays"].join( self.tables_to_join["measurement"], how="left", on="visit_occurrence_id").where( (col("measurement_datetime") >= col("visit_start_date")) & (col("measurement_datetime") <= col("visit_end_date"))).select( '*', rank().over(window_desc).alias('rank')).filter( col('rank') == 1).groupBy("visit_occurrence_id").pivot( "measurement_name", list(self.measurements.keys())).agg( first("value_as_number", ignorenulls=True))) self.pivot_measures_desc = self.rename_columns( self.pivot_measures_desc, {str(k): str("last_" + k) for k in self.measurements.keys()}) self.dataset = (self.tables_to_join["icu_stays"].join( self.tables_to_join["person"], how="left", on="person_id").join( self.tables_to_join["preceding_surgery"], how="left", on="visit_occurrence_id").withColumn( "from_surgery", coalesce("from_surgery", lit(0))).join( self.tables_to_join["history_aids"], how="left", on="person_id").withColumn( "history_aids", coalesce("history_aids", lit(0))).join( self.tables_to_join["history_metastases"], how="left", on="person_id").withColumn( "history_metastases", coalesce("history_metastases", lit(0))). join(self.tables_to_join["history_hemato"], how="left", on="person_id").withColumn( "history_hemato", coalesce("history_hemato", lit(0))).join( self.pivot_measures_asc, how="left", on="visit_occurrence_id").join( self.pivot_measures_desc, how="left", on="visit_occurrence_id")) self.add_deathdate(sqlContext)
file_date, \ month(file_date) as month, \ session_id, \ pageview_event_cnt, \ video_start_cnt, \ video_time_spent_secs, \ browser_typ_dsc, \ device_type_dsc as device \ FROM user_business_defined_dataset.cnn_adobe_bdd_web \ WHERE file_date like %s \ AND (visitor_id != '*UN' OR visitor_id != '-100')" % first_month) events = events.dropDuplicates(["visitor_id","hit_time_gmt"]) events = events.fillna('desktop', subset=['device']) events = events.withColumn('device', rename_device(F.col("device"))) events = events.filter(F.col('device')!='other') window = Window.orderBy(F.col("hit_time_gmt").desc()).partitionBy(["visitor_id", "session_id"]) events = events.withColumn("time_on_event", (F.lag(events.hit_time_gmt, 1).over(window) - events.hit_time_gmt)) events = events.fillna(0.0, subset=['time_on_event']) #--------------------------------------------------------------------------------------------------------------------------------------------------- # Get traffic features visitors_1 = events.groupby(["visitor_id","device"]).agg(F.countDistinct("session_id").alias("visits"), F.sum("pageview_event_cnt").alias("pageviews"), F.sum("video_start_cnt").alias("videostarts"), F.sum("time_on_event").alias("time_on_site"), F.sum("video_time_spent_secs").alias("video_playtime"), F.first("browser_typ_dsc").alias("browser_typ_dsc"), F.countDistinct("file_date").alias("days_visited")).fillna(0.0) #--------------------------------------------------------------------------------------------------------------------------------------------------- # Split visitors by monthly dropouts vs non-dropouts
# #### 5.3 - Dataset (parquet) Preview # In[27]: get_ipython().run_line_magic('spark.pyspark', '') df_pqt.show() # #### 5.4 - Remove duplicated values # In[29]: get_ipython().run_line_magic('spark.pyspark', '') window = Window.partitionBy('id').orderBy(df_pqt['update_date'].desc()) df_pqt = df_pqt.withColumn("last_update", rank().over(window)) df_final = df_pqt.filter("last_update=1") df_final.show() # #### 6 - Final dataset storage # # In[31]: get_ipython().run_line_magic('spark.pyspark', '') file_pqt = 'final.parquet' df_pqt_path = local + out + file_pqt df_final.write.parquet(df_pqt_path)
from pyspark.sql.window import Window #df_experience = spark.read.format("com.databricks.spark.avro").load("/mnt/S3_final_hour_1/edw.fact_experience_booking/2017/06/10/*/*/", schema=schema_experience) #df_flight = df_flight.select("booking_id").orderBy('booking_id') #display(df_flight) # COMMAND ---------- display(df_flight.select("booking_id","booking_status","modified_at").filter(df_flight.booking_id == '162290951')) # COMMAND ---------- #use window function row_number to dedup booking. import pyspark.sql.functions as func spec = Window.partitionBy(df_flight.booking_id).orderBy(df_flight.modified_at.desc()) row_number = func.row_number().over(spec) df_flight2 = df_flight.select("booking_id",row_number.alias("row_number")) df_flight2 = df_flight2.filter(df_flight2.row_number == 1) display(df_flight2) # COMMAND ---------- df_experience.groupby(df_experience.booking_status).agg({'booking_id':'count'}).show() # COMMAND ---------- df_survey.show(2,truncate = True)
def growing_row_window(self): return Window.partitionBy('id').orderBy('v').rowsBetween( Window.unboundedPreceding, 3)
def shrinking_row_window(self): return Window.partitionBy('id').orderBy('v').rowsBetween( -2, Window.unboundedFollowing)
def shrinking_range_window(self): return Window.partitionBy('id').orderBy('v') \ .rangeBetween(-3, Window.unboundedFollowing)
def finite_difference(sparkdf, xaxis, yaxes, window_size, partitionAxis=None, monotonically_increasing=False): """ Calculate the finite difference dY/dX for 1 or more Y=f(X) axes with respect to a single X axis :param sparkdf: Input Spark dataframe. :param xaxis: Column name for X axis. :param yaxes: List of column names for Y axes. :param window_size: Width of window over which to calculate finite difference (in number of points). :param partitionAxis: Categorical axis to partition :param monotonically_increasing: Whether Y axes should be monotonically increasing (e.g. counters). If set, negative finite differences in Y will be set to zero. :return: A Spark dataframe with one new column per Y axis calculated as dY/dX. """ original_columns = sparkdf.schema.fieldNames() # Get the first value, so we can use it to define the initial "previous value" first_row = sparkdf.first() first_x = first_row[xaxis] # Slide over this window window = Window.orderBy(xaxis) if partitionAxis is not None: window = window.partitionBy(partitionAxis) # Create function to calculate difference between two columns delta_fn = udf(lambda col1, col2: col1 - col2, DoubleType()) div_fn = udf(lambda col1, col2: col1 / col2 if col2 > 0 else float('nan'), DoubleType()) # Get delta X xaxis_lag = xaxis + '_lag_' + str(window_size) xaxis_delta = xaxis + '_delta_' + str(window_size) df = (sparkdf.withColumn(xaxis, sparkdf[xaxis].cast( DoubleType())).withColumn( xaxis_lag, lag(xaxis, window_size, first_x).over(window)).withColumn( xaxis_delta, delta_fn(col(xaxis), col(xaxis_lag))).drop(xaxis_lag)) # Get delta y and dY/dX for each y for yaxis in yaxes: yaxis_lag = yaxis + '_lag_' + str(window_size) yaxis_delta = yaxis + '_delta_' + str(window_size) rate = 'rate_' + yaxis + '_over_' + xaxis rate_lag = rate + '_lag' rate_lead = rate + '_lead' first_y = first_row[yaxis] df = (df.withColumn(yaxis, sparkdf[yaxis].cast(DoubleType())).withColumn( yaxis_lag, lag(yaxis, window_size, first_y).over(window)).withColumn( yaxis_delta, delta_fn(col(yaxis), col(yaxis_lag)))) if monotonically_increasing: df[yaxis_delta] = df[yaxis_delta].map(lambda d: max(0, d)) df = ( df.withColumn(rate, div_fn( col(yaxis_delta), col(xaxis_delta))).drop(yaxis_lag).drop(yaxis_delta) # Determine when the delta changes (lead and lag by 1) .withColumn(rate_lag, lag(rate, 1, 1).over(window)).withColumn( rate_lead, lead(rate, 1, 1).over(window)) # Only get values where rate is different from before or after .filter(rate + '!=' + rate_lag + ' OR ' + rate + '!=' + rate_lead).drop(rate_lag).drop(rate_lead)) return df.drop(xaxis_delta)
final = sqlContext.sql("SELECT analytic_id" + query + " FROM cust_trans_final GROUP BY analytic_id ") final = final.drop('7B20SMS10MMS1D') final.registerTempTable("final") #filter package according to existing in customer transaction and create index list of packages df_index = parsed_master_final.where( parsed_master_final.New_Package_Name.isin(column_seq)).drop_duplicates( subset=['New_Package_Name']).withColumn( "id", monotonically_increasing_id()) index_pp = df_index.select("id","Package_Duration_XS","Package_Duration_S","Package_Duration_M","Package_Duration_L","Package_Duration_XL","Price_XS","Price_S",\ "Price_M","Price_L","Price_XL","Package_Size_1Mbps","Package_Size_256Kbps","Package_Size_2Mbps","Package_Size_384Kbps",\ "Package_Size_4Mbps","Package_Size_512Kbps","Package_Size_64Kbps","Package_Size_6Mbps","Package_Size_Entertain",\ "Package_Size_Full speed","Package_Size_Game","Package_Size_Social","Package_Size_Time") windowSpec = W.orderBy("id") ontop_rowid = df_index.withColumn("id", row_number().over(windowSpec)) #df_index.select("New_Package_Name","id").withColumn("id", row_number().over(windowSpec)).show() ontop_rowid2 = ontop_rowid.withColumnRenamed("Package_Size_Full speed", "Package_Size_Full_speed") ontop_rowid2.registerTempTable("ontop_rowid2") index_pp2 = sqlContext.sql( "SELECT id-1 as id,Package_Duration_XS,Package_Duration_S,Package_Duration_M,Package_Duration_L,Package_Duration_XL,Price_XS,Price_S,Price_M,Price_L,Price_XL,Package_Size_1Mbps,Package_Size_256Kbps,Package_Size_2Mbps,Package_Size_384Kbps,Package_Size_4Mbps,Package_Size_512Kbps,Package_Size_64Kbps,Package_Size_6Mbps,Package_Size_Entertain,Package_Size_Full_speed,Package_Size_Game,Package_Size_Social,Package_Size_Time FROM ontop_rowid2" ) package_sort = ontop_rowid.select('New_Package_Name').rdd.map( lambda r: r[0]).collect() query = '' for c in package_sort: query = query + ", " + c final_selected = sqlContext.sql("SELECT analytic_id" + query + " FROM final")
def growing_row_window(self): return Window.partitionBy('id').orderBy('v').rowsBetween(Window.unboundedPreceding, 3)
data = spark.read.format('csv') \ .options(header='true', inferschema='false') \ .load(sys.argv[1]) data = data.select('Symbol', 'Name', 'Date', 'Log Return') \ .filter(col('Log Return') != '') data = data.withColumn( 'Date', to_date(unix_timestamp(col('Date'), 'yyyy-MM-dd').cast('timestamp'))) data = data.withColumn('Log Return', col('Log Return').cast('float').alias('Log Return')) # cumulative log return sum cum_window = Window.partitionBy(data['Symbol']).orderBy( data['Date']).rangeBetween(Window.unboundedPreceding, 0) cum_sum = data.withColumn('CumulativeSum', sum('Log Return').over(cum_window)) cum_sum = cum_sum.orderBy(col('Symbol'), col('Date')) # total log return sum sum_window = Window.partitionBy('Symbol') sp_sum = cum_sum.withColumn('MaxDate', max('Date').over(sum_window)) \ .where(col('Date') == col('MaxDate')) \ .drop('MaxDate', 'Log Return', 'Date') \ .withColumnRenamed('CumulativeSum', 'ReturnSum') sp_sum = sp_sum.select('Symbol', 'Name', 'ReturnSum') # top 5 drops top_drop_window = Window.partitionBy(data['Symbol']).orderBy( data['Log Return'].asc())
def shrinking_row_window(self): return Window.partitionBy('id').orderBy('v').rowsBetween(-2, Window.unboundedFollowing)
# Ponto de entrada para gerar uma sessão do spark spark = SparkSession.builder.appName("PySpark Test").getOrCreate() # ==> Lendo de um arquivo CSV(local) sem a necessidade de gerar um schema # ==> df = spark.read.csv(r'./resources/data.csv',header=True) # Criando um DataFrame passando os valores do array e o schema definido df = spark.createDataFrame(data=data, schema=schema) # Criando um novo DataFrame que irá incluir uma nova coluna, # convertendo campo String em date # e excluindo a coluna de data(origem em string) df2 = df.withColumn( 'Atualizacao', F.to_date( F.unix_timestamp( F.col('Data_Atualizacao'), 'yyyy-MM-dd').cast("timestamp"))).drop('Data_Atualizacao') # Criando o Dataframe final adcionando uma nova coluna # que fará o rank ordenando pela data de atualização, # agrupado por municipio e ordenado df_final = df2.withColumn( "Ordem Transacao", F.dense_rank().over( W.partitionBy("Municipio").orderBy("Atualizacao"))).orderBy( "Municipio", "Ordem Transacao") # Exibindo resultado df_final.show(truncate=False)
def sliding_row_window(self): return Window.partitionBy('id').orderBy('v').rowsBetween(-2, 1)
from pyspark.sql.window import Window from pyspark.sql.functions import * spark = SparkSession.builder.appName('Window Function').getOrCreate() data = [("sales", 1, 5000), ("personnel", 2, 3900), ("sales", 3, 4800), ("sales", 4, 4800), ("personnel", 5, 3500), ("develop", 7, 4200), ("develop", 8, 6000), ("develop", 9, 4500), ("develop", 10, 5200), ("develop", 11, 5200)] columns = ['dept', 'empno', 'salary'] emp_df = spark.createDataFrame(data=data, schema=columns) print(emp_df.show()) window_spec = Window.partitionBy("dept").orderBy("salary") cum_salary_by_dept = emp_df\ .withColumn("max_salary", sum("salary").over(window_spec)) print(max_salary_by_dept.show()) emp_rank_by_dept = emp_df\ .withColumn("rank", rank().over(Window.partitionBy("dept").orderBy("salary")))\ .withColumn("dense_rank", dense_rank().over(Window.partitionBy("dept").orderBy("salary")))\ .withColumn("row_number", row_number().over(Window.partitionBy("dept").orderBy("salary"))) \ .withColumn("max_salary", max("salary").over(Window.partitionBy("dept")))\ .withColumn("min_salary", min("salary").over(Window.partitionBy("dept"))) print(emp_rank_by_dept.show()) print( emp_rank_by_dept.select("dept", "min_salary", "max_salary").dropDuplicates().show())
def cal_mat_window(sc, sqlContext, dfSC, window): windowSpec = Window.partitionBy("symbol").orderBy("date").rangeBetween(-1 * window+1,1) mat = func.avg("close").over(windowSpec) dfSC = dfSC.select(dfSC.symbol, dfSC.date, dfSC.close, mat ) print dfSC.collect()
def churners(self, spark, mysqldetails, InsightName, memberDF, tagsDict): getset = GetSet() pw = getset.get_predictorwindow() mydate = getset.get_churnmaxdate() #churn_data_semifinal1=pd.read_csv('final_data.csv',low_memory=False) query = "(SELECT txnmappedmobile as Mobile,modifiedbillno,uniqueitemcode,itemqty FROM sku_report_loyalty where ModifiedTxnDate>DATE_SUB('" + mydate + "',INTERVAL " + str( pw) + " DAY)) as df" jdbcDFsku = MySqlConnector.mysqlRead(self, spark, mysqldetails[0], mysqldetails[1], query, mysqldetails[2], mysqldetails[3]) jdbcDFsku = jdbcDFsku.groupBy('Mobile', 'modifiedbillno').agg( countDistinct('uniqueitemcode').alias('uniqueitemcode'), sum('itemqty').alias('itemqty')) jdbcDFsku = jdbcDFsku.groupBy('Mobile').agg( sum('uniqueitemcode').alias('uniqueitems'), sum('itemqty').alias('qty')) query = "(SELECT Mobile, tier, gender,EnrolledStoreCode FROM member_report) as df" jdbcDFmem = MySqlConnector.mysqlRead(self, spark, mysqldetails[0], mysqldetails[1], query, mysqldetails[2], mysqldetails[3]) query = "(select StoreCode,state,city from store_master) as df" jdbcDFstr = MySqlConnector.mysqlRead(self, spark, mysqldetails[0], mysqldetails[1], query, mysqldetails[2], mysqldetails[3]) jdbcDFmem = jdbcDFmem.alias('a').join( jdbcDFstr.alias('b'), col('a.EnrolledStoreCode') == col('b.StoreCode'), 'left') query = "(SELECT TxnMappedMobile as Mobile,modifiedbillno,departmentcode FROM (SELECT a.uniqueitemcode,a.departmentcode,b.modifiedbillno,b.TxnMappedMobile FROM item_master a inner join sku_report_loyalty b on a.uniqueitemcode=b.uniqueitemcode where b.ModifiedTxnDate>DATE_SUB('" + mydate + "',INTERVAL " + str( pw) + " DAY)) t ) as df" jdbcDFdep = MySqlConnector.mysqlRead(self, spark, mysqldetails[0], mysqldetails[1], query, mysqldetails[2], mysqldetails[3]) jdbcDFdep = jdbcDFdep.groupBy('Mobile', 'modifiedbillno').agg( countDistinct('departmentcode').alias('departmentcode')) jdbcDFdep = jdbcDFdep.groupBy('Mobile').agg( sum('departmentcode').alias('ndept')) #query = "(select Mobile,ModifiedBillNo,TxnDate,amount,StoreCode,PointsCollected,LapsedPoints,AvailablePoints from txn_report_accrual_redemption where TxnDate<DATE_SUB('2019-04-01',INTERVAL "+cw+" DAY) and TxnDate>DATE_SUB('2019-04-01',INTERVAL "+lw+" DAY))) as df" query = "(select Mobile,ModifiedBillNo,TxnDate,amount,StoreCode,PointsCollected,LapsedPoints,AvailablePoints from txn_report_accrual_redemption where TxnDate>DATE_SUB('" + mydate + "',INTERVAL " + str( pw) + " DAY)) as df" jdbcDF3 = MySqlConnector.mysqlRead(self, spark, mysqldetails[0], mysqldetails[1], query, mysqldetails[2], mysqldetails[3]) jdbcDF3.show(5) #jdbcDF3c = jdbcDF3.filter(col('TxnDate')>date_sub(lit(datetime(2019,4,1).date()),lit(cw))) #mlist = jdbcDF3c.rdd.map(lambda r: r.Mobile).collect() #jdbcDF3 = jdbcDF3.withColumn('labels',when(jdbcDF3.Mobile.isin(mlist),0).otherwise(1)) window = Window.partitionBy('Mobile').orderBy("TxnDate") jdbcDF3 = jdbcDF3.withColumn( "diff", datediff(lag('TxnDate', 1).over(window), 'TxnDate')) #jdbcDF3=jdbcDF3.groupby('Mobile').agg(sum('amount').alias('total_amount'),countDistinct('ModifiedBillNo').alias('total_txn'),countDistinct('TxnDate').alias('total_visits'),\ #max('TxnDate').alias('MaxDate'),min('TxnDate').alias('MinDate'),countDistinct('StoreCode').alias('nstores'),max('diff').alias('max_latency'),max('labels').alias('labels')) jdbcDF3=jdbcDF3.groupby('Mobile').agg(sum('amount').alias('total_amount'),countDistinct('ModifiedBillNo').alias('total_txn'),countDistinct('TxnDate').alias('total_visits'),sum('LapsedPoints').alias('lapsedpoints'),sum('AvailablePoints').alias('availablepoints'),\ max('TxnDate').alias('MaxDate'),min('TxnDate').alias('MinDate'),countDistinct('StoreCode').alias('nstores'),max('diff').alias('max_latency'),sum('PointsCollected').alias('pointscollected')) jdbcDF3 = jdbcDF3.withColumn('current_date', datetime.strptime(mydate, "%Y-%m-%d")) jdbcDF3 = jdbcDF3.withColumn( "recency", datediff(col("MaxDate"), col("current_date"))) jdbcDF3 = jdbcDF3.withColumn( "Tenure", datediff(col("MinDate"), col("current_date"))) jdbcDF3 = jdbcDF3.withColumn( "latency", datediff(col("MinDate"), col("MaxDate")) / (col('total_visits') - 1)) jdbcDF3 = jdbcDF3.join(jdbcDFsku, 'Mobile', 'left') jdbcDF3 = jdbcDF3.join(jdbcDFdep, 'Mobile', 'left') jdbcDF3 = jdbcDF3.join(jdbcDFmem, 'Mobile', 'left') jdbcDF3 = jdbcDF3.toDF(*[c.lower() for c in jdbcDF3.columns]) jdbcDF3 = jdbcDF3.select('mobile', 'gender', 'tier', 'state', 'city', 'total_txn', 'total_visits', 'total_amount', 'pointscollected', 'lapsedpoints', 'availablepoints', 'max_latency', 'recency', 'tenure', 'latency') jdbcDF3.write.option( "header", "true").mode("overwrite").csv('./' + mysqldetails[1] + '/churn_data_semifinal1.csv') ''' myvars = jdbcDF3.columns modelvars = ['mobile', 'gender', 'tier', 'state', 'city', 'total_txn', 'total_visits', 'total_amount', 'pointscollected', 'lapsedpoints', 'availablepoints', 'max_latency', 'recency', 'tenure', 'latency', 'labels'] columns = list(set(myvars).intersection(set(modelvars))) ''' columns = jdbcDF3.columns data_predict = probchurn.data_process(self, spark, columns) #probchurn.classify(self,mysqldetails,'logistic_regression',churn_data_semifinal1) mlist = memberDF.rdd.map(lambda r: r.mobile).collect() churn_data_semifinal1 = churn_data_semifinal1.Mobile.isin(mlist) method = 'lightgbm' memberDF = probchurn.predict(self, mysqldetails, method, data_predict) return memberDF
def modelfit(self, spark, mysqldetails): getset = GetSet() pw = getset.get_predictorwindow() cw = getset.get_churnwindow() lw = int(pw) + int(cw) mydate = getset.get_churnmaxdate() query = "(SELECT TxnMappedMobile as Mobile,modifiedbillno,uniqueitemcode,itemqty FROM sku_report_loyalty where TxnDate<DATE_SUB('" + mydate + "',INTERVAL " + str( cw ) + " DAY) and TxnDate>DATE_SUB('" + mydate + "',INTERVAL " + str( lw) + " DAY)) as df" jdbcDFsku = MySqlConnector.mysqlRead(self, spark, mysqldetails[0], mysqldetails[1], query, mysqldetails[2], mysqldetails[3]) jdbcDFsku = jdbcDFsku.groupBy('Mobile', 'modifiedbillno').agg( countDistinct('uniqueitemcode').alias('uniqueitemcode'), sum('itemqty').alias('itemqty')) jdbcDFsku = jdbcDFsku.groupBy('Mobile').agg( sum('uniqueitemcode').alias('uniqueitems'), sum('itemqty').alias('qty')) query = "(SELECT Mobile, tier, gender,EnrolledStoreCode FROM member_report) as df" jdbcDFmem = MySqlConnector.mysqlRead(self, spark, mysqldetails[0], mysqldetails[1], query, mysqldetails[2], mysqldetails[3]) query = "(select StoreCode,state,city from store_master) as df" jdbcDFstr = MySqlConnector.mysqlRead(self, spark, mysqldetails[0], mysqldetails[1], query, mysqldetails[2], mysqldetails[3]) jdbcDFmem = jdbcDFmem.alias('a').join( jdbcDFstr.alias('b'), col('a.EnrolledStoreCode') == col('b.StoreCode'), 'left') query = "(SELECT TxnMappedMobile as Mobile,modifiedbillno,departmentcode FROM (SELECT a.uniqueitemcode,a.departmentcode,b.modifiedbillno,b.TxnMappedMobile FROM item_master a inner join sku_report_loyalty b on a.uniqueitemcode=b.uniqueitemcode where b.TxnDate<DATE_SUB('" + mydate + "',INTERVAL " + str( cw ) + " DAY) and b.TxnDate>DATE_SUB('" + mydate + "',INTERVAL " + str( lw) + " DAY)) t ) as df" jdbcDFdep = MySqlConnector.mysqlRead(self, spark, mysqldetails[0], mysqldetails[1], query, mysqldetails[2], mysqldetails[3]) jdbcDFdep = jdbcDFdep.groupBy('Mobile', 'modifiedbillno').agg( countDistinct('departmentcode').alias('departmentcode')) jdbcDFdep = jdbcDFdep.groupBy('Mobile').agg( sum('departmentcode').alias('ndept')) #query = "(select Mobile,ModifiedBillNo,TxnDate,amount,StoreCode,PointsCollected,LapsedPoints,AvailablePoints from txn_report_accrual_redemption where TxnDate<DATE_SUB('2019-04-01',INTERVAL "+cw+" DAY) and TxnDate>DATE_SUB('2019-04-01',INTERVAL "+lw+" DAY))) as df" query = "(select Mobile,ModifiedBillNo,TxnDate,amount,StoreCode,PointsCollected,LapsedPoints,AvailablePoints from txn_report_accrual_redemption where TxnDate>DATE_SUB('" + mydate + "',INTERVAL " + str( lw) + " DAY)) as df" jdbcDF3 = MySqlConnector.mysqlRead(self, spark, mysqldetails[0], mysqldetails[1], query, mysqldetails[2], mysqldetails[3]) #jdbcDF3.show(5) jdbcDF3c = jdbcDF3.filter( col('TxnDate') > datetime.strptime(mydate, "%Y-%m-%d") - timedelta(days=int(cw))) mlist = jdbcDF3c.rdd.map(lambda r: r.Mobile).collect() print("now", datetime.strptime(mydate, "%Y-%m-%d")) print("churn window start", datetime.strptime(mydate, "%Y-%m-%d") - timedelta(days=int(cw))) jdbcDF3 = jdbcDF3.filter( col('TxnDate') < datetime.strptime(mydate, "%Y-%m-%d") - timedelta(days=int(cw))) jdbcDF3 = jdbcDF3.withColumn( 'labels', when(jdbcDF3.Mobile.isin(mlist), 0).otherwise(1)) window = Window.partitionBy('Mobile').orderBy("TxnDate") jdbcDF3 = jdbcDF3.withColumn( "diff", datediff(lag('TxnDate', 1).over(window), 'TxnDate')) jdbcDF3.show(5) jdbcDF3=jdbcDF3.groupby('Mobile').agg(sum('amount').alias('total_amount'),countDistinct('ModifiedBillNo').alias('total_txn'),countDistinct('TxnDate').alias('total_visits'),sum('LapsedPoints').alias('lapsedpoints'),sum('AvailablePoints').alias('availablepoints'),\ max('TxnDate').alias('MaxDate'),min('TxnDate').alias('MinDate'),countDistinct('StoreCode').alias('nstores'),max('diff').alias('max_latency'),sum('PointsCollected').alias('pointscollected'),max('labels').alias('labels')) #jdbcDF3=jdbcDF3.groupby('Mobile').agg(sum('amount').alias('total_amount'),countDistinct('ModifiedBillNo').alias('total_txn'),countDistinct('TxnDate').alias('total_visits'),\ #max('TxnDate').alias('MaxDate'),min('TxnDate').alias('MinDate'),countDistinct('StoreCode').alias('nstores'),max('diff').alias('max_latency')) jdbcDF3 = jdbcDF3.withColumn('current_date', date_sub('MaxDate', int(cw))) jdbcDF3 = jdbcDF3.withColumn( "recency", datediff(col("MaxDate"), col("current_date"))) jdbcDF3 = jdbcDF3.withColumn( "Tenure", datediff(col("MinDate"), col("current_date"))) jdbcDF3 = jdbcDF3.withColumn( "latency", datediff(col("MinDate"), col("MaxDate")) / (col('total_visits') - 1)) jdbcDF3 = jdbcDF3.join(jdbcDFsku, 'Mobile', 'left') jdbcDF3 = jdbcDF3.join(jdbcDFdep, 'Mobile', 'left') jdbcDF3 = jdbcDF3.join(jdbcDFmem, 'Mobile', 'left') jdbcDF3 = jdbcDF3.toDF(*[c.lower() for c in jdbcDF3.columns]) #row = jdbcDF3.limit(1).collect()[0].asDict() #myvars = [i for i,j in row.items()] ''' myvars = jdbcDF3.columns modelvars = ['mobile', 'gender', 'tier', 'state', 'city', 'total_txn', 'total_visits', 'total_amount', 'pointscollected', 'lapsedpoints', 'availablepoints', 'max_latency', 'recency', 'tenure', 'latency', 'labels'] columns = list(set(myvars).intersection(set(modelvars))) ''' columns = jdbcDF3.columns jdbcDF3 = jdbcDF3.select(*columns) os.system('mkdir -p ' + mysqldetails[1]) jdbcDF3.write.option( "header", "true").mode("overwrite").csv('./' + mysqldetails[1] + '/churn_data_semifinal1.csv') #jdbcDF3.write.format('com.databricks.spark.csv').save('churn_data_semifinal1.csv',header= 'true') data = probchurn.data_process(self, spark, columns) probchurn.classify(self, mysqldetails, 'lightgbm', data) print("churn modelling done!")
############################################################ ### Temporal split ############################################################ ## create train and test dictionaries train = {} test = {} # import pyspark.sql functions from pyspark.sql.functions import lit, col, row_number from pyspark.sql.window import Window # select date column to sort w = Window.orderBy('clean_date') ## 50/50 split for i in data_array: # store middle question count hlfwy_mrk = datasets[i].count() / 2 # add index column with sorted data datasets[i] = datasets[i].withColumn('index', row_number().over(w)) train[i] = datasets[i].filter(datasets[i]['index'] <= lit(hlfwy_mrk)) test[i] = datasets[i].filter(datasets[i]['index'] > lit(hlfwy_mrk)) numer = test[i].count() denom = numer + train[i].count() frac = numer / denom print(f'{i}: {frac}, from {numer} test and {denom} total')
# substract the time with 870 to get the minute indicator trade_df = trade_df.withColumn( 'MinuteIndicator', trade_df['Hour'] * 60 + trade_df['Minute'] - 870) # select the data during trading hours, trading volume larger than 0 and drop any missing values on trade price trade_df = trade_df.\ filter((trade_df['MinuteIndicator'] >= 0) & (trade_df['MinuteIndicator'] <= 390) & (trade_df['Volume']>0)).\ dropna(subset=('Price')) # delete rows with ticker contains '![/', all these rows are minute-by-minute summary rather than real trade trade_df = trade_df.filter(~trade_df['Ticker'].contains('![/')) # drop duplicates minutes for each stock and trade_4_computing_ret = trade_df.groupBy(['Ticker', 'MinuteIndicator']).max('SeqNo')\ .withColumnRenamed('max(SeqNo)', 'SeqNo')\ .join(trade_df, ['Ticker', 'SeqNo', 'MinuteIndicator'])\ .orderBy(['Ticker', 'MinuteIndicator'])\ .select('Ticker', 'MinuteIndicator', 'Price') windowSpec = Window.partitionBy('Ticker').orderBy('MinuteIndicator') trade_4_computing_ret = trade_4_computing_ret.withColumn( 'Lag Price', lag(trade_4_computing_ret['Price']).over(windowSpec)) trade_4_computing_ret = trade_4_computing_ret.withColumn('Ret', log(trade_4_computing_ret['Price']/trade_4_computing_ret['Lag Price']))\ .dropna(subset=('Ret'))\ .orderBy('Ticker', 'MinuteIndicator') final_df = pd.DataFrame( trade_4_computing_ret.collect(), columns=['Ticker', 'MinuteIndicator', 'Price', 'Lag Price', 'Ret']) new = pd.DataFrame(final_df.Ticker.unique(), [range(1, 391)]*len(final_df.Ticker.unique()))\ .reset_index()\ .explode('index').rename(columns={'index': 'MinuteIndicator', 0: 'Ticker'}) df_for_plot = new.merge(final_df[['Ticker', 'MinuteIndicator', 'Ret']], how='left', on=['Ticker', 'MinuteIndicator'])\ .fillna(0)\
def collect_numeric_metric(metric, df, population): cdf = df.select(df[metric['src']]) cdf = cdf.dropna(subset=metric['src']) cdf = cdf.select(cdf[metric['src']].cast('float').alias('bucket')) total_count = cdf.count() num_partitions = total_count / 500 ws = Window.orderBy('bucket') cdf = cdf.select( cdf['bucket'], cume_dist().over(ws).alias('c'), row_number().over(ws).alias('i')) cdf = cdf.filter("i = 1 OR i %% %d = 0" % num_partitions) cdf = cdf.collect() # Collapse rows with duplicate buckets. collapsed_data = [] prev = None for d in cdf: if not collapsed_data: collapsed_data.append(d) # Always keep first record. continue if prev and prev['bucket'] == d['bucket']: collapsed_data.pop() collapsed_data.append(d) prev = d # Calculate `p` from `c`. data = [] prev = None for i, d in enumerate(collapsed_data): p = d['c'] - prev['c'] if prev else d['c'] data.append({ 'bucket': d['bucket'], 'c': d['c'], 'p': p, }) prev = d """ Example of what `data` looks like now:: [{'bucket': 0.0, 'c': 0.00126056, 'p': 0.00126056}, {'bucket': 3.0, 'c': 0.00372313, 'p': 0.00246256}, {'bucket': 4.0, 'c': 0.00430616, 'p': 0.0005830290622683026}, {'bucket': 6.13319683, 'c': 0.00599801, 'p': 0.00169184}, {'bucket': 8.0, 'c': 0.08114486, 'p': 0.07514685}, {'bucket': 8.23087882, 'c': 0.08197282, 'p': 0.00082795}, ...] """ # Push data to database. sql = ("INSERT INTO api_numericcollection " "(num_observations, population, metric_id, dataset_id) " "VALUES (%s, %s, %s, %s) " "RETURNING id") params = [total_count, population, metric['id'], dataset_id] if DEBUG_SQL: collection_id = 0 print sql, params else: cursor.execute(sql, params) conn.commit() collection_id = cursor.fetchone()[0] for d in data: sql = ("INSERT INTO api_numericpoint " "(bucket, proportion, collection_id) " "VALUES (%s, %s, %s)") params = [d['bucket'], d['p'], collection_id] if DEBUG_SQL: print sql, params else: cursor.execute(sql, params) if not DEBUG_SQL: conn.commit()