def run(spark, args): """Write features w/ churn status in first column""" users = join.spark_job.run(spark, args) features = list(users.columns) # churn must be the first column features.remove('churned') features = ['churned'] + features users.select(features).write.csv(dt_path(args['output_path'], args['dt']), compression='gzip', mode='overwrite')
def run(spark, args): query_dt = args['dt'] dt_start = utils.dt_start(query_dt, days_back=28) sessions = spark.read.load(args['input_path']).where( "dt between '{dt_start}' and '{query_dt}'".format(query_dt=query_dt, dt_start=dt_start)) sessions.createOrReplaceTempView('sessions') user_activity = spark.sql(query.activity_sql) user_activity.write.parquet(utils.dt_path(args['output_path'], query_dt), mode='overwrite')
def run(spark, args): query_dt = args['dt'] data = spark.read.option('header', 'true').csv(args['input_path']) data.createOrReplaceTempView('conviva_strings') data_recast = spark.sql(query.user_cast_sql) data_recast.createOrReplaceTempView('conviva_streams') user_perf_features = spark.sql(query.get_perf_sql(query_dt, days_back=28)) user_perf_features.write.parquet( utils.dt_path(args['output_path'], query_dt), mode='overwrite')
def run(spark, args): query_dt = args['dt'] next_month_date = get_next_month(query_dt) all_subs = spark.read.load(args['input_path']) all_subs.createOrReplaceTempView("sub_table") sub_sql = get_sub_sql(query_dt=query_dt, month_later_dt=next_month_date.strftime('%Y-%m-%d')) subs = spark.sql(sub_sql) subs.write.parquet(utils.dt_path(args['output_path'], query_dt), mode='overwrite')
def run(spark, args): dt = args['dt'] users = spark.read.load(dt_path(args['input_subscription'], dt)) predictions = spark.read.json(dt_path(args['input_prediction'], dt)) # creating a row number is the only way to horizontally concatenate users = users.withColumn('row_num', monotonically_increasing_id()) predictions = predictions.withColumn('row_num', monotonically_increasing_id()) user_prediction = users.select(['user_id', 'row_num']).join(predictions, 'row_num') user_prediction = user_prediction.withColumnRenamed( 'predicted_label', 'predicted_churn') user_prediction = user_prediction.withColumnRenamed( 'score', 'churn_risk_score') user_prediction.write.parquet(dt_path(args['output_path'], dt), mode='overwrite')
def setup_predictions(spark, args): records = [{ 'predicted_label': 1.0, 'score': .55 }, { 'predicted_label': 0.0, 'score': .05 }] rows = [Row(**record) for record in records] data = spark.createDataFrame(rows) sample_input_path = dt_path(args['input_prediction'], args['dt']) data.write.json(sample_input_path, compression='gzip', mode='overwrite')
def run(spark, args): dt = args['dt'] user_subscription = spark.read.load(dt_path(args['input_subscription'], dt)) user_performance = spark.read.load(dt_path(args['input_performance'], dt)) user_activity = spark.read.load(dt_path(args['input_activity'], dt)) users = user_subscription.join(user_performance, 'user_id', 'left_outer').join(user_activity, 'user_id', 'left_outer') performance_features = user_performance.columns performance_features.remove('user_id') session_features = ['avg_session_length', 'median_session_length'] for feature in performance_features + session_features: median_expr = 'percentile_approx({}, 0.5)'.format(feature) median_rows = users.agg(expr(median_expr)).collect() median = median_rows[0][median_expr] users = users.fillna({feature: median}) for feature in ['total_timespent', 'num_sessions', 'active_days']: users = users.fillna({feature: 0}) users.cache() features = list(users.columns) features.remove('user_id') features.remove('churned') # write data with no churn status users.select(features).write.csv(dt_path(args['output_path'], dt), compression='gzip', mode='overwrite') return users
def run(spark, args): query_dt = args['dt'] dt_start = utils.dt_start(query_dt, days_back=1) # off-the-bat 'where' for optimization segment = spark.read.load(args['input_path']).where( "dt in ('{dt_start}', '{query_dt}')".format( dt_start=dt_start, query_dt=query_dt)) events = spark.createDataFrame(segment.rdd.map(parse), samplingRatio=.4) events.createOrReplaceTempView('events') sessions = spark.sql(query.get_session_sql(query_dt)) # Other jobs will use daily session roll-up sessions.write.parquet(utils.dt_path(args['output_path'], query_dt), mode='overwrite')
def run(spark, args): query_dt = args['dt'] dt_start = utils.dt_start(query_dt, days_back=28) # Job dependent on 28 days of successful session job runs sessions = spark.read.load(args['input_path']).where( "dt between '{dt_start}' and '{query_dt}'".format( query_dt=query_dt, dt_start=dt_start)) sessions.createOrReplaceTempView('sessions') user_device_timespent = spark.sql(query.device_sql) user_device_timespent_pivoted = user_device_timespent.groupby('user_id') \ .pivot('device_code').agg(sum('device_minutes')) user_device_timespent_pivoted.cache() user_device_timespent_pivoted.write.parquet( utils.dt_path(args['output_path'], query_dt), mode='overwrite')
def test_dt_path(): assert utils.dt_path('s3://mypath', '2019-01-09') == 's3://mypath/dt=2019-01-09'