Ejemplo n.º 1
0
def run(spark, args):
    """Write features w/ churn status in first column"""
    users = join.spark_job.run(spark, args)
    features = list(users.columns)
    # churn must be the first column
    features.remove('churned')
    features = ['churned'] + features
    users.select(features).write.csv(dt_path(args['output_path'], args['dt']),
                                     compression='gzip',
                                     mode='overwrite')
Ejemplo n.º 2
0
def run(spark, args):
    query_dt = args['dt']
    dt_start = utils.dt_start(query_dt, days_back=28)
    sessions = spark.read.load(args['input_path']).where(
        "dt between '{dt_start}' and '{query_dt}'".format(query_dt=query_dt,
                                                          dt_start=dt_start))
    sessions.createOrReplaceTempView('sessions')

    user_activity = spark.sql(query.activity_sql)
    user_activity.write.parquet(utils.dt_path(args['output_path'], query_dt),
                                mode='overwrite')
Ejemplo n.º 3
0
def run(spark, args):
    query_dt = args['dt']

    data = spark.read.option('header', 'true').csv(args['input_path'])
    data.createOrReplaceTempView('conviva_strings')
    data_recast = spark.sql(query.user_cast_sql)
    data_recast.createOrReplaceTempView('conviva_streams')

    user_perf_features = spark.sql(query.get_perf_sql(query_dt, days_back=28))
    user_perf_features.write.parquet(
        utils.dt_path(args['output_path'], query_dt), mode='overwrite')
Ejemplo n.º 4
0
def run(spark, args):
    query_dt = args['dt']

    next_month_date = get_next_month(query_dt)
    all_subs = spark.read.load(args['input_path'])
    all_subs.createOrReplaceTempView("sub_table")

    sub_sql = get_sub_sql(query_dt=query_dt,
                          month_later_dt=next_month_date.strftime('%Y-%m-%d'))
    subs = spark.sql(sub_sql)
    subs.write.parquet(utils.dt_path(args['output_path'], query_dt),
                       mode='overwrite')
Ejemplo n.º 5
0
def run(spark, args):

    dt = args['dt']
    users = spark.read.load(dt_path(args['input_subscription'], dt))
    predictions = spark.read.json(dt_path(args['input_prediction'], dt))

    # creating a row number is the only way to horizontally concatenate
    users = users.withColumn('row_num', monotonically_increasing_id())
    predictions = predictions.withColumn('row_num',
                                         monotonically_increasing_id())

    user_prediction = users.select(['user_id',
                                    'row_num']).join(predictions, 'row_num')

    user_prediction = user_prediction.withColumnRenamed(
        'predicted_label', 'predicted_churn')
    user_prediction = user_prediction.withColumnRenamed(
        'score', 'churn_risk_score')

    user_prediction.write.parquet(dt_path(args['output_path'], dt),
                                  mode='overwrite')
Ejemplo n.º 6
0
def setup_predictions(spark, args):
    records = [{
        'predicted_label': 1.0,
        'score': .55
    }, {
        'predicted_label': 0.0,
        'score': .05
    }]
    rows = [Row(**record) for record in records]
    data = spark.createDataFrame(rows)
    sample_input_path = dt_path(args['input_prediction'], args['dt'])
    data.write.json(sample_input_path, compression='gzip', mode='overwrite')
Ejemplo n.º 7
0
def run(spark, args):

    dt = args['dt']
    user_subscription = spark.read.load(dt_path(args['input_subscription'],
                                                dt))
    user_performance = spark.read.load(dt_path(args['input_performance'], dt))
    user_activity = spark.read.load(dt_path(args['input_activity'], dt))

    users = user_subscription.join(user_performance, 'user_id',
                                   'left_outer').join(user_activity, 'user_id',
                                                      'left_outer')

    performance_features = user_performance.columns
    performance_features.remove('user_id')
    session_features = ['avg_session_length', 'median_session_length']

    for feature in performance_features + session_features:
        median_expr = 'percentile_approx({}, 0.5)'.format(feature)
        median_rows = users.agg(expr(median_expr)).collect()
        median = median_rows[0][median_expr]
        users = users.fillna({feature: median})

    for feature in ['total_timespent', 'num_sessions', 'active_days']:
        users = users.fillna({feature: 0})

    users.cache()

    features = list(users.columns)
    features.remove('user_id')
    features.remove('churned')

    # write data with no churn status
    users.select(features).write.csv(dt_path(args['output_path'], dt),
                                     compression='gzip',
                                     mode='overwrite')

    return users
Ejemplo n.º 8
0
def run(spark, args):

    query_dt = args['dt']
    dt_start = utils.dt_start(query_dt, days_back=1)

    # off-the-bat 'where' for optimization
    segment = spark.read.load(args['input_path']).where(
        "dt in ('{dt_start}', '{query_dt}')".format(
            dt_start=dt_start, query_dt=query_dt))

    events = spark.createDataFrame(segment.rdd.map(parse), samplingRatio=.4)
    events.createOrReplaceTempView('events')
    sessions = spark.sql(query.get_session_sql(query_dt))
    # Other jobs will use daily session roll-up
    sessions.write.parquet(utils.dt_path(args['output_path'], query_dt), mode='overwrite')
Ejemplo n.º 9
0
def run(spark, args):
    query_dt = args['dt']
    dt_start = utils.dt_start(query_dt, days_back=28)
    # Job dependent on 28 days of successful session job runs
    sessions = spark.read.load(args['input_path']).where(
        "dt between '{dt_start}' and '{query_dt}'".format(
            query_dt=query_dt, dt_start=dt_start))

    sessions.createOrReplaceTempView('sessions')

    user_device_timespent = spark.sql(query.device_sql)
    user_device_timespent_pivoted = user_device_timespent.groupby('user_id') \
        .pivot('device_code').agg(sum('device_minutes'))
    user_device_timespent_pivoted.cache()

    user_device_timespent_pivoted.write.parquet(
        utils.dt_path(args['output_path'], query_dt), mode='overwrite')
Ejemplo n.º 10
0
def test_dt_path():
    assert utils.dt_path('s3://mypath', '2019-01-09') == 's3://mypath/dt=2019-01-09'