コード例 #1
0
def get_perf_sql(query_dt, days_back=28):
    # startup_time_ms = -1 means EBVS
    # average_bitrate_kbps = 0 means could not get bitrate
    valid_startup_time = "case when startup_time_ms > 0 then startup_time_ms else null end"
    valid_bitrate = "case when average_bitrate_kbps > 0 then average_bitrate_kbps else null end"

    # compute user aggregates
    return """
    select
        viewerid user_id,
        avg({startup_time}) avg_startup_time,
        approx_percentile({startup_time}, 0.5) median_startup_time,
        sum({startup_time}) total_startup_time,
        sum(startup_error) total_startup_errors,
        avg(interrupts) avg_interrupts,
        approx_percentile(interrupts, 0.5) median_interrupts,
        sum(interrupts) total_interrupts,
        avg(buffering_time_ms) avg_buffering_time,
        approx_percentile(buffering_time_ms, 0.5) median_buffering_time,
        sum(buffering_time_ms) total_buffering_time,
        avg({bitrate}) avg_bitrate,
        approx_percentile({bitrate}, 0.5) median_bitrate
    from
        conviva_streams
    where
        dt between '{dt_start}' and '{query_dt}'
        and from_unixtime(start_time_unix_time, 'yyyy-MM-dd') >= '{dt_start}'
        and from_unixtime(start_time_unix_time, 'yyyy-MM-dd') < '{query_dt}'
    group by
        viewerid
    """.format(startup_time=valid_startup_time,
               bitrate=valid_bitrate,
               dt_start=utils.dt_start(query_dt, days_back),
               query_dt=query_dt)
コード例 #2
0
ファイル: spark_job.py プロジェクト: Cipahi/churn
def run(spark, args):
    query_dt = args['dt']
    dt_start = utils.dt_start(query_dt, days_back=28)
    sessions = spark.read.load(args['input_path']).where(
        "dt between '{dt_start}' and '{query_dt}'".format(query_dt=query_dt,
                                                          dt_start=dt_start))
    sessions.createOrReplaceTempView('sessions')

    user_activity = spark.sql(query.activity_sql)
    user_activity.write.parquet(utils.dt_path(args['output_path'], query_dt),
                                mode='overwrite')
コード例 #3
0
def run(spark, args):

    query_dt = args['dt']
    dt_start = utils.dt_start(query_dt, days_back=1)

    # off-the-bat 'where' for optimization
    segment = spark.read.load(args['input_path']).where(
        "dt in ('{dt_start}', '{query_dt}')".format(
            dt_start=dt_start, query_dt=query_dt))

    events = spark.createDataFrame(segment.rdd.map(parse), samplingRatio=.4)
    events.createOrReplaceTempView('events')
    sessions = spark.sql(query.get_session_sql(query_dt))
    # Other jobs will use daily session roll-up
    sessions.write.parquet(utils.dt_path(args['output_path'], query_dt), mode='overwrite')
コード例 #4
0
def run(spark, args):
    query_dt = args['dt']
    dt_start = utils.dt_start(query_dt, days_back=28)
    # Job dependent on 28 days of successful session job runs
    sessions = spark.read.load(args['input_path']).where(
        "dt between '{dt_start}' and '{query_dt}'".format(
            query_dt=query_dt, dt_start=dt_start))

    sessions.createOrReplaceTempView('sessions')

    user_device_timespent = spark.sql(query.device_sql)
    user_device_timespent_pivoted = user_device_timespent.groupby('user_id') \
        .pivot('device_code').agg(sum('device_minutes'))
    user_device_timespent_pivoted.cache()

    user_device_timespent_pivoted.write.parquet(
        utils.dt_path(args['output_path'], query_dt), mode='overwrite')
コード例 #5
0
ファイル: test_utils.py プロジェクト: Cipahi/churn
def test_dt_start():
    test_dt = '2019-01-03'
    assert utils.dt_start(test_dt, 7) == '2018-12-27'
コード例 #6
0
ファイル: test_utils.py プロジェクト: Cipahi/churn
def test_dt_start_default():
    test_dt = '2019-01-03'
    assert utils.dt_start(test_dt) == '2018-12-06'