def get_perf_sql(query_dt, days_back=28): # startup_time_ms = -1 means EBVS # average_bitrate_kbps = 0 means could not get bitrate valid_startup_time = "case when startup_time_ms > 0 then startup_time_ms else null end" valid_bitrate = "case when average_bitrate_kbps > 0 then average_bitrate_kbps else null end" # compute user aggregates return """ select viewerid user_id, avg({startup_time}) avg_startup_time, approx_percentile({startup_time}, 0.5) median_startup_time, sum({startup_time}) total_startup_time, sum(startup_error) total_startup_errors, avg(interrupts) avg_interrupts, approx_percentile(interrupts, 0.5) median_interrupts, sum(interrupts) total_interrupts, avg(buffering_time_ms) avg_buffering_time, approx_percentile(buffering_time_ms, 0.5) median_buffering_time, sum(buffering_time_ms) total_buffering_time, avg({bitrate}) avg_bitrate, approx_percentile({bitrate}, 0.5) median_bitrate from conviva_streams where dt between '{dt_start}' and '{query_dt}' and from_unixtime(start_time_unix_time, 'yyyy-MM-dd') >= '{dt_start}' and from_unixtime(start_time_unix_time, 'yyyy-MM-dd') < '{query_dt}' group by viewerid """.format(startup_time=valid_startup_time, bitrate=valid_bitrate, dt_start=utils.dt_start(query_dt, days_back), query_dt=query_dt)
def run(spark, args): query_dt = args['dt'] dt_start = utils.dt_start(query_dt, days_back=28) sessions = spark.read.load(args['input_path']).where( "dt between '{dt_start}' and '{query_dt}'".format(query_dt=query_dt, dt_start=dt_start)) sessions.createOrReplaceTempView('sessions') user_activity = spark.sql(query.activity_sql) user_activity.write.parquet(utils.dt_path(args['output_path'], query_dt), mode='overwrite')
def run(spark, args): query_dt = args['dt'] dt_start = utils.dt_start(query_dt, days_back=1) # off-the-bat 'where' for optimization segment = spark.read.load(args['input_path']).where( "dt in ('{dt_start}', '{query_dt}')".format( dt_start=dt_start, query_dt=query_dt)) events = spark.createDataFrame(segment.rdd.map(parse), samplingRatio=.4) events.createOrReplaceTempView('events') sessions = spark.sql(query.get_session_sql(query_dt)) # Other jobs will use daily session roll-up sessions.write.parquet(utils.dt_path(args['output_path'], query_dt), mode='overwrite')
def run(spark, args): query_dt = args['dt'] dt_start = utils.dt_start(query_dt, days_back=28) # Job dependent on 28 days of successful session job runs sessions = spark.read.load(args['input_path']).where( "dt between '{dt_start}' and '{query_dt}'".format( query_dt=query_dt, dt_start=dt_start)) sessions.createOrReplaceTempView('sessions') user_device_timespent = spark.sql(query.device_sql) user_device_timespent_pivoted = user_device_timespent.groupby('user_id') \ .pivot('device_code').agg(sum('device_minutes')) user_device_timespent_pivoted.cache() user_device_timespent_pivoted.write.parquet( utils.dt_path(args['output_path'], query_dt), mode='overwrite')
def test_dt_start(): test_dt = '2019-01-03' assert utils.dt_start(test_dt, 7) == '2018-12-27'
def test_dt_start_default(): test_dt = '2019-01-03' assert utils.dt_start(test_dt) == '2018-12-06'