Exemple #1
0
def content_overview(dt, week_ucis, content_id, title):
    content_by_week = query_content_weekly(content_id)
    content_data = [(x['started-views'], int(x['viewing-time'] * x['weekly-active-user']))
        for x in content_by_week]
    total_views = sum([x[0] for x in content_data])
    # this is already measured in minutes
    total_viewtime = sum([x[1] for x in content_data])
    started_views_this_week = week_ucis.count()
    content_hour_of_day = normalize(view_count_by_hour_of_day(week_ucis), started_views_this_week)
    content_day_of_week = normalize(view_count_by_day_of_week(week_ucis), started_views_this_week)
    weekly_active_user = user_number(week_ucis)
    completion_ratio = avg_completion_ratio(week_ucis)
    user_viewtime = avg_user_viewtime(week_ucis)
    device_type = top_tag_by_view_count(week_ucis, 'device')
    res = {
        "id": content_id,
        'inventoryID': content_id,
        'title': title,
        "datetime": dt.replace(tzinfo=pytz.UTC),
        "content-total-viewing-time": total_viewtime,
        "content-total-started-views": total_views,
        "hour-of-day": content_hour_of_day,
        "day-of-week": content_day_of_week,
        "weekly-active-user": weekly_active_user,
        "content-completion": completion_ratio,
        "viewing-time": user_viewtime,
        "device-type": device_type,
    }
    print "#"*10
    print content_id
    print res
    r.db('gettv_insight_api').table('content_overview').insert([res], conflict='replace').run()
Exemple #2
0
def users_package_overview(ucis):
    """
    send in ucis and get its users, then group by package info
    """
    package_names = ["T1", "T2", "T3"]
    users = spark_io.get_users()
    user_num = users.count()
    user_packages = users\
        .groupBy('basicPackage')\
        .count()\
        .collect()
    user_packages = {x['basicPackage']: x['count'] for x in user_packages}

    tag_users = ucis\
        .select('userID')\
        .distinct()\
        .collect()

    tag_users = set([x['userID'] for x in tag_users])

    # defined udf for filter
    # if return if user is a tag user
    is_in_tag_user = func.udf(lambda x: x in tag_users, BooleanType())

    tag_user_packages = users\
        .filter(is_in_tag_user(users.userID))\
        .groupBy('basicPackage')\
        .count()\
        .collect()
    tag_user_packages = {x['basicPackage']: x['count'] for x in tag_user_packages}
    tag_user_packages_norm = normalize([(x, tag_user_packages[x]) for x in package_names], len(tag_users))
    user_packages_norm = normalize([(x, user_packages[x] - tag_user_packages[x]) for x in package_names], user_num - len(tag_users))
    return tag_user_packages_norm, user_packages_norm
def run_daily_kpis(timestamp, week_ucis, channel_id):
    date_string = timestamp.strftime('%Y-%m-%d')
    started_views = view_count(week_ucis)
    weekly_active_user = user_number(week_ucis)
    user_viewtime = avg_user_viewtime(week_ucis)
    views_by_action = normalize(action_type_view_count(week_ucis),
                                started_views, 'count')
    complete_views = avg_finished_program_by_user(week_ucis)
    completion_ratio = avg_completion_ratio(week_ucis)
    weekly_top_program = top_programs_by_view_count(week_ucis, 10)
    weekly_top_genre = normalize(top_genre_by_view_count(week_ucis, 10),
                                 started_views, 'count')
    res = {
        "id": date_string + '_' + channel_id,
        'channelID': channel_id,
        "datetime": timestamp.replace(tzinfo=pytz.UTC),
        "viewing-time": user_viewtime,
        "weekly-active-user": weekly_active_user,
        "started-views": started_views,
        "comlete-views": complete_views,
        "content-completion": completion_ratio,
        "top-programs": weekly_top_program,
        "top-genres": weekly_top_genre
    }
    print "#" * 10
    print channel_id
    print res
    r.db('telenortv_insight_api').table('channel_by_week').insert([res]).run()
    print date_string
    print "#" * 10
def run_weekly_kpis(timestamp, week_ucis, content_id, title):
    date_string = timestamp.strftime('%Y-%m-%d')
    started_views = view_count(week_ucis)
    print started_views, "started views"
    weekly_active_user = user_number(week_ucis)
    user_viewtime = avg_user_viewtime(week_ucis)
    views_by_action = normalize(action_type_view_count(week_ucis), started_views)
    complete_views = avg_finished_program_by_user(week_ucis)
    completion_ratio = avg_completion_ratio(week_ucis)
    res = {
        "id": date_string + '_' + content_id,
        "title": title,
        'inventoryID': content_id,
        "datetime": timestamp.replace(tzinfo=pytz.UTC),
        "viewing-time": user_viewtime,
        "weekly-active-user": weekly_active_user,
        "started-views": started_views,
        "comlete-views": complete_views,
        "content-completion": completion_ratio,
    }
    print "#"*10
    print content_id
    print res
    r.db('gettv_insight_api').table('content_by_week').insert([res], conflict='replace').run()
    print date_string
    print "#"*10
Exemple #5
0
def genre_by_hour_of_day(week_ucis, genre):
    genre_ucis = week_ucis.filter(week_ucis.category == genre)
    genre_hour_of_day = view_count_by_hour_of_day(genre_ucis)
    genre_sum = sum([x[1] for x in genre_hour_of_day])
    return {
        "data": normalize(genre_hour_of_day, genre_sum),
        "id": "{}-hour-of-day".format(genre.replace('/', '-'))
    }
def run_daily_kpis(timestamp, users):
    daily_ucis = spark_io.get_daily_interactions(timestamp)
    date_string = timestamp.strftime('%Y-%m-%d')
    started_views = view_count(daily_ucis)
    daily_active_user = user_number(daily_ucis)
    user_viewtime = avg_user_viewtime(daily_ucis)
    views_by_action = normalize(action_type_view_count(daily_ucis),
                                started_views)
    complete_views = avg_finished_program_by_user(daily_ucis)
    completion_ratio = avg_completion_ratio(daily_ucis)
    daily_top_program = top_programs_by_view_count(daily_ucis, 10)
    daily_top_genre = normalize(
        top_tag_by_view_count(daily_ucis, 'category', 10), started_views)
    daily_top_channel = normalize(
        top_tag_by_view_count(daily_ucis, 'channelName', 10), started_views)
    week_ucis = spark_io.get_weekly_interactions(timestamp)
    last_week_ucis = spark_io.get_weekly_interactions(timestamp -
                                                      timedelta(days=7))
    weekly_hibernation = user_hibernation(week_ucis, last_week_ucis)
    new_users = weekly_new_user(users, timestamp)
    total_users = unique_user(users, timestamp)
    res = {
        "id": date_string,
        "datetime": timestamp.replace(tzinfo=pytz.UTC),
        "new-user": new_users,
        "unique-user": total_users,
        "viewing-time": user_viewtime,
        "daily-active-user": daily_active_user,
        "started-views": started_views,
        "views-by-action": views_by_action,
        "complete-views": complete_views,
        "content-completion": completion_ratio,
        "user-hibernation": weekly_hibernation,
        "top-programs": daily_top_program,
        "top-genres": daily_top_genre,
        "top-channels": daily_top_channel,
    }
    print res
    r.db('telenortv_insight_api')\
        .table('by_day')\
        .insert([res], conflict='replace').run()
    print date_string
    print "#" * 10
Exemple #7
0
def service_overview(dt):
    spark_io = SparkParquetIO()
    week_ucis = spark_io.get_weekly_interactions(dt)
    total_views = week_ucis.count()
    total_viewtime = week_ucis.groupBy().sum('duration').collect()[0]['sum(duration)'] / 60
    hour_of_day = normalize(
        view_count_by_hour_of_day(week_ucis), total_views)
    day_of_week = normalize(
        view_count_by_day_of_week(week_ucis), total_views)
    device_overview = normalize(
        top_tag_by_view_count(week_ucis, 'device'), total_views)
    device_conpletion_ratio = top_tag_by_completion_ratio(week_ucis, 'device')
    location_overview = normalize(
        top_tag_by_view_count(week_ucis, 'city'), total_views)
    ###################
    # this frist one should be wraped with genre title count as well
    genre_by_views = normalize(
        top_tag_by_view_count(week_ucis, 'category', row_limit= 20), total_views)
    ###################
    genre_by_viewtime = normalize(
        top_tag_by_total_viewtime(week_ucis, 'category', row_limit=20), total_viewtime)
    genre_by_completion_ratio = top_tag_by_completion_ratio(week_ucis, 'category', row_limit=20)
    genre_by_user_viewtime = top_tag_by_user_viewtime(week_ucis, 'category', row_limit=20)
    # about users
    user_complete_view = user_by_complete_views(week_ucis)
    user_viewtime = user_by_viewtime(week_ucis, 7)
    action_type_overview = top_tag_by_total_viewtime(week_ucis, 'actionType')
    action_type_overview = normalize(
        action_type_overview, sum([x[1] for x in action_type_overview])
    )
    res = [
        {"id": "hour-of-day", "data": hour_of_day},
        {"id": "day-of-week", "data": day_of_week},
        {"id": "genre-by-started-views", "data": genre_by_views},
        {"id": "genre-by-viewtime", "data": genre_by_viewtime},
        {"id": "genre-by-user-viewtime", "data": genre_by_user_viewtime},
        {"id": "genre-by-completion-ratio", "data": genre_by_completion_ratio},
        {"id": "device-by-started-views", "data": device_overview},
        {"id": "device-by-completion-ratio", "data": device_conpletion_ratio},
        {"id": "location-by-started-views", "data": location_overview},
        {"id": "genre-by-completion-ratio", "data": genre_by_completion_ratio},
        {"id": "user-by-complete-views", "data": user_complete_view},
        {"id": "user-by-viewtime", "data": user_viewtime},
        {"id": "action-type-overview", "data": action_type_overview}
    ]

    for x in res:
        print x
    r.db('gettv_insight_api').table('overview').insert(res, conflict='replace').run()
Exemple #8
0
def top_genre_vod(ucis, vod):
    vod_with_views = ucis\
        .filter(ucis.actionType==vod)\
        .groupBy(ucis.inventoryID)\
        .count()\
        .collect()
    res = seq(vod_with_views)\
        .map(lambda x: (vod_genre(x['inventoryID']), x['count']))\
        .filter(lambda (genre, count): genre)\
        .group_by(lambda (genre, count): genre)\
        .map(lambda (genre, g_list): (genre, sum([x[1] for x in g_list])))\
        .order_by(lambda (g, c): -c)\
        .to_list()
    vod_sum = sum([x[1] for x in res])
    return {
        "data": normalize(res, vod_sum)[:10],
        "id": "{}-genre-by-views".format(vod)
    }
Exemple #9
0
def run_vod_kpis(ucis, view_type):
    started_views = view_count(ucis)
    week_ucis = ucis.filter((dt_end - timedelta(days=6) < ucis.firstEvent)
                            & (ucis.firstEvent < dt_end + timedelta(days=1)))
    week_ago_ucis = ucis.filter((dt_end - timedelta(days=13) < ucis.firstEvent)
                                &
                                (ucis.firstEvent < dt_end - timedelta(days=6)))
    weekly_active_user = user_number(week_ucis)
    total_active_user = user_number(ucis)
    total_viewtime = total_viewing_time(ucis)
    user_viewtime = avg_user_viewtime(week_ucis)
    weekly_hibernation = user_hibernation(week_ucis, week_ago_ucis)
    top_program = top_programs_in_vod(ucis, 20)
    top_channel = normalize(top_tag_by_view_count(ucis, 'channelName'),
                            started_views)
    hour_of_day = normalize(view_count_by_hour_of_day(ucis), started_views)
    day_of_week = normalize(view_count_by_day_of_week(ucis), started_views)
    tag_user_package, user_package = users_package_overview(ucis)
    package_overview = {
        "{} user".format(view_type): tag_user_package,
        "linear TV user": user_package
    }
    res = [{
        "title": 'started-views',
        "id": 'started-views',
        "started-views": started_views
    }, {
        "title": 'weekly-active-user',
        "id": 'weekly-active-user',
        "weekly-active-user": weekly_active_user
    }, {
        "title": 'total-active-user',
        "id": 'total-active-user',
        "total-active-user": total_active_user
    }, {
        "title": 'total-viewing-time',
        "id": 'total-viewing-time',
        "total-viewing-time": total_viewtime
    }, {
        "title": 'viewing-time',
        "id": 'viewing-time',
        "viewing-time": user_viewtime
    }, {
        "title": 'user-hibernation',
        "id": 'user-hibernation',
        "user-hibernation": weekly_hibernation
    }, {
        "title": 'top-programs',
        "id": 'top-programs',
        "data": top_program
    }, {
        "title": 'top-provider',
        "id": 'top-provider',
        "data": top_channel
    }, {
        "title": 'hour-of-day',
        "id": 'hour-of-day',
        "data": hour_of_day
    }, {
        "title": 'day-of-week',
        "id": 'day-of-week',
        "data": day_of_week
    }, {
        "title": 'package-overview',
        "id": 'package-overview',
        "data": package_overview
    }]
    r.db('telenortv_insight_api').table(view_type).insert(
        res, conflict='replace').run()
Exemple #10
0
def service_overview(dt):
    spark_io = SparkParquetIO()
    week_ucis = spark_io.get_weekly_interactions(dt)
    total_views = week_ucis.count()
    total_viewtime = week_ucis.groupBy().sum(
        'duration').collect()[0]['sum(duration)'] / 60
    hour_of_day = normalize(view_count_by_hour_of_day(week_ucis), total_views)
    day_of_week = normalize(view_count_by_day_of_week(week_ucis), total_views)
    # about channels
    channel_by_views = normalize(
        top_tag_by_view_count(week_ucis, 'channelName', row_limit=20),
        total_views)
    channel_by_viewtime = normalize(
        top_tag_by_total_viewtime(week_ucis, 'channelName', row_limit=20),
        total_viewtime)
    channel_by_completion_ratio = top_tag_by_completion_ratio(week_ucis,
                                                              'channelName',
                                                              row_limit=20)
    channel_by_user_viewtime = top_tag_by_user_viewtime(week_ucis,
                                                        'channelName',
                                                        row_limit=20)
    # about genres
    ###################
    # this frist one should be wraped with genre title count as well
    genre_by_views = normalize(
        top_tag_by_view_count(week_ucis, 'category', row_limit=20),
        total_views)
    genre_by_title_count = normalize([(x[0], title_count_for_genre(x[0]))
                                      for x in genre_by_views],
                                     total_title_count())
    genre_overview = {"views": genre_by_views, "titles": genre_by_title_count}
    ###################
    genre_by_viewtime = normalize(
        top_tag_by_total_viewtime(week_ucis, 'category', row_limit=20),
        total_viewtime)
    genre_by_completion_ratio = top_tag_by_completion_ratio(week_ucis,
                                                            'category',
                                                            row_limit=20)
    genre_by_user_viewtime = top_tag_by_user_viewtime(week_ucis,
                                                      'category',
                                                      row_limit=20)
    # about users
    user_complete_view = user_by_complete_views(week_ucis)
    user_viewtime = user_by_viewtime(week_ucis, 7)
    action_type_overview = top_tag_by_total_viewtime(week_ucis, 'actionType')
    action_type_overview = normalize(action_type_overview,
                                     sum([x[1] for x in action_type_overview]))
    basic_package_by_viewing_time = basic_package_user_viewing_time(week_ucis)
    primeium_package_by_viewing_time = primeium_package_user_viewing_time(
        week_ucis)
    package_overview = basic_additional_package_overview()
    res = [{
        "id": "user-package-overview",
        "data": package_overview
    }, {
        "id": "basic-package-user-viewing-time",
        "data": basic_package_by_viewing_time
    }, {
        "id": "premium-package-user-viewing-time",
        "data": primeium_package_by_viewing_time
    }, {
        "id": "hour-of-day",
        "data": hour_of_day
    }, {
        "id": "day-of-week",
        "data": day_of_week
    }, {
        "id": "channel-by-completion-ratio",
        "data": channel_by_completion_ratio
    }, {
        "id": "channel-by-views",
        "data": channel_by_views
    }, {
        "id": "channel-by-viewtime",
        "data": channel_by_viewtime
    }, {
        "id": "channel-by-user-viewtime",
        "data": channel_by_user_viewtime
    }, {
        "id": "genre-overview",
        "data": genre_overview
    }, {
        "id": "genre-by-viewtime",
        "data": genre_by_viewtime
    }, {
        "id": "genre-by-user-viewtime",
        "data": genre_by_user_viewtime
    }, {
        "id": "genre-by-completion-ratio",
        "data": genre_by_completion_ratio
    }, {
        "id": "user-by-complete-views",
        "data": user_complete_view
    }, {
        "id": "user-by-viewtime",
        "data": user_viewtime
    }, {
        "id": "action-type-overview",
        "data": action_type_overview
    }]

    print res
    r.db('telenortv_insight_api').table('overview').insert(
        res, conflict='replace').run()