コード例 #1
0
def extract_engagement_dynamics_from_file(filepath, engagement_map_series, split_key_series, window_size, min_view=100):
    age = len(engagement_map_series)
    with open(filepath, 'r') as fin:
        fin.readline()
        for line in fin:
            vid, _, duration, _, _, _, _, _, _, _, _, days, daily_view, daily_watch = line.rstrip().split('\t')
            duration = int(duration)
            days = read_as_int_array(days, delimiter=',', truncated=age)
            daily_view = read_as_int_array(daily_view, delimiter=',', truncated=age)
            daily_watch = read_as_float_array(daily_watch, delimiter=',', truncated=age)
            cum_day_list = []
            cum_wp_list = []
            cum_re_list = []
            sliding_day_list = []
            sliding_wp_list = []
            sliding_re_list = []
            for i in range(age):
                # cumulative watch percentage and relative engagement
                cum_views = np.sum(daily_view[days <= i])
                cum_watches = np.sum(daily_watch[days <= i])
                if cum_views >= min_view:
                    cum_wp = cum_watches * 60 / cum_views / duration
                    if cum_wp > 1:
                        cum_wp = 1
                    cum_re = to_relative_engagement(engagement_map_series[i], duration, cum_wp, lookup_keys=split_key_series[i])
                    cum_day_list.append(i)
                    cum_wp_list.append(cum_wp)
                    cum_re_list.append(cum_re)

                # sliding window watch percentage and relative engagement
                if i < window_size:
                    sliding_views = np.sum(daily_view[days <= i])
                    sliding_watches = np.sum(daily_watch[days <= i])
                else:
                    sliding_views = np.sum(daily_view[(i - window_size < days) & (days <= i)])
                    sliding_watches = np.sum(daily_watch[(i - window_size < days) & (days <= i)])
                if sliding_views >= min_view:
                    sliding_wp = sliding_watches * 60 / sliding_views / duration
                    if sliding_wp > 1:
                        sliding_wp = 1
                    sliding_re = to_relative_engagement(engagement_map_series[i], duration, sliding_wp, lookup_keys=split_key_series[i])
                    sliding_day_list.append(i)
                    sliding_wp_list.append(sliding_wp)
                    sliding_re_list.append(sliding_re)

            # write to output files
            if len(cum_day_list) > 0:
                cum_output.write('{0}\t{1}\t{2}\t{3}\n'.format(vid, strify(cum_day_list, delimiter=','),
                                                               strify(cum_wp_list, delimiter=','),
                                                               strify(cum_re_list, delimiter=',')))
            if len(sliding_day_list) > 0:
                sliding_output.write('{0}\t{1}\t{2}\t{3}\n'.format(vid, strify(sliding_day_list, delimiter=','),
                                                                   strify(sliding_wp_list, delimiter=','),
                                                                   strify(sliding_re_list, delimiter=',')))
コード例 #2
0
def _load_data(filepath):
    with open(filepath, 'r') as fin:
        fin.readline()
        for line in fin:
            _, _, duration, dump = line.split('\t', 3)
            _, days, views, watches = dump.rstrip().rsplit('\t', 3)

            duration = int(duration)
            days = read_as_int_array(days, delimiter=',', truncated=age)
            daily_view = read_as_int_array(views, delimiter=',', truncated=age)
            daily_watch = read_as_float_array(watches, delimiter=',', truncated=age)

            if np.sum(daily_view[days < 30]) == 0:
                continue

            for idx, t in enumerate([30, 60, 90, 120]):
                wp_t = np.sum(daily_watch[days < t]) * 60 / np.sum(daily_view[days < t]) / duration
                relative_engagement_quad[idx].append(to_relative_engagement(engagement_map, duration, wp_t, lookup_keys=lookup_durations))
コード例 #3
0
def extract_engagement_data_from_file(filepath, age, handles, threshold=100):
    with open(filepath, 'r') as fin:
        fin.readline()
        for line in fin:
            vid, _, duration, _, category, _, _, _, _, _, _, days, daily_view, daily_watch = line.rstrip().split('\t')
            duration = int(duration)
            days = read_as_int_array(days, delimiter=',', truncated=age)
            daily_view = read_as_int_array(daily_view, delimiter=',', truncated=age)
            daily_watch = read_as_float_array(daily_watch, delimiter=',', truncated=age)
            for i in range(age):
                views = np.sum(daily_view[days <= i])
                watches = np.sum(daily_watch[days <= i])
                if views >= threshold:
                    watch_time = watches * 60 / views
                    if watch_time > duration:
                        watch_time = duration
                    watch_percentage = watch_time / duration
                    handles[i].write('{0},{1},{2},{3},{4}\n'.format(vid, category, duration, watch_percentage, watch_time))
コード例 #4
0
                    863, 897, 1039, 1348, 1659, 1259, 1559
                ]
                fig_cnt += 1
            elif vid == 'hxUh6dS5Q_Q':
                fig_idx = 1
                daily_view = [
                    20634, 15162, 6925, 5132, 4348, 3625, 3437, 5255, 6226,
                    6021, 10104, 7183, 11172, 10655, 15246, 15990, 17911,
                    14262, 12128, 11120, 7191, 5882, 3867, 5271, 2352, 2004,
                    2677, 2817, 3266, 2968
                ]
                fig_cnt += 1
            else:
                continue

            days = read_as_int_array(days, delimiter=',')
            re_list = read_as_float_array(re_list, delimiter=',')

            # power-law fitting
            a, b, c = fit_with_powerlaw(days, re_list)

            print(vid, r'model: {0:.3f}t^{1:.3f}+{2:.3f}'.format(a, b, c))

            # == == == == == == == == Part 2: Plot fitting result == == == == == == == == #
            to_plot = True
            if to_plot:
                ax1 = fig.add_subplot(1, 2, 1 + fig_idx)
                ax2 = ax1.twinx()
                ax1.plot(ts, daily_view, 'b-')
                ax2.plot(ts, re_list, 'k--')
                ax2.plot(ts, [func_powerlaw(x, a, b, c) for x in ts], 'r-')
コード例 #5
0
def extract_info(input_path, output_file, truncated=None):
    """
    Extract essential information from each video.
    :param input_path: input file path
    :param output_file: output file handler
    :param truncated: head number of elements extracted
    :return:
    """
    with open(input_path, 'r') as fin:
        for line in fin:
            # skip if data is corrupted or reading duration fails
            try:
                video = json.loads(line.rstrip())
                duration = isodate.parse_duration(video['contentDetails']['duration']).seconds
            except:
                continue

            # skip if not insights data or not watching data
            if 'insights' not in video or video['insights']['avgWatch'] == 'N' or duration == 0:
                continue

            published_at = video['snippet']['publishedAt'][:10]
            start_date = video['insights']['startDate']
            time_diff = (datetime(*map(int, start_date.split('-'))) - datetime(*map(int, published_at.split('-')))).days
            days = read_as_int_array(video['insights']['days'], delimiter=',', truncated=truncated) + time_diff
            days = days[days < truncated]
            daily_view = read_as_int_array(video['insights']['dailyView'], delimiter=',', truncated=len(days))
            view30 = np.sum(daily_view[days < 30])

            # pre-filtering: have at least 100 views in first 30 days
            if view30 < 100:
                continue

            daily_watch = read_as_float_array(video['insights']['dailyWatch'], delimiter=',', truncated=len(days))
            watch30 = np.sum(daily_watch[days < 30])
            wp30 = watch30*60/view30/duration
            # upper bound watch percentage to 1
            if wp30 > 1:
                wp30 = 1
            re30 = to_relative_engagement(engagement_map, duration, wp30, lookup_keys=lookup_durations)

            # topic information
            if 'topicDetails' in video:
                if 'topicIds' in video['topicDetails']:
                    topic_ids = set(video['topicDetails']['topicIds'])
                else:
                    topic_ids = set()
                if 'relevantTopicIds' in video['topicDetails']:
                    relevant_topic_ids = set(video['topicDetails']['relevantTopicIds'])
                else:
                    relevant_topic_ids = set()
                topics_set = topic_ids.union(relevant_topic_ids)
                topics = strify(topics_set)
            else:
                topics = 'NA'

            # detect description language
            description = video['snippet']['description']
            try:
                detect_lang = detect(description)
            except:
                detect_lang = 'NA'

            vid = video['id']
            definition = [0, 1][video['contentDetails']['definition'] == 'hd']
            category = video['snippet']['categoryId']
            channel = video['snippet']['channelId']

            output_file.write('{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}\t{10}\t{11}\t{12}\t{13}\t{14}\n'
                              .format(vid, published_at, duration, definition, category, detect_lang, channel, topics,
                                      view30, watch30, wp30, re30, strify(days), strify(daily_view), strify(daily_watch)))
コード例 #6
0
    for subdir, _, files in os.walk(data_loc):
        for f in files:
            with open(os.path.join(subdir, f), 'r') as fin:
                fin.readline()
                for line in fin:
                    dump, days, daily_view, daily_watch = line.rstrip().rsplit('\t', 3)
                    vid, _, duration, _ = dump.split('\t', 3)
                    if vid == 'RzvS7OmShAE':
                        fig_idx = 0
                    elif vid == 'rKdNjlNYMKk':
                        fig_idx = 1
                    else:
                        continue
                    duration = int(duration)
                    days = read_as_int_array(days, delimiter=',', truncated=age)
                    daily_view = read_as_int_array(daily_view, delimiter=',', truncated=age)
                    daily_watch = read_as_float_array(daily_watch, delimiter=',', truncated=age)

                    # a moving windows solution, using past 7 days to calculate wp
                    cumulative_wp = []
                    for i in range(days[-1]+1):
                        if i < window_size:
                            past_window_views = np.sum(daily_view[days <= i])
                            past_window_watches = np.sum(daily_watch[days <= i])
                        else:
                            past_window_views = np.sum(daily_view[(i-window_size < days) & (days <= i)])
                            past_window_watches = np.sum(daily_watch[(i-window_size < days) & (days <= i)])
                        if past_window_views < min_window_view:
                            break
                        cumulative_wp.append(past_window_watches * 60 / past_window_views / duration)
コード例 #7
0
def extract_info(input_path, output_path, truncated=None):
    """
    Extract essential information from each video.
    :param input_path: input file path
    :param output_path: output file path
    :param truncated: head number of extracted elements in attention dynamics
    :return:
    """
    fout = open(output_path, 'w')
    fout.write(
        '{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}\t{10}\t{11}\t{12}\t{13}\n'
        .format('id', 'publish', 'duration', 'definition', 'category',
                'detect_lang', 'channel', 'topics', 'view30', 'watch30',
                'wp30', 'days', 'daily_view', 'daily_watch'))

    with open(input_path, 'r') as fin:
        for line in fin:
            # skip if data is corrupted or reading duration fails
            try:
                video = json.loads(line.rstrip())
            except:
                continue

            vid = video['id']
            published_at = video['snippet']['publishedAt'][:10]
            duration = isodate.parse_duration(
                video['contentDetails']['duration']).seconds
            definition = [0, 1][video['contentDetails']['definition'] == 'hd']
            category = video['snippet']['categoryId']
            detect_lang = video['snippet']['detectLang']
            channel = video['snippet']['channelId']

            # freebase topic information
            if 'topicDetails' in video:
                if 'topicIds' in video['topicDetails']:
                    topic_ids = set(video['topicDetails']['topicIds'])
                else:
                    topic_ids = set()
                if 'relevantTopicIds' in video['topicDetails']:
                    relevant_topic_ids = set(
                        video['topicDetails']['relevantTopicIds'])
                else:
                    relevant_topic_ids = set()
                topics_set = topic_ids.union(relevant_topic_ids)
                topics = strify(topics_set)
            else:
                topics = 'NA'

            # attention dynamics information
            start_date = video['insights']['startDate']
            time_diff = (datetime(*map(int, start_date.split('-'))) -
                         datetime(*map(int, published_at.split('-')))).days
            days = read_as_int_array(video['insights']['days'],
                                     delimiter=',',
                                     truncated=truncated) + time_diff
            days = days[days < truncated]
            daily_view = read_as_int_array(video['insights']['dailyView'],
                                           delimiter=',',
                                           truncated=len(days))
            view30 = np.sum(daily_view[days < 30])
            daily_watch = read_as_float_array(video['insights']['dailyWatch'],
                                              delimiter=',',
                                              truncated=len(days))
            watch30 = np.sum(daily_watch[days < 30])
            # I have cleaned the data, so views in the first 30 days will be greater than 100
            # take care of zero view and very occasionally (streamed video) zero duration
            wp30 = watch30 * 60 / view30 / duration
            # upper bound watch percentage to 1
            if wp30 > 1:
                wp30 = 1

            fout.write(
                '{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}\t{10}\t{11}\t{12}\t{13}\n'
                .format(vid, published_at, duration, definition, category,
                        detect_lang, channel, topics, view30, watch30, wp30,
                        strify(days), strify(daily_view), strify(daily_watch)))
    fout.close()