def _load_data(filename):
    vid_volume_dict = {}
    with open(os.path.join(data_prefix_dir, '{0}.csv'.format(filename))) as fin:
        fin.readline()
        for line in fin:
            vid, series = line.rstrip().split('\t', 1)
            vid_volume_dict[vid] = np.sum(read_as_float_array(series, delimiter='\t'))
    return vid_volume_dict
def _load_data(filepath):
    with open(filepath, 'r') as fin:
        fin.readline()
        for line in fin:
            _, _, duration, dump = line.split('\t', 3)
            _, days, views, watches = dump.rstrip().rsplit('\t', 3)

            duration = int(duration)
            days = read_as_int_array(days, delimiter=',', truncated=age)
            daily_view = read_as_int_array(views, delimiter=',', truncated=age)
            daily_watch = read_as_float_array(watches, delimiter=',', truncated=age)

            if np.sum(daily_view[days < 30]) == 0:
                continue

            for idx, t in enumerate([30, 60, 90, 120]):
                wp_t = np.sum(daily_watch[days < t]) * 60 / np.sum(daily_view[days < t]) / duration
                relative_engagement_quad[idx].append(to_relative_engagement(engagement_map, duration, wp_t, lookup_keys=lookup_durations))
def extract_info(input_path, output_file, truncated=None):
    """
    Extract essential information from each video.
    :param input_path: input file path
    :param output_file: output file handler
    :param truncated: head number of elements extracted
    :return:
    """
    with open(input_path, 'r') as fin:
        for line in fin:
            # skip if data is corrupted or reading duration fails
            try:
                video = json.loads(line.rstrip())
                duration = isodate.parse_duration(video['contentDetails']['duration']).seconds
            except:
                continue

            # skip if not insights data or not watching data
            if 'insights' not in video or video['insights']['avgWatch'] == 'N' or duration == 0:
                continue

            published_at = video['snippet']['publishedAt'][:10]
            start_date = video['insights']['startDate']
            time_diff = (datetime(*map(int, start_date.split('-'))) - datetime(*map(int, published_at.split('-')))).days
            days = read_as_int_array(video['insights']['days'], delimiter=',', truncated=truncated) + time_diff
            days = days[days < truncated]
            daily_view = read_as_int_array(video['insights']['dailyView'], delimiter=',', truncated=len(days))
            view30 = np.sum(daily_view[days < 30])

            # pre-filtering: have at least 100 views in first 30 days
            if view30 < 100:
                continue

            daily_watch = read_as_float_array(video['insights']['dailyWatch'], delimiter=',', truncated=len(days))
            watch30 = np.sum(daily_watch[days < 30])
            wp30 = watch30*60/view30/duration
            # upper bound watch percentage to 1
            if wp30 > 1:
                wp30 = 1
            re30 = to_relative_engagement(engagement_map, duration, wp30, lookup_keys=lookup_durations)

            # topic information
            if 'topicDetails' in video:
                if 'topicIds' in video['topicDetails']:
                    topic_ids = set(video['topicDetails']['topicIds'])
                else:
                    topic_ids = set()
                if 'relevantTopicIds' in video['topicDetails']:
                    relevant_topic_ids = set(video['topicDetails']['relevantTopicIds'])
                else:
                    relevant_topic_ids = set()
                topics_set = topic_ids.union(relevant_topic_ids)
                topics = strify(topics_set)
            else:
                topics = 'NA'

            # detect description language
            description = video['snippet']['description']
            try:
                detect_lang = detect(description)
            except:
                detect_lang = 'NA'

            vid = video['id']
            definition = [0, 1][video['contentDetails']['definition'] == 'hd']
            category = video['snippet']['categoryId']
            channel = video['snippet']['channelId']

            output_file.write('{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}\t{10}\t{11}\t{12}\t{13}\t{14}\n'
                              .format(vid, published_at, duration, definition, category, detect_lang, channel, topics,
                                      view30, watch30, wp30, re30, strify(days), strify(daily_view), strify(daily_watch)))
        for f in files:
            with open(os.path.join(subdir, f), 'r') as fin:
                fin.readline()
                for line in fin:
                    dump, days, daily_view, daily_watch = line.rstrip().rsplit('\t', 3)
                    vid, _, duration, _ = dump.split('\t', 3)
                    if vid == 'RzvS7OmShAE':
                        fig_idx = 0
                    elif vid == 'rKdNjlNYMKk':
                        fig_idx = 1
                    else:
                        continue
                    duration = int(duration)
                    days = read_as_int_array(days, delimiter=',', truncated=age)
                    daily_view = read_as_int_array(daily_view, delimiter=',', truncated=age)
                    daily_watch = read_as_float_array(daily_watch, delimiter=',', truncated=age)

                    # a moving windows solution, using past 7 days to calculate wp
                    cumulative_wp = []
                    for i in range(days[-1]+1):
                        if i < window_size:
                            past_window_views = np.sum(daily_view[days <= i])
                            past_window_watches = np.sum(daily_watch[days <= i])
                        else:
                            past_window_views = np.sum(daily_view[(i-window_size < days) & (days <= i)])
                            past_window_watches = np.sum(daily_watch[(i-window_size < days) & (days <= i)])
                        if past_window_views < min_window_view:
                            break
                        cumulative_wp.append(past_window_watches * 60 / past_window_views / duration)

                    cumulative_engagement = to_relative_engagement(engagement_map, duration, cumulative_wp, lookup_keys=lookup_durations)
Esempio n. 5
0
def extract_info(input_path, output_path, truncated=None):
    """
    Extract essential information from each video.
    :param input_path: input file path
    :param output_path: output file path
    :param truncated: head number of extracted elements in attention dynamics
    :return:
    """
    fout = open(output_path, 'w')
    fout.write(
        '{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}\t{10}\t{11}\t{12}\t{13}\n'
        .format('id', 'publish', 'duration', 'definition', 'category',
                'detect_lang', 'channel', 'topics', 'view30', 'watch30',
                'wp30', 'days', 'daily_view', 'daily_watch'))

    with open(input_path, 'r') as fin:
        for line in fin:
            # skip if data is corrupted or reading duration fails
            try:
                video = json.loads(line.rstrip())
            except:
                continue

            vid = video['id']
            published_at = video['snippet']['publishedAt'][:10]
            duration = isodate.parse_duration(
                video['contentDetails']['duration']).seconds
            definition = [0, 1][video['contentDetails']['definition'] == 'hd']
            category = video['snippet']['categoryId']
            detect_lang = video['snippet']['detectLang']
            channel = video['snippet']['channelId']

            # freebase topic information
            if 'topicDetails' in video:
                if 'topicIds' in video['topicDetails']:
                    topic_ids = set(video['topicDetails']['topicIds'])
                else:
                    topic_ids = set()
                if 'relevantTopicIds' in video['topicDetails']:
                    relevant_topic_ids = set(
                        video['topicDetails']['relevantTopicIds'])
                else:
                    relevant_topic_ids = set()
                topics_set = topic_ids.union(relevant_topic_ids)
                topics = strify(topics_set)
            else:
                topics = 'NA'

            # attention dynamics information
            start_date = video['insights']['startDate']
            time_diff = (datetime(*map(int, start_date.split('-'))) -
                         datetime(*map(int, published_at.split('-')))).days
            days = read_as_int_array(video['insights']['days'],
                                     delimiter=',',
                                     truncated=truncated) + time_diff
            days = days[days < truncated]
            daily_view = read_as_int_array(video['insights']['dailyView'],
                                           delimiter=',',
                                           truncated=len(days))
            view30 = np.sum(daily_view[days < 30])
            daily_watch = read_as_float_array(video['insights']['dailyWatch'],
                                              delimiter=',',
                                              truncated=len(days))
            watch30 = np.sum(daily_watch[days < 30])
            # I have cleaned the data, so views in the first 30 days will be greater than 100
            # take care of zero view and very occasionally (streamed video) zero duration
            wp30 = watch30 * 60 / view30 / duration
            # upper bound watch percentage to 1
            if wp30 > 1:
                wp30 = 1

            fout.write(
                '{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}\t{10}\t{11}\t{12}\t{13}\n'
                .format(vid, published_at, duration, definition, category,
                        detect_lang, channel, topics, view30, watch30, wp30,
                        strify(days), strify(daily_view), strify(daily_watch)))
    fout.close()