def _load_data(filename): vid_volume_dict = {} with open(os.path.join(data_prefix_dir, '{0}.csv'.format(filename))) as fin: fin.readline() for line in fin: vid, series = line.rstrip().split('\t', 1) vid_volume_dict[vid] = np.sum(read_as_float_array(series, delimiter='\t')) return vid_volume_dict
def _load_data(filepath): with open(filepath, 'r') as fin: fin.readline() for line in fin: _, _, duration, dump = line.split('\t', 3) _, days, views, watches = dump.rstrip().rsplit('\t', 3) duration = int(duration) days = read_as_int_array(days, delimiter=',', truncated=age) daily_view = read_as_int_array(views, delimiter=',', truncated=age) daily_watch = read_as_float_array(watches, delimiter=',', truncated=age) if np.sum(daily_view[days < 30]) == 0: continue for idx, t in enumerate([30, 60, 90, 120]): wp_t = np.sum(daily_watch[days < t]) * 60 / np.sum(daily_view[days < t]) / duration relative_engagement_quad[idx].append(to_relative_engagement(engagement_map, duration, wp_t, lookup_keys=lookup_durations))
def extract_info(input_path, output_file, truncated=None): """ Extract essential information from each video. :param input_path: input file path :param output_file: output file handler :param truncated: head number of elements extracted :return: """ with open(input_path, 'r') as fin: for line in fin: # skip if data is corrupted or reading duration fails try: video = json.loads(line.rstrip()) duration = isodate.parse_duration(video['contentDetails']['duration']).seconds except: continue # skip if not insights data or not watching data if 'insights' not in video or video['insights']['avgWatch'] == 'N' or duration == 0: continue published_at = video['snippet']['publishedAt'][:10] start_date = video['insights']['startDate'] time_diff = (datetime(*map(int, start_date.split('-'))) - datetime(*map(int, published_at.split('-')))).days days = read_as_int_array(video['insights']['days'], delimiter=',', truncated=truncated) + time_diff days = days[days < truncated] daily_view = read_as_int_array(video['insights']['dailyView'], delimiter=',', truncated=len(days)) view30 = np.sum(daily_view[days < 30]) # pre-filtering: have at least 100 views in first 30 days if view30 < 100: continue daily_watch = read_as_float_array(video['insights']['dailyWatch'], delimiter=',', truncated=len(days)) watch30 = np.sum(daily_watch[days < 30]) wp30 = watch30*60/view30/duration # upper bound watch percentage to 1 if wp30 > 1: wp30 = 1 re30 = to_relative_engagement(engagement_map, duration, wp30, lookup_keys=lookup_durations) # topic information if 'topicDetails' in video: if 'topicIds' in video['topicDetails']: topic_ids = set(video['topicDetails']['topicIds']) else: topic_ids = set() if 'relevantTopicIds' in video['topicDetails']: relevant_topic_ids = set(video['topicDetails']['relevantTopicIds']) else: relevant_topic_ids = set() topics_set = topic_ids.union(relevant_topic_ids) topics = strify(topics_set) else: topics = 'NA' # detect description language description = video['snippet']['description'] try: detect_lang = detect(description) except: detect_lang = 'NA' vid = video['id'] definition = [0, 1][video['contentDetails']['definition'] == 'hd'] category = video['snippet']['categoryId'] channel = video['snippet']['channelId'] output_file.write('{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}\t{10}\t{11}\t{12}\t{13}\t{14}\n' .format(vid, published_at, duration, definition, category, detect_lang, channel, topics, view30, watch30, wp30, re30, strify(days), strify(daily_view), strify(daily_watch)))
for f in files: with open(os.path.join(subdir, f), 'r') as fin: fin.readline() for line in fin: dump, days, daily_view, daily_watch = line.rstrip().rsplit('\t', 3) vid, _, duration, _ = dump.split('\t', 3) if vid == 'RzvS7OmShAE': fig_idx = 0 elif vid == 'rKdNjlNYMKk': fig_idx = 1 else: continue duration = int(duration) days = read_as_int_array(days, delimiter=',', truncated=age) daily_view = read_as_int_array(daily_view, delimiter=',', truncated=age) daily_watch = read_as_float_array(daily_watch, delimiter=',', truncated=age) # a moving windows solution, using past 7 days to calculate wp cumulative_wp = [] for i in range(days[-1]+1): if i < window_size: past_window_views = np.sum(daily_view[days <= i]) past_window_watches = np.sum(daily_watch[days <= i]) else: past_window_views = np.sum(daily_view[(i-window_size < days) & (days <= i)]) past_window_watches = np.sum(daily_watch[(i-window_size < days) & (days <= i)]) if past_window_views < min_window_view: break cumulative_wp.append(past_window_watches * 60 / past_window_views / duration) cumulative_engagement = to_relative_engagement(engagement_map, duration, cumulative_wp, lookup_keys=lookup_durations)
def extract_info(input_path, output_path, truncated=None): """ Extract essential information from each video. :param input_path: input file path :param output_path: output file path :param truncated: head number of extracted elements in attention dynamics :return: """ fout = open(output_path, 'w') fout.write( '{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}\t{10}\t{11}\t{12}\t{13}\n' .format('id', 'publish', 'duration', 'definition', 'category', 'detect_lang', 'channel', 'topics', 'view30', 'watch30', 'wp30', 'days', 'daily_view', 'daily_watch')) with open(input_path, 'r') as fin: for line in fin: # skip if data is corrupted or reading duration fails try: video = json.loads(line.rstrip()) except: continue vid = video['id'] published_at = video['snippet']['publishedAt'][:10] duration = isodate.parse_duration( video['contentDetails']['duration']).seconds definition = [0, 1][video['contentDetails']['definition'] == 'hd'] category = video['snippet']['categoryId'] detect_lang = video['snippet']['detectLang'] channel = video['snippet']['channelId'] # freebase topic information if 'topicDetails' in video: if 'topicIds' in video['topicDetails']: topic_ids = set(video['topicDetails']['topicIds']) else: topic_ids = set() if 'relevantTopicIds' in video['topicDetails']: relevant_topic_ids = set( video['topicDetails']['relevantTopicIds']) else: relevant_topic_ids = set() topics_set = topic_ids.union(relevant_topic_ids) topics = strify(topics_set) else: topics = 'NA' # attention dynamics information start_date = video['insights']['startDate'] time_diff = (datetime(*map(int, start_date.split('-'))) - datetime(*map(int, published_at.split('-')))).days days = read_as_int_array(video['insights']['days'], delimiter=',', truncated=truncated) + time_diff days = days[days < truncated] daily_view = read_as_int_array(video['insights']['dailyView'], delimiter=',', truncated=len(days)) view30 = np.sum(daily_view[days < 30]) daily_watch = read_as_float_array(video['insights']['dailyWatch'], delimiter=',', truncated=len(days)) watch30 = np.sum(daily_watch[days < 30]) # I have cleaned the data, so views in the first 30 days will be greater than 100 # take care of zero view and very occasionally (streamed video) zero duration wp30 = watch30 * 60 / view30 / duration # upper bound watch percentage to 1 if wp30 > 1: wp30 = 1 fout.write( '{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}\t{10}\t{11}\t{12}\t{13}\n' .format(vid, published_at, duration, definition, category, detect_lang, channel, topics, view30, watch30, wp30, strify(days), strify(daily_view), strify(daily_watch))) fout.close()