def extract_engagement_dynamics_from_file(filepath, engagement_map_series, split_key_series, window_size, min_view=100): age = len(engagement_map_series) with open(filepath, 'r') as fin: fin.readline() for line in fin: vid, _, duration, _, _, _, _, _, _, _, _, days, daily_view, daily_watch = line.rstrip().split('\t') duration = int(duration) days = read_as_int_array(days, delimiter=',', truncated=age) daily_view = read_as_int_array(daily_view, delimiter=',', truncated=age) daily_watch = read_as_float_array(daily_watch, delimiter=',', truncated=age) cum_day_list = [] cum_wp_list = [] cum_re_list = [] sliding_day_list = [] sliding_wp_list = [] sliding_re_list = [] for i in range(age): # cumulative watch percentage and relative engagement cum_views = np.sum(daily_view[days <= i]) cum_watches = np.sum(daily_watch[days <= i]) if cum_views >= min_view: cum_wp = cum_watches * 60 / cum_views / duration if cum_wp > 1: cum_wp = 1 cum_re = to_relative_engagement(engagement_map_series[i], duration, cum_wp, lookup_keys=split_key_series[i]) cum_day_list.append(i) cum_wp_list.append(cum_wp) cum_re_list.append(cum_re) # sliding window watch percentage and relative engagement if i < window_size: sliding_views = np.sum(daily_view[days <= i]) sliding_watches = np.sum(daily_watch[days <= i]) else: sliding_views = np.sum(daily_view[(i - window_size < days) & (days <= i)]) sliding_watches = np.sum(daily_watch[(i - window_size < days) & (days <= i)]) if sliding_views >= min_view: sliding_wp = sliding_watches * 60 / sliding_views / duration if sliding_wp > 1: sliding_wp = 1 sliding_re = to_relative_engagement(engagement_map_series[i], duration, sliding_wp, lookup_keys=split_key_series[i]) sliding_day_list.append(i) sliding_wp_list.append(sliding_wp) sliding_re_list.append(sliding_re) # write to output files if len(cum_day_list) > 0: cum_output.write('{0}\t{1}\t{2}\t{3}\n'.format(vid, strify(cum_day_list, delimiter=','), strify(cum_wp_list, delimiter=','), strify(cum_re_list, delimiter=','))) if len(sliding_day_list) > 0: sliding_output.write('{0}\t{1}\t{2}\t{3}\n'.format(vid, strify(sliding_day_list, delimiter=','), strify(sliding_wp_list, delimiter=','), strify(sliding_re_list, delimiter=',')))
def _load_data(filepath): with open(filepath, 'r') as fin: fin.readline() for line in fin: _, _, duration, dump = line.split('\t', 3) _, days, views, watches = dump.rstrip().rsplit('\t', 3) duration = int(duration) days = read_as_int_array(days, delimiter=',', truncated=age) daily_view = read_as_int_array(views, delimiter=',', truncated=age) daily_watch = read_as_float_array(watches, delimiter=',', truncated=age) if np.sum(daily_view[days < 30]) == 0: continue for idx, t in enumerate([30, 60, 90, 120]): wp_t = np.sum(daily_watch[days < t]) * 60 / np.sum(daily_view[days < t]) / duration relative_engagement_quad[idx].append(to_relative_engagement(engagement_map, duration, wp_t, lookup_keys=lookup_durations))
def extract_engagement_data_from_file(filepath, age, handles, threshold=100): with open(filepath, 'r') as fin: fin.readline() for line in fin: vid, _, duration, _, category, _, _, _, _, _, _, days, daily_view, daily_watch = line.rstrip().split('\t') duration = int(duration) days = read_as_int_array(days, delimiter=',', truncated=age) daily_view = read_as_int_array(daily_view, delimiter=',', truncated=age) daily_watch = read_as_float_array(daily_watch, delimiter=',', truncated=age) for i in range(age): views = np.sum(daily_view[days <= i]) watches = np.sum(daily_watch[days <= i]) if views >= threshold: watch_time = watches * 60 / views if watch_time > duration: watch_time = duration watch_percentage = watch_time / duration handles[i].write('{0},{1},{2},{3},{4}\n'.format(vid, category, duration, watch_percentage, watch_time))
863, 897, 1039, 1348, 1659, 1259, 1559 ] fig_cnt += 1 elif vid == 'hxUh6dS5Q_Q': fig_idx = 1 daily_view = [ 20634, 15162, 6925, 5132, 4348, 3625, 3437, 5255, 6226, 6021, 10104, 7183, 11172, 10655, 15246, 15990, 17911, 14262, 12128, 11120, 7191, 5882, 3867, 5271, 2352, 2004, 2677, 2817, 3266, 2968 ] fig_cnt += 1 else: continue days = read_as_int_array(days, delimiter=',') re_list = read_as_float_array(re_list, delimiter=',') # power-law fitting a, b, c = fit_with_powerlaw(days, re_list) print(vid, r'model: {0:.3f}t^{1:.3f}+{2:.3f}'.format(a, b, c)) # == == == == == == == == Part 2: Plot fitting result == == == == == == == == # to_plot = True if to_plot: ax1 = fig.add_subplot(1, 2, 1 + fig_idx) ax2 = ax1.twinx() ax1.plot(ts, daily_view, 'b-') ax2.plot(ts, re_list, 'k--') ax2.plot(ts, [func_powerlaw(x, a, b, c) for x in ts], 'r-')
def extract_info(input_path, output_file, truncated=None): """ Extract essential information from each video. :param input_path: input file path :param output_file: output file handler :param truncated: head number of elements extracted :return: """ with open(input_path, 'r') as fin: for line in fin: # skip if data is corrupted or reading duration fails try: video = json.loads(line.rstrip()) duration = isodate.parse_duration(video['contentDetails']['duration']).seconds except: continue # skip if not insights data or not watching data if 'insights' not in video or video['insights']['avgWatch'] == 'N' or duration == 0: continue published_at = video['snippet']['publishedAt'][:10] start_date = video['insights']['startDate'] time_diff = (datetime(*map(int, start_date.split('-'))) - datetime(*map(int, published_at.split('-')))).days days = read_as_int_array(video['insights']['days'], delimiter=',', truncated=truncated) + time_diff days = days[days < truncated] daily_view = read_as_int_array(video['insights']['dailyView'], delimiter=',', truncated=len(days)) view30 = np.sum(daily_view[days < 30]) # pre-filtering: have at least 100 views in first 30 days if view30 < 100: continue daily_watch = read_as_float_array(video['insights']['dailyWatch'], delimiter=',', truncated=len(days)) watch30 = np.sum(daily_watch[days < 30]) wp30 = watch30*60/view30/duration # upper bound watch percentage to 1 if wp30 > 1: wp30 = 1 re30 = to_relative_engagement(engagement_map, duration, wp30, lookup_keys=lookup_durations) # topic information if 'topicDetails' in video: if 'topicIds' in video['topicDetails']: topic_ids = set(video['topicDetails']['topicIds']) else: topic_ids = set() if 'relevantTopicIds' in video['topicDetails']: relevant_topic_ids = set(video['topicDetails']['relevantTopicIds']) else: relevant_topic_ids = set() topics_set = topic_ids.union(relevant_topic_ids) topics = strify(topics_set) else: topics = 'NA' # detect description language description = video['snippet']['description'] try: detect_lang = detect(description) except: detect_lang = 'NA' vid = video['id'] definition = [0, 1][video['contentDetails']['definition'] == 'hd'] category = video['snippet']['categoryId'] channel = video['snippet']['channelId'] output_file.write('{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}\t{10}\t{11}\t{12}\t{13}\t{14}\n' .format(vid, published_at, duration, definition, category, detect_lang, channel, topics, view30, watch30, wp30, re30, strify(days), strify(daily_view), strify(daily_watch)))
for subdir, _, files in os.walk(data_loc): for f in files: with open(os.path.join(subdir, f), 'r') as fin: fin.readline() for line in fin: dump, days, daily_view, daily_watch = line.rstrip().rsplit('\t', 3) vid, _, duration, _ = dump.split('\t', 3) if vid == 'RzvS7OmShAE': fig_idx = 0 elif vid == 'rKdNjlNYMKk': fig_idx = 1 else: continue duration = int(duration) days = read_as_int_array(days, delimiter=',', truncated=age) daily_view = read_as_int_array(daily_view, delimiter=',', truncated=age) daily_watch = read_as_float_array(daily_watch, delimiter=',', truncated=age) # a moving windows solution, using past 7 days to calculate wp cumulative_wp = [] for i in range(days[-1]+1): if i < window_size: past_window_views = np.sum(daily_view[days <= i]) past_window_watches = np.sum(daily_watch[days <= i]) else: past_window_views = np.sum(daily_view[(i-window_size < days) & (days <= i)]) past_window_watches = np.sum(daily_watch[(i-window_size < days) & (days <= i)]) if past_window_views < min_window_view: break cumulative_wp.append(past_window_watches * 60 / past_window_views / duration)
def extract_info(input_path, output_path, truncated=None): """ Extract essential information from each video. :param input_path: input file path :param output_path: output file path :param truncated: head number of extracted elements in attention dynamics :return: """ fout = open(output_path, 'w') fout.write( '{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}\t{10}\t{11}\t{12}\t{13}\n' .format('id', 'publish', 'duration', 'definition', 'category', 'detect_lang', 'channel', 'topics', 'view30', 'watch30', 'wp30', 'days', 'daily_view', 'daily_watch')) with open(input_path, 'r') as fin: for line in fin: # skip if data is corrupted or reading duration fails try: video = json.loads(line.rstrip()) except: continue vid = video['id'] published_at = video['snippet']['publishedAt'][:10] duration = isodate.parse_duration( video['contentDetails']['duration']).seconds definition = [0, 1][video['contentDetails']['definition'] == 'hd'] category = video['snippet']['categoryId'] detect_lang = video['snippet']['detectLang'] channel = video['snippet']['channelId'] # freebase topic information if 'topicDetails' in video: if 'topicIds' in video['topicDetails']: topic_ids = set(video['topicDetails']['topicIds']) else: topic_ids = set() if 'relevantTopicIds' in video['topicDetails']: relevant_topic_ids = set( video['topicDetails']['relevantTopicIds']) else: relevant_topic_ids = set() topics_set = topic_ids.union(relevant_topic_ids) topics = strify(topics_set) else: topics = 'NA' # attention dynamics information start_date = video['insights']['startDate'] time_diff = (datetime(*map(int, start_date.split('-'))) - datetime(*map(int, published_at.split('-')))).days days = read_as_int_array(video['insights']['days'], delimiter=',', truncated=truncated) + time_diff days = days[days < truncated] daily_view = read_as_int_array(video['insights']['dailyView'], delimiter=',', truncated=len(days)) view30 = np.sum(daily_view[days < 30]) daily_watch = read_as_float_array(video['insights']['dailyWatch'], delimiter=',', truncated=len(days)) watch30 = np.sum(daily_watch[days < 30]) # I have cleaned the data, so views in the first 30 days will be greater than 100 # take care of zero view and very occasionally (streamed video) zero duration wp30 = watch30 * 60 / view30 / duration # upper bound watch percentage to 1 if wp30 > 1: wp30 = 1 fout.write( '{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}\t{10}\t{11}\t{12}\t{13}\n' .format(vid, published_at, duration, definition, category, detect_lang, channel, topics, view30, watch30, wp30, strify(days), strify(daily_view), strify(daily_watch))) fout.close()