def main(): timer = Timer() timer.start() cornflower_blue = ColorPalette.BLUE tomato = ColorPalette.TOMATO color_cycle_4 = ColorPalette.CC4 label_fs = ColorPalette.LABELFS title_fs = ColorPalette.TITLEFS tick_style = ColorPalette.TICKSTYLE bar_text_style = ColorPalette.BARTEXTSTYLE data_loader = DataLoader() data_loader.load_video_views() embed_view_dict = data_loader.embed_view_dict embed_avg_train_view_dict = { embed: np.mean(embed_view_dict[embed][:-NUM_OUTPUT]) for embed in embed_view_dict.keys() } net_ratio_list = [] src_to_tar_view_ratio = [] link_weights_record = [] naive_smape_list, snaive_smape_list, ar_smape_list, rnn_smape_list, arnet_smape_list = [ [] for _ in range(5) ] naive_daily_smape_mat, snaive_daily_smape_mat, ar_daily_smape_mat, rnn_daily_smape_mat, arnet_daily_smape_mat = [ np.empty((0, NUM_OUTPUT), np.float) for _ in range(5) ] with open('./forecast_tracker_all.json', 'r') as fin: for line in fin: result_json = json.loads(line.rstrip()) tar_embed = result_json['embed'] true_value = result_json['true_value'] naive_pred = result_json['naive_pred'] snaive_pred = result_json['snaive_pred'] ar_pred = result_json['ar_pred'] rnn_pred = result_json['rnn_pred'] arnet_pred = result_json['arnet_pred'] naive_smape, naive_daily_smape_arr = smape(true_value, naive_pred) naive_smape_list.append(naive_smape) naive_daily_smape_mat = np.vstack( (naive_daily_smape_mat, naive_daily_smape_arr)) snaive_smape, snaive_daily_smape_arr = smape( true_value, snaive_pred) snaive_smape_list.append(snaive_smape) snaive_daily_smape_mat = np.vstack( (snaive_daily_smape_mat, snaive_daily_smape_arr)) ar_smape, ar_daily_smape_arr = smape(true_value, ar_pred) ar_smape_list.append(ar_smape) ar_daily_smape_mat = np.vstack( (ar_daily_smape_mat, ar_daily_smape_arr)) rnn_smape, rnn_daily_smape_arr = smape(true_value, rnn_pred) rnn_smape_list.append(rnn_smape) rnn_daily_smape_mat = np.vstack( (rnn_daily_smape_mat, rnn_daily_smape_arr)) arnet_smape, arnet_daily_smape_arr = smape(true_value, arnet_pred) arnet_smape_list.append(arnet_smape) arnet_daily_smape_mat = np.vstack( (arnet_daily_smape_mat, arnet_daily_smape_arr)) # analyse network contribution arnet_net_ratio = result_json['net_ratio'] net_ratio_list.append(arnet_net_ratio) incoming_embeds = result_json['incoming_embeds'] link_weights = result_json['link_weights'] for edge_inx, src_embed in enumerate(incoming_embeds): view_ratio = np.log10(embed_avg_train_view_dict[src_embed] / embed_avg_train_view_dict[tar_embed]) src_to_tar_view_ratio.append(view_ratio) link_weights_record.append(link_weights[edge_inx]) fig, axes = plt.subplots(ncols=3, nrows=1, figsize=(12, 4)) axes = axes.ravel() # == == == == == == Part 1: Plot performance comparison == == == == == == # smape_mat = [ naive_smape_list, snaive_smape_list, ar_smape_list, rnn_smape_list, arnet_smape_list ] axes[0].boxplot(smape_mat, showfliers=False, meanline=True, showmeans=True, widths=0.7) means = [np.mean(x) for x in smape_mat] pos = range(len(means)) for tick, label in zip(pos, axes[1].get_xticklabels()): axes[0].text(pos[tick] + 1, means[tick] + 0.3, '{0:.3f}'.format(means[tick]), **bar_text_style) axes[0].set_xticklabels(['Naive', 'SN', 'AR', 'RNN', 'ARNet'], fontsize=label_fs) axes[0].set_ylabel('SMAPE', fontsize=label_fs) axes[0].tick_params(**tick_style) axes[0].set_title('(a)', fontsize=title_fs) # == == == == == == Part 2: Plot performance with forecast horizon extends == == == == == == # axes[1].plot(np.arange(1, 1 + NUM_OUTPUT), np.mean(naive_daily_smape_mat, axis=0), label='Naive', c='k', mfc='none', marker='D', markersize=4) axes[1].plot(np.arange(1, 1 + NUM_OUTPUT), np.mean(snaive_daily_smape_mat, axis=0), label='SN', c=color_cycle_4[0], mfc='none', marker='*', markersize=5) axes[1].plot(np.arange(1, 1 + NUM_OUTPUT), np.mean(ar_daily_smape_mat, axis=0), label='AR', c=color_cycle_4[1], mfc='none', marker='s', markersize=5) axes[1].plot(np.arange(1, 1 + NUM_OUTPUT), np.mean(rnn_daily_smape_mat, axis=0), label='RNN', c=color_cycle_4[2], mfc='none', marker='^', markersize=5) axes[1].plot(np.arange(1, 1 + NUM_OUTPUT), np.mean(arnet_daily_smape_mat, axis=0), label='ARNet', c=color_cycle_4[3], marker='o', markersize=5) axes[1].set_xlabel('forecast horizon', fontsize=label_fs) axes[1].set_ylabel('SMAPE', fontsize=label_fs) axes[1].set_ylim([6, 23]) axes[1].tick_params(**tick_style) axes[1].legend(frameon=False) axes[1].set_title('(b)', fontsize=title_fs) # == == == == == == Part 3: Plot link strength vs. view ratio from src to tar == == == == == == # bin_axis = np.arange(-2, 1.9, 0.1) bin_records = [[] for _ in range(len(bin_axis))] for x, y in zip(src_to_tar_view_ratio, link_weights_record): if x >= -2: bin_records[int(np.floor((x + 2) * 10))].append(y) for t in np.arange(5, 50, 5): axes[2].fill_between(bin_axis, [np.percentile(x, 50 - t) for x in bin_records], [np.percentile(x, 55 - t) for x in bin_records], facecolor=cornflower_blue, alpha=(100 - 2 * t) / 100, lw=0) axes[2].fill_between(bin_axis, [np.percentile(x, 45 + t) for x in bin_records], [np.percentile(x, 50 + t) for x in bin_records], facecolor=cornflower_blue, alpha=(100 - 2 * t) / 100, lw=0) for t in [10, 30, 70, 90]: axes[2].plot(bin_axis, [np.percentile(x, t) for x in bin_records], color=cornflower_blue, alpha=(100 - 2 * t) / 100, lw=1, zorder=15) median_line = [np.percentile(x, 50) for x in bin_records] axes[2].plot(bin_axis, median_line, color='k', alpha=0.5, zorder=20, lw=1.5) axes[2].xaxis.set_major_formatter( FuncFormatter(lambda x, _: r'$10^{{{0:.0f}}}%$'.format(x))) peak1_idx = int(np.argmax(median_line)) peak2_idx = 10 + int(np.argmax(median_line[10:])) peak1 = (bin_axis[peak1_idx], median_line[peak1_idx]) peak2 = (bin_axis[peak2_idx], median_line[peak2_idx]) axes[2].scatter(peak1[0], peak1[1], s=15, c=tomato, edgecolors='k', zorder=30) axes[2].text(peak1[0] + 0.08, peak1[1] + 0.01, '({0:.2f}, {1:.2f})'.format(10**peak1[0], peak1[1]), ha='left', va='center') axes[2].scatter(peak2[0], peak2[1], s=15, c=tomato, edgecolors='k', zorder=30) axes[2].text(peak2[0], peak2[1] + 0.02, '({0:.2f}, {1:.2f})'.format(10**peak2[0], peak2[1]), ha='center', va='bottom') axes[2].set_xlim((-2.05, 2.02)) axes[2].set_ylim((-0.02, 1.01)) axes[2].set_xlabel('views ratio from video ' + r'$u$' + ' to video ' + r'$v$', fontsize=label_fs) axes[2].set_ylabel('estimated link strength ' + r'$\beta_{u, v}$', fontsize=label_fs) axes[2].set_title('(c)', fontsize=title_fs) hide_spines(axes) timer.stop() plt.tight_layout() plt.savefig('../images/model_prediction_results.pdf', bbox_inches='tight') plt.show()
def main(): # == == == == == == Part 1: Set up environment == == == == == == # timer = Timer() timer.start() data_prefix = '../data/' # == == == == == == Part 2: Load video views == == == == == == # data_loader = DataLoader() data_loader.load_video_views() embed_avg_view_dict = data_loader.embed_avg_view_dict num_videos = data_loader.num_videos # == == == == == == Part 3: Build views percentile partition == == == == == == # day_views = list(embed_avg_view_dict.values()) median_value = np.median(day_views) # the top 1st quantile is 75th percentile and above first_quantile_value = np.percentile(day_views, 75) third_quantile_value = np.percentile(day_views, 25) embed_percentile_dict = {} for embed in np.arange(num_videos): if embed_avg_view_dict[embed] >= first_quantile_value: embed_percentile_dict[embed] = 0 elif embed_avg_view_dict[embed] >= median_value: embed_percentile_dict[embed] = 1 elif embed_avg_view_dict[embed] >= third_quantile_value: embed_percentile_dict[embed] = 2 else: embed_percentile_dict[embed] = 3 # == == == == == == Part 4: Load dynamic network snapshot == == == == == == # edge_weight_mat = np.zeros((4, 4), dtype=np.float32) for t in range(T): filename = 'network_{0}.p'.format( (datetime(2018, 9, 1) + timedelta(days=t)).strftime('%Y-%m-%d')) with open(os.path.join(data_prefix, 'network_pickle', filename), 'rb') as fin: network_dict = pickle.load(fin) # embed_tar: [(embed_src, pos_src, view_src), ...] for embed_tar in range(num_videos): for embed_src, pos_src, _ in network_dict[embed_tar]: if pos_src < NUM_REL: edge_weight_mat[( embed_percentile_dict[embed_src], embed_percentile_dict[embed_tar])] += 1 / T print('>>> Finish loading day {0}...'.format(t + 1)) edge_weight_mat = edge_weight_mat.astype(np.int) print('>>> Network structure has been loaded!') # == == == == == == Part 5: Plot graph by network2tikz == == == == == == # # Network # ------- # every possible pair, including self loop network_structure = [] num_partitions = 4 for pair in itertools.product(np.arange(num_partitions), repeat=2): network_structure.append(pair) net = igraph.Graph(network_structure, directed=True) # Network attributes # ------------------ # Network dicts # ------------- layout = {0: (0, 0), 1: (1, 0), 2: (2, 0), 3: (3, 0)} # Visual style dict # ----------------- visual_style = {} # node styles # ----------- visual_style['vertex_size'] = 0.9 visual_style['vertex_color'] = ColorPalette.CCRGB4 visual_style['vertex_opacity'] = 0.6 visual_style['vertex_label'] = [ 'top 25\%', '(25\% 50\%', '(50\% 75\%', 'bottom 25\%' ] visual_style['vertex_label_distance'] = 0 visual_style['vertex_label_size'] = [5, 4, 4, 4] # edge styles # ----------- edge_width = list(np.ravel(edge_weight_mat)) visual_style['edge_width'] = scaler(edge_width) visual_style['edge_curved'] = 0.7 edge_label = ['{{{:,}}}'.format(x) for x in edge_width] visual_style['edge_label'] = edge_label visual_style['edge_label_size'] = 4.5 visual_style['edge_loop_shape'] = 60 visual_style['edge_loop_size'] = 1 visual_style['edge_loop_position'] = [180, 0, 0, 0] visual_style['edge_arrow_size'] = 0.01 visual_style['edge_arrow_width'] = [ 0.03, 0.01, 0.01, 0.01, 0.02, 0.01, 0.01, 0.01, 0.02, 0.01, 0.01, 0.01, 0.02, 0.01, 0.01, 0.01 ] # general options # --------------- visual_style['layout'] = layout visual_style['canvas'] = (10, 3.5) visual_style['margin'] = 1.5 # Create pdf figure of the network plot(net, '../images/measure_how_videos_connect.pdf', **visual_style) print('>>> Generated pdf file ../images/measure_how_videos_connect.pdf') timer.stop()
def main(): timer = Timer() timer.start() cornflower_blue = ColorPalette.BLUE tomato = ColorPalette.TOMATO color_cycle_4 = ColorPalette.CC4 label_fs = ColorPalette.LABELFS title_fs = ColorPalette.TITLEFS tick_style = ColorPalette.TICKSTYLE data_loader = DataLoader() data_loader.load_video_views() embed_view_dict = data_loader.embed_view_dict embed_avg_train_view_dict = { embed: np.mean(embed_view_dict[embed][:-NUM_OUTPUT]) for embed in embed_view_dict.keys() } data_loader.load_embed_content_dict() embed_cid_dict = data_loader.embed_cid_dict embed_genre_dict = data_loader.embed_genre_dict cid_artist_dict = {} cid_tag_dict = {} with open('../data/artist_details.json', 'r') as fin: for line in fin: artist_json = json.loads(line.rstrip()) cid_artist_dict[ artist_json['channel_id']] = artist_json['artist_name'] cid_tag_dict[artist_json['channel_id']] = artist_json['tag-dict'] cid_views_dict = defaultdict(int) cid_views_wo_network_dict = defaultdict(int) arnet_smape_list = [] net_ratio_list = [] same_artist_net_ratio_list = [] same_genre_net_ratio_list = [] total_views = 0 network_explained_views = 0 with open('./embed_prediction.json', 'r') as fin: for line in fin: result_json = json.loads(line.rstrip()) tar_embed = result_json['embed'] avg_train_views = embed_avg_train_view_dict[tar_embed] true_value = result_json['true_value'] arnet_pred = result_json['arnet_pred'] arnet_smape_list.append(smape(true_value, arnet_pred)[0]) incoming_embeds = result_json['incoming_embeds'] link_weights = result_json['link_weights'] same_artist_contributed_views = 0 same_genre_contributed_views = 0 for edge_inx, src_embed in enumerate(incoming_embeds): if embed_cid_dict[tar_embed] == embed_cid_dict[src_embed]: same_artist_contributed_views += link_weights[ edge_inx] * embed_avg_train_view_dict[src_embed] if is_same_genre(embed_genre_dict[tar_embed], embed_genre_dict[src_embed]): same_genre_contributed_views += link_weights[ edge_inx] * embed_avg_train_view_dict[src_embed] # analyse network contribution arnet_net_ratio = result_json['net_ratio'] net_ratio_list.append(arnet_net_ratio) # rounding issue can make the value slightly larger than 1 same_artist_net_ratio_list.append( min(same_artist_contributed_views / avg_train_views, 1)) same_genre_net_ratio_list.append( min(same_genre_contributed_views / avg_train_views, 1)) cid_views_dict[embed_cid_dict[tar_embed]] += avg_train_views cid_views_wo_network_dict[embed_cid_dict[ tar_embed]] += avg_train_views * (1 - arnet_net_ratio) total_views += avg_train_views network_explained_views += avg_train_views * arnet_net_ratio print( '\nFor an average video in our dataset, we estimate {0:.1f}% of the views come from the network.' .format(100 * np.mean(net_ratio_list))) print( 'In particular, {0:.1f}% ({1:.1f}%) of the views come from the same artist.' .format( 100 * np.mean(same_artist_net_ratio_list), 100 * np.mean(same_artist_net_ratio_list) / np.mean(net_ratio_list))) print( 'In total, our model estimates that the recommendation network contributes {0:.1f}% of popularity in the Vevo network.' .format(100 * network_explained_views / total_views)) print('total views for 13K: {0:.1f}M'.format(total_views / 1000000)) print('explained views for 13K: {0:.1f}M'.format(network_explained_views / 1000000)) print('total views for 60K: {0:.1f}M'.format( np.sum(list(embed_avg_train_view_dict.values())) / 1000000)) print('Gini coef with network: {0:.4f}'.format( gini(list(cid_views_dict.values())))) print('Gini coef without network: {0:.4f}\n'.format( gini(list(cid_views_wo_network_dict.values())))) fig, axes = plt.subplots(ncols=3, nrows=2, figsize=(12, 4.2)) gs = axes[0, 0].get_gridspec() for ax in axes[:, 1]: ax.remove() ax_mid = fig.add_subplot(gs[:, 1]) for ax in axes[:, 2]: ax.remove() ax_right = fig.add_subplot(gs[:, 2]) axes = [axes[0, 0], axes[1, 0], ax_mid, ax_right] # == == == == == == Part 1: Plot SMAPE vs. traffic composition == == == == == == # num_bin = 10 sorted_same_artist_tuple_list = sorted( [(x, y) for x, y in zip(same_artist_net_ratio_list, arnet_smape_list)], key=lambda x: x[0]) same_artist_split_values = [ np.percentile(same_artist_net_ratio_list, x) for x in np.arange(10, 101, 10) ] same_artist_bins = [[] for _ in range(num_bin)] for same_artist_net_ratio, arnet_smape in sorted_same_artist_tuple_list: slice_idx = int( np.floor( percentileofscore(same_artist_net_ratio_list, same_artist_net_ratio) / 10)) if slice_idx >= num_bin: slice_idx = num_bin - 1 same_artist_bins[slice_idx].append(arnet_smape) sorted_same_genre_tuple_list = sorted( [(x, y) for x, y in zip(same_genre_net_ratio_list, arnet_smape_list)], key=lambda x: x[0]) same_genre_split_values = [ np.percentile(same_genre_net_ratio_list, x) for x in np.arange(10, 101, 10) ] same_genre_bins = [[] for _ in range(num_bin)] for same_genre_net_ratio, arnet_smape in sorted_same_genre_tuple_list: slice_idx = int( np.floor( percentileofscore(same_genre_net_ratio_list, same_genre_net_ratio) / 10)) if slice_idx >= num_bin: slice_idx = num_bin - 1 same_genre_bins[slice_idx].append(arnet_smape) axes[0].plot(range(1, 11, 1), [np.mean(x) for x in same_artist_bins], color=cornflower_blue, label='same artist', mfc='none', marker='o', markersize=4) axes[1].plot(range(1, 11, 1), [np.mean(x) for x in same_genre_bins], color=tomato, label='same genre', mfc='none', marker='o', markersize=4) for ax in [axes[0], axes[1]]: ax.set_xlim([0.5, 10.5]) ax.set_ylim([7, 10.5]) ax.set_ylabel('SMAPE', fontsize=label_fs) ax.xaxis.set_ticks(np.arange(1, 10, 2)) ax.tick_params(**tick_style) ax.legend(frameon=False) axes[0].xaxis.set_major_formatter( FuncFormatter(lambda x, _: '({0:.3f})'.format(same_artist_split_values[ int(x) - 1]))) axes[1].xaxis.set_major_formatter( FuncFormatter(lambda x, _: '{0:.0f}%\n({1:.3f})'.format( 10 * x, same_genre_split_values[int(x) - 1]))) # axes[0].xaxis.set_major_formatter( # FuncFormatter(lambda x, _: '({0:.3f})'.format(10 * x))) # axes[1].xaxis.set_major_formatter( # FuncFormatter(lambda x, _: '{0:.0f}%\n({1:.3f})'.format(10 * x, 10 * x))) axes[1].set_xlabel('$\eta_v$ percentile', fontsize=label_fs) axes[0].set_title('(a)', fontsize=title_fs) # == == == == == == Part 2: Plot who can utilize the network better? == == == == == == # artist_views_list = list(cid_views_dict.values()) wo_network_artist_views_list = list(cid_views_wo_network_dict.values()) cid_list = sorted(cid_views_dict.keys()) artist_true_percentile = [ percentileofscore(artist_views_list, cid_views_dict[cid]) for cid in cid_list ] wo_network_artist_percentile = [ percentileofscore(wo_network_artist_views_list, cid_views_wo_network_dict[cid]) for cid in cid_list ] percentile_change = np.array([ artist_true_percentile[i] - wo_network_artist_percentile[i] for i in range(len(cid_list)) ]) num_popularity_loss = sum(percentile_change < 0) num_popularity_equal = sum(percentile_change == 0) num_popularity_gain = sum(percentile_change > 0) print('{0} ({1:.2f}%) artists lose popularity with network'.format( num_popularity_loss, num_popularity_loss / len(cid_list) * 100)) print('{0} ({1:.2f}%) artists with no popularity change'.format( num_popularity_equal, num_popularity_equal / len(cid_list) * 100)) print('{0} ({1:.2f}%) artists gain popularity with network\n'.format( num_popularity_gain, num_popularity_gain / len(cid_list) * 100)) artist_percentile_mat = [[] for _ in range(10)] artist_cid_mat = [[] for _ in range(10)] for idx, percentile_value in enumerate(wo_network_artist_percentile): bin_idx = min(int(np.floor(percentile_value / 10)), 9) artist_percentile_mat[bin_idx].append(artist_true_percentile[idx] - percentile_value) artist_cid_mat[bin_idx].append(cid_list[idx]) red_circle = dict(markerfacecolor=tomato, marker='o', markersize=4) axes[2].boxplot(artist_percentile_mat, showfliers=True, widths=0.5, flierprops=red_circle) axes[2].axhline(y=0, color=cornflower_blue, linestyle='--', lw=1, zorder=0) axes[2].set_xlabel('artist popularity percentile without network', fontsize=label_fs) axes[2].set_ylabel('percentile change with network', fontsize=label_fs) axes[2].tick_params(**tick_style) axes[2].set_xticks(axes[2].get_xticks()[::2]) axes[2].xaxis.set_major_formatter( FuncFormatter(lambda x, _: '{0:.0f}%'.format(10 * x))) axes[2].yaxis.set_major_formatter( FuncFormatter(lambda x, _: '{0:.0f}%'.format(x))) axes[2].set_title('(b)', fontsize=12) # find outliers whis = 1.5 top_outliers_list = [] bottom_outliers_list = [] for box_idx, box in enumerate(artist_percentile_mat): q1 = np.percentile(box, 25) q3 = np.percentile(box, 75) iq = q3 - q1 hi_val = q3 + whis * iq lo_val = q1 - whis * iq for idx, val in enumerate(box): if val > hi_val: top_outliers_list.append((artist_cid_mat[box_idx][idx], val)) elif val < lo_val: bottom_outliers_list.append( (artist_cid_mat[box_idx][idx], val)) sorted_top_outliers_list = sorted( [(cid_artist_dict[x[0]], cid_tag_dict[x[0]], int( cid_views_dict[x[0]]), x[1]) for x in top_outliers_list], key=lambda t: t[2], reverse=True) for t in sorted_top_outliers_list: print(t) print('-------------------') sorted_bottom_outliers_list = sorted( [(cid_artist_dict[x[0]], cid_tag_dict[x[0]], int( cid_views_dict[x[0]]), x[1]) for x in bottom_outliers_list], key=lambda t: t[2], reverse=True) for t in sorted_bottom_outliers_list: print(t) indie_xaxis, indie_yaxis = [], [] rap_xaxis, rap_yaxis = [], [] other_xaxis, other_yaxis = [], [] lose_xaxis, lose_yaxis = [], [] for top_outlier, _ in top_outliers_list: if 'indie' in ','.join(cid_tag_dict[top_outlier].keys()) or \ 'alternative' in ','.join(cid_tag_dict[top_outlier].keys()) or \ 'new wave' in ','.join(cid_tag_dict[top_outlier].keys()): indie_xaxis.append(cid_views_dict[top_outlier]) indie_yaxis.append((cid_views_dict[top_outlier] - cid_views_wo_network_dict[top_outlier]) / cid_views_dict[top_outlier]) elif 'rap' in ','.join(cid_tag_dict[top_outlier].keys()) or \ 'hip hop' in ','.join(cid_tag_dict[top_outlier].keys()) or \ 'rhythm and blues' in ','.join(cid_tag_dict[top_outlier].keys()) or \ 'reggae' in ','.join(cid_tag_dict[top_outlier].keys()) or \ 'punk' in ','.join(cid_tag_dict[top_outlier].keys()) or \ 'funk' in ','.join(cid_tag_dict[top_outlier].keys()) or \ 'r&b' in ','.join(cid_tag_dict[top_outlier].keys()): rap_xaxis.append(cid_views_dict[top_outlier]) rap_yaxis.append((cid_views_dict[top_outlier] - cid_views_wo_network_dict[top_outlier]) / cid_views_dict[top_outlier]) else: other_xaxis.append(cid_views_dict[top_outlier]) other_yaxis.append((cid_views_dict[top_outlier] - cid_views_wo_network_dict[top_outlier]) / cid_views_dict[top_outlier]) for bottom_outlier, _ in bottom_outliers_list: lose_xaxis.append(cid_views_dict[bottom_outlier]) lose_yaxis.append((cid_views_dict[bottom_outlier] - cid_views_wo_network_dict[bottom_outlier]) / cid_views_dict[bottom_outlier]) axes[3].scatter(indie_xaxis, indie_yaxis, marker='^', facecolors='none', edgecolors=color_cycle_4[0], s=20, label='Indie: {0}'.format(len(indie_xaxis))) axes[3].scatter(rap_xaxis, rap_yaxis, marker='o', facecolors='none', edgecolors=color_cycle_4[1], s=20, label='Hip hop: {0}'.format(len(rap_xaxis))) axes[3].scatter(other_xaxis, other_yaxis, marker='s', facecolors='none', edgecolors=color_cycle_4[2], s=20, label='Other: {0}'.format(len(other_xaxis))) # axes[3].scatter(lose_xaxis, lose_yaxis, marker='x', color=color_cycle_4[3], s=20, label='artists lose popularity: {0}'.format(len(bad_xaxis))) axes[3].set_ylim((-0.02, 1.02)) axes[3].set_xscale('log') axes[3].set_xlabel('artist average daily views', fontsize=label_fs) axes[3].set_ylabel('network contribution ratio ' + '$\eta_v$', fontsize=label_fs) axes[3].tick_params(**tick_style) axes[3].legend(frameon=False, loc='lower left') axes[3].set_title('(c)', fontsize=title_fs) hide_spines(axes) timer.stop() plt.tight_layout(w_pad=0.2) plt.savefig('../images/model_prediction_analysis.pdf', bbox_inches='tight') plt.show()
def main(): # == == == == == == Part 1: Set up environment == == == == == == # timer = Timer() timer.start() data_prefix = '../data' # == == == == == == Part 2: Load video views == == == == == == # data_loader = DataLoader() data_loader.load_video_views() embed_view_dict = data_loader.embed_view_dict num_videos = data_loader.num_videos total_views = [] # == == == == == == Part 3: Load network snapshot as cutoff value changes == == == == == == # for t in range(T): total_views.append( sum([embed_view_dict[embed][t] for embed in range(num_videos)])) snapshot_date = obj2str(datetime(2018, 9, 1) + timedelta(days=t)) snapshot_filename = 'network_{0}.p'.format(snapshot_date) nodes_set = set() num_edges = 0 embedded_graph = defaultdict(list) with open( os.path.join(data_prefix, 'network_pickle', snapshot_filename), 'rb') as fin: network_dict = pickle.load(fin) # embed_tar: [(embed_src, pos_src, view_src)] for embed_tar in range(num_videos): for embed_src, pos_src, _ in network_dict[embed_tar]: if pos_src < CUTOFF: embedded_graph[embed_src].append(embed_tar) nodes_set.add(embed_src) nodes_set.add(embed_tar) num_edges += 1 logging.info('>>> Graph embedding @ date {0} has been loaded!'.format( snapshot_date)) logging.info('>>> {0} nodes and {1} edges in the graph'.format( len(nodes_set), num_edges)) logging.info(' {0} views throughout the graph'.format( total_views[t])) # == == == == == == Part 4: Extract bow-tie structure == == == == == == # scc_content = tarjan(embedded_graph) scc_content = sorted(scc_content, key=lambda x: len(x), reverse=True) # largest SCC largest_scc = scc_content.pop(0) logging.info('>>> {0} ({1:.2f}%) nodes in the largest SCC'.format( len(largest_scc), len(largest_scc) / num_videos * 100)) largest_scc_views = sum( [embed_view_dict[embed][t] for embed in largest_scc]) logging.info(' {0} ({1:.2f}%) views in the largest SCC'.format( largest_scc_views, largest_scc_views / total_views[t] * 100)) # find IN, OUT, Tendrils, Disconnected in_component = [] num_scc_in = 0 to_visit_scc = [] for scc in scc_content: ret = is_in_component(scc, embedded_graph, largest_scc) if ret: in_component.extend(scc) num_scc_in += 1 else: to_visit_scc.append(scc) logging.info('>>> {0} ({1:.2f}%) nodes in the IN component'.format( len(in_component), len(in_component) / num_videos * 100)) logging.info(' {0} scc in the IN component'.format(num_scc_in)) in_views = sum([embed_view_dict[embed][t] for embed in in_component]) logging.info(' {0} ({1:.2f}%) views in the IN component'.format( in_views, in_views / total_views[t] * 100)) out_component = [] num_scc_out = 0 to_visit_scc2 = [] for scc in to_visit_scc: ret = is_out_component(scc, embedded_graph, largest_scc) if ret: out_component.extend(scc) num_scc_out += 1 else: to_visit_scc2.append(scc) logging.info('>>> {0} ({1:.2f}%) nodes in the OUT component'.format( len(out_component), len(out_component) / num_videos * 100)) logging.info(' {0} scc in the OUT component'.format(num_scc_out)) out_views = sum([embed_view_dict[embed][t] for embed in out_component]) logging.info(' {0} ({1:.2f}%) views in the OUT component'.format( out_views, out_views / total_views[t] * 100)) tendrils = [] num_scc_tendrils = 0 disconnected = [] num_disconnected = num_videos - len(nodes_set) num_scc_disconnected = 0 for scc in to_visit_scc2: ret = is_out_component(scc, embedded_graph, in_component) if ret: tendrils.extend(scc) num_scc_tendrils += 1 else: ret = is_in_component(scc, embedded_graph, out_component) if ret: tendrils.extend(scc) num_scc_tendrils += 1 else: disconnected.extend(scc) num_scc_disconnected += 1 logging.info('>>> {0} ({1:.2f}%) nodes in the Tendrils'.format( len(tendrils), len(tendrils) / num_videos * 100)) logging.info(' {0} scc in the Tendrils'.format(num_scc_tendrils)) tendrils_views = sum([embed_view_dict[embed][t] for embed in tendrils]) logging.info(' {0} ({1:.2f}%) views in the Tendrils'.format( tendrils_views, tendrils_views / total_views[t] * 100)) logging.info('>>> {0} ({1:.2f}%) nodes in the Disconnected'.format( num_disconnected + len(disconnected), (num_disconnected + len(disconnected)) / num_videos * 100)) logging.info( ' {0} scc in the Disconnected'.format(num_disconnected + num_scc_disconnected)) disc_views = total_views[ t] - largest_scc_views - in_views - out_views - tendrils_views logging.info(' {0} ({1:.2f}%) views in the Disconnected'.format( disc_views, disc_views / total_views[t] * 100)) print('>>> Finish computing bowtie at day {0}...'.format(t + 1)) timer.stop()
def main(): # == == == == == == Part 1: Set up environment == == == == == == # timer = Timer() timer.start() data_prefix = '../data/' # == == == == == == Part 2: Load video views == == == == == == # data_loader = DataLoader() data_loader.load_video_views() embed_avg_view_dict = data_loader.embed_avg_view_dict num_videos = data_loader.num_videos data_loader.load_embed_content_dict() embed_cid_dict = data_loader.embed_cid_dict embed_genre_dict = data_loader.embed_genre_dict # == == == == == == Part 3: Load dynamic network snapshot == == == == == == # network_dict_list = [] for t in range(T): target_date_str = obj2str(datetime(2018, 9, 1) + timedelta(days=t)) filename = 'network_{0}.p'.format(target_date_str) network_dict = pickle.load( open(os.path.join(data_prefix, 'network_pickle', filename), 'rb')) for embed in network_dict: network_dict[embed] = [ x[0] for x in network_dict[embed] if x[1] < NUM_REL ] network_dict_list.append(network_dict) persistent_src_embed_set = set() persistent_tar_embed_set = set() existing_edges = set() num_reciprocal_edges = 0 num_same_artist = 0 num_same_genre = 0 with open(os.path.join(data_prefix, 'persistent_network.csv'), 'w') as fout: fout.write('Source,Target\n') for tar_embed in range(num_videos): src_union_set = set() for t in range(T): src_union_set.update(set(network_dict_list[t][tar_embed])) for src_embed in src_union_set: linkage_list = [0] * T for t in range(T): if src_embed in network_dict_list[t][tar_embed]: linkage_list[t] = 1 if is_persistent_link(linkage_list): # filter: at least 100 daily views for target video, # and the mean daily views of source video is at least 1% of the target video src_mean = embed_avg_view_dict[src_embed] tar_mean = embed_avg_view_dict[tar_embed] if tar_mean >= 100 and src_mean >= 0.01 * tar_mean: fout.write('{0},{1}\n'.format(src_embed, tar_embed)) persistent_src_embed_set.add(src_embed) persistent_tar_embed_set.add(tar_embed) if '{1}-{0}'.format(src_embed, tar_embed) in existing_edges: num_reciprocal_edges += 1 if embed_cid_dict[src_embed] == embed_cid_dict[ tar_embed]: num_same_artist += 1 if is_same_genre(embed_genre_dict[src_embed], embed_genre_dict[tar_embed]): num_same_genre += 1 existing_edges.add('{0}-{1}'.format( src_embed, tar_embed)) print('{0} edges in the persistent network'.format(len(existing_edges))) print( '{0} source videos, {1} target videos, {2} videos appear in both set'. format( len(persistent_src_embed_set), len(persistent_tar_embed_set), len(persistent_src_embed_set.intersection( persistent_tar_embed_set)))) print('{0} pairs of reciprocal edges'.format(num_reciprocal_edges)) print('{0} ({1:.1f}%) edges belong to the same artist'.format( num_same_artist, 100 * num_same_artist / len(existing_edges))) print('{0} ({1:.1f}%) edges belong to the same genre'.format( num_same_genre, 100 * num_same_genre / len(existing_edges))) timer.stop()
def main(): # == == == == == == Part 1: Set up environment == == == == == == # data_prefix = '../data/' partition_dirname = './video_partition' result_dirname = './model_results' # == == == == == == Part 2: Load target videos set == == == == == == # tar_inlink_dict = defaultdict(list) with open(os.path.join(data_prefix, 'persistent_network.csv'), 'r') as fin: fin.readline() for line in fin: src_embed, tar_embed = map(int, line.rstrip().split(',')) tar_inlink_dict[tar_embed].append(src_embed) tar_embed_list = list(sorted(tar_inlink_dict.keys())) # == == == == == == Part 3: Load target videos set == == == == == == # # partition tar embed into small files, each file contains 10 videos if not os.path.exists(partition_dirname): os.makedirs(partition_dirname) num_embed_in_partition = 50 partition_idx = 1 partition_fout = open( os.path.join(partition_dirname, 'partition_{0:03d}.txt'.format(partition_idx)), 'w') embed_cnt = 0 for embed in tar_embed_list: if embed_cnt == num_embed_in_partition: partition_idx += 1 partition_fout.close() partition_fout = open( os.path.join( partition_dirname, 'partition_{0:03d}.txt'.format(partition_idx)), 'w') embed_cnt = 0 partition_fout.write('{0}\n'.format(embed)) embed_cnt += 1 partition_fout.close() if not os.path.exists(result_dirname): os.makedirs(result_dirname) load_partition = True visited_embed_set = set() if load_partition: partition_idx = int(sys.argv[1]) result_filename = os.path.join( result_dirname, 'forecast_tracker_{0:03d}.json'.format(partition_idx)) if os.path.exists(result_filename): with open(result_filename, 'r') as fin: for line in fin: result_json = json.loads(line.rstrip()) visited_embed_set.add(result_json['embed']) partition_filename = os.path.join( partition_dirname, 'partition_{0:03d}.txt'.format(partition_idx)) tar_embed_list = [] with open(partition_filename, 'r') as fin: for line in fin: embed = int(line.rstrip()) if embed not in visited_embed_set: tar_embed_list.append(embed) else: result_filename = os.path.join(result_dirname, 'forecast_tracker_all.json') fout = open(result_filename, 'a') print('{0} videos to model'.format(len(tar_embed_list))) # == == == == == == Part 4: Load video views == == == == == == # data_loader = DataLoader() data_loader.load_video_views() embed_view_dict = data_loader.embed_view_dict # == == == == == == Part 5: Start prediction task == == == == == == # for item_cnt, tar_embed in enumerate(tar_embed_list): timer = Timer() timer.start() tar_ts_data = embed_view_dict[tar_embed] true_value = tar_ts_data[-NUM_OUTPUT:] # naive method naive_model = Naive(tar_ts_data, num_output=NUM_OUTPUT) naive_smape = naive_model.evaluate() naive_pred = naive_model.pred_test_output # seasonal naive method snaive_model = SeasonalNaive(tar_ts_data, num_output=NUM_OUTPUT) snaive_smape = snaive_model.evaluate() snaive_pred = snaive_model.pred_test_output # autoregressive method ar_model = AutoRegression(tar_ts_data, num_output=NUM_OUTPUT) ar_model.train_ar(lag=FREQ) ar_smape = ar_model.evaluate() ar_pred = list(map(int, ar_model.pred_test_output)) # RNN with LSTM units rnn_model = TemporalLSTM(tar_ts_data, num_input=NUM_INPUT, num_output=NUM_OUTPUT, num_features=1, num_neurons=NUM_NEURONS, freq=FREQ, num_ensemble=NUM_ENSEMBLE) rnn_model.prepare_tensor() rnn_model.create_model() rnn_model.train_lstm() rnn_smape = rnn_model.evaluate() rnn_pred = list(map(int, rnn_model.pred_test_output)) # autoregressive with network method # preset AR coefficient preset_ar_coef = list(ar_model.fitted_params) # network feature method src_ts_data_mat = np.empty((0, T), np.int) for src_embed in tar_inlink_dict[tar_embed]: src_ts_data = np.array(embed_view_dict[src_embed]) src_ts_data_mat = np.vstack((src_ts_data_mat, src_ts_data)) arnet_model = ARNet(tar_ts_data, src_ts_data_mat=src_ts_data_mat, num_input=NUM_INPUT, num_output=NUM_OUTPUT, num_ensemble=NUM_ENSEMBLE) arnet_model.train_arnet(start_params=preset_ar_coef) arnet_smape = arnet_model.evaluate() arnet_pred = list(map(int, arnet_model.pred_test_output)) fout.write('{0}\n'.format( json.dumps({ 'embed': tar_embed, 'true_value': true_value, 'naive_pred': naive_pred, 'snaive_pred': snaive_pred, 'ar_pred': ar_pred, 'rnn_pred': rnn_pred, 'arnet_pred': arnet_pred, 'net_ratio': arnet_model.network_ratio, 'incoming_embeds': tar_inlink_dict[tar_embed], 'link_weights': arnet_model.link_weights.tolist() }))) print( 'embed: {0}, Naive: {1:.3f}, SeasonalNaive: {2:.3f}, AutoRegression: {3:.3f}, RNN: {4:.3f}, ARNet: {5:.3f}' .format(tar_embed, naive_smape, snaive_smape, ar_smape, rnn_smape, arnet_smape)) naive_model = None del naive_model snaive_model = None del snaive_model ar_model = None del ar_model rnn_model = None del rnn_model arnet_model = None del arnet_model gc.collect() timer.stop() fout.close()
def main(): # == == == == == == Part 1: Set up environment == == == == == == # timer = Timer() timer.start() data_prefix = '../data/' target_day_indices = [0, 15, 30, 45] color_cycle_4 = ColorPalette.CC4 date_labels = [ 'Sep 01, 2018', 'Sep 16, 2018', 'Oct 01, 2018', 'Oct 16, 2018' ] # == == == == == == Part 2: Load video views == == == == == == # data_loader = DataLoader() data_loader.load_video_views() embed_view_dict = data_loader.embed_view_dict embed_avg_view_dict = data_loader.embed_avg_view_dict num_videos = data_loader.num_videos target_day_view_list = [[], [], [], []] for embed in range(num_videos): for target_idx, target_day in enumerate(target_day_indices): target_day_view_list[target_idx].append( embed_view_dict[embed][target_day]) # == == == == == == Part 3: Load dynamic network snapshot == == == == == == # embed_indegree_dict = { embed: np.zeros((T, )) for embed in np.arange(num_videos) } # daily indegree for each embed zero_indegree_list = [] # percentage of zero indegree for each day num_edges_list = [] # number of total edges for each day for t in range(T): filename = 'network_{0}.p'.format( (datetime(2018, 9, 1) + timedelta(days=t)).strftime('%Y-%m-%d')) indegree_list = [] with open(os.path.join(data_prefix, 'network_pickle', filename), 'rb') as fin: network_dict = pickle.load(fin) # embed_tar: [(embed_src, pos_src, view_src), ...] for tar_embed in range(num_videos): indegree_value = len( [1 for x in network_dict[tar_embed] if x[1] < NUM_REL]) embed_indegree_dict[tar_embed][t] = indegree_value indegree_list.append(indegree_value) indegree_counter = Counter(indegree_list) zero_indegree_list.append(indegree_counter[0] / num_videos) num_edges_list.append(sum(indegree_list)) print('>>> Finish loading day {0}...'.format(t + 1)) print('>>> Network structure has been loaded!') print('\n>>> Average number of edges: {0:.0f}, max: {1:.0f}, min: {2:.0f}'. format( sum(num_edges_list) / len(num_edges_list), max(num_edges_list), min(num_edges_list))) fig, axes = plt.subplots(1, 3, figsize=(12, 4.5)) ax1, ax2, ax3 = axes.ravel() # == == == == == == Part 4: Plot ax1 indegree CCDF == == == == == == # embed_avg_indegree_dict = defaultdict(float) for t in range(T): for embed in range(num_videos): embed_avg_indegree_dict[embed] += embed_indegree_dict[embed][t] / T indegree_ranked_embed_list = [ x[0] for x in sorted(embed_avg_indegree_dict.items(), key=lambda kv: kv[1], reverse=True) ] top_20_indegree_embeds = indegree_ranked_embed_list[:20] popular_ranked_embed_list = [ x[0] for x in sorted( embed_avg_view_dict.items(), key=lambda kv: kv[1], reverse=True) ] top_20_popular_embeds = popular_ranked_embed_list[:20] for target_idx, target_day in enumerate(target_day_indices): indegree_list = [] for embed in range(num_videos): indegree_list.append(embed_indegree_dict[embed][target_day]) print( 'video with 10 indegree has more in-links than {0:.2f}% videos on date {1}' .format(percentileofscore(indegree_list, 10), date_labels[target_idx])) print( 'video with 20 indegree has more in-links than {0:.2f}% videos on date {1}' .format(percentileofscore(indegree_list, 20), date_labels[target_idx])) plot_ccdf(indegree_list, ax=ax1, color=color_cycle_4[target_idx], label=date_labels[target_idx]) # compute the powerlaw fit powerlaw_fit = Fit(list(embed_avg_indegree_dict.values())) infer_alpha = powerlaw_fit.power_law.alpha p = powerlaw_fit.power_law.ccdf() ins_x_axis = powerlaw_fit.power_law.__dict__['parent_Fit'].__dict__[ 'data'][:int(0.9 * len(p))] ins_y_axis = 0.1 * p[:int(0.9 * len(p))] ax1.plot(ins_x_axis, ins_y_axis, 'k:') ax1.text(0.4, 0.6, r'$x^{{{0:.2f}}}$'.format(-infer_alpha + 1), size=12, ha='right', va='bottom', transform=ax1.transAxes) ax1.set_xscale('log') ax1.set_yscale('log') ax1.set_xlabel('indegree', fontsize=11) ax1.set_ylabel('$P(X) \geq x$', fontsize=11) ax1.tick_params(axis='both', which='major', labelsize=10) ax1.set_title('(a) indegree distribution', fontsize=12) ax1.legend(frameon=False, fontsize=11, ncol=1, fancybox=False, shadow=True) mean_zero_indegree = sum(zero_indegree_list) / len(zero_indegree_list) ax1.axhline(y=1 - mean_zero_indegree, color='k', linestyle='--', zorder=30) ax1.text(0.96, 0.9, '{0:.0f}% with 0 indegree'.format(mean_zero_indegree * 100), size=11, transform=ax1.transAxes, ha='right', va='top') # == == == == == == Part 5: Plot ax2 views distribution == == == == == == # for target_idx, views_list in enumerate(target_day_view_list): x_values = range(100) y_values = [np.percentile(views_list, x) for x in x_values] ax2.plot(x_values, y_values, color=color_cycle_4[target_idx], label=date_labels[target_idx]) ax2.set_yscale('log') ax2.set_xlabel('views percentile', fontsize=11) ax2.set_ylabel('num of views', fontsize=11) ax2.tick_params(axis='both', which='major', labelsize=10) ax2.set_title('(b) daily views vs. its percentile', fontsize=12) avg_views_list = sorted(list(embed_avg_view_dict.values()), reverse=True) gini_coef = gini(avg_views_list) print('top 1% videos occupy {0:.2f}% views'.format( sum(avg_views_list[:int(0.01 * num_videos)]) / sum(avg_views_list) * 100)) print('top 10% videos occupy {0:.2f}% views'.format( sum(avg_views_list[:int(0.1 * num_videos)]) / sum(avg_views_list) * 100)) print('Gini coef: {0:.3f}'.format(gini_coef)) spearman_degree = [ embed_avg_indegree_dict[embed] for embed in range(num_videos) ] spearman_views = [ embed_avg_view_dict[embed] for embed in range(num_videos) ] print( 'Spearman correlation between views and indegree: {0:.4f}, pvalue: {1:.2f}' .format(*spearmanr(spearman_views, spearman_degree))) median_views = np.median(avg_views_list) top_views_90th = np.percentile(avg_views_list, 90) top_views_99th = np.percentile(avg_views_list, 99) ax2_xmin = ax2.get_xlim()[0] ax2_ymin = ax2.get_ylim()[0] ax2.plot((50, 50), (ax2_ymin, median_views), color='k', linestyle='--', zorder=30) ax2.plot((ax2_xmin, 50), (median_views, median_views), color='k', linestyle='--', zorder=30) ax2.text(0.49, 0.45, 'median views {0:,.0f}'.format(median_views), size=11, transform=ax2.transAxes, ha='right', va='bottom') ax2.plot((90, 90), (ax2_ymin, top_views_90th), color='k', linestyle='--', zorder=30) ax2.plot((ax2_xmin, 90), (top_views_90th, top_views_90th), color='k', linestyle='--', zorder=30) ax2.text(0.88, 0.75, '90th views {0:,.0f}'.format(top_views_90th), size=11, transform=ax2.transAxes, ha='right', va='bottom') ax2.plot((99, 99), (ax2_ymin, top_views_99th), color='k', linestyle='--', zorder=30) ax2.plot((ax2_xmin, 99), (top_views_99th, top_views_99th), color='k', linestyle='--', zorder=30) ax2.text(0.91, 0.95, '99th views {0:,.0f}'.format(top_views_99th), size=11, transform=ax2.transAxes, ha='right', va='bottom') # == == == == == == Part 7: Plot ax3 video uploading trend == == == == == == # x_axis = range(2009, 2018) x_labels = ["'09", "'10", "'11", "'12", "'13", "'14", "'15", "'16", "'17"] upload_mat = np.zeros((len(x_axis), 8)) target_topics = [ 'Pop_music', 'Rock_music', 'Hip_hop_music', 'Independent_music', 'Country_music', 'Electronic_music', 'Soul_music', 'Others' ] topic_labels = [ 'Pop', 'Rock', 'Hip hop', 'Independent', 'Country', 'Electronic', 'Soul', 'Others' ] color_cycle_8 = ColorPalette.CC8 data_loader.load_embed_content_dict() embed_title_dict = data_loader.embed_title_dict embed_uploadtime_dict = data_loader.embed_uploadtime_dict embed_genre_dict = data_loader.embed_genre_dict for embed in range(num_videos): upload_year = int(embed_uploadtime_dict[embed][:4]) if 2009 <= upload_year <= 2017: year_idx = upload_year - 2009 genres = embed_genre_dict[embed] if len(genres) == 0: # add one to "Others" genre upload_mat[year_idx, 7] += 1 else: for genre in genres: upload_mat[year_idx, target_topics.index(genre)] += 1 / len(genres) print() print([ '{0}: {1}'.format(topic, int(num)) for topic, num in zip(target_topics, np.sum(upload_mat, axis=0)) ]) stackedBarPlot(ax=ax3, data=upload_mat, cols=color_cycle_8, edgeCols=['#000000'] * 8, xlabel='uploaded year', ylabel='num of videos', scale=False, endGaps=True) ax3.tick_params(axis='both', which='major', labelsize=9) ax3.set_xticks(np.arange(len(x_axis))) ax3.set_xticklabels(x_labels) ax3.yaxis.set_major_formatter(FuncFormatter(concise_fmt)) ax3.legend([ plt.Rectangle((0, 0), 1, 1, fc=c, ec='k', alpha=0.6) for c in color_cycle_8 ], topic_labels, fontsize=9, frameon=False, handletextpad=0.2, columnspacing=0.3, ncol=4, bbox_to_anchor=(1, -0.12), bbox_transform=ax3.transAxes, fancybox=False, shadow=True) ax3.set_title('(c) VEVO videos uploading trend', fontsize=12) union_top_set = set(top_20_indegree_embeds).union(top_20_popular_embeds) print('\n>>> Size of the union set at cutoff 15:', len(union_top_set)) print('{0:>24} | {1:>17} | {2:>5} | {3:>8} | {4:>6} | {5:>10} | {6:>5}'. format('Video title', 'Artist', 'Age', 'Indegree', '-rank', 'Views', '-rank')) for embed in top_20_indegree_embeds: print( '{0:>24} & {1:>17} & {2:>5} & {3:>8} & {4:>6} & {5:>10} & {6:>5} \\\\' .format( embed_title_dict[embed].split( ' - ', 1)[1].split('(')[0].split('ft')[0].strip(), embed_title_dict[embed].split( ' - ', 1)[0].split('&')[0].split(',')[0].strip(), '{0:,}'.format( (datetime(2018, 11, 2) - str2obj(embed_uploadtime_dict[embed])).days), '{0:,}'.format(int(embed_avg_indegree_dict[embed])), '{0:,}'.format(top_20_indegree_embeds.index(embed) + 1), '{0:,}'.format(int(embed_avg_view_dict[embed])), '{0:,}'.format(popular_ranked_embed_list.index(embed) + 1))) print('\n{0:>24} | {1:>17} | {2:>5} | {3:>8} | {4:>6} | {5:>10} | {6:>5}'. format('Video title', 'Artist', 'Age', 'Indegree', '-rank', 'Views', '-rank')) for embed in top_20_popular_embeds: print( '{0:>24} & {1:>17} & {2:>5} & {3:>8} & {4:>6} & {5:>10} & {6:>5} \\\\' .format( embed_title_dict[embed].split( ' - ', 1)[1].split('(')[0].split('ft')[0].strip(), embed_title_dict[embed].split( ' - ', 1)[0].split('&')[0].split(',')[0].strip(), '{0:,}'.format( (datetime(2018, 11, 2) - str2obj(embed_uploadtime_dict[embed])).days), '{0:,}'.format(int(embed_avg_indegree_dict[embed])), '{0:,}'.format(indegree_ranked_embed_list.index(embed) + 1), '{0:,}'.format(int(embed_avg_view_dict[embed])), '{0:,}'.format(top_20_popular_embeds.index(embed) + 1))) hide_spines(axes) timer.stop() plt.tight_layout() plt.savefig('../images/measure_basic_statistics.pdf', bbox_inches='tight') if not platform.system() == 'Linux': plt.show()
def main(): # == == == == == == Part 1: Set up environment == == == == == == # timer = Timer() timer.start() data_prefix = '../data/' year_labels = [ "all years", "'09", "'10", "'11", "'12", "'13", "'14", "'15", "'16", "'17", "'18" ] num_year = len(year_labels) - 1 # == == == == == == Part 2: Load video views == == == == == == # data_loader = DataLoader() data_loader.load_video_views() data_loader.load_embed_content_dict() embed_avg_view_dict = data_loader.embed_avg_view_dict embed_uploadtime_dict = data_loader.embed_uploadtime_dict num_videos = data_loader.num_videos for embed in range(num_videos): upload_year = int(embed_uploadtime_dict[embed][:4]) if upload_year >= 2009: year_idx = upload_year - 2009 else: year_idx = 0 embed_uploadtime_dict[embed] = year_idx views_by_years_list = [[] for _ in range(num_year)] indegrees_by_years_list = [[] for _ in range(num_year)] # == == == == == == Part 3: Load dynamic network snapshot == == == == == == # embed_indegree_dict_15 = { embed: np.zeros((T, )) for embed in np.arange(num_videos) } for t in range(T): filename = 'network_{0}.p'.format( obj2str(datetime(2018, 9, 1) + timedelta(days=t))) with open(os.path.join(data_prefix, 'network_pickle', filename), 'rb') as fin: network_dict = pickle.load(fin) # embed_tar: [(embed_src, pos_src, view_src)] for embed in range(num_videos): embed_indegree_dict_15[embed][t] = len( [1 for x in network_dict[embed] if x[1] < NUM_REL_15]) print('>>> Finish loading day {0}...'.format(t + 1)) print('>>> Network structure has been loaded!') for embed in range(num_videos): views_by_years_list[embed_uploadtime_dict[embed]].append( embed_avg_view_dict[embed]) indegrees_by_years_list[embed_uploadtime_dict[embed]].append( np.mean(embed_indegree_dict_15[embed])) spearman_traces = [] all_views, all_indegrees = [], [] for i in range(num_year): all_views.extend(views_by_years_list[i]) all_indegrees.extend(indegrees_by_years_list[i]) print('\n>>> {0}'.format(year_labels[0]), spearmanr(all_views, all_indegrees)) spearman_traces.append(spearmanr(all_views, all_indegrees)[0]) for i in range(num_year): spearman_traces.append( spearmanr(views_by_years_list[i], indegrees_by_years_list[i])[0]) print('>>> {0} year'.format(year_labels[1 + i]), spearmanr(views_by_years_list[i], indegrees_by_years_list[i])) # == == == == == == Part 4: Plotting script == == == == == == # fig, ax1 = plt.subplots(1, 1, figsize=(8, 2)) tomato = ColorPalette.TOMATO blue = ColorPalette.BLUE bar1 = ax1.bar(range(num_year + 1), spearman_traces, edgecolor=['k'] * (num_year + 1), color=[tomato] + [blue] * num_year, lw=1) for rect in bar1: height = rect.get_height() plt.text(rect.get_x() + rect.get_width() / 2.0, height, '{0:.3f}'.format(height), ha='center', va='bottom') ax1.set_xticks(np.arange(11)) ax1.set_xticklabels(year_labels) ax1.set_ylabel(r'spearman $\rho$') hide_spines(ax1) timer.stop() plt.tight_layout() plt.savefig('../images/measure_spearmanr.pdf', bbox_inches='tight') if not platform.system() == 'Linux': plt.show()
def main(): # == == == == == == Part 1: Set up environment == == == == == == # timer = Timer() timer.start() data_prefix = '../data/' # == == == == == == Part 2: Load video views == == == == == == # data_loader = DataLoader() data_loader.load_video_views() num_videos = data_loader.num_videos # == == == == == == Part 3: Load dynamic network snapshot == == == == == == # embed_indegree_dict = {embed: np.zeros((T,)) for embed in np.arange(num_videos)} edge_frequency_dict = defaultdict(int) for t in range(T): filename = 'network_{0}.p'.format((datetime(2018, 9, 1) + timedelta(days=t)).strftime('%Y-%m-%d')) with open(os.path.join(data_prefix, 'network_pickle', filename), 'rb') as fin: network_dict = pickle.load(fin) # embed_tar: [(embed_src, pos_src, view_src), ...] for embed_tar in range(num_videos): inlinks = [x for x in network_dict[embed_tar] if x[1] < NUM_REL] if len(inlinks) > 0: for embed_src, _, _, in inlinks: edge_frequency_dict['{0}-{1}'.format(embed_src, embed_tar)] += 1 embed_indegree_dict[embed_tar][t] = len(inlinks) print('>>> Finish loading day {0}...'.format(t + 1)) print('>>> Network structure has been loaded!') link_frequency_counter = Counter(edge_frequency_dict.values()) # == == == == == == Part 4: Plot how indegree changes == == == == == == # cornflower_blue = ColorPalette.BLUE tomato = ColorPalette.TOMATO fig, axes = plt.subplots(1, 2, figsize=(12, 4.1)) ax1, ax2 = axes.ravel() indegree_change_dict = defaultdict(list) for embed in range(num_videos): for t in range(T-1): x0 = embed_indegree_dict[embed][t] x1 = embed_indegree_dict[embed][t+1] if x0 >= 10: indegree_change_dict[x0].append((x1-x0) / x0) x_axis = sorted([x for x in indegree_change_dict.keys() if len(indegree_change_dict[x]) >= 100]) for i in np.arange(5, 50, 5): ax1.fill_between(x_axis, [smoothing(indegree_change_dict, x, 50 - i) for x in x_axis], [smoothing(indegree_change_dict, x, 55 - i) for x in x_axis], facecolor=cornflower_blue, alpha=(100 - 2 * i) / 100, lw=0) ax1.fill_between(x_axis, [smoothing(indegree_change_dict, x, 45 + i) for x in x_axis], [smoothing(indegree_change_dict, x, 50 + i) for x in x_axis], facecolor=cornflower_blue, alpha=(100 - 2 * i) / 100, lw=0) for i in [25, 75]: ax1.plot(x_axis, [smoothing(indegree_change_dict, x, i) for x in x_axis], color=cornflower_blue, alpha=0.8, zorder=15) ax1.plot(x_axis, [smoothing(indegree_change_dict, x, 50) for x in x_axis], color=cornflower_blue, alpha=1, zorder=15) ax1.set_ylim([-0.9, 0.9]) ax1.set_xlabel('indegree', fontsize=12) ax1.set_ylabel('indegree change ratio the next day', fontsize=12) ax1.set_title('(a)', fontsize=12) ax1.tick_params(axis='both', which='major', labelsize=10) plot_contour(indegree_change_dict, target_x=100, ax=ax1) x_axis = range(1, 1 + T) y_axis = [link_frequency_counter[x] for x in x_axis] print('\nephemeral links of frequency 1, {0}, {1:.2f}%'.format(y_axis[0], y_axis[0] / sum(y_axis) * 100)) print('persistent links of frequency 63, {0}, {1:.2f}%'.format(y_axis[-1], y_axis[-1] / sum(y_axis) * 100)) ax2.plot(x_axis, y_axis, 'o-', c=tomato, mfc='none', mec=tomato, ms=4) ax2.set_xlabel('link frequency', fontsize=12) ax2.set_ylabel('num of video-to-video pairs', fontsize=12) ax2.tick_params(axis='both', which='major', labelsize=10) ax2.set_title('(b)', fontsize=12) ax2.annotate('ephemeral links', fontsize=12, xy=(3, 350000), xycoords='data', xytext=(17, 350000), textcoords='data', arrowprops=dict(arrowstyle='->', connectionstyle='arc3')) ax2.annotate('frequent links', fontsize=12, xy=(61, 35000), xycoords='data', xytext=(35, 55000), textcoords='data', arrowprops=dict(arrowstyle='->', connectionstyle='arc3')) ax2.yaxis.set_major_formatter(FuncFormatter(concise_fmt)) hide_spines(axes) timer.stop() plt.tight_layout() plt.savefig('../images/measure_temporal_micro.pdf', bbox_inches='tight') if not platform.system() == 'Linux': plt.show()
def main(): # == == == == == == Part 1: Set up environment == == == == == == # timer = Timer() timer.start() data_prefix = '../data/' # == == == == == == Part 2: Load video views == == == == == == # data_loader = DataLoader() data_loader.load_video_views() embed_view_dict = data_loader.embed_view_dict embed_avg_view_dict = data_loader.embed_avg_view_dict # == == == == == == Part 3: Load persistent and non-persistent network == == == == == == # reciprocal_link_set = set() persistent_link_set = set() non_persistent_link_set = set() with open(os.path.join(data_prefix, 'persistent_network.csv'), 'r') as fin: fin.readline() for line in fin: src_embed, tar_embed = map(int, line.rstrip().split(',')) link = '{0}-{1}'.format(src_embed, tar_embed) rec_link = '{1}-{0}'.format(src_embed, tar_embed) if rec_link in persistent_link_set: persistent_link_set.remove(rec_link) reciprocal_link_set.add(link) else: persistent_link_set.add(link) for t in range(T): target_date_str = obj2str(datetime(2018, 9, 1) + timedelta(days=t)) filename = 'network_{0}.p'.format(target_date_str) network_dict = pickle.load(open(os.path.join(data_prefix, 'network_pickle', filename), 'rb')) for tar_embed in network_dict: src_embed_list = [x[0] for x in network_dict[tar_embed] if x[1] < NUM_REL] if len(src_embed_list) > 0: for src_embed in src_embed_list: # filter: at least 100 daily views for target video, # and the mean daily views of source video is at least 1% of the target video if embed_avg_view_dict[tar_embed] >= 100 and embed_avg_view_dict[src_embed] >= 0.01 * embed_avg_view_dict[tar_embed]: link = '{0}-{1}'.format(src_embed, tar_embed) rec_link = '{1}-{0}'.format(src_embed, tar_embed) if link not in persistent_link_set and rec_link not in persistent_link_set \ and link not in reciprocal_link_set and rec_link not in reciprocal_link_set \ and link not in non_persistent_link_set and rec_link not in non_persistent_link_set: non_persistent_link_set.add(link) print('>>> Number of reciprocal links: {0}'.format(len(reciprocal_link_set))) print('>>> Number of persistent links (non-reciprocal): {0}'.format(len(persistent_link_set))) print('>>> Number of ephemeral links: {0}'.format(len(non_persistent_link_set))) for link_set, log_filename in zip([reciprocal_link_set, persistent_link_set, non_persistent_link_set], ['./reciprocal_pearsonr.log', './persistent_pearsonr.log', './ephemeral_pearsonr.log']): with open(log_filename, 'w') as log_file: for link in link_set: src_embed, tar_embed = map(int, link.split('-')) eff_size, pvalue = pearsonr(detsn(embed_view_dict[src_embed]), detsn(embed_view_dict[tar_embed])) log_file.write('{0},{1},{2},{3}\n'.format(src_embed, tar_embed, eff_size, pvalue)) timer.stop()