def main():
    timer = Timer()
    timer.start()

    cornflower_blue = ColorPalette.BLUE
    tomato = ColorPalette.TOMATO
    color_cycle_4 = ColorPalette.CC4
    label_fs = ColorPalette.LABELFS
    title_fs = ColorPalette.TITLEFS
    tick_style = ColorPalette.TICKSTYLE
    bar_text_style = ColorPalette.BARTEXTSTYLE

    data_loader = DataLoader()
    data_loader.load_video_views()
    embed_view_dict = data_loader.embed_view_dict
    embed_avg_train_view_dict = {
        embed: np.mean(embed_view_dict[embed][:-NUM_OUTPUT])
        for embed in embed_view_dict.keys()
    }

    net_ratio_list = []

    src_to_tar_view_ratio = []
    link_weights_record = []

    naive_smape_list, snaive_smape_list, ar_smape_list, rnn_smape_list, arnet_smape_list = [
        [] for _ in range(5)
    ]
    naive_daily_smape_mat, snaive_daily_smape_mat, ar_daily_smape_mat, rnn_daily_smape_mat, arnet_daily_smape_mat = [
        np.empty((0, NUM_OUTPUT), np.float) for _ in range(5)
    ]

    with open('./forecast_tracker_all.json', 'r') as fin:
        for line in fin:
            result_json = json.loads(line.rstrip())
            tar_embed = result_json['embed']

            true_value = result_json['true_value']
            naive_pred = result_json['naive_pred']
            snaive_pred = result_json['snaive_pred']
            ar_pred = result_json['ar_pred']
            rnn_pred = result_json['rnn_pred']
            arnet_pred = result_json['arnet_pred']

            naive_smape, naive_daily_smape_arr = smape(true_value, naive_pred)
            naive_smape_list.append(naive_smape)
            naive_daily_smape_mat = np.vstack(
                (naive_daily_smape_mat, naive_daily_smape_arr))

            snaive_smape, snaive_daily_smape_arr = smape(
                true_value, snaive_pred)
            snaive_smape_list.append(snaive_smape)
            snaive_daily_smape_mat = np.vstack(
                (snaive_daily_smape_mat, snaive_daily_smape_arr))

            ar_smape, ar_daily_smape_arr = smape(true_value, ar_pred)
            ar_smape_list.append(ar_smape)
            ar_daily_smape_mat = np.vstack(
                (ar_daily_smape_mat, ar_daily_smape_arr))

            rnn_smape, rnn_daily_smape_arr = smape(true_value, rnn_pred)
            rnn_smape_list.append(rnn_smape)
            rnn_daily_smape_mat = np.vstack(
                (rnn_daily_smape_mat, rnn_daily_smape_arr))

            arnet_smape, arnet_daily_smape_arr = smape(true_value, arnet_pred)
            arnet_smape_list.append(arnet_smape)
            arnet_daily_smape_mat = np.vstack(
                (arnet_daily_smape_mat, arnet_daily_smape_arr))

            # analyse network contribution
            arnet_net_ratio = result_json['net_ratio']
            net_ratio_list.append(arnet_net_ratio)

            incoming_embeds = result_json['incoming_embeds']
            link_weights = result_json['link_weights']
            for edge_inx, src_embed in enumerate(incoming_embeds):
                view_ratio = np.log10(embed_avg_train_view_dict[src_embed] /
                                      embed_avg_train_view_dict[tar_embed])
                src_to_tar_view_ratio.append(view_ratio)
                link_weights_record.append(link_weights[edge_inx])

    fig, axes = plt.subplots(ncols=3, nrows=1, figsize=(12, 4))
    axes = axes.ravel()

    # == == == == == == Part 1: Plot performance comparison == == == == == == #
    smape_mat = [
        naive_smape_list, snaive_smape_list, ar_smape_list, rnn_smape_list,
        arnet_smape_list
    ]
    axes[0].boxplot(smape_mat,
                    showfliers=False,
                    meanline=True,
                    showmeans=True,
                    widths=0.7)
    means = [np.mean(x) for x in smape_mat]
    pos = range(len(means))
    for tick, label in zip(pos, axes[1].get_xticklabels()):
        axes[0].text(pos[tick] + 1, means[tick] + 0.3,
                     '{0:.3f}'.format(means[tick]), **bar_text_style)

    axes[0].set_xticklabels(['Naive', 'SN', 'AR', 'RNN', 'ARNet'],
                            fontsize=label_fs)
    axes[0].set_ylabel('SMAPE', fontsize=label_fs)
    axes[0].tick_params(**tick_style)
    axes[0].set_title('(a)', fontsize=title_fs)

    # == == == == == == Part 2: Plot performance with forecast horizon extends == == == == == == #
    axes[1].plot(np.arange(1, 1 + NUM_OUTPUT),
                 np.mean(naive_daily_smape_mat, axis=0),
                 label='Naive',
                 c='k',
                 mfc='none',
                 marker='D',
                 markersize=4)
    axes[1].plot(np.arange(1, 1 + NUM_OUTPUT),
                 np.mean(snaive_daily_smape_mat, axis=0),
                 label='SN',
                 c=color_cycle_4[0],
                 mfc='none',
                 marker='*',
                 markersize=5)
    axes[1].plot(np.arange(1, 1 + NUM_OUTPUT),
                 np.mean(ar_daily_smape_mat, axis=0),
                 label='AR',
                 c=color_cycle_4[1],
                 mfc='none',
                 marker='s',
                 markersize=5)
    axes[1].plot(np.arange(1, 1 + NUM_OUTPUT),
                 np.mean(rnn_daily_smape_mat, axis=0),
                 label='RNN',
                 c=color_cycle_4[2],
                 mfc='none',
                 marker='^',
                 markersize=5)
    axes[1].plot(np.arange(1, 1 + NUM_OUTPUT),
                 np.mean(arnet_daily_smape_mat, axis=0),
                 label='ARNet',
                 c=color_cycle_4[3],
                 marker='o',
                 markersize=5)

    axes[1].set_xlabel('forecast horizon', fontsize=label_fs)
    axes[1].set_ylabel('SMAPE', fontsize=label_fs)
    axes[1].set_ylim([6, 23])
    axes[1].tick_params(**tick_style)
    axes[1].legend(frameon=False)
    axes[1].set_title('(b)', fontsize=title_fs)

    # == == == == == == Part 3: Plot link strength vs. view ratio from src to tar == == == == == == #
    bin_axis = np.arange(-2, 1.9, 0.1)
    bin_records = [[] for _ in range(len(bin_axis))]
    for x, y in zip(src_to_tar_view_ratio, link_weights_record):
        if x >= -2:
            bin_records[int(np.floor((x + 2) * 10))].append(y)

    for t in np.arange(5, 50, 5):
        axes[2].fill_between(bin_axis,
                             [np.percentile(x, 50 - t) for x in bin_records],
                             [np.percentile(x, 55 - t) for x in bin_records],
                             facecolor=cornflower_blue,
                             alpha=(100 - 2 * t) / 100,
                             lw=0)
        axes[2].fill_between(bin_axis,
                             [np.percentile(x, 45 + t) for x in bin_records],
                             [np.percentile(x, 50 + t) for x in bin_records],
                             facecolor=cornflower_blue,
                             alpha=(100 - 2 * t) / 100,
                             lw=0)

    for t in [10, 30, 70, 90]:
        axes[2].plot(bin_axis, [np.percentile(x, t) for x in bin_records],
                     color=cornflower_blue,
                     alpha=(100 - 2 * t) / 100,
                     lw=1,
                     zorder=15)
    median_line = [np.percentile(x, 50) for x in bin_records]
    axes[2].plot(bin_axis,
                 median_line,
                 color='k',
                 alpha=0.5,
                 zorder=20,
                 lw=1.5)
    axes[2].xaxis.set_major_formatter(
        FuncFormatter(lambda x, _: r'$10^{{{0:.0f}}}%$'.format(x)))

    peak1_idx = int(np.argmax(median_line))
    peak2_idx = 10 + int(np.argmax(median_line[10:]))
    peak1 = (bin_axis[peak1_idx], median_line[peak1_idx])
    peak2 = (bin_axis[peak2_idx], median_line[peak2_idx])
    axes[2].scatter(peak1[0],
                    peak1[1],
                    s=15,
                    c=tomato,
                    edgecolors='k',
                    zorder=30)
    axes[2].text(peak1[0] + 0.08,
                 peak1[1] + 0.01,
                 '({0:.2f}, {1:.2f})'.format(10**peak1[0], peak1[1]),
                 ha='left',
                 va='center')
    axes[2].scatter(peak2[0],
                    peak2[1],
                    s=15,
                    c=tomato,
                    edgecolors='k',
                    zorder=30)
    axes[2].text(peak2[0],
                 peak2[1] + 0.02,
                 '({0:.2f}, {1:.2f})'.format(10**peak2[0], peak2[1]),
                 ha='center',
                 va='bottom')

    axes[2].set_xlim((-2.05, 2.02))
    axes[2].set_ylim((-0.02, 1.01))
    axes[2].set_xlabel('views ratio from video ' + r'$u$' + ' to video ' +
                       r'$v$',
                       fontsize=label_fs)
    axes[2].set_ylabel('estimated link strength ' + r'$\beta_{u, v}$',
                       fontsize=label_fs)
    axes[2].set_title('(c)', fontsize=title_fs)

    hide_spines(axes)

    timer.stop()

    plt.tight_layout()
    plt.savefig('../images/model_prediction_results.pdf', bbox_inches='tight')
    plt.show()
Exemple #2
0
def main():
    # == == == == == == Part 1: Set up environment == == == == == == #
    timer = Timer()
    timer.start()

    data_prefix = '../data/'

    # == == == == == == Part 2: Load video views == == == == == == #
    data_loader = DataLoader()
    data_loader.load_video_views()
    embed_avg_view_dict = data_loader.embed_avg_view_dict
    num_videos = data_loader.num_videos

    # == == == == == == Part 3: Build views percentile partition == == == == == == #
    day_views = list(embed_avg_view_dict.values())
    median_value = np.median(day_views)
    # the top 1st quantile is 75th percentile and above
    first_quantile_value = np.percentile(day_views, 75)
    third_quantile_value = np.percentile(day_views, 25)

    embed_percentile_dict = {}
    for embed in np.arange(num_videos):
        if embed_avg_view_dict[embed] >= first_quantile_value:
            embed_percentile_dict[embed] = 0
        elif embed_avg_view_dict[embed] >= median_value:
            embed_percentile_dict[embed] = 1
        elif embed_avg_view_dict[embed] >= third_quantile_value:
            embed_percentile_dict[embed] = 2
        else:
            embed_percentile_dict[embed] = 3

    # == == == == == == Part 4: Load dynamic network snapshot == == == == == == #
    edge_weight_mat = np.zeros((4, 4), dtype=np.float32)
    for t in range(T):
        filename = 'network_{0}.p'.format(
            (datetime(2018, 9, 1) + timedelta(days=t)).strftime('%Y-%m-%d'))
        with open(os.path.join(data_prefix, 'network_pickle', filename),
                  'rb') as fin:
            network_dict = pickle.load(fin)
            # embed_tar: [(embed_src, pos_src, view_src), ...]
            for embed_tar in range(num_videos):
                for embed_src, pos_src, _ in network_dict[embed_tar]:
                    if pos_src < NUM_REL:
                        edge_weight_mat[(
                            embed_percentile_dict[embed_src],
                            embed_percentile_dict[embed_tar])] += 1 / T
        print('>>> Finish loading day {0}...'.format(t + 1))
    edge_weight_mat = edge_weight_mat.astype(np.int)
    print('>>> Network structure has been loaded!')

    # == == == == == == Part 5: Plot graph by network2tikz == == == == == == #
    # Network
    # -------
    # every possible pair, including self loop
    network_structure = []
    num_partitions = 4
    for pair in itertools.product(np.arange(num_partitions), repeat=2):
        network_structure.append(pair)
    net = igraph.Graph(network_structure, directed=True)

    # Network attributes
    # ------------------
    # Network dicts
    # -------------
    layout = {0: (0, 0), 1: (1, 0), 2: (2, 0), 3: (3, 0)}

    # Visual style dict
    # -----------------
    visual_style = {}

    # node styles
    # -----------
    visual_style['vertex_size'] = 0.9
    visual_style['vertex_color'] = ColorPalette.CCRGB4
    visual_style['vertex_opacity'] = 0.6
    visual_style['vertex_label'] = [
        'top 25\%', '(25\% 50\%', '(50\% 75\%', 'bottom 25\%'
    ]
    visual_style['vertex_label_distance'] = 0
    visual_style['vertex_label_size'] = [5, 4, 4, 4]

    # edge styles
    # -----------
    edge_width = list(np.ravel(edge_weight_mat))
    visual_style['edge_width'] = scaler(edge_width)
    visual_style['edge_curved'] = 0.7
    edge_label = ['{{{:,}}}'.format(x) for x in edge_width]
    visual_style['edge_label'] = edge_label
    visual_style['edge_label_size'] = 4.5
    visual_style['edge_loop_shape'] = 60
    visual_style['edge_loop_size'] = 1
    visual_style['edge_loop_position'] = [180, 0, 0, 0]
    visual_style['edge_arrow_size'] = 0.01
    visual_style['edge_arrow_width'] = [
        0.03, 0.01, 0.01, 0.01, 0.02, 0.01, 0.01, 0.01, 0.02, 0.01, 0.01, 0.01,
        0.02, 0.01, 0.01, 0.01
    ]

    # general options
    # ---------------
    visual_style['layout'] = layout
    visual_style['canvas'] = (10, 3.5)
    visual_style['margin'] = 1.5

    # Create pdf figure of the network
    plot(net, '../images/measure_how_videos_connect.pdf', **visual_style)
    print('>>> Generated pdf file ../images/measure_how_videos_connect.pdf')

    timer.stop()
Exemple #3
0
def main():
    timer = Timer()
    timer.start()

    cornflower_blue = ColorPalette.BLUE
    tomato = ColorPalette.TOMATO
    color_cycle_4 = ColorPalette.CC4
    label_fs = ColorPalette.LABELFS
    title_fs = ColorPalette.TITLEFS
    tick_style = ColorPalette.TICKSTYLE

    data_loader = DataLoader()
    data_loader.load_video_views()
    embed_view_dict = data_loader.embed_view_dict
    embed_avg_train_view_dict = {
        embed: np.mean(embed_view_dict[embed][:-NUM_OUTPUT])
        for embed in embed_view_dict.keys()
    }
    data_loader.load_embed_content_dict()
    embed_cid_dict = data_loader.embed_cid_dict
    embed_genre_dict = data_loader.embed_genre_dict

    cid_artist_dict = {}
    cid_tag_dict = {}
    with open('../data/artist_details.json', 'r') as fin:
        for line in fin:
            artist_json = json.loads(line.rstrip())
            cid_artist_dict[
                artist_json['channel_id']] = artist_json['artist_name']
            cid_tag_dict[artist_json['channel_id']] = artist_json['tag-dict']

    cid_views_dict = defaultdict(int)
    cid_views_wo_network_dict = defaultdict(int)

    arnet_smape_list = []
    net_ratio_list = []
    same_artist_net_ratio_list = []
    same_genre_net_ratio_list = []
    total_views = 0
    network_explained_views = 0

    with open('./embed_prediction.json', 'r') as fin:
        for line in fin:
            result_json = json.loads(line.rstrip())
            tar_embed = result_json['embed']
            avg_train_views = embed_avg_train_view_dict[tar_embed]

            true_value = result_json['true_value']
            arnet_pred = result_json['arnet_pred']
            arnet_smape_list.append(smape(true_value, arnet_pred)[0])

            incoming_embeds = result_json['incoming_embeds']
            link_weights = result_json['link_weights']
            same_artist_contributed_views = 0
            same_genre_contributed_views = 0
            for edge_inx, src_embed in enumerate(incoming_embeds):
                if embed_cid_dict[tar_embed] == embed_cid_dict[src_embed]:
                    same_artist_contributed_views += link_weights[
                        edge_inx] * embed_avg_train_view_dict[src_embed]
                if is_same_genre(embed_genre_dict[tar_embed],
                                 embed_genre_dict[src_embed]):
                    same_genre_contributed_views += link_weights[
                        edge_inx] * embed_avg_train_view_dict[src_embed]

            # analyse network contribution
            arnet_net_ratio = result_json['net_ratio']
            net_ratio_list.append(arnet_net_ratio)
            # rounding issue can make the value slightly larger than 1
            same_artist_net_ratio_list.append(
                min(same_artist_contributed_views / avg_train_views, 1))
            same_genre_net_ratio_list.append(
                min(same_genre_contributed_views / avg_train_views, 1))

            cid_views_dict[embed_cid_dict[tar_embed]] += avg_train_views
            cid_views_wo_network_dict[embed_cid_dict[
                tar_embed]] += avg_train_views * (1 - arnet_net_ratio)

            total_views += avg_train_views
            network_explained_views += avg_train_views * arnet_net_ratio

    print(
        '\nFor an average video in our dataset, we estimate {0:.1f}% of the views come from the network.'
        .format(100 * np.mean(net_ratio_list)))
    print(
        'In particular, {0:.1f}% ({1:.1f}%) of the views come from the same artist.'
        .format(
            100 * np.mean(same_artist_net_ratio_list), 100 *
            np.mean(same_artist_net_ratio_list) / np.mean(net_ratio_list)))
    print(
        'In total, our model estimates that the recommendation network contributes {0:.1f}% of popularity in the Vevo network.'
        .format(100 * network_explained_views / total_views))
    print('total views for 13K: {0:.1f}M'.format(total_views / 1000000))
    print('explained views for 13K: {0:.1f}M'.format(network_explained_views /
                                                     1000000))
    print('total views for 60K: {0:.1f}M'.format(
        np.sum(list(embed_avg_train_view_dict.values())) / 1000000))
    print('Gini coef with network: {0:.4f}'.format(
        gini(list(cid_views_dict.values()))))
    print('Gini coef without network: {0:.4f}\n'.format(
        gini(list(cid_views_wo_network_dict.values()))))

    fig, axes = plt.subplots(ncols=3, nrows=2, figsize=(12, 4.2))
    gs = axes[0, 0].get_gridspec()
    for ax in axes[:, 1]:
        ax.remove()
    ax_mid = fig.add_subplot(gs[:, 1])
    for ax in axes[:, 2]:
        ax.remove()
    ax_right = fig.add_subplot(gs[:, 2])
    axes = [axes[0, 0], axes[1, 0], ax_mid, ax_right]

    # == == == == == == Part 1: Plot SMAPE vs. traffic composition == == == == == == #
    num_bin = 10
    sorted_same_artist_tuple_list = sorted(
        [(x, y) for x, y in zip(same_artist_net_ratio_list, arnet_smape_list)],
        key=lambda x: x[0])
    same_artist_split_values = [
        np.percentile(same_artist_net_ratio_list, x)
        for x in np.arange(10, 101, 10)
    ]
    same_artist_bins = [[] for _ in range(num_bin)]
    for same_artist_net_ratio, arnet_smape in sorted_same_artist_tuple_list:
        slice_idx = int(
            np.floor(
                percentileofscore(same_artist_net_ratio_list,
                                  same_artist_net_ratio) / 10))
        if slice_idx >= num_bin:
            slice_idx = num_bin - 1
        same_artist_bins[slice_idx].append(arnet_smape)

    sorted_same_genre_tuple_list = sorted(
        [(x, y) for x, y in zip(same_genre_net_ratio_list, arnet_smape_list)],
        key=lambda x: x[0])
    same_genre_split_values = [
        np.percentile(same_genre_net_ratio_list, x)
        for x in np.arange(10, 101, 10)
    ]
    same_genre_bins = [[] for _ in range(num_bin)]
    for same_genre_net_ratio, arnet_smape in sorted_same_genre_tuple_list:
        slice_idx = int(
            np.floor(
                percentileofscore(same_genre_net_ratio_list,
                                  same_genre_net_ratio) / 10))
        if slice_idx >= num_bin:
            slice_idx = num_bin - 1
        same_genre_bins[slice_idx].append(arnet_smape)

    axes[0].plot(range(1, 11, 1), [np.mean(x) for x in same_artist_bins],
                 color=cornflower_blue,
                 label='same artist',
                 mfc='none',
                 marker='o',
                 markersize=4)
    axes[1].plot(range(1, 11, 1), [np.mean(x) for x in same_genre_bins],
                 color=tomato,
                 label='same genre',
                 mfc='none',
                 marker='o',
                 markersize=4)

    for ax in [axes[0], axes[1]]:
        ax.set_xlim([0.5, 10.5])
        ax.set_ylim([7, 10.5])
        ax.set_ylabel('SMAPE', fontsize=label_fs)
        ax.xaxis.set_ticks(np.arange(1, 10, 2))
        ax.tick_params(**tick_style)
        ax.legend(frameon=False)

    axes[0].xaxis.set_major_formatter(
        FuncFormatter(lambda x, _: '({0:.3f})'.format(same_artist_split_values[
            int(x) - 1])))
    axes[1].xaxis.set_major_formatter(
        FuncFormatter(lambda x, _: '{0:.0f}%\n({1:.3f})'.format(
            10 * x, same_genre_split_values[int(x) - 1])))

    # axes[0].xaxis.set_major_formatter(
    #     FuncFormatter(lambda x, _: '({0:.3f})'.format(10 * x)))
    # axes[1].xaxis.set_major_formatter(
    #     FuncFormatter(lambda x, _: '{0:.0f}%\n({1:.3f})'.format(10 * x, 10 * x)))

    axes[1].set_xlabel('$\eta_v$ percentile', fontsize=label_fs)
    axes[0].set_title('(a)', fontsize=title_fs)

    # == == == == == == Part 2: Plot who can utilize the network better? == == == == == == #
    artist_views_list = list(cid_views_dict.values())
    wo_network_artist_views_list = list(cid_views_wo_network_dict.values())
    cid_list = sorted(cid_views_dict.keys())
    artist_true_percentile = [
        percentileofscore(artist_views_list, cid_views_dict[cid])
        for cid in cid_list
    ]
    wo_network_artist_percentile = [
        percentileofscore(wo_network_artist_views_list,
                          cid_views_wo_network_dict[cid]) for cid in cid_list
    ]
    percentile_change = np.array([
        artist_true_percentile[i] - wo_network_artist_percentile[i]
        for i in range(len(cid_list))
    ])

    num_popularity_loss = sum(percentile_change < 0)
    num_popularity_equal = sum(percentile_change == 0)
    num_popularity_gain = sum(percentile_change > 0)
    print('{0} ({1:.2f}%) artists lose popularity with network'.format(
        num_popularity_loss, num_popularity_loss / len(cid_list) * 100))
    print('{0} ({1:.2f}%) artists with no popularity change'.format(
        num_popularity_equal, num_popularity_equal / len(cid_list) * 100))
    print('{0} ({1:.2f}%) artists gain popularity with network\n'.format(
        num_popularity_gain, num_popularity_gain / len(cid_list) * 100))

    artist_percentile_mat = [[] for _ in range(10)]
    artist_cid_mat = [[] for _ in range(10)]
    for idx, percentile_value in enumerate(wo_network_artist_percentile):
        bin_idx = min(int(np.floor(percentile_value / 10)), 9)
        artist_percentile_mat[bin_idx].append(artist_true_percentile[idx] -
                                              percentile_value)
        artist_cid_mat[bin_idx].append(cid_list[idx])

    red_circle = dict(markerfacecolor=tomato, marker='o', markersize=4)
    axes[2].boxplot(artist_percentile_mat,
                    showfliers=True,
                    widths=0.5,
                    flierprops=red_circle)
    axes[2].axhline(y=0, color=cornflower_blue, linestyle='--', lw=1, zorder=0)
    axes[2].set_xlabel('artist popularity percentile without network',
                       fontsize=label_fs)
    axes[2].set_ylabel('percentile change with network', fontsize=label_fs)
    axes[2].tick_params(**tick_style)
    axes[2].set_xticks(axes[2].get_xticks()[::2])
    axes[2].xaxis.set_major_formatter(
        FuncFormatter(lambda x, _: '{0:.0f}%'.format(10 * x)))
    axes[2].yaxis.set_major_formatter(
        FuncFormatter(lambda x, _: '{0:.0f}%'.format(x)))
    axes[2].set_title('(b)', fontsize=12)

    # find outliers
    whis = 1.5
    top_outliers_list = []
    bottom_outliers_list = []
    for box_idx, box in enumerate(artist_percentile_mat):
        q1 = np.percentile(box, 25)
        q3 = np.percentile(box, 75)
        iq = q3 - q1
        hi_val = q3 + whis * iq
        lo_val = q1 - whis * iq
        for idx, val in enumerate(box):
            if val > hi_val:
                top_outliers_list.append((artist_cid_mat[box_idx][idx], val))
            elif val < lo_val:
                bottom_outliers_list.append(
                    (artist_cid_mat[box_idx][idx], val))

    sorted_top_outliers_list = sorted(
        [(cid_artist_dict[x[0]], cid_tag_dict[x[0]], int(
            cid_views_dict[x[0]]), x[1]) for x in top_outliers_list],
        key=lambda t: t[2],
        reverse=True)
    for t in sorted_top_outliers_list:
        print(t)
    print('-------------------')
    sorted_bottom_outliers_list = sorted(
        [(cid_artist_dict[x[0]], cid_tag_dict[x[0]], int(
            cid_views_dict[x[0]]), x[1]) for x in bottom_outliers_list],
        key=lambda t: t[2],
        reverse=True)
    for t in sorted_bottom_outliers_list:
        print(t)

    indie_xaxis, indie_yaxis = [], []
    rap_xaxis, rap_yaxis = [], []
    other_xaxis, other_yaxis = [], []
    lose_xaxis, lose_yaxis = [], []
    for top_outlier, _ in top_outliers_list:
        if 'indie' in ','.join(cid_tag_dict[top_outlier].keys()) or \
                'alternative' in ','.join(cid_tag_dict[top_outlier].keys()) or \
                'new wave' in ','.join(cid_tag_dict[top_outlier].keys()):
            indie_xaxis.append(cid_views_dict[top_outlier])
            indie_yaxis.append((cid_views_dict[top_outlier] -
                                cid_views_wo_network_dict[top_outlier]) /
                               cid_views_dict[top_outlier])
        elif 'rap' in ','.join(cid_tag_dict[top_outlier].keys()) or \
                'hip hop' in ','.join(cid_tag_dict[top_outlier].keys()) or \
                'rhythm and blues' in ','.join(cid_tag_dict[top_outlier].keys()) or \
                'reggae' in ','.join(cid_tag_dict[top_outlier].keys()) or \
                'punk' in ','.join(cid_tag_dict[top_outlier].keys()) or \
                'funk' in ','.join(cid_tag_dict[top_outlier].keys()) or \
                'r&b' in ','.join(cid_tag_dict[top_outlier].keys()):
            rap_xaxis.append(cid_views_dict[top_outlier])
            rap_yaxis.append((cid_views_dict[top_outlier] -
                              cid_views_wo_network_dict[top_outlier]) /
                             cid_views_dict[top_outlier])
        else:
            other_xaxis.append(cid_views_dict[top_outlier])
            other_yaxis.append((cid_views_dict[top_outlier] -
                                cid_views_wo_network_dict[top_outlier]) /
                               cid_views_dict[top_outlier])
    for bottom_outlier, _ in bottom_outliers_list:
        lose_xaxis.append(cid_views_dict[bottom_outlier])
        lose_yaxis.append((cid_views_dict[bottom_outlier] -
                           cid_views_wo_network_dict[bottom_outlier]) /
                          cid_views_dict[bottom_outlier])

    axes[3].scatter(indie_xaxis,
                    indie_yaxis,
                    marker='^',
                    facecolors='none',
                    edgecolors=color_cycle_4[0],
                    s=20,
                    label='Indie: {0}'.format(len(indie_xaxis)))
    axes[3].scatter(rap_xaxis,
                    rap_yaxis,
                    marker='o',
                    facecolors='none',
                    edgecolors=color_cycle_4[1],
                    s=20,
                    label='Hip hop: {0}'.format(len(rap_xaxis)))
    axes[3].scatter(other_xaxis,
                    other_yaxis,
                    marker='s',
                    facecolors='none',
                    edgecolors=color_cycle_4[2],
                    s=20,
                    label='Other: {0}'.format(len(other_xaxis)))
    # axes[3].scatter(lose_xaxis, lose_yaxis, marker='x', color=color_cycle_4[3], s=20, label='artists lose popularity: {0}'.format(len(bad_xaxis)))
    axes[3].set_ylim((-0.02, 1.02))
    axes[3].set_xscale('log')
    axes[3].set_xlabel('artist average daily views', fontsize=label_fs)
    axes[3].set_ylabel('network contribution ratio ' + '$\eta_v$',
                       fontsize=label_fs)
    axes[3].tick_params(**tick_style)
    axes[3].legend(frameon=False, loc='lower left')
    axes[3].set_title('(c)', fontsize=title_fs)

    hide_spines(axes)

    timer.stop()

    plt.tight_layout(w_pad=0.2)
    plt.savefig('../images/model_prediction_analysis.pdf', bbox_inches='tight')
    plt.show()
Exemple #4
0
def main():
    # == == == == == == Part 1: Set up environment == == == == == == #
    timer = Timer()
    timer.start()

    data_prefix = '../data'

    # == == == == == == Part 2: Load video views == == == == == == #
    data_loader = DataLoader()
    data_loader.load_video_views()
    embed_view_dict = data_loader.embed_view_dict
    num_videos = data_loader.num_videos
    total_views = []

    # == == == == == == Part 3: Load network snapshot as cutoff value changes == == == == == == #
    for t in range(T):
        total_views.append(
            sum([embed_view_dict[embed][t] for embed in range(num_videos)]))

        snapshot_date = obj2str(datetime(2018, 9, 1) + timedelta(days=t))
        snapshot_filename = 'network_{0}.p'.format(snapshot_date)
        nodes_set = set()
        num_edges = 0
        embedded_graph = defaultdict(list)
        with open(
                os.path.join(data_prefix, 'network_pickle', snapshot_filename),
                'rb') as fin:
            network_dict = pickle.load(fin)
            # embed_tar: [(embed_src, pos_src, view_src)]
            for embed_tar in range(num_videos):
                for embed_src, pos_src, _ in network_dict[embed_tar]:
                    if pos_src < CUTOFF:
                        embedded_graph[embed_src].append(embed_tar)
                        nodes_set.add(embed_src)
                        nodes_set.add(embed_tar)
                        num_edges += 1

        logging.info('>>> Graph embedding @ date {0} has been loaded!'.format(
            snapshot_date))
        logging.info('>>> {0} nodes and {1} edges in the graph'.format(
            len(nodes_set), num_edges))
        logging.info('    {0} views throughout the graph'.format(
            total_views[t]))

        # == == == == == == Part 4: Extract bow-tie structure == == == == == == #
        scc_content = tarjan(embedded_graph)
        scc_content = sorted(scc_content, key=lambda x: len(x), reverse=True)

        # largest SCC
        largest_scc = scc_content.pop(0)
        logging.info('>>> {0} ({1:.2f}%) nodes in the largest SCC'.format(
            len(largest_scc),
            len(largest_scc) / num_videos * 100))
        largest_scc_views = sum(
            [embed_view_dict[embed][t] for embed in largest_scc])
        logging.info('    {0} ({1:.2f}%) views in the largest SCC'.format(
            largest_scc_views, largest_scc_views / total_views[t] * 100))

        # find IN, OUT, Tendrils, Disconnected
        in_component = []
        num_scc_in = 0
        to_visit_scc = []
        for scc in scc_content:
            ret = is_in_component(scc, embedded_graph, largest_scc)
            if ret:
                in_component.extend(scc)
                num_scc_in += 1
            else:
                to_visit_scc.append(scc)
        logging.info('>>> {0} ({1:.2f}%) nodes in the IN component'.format(
            len(in_component),
            len(in_component) / num_videos * 100))
        logging.info('    {0} scc in the IN component'.format(num_scc_in))
        in_views = sum([embed_view_dict[embed][t] for embed in in_component])
        logging.info('    {0} ({1:.2f}%) views in the IN component'.format(
            in_views, in_views / total_views[t] * 100))

        out_component = []
        num_scc_out = 0
        to_visit_scc2 = []
        for scc in to_visit_scc:
            ret = is_out_component(scc, embedded_graph, largest_scc)
            if ret:
                out_component.extend(scc)
                num_scc_out += 1
            else:
                to_visit_scc2.append(scc)
        logging.info('>>> {0} ({1:.2f}%) nodes in the OUT component'.format(
            len(out_component),
            len(out_component) / num_videos * 100))
        logging.info('    {0} scc in the OUT component'.format(num_scc_out))
        out_views = sum([embed_view_dict[embed][t] for embed in out_component])
        logging.info('    {0} ({1:.2f}%) views in the OUT component'.format(
            out_views, out_views / total_views[t] * 100))

        tendrils = []
        num_scc_tendrils = 0
        disconnected = []
        num_disconnected = num_videos - len(nodes_set)
        num_scc_disconnected = 0
        for scc in to_visit_scc2:
            ret = is_out_component(scc, embedded_graph, in_component)
            if ret:
                tendrils.extend(scc)
                num_scc_tendrils += 1
            else:
                ret = is_in_component(scc, embedded_graph, out_component)
                if ret:
                    tendrils.extend(scc)
                    num_scc_tendrils += 1
                else:
                    disconnected.extend(scc)
                    num_scc_disconnected += 1
        logging.info('>>> {0} ({1:.2f}%) nodes in the Tendrils'.format(
            len(tendrils),
            len(tendrils) / num_videos * 100))
        logging.info('    {0} scc in the Tendrils'.format(num_scc_tendrils))
        tendrils_views = sum([embed_view_dict[embed][t] for embed in tendrils])
        logging.info('    {0} ({1:.2f}%) views in the Tendrils'.format(
            tendrils_views, tendrils_views / total_views[t] * 100))

        logging.info('>>> {0} ({1:.2f}%) nodes in the Disconnected'.format(
            num_disconnected + len(disconnected),
            (num_disconnected + len(disconnected)) / num_videos * 100))
        logging.info(
            '    {0} scc in the Disconnected'.format(num_disconnected +
                                                     num_scc_disconnected))
        disc_views = total_views[
            t] - largest_scc_views - in_views - out_views - tendrils_views
        logging.info('    {0} ({1:.2f}%) views in the Disconnected'.format(
            disc_views, disc_views / total_views[t] * 100))

        print('>>> Finish computing bowtie at day {0}...'.format(t + 1))

    timer.stop()
def main():
    # == == == == == == Part 1: Set up environment == == == == == == #
    timer = Timer()
    timer.start()

    data_prefix = '../data/'

    # == == == == == == Part 2: Load video views == == == == == == #
    data_loader = DataLoader()
    data_loader.load_video_views()
    embed_avg_view_dict = data_loader.embed_avg_view_dict
    num_videos = data_loader.num_videos
    data_loader.load_embed_content_dict()
    embed_cid_dict = data_loader.embed_cid_dict
    embed_genre_dict = data_loader.embed_genre_dict

    # == == == == == == Part 3: Load dynamic network snapshot == == == == == == #
    network_dict_list = []
    for t in range(T):
        target_date_str = obj2str(datetime(2018, 9, 1) + timedelta(days=t))
        filename = 'network_{0}.p'.format(target_date_str)
        network_dict = pickle.load(
            open(os.path.join(data_prefix, 'network_pickle', filename), 'rb'))
        for embed in network_dict:
            network_dict[embed] = [
                x[0] for x in network_dict[embed] if x[1] < NUM_REL
            ]
        network_dict_list.append(network_dict)

    persistent_src_embed_set = set()
    persistent_tar_embed_set = set()
    existing_edges = set()
    num_reciprocal_edges = 0
    num_same_artist = 0
    num_same_genre = 0

    with open(os.path.join(data_prefix, 'persistent_network.csv'),
              'w') as fout:
        fout.write('Source,Target\n')

        for tar_embed in range(num_videos):
            src_union_set = set()
            for t in range(T):
                src_union_set.update(set(network_dict_list[t][tar_embed]))

            for src_embed in src_union_set:
                linkage_list = [0] * T
                for t in range(T):
                    if src_embed in network_dict_list[t][tar_embed]:
                        linkage_list[t] = 1
                if is_persistent_link(linkage_list):
                    # filter: at least 100 daily views for target video,
                    # and the mean daily views of source video is at least 1% of the target video
                    src_mean = embed_avg_view_dict[src_embed]
                    tar_mean = embed_avg_view_dict[tar_embed]

                    if tar_mean >= 100 and src_mean >= 0.01 * tar_mean:
                        fout.write('{0},{1}\n'.format(src_embed, tar_embed))
                        persistent_src_embed_set.add(src_embed)
                        persistent_tar_embed_set.add(tar_embed)
                        if '{1}-{0}'.format(src_embed,
                                            tar_embed) in existing_edges:
                            num_reciprocal_edges += 1
                        if embed_cid_dict[src_embed] == embed_cid_dict[
                                tar_embed]:
                            num_same_artist += 1
                        if is_same_genre(embed_genre_dict[src_embed],
                                         embed_genre_dict[tar_embed]):
                            num_same_genre += 1
                        existing_edges.add('{0}-{1}'.format(
                            src_embed, tar_embed))

    print('{0} edges in the persistent network'.format(len(existing_edges)))
    print(
        '{0} source videos, {1} target videos, {2} videos appear in both set'.
        format(
            len(persistent_src_embed_set), len(persistent_tar_embed_set),
            len(persistent_src_embed_set.intersection(
                persistent_tar_embed_set))))
    print('{0} pairs of reciprocal edges'.format(num_reciprocal_edges))
    print('{0} ({1:.1f}%) edges belong to the same artist'.format(
        num_same_artist, 100 * num_same_artist / len(existing_edges)))
    print('{0} ({1:.1f}%) edges belong to the same genre'.format(
        num_same_genre, 100 * num_same_genre / len(existing_edges)))

    timer.stop()
Exemple #6
0
def main():
    # == == == == == == Part 1: Set up environment == == == == == == #
    data_prefix = '../data/'
    partition_dirname = './video_partition'
    result_dirname = './model_results'

    # == == == == == == Part 2: Load target videos set == == == == == == #
    tar_inlink_dict = defaultdict(list)
    with open(os.path.join(data_prefix, 'persistent_network.csv'), 'r') as fin:
        fin.readline()
        for line in fin:
            src_embed, tar_embed = map(int, line.rstrip().split(','))
            tar_inlink_dict[tar_embed].append(src_embed)
    tar_embed_list = list(sorted(tar_inlink_dict.keys()))

    # == == == == == == Part 3: Load target videos set == == == == == == #
    # partition tar embed into small files, each file contains 10 videos
    if not os.path.exists(partition_dirname):
        os.makedirs(partition_dirname)
        num_embed_in_partition = 50
        partition_idx = 1
        partition_fout = open(
            os.path.join(partition_dirname,
                         'partition_{0:03d}.txt'.format(partition_idx)), 'w')
        embed_cnt = 0
        for embed in tar_embed_list:
            if embed_cnt == num_embed_in_partition:
                partition_idx += 1
                partition_fout.close()
                partition_fout = open(
                    os.path.join(
                        partition_dirname,
                        'partition_{0:03d}.txt'.format(partition_idx)), 'w')
                embed_cnt = 0
            partition_fout.write('{0}\n'.format(embed))
            embed_cnt += 1
        partition_fout.close()

    if not os.path.exists(result_dirname):
        os.makedirs(result_dirname)

    load_partition = True
    visited_embed_set = set()
    if load_partition:
        partition_idx = int(sys.argv[1])
        result_filename = os.path.join(
            result_dirname,
            'forecast_tracker_{0:03d}.json'.format(partition_idx))
        if os.path.exists(result_filename):
            with open(result_filename, 'r') as fin:
                for line in fin:
                    result_json = json.loads(line.rstrip())
                    visited_embed_set.add(result_json['embed'])

        partition_filename = os.path.join(
            partition_dirname, 'partition_{0:03d}.txt'.format(partition_idx))
        tar_embed_list = []
        with open(partition_filename, 'r') as fin:
            for line in fin:
                embed = int(line.rstrip())
                if embed not in visited_embed_set:
                    tar_embed_list.append(embed)
    else:
        result_filename = os.path.join(result_dirname,
                                       'forecast_tracker_all.json')
    fout = open(result_filename, 'a')
    print('{0} videos to model'.format(len(tar_embed_list)))

    # == == == == == == Part 4: Load video views == == == == == == #
    data_loader = DataLoader()
    data_loader.load_video_views()
    embed_view_dict = data_loader.embed_view_dict

    # == == == == == == Part 5: Start prediction task == == == == == == #
    for item_cnt, tar_embed in enumerate(tar_embed_list):
        timer = Timer()
        timer.start()

        tar_ts_data = embed_view_dict[tar_embed]
        true_value = tar_ts_data[-NUM_OUTPUT:]

        # naive method
        naive_model = Naive(tar_ts_data, num_output=NUM_OUTPUT)
        naive_smape = naive_model.evaluate()
        naive_pred = naive_model.pred_test_output

        # seasonal naive method
        snaive_model = SeasonalNaive(tar_ts_data, num_output=NUM_OUTPUT)
        snaive_smape = snaive_model.evaluate()
        snaive_pred = snaive_model.pred_test_output

        # autoregressive method
        ar_model = AutoRegression(tar_ts_data, num_output=NUM_OUTPUT)
        ar_model.train_ar(lag=FREQ)
        ar_smape = ar_model.evaluate()
        ar_pred = list(map(int, ar_model.pred_test_output))

        # RNN with LSTM units
        rnn_model = TemporalLSTM(tar_ts_data,
                                 num_input=NUM_INPUT,
                                 num_output=NUM_OUTPUT,
                                 num_features=1,
                                 num_neurons=NUM_NEURONS,
                                 freq=FREQ,
                                 num_ensemble=NUM_ENSEMBLE)
        rnn_model.prepare_tensor()
        rnn_model.create_model()
        rnn_model.train_lstm()
        rnn_smape = rnn_model.evaluate()
        rnn_pred = list(map(int, rnn_model.pred_test_output))

        # autoregressive with network method
        # preset AR coefficient
        preset_ar_coef = list(ar_model.fitted_params)
        # network feature method
        src_ts_data_mat = np.empty((0, T), np.int)
        for src_embed in tar_inlink_dict[tar_embed]:
            src_ts_data = np.array(embed_view_dict[src_embed])
            src_ts_data_mat = np.vstack((src_ts_data_mat, src_ts_data))

        arnet_model = ARNet(tar_ts_data,
                            src_ts_data_mat=src_ts_data_mat,
                            num_input=NUM_INPUT,
                            num_output=NUM_OUTPUT,
                            num_ensemble=NUM_ENSEMBLE)
        arnet_model.train_arnet(start_params=preset_ar_coef)
        arnet_smape = arnet_model.evaluate()
        arnet_pred = list(map(int, arnet_model.pred_test_output))

        fout.write('{0}\n'.format(
            json.dumps({
                'embed': tar_embed,
                'true_value': true_value,
                'naive_pred': naive_pred,
                'snaive_pred': snaive_pred,
                'ar_pred': ar_pred,
                'rnn_pred': rnn_pred,
                'arnet_pred': arnet_pred,
                'net_ratio': arnet_model.network_ratio,
                'incoming_embeds': tar_inlink_dict[tar_embed],
                'link_weights': arnet_model.link_weights.tolist()
            })))

        print(
            'embed: {0}, Naive: {1:.3f}, SeasonalNaive: {2:.3f}, AutoRegression: {3:.3f}, RNN: {4:.3f}, ARNet: {5:.3f}'
            .format(tar_embed, naive_smape, snaive_smape, ar_smape, rnn_smape,
                    arnet_smape))

        naive_model = None
        del naive_model
        snaive_model = None
        del snaive_model
        ar_model = None
        del ar_model
        rnn_model = None
        del rnn_model
        arnet_model = None
        del arnet_model
        gc.collect()

        timer.stop()

    fout.close()
def main():
    # == == == == == == Part 1: Set up environment == == == == == == #
    timer = Timer()
    timer.start()

    data_prefix = '../data/'

    target_day_indices = [0, 15, 30, 45]
    color_cycle_4 = ColorPalette.CC4
    date_labels = [
        'Sep 01, 2018', 'Sep 16, 2018', 'Oct 01, 2018', 'Oct 16, 2018'
    ]

    # == == == == == == Part 2: Load video views == == == == == == #
    data_loader = DataLoader()
    data_loader.load_video_views()
    embed_view_dict = data_loader.embed_view_dict
    embed_avg_view_dict = data_loader.embed_avg_view_dict
    num_videos = data_loader.num_videos

    target_day_view_list = [[], [], [], []]
    for embed in range(num_videos):
        for target_idx, target_day in enumerate(target_day_indices):
            target_day_view_list[target_idx].append(
                embed_view_dict[embed][target_day])

    # == == == == == == Part 3: Load dynamic network snapshot == == == == == == #
    embed_indegree_dict = {
        embed: np.zeros((T, ))
        for embed in np.arange(num_videos)
    }  # daily indegree for each embed
    zero_indegree_list = []  # percentage of zero indegree for each day
    num_edges_list = []  # number of total edges for each day
    for t in range(T):
        filename = 'network_{0}.p'.format(
            (datetime(2018, 9, 1) + timedelta(days=t)).strftime('%Y-%m-%d'))
        indegree_list = []
        with open(os.path.join(data_prefix, 'network_pickle', filename),
                  'rb') as fin:
            network_dict = pickle.load(fin)
            # embed_tar: [(embed_src, pos_src, view_src), ...]
            for tar_embed in range(num_videos):
                indegree_value = len(
                    [1 for x in network_dict[tar_embed] if x[1] < NUM_REL])
                embed_indegree_dict[tar_embed][t] = indegree_value
                indegree_list.append(indegree_value)
        indegree_counter = Counter(indegree_list)
        zero_indegree_list.append(indegree_counter[0] / num_videos)
        num_edges_list.append(sum(indegree_list))
        print('>>> Finish loading day {0}...'.format(t + 1))
    print('>>> Network structure has been loaded!')
    print('\n>>> Average number of edges: {0:.0f}, max: {1:.0f}, min: {2:.0f}'.
          format(
              sum(num_edges_list) / len(num_edges_list), max(num_edges_list),
              min(num_edges_list)))

    fig, axes = plt.subplots(1, 3, figsize=(12, 4.5))
    ax1, ax2, ax3 = axes.ravel()

    # == == == == == == Part 4: Plot ax1 indegree CCDF == == == == == == #
    embed_avg_indegree_dict = defaultdict(float)
    for t in range(T):
        for embed in range(num_videos):
            embed_avg_indegree_dict[embed] += embed_indegree_dict[embed][t] / T

    indegree_ranked_embed_list = [
        x[0] for x in sorted(embed_avg_indegree_dict.items(),
                             key=lambda kv: kv[1],
                             reverse=True)
    ]
    top_20_indegree_embeds = indegree_ranked_embed_list[:20]
    popular_ranked_embed_list = [
        x[0] for x in sorted(
            embed_avg_view_dict.items(), key=lambda kv: kv[1], reverse=True)
    ]
    top_20_popular_embeds = popular_ranked_embed_list[:20]

    for target_idx, target_day in enumerate(target_day_indices):
        indegree_list = []
        for embed in range(num_videos):
            indegree_list.append(embed_indegree_dict[embed][target_day])

        print(
            'video with 10 indegree has more in-links than {0:.2f}% videos on date {1}'
            .format(percentileofscore(indegree_list, 10),
                    date_labels[target_idx]))
        print(
            'video with 20 indegree has more in-links than {0:.2f}% videos on date {1}'
            .format(percentileofscore(indegree_list, 20),
                    date_labels[target_idx]))

        plot_ccdf(indegree_list,
                  ax=ax1,
                  color=color_cycle_4[target_idx],
                  label=date_labels[target_idx])

    # compute the powerlaw fit
    powerlaw_fit = Fit(list(embed_avg_indegree_dict.values()))
    infer_alpha = powerlaw_fit.power_law.alpha
    p = powerlaw_fit.power_law.ccdf()
    ins_x_axis = powerlaw_fit.power_law.__dict__['parent_Fit'].__dict__[
        'data'][:int(0.9 * len(p))]
    ins_y_axis = 0.1 * p[:int(0.9 * len(p))]

    ax1.plot(ins_x_axis, ins_y_axis, 'k:')
    ax1.text(0.4,
             0.6,
             r'$x^{{{0:.2f}}}$'.format(-infer_alpha + 1),
             size=12,
             ha='right',
             va='bottom',
             transform=ax1.transAxes)
    ax1.set_xscale('log')
    ax1.set_yscale('log')
    ax1.set_xlabel('indegree', fontsize=11)
    ax1.set_ylabel('$P(X) \geq x$', fontsize=11)
    ax1.tick_params(axis='both', which='major', labelsize=10)
    ax1.set_title('(a) indegree distribution', fontsize=12)

    ax1.legend(frameon=False, fontsize=11, ncol=1, fancybox=False, shadow=True)

    mean_zero_indegree = sum(zero_indegree_list) / len(zero_indegree_list)

    ax1.axhline(y=1 - mean_zero_indegree, color='k', linestyle='--', zorder=30)
    ax1.text(0.96,
             0.9,
             '{0:.0f}% with 0 indegree'.format(mean_zero_indegree * 100),
             size=11,
             transform=ax1.transAxes,
             ha='right',
             va='top')

    # == == == == == == Part 5: Plot ax2 views distribution == == == == == == #
    for target_idx, views_list in enumerate(target_day_view_list):
        x_values = range(100)
        y_values = [np.percentile(views_list, x) for x in x_values]
        ax2.plot(x_values,
                 y_values,
                 color=color_cycle_4[target_idx],
                 label=date_labels[target_idx])
    ax2.set_yscale('log')
    ax2.set_xlabel('views percentile', fontsize=11)
    ax2.set_ylabel('num of views', fontsize=11)
    ax2.tick_params(axis='both', which='major', labelsize=10)
    ax2.set_title('(b) daily views vs. its percentile', fontsize=12)

    avg_views_list = sorted(list(embed_avg_view_dict.values()), reverse=True)
    gini_coef = gini(avg_views_list)
    print('top 1% videos occupy {0:.2f}% views'.format(
        sum(avg_views_list[:int(0.01 * num_videos)]) / sum(avg_views_list) *
        100))
    print('top 10% videos occupy {0:.2f}% views'.format(
        sum(avg_views_list[:int(0.1 * num_videos)]) / sum(avg_views_list) *
        100))
    print('Gini coef: {0:.3f}'.format(gini_coef))

    spearman_degree = [
        embed_avg_indegree_dict[embed] for embed in range(num_videos)
    ]
    spearman_views = [
        embed_avg_view_dict[embed] for embed in range(num_videos)
    ]

    print(
        'Spearman correlation between views and indegree: {0:.4f}, pvalue: {1:.2f}'
        .format(*spearmanr(spearman_views, spearman_degree)))

    median_views = np.median(avg_views_list)
    top_views_90th = np.percentile(avg_views_list, 90)
    top_views_99th = np.percentile(avg_views_list, 99)
    ax2_xmin = ax2.get_xlim()[0]
    ax2_ymin = ax2.get_ylim()[0]

    ax2.plot((50, 50), (ax2_ymin, median_views),
             color='k',
             linestyle='--',
             zorder=30)
    ax2.plot((ax2_xmin, 50), (median_views, median_views),
             color='k',
             linestyle='--',
             zorder=30)
    ax2.text(0.49,
             0.45,
             'median views {0:,.0f}'.format(median_views),
             size=11,
             transform=ax2.transAxes,
             ha='right',
             va='bottom')

    ax2.plot((90, 90), (ax2_ymin, top_views_90th),
             color='k',
             linestyle='--',
             zorder=30)
    ax2.plot((ax2_xmin, 90), (top_views_90th, top_views_90th),
             color='k',
             linestyle='--',
             zorder=30)
    ax2.text(0.88,
             0.75,
             '90th views {0:,.0f}'.format(top_views_90th),
             size=11,
             transform=ax2.transAxes,
             ha='right',
             va='bottom')

    ax2.plot((99, 99), (ax2_ymin, top_views_99th),
             color='k',
             linestyle='--',
             zorder=30)
    ax2.plot((ax2_xmin, 99), (top_views_99th, top_views_99th),
             color='k',
             linestyle='--',
             zorder=30)
    ax2.text(0.91,
             0.95,
             '99th views {0:,.0f}'.format(top_views_99th),
             size=11,
             transform=ax2.transAxes,
             ha='right',
             va='bottom')

    # == == == == == == Part 7: Plot ax3 video uploading trend == == == == == == #
    x_axis = range(2009, 2018)
    x_labels = ["'09", "'10", "'11", "'12", "'13", "'14", "'15", "'16", "'17"]
    upload_mat = np.zeros((len(x_axis), 8))

    target_topics = [
        'Pop_music', 'Rock_music', 'Hip_hop_music', 'Independent_music',
        'Country_music', 'Electronic_music', 'Soul_music', 'Others'
    ]
    topic_labels = [
        'Pop', 'Rock', 'Hip hop', 'Independent', 'Country', 'Electronic',
        'Soul', 'Others'
    ]

    color_cycle_8 = ColorPalette.CC8

    data_loader.load_embed_content_dict()
    embed_title_dict = data_loader.embed_title_dict
    embed_uploadtime_dict = data_loader.embed_uploadtime_dict
    embed_genre_dict = data_loader.embed_genre_dict

    for embed in range(num_videos):
        upload_year = int(embed_uploadtime_dict[embed][:4])
        if 2009 <= upload_year <= 2017:
            year_idx = upload_year - 2009

            genres = embed_genre_dict[embed]
            if len(genres) == 0:
                # add one to "Others" genre
                upload_mat[year_idx, 7] += 1
            else:
                for genre in genres:
                    upload_mat[year_idx,
                               target_topics.index(genre)] += 1 / len(genres)

    print()
    print([
        '{0}: {1}'.format(topic, int(num))
        for topic, num in zip(target_topics, np.sum(upload_mat, axis=0))
    ])

    stackedBarPlot(ax=ax3,
                   data=upload_mat,
                   cols=color_cycle_8,
                   edgeCols=['#000000'] * 8,
                   xlabel='uploaded year',
                   ylabel='num of videos',
                   scale=False,
                   endGaps=True)

    ax3.tick_params(axis='both', which='major', labelsize=9)
    ax3.set_xticks(np.arange(len(x_axis)))
    ax3.set_xticklabels(x_labels)
    ax3.yaxis.set_major_formatter(FuncFormatter(concise_fmt))
    ax3.legend([
        plt.Rectangle((0, 0), 1, 1, fc=c, ec='k', alpha=0.6)
        for c in color_cycle_8
    ],
               topic_labels,
               fontsize=9,
               frameon=False,
               handletextpad=0.2,
               columnspacing=0.3,
               ncol=4,
               bbox_to_anchor=(1, -0.12),
               bbox_transform=ax3.transAxes,
               fancybox=False,
               shadow=True)
    ax3.set_title('(c) VEVO videos uploading trend', fontsize=12)

    union_top_set = set(top_20_indegree_embeds).union(top_20_popular_embeds)
    print('\n>>> Size of the union set at cutoff 15:', len(union_top_set))
    print('{0:>24} | {1:>17} | {2:>5} | {3:>8} | {4:>6} | {5:>10} | {6:>5}'.
          format('Video title', 'Artist', 'Age', 'Indegree', '-rank', 'Views',
                 '-rank'))
    for embed in top_20_indegree_embeds:
        print(
            '{0:>24} & {1:>17} & {2:>5} & {3:>8} & {4:>6} & {5:>10} & {6:>5} \\\\'
            .format(
                embed_title_dict[embed].split(
                    ' - ', 1)[1].split('(')[0].split('ft')[0].strip(),
                embed_title_dict[embed].split(
                    ' - ',
                    1)[0].split('&')[0].split(',')[0].strip(), '{0:,}'.format(
                        (datetime(2018, 11, 2) -
                         str2obj(embed_uploadtime_dict[embed])).days),
                '{0:,}'.format(int(embed_avg_indegree_dict[embed])),
                '{0:,}'.format(top_20_indegree_embeds.index(embed) + 1),
                '{0:,}'.format(int(embed_avg_view_dict[embed])),
                '{0:,}'.format(popular_ranked_embed_list.index(embed) + 1)))

    print('\n{0:>24} | {1:>17} | {2:>5} | {3:>8} | {4:>6} | {5:>10} | {6:>5}'.
          format('Video title', 'Artist', 'Age', 'Indegree', '-rank', 'Views',
                 '-rank'))
    for embed in top_20_popular_embeds:
        print(
            '{0:>24} & {1:>17} & {2:>5} & {3:>8} & {4:>6} & {5:>10} & {6:>5} \\\\'
            .format(
                embed_title_dict[embed].split(
                    ' - ', 1)[1].split('(')[0].split('ft')[0].strip(),
                embed_title_dict[embed].split(
                    ' - ',
                    1)[0].split('&')[0].split(',')[0].strip(), '{0:,}'.format(
                        (datetime(2018, 11, 2) -
                         str2obj(embed_uploadtime_dict[embed])).days),
                '{0:,}'.format(int(embed_avg_indegree_dict[embed])),
                '{0:,}'.format(indegree_ranked_embed_list.index(embed) + 1),
                '{0:,}'.format(int(embed_avg_view_dict[embed])),
                '{0:,}'.format(top_20_popular_embeds.index(embed) + 1)))

    hide_spines(axes)

    timer.stop()

    plt.tight_layout()
    plt.savefig('../images/measure_basic_statistics.pdf', bbox_inches='tight')
    if not platform.system() == 'Linux':
        plt.show()
def main():
    # == == == == == == Part 1: Set up environment == == == == == == #
    timer = Timer()
    timer.start()

    data_prefix = '../data/'
    year_labels = [
        "all years", "'09", "'10", "'11", "'12", "'13", "'14", "'15", "'16",
        "'17", "'18"
    ]
    num_year = len(year_labels) - 1

    # == == == == == == Part 2: Load video views == == == == == == #
    data_loader = DataLoader()
    data_loader.load_video_views()
    data_loader.load_embed_content_dict()
    embed_avg_view_dict = data_loader.embed_avg_view_dict
    embed_uploadtime_dict = data_loader.embed_uploadtime_dict
    num_videos = data_loader.num_videos

    for embed in range(num_videos):
        upload_year = int(embed_uploadtime_dict[embed][:4])
        if upload_year >= 2009:
            year_idx = upload_year - 2009
        else:
            year_idx = 0
        embed_uploadtime_dict[embed] = year_idx

    views_by_years_list = [[] for _ in range(num_year)]
    indegrees_by_years_list = [[] for _ in range(num_year)]

    # == == == == == == Part 3: Load dynamic network snapshot == == == == == == #
    embed_indegree_dict_15 = {
        embed: np.zeros((T, ))
        for embed in np.arange(num_videos)
    }
    for t in range(T):
        filename = 'network_{0}.p'.format(
            obj2str(datetime(2018, 9, 1) + timedelta(days=t)))
        with open(os.path.join(data_prefix, 'network_pickle', filename),
                  'rb') as fin:
            network_dict = pickle.load(fin)
            # embed_tar: [(embed_src, pos_src, view_src)]
            for embed in range(num_videos):
                embed_indegree_dict_15[embed][t] = len(
                    [1 for x in network_dict[embed] if x[1] < NUM_REL_15])
        print('>>> Finish loading day {0}...'.format(t + 1))
    print('>>> Network structure has been loaded!')

    for embed in range(num_videos):
        views_by_years_list[embed_uploadtime_dict[embed]].append(
            embed_avg_view_dict[embed])
        indegrees_by_years_list[embed_uploadtime_dict[embed]].append(
            np.mean(embed_indegree_dict_15[embed]))

    spearman_traces = []
    all_views, all_indegrees = [], []
    for i in range(num_year):
        all_views.extend(views_by_years_list[i])
        all_indegrees.extend(indegrees_by_years_list[i])
    print('\n>>> {0}'.format(year_labels[0]),
          spearmanr(all_views, all_indegrees))
    spearman_traces.append(spearmanr(all_views, all_indegrees)[0])
    for i in range(num_year):
        spearman_traces.append(
            spearmanr(views_by_years_list[i], indegrees_by_years_list[i])[0])
        print('>>> {0} year'.format(year_labels[1 + i]),
              spearmanr(views_by_years_list[i], indegrees_by_years_list[i]))

    # == == == == == == Part 4: Plotting script == == == == == == #
    fig, ax1 = plt.subplots(1, 1, figsize=(8, 2))
    tomato = ColorPalette.TOMATO
    blue = ColorPalette.BLUE

    bar1 = ax1.bar(range(num_year + 1),
                   spearman_traces,
                   edgecolor=['k'] * (num_year + 1),
                   color=[tomato] + [blue] * num_year,
                   lw=1)
    for rect in bar1:
        height = rect.get_height()
        plt.text(rect.get_x() + rect.get_width() / 2.0,
                 height,
                 '{0:.3f}'.format(height),
                 ha='center',
                 va='bottom')

    ax1.set_xticks(np.arange(11))
    ax1.set_xticklabels(year_labels)
    ax1.set_ylabel(r'spearman $\rho$')

    hide_spines(ax1)

    timer.stop()

    plt.tight_layout()
    plt.savefig('../images/measure_spearmanr.pdf', bbox_inches='tight')
    if not platform.system() == 'Linux':
        plt.show()
def main():
    # == == == == == == Part 1: Set up environment == == == == == == #
    timer = Timer()
    timer.start()

    data_prefix = '../data/'

    # == == == == == == Part 2: Load video views == == == == == == #
    data_loader = DataLoader()
    data_loader.load_video_views()
    num_videos = data_loader.num_videos

    # == == == == == == Part 3: Load dynamic network snapshot == == == == == == #
    embed_indegree_dict = {embed: np.zeros((T,)) for embed in np.arange(num_videos)}
    edge_frequency_dict = defaultdict(int)
    for t in range(T):
        filename = 'network_{0}.p'.format((datetime(2018, 9, 1) + timedelta(days=t)).strftime('%Y-%m-%d'))
        with open(os.path.join(data_prefix, 'network_pickle', filename), 'rb') as fin:
            network_dict = pickle.load(fin)
            # embed_tar: [(embed_src, pos_src, view_src), ...]
            for embed_tar in range(num_videos):
                inlinks = [x for x in network_dict[embed_tar] if x[1] < NUM_REL]
                if len(inlinks) > 0:
                    for embed_src, _, _, in inlinks:
                        edge_frequency_dict['{0}-{1}'.format(embed_src, embed_tar)] += 1
                embed_indegree_dict[embed_tar][t] = len(inlinks)
        print('>>> Finish loading day {0}...'.format(t + 1))
    print('>>> Network structure has been loaded!')

    link_frequency_counter = Counter(edge_frequency_dict.values())

    # == == == == == == Part 4: Plot how indegree changes == == == == == == #
    cornflower_blue = ColorPalette.BLUE
    tomato = ColorPalette.TOMATO

    fig, axes = plt.subplots(1, 2, figsize=(12, 4.1))
    ax1, ax2 = axes.ravel()
    indegree_change_dict = defaultdict(list)
    for embed in range(num_videos):
        for t in range(T-1):
            x0 = embed_indegree_dict[embed][t]
            x1 = embed_indegree_dict[embed][t+1]
            if x0 >= 10:
                indegree_change_dict[x0].append((x1-x0) / x0)

    x_axis = sorted([x for x in indegree_change_dict.keys() if len(indegree_change_dict[x]) >= 100])

    for i in np.arange(5, 50, 5):
        ax1.fill_between(x_axis, [smoothing(indegree_change_dict, x, 50 - i) for x in x_axis],
                         [smoothing(indegree_change_dict, x, 55 - i) for x in x_axis],
                         facecolor=cornflower_blue, alpha=(100 - 2 * i) / 100, lw=0)
        ax1.fill_between(x_axis, [smoothing(indegree_change_dict, x, 45 + i) for x in x_axis],
                         [smoothing(indegree_change_dict, x, 50 + i) for x in x_axis],
                         facecolor=cornflower_blue, alpha=(100 - 2 * i) / 100, lw=0)

    for i in [25, 75]:
        ax1.plot(x_axis, [smoothing(indegree_change_dict, x, i) for x in x_axis], color=cornflower_blue, alpha=0.8, zorder=15)
    ax1.plot(x_axis, [smoothing(indegree_change_dict, x, 50) for x in x_axis], color=cornflower_blue, alpha=1, zorder=15)

    ax1.set_ylim([-0.9, 0.9])
    ax1.set_xlabel('indegree', fontsize=12)
    ax1.set_ylabel('indegree change ratio the next day', fontsize=12)
    ax1.set_title('(a)', fontsize=12)
    ax1.tick_params(axis='both', which='major', labelsize=10)

    plot_contour(indegree_change_dict, target_x=100, ax=ax1)

    x_axis = range(1, 1 + T)
    y_axis = [link_frequency_counter[x] for x in x_axis]

    print('\nephemeral links of frequency 1, {0}, {1:.2f}%'.format(y_axis[0], y_axis[0] / sum(y_axis) * 100))
    print('persistent links of frequency 63, {0}, {1:.2f}%'.format(y_axis[-1], y_axis[-1] / sum(y_axis) * 100))

    ax2.plot(x_axis, y_axis, 'o-', c=tomato, mfc='none', mec=tomato, ms=4)
    ax2.set_xlabel('link frequency', fontsize=12)
    ax2.set_ylabel('num of video-to-video pairs', fontsize=12)
    ax2.tick_params(axis='both', which='major', labelsize=10)
    ax2.set_title('(b)', fontsize=12)

    ax2.annotate('ephemeral links', fontsize=12,
                 xy=(3, 350000), xycoords='data',
                 xytext=(17, 350000), textcoords='data',
                 arrowprops=dict(arrowstyle='->', connectionstyle='arc3'))

    ax2.annotate('frequent links', fontsize=12,
                 xy=(61, 35000), xycoords='data',
                 xytext=(35, 55000), textcoords='data',
                 arrowprops=dict(arrowstyle='->', connectionstyle='arc3'))

    ax2.yaxis.set_major_formatter(FuncFormatter(concise_fmt))

    hide_spines(axes)

    timer.stop()

    plt.tight_layout()
    plt.savefig('../images/measure_temporal_micro.pdf', bbox_inches='tight')
    if not platform.system() == 'Linux':
        plt.show()
Exemple #10
0
def main():
    # == == == == == == Part 1: Set up environment == == == == == == #
    timer = Timer()
    timer.start()

    data_prefix = '../data/'

    # == == == == == == Part 2: Load video views == == == == == == #
    data_loader = DataLoader()
    data_loader.load_video_views()
    embed_view_dict = data_loader.embed_view_dict
    embed_avg_view_dict = data_loader.embed_avg_view_dict

    # == == == == == == Part 3: Load persistent and non-persistent network == == == == == == #
    reciprocal_link_set = set()
    persistent_link_set = set()
    non_persistent_link_set = set()

    with open(os.path.join(data_prefix, 'persistent_network.csv'), 'r') as fin:
        fin.readline()
        for line in fin:
            src_embed, tar_embed = map(int, line.rstrip().split(','))

            link = '{0}-{1}'.format(src_embed, tar_embed)
            rec_link = '{1}-{0}'.format(src_embed, tar_embed)
            if rec_link in persistent_link_set:
                persistent_link_set.remove(rec_link)
                reciprocal_link_set.add(link)
            else:
                persistent_link_set.add(link)

    for t in range(T):
        target_date_str = obj2str(datetime(2018, 9, 1) + timedelta(days=t))
        filename = 'network_{0}.p'.format(target_date_str)
        network_dict = pickle.load(open(os.path.join(data_prefix, 'network_pickle', filename), 'rb'))
        for tar_embed in network_dict:
            src_embed_list = [x[0] for x in network_dict[tar_embed] if x[1] < NUM_REL]
            if len(src_embed_list) > 0:
                for src_embed in src_embed_list:
                    # filter: at least 100 daily views for target video,
                    # and the mean daily views of source video is at least 1% of the target video
                    if embed_avg_view_dict[tar_embed] >= 100 and embed_avg_view_dict[src_embed] >= 0.01 * embed_avg_view_dict[tar_embed]:
                        link = '{0}-{1}'.format(src_embed, tar_embed)
                        rec_link = '{1}-{0}'.format(src_embed, tar_embed)
                        if link not in persistent_link_set and rec_link not in persistent_link_set \
                                and link not in reciprocal_link_set and rec_link not in reciprocal_link_set \
                                and link not in non_persistent_link_set and rec_link not in non_persistent_link_set:
                            non_persistent_link_set.add(link)

    print('>>> Number of reciprocal links: {0}'.format(len(reciprocal_link_set)))
    print('>>> Number of persistent links (non-reciprocal): {0}'.format(len(persistent_link_set)))
    print('>>> Number of ephemeral links: {0}'.format(len(non_persistent_link_set)))

    for link_set, log_filename in zip([reciprocal_link_set, persistent_link_set, non_persistent_link_set],
                                      ['./reciprocal_pearsonr.log', './persistent_pearsonr.log',
                                       './ephemeral_pearsonr.log']):
        with open(log_filename, 'w') as log_file:
            for link in link_set:
                src_embed, tar_embed = map(int, link.split('-'))
                eff_size, pvalue = pearsonr(detsn(embed_view_dict[src_embed]), detsn(embed_view_dict[tar_embed]))
                log_file.write('{0},{1},{2},{3}\n'.format(src_embed, tar_embed, eff_size, pvalue))

    timer.stop()