Beispiel #1
0
def test_svm(X, y, path):
    data_set = 'cardio'

    print("Predicting 1")
    probabilit_list = []
    dtc = pickle.load(open(path + 'model/' + data_set + '/svm_model_1', 'rb'))
    probs = dtc.predict_proba(X)
    probs = probs[:, 1]
    probabilit_list.append(probs)
    calculate_f1_score(dtc, X, y)

    print("Predicting 2")
    dtc = pickle.load(open(path + 'model/' + data_set + '/svm_model_2', 'rb'))
    probs = dtc.predict_proba(X)
    probs = probs[:, 1]
    probabilit_list.append(probs)
    calculate_f1_score(dtc, X, y)

    color_list = ['r', 'b']
    label_list = ['kernel = linear', 'kernel = polynomial']

    plt = multiple_precision_recall_curves(y, probabilit_list, color_list,
                                           label_list)

    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.ylim([0.5, 1.05])
    plt.xlim([0.0, 1.0])
    plt.title('Support Vector Machine Precision-Recall Curve')
    plt.legend(loc="best")

    # plt.show()
    save_figure(plt, path + "plot/" + data_set, 'svm_pr_curve.png')
def spatial_statistics(trainsize, r, regs):
    """Plot spatially averaged temperature and spacially itegrated (summed)
    species concentrations over the full time domain.

    Parameters
    ----------
    trainsize : int
        Number of snapshots used to train the ROM.

    r : int
        Dimension of the ROM.

    regs : two or three positive floats
        Regularization hyperparameters used to train the ROM.
    """
    # Load the true results.
    keys = [f"{var}_mean" for var in config.ROM_VARIABLES[:4]]
    keys += [f"{var}_sum" for var in config.SPECIES]
    feature_gems, t = utils.load_spatial_statistics(keys)
    keys = np.reshape(keys, (4, 2), order='F')

    # Load and simulate the ROM.
    t, V, qbar, scales, q_rom = simulate_rom(trainsize, r, regs)

    # Initialize the figure.
    fig, axes = plt.subplots(keys.shape[0],
                             keys.shape[1],
                             figsize=(9, 6),
                             sharex=True)

    # Calculate and plot the results.
    for ax, key in zip(axes.flat, keys.flat):
        with utils.timed_block("Reconstructing"):
            feature_rom = get_feature(key, q_rom, V, qbar, scales)
        ax.plot(t, feature_gems[key], lw=1, **config.GEMS_STYLE)
        ax.plot(t[:q_rom.shape[1]], feature_rom, lw=1, **config.ROM_STYLE)
        ax.axvline(t[trainsize], color='k')
        ax.set_ylabel(config.VARLABELS[key.split('_')[0]])
        ax.locator_params(axis='y', nbins=2)

    # Set titles, labels, ticks, and draw a single legend.
    for ax in axes[-1, :]:
        ax.set_xlim(t[0], t[-1])
        ax.set_xticks(np.arange(t[0], t[-1] + .001, .002))
        ax.set_xlabel("Time [s]", fontsize=12)
    axes[0, 0].set_title("Spatial Averages", fontsize=14)
    axes[0, 1].set_title("Spatial Integrals", fontsize=14)

    # Legend on the right.
    fig.tight_layout(rect=[0, 0, .85, 1])
    leg = axes[0, 0].legend(loc="center right",
                            fontsize=14,
                            bbox_to_anchor=(1, .5),
                            bbox_transform=fig.transFigure)
    for line in leg.get_lines():
        line.set_linewidth(2)

    utils.save_figure("statfeatures.pdf")
 def test_save_and_load_figure(self):
     fig_object = plt.figure()
     file_path = 'test_file'
     utils.save_figure(fig_object=fig_object, file_path=file_path)
     loaded_fig_object = utils.load_figure(file_path=file_path)
     os.remove(file_path + '.pkl')
     os.remove(file_path + '.pdf')
     self.assertEqual(fig_object.images, loaded_fig_object.images)
     self.assertEqual(fig_object.axes, loaded_fig_object.axes)
def save_supplementary_figure_1(posts_df, post_url_df, url_df):

    accounts_to_plot = [
        'Pharmaceuticals Exposed', 'Humanity vs Insanity - The CRANE Report',
        'News2morrow', 'Truth Train', 'The British Constitution Group',
        "Arnica - Parents' Support Network, Promoting Natural Immunity",
        'ROKOTUSKRIITTISET', 'Canadian Freedom Fighters',
        "'FACEBOOK CENSORED NEWS'", 'Tampa Bay Trump Club'
    ]

    fig = plt.figure(figsize=(10, 12))

    for idx in range(len(accounts_to_plot)):
        ax = plt.subplot(5, 2, idx + 1)
        plt.title(accounts_to_plot[idx])

        account_id = posts_df[posts_df['account_name'] ==
                              accounts_to_plot[idx]].account_id.unique()[0]
        fake_news_dates = compute_fake_news_dates(post_url_df, url_df,
                                                  account_id)
        plot_one_group(ax,
                       posts_df,
                       account_id,
                       fake_news_dates=fake_news_dates)

        repeat_offender_periods = compute_repeat_offender_periods(
            fake_news_dates)
        repeat_offender_periods = merge_overlapping_periods(
            repeat_offender_periods)
        for period in repeat_offender_periods:
            plt.axvspan(period[0],
                        period[1],
                        ymin=1 / 11,
                        facecolor='C3',
                        alpha=0.1)

        if idx == 0:
            legend1 = plt.legend(loc='upper left')
            plt.ylim(top=40)
        elif idx == 1:
            patch1 = mpatches.Patch(facecolor='white',
                                    alpha=0.4,
                                    edgecolor='k')
            patch2 = mpatches.Patch(facecolor='pink', alpha=0.4, edgecolor='k')
            legend2 = plt.legend(
                [patch1, patch2],
                ["'No strike' periods", "'Repeat offender' periods"],
                loc='upper right',
                framealpha=1)
        plt.axvline(x=np.datetime64("2020-06-09"),
                    color='black',
                    linestyle='--',
                    linewidth=1)

    plt.tight_layout()
    save_figure('supplementary_figure_1', folder='ip&m', dpi=100)
Beispiel #5
0
 def plot_likelihood(self, prefix, suffix):
     plt.figure()
     self.llh = np.asarray(self.llh)
     N = len(self.llh)
     plt.plot(range(1, (N + 1)), self.llh, c=COLORS[0])
     plt.ylabel('log likelihood')
     plt.xlabel('EM iterations')
     lgd = plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
     utils.save_figure('./img/hmm/', plt, prefix, suffix, lgd)
     return
def test_boosted_decision_tree_loan(X, y, path):
    data_set = 'loan'
    probabilit_list = []
    dtc = pickle.load(
        open(path + 'model/' + data_set + '/boosted_dtc_model_nodes_1', 'rb'))
    probs = dtc.predict_proba(X)
    probs = probs[:, 1]
    probabilit_list.append(probs)
    calculate_f1_score(dtc, X, y)

    dtc = pickle.load(
        open(path + 'model/' + data_set + '/boosted_dtc_model_nodes_2', 'rb'))
    probs = dtc.predict_proba(X)
    probs = probs[:, 1]
    probabilit_list.append(probs)
    calculate_f1_score(dtc, X, y)

    dtc = pickle.load(
        open(path + 'model/' + data_set + '/boosted_dtc_model_nodes_3', 'rb'))
    probs = dtc.predict_proba(X)
    probs = probs[:, 1]
    probabilit_list.append(probs)
    calculate_f1_score(dtc, X, y)

    dtc = pickle.load(
        open(path + 'model/' + data_set + '/boosted_dtc_model_nodes_4', 'rb'))
    probs = dtc.predict_proba(X)
    probs = probs[:, 1]
    probabilit_list.append(probs)
    calculate_f1_score(dtc, X, y)

    dtc = pickle.load(
        open(path + 'model/' + data_set + '/boosted_dtc_none', 'rb'))
    probs = dtc.predict_proba(X)
    probs = probs[:, 1]
    probabilit_list.append(probs)
    calculate_f1_score(dtc, X, y)

    color_list = ['r', 'y', 'g', 'm', 'b']
    label_list = ['MD = 4', 'MD = 8', 'MD = 15', 'MD = 30', 'MD = None']

    plt = multiple_precision_recall_curves(y, probabilit_list, color_list,
                                           label_list)

    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.ylim([0.0, 1.05])
    plt.xlim([0.0, 1.0])
    plt.title(
        'Boosted Decision Tree Classifier with Max Depth Pruning (MD) \n Precision-Recall Curve '
    )
    plt.legend(loc="best")

    save_figure(plt, path + "plot/" + data_set,
                'boosted_dtc_max_depth_plots.png')
Beispiel #7
0
def test_decision_tree_max_leaf(X, y, path):
    data_set = 'cardio'
    probabilit_list = []

    dtc = pickle.load(
        open(path + 'model/' + data_set + '/dtc_model_leaf_nodes_1', 'rb'))
    probs = dtc.predict_proba(X)
    probs = probs[:, 1]
    probabilit_list.append(probs)
    calculate_f1_score(dtc, X, y)

    dtc = pickle.load(
        open(path + 'model/' + data_set + '/dtc_model_leaf_nodes_2', 'rb'))
    probs = dtc.predict_proba(X)
    probs = probs[:, 1]
    probabilit_list.append(probs)
    calculate_f1_score(dtc, X, y)

    dtc = pickle.load(
        open(path + 'model/' + data_set + '/dtc_model_leaf_nodes_3', 'rb'))
    probs = dtc.predict_proba(X)
    probs = probs[:, 1]
    probabilit_list.append(probs)
    calculate_f1_score(dtc, X, y)

    dtc = pickle.load(
        open(path + 'model/' + data_set + '/dtc_model_leaf_nodes_4', 'rb'))
    probs = dtc.predict_proba(X)
    probs = probs[:, 1]
    probabilit_list.append(probs)
    calculate_f1_score(dtc, X, y)

    dtc = pickle.load(open(path + 'model/' + data_set + '/dtc_none', 'rb'))
    probs = dtc.predict_proba(X)
    probs = probs[:, 1]
    probabilit_list.append(probs)

    color_list = ['r', 'y', 'g', 'm', 'b']
    label_list = [
        'MLN = 20', 'MLN = 100', 'MLN = 1000', 'MLN = 2000', 'MLN = None'
    ]

    plt = multiple_precision_recall_curves(y, probabilit_list, color_list,
                                           label_list)

    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.ylim([0.5, 1.05])
    plt.xlim([0.0, 1.0])
    plt.title(
        'Decision Tree Classifier with Max Leaf Nodes (MLN) \n Precision-Recall Curve '
    )
    plt.legend(loc="best")

    save_figure(plt, path + "plot/" + data_set, 'dtc_max_leaf_nodes_plots.png')
Beispiel #8
0
def test_best_models_loan(X, y, path):
    data_set = 'loan'
    probabilit_list = []
    dtc = pickle.load(
        open(path + 'model/' + data_set + '/dtc_model_depth_2', 'rb'))
    probs = dtc.predict_proba(X)
    probs = probs[:, 1]
    probabilit_list.append(probs)

    dtc = pickle.load(
        open(path + 'model/' + data_set + '/boosted_dtc_model_nodes_1', 'rb'))
    probs = dtc.predict_proba(X)
    probs = probs[:, 1]
    probabilit_list.append(probs)

    dtc = pickle.load(
        open(path + 'model/' + data_set + '/neural_net_model_4', 'rb'))
    probs = dtc.predict_proba(X)
    probs = probs[:, 1]
    probabilit_list.append(probs)

    # dtc = pickle.load(open(path + 'model/' + data_set + '/svm_model_1', 'rb'))
    # probs = dtc.predict_proba(X)
    # probs = probs[:, 1]
    # probabilit_list.append(probs)

    dtc = pickle.load(open(path + 'model/' + data_set + '/kNN_model_4', 'rb'))
    probs = dtc.predict_proba(X)
    probs = probs[:, 1]
    probabilit_list.append(probs)

    # with svm
    # color_list = ['r', 'b', 'm', 'y', 'g']
    # label_list = ['model = decision tree', 'model = boosted decision tree', 'model = neural network', 'model = SVM', 'model = kNN']

    # without svm
    color_list = ['r', 'b', 'm', 'g']
    label_list = [
        'model = decision tree', 'model = boosted decision tree',
        'model = neural network', 'model = kNN'
    ]

    plt = multiple_precision_recall_curves(y, probabilit_list, color_list,
                                           label_list)

    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.ylim([0.0, 1.05])
    plt.xlim([0.0, 1.0])
    plt.title('Precision-Recall Curve of Best Model for each Algorithm')
    plt.legend(loc="best")

    # plt.show()
    save_figure(plt, path + "plot/" + data_set, 'best_models_pr_curve.png')
Beispiel #9
0
 def plot_likelihood(self, prefix, suffix):
     plt.figure()
     self.llh = np.asarray(self.llh)
     N = len(self.llh)
     labels = ['train llh', 'test llh']
     for i in range(2):
         plt.plot(range(1, (N + 1)),
                  self.llh[:, i], c=COLORS[i], label=labels[i])
     plt.ylabel('log likelihood')
     plt.xlabel('EM iterations')
     lgd = plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
     utils.save_figure(plt, prefix, suffix, lgd)
     return
def save_supplementary_figure_2(posts_fake):
    list_complete_groups_id = []
    for id in posts_fake['account_id'].unique():
        posts_df_group = posts_fake[posts_fake["account_id"] == id]
        if ((np.min(posts_df_group['date']) == np.min(posts_fake['date'])) &
            (np.max(posts_df_group['date']) == np.max(posts_fake['date']))):
            list_complete_groups_id.append(id)
    posts_fake_temp = posts_fake[posts_fake["account_id"].isin(
        list_complete_groups_id)]

    plot_group_average(posts_fake_temp, title_detail="Repeat offenders")
    save_figure('supplementary_figure_2', folder='ip&m', dpi=100)
    print_evolution_percentages(posts_fake_temp)
def test_neural_net_loan(X, y, path):
    data_set = 'loan'
    probabilit_list = []
    dtc = pickle.load(
        open(path + 'model/' + data_set + '/neural_net_model_1', 'rb'))
    probs = dtc.predict_proba(X)
    probs = probs[:, 1]
    probabilit_list.append(probs)
    calculate_f1_score(dtc, X, y)

    dtc = pickle.load(
        open(path + 'model/' + data_set + '/neural_net_model_2', 'rb'))
    probs = dtc.predict_proba(X)
    probs = probs[:, 1]
    probabilit_list.append(probs)
    calculate_f1_score(dtc, X, y)

    dtc = pickle.load(
        open(path + 'model/' + data_set + '/neural_net_model_3', 'rb'))
    probs = dtc.predict_proba(X)
    probs = probs[:, 1]
    probabilit_list.append(probs)
    calculate_f1_score(dtc, X, y)

    dtc = pickle.load(
        open(path + 'model/' + data_set + '/neural_net_model_4', 'rb'))
    probs = dtc.predict_proba(X)
    probs = probs[:, 1]
    probabilit_list.append(probs)
    calculate_f1_score(dtc, X, y)

    color_list = ['r', 'b', 'm', 'y']
    label_list = [
        'HLS = 20 x 5', 'HLS = 50 x 5', 'HLS = 100 x 5', 'HLS = 500 x 5'
    ]

    plt = multiple_precision_recall_curves(y, probabilit_list, color_list,
                                           label_list)

    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.ylim([0.0, 1.05])
    plt.xlim([0.0, 1.0])
    plt.title(
        'Neural Network with Varying Hidden Layer Size (HLS) \n Precision-Recall Curve'
    )
    plt.legend(loc="best")

    # plt.show()
    save_figure(plt, path + "plot/" + data_set, 'neural_net_pr_curve.png')
Beispiel #12
0
 def __plot_states(self, data, T_max, title, prefix, suffix, plot_type='plot'):
     f, axarr = plt.subplots(self.K,  sharex=True)
     for i in range(self.K):
         if plot_type == 'step':
             axarr[i].step(range(T_max), data[:T_max, i],
                           c=COLORS[i], label="State %d" % (i + 1))
         else:
             axarr[i].plot(range(T_max), data[:T_max, i],
                           c=COLORS[i], label="State %d" % (i + 1))
         axarr[i].legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
     f.subplots_adjust(hspace=0.2)
     lgd = plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
     plt.title(title, y=(self.K + 0.8))
     plt.setp([a.get_xticklabels() for a in f.axes[:-1]], visible=False)
     utils.save_figure(plt, prefix, suffix, lgd)
     return
Beispiel #13
0
def save_all_groups_figures(posts_df, pages_df):

    group_index = 0
    for account_id in posts_df['account_id'].unique():

        if group_index % 10 == 0:
            plt.figure(figsize=(12, 14))

        ax = plt.subplot(5, 2, group_index % 10 + 1)

        account_name = posts_df[posts_df['account_id'] ==
                                account_id].account_name.unique()[0]
        plt.title(account_name, size="x-large")
        reduced_distribution_date = pages_df[pages_df['page_name'] ==
                                             account_name]['date'].values[0]

        plot_one_group(ax, posts_df, account_id, fake_news_dates=[])

        xticks = [
            np.datetime64('2019-01-01'),
            np.datetime64('2019-05-01'),
            np.datetime64('2019-09-01'),
            np.datetime64('2020-01-01'),
            np.datetime64('2020-05-01'),
            np.datetime64('2020-09-01'),
            np.datetime64(reduced_distribution_date)
        ]
        plt.xticks(xticks, rotation=30, ha='right')
        plt.gca().get_xticklabels()[-1].set_color('red')
        plt.axvline(x=np.datetime64(reduced_distribution_date),
                    color='C3',
                    linestyle='--',
                    linewidth=2)

        if group_index % 10 == 0:
            plt.legend()

        if (group_index % 10 == 9) | (group_index
                                      == posts_df['account_id'].nunique() - 1):
            plt.tight_layout()
            save_figure(
                'z_part_2_all_groups_{}'.format(int(group_index / 10) + 1),
                folder='ip&m',
                dpi=100)

        group_index += 1
def plot_histogram(repeat_offender, free, title_detail):

    infowars = {'reaction': -94, 'share': -96, 'comment': -93}

    fig = plt.figure(figsize=(14, 12))
    fig.suptitle('Histogram of the percentage changes ' + title_detail + ' for the {} Facebook groups'\
                .format(len(repeat_offender['reaction'])), fontsize='x-large')

    xlabels = [
        '-100%', '', '', '', '', '-50%', '', '', '', '', '0%', '', '', '', '',
        '+50%', '', '', '', '', '+100%', '', '', '', '', '+150%', 'More'
    ]

    columns_to_plot = ['reaction', 'share', 'comment']

    for index, column in enumerate(columns_to_plot):
        ax = plt.subplot(3, 1, index + 1)

        evolution_percentage = [
            (ro - fr) * 100 / fr
            for ro, fr in zip(repeat_offender[column], free[column])
        ]
        evolution_percentage = [x for x in evolution_percentage if x != np.inf]
        plt.hist(evolution_percentage,
                 bins=list(range(-100, 151, 10)) + [100000],
                 color='C' + str(index))

        plt.vlines(0, 0, 28, color='k', linestyles='dotted')
        plt.vlines(infowars[column], 0, 5, color='C3', linestyles='dotted')
        plt.text(infowars[column] - 2,
                 5.5,
                 'Infowars drop',
                 color='C3',
                 rotation=50)

        plt.xlabel(column.capitalize() + 's')
        plt.xlim([-100, 160])
        plt.xticks(list(range(-100, 151, 10)) + [160], xlabels)
        ax.spines['right'].set_visible(False)
        ax.spines['left'].set_visible(False)
        ax.spines['top'].set_visible(False)
        plt.locator_params(axis='y', nbins=4)

    plt.tight_layout()
    save_figure('histogram ' + title_detail, folder='ip&m', dpi=100)
Beispiel #15
0
 def plot_cluster(self, data_name, data, classification_name,
                  classification_set, prefix, suffix, show_cross=False):
     plt.figure()
     for i in range(self.K):
         cluster = data[classification_set == i, :]
         plt.scatter(cluster[:, 0], cluster[:, 1], color=COLORS[i])
         plot_ellipse(self.mu[i, :][:, None], np.linalg.inv(np.asmatrix(self.sigma[i, :, :])),
                      -2 * np.log(ALPHA), color=COLORS[i])
         if show_cross:
             plt.scatter(self.mu[i, 0], self.mu[i, 1],
                         color="k", marker="+", lw=20)
     plt.xlim(XLIM)
     plt.ylim(YLIM)
     plt.title(
         '{} classification on {}'.format(
             classification_name,
             data_name))
     utils.save_figure(plt, prefix, suffix)
     return
Beispiel #16
0
def test_kNN(X, y, path):
    data_set = 'cardio'

    probabilit_list = []
    dtc = pickle.load(open(path + 'model/' + data_set + '/kNN_model_1', 'rb'))
    probs = dtc.predict_proba(X)
    probs = probs[:, 1]
    probabilit_list.append(probs)
    calculate_f1_score(dtc, X, y)

    dtc = pickle.load(open(path + 'model/' + data_set + '/kNN_model_2', 'rb'))
    probs = dtc.predict_proba(X)
    probs = probs[:, 1]
    probabilit_list.append(probs)
    calculate_f1_score(dtc, X, y)

    dtc = pickle.load(open(path + 'model/' + data_set + '/kNN_model_3', 'rb'))
    probs = dtc.predict_proba(X)
    probs = probs[:, 1]
    probabilit_list.append(probs)
    calculate_f1_score(dtc, X, y)

    dtc = pickle.load(open(path + 'model/' + data_set + '/kNN_model_4', 'rb'))
    probs = dtc.predict_proba(X)
    probs = probs[:, 1]
    probabilit_list.append(probs)
    calculate_f1_score(dtc, X, y)

    color_list = ['r', 'b', 'm', 'y']
    label_list = ['k = 25', 'k = 150', 'k = 225', 'k = 300']

    plt = multiple_precision_recall_curves(y, probabilit_list, color_list, label_list)

    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.ylim([0.5, 1.05])
    plt.xlim([0.0, 1.0])
    plt.title('k Nearest Neighbors Precision-Recall Curve')
    plt.legend(loc="best")

    # plt.show()
    save_figure(plt, path + "plot/" + data_set, 'kNN_pr_curve.png')
Beispiel #17
0
def save_figure_4(posts_df, pages_df):

    account_name = 'I Love Carbon Dioxide'
    account_id = posts_df[posts_df['account_name'] ==
                          account_name].account_id.unique()[0]
    reduced_distribution_date = pages_df[pages_df['page_name'] ==
                                         account_name]['date'].values[0]

    plt.figure(figsize=(10, 4))
    ax = plt.subplot()

    plt.title("Engagement metrics for one 'reduced distribution' page ('" +
              account_name + "')",
              size="x-large")

    plot_one_group(ax, posts_df, account_id, fake_news_dates=[])

    xticks = [
        np.datetime64('2019-01-01'),
        np.datetime64('2019-03-01'),
        np.datetime64('2019-05-01'),
        np.datetime64('2019-07-01'),
        np.datetime64('2019-09-01'),
        np.datetime64('2019-11-01'),
        np.datetime64('2020-01-01'),
        np.datetime64('2020-03-01'),
        np.datetime64('2020-07-01'),
        np.datetime64('2020-09-01'),
        np.datetime64('2020-11-01'),
        np.datetime64(reduced_distribution_date)
    ]
    plt.xticks(xticks, rotation=30, ha='right')
    plt.gca().get_xticklabels()[-1].set_color('red')

    plt.axvline(x=np.datetime64(reduced_distribution_date),
                color='C3',
                linestyle='--',
                linewidth=2)

    plt.legend()
    plt.tight_layout()
    save_figure('figure_4', folder='ip&m', dpi=100)
Beispiel #18
0
def save_supplementary_figure_2(posts_df, pages_df):

    accounts_to_plot = [
        'Tucker Carlson Tonight', 'Normals Are Pissed', 'Botanica Health',
        'Jodie Meschuk', 'The PROOF Blog', "The Rational Capitalist",
        'Mark Levin', 'POVnow', "Tell The USA to DUMP Trump", 'Florida Boys TV'
    ]

    fig = plt.figure(figsize=(10, 12))

    for idx in range(len(accounts_to_plot)):
        ax = plt.subplot(5, 2, idx + 1)
        plt.title(accounts_to_plot[idx])

        account_id = posts_df[posts_df['account_name'] ==
                              accounts_to_plot[idx]].account_id.unique()[0]
        reduced_distribution_date = pages_df[
            pages_df['page_name'] == accounts_to_plot[idx]]['date'].values[0]

        plot_one_group(ax, posts_df, account_id, fake_news_dates=[])

        xticks = [
            np.datetime64('2019-01-01'),
            np.datetime64('2019-05-01'),
            np.datetime64('2019-09-01'),
            np.datetime64('2020-01-01'),
            np.datetime64('2020-05-01'),
            np.datetime64('2020-09-01'),
            np.datetime64(reduced_distribution_date)
        ]
        plt.xticks(xticks, rotation=30, ha='right')
        plt.gca().get_xticklabels()[-1].set_color('red')
        plt.axvline(x=np.datetime64(reduced_distribution_date),
                    color='C3',
                    linestyle='--',
                    linewidth=2)

        if idx == 0:
            plt.legend()

    plt.tight_layout()
    save_figure('supplementary_figure_3', folder='ip&m', dpi=100)
Beispiel #19
0
def generate_check(sess, model):
    xss, mus, sigmas = model.generate(sess, 500)

    image_dir = flags.save_dir + "/generated"
    if not os.path.exists(image_dir):
        os.mkdir(image_dir)

    for i in range(len(xss)):
        xs = xss[i]
        file_name = "generated_{0:0>2}.png".format(i)
        file_path = image_dir + "/" + file_name
        utils.save_figure(xs, file_path)

    # Print mu and sigma at each step.
    for i in range(mus.shape[0]):
        mu = mus[i]
        sigma = sigmas[i]
        mean_mu = np.mean(mu)
        mean_sigma = np.mean(sigma)
        print("mu[{0}]    = {1:.5f}".format(i, mean_mu))
        print("sigma[{0}] = {1:.5f}".format(i, mean_sigma))
def save_all_groups_figures(posts_df, post_url_df, url_df):

    group_index = 0
    for account_id in posts_df['account_id'].unique():

        if group_index % 10 == 0:
            plt.figure(figsize=(12, 14))

        ax = plt.subplot(5, 2, group_index % 10 + 1)

        fake_news_dates = compute_fake_news_dates(post_url_df, url_df,
                                                  account_id)
        plot_one_group(ax,
                       posts_df,
                       account_id,
                       fake_news_dates=fake_news_dates)
        plt.title(posts_df[posts_df['account_id'] ==
                           account_id].account_name.unique()[0])

        repeat_offender_periods = compute_repeat_offender_periods(
            fake_news_dates)
        repeat_offender_periods = merge_overlapping_periods(
            repeat_offender_periods)
        for period in repeat_offender_periods:
            plt.axvspan(period[0],
                        period[1],
                        ymin=1 / 11,
                        facecolor='C3',
                        alpha=0.1)

        if (group_index % 10 == 9) | (group_index
                                      == posts_df['account_id'].nunique() - 1):
            plt.tight_layout()
            save_figure(
                'z_part_1_all_groups_{}'.format(int(group_index / 10) + 1),
                folder='ip&m',
                dpi=100)

        group_index += 1
Beispiel #21
0
    def visualize_components(self, x_reduced, y, dataset):
        """Visualize components.

            Args:
               x_reduced (ndarray): reduced data.
               y (ndarray): true labels.
               dataset (string): dataset, WDBC or MNIST.

            Returns:
              None.
            """

        component1 = '{}1'.format(self.name)  # first component
        component2 = '{}2'.format(self.name)  # second component

        # Create dataframe for visualization
        df = pd.DataFrame(x_reduced[:, :2], columns=[component1, component2])
        df['y'] = y

        # Plot components and save figure
        utils.plot_components(component1, component2, df, self.name)
        utils.save_figure('{}_{}_components'.format(dataset, self.name))
def save_figure_1(posts_df, post_url_df, url_df):

    fig = plt.figure(figsize=(10, 8))
    gs = fig.add_gridspec(2, 5)

    # top panel
    ax = fig.add_subplot(gs[0, :])
    plot_repeat_offender_example(posts_df, post_url_df, url_df, ax)

    # bottom panel
    repeat_offender, free = compute_periods_average(posts_df, post_url_df,
                                                    url_df)
    print_repeat_offender_statistics(repeat_offender, free)
    plot_histogram(
        repeat_offender,
        free,
        title_detail="between the 'repeat offender' and the 'free' periods")

    ax = fig.add_subplot(gs[1, 1:4])
    plot_repeat_offender_average(repeat_offender, free, ax)
    plt.tight_layout(pad=3)
    save_figure('figure_1', folder='ip&m', dpi=100)
Beispiel #23
0
def train_k_NN(path, with_plots):
    data_set = 'cardio'
    x_train, y_train = load_data(path + 'data/' + data_set + '/train/')

    if with_plots == "False":
        model_1 = train_and_time(KNeighborsClassifier(n_neighbors=25), x_train,
                                 y_train)
        model_2 = train_and_time(KNeighborsClassifier(n_neighbors=150),
                                 x_train, y_train)
        model_3 = train_and_time(KNeighborsClassifier(n_neighbors=225),
                                 x_train, y_train)
        model_4 = train_and_time(KNeighborsClassifier(n_neighbors=300),
                                 x_train, y_train)

        save_model(model_1, path + 'model/' + data_set, 'kNN_model_1')
        save_model(model_2, path + 'model/' + data_set, 'kNN_model_2')
        save_model(model_3, path + 'model/' + data_set, 'kNN_model_3')
        save_model(model_4, path + 'model/' + data_set, 'kNN_model_4')

    else:
        print('Training kNN...')

        model_1 = KNeighborsClassifier(n_neighbors=25)
        model_2 = KNeighborsClassifier(n_neighbors=150)
        model_3 = KNeighborsClassifier(n_neighbors=225)
        model_4 = KNeighborsClassifier(n_neighbors=300)
        plt = multiple_learning_curves_plot(
            [model_1, model_2, model_3, model_4], x_train, y_train,
            ["r", "y", "b", "m"], ['k = 25', 'k = 150', 'k = 225', 'k = 300'])

        plt.title("k Nearest Neighbor \n Learning Curves")
        plt.xlabel("Training examples")
        plt.ylabel("F1 Score")
        plt.grid()

        plt.legend(loc="best")
        # plt.show()
        save_figure(plt, path + "plot/" + data_set, 'kNN_learning_curves.png')
def train_boosted_dtc(path, with_plots):
    data_set = 'cardio'
    x_train, y_train = load_data(path + 'data/' + data_set + '/train/')

    if with_plots == "False":
        model_nodes_1 = train_and_time(AdaBoostClassifier(tree.DecisionTreeClassifier(max_depth=5)), x_train, y_train)
        model_nodes_2 = train_and_time(AdaBoostClassifier(tree.DecisionTreeClassifier(max_depth=10)), x_train, y_train)
        model_nodes_3 = train_and_time(AdaBoostClassifier(tree.DecisionTreeClassifier(max_depth=15)), x_train, y_train)
        model_nodes_4 = train_and_time(AdaBoostClassifier(tree.DecisionTreeClassifier(max_depth=20)), x_train, y_train)
        model_nodes_5 = train_and_time(AdaBoostClassifier(tree.DecisionTreeClassifier()), x_train, y_train)

        save_model(model_nodes_1, path + "model/" + data_set, 'boosted_dtc_model_nodes_1')
        save_model(model_nodes_2, path + "model/" + data_set, 'boosted_dtc_model_nodes_2')
        save_model(model_nodes_3, path + "model/" + data_set, 'boosted_dtc_model_nodes_3')
        save_model(model_nodes_4, path + "model/" + data_set, 'boosted_dtc_model_nodes_4')
        save_model(model_nodes_5, path + "model/" + data_set, 'boosted_dtc_none')

    else:
        print('Training boosted dtc...')
        model_1 = AdaBoostClassifier(tree.DecisionTreeClassifier(max_depth=5))
        model_2 = AdaBoostClassifier(tree.DecisionTreeClassifier(max_depth=10))
        model_3 = AdaBoostClassifier(tree.DecisionTreeClassifier(max_depth=15))
        model_4 = AdaBoostClassifier(tree.DecisionTreeClassifier(max_depth=20))
        model_5 = AdaBoostClassifier(tree.DecisionTreeClassifier())
        plt = multiple_learning_curves_plot(
            [model_1, model_2, model_3, model_4, model_5],
            x_train, y_train,
            ["r", "y", "g", "m", "b"],
            ["MD = 5", "MD = 10", "MD = 15", "MD = 20", "MD = None"]
        )
        plt.title("Boosted Decision Tree With Max Depth (MD) \n Pruning Learning Curves")
        plt.xlabel("Training examples")
        plt.ylabel("F1 Score")
        plt.grid()
        plt.legend(loc="best")
        save_figure(plt, path + "plot/" + data_set, 'boosted_dtc_md_learning_curves.png')
Beispiel #25
0
def errors_in_time(trainsize, r, regs, cutoff=60000):
    """Plot spatially averaged errors, and the projection error, in time.

    Parameters
    ----------
    trainsize : int
        Number of snapshots used to train the ROM.

    r : int
        Dimension of the ROM.

    regs : two positive floats
        Regularization hyperparameters used to train the ROM.

    cutoff : int
        Numer of time steps to plot.
    """
    # Load and simulate the ROM.
    t, V, scales, q_rom = simulate_rom(trainsize, r, regs, cutoff)

    # Load and lift the true results.
    data, _ = utils.load_gems_data(cols=cutoff)
    with utils.timed_block("Lifting GEMS data"):
        data_gems = dproc.lift(data[:, :cutoff])
    del data

    # Shift and project the data (unscaling done later by chunk).
    with utils.timed_block("Projecting GEMS data to POD subspace"):
        data_shifted, _ = dproc.scale(data_gems.copy(), scales)
        data_proj = V.T @ data_shifted
        del data_shifted

    # Initialize the figure.
    fig, axes = plt.subplots(3, 3, figsize=(12, 6), sharex=True)

    # Compute and plot errors in each variable.
    for var, ax in zip(config.ROM_VARIABLES, axes.flat):

        with utils.timed_block(f"Reconstructing results for {var}"):
            Vvar = dproc.getvar(var, V)
            gems_var = dproc.getvar(var, data_gems)
            proj_var = dproc.unscale(Vvar @ data_proj, scales, var)
            pred_var = dproc.unscale(Vvar @ q_rom, scales, var)

        with utils.timed_block(f"Calculating error in {var}"):
            denom = np.abs(gems_var).max(axis=0)
            proj_error = np.mean(np.abs(proj_var - gems_var), axis=0) / denom
            pred_error = np.mean(np.abs(pred_var - gems_var), axis=0) / denom

        # Plot results.
        ax.plot(t,
                proj_error,
                '-',
                lw=1,
                label="Projection Error",
                c=config.GEMS_STYLE['color'])
        ax.plot(t,
                pred_error,
                '-',
                lw=1,
                label="ROM Error",
                c=config.ROM_STYLE['color'])
        ax.axvline(t[trainsize], color='k')
        ax.set_ylabel(config.VARTITLES[var])

    # Format the figure.
    for ax in axes[-1, :]:
        ax.set_xlim(t[0], t[-1])
        ax.set_xticks(np.arange(t[0], t[-1] + .001, .002))
        ax.set_xlabel("Time [s]", fontsize=12)

    # Make legend centered below the subplots.
    fig.tight_layout(rect=[0, .1, 1, 1])
    leg = axes[0, 0].legend(ncol=2,
                            fontsize=14,
                            loc="lower center",
                            bbox_to_anchor=(.5, 0),
                            bbox_transform=fig.transFigure)
    for line in leg.get_lines():
        line.set_linestyle('-')
        line.set_linewidth(5)

    # Save the figure.
    utils.save_figure(f"errors"
                      f"_{config.TRNFMT(trainsize)}"
                      f"_{config.DIMFMT(r)}"
                      f"_{config.REGFMT(regs)}.pdf")
Beispiel #26
0
def point_traces(trainsize, r, regs, elems, cutoff=60000):
    """Plot the time trace of each variable in the original data at the monitor
    location, and the time trace of each variable of the ROM reconstruction at
    the same locations. One figure is generated per variable.

    Parameters
    ----------
    trainsize : int
        Number of snapshots used to train the ROM.

    r : int
        Dimension of the ROM.

    regs : two positive floats
        Regularization hyperparameters used to train the ROM.

    elems : list(int) or ndarray(int)
        Indices in the spatial domain at which to compute the time traces.

    cutoff : int
        Numer of time steps to plot.
    """
    if elems is None:
        elems = config.MONITOR_LOCATIONS

    # Get the indicies for each variable.
    elems = np.atleast_1d(elems)
    nelems = elems.size
    nrows = (nelems // 2) + (1 if nelems % 2 != 0 else 0)
    elems = np.concatenate(
        [elems + i * config.DOF for i in range(config.NUM_ROMVARS)])

    # Load and lift the true results.
    data, _ = utils.load_gems_data(rows=elems[:nelems * config.NUM_GEMSVARS])
    with utils.timed_block("Lifting GEMS time trace data"):
        traces_gems = dproc.lift(data[:, :cutoff])

    # Load and simulate the ROM.
    t, V, scales, q_rom = simulate_rom(trainsize, r, regs, cutoff)

    # Reconstruct and rescale the simulation results.
    simend = q_rom.shape[1]
    with utils.timed_block("Reconstructing simulation results"):
        traces_rom = dproc.unscale(V[elems] @ q_rom, scales)

    # Save a figure for each variable.
    xticks = np.arange(t[0], t[-1] + .001, .002)
    for i, var in enumerate(config.ROM_VARIABLES):
        fig, axes = plt.subplots(nrows,
                                 2 if nelems > 1 else 1,
                                 figsize=(9, 3 * nrows),
                                 sharex=True)
        axes = np.atleast_2d(axes)
        for j, ax in enumerate(axes.flat):
            idx = j + i * nelems
            ax.plot(t, traces_gems[idx, :], lw=1, **config.GEMS_STYLE)
            ax.plot(t[:simend], traces_rom[idx, :], lw=1, **config.ROM_STYLE)
            ax.axvline(t[trainsize], color='k', lw=1)
            ax.set_xlim(t[0], t[-1])
            ax.set_xticks(xticks)
            ax.set_title(f"Location ${j+1}$", fontsize=12)
            ax.locator_params(axis='y', nbins=2)
        for ax in axes[-1, :]:
            ax.set_xlabel("Time [s]", fontsize=12)
        for ax in axes[:, 0]:
            ax.set_ylabel(config.VARLABELS[var], fontsize=12)

        # Single legend to the right of the subplots.
        fig.tight_layout(rect=[0, 0, .85, 1])
        leg = axes[0, 0].legend(loc="center right",
                                fontsize=14,
                                bbox_to_anchor=(1, .5),
                                bbox_transform=fig.transFigure)
        for line in leg.get_lines():
            line.set_linewidth(2)

        # Save the figure.
        utils.save_figure("pointtrace"
                          f"_{config.TRNFMT(trainsize)}"
                          f"_{config.DIMFMT(r)}"
                          f"_{config.REGFMT(regs)}_{var}.pdf")
Beispiel #27
0
def density_plot(config_file,
                 plot_file,
                 burnin=0,
                 max_clusters=None,
                 mesh_size=101,
                 min_cluster_size=0,
                 samples=None,
                 thin=1):

    df = post_process.clusters.load_table(config_file,
                                          burnin=burnin,
                                          thin=thin,
                                          max_clusters=max_clusters,
                                          mesh_size=mesh_size,
                                          min_size=min_cluster_size)

    sizes = df[['cluster_id', 'size'
                ]].drop_duplicates().set_index('cluster_id').to_dict()['size']

    if samples is None:
        samples = sorted(df['sample_id'].unique())

    else:
        df = df[df['sample_id'].isin(samples)]

    num_samples = len(samples)

    clusters = df['cluster_id'].unique()

    postions = range(1, len(clusters) + 1)

    utils.setup_plot()

    width = 8

    height = 2 * num_samples + 1

    fig = pp.figure(figsize=(width, height))

    grid = gs.GridSpec(nrows=num_samples, ncols=1)

    colors = utils.get_clusters_color_map(pd.Series(clusters))

    for ax_index, sample_id in enumerate(samples):
        plot_df = df[df['sample_id'] == sample_id]

        plot_df = plot_df.drop(['sample_id', 'size'],
                               axis=1).set_index('cluster_id')

        ax = fig.add_subplot(grid[ax_index])

        utils.setup_axes(ax)

        ax.annotate(sample_id,
                    xy=(1.01, 0.5),
                    xycoords='axes fraction',
                    fontsize=defaults.axis_label_font_size)

        for i, (cluster_id, log_pdf) in enumerate(plot_df.iterrows()):
            pos = postions[i]

            y = log_pdf.index.astype(float)

            x = np.exp(log_pdf)

            x = (x / x.max()) * 0.3

            ax.fill_betweenx(y,
                             pos - x,
                             pos + x,
                             color=colors[cluster_id],
                             where=(x > 1e-6))

        ax.set_xticks(postions)

        if ax_index == (num_samples - 1):
            x_tick_labels = [
                '{0} (n={1})'.format(x, sizes[x]) for x in clusters
            ]

            ax.set_xticklabels(x_tick_labels, rotation=90)

            ax.set_xlabel(defaults.cluster_label,
                          fontsize=defaults.axis_label_font_size)

        else:
            ax.set_xticklabels([])

        utils.set_tick_label_font_sizes(ax, defaults.tick_label_font_size)

        ax.set_ylim(defaults.cellular_prevalence_limits)

    if num_samples == 1:
        ax.set_ylabel(defaults.cellular_prevalence_label,
                      fontsize=defaults.axis_label_font_size)

    else:
        fig.text(-0.01,
                 0.5,
                 defaults.cellular_prevalence_label,
                 fontsize=defaults.axis_label_font_size,
                 ha='center',
                 rotation=90,
                 va='center')

    grid.tight_layout(fig)

    utils.save_figure(fig, plot_file)
Beispiel #28
0
def parallel_coordinates_plot(config_file,
                              plot_file,
                              burnin=0,
                              max_clusters=None,
                              mesh_size=101,
                              min_cluster_size=0,
                              samples=None,
                              thin=1):

    utils.setup_plot()

    plot_df = post_process.clusters.load_summary_table(
        config_file,
        burnin=burnin,
        max_clusters=max_clusters,
        mesh_size=mesh_size,
        min_size=min_cluster_size,
        thin=thin,
    )

    if samples is None:
        samples = sorted(plot_df['sample_id'].unique())

    else:
        plot_df = plot_df[plot_df['sample_id'].isin(samples)]

    clusters = sorted(plot_df['cluster_id'].unique())

    plot_df['sample_index'] = plot_df['sample_id'].apply(
        lambda x: samples.index(x))

    plot_df = plot_df.sort_values(by='sample_index')

    grid = sb.FacetGrid(plot_df,
                        hue='cluster_id',
                        hue_order=clusters,
                        palette='husl')

    grid.map(pp.errorbar,
             'sample_index',
             'mean',
             'std',
             marker=defaults.line_plot_marker,
             markersize=defaults.line_plot_marker_size)

    ax = grid.ax

    utils.setup_axes(ax)

    fig = grid.fig

    # Legend
    box = ax.get_position()

    ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])

    ax.legend(loc='center left', bbox_to_anchor=(1, 0.5), title='Cluster')

    # Axis formatting
    ax.set_xticks(sorted(plot_df['sample_index'].unique()))

    ax.set_xticklabels(samples)

    ax.set_xlabel(defaults.sample_label,
                  fontsize=defaults.axis_label_font_size)

    ax.set_ylabel(defaults.cellular_prevalence_label,
                  fontsize=defaults.axis_label_font_size)

    utils.set_tick_label_font_sizes(ax, defaults.tick_label_font_size)

    # Plot limits
    ax.set_xlim(plot_df['sample_index'].min() - 0.1,
                plot_df['sample_index'].max() + 0.1)

    ax.set_ylim(*defaults.cellular_prevalence_limits)

    # Resize and save figure
    fig.set_size_inches(*utils.get_parallel_coordinates_figure_size(samples))

    utils.save_figure(fig, plot_file)
Beispiel #29
0
def density_plot(config_file, plot_file, burnin=0, samples=None, thin=1):

    utils.setup_plot()

    df = _load_density_df(config_file, burnin, thin)

    if samples is None:
        samples = sorted(df['sample_id'].unique())

    else:
        df = df[df['sample_id'].isin(samples)]

    loci = df['mutation_id'].unique()

    num_loci = len(loci)

    width = 8

    height = 2 * num_loci + 2

    fig = pp.figure(figsize=(width, height))

    grid = gs.GridSpec(nrows=num_loci, ncols=1)

    for ax_index, locus in enumerate(loci):
        ax = fig.add_subplot(grid[ax_index])

        utils.setup_axes(ax)

        plot_df = df[df['mutation_id'] == locus]

        sb.violinplot(ax=ax,
                      data=plot_df,
                      x='sample_id',
                      y='cellular_prevalence',
                      inner=None,
                      order=samples,
                      scale='width')

        ax.set_ylabel('')

        if ax_index != (num_loci - 1):
            ax.set_xticklabels([])

            ax.set_xlabel('')

        else:
            ax.set_xlabel(defaults.sample_label)

        ax.set_ylim(*defaults.cellular_prevalence_limits)

        ax.annotate(locus,
                    xy=(1.01, 0.5),
                    xycoords='axes fraction',
                    fontsize=defaults.axis_label_font_size)

        utils.set_tick_label_font_sizes(ax, defaults.tick_label_font_size)

    fig.text(-0.01,
             0.5,
             defaults.cellular_prevalence_label,
             fontsize=defaults.axis_label_font_size,
             ha='center',
             rotation=90,
             va='center')

    grid.tight_layout(fig, h_pad=3)

    utils.save_figure(fig, plot_file)
Beispiel #30
0
def parallel_coordinates_plot(config_file,
                              plot_file,
                              burnin=0,
                              max_clusters=None,
                              min_cluster_size=0,
                              samples=None,
                              thin=1,
                              value='cellular_prevalence'):

    utils.setup_plot()

    df = post_process.loci.load_table(config_file,
                                      burnin,
                                      thin,
                                      max_clusters=max_clusters,
                                      min_cluster_size=min_cluster_size)

    color_map = utils.get_clusters_color_map(df['cluster_id'])

    if samples is None:
        samples = sorted(df['sample_id'].unique())

    else:
        df = df[df['sample_id'].isin(samples)]

    df['sample_index'] = df['sample_id'].apply(lambda x: samples.index(x))

    df = df.sort_values(by='sample_index')

    fig = pp.figure()

    ax = fig.add_subplot(1, 1, 1)

    utils.setup_axes(ax)

    for cluster_id, cluster_df in df.groupby('cluster_id'):
        for _, locus_df in cluster_df.groupby('mutation_id'):
            x = locus_df['sample_index']

            y = locus_df[value]

            ax.plot(x,
                    y,
                    alpha=0.75,
                    c=color_map[cluster_id],
                    marker=defaults.line_plot_marker,
                    markersize=defaults.line_plot_marker_size)

    ax.set_xlabel(defaults.sample_label,
                  fontsize=defaults.axis_label_font_size)

    if value == 'cellular_prevalence':
        ax.set_ylabel(defaults.cellular_prevalence_label,
                      fontsize=defaults.axis_label_font_size)

    elif value == 'variant_allele_frequency':
        ax.set_ylabel(defaults.variant_allele_frequency_label)

    ax.set_xticks(sorted(df['sample_index'].unique()))

    ax.set_xticklabels(samples)

    utils.set_tick_label_font_sizes(ax, defaults.tick_label_font_size)

    ax.set_ylim(*defaults.cellular_prevalence_limits)

    box = ax.get_position()

    ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])

    legend_handles = utils.get_legend_handles(color_map)

    legend = ax.legend(legend_handles.values(),
                       legend_handles.keys(),
                       bbox_to_anchor=(1, 0.5),
                       fontsize=defaults.legend_font_size,
                       loc='center left',
                       title=defaults.cluster_label)

    legend.get_title().set_fontsize(defaults.legend_title_font_size)

    utils.save_figure(fig, plot_file)