Beispiel #1
0
def draw_main(source, target, pred, field, itr):
    """ Save the figures that define a sample: the source, target, pred, and field"""
    save_fig(source[0], log_path + str(itr) + '_source')
    save_fig(target[0], log_path + str(itr) + '_target')
    save_fig(pred[0], log_path + str(itr) + '_pred')
    save_field(field[0], log_path + str(itr) + '_field')
    save_fig(torch.abs(pred[0] - target[0]), log_path + str(itr) + '_mydiff')
    save_fig(torch.abs(source[0] - target[0]), log_path + str(itr) + '_ogdiff')
Beispiel #2
0
def plot_fig3abc():
    # read data
    df_fleiss = pd.read_csv(os.path.join(DATA_DIR, 'fig3_fleiss.csv'))
    df_var = pd.read_csv(os.path.join(DATA_DIR, 'fig3_variability.csv'),
                         index_col='label')
    df_dis = pd.read_csv(os.path.join(DATA_DIR, 'fig3_label_imbalance.csv'),
                         index_col='label')

    # plot stuff
    height_ratios = [2.5, .6, 3]
    fig, axes = plt.subplots(3,
                             1,
                             figsize=(2.5, 4.5),
                             sharex=True,
                             gridspec_kw=dict(hspace=0,
                                              height_ratios=height_ratios))
    cbar_opts = [
        dict(pad=.08, shrink=.4 * 2.5 / hr, aspect=15) for hr in height_ratios
    ]
    plot_cbar = True
    common_args = dict(lw=.2, ec='white', annot=False, square=True)

    # plot distribution
    cmap = 'Purples'
    ax = axes[0]
    heatmap = sns.heatmap(data=df_dis,
                          cmap=cmap,
                          cbar=plot_cbar,
                          cbar_kws=dict(label="Corpus size", **cbar_opts[0]),
                          ax=ax,
                          fmt='d',
                          annot_kws=dict(fontsize=5),
                          **common_args)
    ax.set_xticklabels(df_dis.columns.tolist())
    ax.tick_params(axis='both', direction='out')
    ax.set_title('Label imbalance', fontsize=7)

    # plot fleiss
    cmap = 'Reds'
    ax = axes[1]
    labels = [f'{l:.2f}'.lstrip('0') for l in df_fleiss.fleiss_kappa.values]
    heatmap = sns.heatmap(data=[df_fleiss.fleiss_kappa],
                          cbar=plot_cbar,
                          cmap=cmap,
                          cbar_kws=dict(label="Fleiss' Kappa", **cbar_opts[1]),
                          ax=ax,
                          fmt='',
                          annot_kws=dict(fontsize=5),
                          **common_args)
    ax.tick_params(axis='both', direction='out')
    ax.set_yticklabels(['all'], rotation=0)
    ax.set_title('Annotator agreement', fontsize=7)

    # plot variance
    cmap = 'Greens'
    ax = axes[2]
    heatmap = sns.heatmap(data=df_var,
                          cmap=cmap,
                          cbar=plot_cbar,
                          center=df_var.loc['all'].mean(),
                          cbar_kws=dict(label='Embedding variance',
                                        **cbar_opts[2]),
                          ax=ax,
                          **common_args)
    ax.set_title('Corpus variability', fontsize=7)
    # ticks formatting
    ax.tick_params(axis='both', direction='out')
    ax.set_xticklabels(ax.get_xticklabels(), rotation=75, ha='right')
    offset = matplotlib.transforms.ScaledTranslation(.06, 0,
                                                     fig.dpi_scale_trans)
    for label in ax.xaxis.get_majorticklabels():
        label.set_transform(label.get_transform() + offset)

    # common cosmetics
    for ax, cbar_ticks in zip(axes, [3, 3, 4]):
        # set labels invisible
        ax.xaxis.label.set_visible(False)
        ax.yaxis.label.set_visible(False)
        # colorbar cosmetics
        cbar = ax.collections[0].colorbar
        cbar.ax.tick_params(axis='y', direction='out')
        cbar.ax.yaxis.label.set_rotation(90)
        cbar.ax.yaxis.label.set_ha('center')
        cbar.ax.yaxis.label.set_va('top')
        cbar.ax.yaxis.set_major_locator(
            matplotlib.ticker.MaxNLocator(cbar_ticks))

    save_fig(fig, 'fig3abc', version=1, plot_formats=['png', 'pdf'], dpi=800)
Beispiel #3
0
def plot_fig3d(cached=True):
    sim_matrix = {}
    keys = ['all', 'positive', 'neutral', 'negative']
    for k in keys:
        sim_matrix[k] = pd.read_csv(os.path.join(DATA_DIR, f'fig3d_{k}.csv'),
                                    index_col=0)
    num_plots = len(sim_matrix.keys())
    fig, _axes = plt.subplots(2, 2, figsize=(4, 4), sharex=True, sharey=True)
    axes = []
    for ax_row in _axes:
        for ax in ax_row:
            axes.append(ax)

    min_vals = []
    max_vals = []
    for key, df in sim_matrix.items():
        min_vals.append(df.values[np.triu_indices_from(df, k=1)].min())
        max_vals.append(df.values[np.triu_indices_from(df, k=1)].max())
    min_val = min(min_vals)
    max_val = max(max_vals)

    for key, ax in zip(keys, axes):
        df = sim_matrix[key]
        # normalize
        df = (df - min_val) / (max_val - min_val)
        mask = np.zeros(df.shape, dtype=bool)
        mask[np.tril_indices(len(df), k=-1)] = True
        cmap = 'Blues_r'
        sns.heatmap(data=df,
                    mask=mask,
                    cmap=cmap,
                    cbar=False,
                    cbar_kws=dict(label='Normalized\ncosine similarity',
                                  fraction=.025,
                                  pad=.08),
                    vmax=1,
                    center=.5,
                    lw=.2,
                    ec='white',
                    ax=ax,
                    square=True)

        # move axis labels
        ax.tick_params(axis='both', direction='out')

        ax.set_xticklabels(ax.get_xticklabels(), rotation=75, ha='right')
        offset = matplotlib.transforms.ScaledTranslation(
            .05, 0, fig.dpi_scale_trans)
        for label in ax.xaxis.get_majorticklabels():
            label.set_transform(label.get_transform() + offset)

        # title
        ax.set_title(key, fontsize=7)

    # add colorbar
    cbar = add_colorbar(fig,
                        ax,
                        x=.9,
                        y=.42,
                        length=.013,
                        width=.2,
                        vmin=0,
                        vmax=1,
                        label='Normalized\ncosine similarity',
                        cmap=cmap,
                        orientation='vertical')
    cbar.ax.tick_params(axis='y', direction='out')
    cbar.ax.yaxis.label.set_ha('center')
    cbar.ax.yaxis.label.set_va('top')
    cbar.ax.yaxis.set_major_locator(matplotlib.ticker.MaxNLocator(5))

    fig.subplots_adjust(hspace=.2, wspace=0)
    fig.suptitle('Corpus similarity', fontsize=7, y=.96)

    # save
    save_fig(fig, 'fig3d', version=1, plot_formats=['png', 'pdf'], dpi=800)
def main():
    df_bert = read_bert_data()
    df_fasttext = read_fasttext_data()

    s_date = datetime(2017, 7, 1, tzinfo=pytz.utc)
    e_date = datetime(2020, 10, 24, tzinfo=pytz.utc)

    fig, axes = plt.subplots(2,
                             1,
                             figsize=(3.5, 3.5),
                             sharex=True,
                             sharey=True)
    for ax, df, title in zip(axes, [df_fasttext, df_bert],
                             ['FastText', 'BERT']):
        df = df[s_date:e_date]
        df = df.reset_index().melt(id_vars=['created_at'],
                                   var_name='trained_at',
                                   value_name='sentiment')
        df = select_intervals(df, s_date, e_date)

        palette = sns.color_palette('inferno',
                                    n_colors=df.trained_at.nunique())

        # plot
        sns.lineplot(x='created_at',
                     y='sentiment',
                     hue='trained_at',
                     palette=palette,
                     data=df,
                     hue_order=sorted(df.trained_at.unique()),
                     solid_capstyle='round',
                     legend=ax == axes[0],
                     ax=ax)

        # legend
        if ax == axes[0]:
            handles, labels = ax.get_legend_handles_labels()
            leg = ax.legend(handles,
                            labels,
                            loc='center left',
                            bbox_to_anchor=(1.05, 0),
                            borderaxespad=0.,
                            frameon=False,
                            title='Trained at')
            leg._legend_box.align = "left"

        # formatting
        ax.grid(True)
        ax.xaxis.set_minor_locator(mdates.MonthLocator())
        ax.xaxis.set_major_locator(mdates.MonthLocator(bymonth=[1, 7]))
        ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
        ax.set_title(title, pad=4)

        # labels
        ax.xaxis.label.set_visible(False)
        ax.yaxis.label.set_visible(False)
        ax.set_ylim((0, .83))

    fig.subplots_adjust(hspace=.15)
    fig.text(.02,
             .5,
             'Sentiment index $s$',
             rotation=90,
             ha='left',
             va='center')

    # save
    save_fig(plt.gcf(), 'fig4', version=1, plot_formats=['png', 'pdf'])
Beispiel #5
0
def plot_fig2():
    df_fasttext = read_data_fasttext()
    df_bert = read_data_bert()

    # constants
    ms = 2
    lw = .8
    ms_square = 2.5
    metric = 'f1_macro'

    # plot stuff
    fig, all_axes = plt.subplots(2, 2, sharex=True, figsize=(3.5, 3))
    for i, (df, (ax1, ax2), title) in enumerate(
            zip([df_fasttext, df_bert], [[all_axes[0][i], all_axes[1][i]]
                                         for i in range(len(all_axes))],
                ['FastText', 'BERT'])):
        palette = sns.color_palette('inferno',
                                    n_colors=df.centroid_day_train.nunique())

        # compute drift score
        df = compute_concept_drift_score(df, metric=metric)

        # train markers
        df_markers = df.groupby(['centroid_day_train', 'centroid_day'
                                 ])[[metric, 'rel_concept_drift'
                                     ]].mean().reset_index().copy()
        df_markers = df_markers[df_markers.centroid_day_train ==
                                df_markers.centroid_day]

        # performance score
        df = df[[
            'centroid_day_train', 'centroid_day', metric, 'repeat',
            'rel_concept_drift'
        ]]
        df['train_day'] = df['centroid_day_train'].apply(
            lambda s: s.strftime('%Y-%m-%d'))
        df = df.sort_values(['train_day', 'centroid_day'])
        sns.lineplot(x='centroid_day',
                     y=metric,
                     hue='train_day',
                     ci=95,
                     err_style='band',
                     marker='o',
                     lw=lw,
                     ms=ms,
                     mec='none',
                     palette=palette,
                     data=df,
                     legend=i == 1,
                     ax=ax1)
        sns.lineplot(x='centroid_day',
                     y=metric,
                     hue='centroid_day_train',
                     ci=None,
                     lw=0,
                     marker='s',
                     mec='none',
                     ms=ms_square,
                     palette=palette,
                     data=df_markers,
                     ax=ax1,
                     legend=False)

        # model drift score
        sns.lineplot(x='centroid_day',
                     y='rel_concept_drift',
                     hue='centroid_day_train',
                     data=df,
                     palette=palette,
                     marker='o',
                     lw=lw,
                     ms=ms,
                     mec='none',
                     legend=False,
                     ax=ax2)
        sns.lineplot(x='centroid_day',
                     y='rel_concept_drift',
                     hue='centroid_day_train',
                     ci=None,
                     lw=0,
                     marker='s',
                     mec='none',
                     ms=ms_square,
                     palette=palette,
                     data=df_markers,
                     ax=ax2,
                     legend=False)

        # axis labels
        if i == 0:
            ax1.set_ylabel('F1-macro')
            ax2.set_ylabel('% Relative\nperforamnce change')
        else:
            ax1.yaxis.label.set_visible(False)
            ax2.yaxis.label.set_visible(False)
            ax1.yaxis.set_ticklabels([])
            ax2.yaxis.set_ticklabels([])

        ax2.locator_params(axis='y', nbins=5)
        ax1.set_ylim((.32, .65))
        ax2.set_ylim((-28, 13))

        # titles
        ax1.set_title(title)

        # common formatting
        for ax in [ax1, ax2]:
            ax.grid(True)
            ax.xaxis.set_minor_locator(mdates.MonthLocator())
            ax.xaxis.set_major_locator(mdates.MonthLocator(bymonth=[1]))
            ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))

            # lims
            ax.set_xlim((datetime(2018, 7, 1), datetime(2020, 9, 1)))
            ax.xaxis.label.set_visible(False)

        # create legends
        if i == 1:
            handles, labels = ax1.get_legend_handles_labels()
            legend_opts = dict(loc='center left',
                               bbox_to_anchor=(1.1, .2),
                               borderaxespad=0.,
                               handlelength=1,
                               handletextpad=.8,
                               frameon=False)
            leg = ax1.legend(handles,
                             labels,
                             title='Trained on data up to',
                             **legend_opts)
            leg._legend_box.align = "left"

            leg2 = [
                Line2D([0], [0],
                       lw=0,
                       marker='s',
                       mec='none',
                       ms=3,
                       color=palette[0],
                       label='Train & evaluate'),
                Line2D([0], [0],
                       lw=0,
                       marker='o',
                       mec='none',
                       ms=3,
                       color=palette[0],
                       label='Evaluate')
            ]
            ax2.legend(handles=leg2, **legend_opts)

    fig.subplots_adjust(hspace=.1, wspace=.08)

    # save
    save_fig(plt.gcf(), 'fig2', version=1, plot_formats=['png', 'pdf'])
Beispiel #6
0
def main():
    # read data
    df = read_data()
    df = df.set_index('centroid_day')

    # first datapoint was skipped (by accident)
    df = df.iloc[1:]

    # plot
    fig, ax = plt.subplots(1, 1, figsize=(3, 1.8))

    # compute baselines
    df['b1_train'] = 0
    df['b1_test'] = 0
    df['train'] = 0
    df['test'] = 0
    # b1 train
    df.loc[df.iloc[1:5].index, 'b1_train'] = num_train_samples_per_bin
    # b1 test
    df.loc[df.iloc[4:].index, 'b1_test'] = num_test_samples_per_bin
    # train
    df.loc[df.iloc[:1].index, 'train'] = num_train_samples_per_bin
    df.loc[df.iloc[5:].index, 'train'] = num_train_samples_per_bin
    # test
    df.loc[df.iloc[3:4].index, 'test'] = num_test_samples_per_bin
    df['other'] = df['b1_train'] + df['b1_test'] + df['train'] + df['test']
    df['total'] = df['all'].copy()
    df['all'] -= df['other']

    # plot
    width = 40
    ax.bar(df.index.to_pydatetime(),
           df['train'].values.tolist(),
           width=width,
           color='C0',
           label=f'Train')
    ax.bar(df.index.to_pydatetime(),
           df['test'].values.tolist(),
           bottom=df['train'] + df['b1_train'],
           color='C3',
           width=width,
           label=f'Eval')
    ax.bar(df.index.to_pydatetime(),
           df['b1_train'].values.tolist(),
           width=width,
           color='C0',
           hatch=6 * '/',
           label='Training data $b_1$',
           ec='white')
    ax.bar(df.index.to_pydatetime(),
           df['b1_test'].values.tolist(),
           bottom=df['b1_train'] + df['train'],
           width=width,
           color='C3',
           hatch=6 * '/',
           label='Eval datasets $b_1$',
           ec='white')
    ax.bar(df.index.to_pydatetime(),
           df['all'].values.tolist(),
           bottom=df['other'].values.tolist(),
           color='.8',
           width=width,
           label='Unused')

    # annotate
    ax.annotate('$b_0$', (df.index[3], df.iloc[3].total),
                ha='center',
                va='bottom',
                xytext=(0, 1),
                textcoords='offset points')
    ax.annotate('$b_1$', (df.index[4], df.iloc[4].total),
                ha='center',
                va='bottom',
                xytext=(0, 1),
                textcoords='offset points')
    ax.annotate('$b_{8}$', (df.index[11], df.iloc[11].total),
                ha='center',
                va='bottom',
                xytext=(0, 1),
                textcoords='offset points')

    # legend
    leg = plt.legend(loc='center left',
                     title='Annotation type',
                     bbox_to_anchor=(1.05, .5),
                     frameon=False,
                     handleheight=.4,
                     handlelength=1.2)
    leg._legend_box.align = "left"

    # tick frequency
    ax.xaxis.set_minor_locator(mdates.MonthLocator())
    ax.xaxis.set_major_locator(mdates.MonthLocator(bymonth=[1]))
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
    ax.yaxis.set_major_locator(matplotlib.ticker.MultipleLocator(base=400))

    # tick direction
    ax.tick_params(axis='x', direction='out', which='minor', zorder=2, size=2)
    ax.tick_params(axis='x', direction='out', which='major', zorder=2, size=4)

    ax.set_ylim((0, 1500))
    ax.set_xlim((datetime(2017, 10, 1), datetime(2020, 9, 1)))
    ax.grid(True)

    # annotations
    ts = ax.transAxes
    coords = ts.transform([-0.092, -0.065])
    tr = mpl.transforms.Affine2D().rotate_deg_around(*coords, 90)
    t = ts + tr
    brace = curly_brace(x=.1,
                        y=.1,
                        width=.03,
                        height=.54,
                        lw=.5,
                        pointing='right',
                        transform=t,
                        color='.15')
    ax.add_artist(brace)
    ax.text(.25,
            .85,
            'training window\nfor $b_1$',
            ha='center',
            va='bottom',
            transform=ts,
            fontsize=7)

    # labels
    ax.set_ylabel('Number of annotations')

    # cosmetics
    sns.despine()

    # save
    save_fig(fig, 'fig1', version=1, plot_formats=['png', 'pdf'])