Esempio n. 1
0
def plot_bargraph(count_plot_df, plot_df):
    """
    Plots the bargraph 
    Arguments:
        count_plot_df - The dataframe that contains lemma counts
        plot_df - the dataframe that contains the odds ratio and lemmas
    """

    graph = (
        p9.ggplot(count_plot_df.astype({"count": int}),
                  p9.aes(x="lemma", y="count")) +
        p9.geom_col(position=p9.position_dodge(width=0.5), fill="#253494") +
        p9.coord_flip() + p9.facet_wrap("repository", scales='free_x') +
        p9.scale_x_discrete(limits=(plot_df.sort_values(
            "odds_ratio", ascending=True).lemma.tolist())) +
        p9.scale_y_continuous(labels=custom_format('{:,.0g}')) +
        p9.labs(x=None) + p9.theme_seaborn(
            context='paper', style="ticks", font="Arial", font_scale=0.95) +
        p9.theme(
            # 640 x 480
            figure_size=(6.66, 5),
            strip_background=p9.element_rect(fill="white"),
            strip_text=p9.element_text(size=12),
            axis_title=p9.element_text(size=12),
            axis_text_x=p9.element_text(size=10),
        ))
    return graph
Esempio n. 2
0
def generate_scatter_plots(
        data,
        x="pca1",
        y="pca2",
        nsample=200,
        random_state=100,
        selected_categories=['bioinformatics', 'neuroscience'],
        color_palette=['#a6cee3', '#1f78b4'],
        save_file_path="output/pca_plots/scatterplot_files/pca01_v_pca02.svg"):
    g = (p9.ggplot(
        data.query(f"category in {selected_categories}").groupby("category").
        apply(lambda x: x.sample(nsample, random_state=random_state)
              if len(x) > nsample else x).reset_index(drop=True)) +
         p9.aes(x=x, y=y, color="factor(category)") + p9.geom_point() +
         p9.scale_color_manual({
             category: color
             for category, color in zip(selected_categories, color_palette)
         }) + p9.labs(x=f"PC{x[-1:]}",
                      y=f"PC{y[-1:]}",
                      title="PCA of BioRxiv (Word Dim: 300)",
                      color="Article Category") +
         p9.theme_seaborn(
             context="paper", style="ticks", font="Arial", font_scale=1.3) +
         p9.theme(figure_size=(6.66, 5), dpi=300))

    g.save(save_file_path, dpi=250)
    print(g)
    plt.clf()
Esempio n. 3
0
def plot_pointplot(plot_df, y_axis_label="", use_log10=False, limits=[0, 3.2]):
    """
    Plots the pointplot
    Arguments:
        plot_df - the dataframe that contains the odds ratio and lemmas
        y_axis_label - the label for the y axis
        use_log10 - use log10 for the y axis?
    """
    graph = (
        p9.ggplot(plot_df, p9.aes(x="lemma", y="odds_ratio")) +
        p9.geom_pointrange(p9.aes(ymin="lower_odds", ymax="upper_odds"),
                           position=p9.position_dodge(width=1),
                           size=0.3,
                           color="#253494") +
        p9.scale_x_discrete(limits=(plot_df.sort_values(
            "odds_ratio", ascending=True).lemma.tolist())) +
        (p9.scale_y_log10() if use_log10 else p9.scale_y_continuous(
            limits=limits)) +
        p9.geom_hline(p9.aes(yintercept=1), linetype='--', color='grey') +
        p9.coord_flip() + p9.theme_seaborn(
            context='paper', style="ticks", font_scale=1, font='Arial') +
        p9.theme(
            # 640 x 480
            figure_size=(6.66, 5),
            panel_grid_minor=p9.element_blank(),
            axis_title=p9.element_text(size=12),
            axis_text_x=p9.element_text(size=10)) +
        p9.labs(x=None, y=y_axis_label))
    return graph
Esempio n. 4
0
def plot_fitting(x, y, resonance_frequency, parameter):
    """ Plots the phase response and the corresponding fit of the harmonic damped oscillator.

    Args:
        x (`float array`):                       X coordinates (frequency in kHz)
        y (`float array`):                       Y coordinates (phase in radians)
        resonance_frequency (`float array`):     Resonance frequency given by the fit of x and y
        parameter (`float array`):               Others parameters of function fit (Q factor, offset, linear background)

    Returns:
        p (`ggplot object`):                     Returns a ggplot object
    """

    y_fit = fit_function(x, resonance_frequency, parameter[0], parameter[1],
                         parameter[2])
    y_fit.name = 'Phase fit'
    x.name = 'Frequency (kHz)'
    y.name = 'Phase (rad)'
    data = concat([x, y, y_fit], axis=1)
    col_names = list(data)

    # Plot data
    p = ggplot(aes(x=col_names[0], y=col_names[1]), data=data) + \
        geom_point() + \
        geom_line(aes(x=col_names[0], y=col_names[2]),  color='red', size=0.5) + \
        theme_seaborn(style='ticks', context='talk', font_scale=0.75) + \
        theme(figure_size=(15, 7), strip_background=element_rect(fill='white'), axis_line_x=element_line(color='black'),
              axis_line_y=element_line(color='black'), legend_key=element_rect(fill='white', color='white'))
    return p
Esempio n. 5
0
def plot_response_shift(x, y, resonance_frequency_without, parameter_without,
                        xx, yy, resonance_frequency_with, parameter):
    """ Plots the phase response of pre start data without and with cell attached to cantilever with the
    respective function fit.

    Args:
        x (`float array`):                               X coordinates w/o cell (frequency in kHz)
        y (`float array`):                               Y coordinates w/o cell (phase in radians)
        xx (`float array`):                              X coordinates w/ cell(frequency in kHz)
        yy (`float array`):                              Y coordinates w/ cell (phase in radians)
        resonance_frequency_without (`float array`):     Resonance frequency given by the fit of x and y  w/o cell
        resonance_frequency_with (`float array`):        Resonance frequency given by the fit of x and y w/ cell
        parameter (`float array`):                       Others parameters of function fit (Q factor, offset, linear
                                                         background) w/o cell
        parameter_without (`float array`):               Others parameters of function fit (Q factor, offset, linear
                                                         background) w/ cell

    Returns:
        p (`ggplot object`):                             Returns a ggplot object
    """

    y_fit_without = fit_function(x, resonance_frequency_without,
                                 parameter_without[0], parameter_without[1],
                                 parameter_without[2])
    y_fit_with = fit_function(xx, resonance_frequency_with, parameter[0],
                              parameter[1], parameter[2])
    y_fit_without.name = 'Phase fit w/o cell att.'
    y_fit_with.name = 'Phase fit w cell att.'
    x.name = 'Frequency without (kHz)'
    y.name = 'Raw phase w/o cell att.'
    xx.name = 'Frequency with (kHz)'
    yy.name = 'Raw phase w cell att.'
    data = concat([x, y, y_fit_without, xx, yy, y_fit_with], axis=1)
    df = melt(data,
              id_vars=['Frequency with (kHz)'],
              value_vars=['Phase fit w cell att.', 'Phase fit w/o cell att.'])
    df.loc[df['variable'] == 'Phase fit w/o cell att.',
           'Frequency with (kHz)'] = x.values
    df2 = melt(data,
               id_vars=['Frequency with (kHz)'],
               value_vars=['Raw phase w cell att.', 'Raw phase w/o cell att.'])
    df2.loc[df2['variable'] == 'Raw phase w/o cell att.',
            'Frequency with (kHz)'] = x.values
    # Plot data
    p = ggplot(data=df) + \
        geom_point(aes(x="Frequency with (kHz)", y='value', fill='variable'), data=df2, alpha=0.6) + \
        geom_line(aes(x="Frequency with (kHz)", y='value', color='variable')) + \
        xlab('Frequency (kHz)') + \
        ylab('Phase (rad)') + \
        labs(fill='Raw data', color='Function fits') + \
        theme_seaborn(style='ticks', context='talk', font_scale=0.75) + \
        theme(figure_size=(15, 7), strip_background=element_rect(fill='white'), axis_line_x=element_line(color='black'),
              axis_line_y=element_line(color='black'), legend_key=element_rect(fill='white', color='white'))
    return p
final_cosine_df = biorxiv_journal_df[["document",
                                      "preprint_doi"]].merge(final_cosine_df)
final_cosine_df.to_csv("output/annotated_links/article_distances_cosine.tsv",
                       sep="\t",
                       index=False)
final_cosine_df.head()

# # Distribution plot

g = (p9.ggplot(
    final_original_df.replace({
        "pre_vs_published": "preprint-published",
        "pre_vs_random": "preprint-random",
    })) + p9.aes(x="label", y="distance") + p9.geom_violin(fill="#a6cee3") +
     p9.labs(x="Document Pair Groups", y="Euclidean Distance") +
     p9.theme_seaborn(
         context="paper", style="ticks", font="Arial", font_scale=2) +
     p9.theme(figure_size=(11, 8.5)))
print(g)

g = (p9.ggplot(
    final_cosine_proxy_df.replace({
        "pre_vs_published": "preprint-published",
        "pre_vs_random": "preprint-random",
    })) + p9.aes(x="label", y="distance") + p9.geom_violin(fill="#a6cee3") +
     p9.labs(x="Document Pair Groups", y="Euclidean (L2 Norm) Distance") +
     p9.theme_seaborn(
         context="paper", style="ticks", font="Arial", font_scale=2) +
     p9.theme(figure_size=(11, 8.5)))
print(g)

g = (p9.ggplot(
Esempio n. 7
0
def plot_2d_distribution_per_category(
        dataframe: pandas.DataFrame,
        label_column: str,
        coordinates: Tuple[str],
        colors: List[str],
        coloring_style: str = 'manual',
        log_10_scale: bool = False,
        theme: str = 'gray',
        alpha: float = 0.5,
        save_to_file: str = None,
        dpi: int = 150
) -> p9.ggplot:
    """
    The :func:`plot_2d_distribution_per_category` helps with providing the user with a 2-dimensional plot of the
    whole distribution.

    Parameters
    ----------
    dataframe: `pandas.DataFrame`, required
        This is the main parameter that this method is supposed to work with, which is a dataframe with a label column
        (which is to help us determine the column) and coordinates for x and y axes.
    label_column: `str`, required
        The input dataframe must have a label_column (preferably integer starting from 0), the name of that
        column should be input here.
    coordinates: `Tuple[str]`, required
        This is a tuple of column names, the first one being the column in which the `x` values for our 2d plot
        are stored, and the other one corresponds to the `y` axis.
    colors: `List[str]`, required
        Depending on whether or not our `coloring_style` is manual or automatic, this can either be a list of colors
        or a list of two colors indicating a range of color values.
    coloring_style: `str`, optional (default='manual')
        Either `manual` or `gradient` which helps assigning colors to clusters.
    log_10_scale: `bool`, optional (default=False)
        If the user wants to take the logarithm in the basis of 10, this parameter should be set to 1.
    theme: `str`, optional (default='gray')
        This is the `theme` types, the acceped values are: ``['gray', 'dark', 'seaborn', 'light']``, the values
        are consistent with `plotnine` package's format.
    alpha: `float`, optional (default=0.5)
        The transparency intensity can be determined by setting this parameter.
    save_to_file: `str`, optional (default=None)
        If the user intends to save the plot in a file, this parameter should have a value. The value must be a filepath.
    dpi: `int`, optional (default=150)
        The dpi for saving the plots indicating the image quality.
    Returns
    ----------
    The output of this method is of `p9.ggplot` type.
    """
    assert coloring_style in ['manual', 'gradient'], "invalid coloring style"

    if coloring_style == 'gradient':
        assert len(colors) == 2, "you have chosen gradient style coloring, for colors you have to provide a list with the \
            First element being the color for low and the second the color for high."
        pplot = p9.ggplot(data=dataframe, mapping=p9.aes(x=coordinates[0], y=coordinates[1], color=label_column))
        pplot += p9.scale_color_gradient(low=colors[0], high=colors[1])
    elif coloring_style == 'manual':
        assert len(colors) == len(dataframe[label_column].unique()), "You have chosen per category manual coloring, therefore you have to provide the same number of colors"
        pplot = p9.ggplot(data=dataframe, mapping=p9.aes(x=coordinates[0], y=coordinates[1], color='factor(' + label_column + ')'))
        pplot += p9.scale_alpha_manual(colors)

    pplot += p9.geom_point(alpha=alpha)
    pplot += p9.xlab(coordinates[0]) + p9.ylab(coordinates[1])

    if log_10_scale:
        pplot += p9.scale_x_log10()

    if theme == 'gray':
        pplot += p9.theme_gray()
    elif theme == 'dark':
        pplot += p9.theme_dark()
    elif theme == 'seaborn':
        pplot += p9.theme_seaborn()
    elif theme == 'light':
        pplot += p9.theme_light()
    else:
        raise Exception('Theme type not supported, please add.')

    pplot += p9.theme(text=p9.element_text(size=8))

    if save_to_file is not None:
        save_directory, filename = separate_path_and_file(filepath=save_to_file)
        pplot.save(filename=filename, path=save_directory, dpi=dpi)
    else:
        pplot.draw()

    return pplot
        y="KL_divergence",
        fill="Comparison",
        color="Comparison",
        group="Comparison",
    ) + p9.geom_point(size=2) + p9.geom_line(linetype="dashed") +
     p9.scale_fill_brewer(type="qual", palette="Paired", direction=-1) +
     p9.scale_color_brewer(
         type="qual",
         palette="Paired",
         direction=-1,
     ) + p9.labs(
         x="Number of terms evaluated",
         y="Kullback–Leibler Divergence",
     ) + p9.theme_seaborn(
         context="paper",
         style="ticks",
         font_scale=1.8,
     ) + p9.theme(figure_size=(11, 8.5), text=p9.element_text(family="Arial")))
g.save("output/svg_files/corpora_kl_divergence.svg")
g.save("output/figures/corpora_kl_divergence.png", dpi=500)
print(g)

kl_divergence_special_char_df = pd.read_csv(
    "output/comparison_stats/corpora_kl_divergence_special_chars_removed.tsv",
    sep="\t")
kl_divergence_special_char_df.head()

g = (p9.ggplot(
    kl_divergence_special_char_df.replace({
        "biorxiv_vs_pmc": "bioRxiv-PMC",
        "biorxiv_vs_nytac": "bioRxiv-NYTAC",
Esempio n. 9
0
def quick_color_check(target_matrix, source_matrix, num_chips):
    """ Quickly plot target matrix values against source matrix values to determine
    over saturated color chips or other issues.

    Inputs:
    source_matrix      = a 22x4 matrix containing the average red value, average green value, and
                             average blue value for each color chip of the source image
    target_matrix      = a 22x4 matrix containing the average red value, average green value, and
                             average blue value for each color chip of the target image
    num_chips          = number of color card chips included in the matrices (integer)

    :param source_matrix: numpy.ndarray
    :param target_matrix: numpy.ndarray
    :param num_chips: int
    """
    # Imports
    from plotnine import ggplot, geom_point, geom_smooth, theme_seaborn, facet_grid, geom_label, scale_x_continuous, \
        scale_y_continuous, scale_color_manual, aes
    import pandas as pd

    # Extract and organize matrix info
    tr = target_matrix[:num_chips, 1:2]
    tg = target_matrix[:num_chips, 2:3]
    tb = target_matrix[:num_chips, 3:4]
    sr = source_matrix[:num_chips, 1:2]
    sg = source_matrix[:num_chips, 2:3]
    sb = source_matrix[:num_chips, 3:4]

    # Create columns of color labels
    red = []
    blue = []
    green = []
    for i in range(num_chips):
        red.append('red')
        blue.append('blue')
        green.append('green')

    # Make a column of chip numbers
    chip = np.arange(0, num_chips).reshape((num_chips, 1))
    chips = np.row_stack((chip, chip, chip))

    # Combine info
    color_data_r = np.column_stack((sr, tr, red))
    color_data_g = np.column_stack((sg, tg, green))
    color_data_b = np.column_stack((sb, tb, blue))
    all_color_data = np.row_stack((color_data_b, color_data_g, color_data_r))

    # Create a dataframe with headers
    dataset = pd.DataFrame({
        'source': all_color_data[:, 0],
        'target': all_color_data[:, 1],
        'color': all_color_data[:, 2]
    })

    # Add chip numbers to the dataframe
    dataset['chip'] = chips
    dataset = dataset.astype({
        'color': str,
        'chip': str,
        'target': float,
        'source': float
    })

    # Make the plot
    p1 = ggplot(dataset, aes(x='target', y='source', color='color', label='chip')) + \
        geom_point(show_legend=False, size=2) + \
        geom_smooth(method='lm', size=.5, show_legend=False) + \
        theme_seaborn() + facet_grid('.~color') + \
        geom_label(angle=15, size=7, nudge_y=-.25, nudge_x=.5, show_legend=False) + \
        scale_x_continuous(limits=(-5, 270)) + scale_y_continuous(limits=(-5, 275)) + \
        scale_color_manual(values=['blue', 'green', 'red'])

    # Autoincrement the device counter
    params.device += 1

    # Reset debug
    if params.debug is not None:
        if params.debug == 'print':
            p1.save(os.path.join(params.debug_outdir, 'color_quick_check.png'))
        elif params.debug == 'plot':
            print(p1)
Esempio n. 10
0
def quick_color_check(target_matrix, source_matrix, num_chips):
    """ Quickly plot target matrix values against source matrix values to determine
    over saturated color chips or other issues.

    Inputs:
    source_matrix      = a 22x4 matrix containing the average red value, average green value, and
                             average blue value for each color chip of the source image
    target_matrix      = a 22x4 matrix containing the average red value, average green value, and
                             average blue value for each color chip of the target image
    num_chips          = number of color card chips included in the matrices (integer)

    :param source_matrix: numpy.ndarray
    :param target_matrix: numpy.ndarray
    :param num_chips: int
    """
    # Imports
    from plotnine import ggplot, geom_point, geom_smooth, theme_seaborn, facet_grid, geom_label, scale_x_continuous, \
        scale_y_continuous, scale_color_manual, aes
    import pandas as pd

    # Extract and organize matrix info
    tr = target_matrix[:num_chips, 1:2]
    tg = target_matrix[:num_chips, 2:3]
    tb = target_matrix[:num_chips, 3:4]
    sr = source_matrix[:num_chips, 1:2]
    sg = source_matrix[:num_chips, 2:3]
    sb = source_matrix[:num_chips, 3:4]

    # Create columns of color labels
    red = []
    blue = []
    green = []
    for i in range(num_chips):
        red.append('red')
        blue.append('blue')
        green.append('green')

    # Make a column of chip numbers
    chip = np.arange(0, num_chips).reshape((num_chips, 1))
    chips = np.row_stack((chip, chip, chip))

    # Combine info
    color_data_r = np.column_stack((sr, tr, red))
    color_data_g = np.column_stack((sg, tg, green))
    color_data_b = np.column_stack((sb, tb, blue))
    all_color_data = np.row_stack((color_data_b, color_data_g, color_data_r))

    # Create a dataframe with headers
    dataset = pd.DataFrame({'source': all_color_data[:, 0], 'target': all_color_data[:, 1],
                            'color': all_color_data[:, 2]})

    # Add chip numbers to the dataframe
    dataset['chip'] = chips
    dataset = dataset.astype({'color': str, 'chip': str, 'target': float, 'source': float})

    # Make the plot
    p1 = ggplot(dataset, aes(x='target', y='source', color='color', label='chip')) + \
        geom_point(show_legend=False, size=2) + \
        geom_smooth(method='lm', size=.5, show_legend=False) + \
        theme_seaborn() + facet_grid('.~color') + \
        geom_label(angle=15, size=7, nudge_y=-.25, nudge_x=.5, show_legend=False) + \
        scale_x_continuous(limits=(-5, 270)) + scale_y_continuous(limits=(-5, 275)) + \
        scale_color_manual(values=['blue', 'green', 'red'])

    # Reset debug
    if params.debug is not None:
        if params.debug == 'print':
            p1.save(os.path.join(params.debug_outdir, 'color_quick_check.png'))
        elif params.debug == 'plot':
            print(p1)
        'aupr_upper':
        lambda x: x.aupr_mean +
        (critical_val * x.aupr_std) / pd.np.sqrt(x.lf_num_len),
        'aupr_lower':
        lambda x: x.aupr_mean -
        (critical_val * x.aupr_std) / pd.np.sqrt(x.lf_num_len)
    }))
dev_set_stats_df

# In[9]:

(p9.ggplot(dev_set_stats_df,
           p9.aes(x="factor(lf_num)", y="auroc_mean", color="model")) +
 p9.geom_point() + p9.geom_line(p9.aes(group="model")) + p9.geom_errorbar(
     p9.aes(ymin="auroc_lower", ymax="auroc_upper", group="model")) +
 p9.theme_seaborn() + p9.labs(title="CtD Tune Set AUROC", color="Model") +
 p9.scale_color_manual({
     "disc_model": "blue",
     "gen_model": "orange"
 }))

# In[10]:

(p9.ggplot(dev_set_stats_df,
           p9.aes(x="factor(lf_num)", y="aupr_mean", color="model")) +
 p9.geom_point() + p9.geom_line(p9.aes(group="model")) + p9.geom_errorbar(
     p9.aes(ymin="aupr_lower", ymax="aupr_upper", group="model")) +
 p9.theme_seaborn() + p9.labs(title="CtD Tune Set AUPR", color="Model") +
 p9.scale_color_manual({
     "disc_model": "blue",
     "gen_model": "orange"
def plot_pointgraph(
        plot_df,
        x_axis_label,
        left_arrow_label,
        right_arrow_label,
        left_arrow_start=-0.5,
        left_arrow_height=38.5,
        right_arrow_start=0.5,
        right_arrow_height=1.5,
        arrow_length=2,
        left_arrow_label_x=-1.5,
        left_arrow_label_y=-1.5,
        right_arrow_label_x=-1.5,
        right_arrow_label_y=-1.5,
        limits=(-3, 3),
):
    """
    This function is designed to plot the an errorbar graph to show each token's odd ratio.
    The main idea for this graph is to show which corpora a token is enriched
    Args:
        plot_df - the data frame to plot,
        x_axis_label - the label of the x axis,
        left_arrow_label - the label for the left arrow,
        right_arrow_label - the label for the right arrow,
        left_arrow_start - the start of the left arrow to be plotted
        left_arrow_height - the height at which the arrow needs to be plotted
        right_arrow_start -  the start of the right arrow to be plotted
        right_arrow_height - - the height at which the arrow needs to be plotted
        arrow_length - the length of the arrow
        left_arrow_label_x - the x axis position for the label of the left arrow
        left_arrow_label_y - the y axis position for the label of the left arrow
        right_arrow_label_x - the x axis position for the label of the right arrow
        right_arrow_label_y - the y axis position for the label of the right arrow
        limits=(-3,3)
    """

    graph = (p9.ggplot(
        plot_df.assign(lemma=lambda x: pd.Categorical(x.lemma.tolist())),
        p9.aes(
            y="lemma",
            xmin="lower_odds",
            x="odds_ratio",
            xmax="upper_odds",
            yend="lemma",
        ),
    ) + p9.geom_errorbarh(color="#253494") + p9.scale_y_discrete(limits=(
        plot_df.sort_values("odds_ratio", ascending=True).lemma.tolist())) +
             p9.scale_x_continuous(limits=limits) +
             p9.geom_vline(p9.aes(xintercept=0), linetype="--", color="grey") +
             p9.annotate(
                 "segment",
                 x=left_arrow_start,
                 xend=left_arrow_start - arrow_length,
                 y=left_arrow_height,
                 yend=left_arrow_height,
                 colour="black",
                 size=0.5,
                 alpha=1,
                 arrow=p9.arrow(length=0.1),
             ) + p9.annotate(
                 "text",
                 label=left_arrow_label,
                 x=left_arrow_label_x,
                 y=left_arrow_label_y,
                 size=12,
                 alpha=0.7,
             ) + p9.annotate(
                 "segment",
                 x=right_arrow_start,
                 xend=right_arrow_start + arrow_length,
                 y=right_arrow_height,
                 yend=right_arrow_height,
                 colour="black",
                 size=0.5,
                 alpha=1,
                 arrow=p9.arrow(length=0.1),
             ) + p9.annotate(
                 "text",
                 label=right_arrow_label,
                 x=right_arrow_label_x,
                 y=right_arrow_label_y,
                 size=12,
                 alpha=0.7,
             ) + p9.theme_seaborn(
                 context="paper",
                 style="ticks", font_scale=1, font="Arial") + p9.theme(
                     figure_size=(11, 8.5),
                     panel_grid_minor=p9.element_blank(),
                     text=p9.element_text(size=12),
                 ) + p9.labs(y=None, x=x_axis_label))

    return graph
# Output t-test results
t_results_geo_targene = ttest_ind(a = targene_geo_mutant['weight'],
                              b = targene_geo_wt['weight'], equal_var = False)
print('Statistic = {:.2f}, p = {:.2E}'.format(t_results_geo_targene[0],
                                              Decimal(t_results_geo_targene[1])))

# graphical output for predictions
p = (gg.ggplot(output,
               gg.aes(x='weight', y='dummy_y', color='factor(status_sign)')) +
     gg.geom_hline(gg.aes(yintercept=0), linetype='solid') +
     gg.geom_point(size=4) +
     gg.scale_color_manual(values=["#377eb8", "#ff7f00"], labels=['WT', 'Mutant']) +
     gg.ylim([-0.1, 0.1]) +
     gg.xlim([-0.001, 1.001]) +
     gg.theme_seaborn(style='whitegrid') +
     gg.xlab('Targene Classifier Score') +
     gg.ylab('') +
     gg.labs(color='Sample_status') +
     gg.ggtitle('Mutant vs WT \n') +
     gg.theme(
        plot_title=gg.element_text(size=22),
        axis_title_x=gg.element_text(size=16),
        axis_text_x=gg.element_text(size=16),
        axis_text_y=gg.element_blank(),
        axis_ticks_length=4,
        axis_ticks_major_y=gg.element_blank(),
        axis_ticks_minor_y=gg.element_blank(),
        axis_ticks_minor_x=gg.element_blank(),
        legend_position=(1.02, 0.8),
        legend_background=gg.element_blank(),
    # plot the x axis titles
    + p9.geom_vline(xintercept=[2.5, 14.5, 26.5, 38.5, 50.5, 62.5, 74.5]) +
    p9.geom_text(label="2014", x=8.5, y=0, color="black") +
    p9.geom_text(label="2015", x=20.5, y=0, color="black") +
    p9.geom_text(label="2016", x=32.5, y=0, color="black") +
    p9.geom_text(label="2017", x=44.5, y=0, color="black") +
    p9.geom_text(label="2018", x=56.5, y=0, color="black") +
    p9.geom_text(label="2019", x=68.5, y=0, color="black")

    # Plot the overall proportion published
    + p9.geom_hline(
        yintercept=0.4196, linetype='solid', color=color_mapper['2018']) +
    p9.geom_hline(yintercept=published / posted,
                  linetype="solid",
                  color=color_mapper['2020ML']) +
    p9.annotate("text", x=8.5, y=0.395, label="overall: 0.4196", size=8) +
    p9.annotate("text",
                x=8.5,
                y=0.48,
                label=f"overall: {published/posted:.4f}",
                size=8) +
    p9.theme_seaborn(style='ticks', context='paper', font_scale=1.5) +
    p9.theme(figure_size=(10, 4.5),
             axis_text_x=p9.element_blank(),
             axis_title_x=p9.element_text(margin={"t": 15})) +
    p9.labs(y="Proportion Published", x="Month"))
g.save("output/figures/publication_rate.svg", dpi=250)
g.save("output/figures/publication_rate.png", dpi=250)
print(g)
    entire_preprint_df = entire_preprint_df.append(
        kmf.survival_function_.reset_index().assign(category=cat))

# In[12]:

g = (
    p9.ggplot(
        entire_preprint_df.assign(timeline=lambda x: pd.to_timedelta(
            x.timeline, "D")).query("category != 'none'"),
        p9.aes(x="timeline", y="KM_estimate", color="category"),
    ) + p9.geom_line(linetype="dashed", size=0.7) + p9.ylim(0, 1) +
    p9.scale_x_timedelta(labels=timedelta_format("d")) + p9.labs(
        x="timeline (days)",
        y="proportion of unpublished biorxiv paper",
        title="Preprint Survival Curves",
    ) + p9.theme_seaborn(context="paper", style="white", font_scale=1.2) +
    p9.theme(axis_ticks_minor_x=p9.element_blank(),
             # legend_position=(0.5, -0.2),
             # legend_direction='horizontal'
             ))
g.save("output/preprint_category_survival_curves.svg", dpi=500)
g.save("output/preprint_category_survival_curves.png", dpi=500)
print(g)

# In[13]:

category_half_life = pd.DataFrame.from_records(half_life).replace(
    np.inf,
    (temp_df["time_to_published"].dt.total_seconds() / 60 / 60 / 24).max())
category_half_life
Esempio n. 16
0
                 doc_distances=np.arange(1, 13) * 0.02669 + 0.8697,
             )),
         linetype="dashed",
         color="#1f78b4",
         size=1,
     ) + p9.annotate(
         "text",
         label=f"y={results.slope:0.4f}*X + {results.intercept:0.4f}",
         x=9,
         y=7.5,
         size=13,
         color="#1f78b4",
     ) + p9.labs(
         x="# of Preprint Versions",
         y="Euclidean Distance of Preprint-Published Versions",
     ) + p9.theme_seaborn(style="white", context="notebook"))
g.save("output/version_count_doc_distances.svg")
g.save("output/version_count_doc_distances.png", dpi=600)
print(g)

# Overall, preprints change with each new version; however, based on the magnitude of the slope I'd argue that these changes are minor compared to substantial changes (~6 distance units)

# # Output published dates to Excel

# Reviewer asked if manually pursuing preprints that take longer to publish would produce any interesting results. Great question, but not enough time to go into that; however, providing a supplementary file for others to look into could provide an in depth answer.

excel_print_df = published_date_distances.drop(
    ["document", "category", "pmcoa"], axis=1).rename(
        index=str,
        columns={
            "preprint_date": "posted_date",
    p9.geom_text(label="2017", x=44.5, y=0, color="black", size=13) +
    p9.geom_text(label="2018", x=56.5, y=0, color="black", size=13) +
    p9.geom_text(label="2019", x=68.5, y=0, color="black", size=13)
    # Plot the overall proportion published
    + p9.geom_hline(
        yintercept=0.4196, linetype="solid", color=color_mapper["2018"]) +
    p9.geom_hline(yintercept=published / posted,
                  linetype="solid",
                  color=color_mapper["2020ML"]) +
    p9.annotate("text", x=8.5, y=0.395, label="overall: 0.4196", size=14) +
    p9.annotate("text",
                x=8.5,
                y=0.48,
                label=f"overall: {published/posted:.4f}",
                size=14) +
    p9.theme_seaborn(
        style="ticks", context="paper", font="Arial", font_scale=2) + p9.theme(
            figure_size=(11, 6.5),
            axis_text_x=p9.element_blank(),
            axis_title_x=p9.element_text(margin={"t": 15}),
        ) + p9.labs(y="Proportion Published", x="Month"))
g.save("output/figures/publication_rate.svg")
g.save("output/figures/publication_rate.png", dpi=250)
print(g)

# # Plot Publication Rate

# +
publish_rate_df["pub_month"] = pd.Categorical(
    publish_rate_df.pub_month.values.tolist(), ordered=True)

posted_recency_adj = (
Esempio n. 18
0
# +
result_df = pd.DataFrame.from_records(results)

result_df["dataset"] = pd.Categorical(
    result_df.dataset.tolist(),
    categories=["train (cross validation)", "test"])
result_df.to_csv("output/knn_results.tsv", sep="\t", index=False)
result_df.head()

# +
g = (p9.ggplot(
    result_df.query("distance in ['euclidean']").rename(
        index=str, columns={"value": "fold_change"}),
    p9.aes(x="model", y="fold_change"),
) + p9.geom_col(position="dodge", show_legend=False, fill="#1f78b4") +
     p9.coord_flip() + p9.facet_wrap("dataset") + p9.theme_seaborn(
         context="paper", style="ticks", font="Arial", font_scale=1.3) +
     p9.theme(figure_size=(6.66, 5)) +
     p9.labs(y="Fold Change Over Random", fill="Distance Metric"))

g.save(Path("output") / Path("figures") / Path("knn_result.svg"))

g.save(Path("output") / Path("figures") / Path("knn_result.png"), dpi=250)

print(g)

# -

# # Generate 2D Visualization

# ## Use SAUCIE on PMC
Esempio n. 19
0
) + p9.geom_boxplot(fill="#a6cee3") + p9.geom_line(
    mapping=p9.aes(x="version_count", y="time_to_published"),
    stat="smooth",
    method="lm",
    linetype="dashed",
    se=False,
    alpha=1,
    size=0.7,
    inherit_aes=False,
) + p9.scale_y_timedelta(labels=timedelta_format("d")) + p9.annotate(
    "text",
    x=9,
    y=timedelta(days=1470),
    label=f"Y={results_2.slope:.2f}*X+{results_2.intercept:.2f}",
) + p9.labs(x="# of Preprint Versions",
            y="Time Elapsed Until Preprint is Published") + p9.theme_seaborn(
                context="paper", style="ticks", font="Arial", font_scale=1.3))
# g.save("output/version_count_vs_publication_time.svg", dpi=500)
# g.save("output/version_count_vs_publication_time.png", dpi=500)
print(g)

plt.figure(figsize=(8, 5))
g = sns.boxenplot(
    x="version_count",
    y="days_to_published",
    data=published_date_distances,
    scale="linear",
    palette="YlGnBu",
)
_ = g.set_ylabel("Time Elapsed Until Preprint is Published (Days)")
_ = g.set_xlabel("# of Preprint Versions")
_ = g.plot(x_line - 1, y_line, "--k")
Esempio n. 20
0
    def test_theme_seaborn(self):
        p = self.g + labs(title='Theme Seaborn') + theme_seaborn()

        assert p + _theme == 'theme_seaborn'
Esempio n. 21
0
    def test_theme_seaborn(self):
        p = self.g + labs(title='Theme Seaborn') + theme_seaborn()

        assert p + _theme == 'theme_seaborn'
Esempio n. 22
0
def plot_violinbox_plots_per_category(
        dataframe: pandas.DataFrame,
        plot_type: str,
        target_feature: str,
        label_column: str,
        colors: List[str],
        coloring_style: str,
        value_skip_list: List = [],
        jitter_alpha: float = 0.7,
        plot_alpha: float = 0.5,
        log_10_scale: bool = False,
        theme: str = 'gray',
        save_to_file: str = None,
        dpi: int = 150,
        show: bool = True
) -> p9.ggplot:
    """
        The :func:`plot_violinbox_plots_per_category` helps with providing the user with nicely plotted violin and
        box plots of the distribution of data points.

        Parameters
        ----------
        dataframe: `pandas.DataFrame`, required
            This is the main parameter that this method is supposed to work with, which is a dataframe that has
            a label column in which we have integer values starting from 0, and a float feature column the distribution
            of which we tend to monitor.
        plot_type: `str`, required
            This value, either `box` or `violin`, determines the type of plot.
        target_feature: `str`, required
            This parameter is the column name of the features that we want to monitor.
        label_column: `str`, required
            The input dataframe must have a label_column (preferably integer starting from 0), the name of that
            column should be input here.
        colors: `List[str]`, required
            Depending on whether or not our `coloring_style` is manual or automatic, this can either be a list of colors
            or a list of two colors indicating a range of color values.
        coloring_style: `str`, optional (default='manual')
            Either `manual` or `gradient` which helps assigning colors to clusters.
        value_skip_list: `List`, optional (default=[])
            If some values in the feature column are to be skipped, they should be put in here so that they
            are ignored in the plots. For example, if for some reason some values are -10000000, they can be taken care
            of in here.
        jitter_alpha: `float`, optional (default=0.7)
            The jitter value transparency is set in this parameter.
        plot_alpha: `float`, optional (default=0.5)
            The transparency intensity can be determined by setting this parameter.
        log_10_scale: `bool`, optional (default=False)
            If the user wants to take the logarithm in the basis of 10, this parameter should be set to 1.
        theme: `str`, optional (default='gray')
            This is the `theme` types, the acceped values are: ``['gray', 'dark', 'seaborn', 'light']``, the values
            are consistent with `plotnine` package's format.
        save_to_file: `str`, optional (default=None)
            If the user intends to save the plot in a file, this parameter should have a value. The value must be a filepath.
        dpi: `int`, optional (default=150)
            The dpi for saving the plots indicating the image quality.
        show: `bool`, optional (default=True)
            Whether or not the plot is to be shown is set in this parameter.
        Returns
        ----------
        The output of this method is of `p9.ggplot` type.
        """
    if len(value_skip_list) > 0:
        df = dataframe[~dataframe[target_feature].isin(value_skip_list)]

    if coloring_style == 'gradient':
        assert len(colors) == 2, "you have chosen gradient style coloring, for colors you have to provide a list with the \
            First element being the color for low and the second the color for high."
        pplot = p9.ggplot(data=dataframe, mapping=p9.aes(x='factor(' + label_column + ')', y=target_feature, color=label_column))
        pplot += p9.scale_color_gradient(low=colors[0], high=colors[1])
    elif coloring_style == 'manual':
        assert len(colors) == len(df[label_column].unique()), "You have chosen per category manual coloring, therefore you have to provide the same number of colors"
        pplot = p9.ggplot(data=dataframe, mapping=p9.aes(x='factor(' + label_column + ')', y=target_feature, color='factor(' + label_column + ')'))
        pplot += p9.scale_alpha_manual(colors)

    pplot += p9.geom_jitter(alpha=jitter_alpha)

    if plot_type == 'box':
        pplot += p9.geom_boxplot(alpha=plot_alpha)
    elif plot_type == 'violin':
        pplot += p9.geom_violin(alpha=plot_alpha)
    else:
        raise Exception('unknown plot type, it must be violin or box.')

    if theme == 'gray':
        pplot += p9.theme_gray()
    elif theme == 'dark':
        pplot += p9.theme_dark()
    elif theme == 'seaborn':
        pplot += p9.theme_seaborn()
    elif theme == 'light':
        pplot += p9.theme_light()
    else:
        raise Exception('Theme type not supported, please add.')

    if log_10_scale:
        pplot += p9.scale_x_log10()

    if save_to_file is not None:
        save_directory, filename = separate_path_and_file(filepath=save_to_file)
        pplot.save(filename=filename, path=save_directory, dpi=dpi)

    if show:
        pplot.draw()

    return pplot
Esempio n. 23
0
print("Best CV Fold")
print(model.scores_["polka"][:, best_result[0]])
model.scores_["polka"][:, best_result[0]].mean()

model_weights_df = pd.DataFrame.from_dict({
    "weight": model.coef_[0],
    "pc": list(range(1, 51)),
})
model_weights_df["pc"] = pd.Categorical(model_weights_df["pc"])
model_weights_df.head()

g = (p9.ggplot(model_weights_df, p9.aes(x="pc", y="weight")) +
     p9.geom_col(position=p9.position_dodge(width=5), fill="#253494") +
     p9.coord_flip() +
     p9.scale_x_discrete(limits=list(sorted(range(1, 51), reverse=True))) +
     p9.theme_seaborn(
         context="paper", style="ticks", font_scale=1.1, font="Arial") +
     p9.theme(figure_size=(10, 8)) + p9.labs(title="Regression Model Weights",
                                             x="Princpial Component",
                                             y="Model Weight"))
# g.save("output/figures/pca_log_regression_weights.svg")
# g.save("output/figures/pca_log_regression_weights.png", dpi=250)
print(g)

fold_features = model.coefs_paths_["polka"].transpose(1, 0, 2)
model_performance_df = pd.DataFrame.from_dict({
    "feat_num": ((fold_features.astype(bool).sum(axis=1)) > 0).sum(axis=1),
    "C":
    model.Cs_,
    "score":
    model.scores_["polka"].mean(axis=0),
})
result_df.head()


# In[5]:


g = (
    p9.ggplot(result_df.query("distance in ['euclidean', 'N/A']"), p9.aes(x="model", y="value"))
    + p9.geom_col(p9.aes(fill="factor(distance)"), position="dodge")
    + p9.coord_flip()
    + p9.facet_wrap("dataset")
    + p9.scale_fill_manual(["#808080", "#1f78b4"])
    + p9.theme_seaborn(
        context='paper',
        style="ticks",
        font="Arial",
        font_scale=1.3
    )
    + p9.theme(
        figure_size=(6.66, 5)
    )
    + p9.labs(
        y="Accuracy",
        fill="Distance Metric"
    )
)

g.save(
    Path("output")/
    Path("figures")/
    Path("knn_result.svg"),