Esempio n. 1
0
def plot_hypothesis(hypothesis, file_name):
    bin_types = list(hypothesis)
    scores = list(hypothesis[bin_types[0]])
    plots = []
    for bin_type, score in product(bin_types, scores):
        mean_name = "Mean: " + score
        df = pd.DataFrame(columns=["Bin", "Dataset", mean_name])
        df2 = pd.DataFrame(columns=["Bin", "t-statistic", 'p-value'])
        for bin_ in hypothesis[bin_type][score]:
            h = list(bin_.values())[0]
            bin_name = list(bin_)[0]
            parameter1 = h.p1
            parameter2 = h.p2
            mean1 = h.mean1
            mean2 = h.mean2
            row1 = {
                "Bin": bin_name,
                'Dataset': parameter1,
                mean_name: str(round(float(mean1), 3))
            }
            row2 = {
                "Bin": bin_name,
                'Dataset': parameter2,
                mean_name: str(round(float(mean2), 3))
            }
            df = df.append(row1, ignore_index=True)
            df = df.append(row2, ignore_index=True)
            t_statistic = h.t
            p_value = h.p
            row = {
                "Bin":
                bin_name,
                't-statistic':
                str(round(t_statistic, 3)),
                'p-value':
                str(p_value),
                '95% Confidence':
                "Significant" if p_value <= 0.05 else "Not Significant"
            }
            df2 = df2.append(row, ignore_index=True)
        plots.append(
            (ggplot(df, aes(x='Bin', y=mean_name, fill='Dataset')) +
             geom_col(stat='identity', position='dodge') +
             ggtitle("{0} bin distribution| {1}\nBin's Average Scores".format(
                 bin_type, score))))
        plots.append(
            (ggplot(df2, aes(x='Bin', y='p-value', fill='95% Confidence')) +
             geom_col(stat='identity', width=0.2) + ggtitle(
                 "{0} bin distribution| {1}\nBin's 95% Confidence Level Test".
                 format(bin_type, score)) +
             scale_fill_manual(values={
                 'Significant': "#214517",
                 'Not Significant': '#c62f2d'
             })))
    save_as_pdf_pages(plots, file_name)

    return
Esempio n. 2
0
 def plots_by_site(self, as_pdf=True, filename="plot/figs/All_sites_by_plot.pdf"):
     self.opts["plt_group_by"] = ["site", "plot", "julian", "type"]
     if self.plot_data is None:
         self.create_plot_data()
     sites = self.plot_data["site"].unique()
     # Update plot options
     plot_options = self.get_plot_options()
     plot_options["colour"] = "plot"
     plot_options["facet_by"] = "plot"
     plots = [self.plot_plots(site, plot_options, as_pdf) for site in sites]
     plots = [plot for plot in plots if plot is not None]
     if as_pdf:
         save_as_pdf_pages(plots, filename)
Esempio n. 3
0
def save_as_pdf(plot: p9.ggplot,
                filename: str = None,
                path: str = None,
                dpi: int = None,
                verbose: bool = False) -> None:
    """Save a plotnine ggplot as pdf

    Parameters
    ----------
    plot : p9.ggplot
        The plot to save

    filename : str, optional (default: None)
        Filename to write to. If None, a name is generated.

    path : str, optional (default: None)
        Path to save to. If None, saves to "out".

    dpi : int, optional (default: None)
        DPI of saved plot. If None, set to 300.

    verbose : bool, optional (default: False)
        Print progress report.

    Returns
    -------
    None

    """

    if path is None:
        path = "out"

    if filename is None:
        dateTimeObj = datetime.now()
        filename = "{}/cellex_plot_{}.pdf".format(
            path, dateTimeObj.strftime("%y%m%d_%H%M%S"))

    if dpi is None:
        dpi = 300

    os.makedirs(path, exist_ok=True)  # make dir if it doesn't already exist

    p9.save_as_pdf_pages(plots=[plot], filename=(filename), dpi=dpi)

    if verbose:
        print("Saved: {}".format(filename))
Esempio n. 4
0
def ologram_merge_stats(inputfiles=None,
                        pdf_width=None,
                        pdf_height=None,
                        output=None,
                        labels=None):
    # -------------------------------------------------------------------------
    # Check user provided labels
    # -------------------------------------------------------------------------

    if labels is not None:

        labels = labels.split(",")

        for elmt in labels:
            if not re.search("^[A-Za-z0-9_]+$", elmt):
                message(
                    "Only alphanumeric characters and '_' allowed for --more-bed-labels",
                    type="ERROR")
        if len(labels) != len(inputfiles):
            message("--labels: the number of labels should be"
                    " the same as the number of input files ", type="ERROR")

        if len(labels) != len(set(labels)):
            message("Redundant labels not allowed.", type="ERROR")

    # -------------------------------------------------------------------------
    # Loop over input files
    # -------------------------------------------------------------------------

    df_list = list()
    df_label = list()

    for pos, infile in enumerate(inputfiles):
        message("Reading file : " + infile.name)
        # Read the dataset into a temporay dataframe
        df_tmp = pd.read_csv(infile, sep='\t', header=0, index_col=None)
        # Change name of 'feature_type' column.
        df_tmp = df_tmp.rename(index=str, columns={"feature_type": "Feature"})
        # Assign the name of the dataset to a new column

        if labels is None:
            file_short_name = os.path.basename(os.path.normpath(os.path.dirname(infile.name)))
            df_label += [file_short_name]
        else:
            file_short_name = labels[pos]
            df_label += [labels[pos]]

        df_tmp = df_tmp.assign(**{"dataset": [file_short_name] * df_tmp.shape[0]})
        # Pval set to 0 or -1 are changed to 1e-320 and NaN respectively
        df_tmp.loc[df_tmp['summed_bp_overlaps_pvalue'] == 0, 'summed_bp_overlaps_pvalue'] = 1e-320
        df_tmp.loc[df_tmp['summed_bp_overlaps_pvalue'] == -1, 'summed_bp_overlaps_pvalue'] = np.nan
        # Compute -log10(pval)
        df_tmp = df_tmp.assign(**{"-log_10(pval)": -np.log10(df_tmp.summed_bp_overlaps_pvalue)})

        # Which p-values are signifcant ?
        # TODO: For now, draws all p-values. Add Benjamini-Hochberg correction, and distinguish between NaN and 0.
        df_tmp = df_tmp.assign(**{"pval_signif": df_tmp.summed_bp_overlaps_pvalue > 0})

        # Add the df to the list to be subsequently merged
        df_list += [df_tmp]



    if len(set(df_label)) < len(df_label):
        message('Enclosing directories are ambiguous and cannot be used as labels. You may use "--labels".',
                type="ERROR")

    # -------------------------------------------------------------------------
    # Concatenate dataframes (row bind)
    # -------------------------------------------------------------------------

    message("Merging dataframes.")
    df_merged = pd.concat(df_list, axis=0)

    # -------------------------------------------------------------------------
    # Plotting
    # -------------------------------------------------------------------------

    message("Plotting")
    my_plot = ggplot(data=df_merged,
                     mapping=aes(y='Feature', x='dataset'))
    my_plot += geom_tile(aes(fill = 'summed_bp_overlaps_log2_fold_change'))
    my_plot += scale_fill_gradient2()
    my_plot += labs(fill = "log2(fold change) for summed bp overlaps")

    # Points for p-val. Must be after geom_tile()
    my_plot += geom_point(data = df_merged.loc[df_merged['pval_signif']],
        mapping = aes(x='dataset',y='Feature',color = '-log_10(pval)'), size=4, shape ='D', inherit_aes = False)
    my_plot += scale_color_gradientn(colors = ["#160E00","#FFB025","#FFE7BD"])
    my_plot += labs(color = "-log10(p-value)")

    # Theming
    my_plot += theme_bw()
    my_plot += theme(panel_grid_major=element_blank(),
                     axis_text_x=element_text(rotation=90),
                     panel_border=element_blank(),
                     axis_ticks=element_blank())

    # -------------------------------------------------------------------------
    # Saving
    # -------------------------------------------------------------------------

    message("Saving")
    nb_ft = len(list(df_merged['Feature'].unique()))
    nb_datasets = len(list(df_merged['dataset'].unique()))

    if pdf_width is None:
        panel_width = 0.6
        pdf_width = panel_width * nb_datasets

        if pdf_width > 100:
            pdf_width = 100
            message("Setting --pdf-width to 100 (limit)")

    if pdf_height is None:
        panel_height = 0.6
        pdf_height = panel_height * nb_ft

        if pdf_height > 500:
            pdf_height = 500
            message("Setting --pdf-height to 500 (limit)")

    message("Page width set to " + str(pdf_width))
    message("Page height set to " + str(pdf_height))
    figsize = (pdf_width, pdf_height)

    # -------------------------------------------------------------------------
    # Turn warning off. Both pandas and plotnine use warnings for deprecated
    # functions. I need to turn they off although I'm not really satisfied with
    # this solution...
    # -------------------------------------------------------------------------

    def fxn():
        warnings.warn("deprecated", DeprecationWarning)

    # -------------------------------------------------------------------------
    # Saving
    # -------------------------------------------------------------------------

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        fxn()

        message("Saving diagram to file : " + output.name)
        message("Be patient. This may be long for large datasets.")

        # NOTE : We must manually specify figure size with save_as_pdf_pages
        save_as_pdf_pages(filename=output.name,
                          plots=[my_plot + theme(figure_size=figsize)],
                          width=pdf_width,
                          height=pdf_height)
Esempio n. 5
0
            }).reset_index().astype({"tag": "category"}))
            plt = ggplot(
                data=tags_summary,
                mapping=aes(
                    x="tag",
                    y="n_tags",
                ),  # "factor(species, ordered=False)",
            )

        plt = (
            plt + geom_bar(
                stat="identity", show_legend=True, position=position_dodge()) +
            xlab("Species") + ylab("Number of annotations") +
            geom_text(mapping=aes(label="n_tags"),
                      position=position_dodge(width=0.9)) + theme_classic() +
            theme(
                axis_text_x=element_text(
                    angle=90, vjust=1, hjust=1, margin={"r": -30}),
                figure_size=(20, 8),
            ) +
            ggtitle("_".join([database["name"], db_type, "tag_species.png"]) +
                    "(n = " + str(tag_df.shape[0]) + ")")
            # + scale_x_discrete(limits=SPECIES_LIST, labels=xlabels)
        )
        plots.append(plt)
        # plt.save(
        #     ), width=10, height=8
        # )
        # print(tags_summary)
save_as_pdf_pages(plots, "tag_summaries3_" + opts["class_type"] + ".pdf")
Esempio n. 6
0
      +p9.geom_abline(p9.aes(intercept=wyn['Intercept'],slope=wyn["btystdave"])))
print(fig1)
figures.append(fig1)

df2=beauty
df2['y_pred']=results.predict()
df2['residuals']=df2['courseevaluation']-df2['y_pred']
fig2_res=(p9.ggplot(p9.aes(x='btystdave',y='residuals'),data=beauty)
      +p9.geom_point())
print(fig2_res)
figures.append(fig2_res)

results = smf.ols("courseevaluation" +"~btystdavepos + btystdave", data=dane).fit()
wyn=results.params

fig2=(p9.ggplot(p9.aes(x="btystdave",y="courseevaluation"),data=dane)
      +p9.geom_jitter(width=0.1)
      +p9.geom_abline(p9.aes(intercept=wyn['Intercept'],slope=wyn["btystdave"])))
print(fig2)
figures.append(fig2)

df2=beauty
df2['y_pred']=results.predict()
df2['residuals']=df2['courseevaluation']-df2['y_pred']
fig2_res=(p9.ggplot(p9.aes(x='btystdave',y='residuals'),data=beauty)
      +p9.geom_point())
print(fig2_res)
figures.append(fig2_res)

save_as_pdf_pages(figures, filename="./zad2results/figures.pdf")
Esempio n. 7
0
        max_avg_dist_df, aes(x="Metric", y="Distance", fill="Dataset")
    ) + geom_bar(stat='identity', position='dodge') + labs(
        title=
        "Feature Selection {} | Maximum Classifier Score | Average Error Distance"
        .format(fs_method), ) + ylab("Average Error")))

    plots.append((ggplot(
        max_max_dist_df, aes(x="Metric", y="Distance", fill="Dataset")
    ) + geom_bar(stat='identity', position='dodge') + labs(
        title=
        "Feature Selection {} | Maximum Classifier Score | Maximum Error Distance"
        .format(fs_method), ) + ylab("Max Error")))

    plots.append((ggplot(
        mean_avg_dist_df, aes(x="Metric", y="Distance", fill="Dataset")
    ) + geom_bar(stat='identity', position='dodge') + labs(
        title=
        "Feature Selection {} | Average Classifier Score  | Average Error Distance"
        .format(fs_method), ) + ylab("Avg Error")))

    plots.append((ggplot(
        mean_max_dist_df, aes(x="Metric", y="Distance", fill="Dataset")
    ) + geom_bar(stat='identity', position='dodge') + labs(
        title=
        "Feature Selection {} | Average Classifier Score | Maximum Error Distance"
        .format(fs_method), ) + ylab("Max Error")))

path = Config.get_work_dir_path(
    os.path.join("paper", "hypothesis", "robustness.pdf"))
save_as_pdf_pages(plots, path=path)
            + gg.ylab("UMAP Y")
            + gg.theme(
                legend_position="none",
                strip_text=gg.element_text(size=5),
                strip_background=gg.element_rect(colour="black", fill="#fdfff4"),
                axis_text=gg.element_text(size=6),
                axis_title=gg.element_text(size=7),
                title=gg.element_text(size=7),
                figure_size=(5.5, 3)
            )
        )
        plotlist.append(embedding_gg)

output_file = pathlib.Path(output_dir[batch], f"{batch}_UMAPs.pdf")
output_file.parent.mkdir(exist_ok=True)
gg.save_as_pdf_pages(plotlist, output_file)


# In[6]:


batch = "2017_12_05_Batch2"

plotlist = []
for norm_method in norm_methods:
    for color_type in [
        "Metadata_broad_sample", "Metadata_Plate", "Metadata_cell_line", "Metadata_time_point"
    ]:
        output_file = pathlib.Path(output_dir[batch], f"{batch}_{norm_method}_colorby{color_type}.png")
        output_file.parent.mkdir(exist_ok=True)
        
Esempio n. 9
0
fig2 = (p9.ggplot(p9.aes(x='x2', y='y'), data=excersise) +
        p9.geom_jitter(width=0.1) +
        p9.geom_abline(p9.aes(intercept=wyn['Intercept'], slope=wyn['x2'])))
plots.append(fig2)
print(fig2)

df = excersise
df['excersise_predict'] = results.predict()
df['residuals'] = df['y'] - df['excersise_predict']

fig1_res = (p9.ggplot(p9.aes(x='x1', y='residuals'), data=excersise) +
            p9.geom_point())
print(fig1_res)
plots.append(fig1_res)

fig2_res = (p9.ggplot(p9.aes(x='x2', y='residuals'), data=excersise) +
            p9.geom_point())
print(fig2_res)
plots.append(fig2_res)

save_as_pdf_pages(plots, filename="./zad1results/figures.pdf")

exercise_v2 = pd.read_csv("exercise.csv")
results_v2 = smf.ols('y ~ x1 + x2', data=exercise_v2).fit()
X = exercise_v2[40:60].drop('y', axis=1)
Y_pred = results_v2.predict(X)

plt.plot(Y_pred)
plt.savefig('./zad1results/prediction.png')
plt.show()