Ejemplo n.º 1
0
Created on Oct 30, 2019

@author: Vincentius Martin, Farica Zhuang

Make some plots for analysis
'''

import pandas as pd
import os
os.chdir("../../..")

import chip2probe.modeler.plotlib as pl
from chip2probe.modeler.cooptrain import CoopTrain

if __name__ == "__main__":
    trainingpath = "output/homotypic/training/training_pwm.csv"
    df = pd.read_csv(trainingpath)  # , sep="\t")
    df['label'] = df['label'].replace({"additive": "independent"})
    train = CoopTrain(df, corelen=4)

    # make distace stacked bar
    pl.plot_stacked_categories(train.df,
                               "orientation",
                               path="dist_stackedbar.png")

    # get stacked bar of ratio between different distance
    pl.plot_stacked_categories(train.df,
                               "distance",
                               path="distance_ets_ets.png",
                               ratio=True)
    #dft["orientation"] = dft.apply(lambda x: "%s%s" % (str(x["ets_ori"]), str(x["runx_ori"])),axis=1)
    dft["orientation"] = dft.apply(
        lambda x: "%s/%s" %
        (orimap[int(x["ets_ori"])], orimap[int(x["runx_ori"])]),
        axis=1)
    #print(dft)
    dft.to_csv("training_pwm.tsv", sep="\t", index=False, float_format='%.3f')
    dft.rename(columns={
        'runx_score': 'Runx1 strength\n(cooperator TF)',
        'ets_score': 'Ets1 strength\n(main TF)'
    },
               inplace=True)
    #plot.plot_stacked_categories(dft, "distance", path="distance_bar.png", title="Distance distribution", ratio=True)
    plot.plot_stacked_categories(
        dft,
        "orientation",
        path="ori_bar.png",
        title="Relative sites orientation distribution",
        ratio=True)
    plot.plot_box_categories(
        dft,
        incols=["Runx1 strength\n(cooperator TF)", "Ets1 strength\n(main TF)"],
        alternative="smaller")

    # dft["runx_score"] = predict_strength(dft, pwm_runx, "runx", flanklen)
    # dft["ets_score"] = predict_strength(dft, pwm_ets, "ets", flanklen)
    # dft = dft.drop(["fullseq"], axis=1)
    # dft = dft[dft['ets_score'].notnull() & dft['runx_score'].notnull()]
    # dft.to_csv("training.tsv",sep="\t",index=False)
    # plot.plot_stacked_categories(dft, "distance")
    # plot.plot_box_categories(dft, incols=["distance","runx_score", "ets_score"], alternative="smaller")
Ejemplo n.º 3
0
    train = gen_training(df, pwm_ets, kompas_ets)

    print(train["label"].value_counts())
    train.to_csv("output/Ets1Ets1/training/train_ets1_ets1.tsv",
                 index=False,
                 sep="\t")

    train.rename(columns={
        'site_str_score': 'Binding strength of the stronger site',
        'site_wk_score': 'Binding strength of the weaker site'
    },
                 inplace=True)
    pl.plot_stacked_categories(
        train,
        "distance",
        path="output/Ets1Ets1/training/distance_bar.png",
        title="Distance distribution",
        ratio=True,
        figsize=(17, 4))
    pl.plot_stacked_categories(
        train,
        "orientation",
        path="output/Ets1Ets1/training/ori_bar.png",
        title="Relative sites orientation\ndistribution",
        ratio=True,
        figsize=(9, 5))
    pl.plot_box_categories(train,
                           path="output/Ets1Ets1/training/boxplot.png",
                           incols=[
                               "Binding strength of the stronger site",
                               "Binding strength of the weaker site"
Ejemplo n.º 4
0
        curdf_coop.shape[0], coopfrac))
 print("Confusion matrix:")
 print(cm)
 minp = curdf.apply(lambda x: x["p_o1"]
                    if x["p_o1"] < x["p_o2"] else x["p_o2"],
                    axis=1)
 corr, _ = pearsonr(minp, curdf["main_prob"])
 print("R(model_probability,p_value): %.2f" % corr, "\n")
 fprtype, tprtype, _ = metrics.roc_curve(np.array(curdf["label"]),
                                         np.array(curdf["main_prob"]))
 auctype = metrics.auc(fprtype, tprtype)
 plt.plot(fprtype, tprtype, label='AUC %s = %.2f' % (mtype, auctype))
 if mtype == "distance":
     g = curdf.groupby(["distance", "label"])["label"].count()
     pl.plot_stacked_categories(curdf,
                                "distance",
                                path="cust_stackedbar_dist.png",
                                ratio=True)
 elif mtype == "affinity":
     g = curdf.groupby("seqid")
     ghead = g.head(1)
     ghead = ghead[ghead["label"] == 0][["seqid", "id"]]
     gtail = g.tail(1)
     gtail = gtail[gtail["label"] == 1][["seqid", "id"]]
     ght = ghead.merge(gtail, on="seqid")
     print(ght)
     # if needed, print
 elif mtype == "orientation":
     for ori in ["HH", "HT/TH", "TT"]:
         oridf = curdf[(curdf["orientation"] == ori)]
         orimatch = oridf[oridf["label"] == oridf["main_pred"]].shape[0]
         print("Orientation %s, match: (%d/%d)" %
    train.to_csv("%s/training/train_%s_%s.tsv" % (basepath, maintf, cooptf),
                 sep="\t",
                 index=False,
                 float_format='%.3f')

    train.rename(columns={
        '%s_score' % maintf:
        '%s strength\n(main TF)' % maintf.capitalize(),
        '%s_score' % cooptf:
        '%s strength\n(cooperator TF)' % cooptf.capitalize()
    },
                 inplace=True)
    plot.plot_stacked_categories(train,
                                 "distance",
                                 path="%s/training/distance_bar.png" %
                                 basepath,
                                 title="Distance distribution",
                                 ratio=True,
                                 figsize=(17, 4),
                                 color=color)
    plot.plot_stacked_categories(
        train,
        "orientation",
        path="%s/training/ori_bar.png" % basepath,
        title="Relative sites orientation\ndistribution",
        ratio=True,
        figsize=(9, 5),
        color=color)
    plot.plot_box_categories(
        train,
        path="%s/training/boxplot.png" % basepath,
        incols=[