Esempio n. 1
0
        seqcol, "site_str_pos", "site_str_score", "site_str_ori",
        "site_str_core", "site_wk_pos", "site_wk_score", "site_wk_ori",
        "site_wk_core", "distance", "orientation"
    ]]
    return df.merge(posdf, on=seqcol)


if __name__ == "__main__":
    basepath = "/Users/vincentiusmartin/Research/chip2gcPBM/chip2probe"
    pd.set_option("display.max_columns", None)

    # using pwm
    pwm_ets = PWM("%s/input/site_models/pwm/ets1.txt" % basepath, log=True)
    kompas_ets = Kompas(
        "/Users/vincentiusmartin/Research/chip2gcPBM/chip2probe/input/site_models/kompas/Ets1_kmer_alignment.txt",
        core_start=11,
        core_end=15,
        core_center=12)
    df = pd.read_csv("%s/output/homotypic/training/seqlbled.csv" %
                     basepath).drop_duplicates()
    df = df[(df["label"] == "cooperative") | (df["label"] == "independent")]

    train = get_sites_pos(df, kompas_ets, pwm_ets, seqcol="sequence")
    train = train[(train["site_wk_score"] != -999)
                  & (train["site_str_score"] != -999)]
    print(train["label"].value_counts())
    print(train["site_str_core"].unique())
    train["orientation"].replace({"-/+": "+/-"}, inplace=True)
    train.to_csv("train_pwm.csv", index=False)

    # using custom imads model
        "/Users/faricazjj/Box/homotf/chip2probe/chip2probe/data/imads/runx1/Runx1_10nM_Bound_filtered_normalized_logistic_transformed_20bp_GCGGT_1a2a3mer_format.model",
        "/Users/faricazjj/Box/homotf/chip2probe/chip2probe/data/imads/runx1/Runx1_10nM_Bound_filtered_normalized_logistic_transformed_20bp_GTGGC_1a2a3mer_format.model",
        "/Users/faricazjj/Box/homotf/chip2probe/chip2probe/data/imads/runx1/Runx1_10nM_Bound_filtered_normalized_logistic_transformed_20bp_GTGGG_1a2a3mer_format.model",
        "/Users/faricazjj/Box/homotf/chip2probe/chip2probe/data/imads/runx1/Runx1_10nM_Bound_filtered_normalized_logistic_transformed_20bp_GTGGT_1a2a3mer_format.model"
    ]
}
imads_cutoff = {'ets1': 0.2128, 'runx1': 0.3061}
escores = {}
models = {}
proteins = ['ets1', 'runx1']

for tf in proteins:
    escores[tf] = PBMEscore(escore_long_paths[tf])

    models[tf] = Kompas(protein=tf,
                        threshold=mutate_cutoff,
                        kmer_align_path=kmer_align_paths[tf])

es_preds = {}
esplots = {}
model_preds = {}
model_plots = {}
colors = [('crimson', 'plum'), ('steelblue', 'lightblue')]

# initialize escore and model objects for each protein
for protein in proteins:
    protein_num = proteins.index(protein)
    es_preds[protein] = escores[protein].predict_sequences(df,
                                                           key_colname="key")
    esplots[protein] = escores[protein].make_plot_data(
        es_preds[protein], color=colors[protein_num][0])
Esempio n. 3
0
        "Name", "Sequence", "label"
    ]]
    train00["Sequence"] = train00["Sequence"].apply(
        lambda x: bio.revcompstr(x))
    train00 = get_sites_pos(train00, kompas, pwm)
    train = pd.concat([train[train["orientation"] != "-/-"], train00])
    return train.drop_duplicates()


if __name__ == "__main__":
    pd.set_option("display.max_columns", None)

    # using pwm
    pwm_ets = PWM("input/sitemodels/ets1.txt", log=True)
    kompas_ets = Kompas("input/sitemodels/Ets1_kmer_alignment.txt",
                        core_start=11,
                        core_end=15,
                        core_center=12)
    df = pd.read_csv(
        "output/Ets1Ets1/label_pr/ets_ets_seqlabeled.csv").drop_duplicates()
    df = df[(df["label"] == "cooperative") | (df["label"] == "independent")]
    train = gen_training(df, pwm_ets, kompas_ets)

    print(train["label"].value_counts())
    train.to_csv("output/Ets1Ets1/training/train_ets1_ets1.tsv",
                 index=False,
                 sep="\t")

    train.rename(columns={
        'site_str_score': 'Binding strength of the stronger site',
        'site_wk_score': 'Binding strength of the weaker site'
    },
Esempio n. 4
0
    #escore_pred_df = escore.predict_sequences(df, key_colname="key")

    # ========= iMADS =========
    # load imads model, the cores should be loaded in the same order with the model
    # imads_paths = ["input/site_models/imads_models/Ets1_w12_GGAA.model", "input/site_models/imads_models/Ets1_w12_GGAT.model"]
    # imads_cores = ["GGAA", "GGAT"]
    # imads_models = [iMADSModel(path, core, 12, [1, 2, 3]) for path, core in zip(imads_paths, imads_cores)]
    # imads = iMADS(imads_models, 0.19) # 0.2128
    #
    # imads_pred = imads.predict_sequence(single_sequence)
    # imads_pred_list = imads.predict_sequences(many_sequences)
    # imads_pred_df = imads.predict_sequences(df, key_colname="key")

    # ========= Kompas =========
    kompas = Kompas("input/site_models/kompas/Ets1_kmer_alignment.txt",
                    core_start=11,
                    core_end=15,
                    core_center=12)
    kompas_pred = kompas.predict_sequence(single_sequence)
    kompas_pred_list = kompas.predict_sequences(many_sequences)

    # ========= Plot the sequence =========

    # Make the plot objects, make_plot_data accepts prediction result
    # pwm_plotr = pwmr.make_plot_data(pwm_pred_listr)
    # pwm_plote = pwme.make_plot_data(pwm_pred_liste, color="green")
    escore_plot = escore.make_plot_data(escore_pred_list, color="#ED2024")
    # imads_plot = imads.make_plot_data(imads_pred_list)
    kompas_plot = kompas.make_plot_data(kompas_pred_list, color="#F7A091")

    # Generate sequence plot
    sp = SitesPlotter()