seqcol, "site_str_pos", "site_str_score", "site_str_ori", "site_str_core", "site_wk_pos", "site_wk_score", "site_wk_ori", "site_wk_core", "distance", "orientation" ]] return df.merge(posdf, on=seqcol) if __name__ == "__main__": basepath = "/Users/vincentiusmartin/Research/chip2gcPBM/chip2probe" pd.set_option("display.max_columns", None) # using pwm pwm_ets = PWM("%s/input/site_models/pwm/ets1.txt" % basepath, log=True) kompas_ets = Kompas( "/Users/vincentiusmartin/Research/chip2gcPBM/chip2probe/input/site_models/kompas/Ets1_kmer_alignment.txt", core_start=11, core_end=15, core_center=12) df = pd.read_csv("%s/output/homotypic/training/seqlbled.csv" % basepath).drop_duplicates() df = df[(df["label"] == "cooperative") | (df["label"] == "independent")] train = get_sites_pos(df, kompas_ets, pwm_ets, seqcol="sequence") train = train[(train["site_wk_score"] != -999) & (train["site_str_score"] != -999)] print(train["label"].value_counts()) print(train["site_str_core"].unique()) train["orientation"].replace({"-/+": "+/-"}, inplace=True) train.to_csv("train_pwm.csv", index=False) # using custom imads model
"/Users/faricazjj/Box/homotf/chip2probe/chip2probe/data/imads/runx1/Runx1_10nM_Bound_filtered_normalized_logistic_transformed_20bp_GCGGT_1a2a3mer_format.model", "/Users/faricazjj/Box/homotf/chip2probe/chip2probe/data/imads/runx1/Runx1_10nM_Bound_filtered_normalized_logistic_transformed_20bp_GTGGC_1a2a3mer_format.model", "/Users/faricazjj/Box/homotf/chip2probe/chip2probe/data/imads/runx1/Runx1_10nM_Bound_filtered_normalized_logistic_transformed_20bp_GTGGG_1a2a3mer_format.model", "/Users/faricazjj/Box/homotf/chip2probe/chip2probe/data/imads/runx1/Runx1_10nM_Bound_filtered_normalized_logistic_transformed_20bp_GTGGT_1a2a3mer_format.model" ] } imads_cutoff = {'ets1': 0.2128, 'runx1': 0.3061} escores = {} models = {} proteins = ['ets1', 'runx1'] for tf in proteins: escores[tf] = PBMEscore(escore_long_paths[tf]) models[tf] = Kompas(protein=tf, threshold=mutate_cutoff, kmer_align_path=kmer_align_paths[tf]) es_preds = {} esplots = {} model_preds = {} model_plots = {} colors = [('crimson', 'plum'), ('steelblue', 'lightblue')] # initialize escore and model objects for each protein for protein in proteins: protein_num = proteins.index(protein) es_preds[protein] = escores[protein].predict_sequences(df, key_colname="key") esplots[protein] = escores[protein].make_plot_data( es_preds[protein], color=colors[protein_num][0])
"Name", "Sequence", "label" ]] train00["Sequence"] = train00["Sequence"].apply( lambda x: bio.revcompstr(x)) train00 = get_sites_pos(train00, kompas, pwm) train = pd.concat([train[train["orientation"] != "-/-"], train00]) return train.drop_duplicates() if __name__ == "__main__": pd.set_option("display.max_columns", None) # using pwm pwm_ets = PWM("input/sitemodels/ets1.txt", log=True) kompas_ets = Kompas("input/sitemodels/Ets1_kmer_alignment.txt", core_start=11, core_end=15, core_center=12) df = pd.read_csv( "output/Ets1Ets1/label_pr/ets_ets_seqlabeled.csv").drop_duplicates() df = df[(df["label"] == "cooperative") | (df["label"] == "independent")] train = gen_training(df, pwm_ets, kompas_ets) print(train["label"].value_counts()) train.to_csv("output/Ets1Ets1/training/train_ets1_ets1.tsv", index=False, sep="\t") train.rename(columns={ 'site_str_score': 'Binding strength of the stronger site', 'site_wk_score': 'Binding strength of the weaker site' },
#escore_pred_df = escore.predict_sequences(df, key_colname="key") # ========= iMADS ========= # load imads model, the cores should be loaded in the same order with the model # imads_paths = ["input/site_models/imads_models/Ets1_w12_GGAA.model", "input/site_models/imads_models/Ets1_w12_GGAT.model"] # imads_cores = ["GGAA", "GGAT"] # imads_models = [iMADSModel(path, core, 12, [1, 2, 3]) for path, core in zip(imads_paths, imads_cores)] # imads = iMADS(imads_models, 0.19) # 0.2128 # # imads_pred = imads.predict_sequence(single_sequence) # imads_pred_list = imads.predict_sequences(many_sequences) # imads_pred_df = imads.predict_sequences(df, key_colname="key") # ========= Kompas ========= kompas = Kompas("input/site_models/kompas/Ets1_kmer_alignment.txt", core_start=11, core_end=15, core_center=12) kompas_pred = kompas.predict_sequence(single_sequence) kompas_pred_list = kompas.predict_sequences(many_sequences) # ========= Plot the sequence ========= # Make the plot objects, make_plot_data accepts prediction result # pwm_plotr = pwmr.make_plot_data(pwm_pred_listr) # pwm_plote = pwme.make_plot_data(pwm_pred_liste, color="green") escore_plot = escore.make_plot_data(escore_pred_list, color="#ED2024") # imads_plot = imads.make_plot_data(imads_pred_list) kompas_plot = kompas.make_plot_data(kompas_pred_list, color="#F7A091") # Generate sequence plot sp = SitesPlotter()