def predict(self, df): # hardcoded for now, too tired print(df) predres = {} predproba = {} for key in self.model: curdf = df.loc[df["orientation"] == key] curct = CoopTrain(curdf, corelen=4, flip_th=True, positive_cores=["GGAA", "GGAT"]) feature_dict = { "distance": { "type": "numerical" }, "shape_in": { "seqin": 4, "smode": "positional", "direction": "inout" }, # maximum seqin is 4 "shape_out": { "seqin": -4, "smode": "positional", "direction": "inout" } } train_df = pd.DataFrame( curct.get_feature_all(feature_dict))[self.param[key]] train = train_df.values.tolist() pred = self.model[key].predict(train) proba = [prb[1] for prb in self.model[key].predict_proba(train)] idxs = curdf.index for i in range(0, len(pred)): predres[idxs[i]] = pred[i] predproba[idxs[i]] = proba[i] allidxs = sorted(predres.keys()) predlist = [predres[i] for i in allidxs] probalist = [predproba[i] for i in allidxs] return predlist, probalist
os.chdir("../..") import pandas as pd from chip2probe.modeler.cooptrain import CoopTrain from chip2probe.modeler.bestmodel import BestModel import chip2probe.modeler.plotlib as pl from sklearn import ensemble, tree import subprocess if __name__ == "__main__": trainingpath = "output/heterotypic/EtsRunx_v1/ch1_ch2/training_pwm.tsv" df = pd.read_csv(trainingpath, sep="\t") df['label'] = df['label'].replace('independent', 'additive') ct = CoopTrain(df) pd.set_option("display.max_columns", None) rf_param_grid = { 'n_estimators': [500], #[500,750,1000], 'max_depth': [5], #[5,10,15], "min_samples_leaf": [5], #[5,10,15], "min_samples_split": [5] #[5,10,15] } best_models = { "strength": BestModel(clf="sklearn.ensemble.RandomForestClassifier", param_grid=rf_param_grid, train_data=ct.get_training_df( {"affinity": {
from chip2probe.modeler.bestmodel import BestModel import chip2probe.modeler.plotlib as pl import pickle from sklearn import ensemble, tree import subprocess if __name__ == "__main__": # basepath = "output/Ets1Runx1" # trainingpath = "output/Ets1Runx1/training/train_ets1_runx1.tsv" basepath = "output/Runx1Ets1" trainingpath = "%s/training/train_runx1_ets1.tsv" % basepath df = pd.read_csv(trainingpath, sep="\t") ct = CoopTrain(df) pd.set_option("display.max_columns",None) rf_param_grid = { 'n_estimators': [500,750,1000], 'max_depth': [5,10,15], "min_samples_leaf": [5,10,15], "min_samples_split" : [5,10,15] } best_models = { "strength": BestModel(clf="sklearn.ensemble.RandomForestClassifier", param_grid=rf_param_grid, train_data=ct.get_training_df({ "affinity": {"colnames": ("ets1_score","runx1_score")}
import pandas as pd import os os.chdir("..") import chip2probe.modeler.plotlib as pl from chip2probe.modeler.cooptrain import CoopTrain from chip2probe.modeler.bestmodel import BestModel from chip2probe.modeler.dnashape import DNAShape if __name__ == "__main__": trainingpath = "input/modeler/training_data/training_p01_adjusted.tsv" df = pd.read_csv(trainingpath, sep="\t") # select only genomic (i.e. non-custom) sequences df = df[~df['name'].str.contains("dist|weak")] cooptr = CoopTrain(df, corelen=4, flip_th=True, positive_cores=["GGAA", "GGAT"]) rf_param_grid = { 'n_estimators': [500, 100, 1500], 'max_depth': [5, 10, 15], "min_samples_leaf": [10, 15, 20], "min_samples_split": [10, 15, 20] } # TODO: choose per orientation best_models = { "dist,ori": BestModel(clf="sklearn.ensemble.RandomForestClassifier", param_grid=rf_param_grid, train_data=cooptr.get_training_df({
import pickle from sklearn import ensemble, tree import subprocess if __name__ == "__main__": # basepath = "output/Ets1Runx1" # trainingpath = "output/Ets1Runx1/training/train_ets1_runx1.tsv" # basepath = "output/Runx1Ets1" # trainingpath = "%s/training/train_runx1_ets1.tsv" % basepath basepath = "output/Ets1Ets1" trainingpath = "%s/training/train_ets1_ets1.tsv" % basepath df = pd.read_csv(trainingpath, sep="\t") ct = CoopTrain(df) pd.set_option("display.max_columns", None) rf_param_grid = { 'n_estimators': [1000], #[500,750,1000], 'max_depth': [10], #[5,10,15], "min_samples_leaf": [5], #[5,10,15], "min_samples_split": [5] #[5,10,15] } best_models = { "Weaker site strength": BestModel(clf="sklearn.ensemble.RandomForestClassifier", param_grid=rf_param_grid, train_data=ct.get_training_df( {"affinity": {
imads12_cores = ["GGAA", "GGAT"] imads12_models = [ iMADSModel(path, core, 12, [1, 2, 3]) for path, core in zip(imads12_paths, imads12_cores) ] imads12 = iMADS(imads12_models, 0.19) # 0.2128 selected = pd.read_csv( "output/array_design_files/Coop2Ets_validation/custom_probes_selected.csv" ) selectedlist = selected["sequence"].values.tolist() model = pickle.load( open("input/modeler/coopmodel/dist_ori_12merimads.sav", "rb")) ct = CoopTrain(selectedlist, corelen=4, flip_th=True, positive_cores=["GGAA", "GGAT"], imads=imads12) feature_dict = { "distance": { "type": "numerical" }, "orientation": { "positive_cores": ["GGAA", "GGAT"], "one_hot": True }, "affinity": { "imads": imads12 } } train = pd.DataFrame(ct.get_feature_all(feature_dict)).values.tolist()
# basepath = "output/Ets1Runx1" # trainingpath = "%s/training/train_ets1_runx1.tsv" % basepath # s1, s2 = "ets1", "runx1" # rel_ori = False # one_hot_ori = False # smode = "positional" basepath = "output/Ets1Ets1" trainingpath = "%s/training/train_ets1_ets1.tsv" % basepath s1, s2 = "site_str", "site_wk" rel_ori = True one_hot_ori = True smode = "relative" df = pd.read_csv(trainingpath, sep="\t") ct = CoopTrain(df) pd.set_option("display.max_columns", None) rf_param_grid = { 'n_estimators': [500], #[500,750,1000], 'max_depth': [10], #[5,10,15], "min_samples_leaf": [10], #[5,10,15], "min_samples_split": [20], #[5,10,15] } best_models = { "distance,orientation": BestModel(clf="sklearn.ensemble.RandomForestClassifier", param_grid=rf_param_grid, train_data=ct.get_training_df( {
def mutate_orientation(seqdf, imads, escore, deep=0, escore_cutoff=0.4, escore_gap=0, idcol="id"): """ Make mutation for orientation Flip one or both sites, flip the whole 12 mer. We only use HH, HT, TT orientation (i.e. no TH). Args: 1. seqdf: input data frame with the wt sequences to mutate 2. imads: imads model to predict the strength of the mutants 3. deep: how far we permit distance to go under imads.sitewidth. The minimum distance is set to imads.sitewidth - deep. The flip length is changed from sitewidth to (sitewidth-deep)//2*2. Returns: A data frame with changed orientations """ # we need to get orientation information, this already filter if each sequence has 2 sites ct = CoopTrain(seqdf["sequence"].values.tolist(), corelen=4, flip_th=True, imads=imads, ignore_sites_err=True) om = ct.df.join(seqdf.set_index("sequence"), on="sequence", how="inner") # this already include the orientation mutres = [] orilist = {"HH", "TT", "HT/TH"} flipsites = [[0], [1], [0, 1]] # which sites to flip iter = 0 nrow = om.shape[0] div = 1 if nrow // 100 == 0 else nrow // 100 mindist = imads.sitewidth - deep for index, row in om.iterrows(): if iter % div == 0: print("Mutating orientation, progress {:.2f}% ({}/{})".format( iter * 100 / nrow, iter, nrow)) iter += 1 mutres_cur = [] sites, sites_specific = DNASequence(row["sequence"], imads, escore, escore_cutoff, escore_gap).get_sites() if len(sites) != 2 or sites[1]["core_mid"] - sites[0][ "core_mid"] < mindist: # or len(sites_specific) != 2 continue curdist = sites[1]["core_mid"] - sites[0]["core_mid"] mutres_cur.append({ "seqid": row[idcol], "sequence": str(row["sequence"]), "site1_pos": sites[0]["core_mid"], "site1_affinity": sites[0]["score"], "site2_pos": sites[1]["core_mid"], "site2_affinity": sites[1]["score"], "distance": curdist, "muttype": "orientation", "comment": "wt", "wtlabel": row["label"], "orientation": row["orientation"] }) for fs in flipsites: newseq = row["sequence"] adjust = 0 if curdist >= imads.sitewidth else int( math.ceil(float(imads.sitewidth - curdist) / 2)) for i in fs: start, end = sites[i]["site_start"] + adjust, sites[i][ "site_start"] + sites[i]["site_width"] - adjust toflip = bio.revcompstr(row["sequence"][start:end]) newseq = newseq[:start] + toflip + newseq[end:] newsites, newsites_specific = DNASequence(newseq, imads, escore, escore_cutoff, escore_gap).get_sites() if len( newsites ) != 2: #or len(newsites_specific) != 2: # we ignore if there are new sites continue newori = cg.get_relative_orientation(newseq, imads, htth=False) if newori == "HT": newori = "HT/TH" elif newori == "TH": continue # skip if TH since we use HT mutres_cur.append({ "seqid": row[idcol], "sequence": str(newseq), "site1_pos": newsites[0]["core_mid"], "site1_affinity": newsites[0]["score"], "site2_pos": newsites[1]["core_mid"], "site2_affinity": newsites[1]["score"], "distance": newsites[1]["core_mid"] - newsites[0]["core_mid"], "muttype": "orientation", "comment": "to_%s" % newori, "wtlabel": row["label"], "orientation": newori }) # if len(mutres_cur) != 3: # 3 orientations # print("Found predictions with number of orientation probes != 3", len(mutres_cur), row["sequence"]) if len(mutres_cur) > 1: mutres.extend(mutres_cur) return pd.DataFrame(mutres)
if __name__ == "__main__": basepath = "/Users/vincentiusmartin/Research/chip2gcPBM/chip2probe" # using custom imads model imads_paths = [ "%s/input/site_models/imads_models/Ets1_w12_GGAA.model" % basepath, "%s/input/site_models/imads_models/Ets1_w12_GGAT.model" % basepath ] imads_cores = ["GGAA", "GGAT"] imads_models = [ iMADSModel(path, core, 12, [1, 2, 3]) for path, core in zip(imads_paths, imads_cores) ] imads = iMADS(imads_models, 0.19) # 0.2128 df = pd.read_csv("%s/output/homotypic/training/seqlbled.csv" % basepath).drop_duplicates() print(df.shape[0]) df = df[(df["label"] == "cooperative") | (df["label"] == "independent")].rename({"Sequence": "sequence"}, axis=1) print(df["label"].value_counts()) ct = CoopTrain(df["sequence"].tolist(), corelen=4, flip_th=True, imads=imads, ignore_sites_err=True) train_df = ct.df.merge(df, on="sequence") print(train_df["label"].value_counts()) print(train_df.shape[0]) train_df.to_csv("training.csv", index=False)
def mutate_affinity(seqdf, imads, escore, deep=0, escore_cutoff=0.4, escore_gap=0, idcol="id"): """ Make mutation to change the affinity (i.e. strength) prediction. First, mutations are made for each core to its other core versions, e.g. if the core is GGAA and the alternate is GGAT, we simply change GGAAA -> GGAT. Then mutate the core flanking regions up to imads.sitewidth, e.g. if the core length is 4 and sitewidth is 12 then we can mutate up to (12-4)/2=4bp to each side. When 'deep' is set to be more than 0, set barrier to sitewidth - distance on the other binding site. Args: 1. seqdf: input data frame with the wt sequences to mutate 2. imads: imads model to predict the strength of the mutants 3. deep: the minimum distance between sequence is set to be imads.sitewidth - deep. Default is 0, which means we keep sitewidth as minimum distance. When deep is > 0, we make barrier at the other site so we don't change its affinity prediction. Returns: A data frame of sequences with SNPs that change its affinity. """ if deep < 0: raise ValueError("Minimum deep is 0") ct = CoopTrain(seqdf["sequence"].values.tolist(), corelen=4, flip_th=True, imads=imads, ignore_sites_err=True) om = ct.df.join(seqdf.set_index("sequence"), on="sequence", how="inner") # this already include the orientation # first make map for mutating between core mdlcores_fw = [m.core for m in imads.models] fwdict = {e[0]: e[1] for e in list(itertools.permutations(mdlcores_fw, 2))} mdlcores_rc = [bio.revcompstr(m) for m in mdlcores_fw] rcdict = {e[0]: e[1] for e in list(itertools.permutations(mdlcores_rc, 2))} coremap = {**fwdict, **rcdict} # prepare the variable mindist = imads.sitewidth - deep mutres = [] iter = 0 nrow = om.shape[0] div = 1 if nrow // 100 == 0 else nrow // 100 for index, row in om.iterrows(): if iter % div == 0: print("Mutating affinity, progress {:.2f}% ({}/{})".format( iter * 100 / nrow, iter, nrow)) iter += 1 mutres_cur = [] # we use DNASequence object to have overlap with escore sites, sites_specific = DNASequence(row["sequence"], imads, escore, escore_cutoff, escore_gap).get_sites() if len(sites) != 2 or sites[1]["core_mid"] - sites[0][ "core_mid"] < mindist: #or len(sites_specific) != 2 continue mutres_cur.append({ "seqid": row[idcol], "sequence": str(row["sequence"]), "site1_pos": sites[0]["core_mid"], "site1_affinity": sites[0]["score"], "site2_pos": sites[1]["core_mid"], "site2_affinity": sites[1]["score"], "distance": sites[1]["core_mid"] - sites[0]["core_mid"], "muttype": "affinity", "comment": "wt", "wtlabel": row["label"], "orientation": row["orientation"] }) mids = [s["core_mid"] for s in sites] # 1. Mutate the core to the other version coremt = mutate_cores(row["sequence"], mids, coremap) # 2. Mutate the flanks up to the sitewidth barrierlen = imads.sitewidth - row["distance"] if row[ "distance"] < imads.sitewidth else 0 flankmt = mutate_flanks(row["sequence"], mids, imads.corewidth, imads.sitewidth, barrier=barrierlen) allmt = coremt + flankmt for i in range(len(allmt)): newsites, newsites_specific = DNASequence(allmt[i]["sequence"], imads, escore, escore_cutoff, escore_gap).get_sites() if len(newsites) != 2: #or len(newsites_specific) != 2: continue newori = cg.get_relative_orientation(allmt[i]["sequence"], imads, htth=True) mutres_cur.append({ "seqid": row[idcol], "sequence": allmt[i]["sequence"], "site1_pos": newsites[0]["core_mid"], "site1_affinity": newsites[0]["score"], "site2_pos": newsites[1]["core_mid"], "site2_affinity": newsites[1]["score"], "distance": newsites[1]["core_mid"] - newsites[0]["core_mid"], "muttype": "affinity", "comment": allmt[i]["comment"], "wtlabel": row["label"], "orientation": newori }) if len(mutres_cur) > 1: mutres.extend(mutres_cur) return pd.DataFrame(mutres)
# Load escore object escore = PBMEscore("input/site_models/escores/Ets1_8mers_11111111.txt") # Load imads object imads12_paths = ["input/site_models/imads_models/Ets1_w12_GGAA.model", "input/site_models/imads_models/Ets1_w12_GGAT.model"] imads12_cores = ["GGAA", "GGAT"] imads12_models = [iMADSModel(path, core, 12, [1, 2, 3]) for path, core in zip(imads12_paths, imads12_cores)] imads12 = iMADS(imads12_models, 0.19) # 0.2128 print("Number of input rows: %d"%df.shape[0]) indf = pd.DataFrame(df[["sequence","label"]])# "id" indf["label"] = indf["label"].replace({"cooperative":1,"additive":0}) mindist = imads12.sitewidth - 3 ct = CoopTrain(indf["sequence"].values.tolist(), corelen=4, flip_th=True, imads=imads12, ignore_sites_err=True) om = ct.df.join(indf.set_index("sequence"), on="sequence", how="inner") # this already include the orientation seqs = [] passval = 0 for index, row in om.iterrows(): sites, sites_specific = DNASequence(row["sequence"], imads12, escore, 0.4, 0).get_sites() if len(sites_specific) != 2: # or sites[1]["core_mid"] - sites[0]["core_mid"] < mindist: #or len(sites_specific) != 2 #print(len(sites), sites[1]["core_mid"] - sites[0]["core_mid"], mindist) continue seqs.append(row["sequence"]) passval += 1 pd.DataFrame({'sequence':seqs}).to_csv("seqs.csv") print("Number of sequences passing the cutoff %d" % passval) """ # 1. Mutate based on affinity
# "shape": # BestModel(clf="sklearn.ensemble.RandomForestClassifier", # param_grid=rf_param_grid, # train_data=ct.get_training_df({ # "distance":{"type":"numerical"}, # "shape_in":{"seqin":3, "poscols":['ets_pos','runx_pos']}, # "shape_out":{"seqin":-2, "poscols":['ets_pos','runx_pos']} # }) # ).run_all() if __name__ == "__main__": trainingpath = "output/heterotypic/EtsRunx_v1/ch1_ch2/training_pwm.tsv" df = pd.read_csv(trainingpath, sep="\t") df['label'] = df['label'].replace('independent', 'additive') ct = CoopTrain(df) pd.set_option("display.max_columns", None) rf_param_grid = { 'n_estimators': [500], #[500,750,1000], 'max_depth': [10], #[5,10,15], "min_samples_leaf": [10], #[5,10,15], "min_samples_split": [20], #[5,10,15] } best_models = { "distance,orientation": BestModel(clf="sklearn.ensemble.RandomForestClassifier", param_grid=rf_param_grid, train_data=ct.get_training_df({ "distance": {
os.chdir("../../..") import chip2probe.modeler.plotlib as pl from chip2probe.modeler.cooptrain import CoopTrain from chip2probe.modeler.bestmodel import BestModel # TODO: fix after we finish refactoring probefilter, for now just append the path sys.path.append("/Users/vincentiusmartin/Research/chip2gcPBM/chip2probe/chip2probe/probe_generator/probefilter") from chip2probe.sitespredict.imads import iMADS from chip2probe.sitespredict.imadsmodel import iMADSModel if __name__ == "__main__": trainingpath = "output/homotypic/training/training.csv" #"input/modeler/training_data/training_p01_adjusted.tsv" df = pd.read_csv(trainingpath) #, sep="\t") # select only genomic (i.e. non-custom) sequences df = df[~df['name'].str.contains("dist|weak")] cooptr = CoopTrain(df, corelen=4) rf_param_grid = { 'n_estimators': [500], #, 1000, 1500], 'max_depth':[5], # 10, 15], "min_samples_leaf" : [10], # 15, 20], "min_samples_split" :[10], # 15 ,20] } # using custom imads model imads8_paths = ["input/site_models/imads_models/Ets1_w8_GGAA.model", "input/site_models/imads_models/Ets1_w8_GGAT.model"] imads8_cores = ["GGAA", "GGAT"] imads8_models = [iMADSModel(path, core, 8, [1, 2, 3]) for path, core in zip(imads8_paths, imads8_cores)] imads8 = iMADS(imads8_models, 0.19) # 0.2128 imads12_paths = ["input/site_models/imads_models/Ets1_w12_GGAA.model", "input/site_models/imads_models/Ets1_w12_GGAT.model"]
os.chdir("../../..") from chip2probe.modeler.cooptrain import CoopTrain from chip2probe.modeler.bestmodel import BestModel # TODO: fix after we finish refactoring probefilter, for now just append the path from chip2probe.sitespredict.imads import iMADS from chip2probe.sitespredict.imadsmodel import iMADSModel if __name__ == "__main__": trainingpath = "input/modeler/training_data/training_p01_adjusted_ets1.tsv" df = pd.read_csv(trainingpath, sep="\t") # select only genomic (i.e. non-custom) sequences # df = df[~df['name'].str.contains("dist|weak")] ct = CoopTrain(df, corelen=4, flip_th=True, positive_cores=["GGAA", "GGAT"]) # using custom imads model imads_paths = [ "input/site_models/imads_models/Ets1_w12_GGAA.model", "input/site_models/imads_models/Ets1_w12_GGAT.model" ] imads_cores = ["GGAA", "GGAT"] imads_models = [ iMADSModel(path, core, 12, [1, 2, 3]) for path, core in zip(imads_paths, imads_cores) ] imads = iMADS(imads_models, 0.19) # 0.2128 # get the features from the CoopTrain class
def mutate_dist(seqdf, imads, escore, deep=0, escore_cutoff=0.4, escore_gap=0, warning=True, patch=True, idcol="id"): """ Make mutation for distance Make closer distance between two sites. Insert and cut, for cutting can fix the second site (site that closer to the glass slide) and can just use the the nucleotide that is cut to patch. CHECK WE ARE NOT CREATING NEW SITE. Args: feature_dict: the following is the list of currently available feature: 1. seqdf: input data frame with the wt sequences to mutate 2. imads: imads model to predict the strength of the mutants 3. deep: how far we permit distance to go under imads.sitewidth. The minimum distance is set to imads.sitewidth - deep. When deep > 0, the affinity of each site will change after the distance is less than sitewidth. 4. warning: print warning when input has sites number != 2 5. appendback: append the middle sequence back to the edge Returns: A data frame where each sequence has mutants """ ct = CoopTrain(seqdf["sequence"].values.tolist(), corelen=4, flip_th=True, imads=imads, ignore_sites_err=True) om = ct.df.join(seqdf.set_index("sequence"), on="sequence", how="inner") # this already include the orientation mutres = [] mindist = imads.sitewidth - deep nrow = om.shape[0] iter = 0 div = 1 if nrow // 100 == 0 else nrow // 100 for index, row in om.iterrows(): if iter % div == 0: print("Mutating distance, progress {:.2f}% ({}/{})".format( iter * 100 / nrow, iter, nrow)) iter += 1 seq = row["sequence"] mutres_cur = [] sites, sites_specific = DNASequence(seq, imads, escore, escore_cutoff, escore_gap).get_sites() curdist = sites[1]["core_mid"] - sites[0]["core_mid"] if len(sites ) != 2 or curdist <= mindist: #or len(sites_specific) != 2: if warning and len(sites) != 2: print("Found a sequence with number of sites not equal to 2: ", seq, sites) continue mutres_cur.append({ "seqid": row[idcol], "sequence": str(seq), "site1_pos": sites[0]["core_mid"], "site1_affinity": sites[0]["score"], "site2_pos": sites[1]["core_mid"], "site2_affinity": sites[1]["score"], "distance": int(curdist), "muttype": "distance", "comment": "wt", "wtlabel": row["label"], "orientation": row["orientation"] }) move = 1 initdist = int( curdist) # need to make this because we keep changing curdist while initdist - move >= mindist: s1_end = sites[0]["site_start"] + sites[0]["site_width"] s2_start = sites[1]["site_start"] curseq = cg.move_single_site(seq, s1_end, s2_start, move, patch=patch, can_overlap=True) cursites, cursites_specific = DNASequence(curseq, imads, escore, escore_cutoff, escore_gap).get_sites() move += 1 if len( cursites ) != 2: # or len(cursites_specific) != 2: # we ignore if there are new sites continue curdist = cursites[1]["core_mid"] - cursites[0]["core_mid"] newori = cg.get_relative_orientation(curseq, imads, htth=True) mutres_cur.append({ "seqid": row[idcol], "sequence": str(curseq), "site1_pos": cursites[0]["core_mid"], "site1_affinity": cursites[0]["score"], "site2_pos": cursites[1]["core_mid"], "site2_affinity": cursites[1]["score"], "distance": int(curdist), "muttype": "distance", "comment": "closer_%d" % (move - 1), "wtlabel": row["label"], "orientation": newori }) if len(mutres_cur) > 1: mutres.extend(mutres_cur) return pd.DataFrame(mutres)
Created on Oct 30, 2019 @author: Vincentius Martin, Farica Zhuang Make some plots for analysis ''' import pandas as pd import os os.chdir("../../..") import chip2probe.modeler.plotlib as pl from chip2probe.modeler.cooptrain import CoopTrain if __name__ == "__main__": trainingpath = "output/homotypic/training/training_pwm.csv" df = pd.read_csv(trainingpath) # , sep="\t") df['label'] = df['label'].replace({"additive": "independent"}) train = CoopTrain(df, corelen=4) # make distace stacked bar pl.plot_stacked_categories(train.df, "orientation", path="dist_stackedbar.png") # get stacked bar of ratio between different distance pl.plot_stacked_categories(train.df, "distance", path="distance_ets_ets.png", ratio=True)
os.chdir("../..") import chip2probe.modeler.plotlib as pl from chip2probe.modeler.cooptrain import CoopTrain from chip2probe.modeler.bestmodel import BestModel from chip2probe.modeler.dnashape import DNAShape if __name__ == "__main__": trainingpath = "input/modeler/training_data/training_p01_adjusted.tsv" df = pd.read_csv(trainingpath, sep="\t") # select only genomic (i.e. non-custom) sequences df = df[~df['name'].str.contains("dist|weak")] # we can add orientation to our data frame by using the cooptrain cooptr = CoopTrain(df, corelen=4, flip_th=True, positive_cores=["GGAA", "GGAT"]) x_ori = cooptr.get_feature("orientation", {"positive_cores": ["GGAA", "GGAT"]}) df["orientation"] = pd.DataFrame(x_ori)["ori"] score_type = "auc" #auc/pr rf_param_grid = { 'n_estimators': [500, 1000, 1500], 'max_depth': [5, 10, 15], "min_samples_leaf": [5, 10, 15], "min_samples_split": [5, 10, 15] } orientations = ["HH", "TT", "HT/TH"]