Esempio n. 1
0
    def predict(self, df):
        # hardcoded for now, too tired
        print(df)
        predres = {}
        predproba = {}
        for key in self.model:
            curdf = df.loc[df["orientation"] == key]

            curct = CoopTrain(curdf,
                              corelen=4,
                              flip_th=True,
                              positive_cores=["GGAA", "GGAT"])
            feature_dict = {
                "distance": {
                    "type": "numerical"
                },
                "shape_in": {
                    "seqin": 4,
                    "smode": "positional",
                    "direction": "inout"
                },  # maximum seqin is 4
                "shape_out": {
                    "seqin": -4,
                    "smode": "positional",
                    "direction": "inout"
                }
            }
            train_df = pd.DataFrame(
                curct.get_feature_all(feature_dict))[self.param[key]]
            train = train_df.values.tolist()
            pred = self.model[key].predict(train)
            proba = [prb[1] for prb in self.model[key].predict_proba(train)]
            idxs = curdf.index
            for i in range(0, len(pred)):
                predres[idxs[i]] = pred[i]
                predproba[idxs[i]] = proba[i]
        allidxs = sorted(predres.keys())
        predlist = [predres[i] for i in allidxs]
        probalist = [predproba[i] for i in allidxs]
        return predlist, probalist
Esempio n. 2
0
os.chdir("../..")

import pandas as pd

from chip2probe.modeler.cooptrain import CoopTrain
from chip2probe.modeler.bestmodel import BestModel
import chip2probe.modeler.plotlib as pl
from sklearn import ensemble, tree
import subprocess

if __name__ == "__main__":
    trainingpath = "output/heterotypic/EtsRunx_v1/ch1_ch2/training_pwm.tsv"
    df = pd.read_csv(trainingpath, sep="\t")
    df['label'] = df['label'].replace('independent', 'additive')
    ct = CoopTrain(df)
    pd.set_option("display.max_columns", None)

    rf_param_grid = {
        'n_estimators': [500],  #[500,750,1000],
        'max_depth': [5],  #[5,10,15],
        "min_samples_leaf": [5],  #[5,10,15],
        "min_samples_split": [5]  #[5,10,15]
    }

    best_models = {
        "strength":
        BestModel(clf="sklearn.ensemble.RandomForestClassifier",
                  param_grid=rf_param_grid,
                  train_data=ct.get_training_df(
                      {"affinity": {
Esempio n. 3
0
from chip2probe.modeler.bestmodel import BestModel
import chip2probe.modeler.plotlib as pl
import pickle
from sklearn import ensemble, tree
import subprocess


if __name__ == "__main__":
    # basepath = "output/Ets1Runx1"
    # trainingpath = "output/Ets1Runx1/training/train_ets1_runx1.tsv"

    basepath = "output/Runx1Ets1"
    trainingpath = "%s/training/train_runx1_ets1.tsv" % basepath

    df = pd.read_csv(trainingpath, sep="\t")
    ct = CoopTrain(df)
    pd.set_option("display.max_columns",None)

    rf_param_grid = {
        'n_estimators': [500,750,1000],
        'max_depth': [5,10,15],
        "min_samples_leaf": [5,10,15],
        "min_samples_split" : [5,10,15]
    }

    best_models = {
        "strength":
            BestModel(clf="sklearn.ensemble.RandomForestClassifier",
              param_grid=rf_param_grid,
              train_data=ct.get_training_df({
                    "affinity": {"colnames": ("ets1_score","runx1_score")}
import pandas as pd
import os
os.chdir("..")

import chip2probe.modeler.plotlib as pl
from chip2probe.modeler.cooptrain import CoopTrain
from chip2probe.modeler.bestmodel import BestModel
from chip2probe.modeler.dnashape import DNAShape

if __name__ == "__main__":
    trainingpath = "input/modeler/training_data/training_p01_adjusted.tsv"
    df = pd.read_csv(trainingpath, sep="\t")
    # select only genomic (i.e. non-custom) sequences
    df = df[~df['name'].str.contains("dist|weak")]
    cooptr = CoopTrain(df,
                       corelen=4,
                       flip_th=True,
                       positive_cores=["GGAA", "GGAT"])

    rf_param_grid = {
        'n_estimators': [500, 100, 1500],
        'max_depth': [5, 10, 15],
        "min_samples_leaf": [10, 15, 20],
        "min_samples_split": [10, 15, 20]
    }

    # TODO: choose per orientation
    best_models = {
        "dist,ori":
        BestModel(clf="sklearn.ensemble.RandomForestClassifier",
                  param_grid=rf_param_grid,
                  train_data=cooptr.get_training_df({
Esempio n. 5
0
import pickle
from sklearn import ensemble, tree
import subprocess

if __name__ == "__main__":
    # basepath = "output/Ets1Runx1"
    # trainingpath = "output/Ets1Runx1/training/train_ets1_runx1.tsv"

    # basepath = "output/Runx1Ets1"
    # trainingpath = "%s/training/train_runx1_ets1.tsv" % basepath

    basepath = "output/Ets1Ets1"
    trainingpath = "%s/training/train_ets1_ets1.tsv" % basepath

    df = pd.read_csv(trainingpath, sep="\t")
    ct = CoopTrain(df)
    pd.set_option("display.max_columns", None)

    rf_param_grid = {
        'n_estimators': [1000],  #[500,750,1000],
        'max_depth': [10],  #[5,10,15],
        "min_samples_leaf": [5],  #[5,10,15],
        "min_samples_split": [5]  #[5,10,15]
    }

    best_models = {
        "Weaker site strength":
        BestModel(clf="sklearn.ensemble.RandomForestClassifier",
                  param_grid=rf_param_grid,
                  train_data=ct.get_training_df(
                      {"affinity": {
Esempio n. 6
0
    imads12_cores = ["GGAA", "GGAT"]
    imads12_models = [
        iMADSModel(path, core, 12, [1, 2, 3])
        for path, core in zip(imads12_paths, imads12_cores)
    ]
    imads12 = iMADS(imads12_models, 0.19)  # 0.2128

    selected = pd.read_csv(
        "output/array_design_files/Coop2Ets_validation/custom_probes_selected.csv"
    )
    selectedlist = selected["sequence"].values.tolist()
    model = pickle.load(
        open("input/modeler/coopmodel/dist_ori_12merimads.sav", "rb"))
    ct = CoopTrain(selectedlist,
                   corelen=4,
                   flip_th=True,
                   positive_cores=["GGAA", "GGAT"],
                   imads=imads12)
    feature_dict = {
        "distance": {
            "type": "numerical"
        },
        "orientation": {
            "positive_cores": ["GGAA", "GGAT"],
            "one_hot": True
        },
        "affinity": {
            "imads": imads12
        }
    }
    train = pd.DataFrame(ct.get_feature_all(feature_dict)).values.tolist()
Esempio n. 7
0
    # basepath = "output/Ets1Runx1"
    # trainingpath = "%s/training/train_ets1_runx1.tsv" % basepath
    # s1, s2 = "ets1", "runx1"
    # rel_ori = False
    # one_hot_ori = False
    # smode = "positional"

    basepath = "output/Ets1Ets1"
    trainingpath = "%s/training/train_ets1_ets1.tsv" % basepath
    s1, s2 = "site_str", "site_wk"
    rel_ori = True
    one_hot_ori = True
    smode = "relative"

    df = pd.read_csv(trainingpath, sep="\t")
    ct = CoopTrain(df)
    pd.set_option("display.max_columns", None)

    rf_param_grid = {
        'n_estimators': [500],  #[500,750,1000],
        'max_depth': [10],  #[5,10,15],
        "min_samples_leaf": [10],  #[5,10,15],
        "min_samples_split": [20],  #[5,10,15]
    }

    best_models = {
        "distance,orientation":
        BestModel(clf="sklearn.ensemble.RandomForestClassifier",
                  param_grid=rf_param_grid,
                  train_data=ct.get_training_df(
                      {
Esempio n. 8
0
def mutate_orientation(seqdf,
                       imads,
                       escore,
                       deep=0,
                       escore_cutoff=0.4,
                       escore_gap=0,
                       idcol="id"):
    """
    Make mutation for orientation

    Flip one or both sites, flip the whole 12 mer. We only use HH, HT, TT
    orientation (i.e. no TH).

    Args:
        1. seqdf: input data frame with the wt sequences to mutate
        2. imads: imads model to predict the strength of the mutants
        3. deep: how far we permit distance to go under imads.sitewidth. The
            minimum distance is set to imads.sitewidth - deep. The flip length
            is changed from sitewidth to (sitewidth-deep)//2*2.
     Returns:
        A data frame with changed orientations
    """
    # we need to get orientation information, this already filter if each sequence has 2 sites
    ct = CoopTrain(seqdf["sequence"].values.tolist(),
                   corelen=4,
                   flip_th=True,
                   imads=imads,
                   ignore_sites_err=True)
    om = ct.df.join(seqdf.set_index("sequence"), on="sequence",
                    how="inner")  # this already include the orientation
    mutres = []
    orilist = {"HH", "TT", "HT/TH"}
    flipsites = [[0], [1], [0, 1]]  # which sites to flip
    iter = 0
    nrow = om.shape[0]
    div = 1 if nrow // 100 == 0 else nrow // 100
    mindist = imads.sitewidth - deep
    for index, row in om.iterrows():
        if iter % div == 0:
            print("Mutating orientation, progress {:.2f}% ({}/{})".format(
                iter * 100 / nrow, iter, nrow))
        iter += 1
        mutres_cur = []
        sites, sites_specific = DNASequence(row["sequence"], imads, escore,
                                            escore_cutoff,
                                            escore_gap).get_sites()
        if len(sites) != 2 or sites[1]["core_mid"] - sites[0][
                "core_mid"] < mindist:  # or len(sites_specific) != 2
            continue
        curdist = sites[1]["core_mid"] - sites[0]["core_mid"]
        mutres_cur.append({
            "seqid": row[idcol],
            "sequence": str(row["sequence"]),
            "site1_pos": sites[0]["core_mid"],
            "site1_affinity": sites[0]["score"],
            "site2_pos": sites[1]["core_mid"],
            "site2_affinity": sites[1]["score"],
            "distance": curdist,
            "muttype": "orientation",
            "comment": "wt",
            "wtlabel": row["label"],
            "orientation": row["orientation"]
        })
        for fs in flipsites:
            newseq = row["sequence"]
            adjust = 0 if curdist >= imads.sitewidth else int(
                math.ceil(float(imads.sitewidth - curdist) / 2))
            for i in fs:
                start, end = sites[i]["site_start"] + adjust, sites[i][
                    "site_start"] + sites[i]["site_width"] - adjust
                toflip = bio.revcompstr(row["sequence"][start:end])
                newseq = newseq[:start] + toflip + newseq[end:]
            newsites, newsites_specific = DNASequence(newseq, imads, escore,
                                                      escore_cutoff,
                                                      escore_gap).get_sites()
            if len(
                    newsites
            ) != 2:  #or len(newsites_specific) != 2: # we ignore if there are new sites
                continue
            newori = cg.get_relative_orientation(newseq, imads, htth=False)
            if newori == "HT":
                newori = "HT/TH"
            elif newori == "TH":
                continue  # skip if TH since we use HT
            mutres_cur.append({
                "seqid":
                row[idcol],
                "sequence":
                str(newseq),
                "site1_pos":
                newsites[0]["core_mid"],
                "site1_affinity":
                newsites[0]["score"],
                "site2_pos":
                newsites[1]["core_mid"],
                "site2_affinity":
                newsites[1]["score"],
                "distance":
                newsites[1]["core_mid"] - newsites[0]["core_mid"],
                "muttype":
                "orientation",
                "comment":
                "to_%s" % newori,
                "wtlabel":
                row["label"],
                "orientation":
                newori
            })
        # if len(mutres_cur) != 3: # 3 orientations
        #     print("Found predictions with number of orientation probes != 3", len(mutres_cur), row["sequence"])
        if len(mutres_cur) > 1:
            mutres.extend(mutres_cur)
    return pd.DataFrame(mutres)
if __name__ == "__main__":
    basepath = "/Users/vincentiusmartin/Research/chip2gcPBM/chip2probe"
    # using custom imads model
    imads_paths = [
        "%s/input/site_models/imads_models/Ets1_w12_GGAA.model" % basepath,
        "%s/input/site_models/imads_models/Ets1_w12_GGAT.model" % basepath
    ]
    imads_cores = ["GGAA", "GGAT"]
    imads_models = [
        iMADSModel(path, core, 12, [1, 2, 3])
        for path, core in zip(imads_paths, imads_cores)
    ]
    imads = iMADS(imads_models, 0.19)  # 0.2128

    df = pd.read_csv("%s/output/homotypic/training/seqlbled.csv" %
                     basepath).drop_duplicates()
    print(df.shape[0])
    df = df[(df["label"] == "cooperative") |
            (df["label"] == "independent")].rename({"Sequence": "sequence"},
                                                   axis=1)
    print(df["label"].value_counts())
    ct = CoopTrain(df["sequence"].tolist(),
                   corelen=4,
                   flip_th=True,
                   imads=imads,
                   ignore_sites_err=True)
    train_df = ct.df.merge(df, on="sequence")
    print(train_df["label"].value_counts())
    print(train_df.shape[0])
    train_df.to_csv("training.csv", index=False)
Esempio n. 10
0
def mutate_affinity(seqdf,
                    imads,
                    escore,
                    deep=0,
                    escore_cutoff=0.4,
                    escore_gap=0,
                    idcol="id"):
    """
    Make mutation to change the affinity (i.e. strength) prediction.

    First, mutations are made for each core to its other core versions, e.g. if
    the core is GGAA and the alternate is GGAT, we simply change GGAAA -> GGAT.
    Then mutate the core flanking regions up to imads.sitewidth, e.g. if the
    core length is 4 and sitewidth is 12 then we can mutate up to (12-4)/2=4bp
    to each side. When 'deep' is set to be more than 0, set barrier to
    sitewidth - distance on the other binding site.

    Args:
        1. seqdf: input data frame with the wt sequences to mutate
        2. imads: imads model to predict the strength of the mutants
        3. deep: the minimum distance between sequence is set to be
            imads.sitewidth - deep. Default is 0, which means we keep sitewidth
            as minimum distance. When deep is > 0, we make barrier at the other
            site so we don't change its affinity prediction.
     Returns:
        A data frame of sequences with SNPs that change its affinity.
    """
    if deep < 0:
        raise ValueError("Minimum deep is 0")

    ct = CoopTrain(seqdf["sequence"].values.tolist(),
                   corelen=4,
                   flip_th=True,
                   imads=imads,
                   ignore_sites_err=True)
    om = ct.df.join(seqdf.set_index("sequence"), on="sequence",
                    how="inner")  # this already include the orientation

    # first make map for mutating between core
    mdlcores_fw = [m.core for m in imads.models]
    fwdict = {e[0]: e[1] for e in list(itertools.permutations(mdlcores_fw, 2))}
    mdlcores_rc = [bio.revcompstr(m) for m in mdlcores_fw]
    rcdict = {e[0]: e[1] for e in list(itertools.permutations(mdlcores_rc, 2))}
    coremap = {**fwdict, **rcdict}

    # prepare the variable
    mindist = imads.sitewidth - deep
    mutres = []

    iter = 0
    nrow = om.shape[0]
    div = 1 if nrow // 100 == 0 else nrow // 100
    for index, row in om.iterrows():
        if iter % div == 0:
            print("Mutating affinity, progress {:.2f}% ({}/{})".format(
                iter * 100 / nrow, iter, nrow))
        iter += 1
        mutres_cur = []
        # we use DNASequence object to have overlap with escore
        sites, sites_specific = DNASequence(row["sequence"], imads, escore,
                                            escore_cutoff,
                                            escore_gap).get_sites()
        if len(sites) != 2 or sites[1]["core_mid"] - sites[0][
                "core_mid"] < mindist:  #or len(sites_specific) != 2
            continue
        mutres_cur.append({
            "seqid":
            row[idcol],
            "sequence":
            str(row["sequence"]),
            "site1_pos":
            sites[0]["core_mid"],
            "site1_affinity":
            sites[0]["score"],
            "site2_pos":
            sites[1]["core_mid"],
            "site2_affinity":
            sites[1]["score"],
            "distance":
            sites[1]["core_mid"] - sites[0]["core_mid"],
            "muttype":
            "affinity",
            "comment":
            "wt",
            "wtlabel":
            row["label"],
            "orientation":
            row["orientation"]
        })
        mids = [s["core_mid"] for s in sites]

        # 1. Mutate the core to the other version
        coremt = mutate_cores(row["sequence"], mids, coremap)

        # 2. Mutate the flanks up to the sitewidth
        barrierlen = imads.sitewidth - row["distance"] if row[
            "distance"] < imads.sitewidth else 0
        flankmt = mutate_flanks(row["sequence"],
                                mids,
                                imads.corewidth,
                                imads.sitewidth,
                                barrier=barrierlen)

        allmt = coremt + flankmt
        for i in range(len(allmt)):
            newsites, newsites_specific = DNASequence(allmt[i]["sequence"],
                                                      imads, escore,
                                                      escore_cutoff,
                                                      escore_gap).get_sites()
            if len(newsites) != 2:  #or len(newsites_specific) != 2:
                continue
            newori = cg.get_relative_orientation(allmt[i]["sequence"],
                                                 imads,
                                                 htth=True)
            mutres_cur.append({
                "seqid":
                row[idcol],
                "sequence":
                allmt[i]["sequence"],
                "site1_pos":
                newsites[0]["core_mid"],
                "site1_affinity":
                newsites[0]["score"],
                "site2_pos":
                newsites[1]["core_mid"],
                "site2_affinity":
                newsites[1]["score"],
                "distance":
                newsites[1]["core_mid"] - newsites[0]["core_mid"],
                "muttype":
                "affinity",
                "comment":
                allmt[i]["comment"],
                "wtlabel":
                row["label"],
                "orientation":
                newori
            })
        if len(mutres_cur) > 1:
            mutres.extend(mutres_cur)
    return pd.DataFrame(mutres)
Esempio n. 11
0
    # Load escore object
    escore = PBMEscore("input/site_models/escores/Ets1_8mers_11111111.txt")

    # Load imads object
    imads12_paths = ["input/site_models/imads_models/Ets1_w12_GGAA.model", "input/site_models/imads_models/Ets1_w12_GGAT.model"]
    imads12_cores = ["GGAA", "GGAT"]
    imads12_models = [iMADSModel(path, core, 12, [1, 2, 3]) for path, core in zip(imads12_paths, imads12_cores)]
    imads12 = iMADS(imads12_models, 0.19) # 0.2128

    print("Number of input rows: %d"%df.shape[0])
    indf = pd.DataFrame(df[["sequence","label"]])# "id"
    indf["label"] = indf["label"].replace({"cooperative":1,"additive":0})

    mindist = imads12.sitewidth - 3
    ct = CoopTrain(indf["sequence"].values.tolist(), corelen=4, flip_th=True, imads=imads12, ignore_sites_err=True)
    om = ct.df.join(indf.set_index("sequence"), on="sequence", how="inner") # this already include the orientation
    seqs = []
    passval = 0
    for index, row in om.iterrows():
        sites, sites_specific = DNASequence(row["sequence"], imads12, escore, 0.4, 0).get_sites()
        if len(sites_specific) != 2: # or sites[1]["core_mid"] - sites[0]["core_mid"] < mindist: #or len(sites_specific) != 2
            #print(len(sites), sites[1]["core_mid"] - sites[0]["core_mid"], mindist)
           continue
        seqs.append(row["sequence"])
        passval += 1
    pd.DataFrame({'sequence':seqs}).to_csv("seqs.csv")
    print("Number of sequences passing the cutoff %d" % passval)

    """
    # 1. Mutate based on affinity
# "shape":
#     BestModel(clf="sklearn.ensemble.RandomForestClassifier",
#       param_grid=rf_param_grid,
#       train_data=ct.get_training_df({
#             "distance":{"type":"numerical"},
#             "shape_in":{"seqin":3, "poscols":['ets_pos','runx_pos']},
#             "shape_out":{"seqin":-2, "poscols":['ets_pos','runx_pos']}
#         })
#     ).run_all()

if __name__ == "__main__":
    trainingpath = "output/heterotypic/EtsRunx_v1/ch1_ch2/training_pwm.tsv"
    df = pd.read_csv(trainingpath, sep="\t")
    df['label'] = df['label'].replace('independent', 'additive')
    ct = CoopTrain(df)
    pd.set_option("display.max_columns", None)

    rf_param_grid = {
        'n_estimators': [500],  #[500,750,1000],
        'max_depth': [10],  #[5,10,15],
        "min_samples_leaf": [10],  #[5,10,15],
        "min_samples_split": [20],  #[5,10,15]
    }

    best_models = {
        "distance,orientation":
        BestModel(clf="sklearn.ensemble.RandomForestClassifier",
                  param_grid=rf_param_grid,
                  train_data=ct.get_training_df({
                      "distance": {
Esempio n. 13
0
os.chdir("../../..")

import chip2probe.modeler.plotlib as pl
from chip2probe.modeler.cooptrain import CoopTrain
from chip2probe.modeler.bestmodel import BestModel
# TODO: fix after we finish refactoring probefilter, for now just append the path
sys.path.append("/Users/vincentiusmartin/Research/chip2gcPBM/chip2probe/chip2probe/probe_generator/probefilter")
from chip2probe.sitespredict.imads import iMADS
from chip2probe.sitespredict.imadsmodel import iMADSModel

if __name__ == "__main__":
    trainingpath = "output/homotypic/training/training.csv" #"input/modeler/training_data/training_p01_adjusted.tsv"
    df = pd.read_csv(trainingpath) #, sep="\t")
    # select only genomic (i.e. non-custom) sequences
    df = df[~df['name'].str.contains("dist|weak")]
    cooptr = CoopTrain(df, corelen=4)

    rf_param_grid = {
        'n_estimators': [500], #, 1000, 1500],
        'max_depth':[5], # 10, 15],
        "min_samples_leaf" : [10], # 15, 20],
        "min_samples_split" :[10], # 15 ,20]
    }

    # using custom imads model
    imads8_paths = ["input/site_models/imads_models/Ets1_w8_GGAA.model", "input/site_models/imads_models/Ets1_w8_GGAT.model"]
    imads8_cores = ["GGAA", "GGAT"]
    imads8_models = [iMADSModel(path, core, 8, [1, 2, 3]) for path, core in zip(imads8_paths, imads8_cores)]
    imads8 = iMADS(imads8_models, 0.19) # 0.2128

    imads12_paths = ["input/site_models/imads_models/Ets1_w12_GGAA.model", "input/site_models/imads_models/Ets1_w12_GGAT.model"]
Esempio n. 14
0
os.chdir("../../..")

from chip2probe.modeler.cooptrain import CoopTrain
from chip2probe.modeler.bestmodel import BestModel
# TODO: fix after we finish refactoring probefilter, for now just append the path
from chip2probe.sitespredict.imads import iMADS
from chip2probe.sitespredict.imadsmodel import iMADSModel

if __name__ == "__main__":
    trainingpath = "input/modeler/training_data/training_p01_adjusted_ets1.tsv"
    df = pd.read_csv(trainingpath, sep="\t")
    # select only genomic (i.e. non-custom) sequences
    # df = df[~df['name'].str.contains("dist|weak")]
    ct = CoopTrain(df,
                   corelen=4,
                   flip_th=True,
                   positive_cores=["GGAA", "GGAT"])

    # using custom imads model
    imads_paths = [
        "input/site_models/imads_models/Ets1_w12_GGAA.model",
        "input/site_models/imads_models/Ets1_w12_GGAT.model"
    ]
    imads_cores = ["GGAA", "GGAT"]
    imads_models = [
        iMADSModel(path, core, 12, [1, 2, 3])
        for path, core in zip(imads_paths, imads_cores)
    ]
    imads = iMADS(imads_models, 0.19)  # 0.2128

    # get the features from the CoopTrain class
Esempio n. 15
0
def mutate_dist(seqdf,
                imads,
                escore,
                deep=0,
                escore_cutoff=0.4,
                escore_gap=0,
                warning=True,
                patch=True,
                idcol="id"):
    """
    Make mutation for distance

    Make closer distance between two sites.
    Insert and cut, for cutting can fix the second site (site that closer to the
    glass slide) and can just use the the nucleotide that is cut to patch.
    CHECK WE ARE NOT CREATING NEW SITE.

    Args:
        feature_dict: the following is the list of currently available feature:
            1. seqdf: input data frame with the wt sequences to mutate
            2. imads: imads model to predict the strength of the mutants
            3. deep: how far we permit distance to go under imads.sitewidth. The
                minimum distance is set to imads.sitewidth - deep. When deep > 0,
                the affinity of each site will change after the distance is
                less than sitewidth.
            4. warning: print warning when input has sites number != 2
            5. appendback: append the middle sequence back to the edge
    Returns:
        A data frame where each sequence has mutants
    """
    ct = CoopTrain(seqdf["sequence"].values.tolist(),
                   corelen=4,
                   flip_th=True,
                   imads=imads,
                   ignore_sites_err=True)
    om = ct.df.join(seqdf.set_index("sequence"), on="sequence",
                    how="inner")  # this already include the orientation

    mutres = []
    mindist = imads.sitewidth - deep
    nrow = om.shape[0]
    iter = 0
    div = 1 if nrow // 100 == 0 else nrow // 100
    for index, row in om.iterrows():
        if iter % div == 0:
            print("Mutating distance, progress {:.2f}% ({}/{})".format(
                iter * 100 / nrow, iter, nrow))
        iter += 1
        seq = row["sequence"]
        mutres_cur = []
        sites, sites_specific = DNASequence(seq, imads, escore, escore_cutoff,
                                            escore_gap).get_sites()
        curdist = sites[1]["core_mid"] - sites[0]["core_mid"]
        if len(sites
               ) != 2 or curdist <= mindist:  #or len(sites_specific) != 2:
            if warning and len(sites) != 2:
                print("Found a sequence with number of sites not equal to 2: ",
                      seq, sites)
            continue
        mutres_cur.append({
            "seqid": row[idcol],
            "sequence": str(seq),
            "site1_pos": sites[0]["core_mid"],
            "site1_affinity": sites[0]["score"],
            "site2_pos": sites[1]["core_mid"],
            "site2_affinity": sites[1]["score"],
            "distance": int(curdist),
            "muttype": "distance",
            "comment": "wt",
            "wtlabel": row["label"],
            "orientation": row["orientation"]
        })
        move = 1
        initdist = int(
            curdist)  # need to make this because we keep changing curdist
        while initdist - move >= mindist:
            s1_end = sites[0]["site_start"] + sites[0]["site_width"]
            s2_start = sites[1]["site_start"]
            curseq = cg.move_single_site(seq,
                                         s1_end,
                                         s2_start,
                                         move,
                                         patch=patch,
                                         can_overlap=True)
            cursites, cursites_specific = DNASequence(curseq, imads, escore,
                                                      escore_cutoff,
                                                      escore_gap).get_sites()
            move += 1
            if len(
                    cursites
            ) != 2:  # or len(cursites_specific) != 2: # we ignore if there are new sites
                continue
            curdist = cursites[1]["core_mid"] - cursites[0]["core_mid"]
            newori = cg.get_relative_orientation(curseq, imads, htth=True)
            mutres_cur.append({
                "seqid": row[idcol],
                "sequence": str(curseq),
                "site1_pos": cursites[0]["core_mid"],
                "site1_affinity": cursites[0]["score"],
                "site2_pos": cursites[1]["core_mid"],
                "site2_affinity": cursites[1]["score"],
                "distance": int(curdist),
                "muttype": "distance",
                "comment": "closer_%d" % (move - 1),
                "wtlabel": row["label"],
                "orientation": newori
            })
        if len(mutres_cur) > 1:
            mutres.extend(mutres_cur)
    return pd.DataFrame(mutres)
Esempio n. 16
0
Created on Oct 30, 2019

@author: Vincentius Martin, Farica Zhuang

Make some plots for analysis
'''

import pandas as pd
import os
os.chdir("../../..")

import chip2probe.modeler.plotlib as pl
from chip2probe.modeler.cooptrain import CoopTrain

if __name__ == "__main__":
    trainingpath = "output/homotypic/training/training_pwm.csv"
    df = pd.read_csv(trainingpath)  # , sep="\t")
    df['label'] = df['label'].replace({"additive": "independent"})
    train = CoopTrain(df, corelen=4)

    # make distace stacked bar
    pl.plot_stacked_categories(train.df,
                               "orientation",
                               path="dist_stackedbar.png")

    # get stacked bar of ratio between different distance
    pl.plot_stacked_categories(train.df,
                               "distance",
                               path="distance_ets_ets.png",
                               ratio=True)
Esempio n. 17
0
os.chdir("../..")

import chip2probe.modeler.plotlib as pl
from chip2probe.modeler.cooptrain import CoopTrain
from chip2probe.modeler.bestmodel import BestModel
from chip2probe.modeler.dnashape import DNAShape

if __name__ == "__main__":
    trainingpath = "input/modeler/training_data/training_p01_adjusted.tsv"
    df = pd.read_csv(trainingpath, sep="\t")
    # select only genomic (i.e. non-custom) sequences
    df = df[~df['name'].str.contains("dist|weak")]

    # we can add orientation to our data frame by using the cooptrain
    cooptr = CoopTrain(df,
                       corelen=4,
                       flip_th=True,
                       positive_cores=["GGAA", "GGAT"])
    x_ori = cooptr.get_feature("orientation",
                               {"positive_cores": ["GGAA", "GGAT"]})
    df["orientation"] = pd.DataFrame(x_ori)["ori"]

    score_type = "auc"  #auc/pr

    rf_param_grid = {
        'n_estimators': [500, 1000, 1500],
        'max_depth': [5, 10, 15],
        "min_samples_leaf": [5, 10, 15],
        "min_samples_split": [5, 10, 15]
    }

    orientations = ["HH", "TT", "HT/TH"]