Ejemplo n.º 1
0
    seqlbled = pd.read_csv("%s/chip2probe/output/homotypic/training/seqlbled.csv" % basepath)
    wtdf = get_wtdf("%s/chip2probe/output/array_design_files/Coop2Ets_validation/custom_probes_selected.csv" % basepath, seqlbled)

    origdf, neg_orig = arr.read_chamber_file("%s/probedata/191030_coop-PBM_Ets1_v1_2nd/2.processed_gpr/20191004_258614510001_ETS1_550_5_1-4_alldata.txt"%basepath, seqcols=["Name","type","rep","ori"], negcols=["Name","rep","ori"], key="Coop1Ets")
    origdf[["Sequence","type","ori"]].drop_duplicates().to_csv("seqsorig.csv",index=False)
    import sys
    sys.exit()
    cust10df, neg10_cust = arr.read_chamber_file("%s/probedata/201128_validation_array_ets1_v2_1/10nMEts1_alexa488_550_20_alldata.txt"%basepath, key="Coop2Ets")
    cust20df, neg20_cust = arr.read_chamber_file("%s/probedata/210102_validation_array_ets1_v2_2/20nMEts1_alexa488_550_10_alldata.txt"%basepath, key="Coop2Ets")
    cust30df, neg30_cust = arr.read_chamber_file("%s/probedata/210102_validation_array_ets1_v2_2/30nMEts1_alexa488_550_10_alldata.txt"%basepath, key="Coop2Ets")

    imads_paths = ["%s/chip2probe/input/site_models/imads_models/Ets1_w12_GGAA.model" % basepath,
                    "%s/chip2probe/input/site_models/imads_models/Ets1_w12_GGAT.model" % basepath]
    imads_cores = ["GGAA", "GGAT"]
    imads_models = [iMADSModel(path, core, 12, [1, 2, 3]) for path, core in zip(imads_paths, imads_cores)]
    imads = iMADS(imads_models, 0.19) # 0.2128

    orig_pred = imads.predict_sequences(get_wtmt(origdf, wtdf), key_colname="id_numeric", sequence_colname="Sequence")
    orig_plot = imads.make_plot_data(orig_pred)

    cust_pred = imads.predict_sequences(get_wtmt(cust10df, wtdf), key_colname="id_numeric", sequence_colname="Sequence")
    cust_plot = imads.make_plot_data(cust_pred)

    sp = SitesPlotter()
    sp.plot_seq_combine([orig_plot], filepath="origplot.pdf")
    sp.plot_seq_combine([cust_plot], filepath="custplot.pdf")

    import sys
    sys.exit()
Ejemplo n.º 2
0
    df = pd.read_csv(trainingpath, sep="\t")
    # select only genomic (i.e. non-custom) sequences
    # df = df[~df['name'].str.contains("dist|weak")]
    ct = CoopTrain(df,
                   corelen=4,
                   flip_th=True,
                   positive_cores=["GGAA", "GGAT"])

    # using custom imads model
    imads_paths = [
        "input/site_models/imads_models/Ets1_w12_GGAA.model",
        "input/site_models/imads_models/Ets1_w12_GGAT.model"
    ]
    imads_cores = ["GGAA", "GGAT"]
    imads_models = [
        iMADSModel(path, core, 12, [1, 2, 3])
        for path, core in zip(imads_paths, imads_cores)
    ]
    imads = iMADS(imads_models, 0.19)  # 0.2128

    # get the features from the CoopTrain class
    feature_dict = {
        "distance": {
            "type": "numerical"
        },
        "orientation": {
            "positive_cores": ["GGAA", "GGAT"],
            "one_hot": True
        },
        "affinity": {
            "imads": imads
Ejemplo n.º 3
0
    df = pd.read_csv(trainingpath) #, sep="\t")
    # select only genomic (i.e. non-custom) sequences
    df = df[~df['name'].str.contains("dist|weak")]
    cooptr = CoopTrain(df, corelen=4)

    rf_param_grid = {
        'n_estimators': [500], #, 1000, 1500],
        'max_depth':[5], # 10, 15],
        "min_samples_leaf" : [10], # 15, 20],
        "min_samples_split" :[10], # 15 ,20]
    }

    # using custom imads model
    imads8_paths = ["input/site_models/imads_models/Ets1_w8_GGAA.model", "input/site_models/imads_models/Ets1_w8_GGAT.model"]
    imads8_cores = ["GGAA", "GGAT"]
    imads8_models = [iMADSModel(path, core, 8, [1, 2, 3]) for path, core in zip(imads8_paths, imads8_cores)]
    imads8 = iMADS(imads8_models, 0.19) # 0.2128

    imads12_paths = ["input/site_models/imads_models/Ets1_w12_GGAA.model", "input/site_models/imads_models/Ets1_w12_GGAT.model"]
    imads12_cores = ["GGAA", "GGAT"]
    imads12_models = [iMADSModel(path, core, 12, [1, 2, 3]) for path, core in zip(imads12_paths, imads12_cores)]
    imads12 = iMADS(imads12_models, 0.19) # 0.2128

    best_models = {
        "distance":
            BestModel(clf="sklearn.ensemble.RandomForestClassifier",
              param_grid=rf_param_grid,
              train_data=cooptr.get_training_df({
                    "distance":{"type":"numerical"}
                })
            ).run_all(),