Ejemplo n.º 1
0
def main(infilen="train/retmetfeatures_new.csv"):
    """
    Make predictions for the evaluation of CALLC
    
    Parameters
    ----------
    infilen : str
		location of train data

    Returns
    -------

    """
    global adds
    global n_all

    infile = pd.read_csv(infilen)
    infile.fillna(0.0, inplace=True)
    try:
        keep_f = [
            x.strip()
            for x in open("features/selected_features.txt").readlines()
        ]
        infile = infile[keep_f]
    except IOError:
        infile, keep_f = sel_features(infile)
        outfile = open("features/selected_features.txt", "w")
        outfile.write("\n".join(list(keep_f)))

    keep_f_features_only = [
        f for f in keep_f if f not in ["time", "IDENTIFIER", "system"]
    ]

    scaler = StandardScaler()
    infile[keep_f_features_only] = scaler.fit_transform(
        infile[keep_f_features_only])

    sets = get_sets(infile)

    for k in sets.keys():
        for ind in range(len(n_all)):
            selected_set = sets[k]
            select_index = list(range(len(selected_set.index)))

            n = n_all[ind]

            if n > len(selected_set.index): continue

            shuffle(select_index)
            train = selected_set.iloc[select_index[0:n], ]  #
            test = selected_set.iloc[select_index[n:], ]  #

            if len(select_index[n:]) < 10: continue

            cv = KFold(n_splits=10, shuffle=True)
            cv_spl = cv.split(list(train.index))

            cv_list = cv_to_fold(cv_spl, len(train.index))

            print("Training L1 %s,%s,%s" % (k, n, adds[ind]))

            move_models(k)
            preds_own = train_l1_func(train,
                                      names=[k, k, k, k, k, k, k],
                                      adds=[n, n, n, n, n, n, n, n],
                                      cv=cv)

            print("Applying L1 %s,%s,%s" % (k, n, adds[ind]))

            preds_l1_train, skipped_train = apply_models(
                train.drop(["time", "IDENTIFIER", "system"], axis=1),
                known_rt=train["time"],
                row_identifiers=train["IDENTIFIER"],
                skip_cont=[k])
            preds_l1_test, skipped_test = apply_models(
                test.drop(["time", "IDENTIFIER", "system"], axis=1),
                known_rt=test["time"],
                row_identifiers=test["IDENTIFIER"])

            preds_l1_train = pd.concat(
                [preds_l1_train.reset_index(drop=True), preds_own], axis=1)

            print("Applying L2 %s,%s,%s" % (k, n, adds[ind]))

            preds_l2_test, preds_l2_train = apply_l2(preds_l1_train,
                                                     preds_l1_test,
                                                     cv_list=cv_list,
                                                     name=k)

            preds_l2_train = pd.concat([
                preds_l2_train.reset_index(drop=True),
                train.drop(["IDENTIFIER", "system", "time"],
                           axis=1).reset_index(drop=True)
            ],
                                       axis=1)
            preds_l2_test = pd.concat([
                preds_l2_test.reset_index(drop=True),
                test.drop(["IDENTIFIER", "system", "time"],
                          axis=1).reset_index(drop=True)
            ],
                                      axis=1)

            print("Applying L3 %s,%s,%s" % (k, n, adds[ind]))

            preds_l3_train, preds_l3_test, coefs_list = train_l3(
                preds_l2_train, preds_l2_test, cv=cv)

            outfilel1 = open(
                "test_preds/%s_preds_l1_%s%s.csv" % (k, n, adds[ind]), "w")
            outfilel2 = open(
                "test_preds/%s_preds_l2_%s%s.csv" % (k, n, adds[ind]), "w")
            outfilel3 = open(
                "test_preds/%s_preds_l3_%s%s.csv" % (k, n, adds[ind]), "w")
            outfilel = open(
                "test_preds/%s_preds_ALL_%s%s.csv" % (k, n, adds[ind]), "w")

            outfilel1train = open(
                "test_preds/%s_preds_train_l1_%s%s.csv" % (k, n, adds[ind]),
                "w")
            outfilel2train = open(
                "test_preds/%s_preds_train_l2_%s%s.csv" % (k, n, adds[ind]),
                "w")
            outfilel3train = open(
                "test_preds/%s_preds_train_l3_%s%s.csv" % (k, n, adds[ind]),
                "w")
            outfileltrain = open(
                "test_preds/%s_ALL_train_l3_%s%s.csv" % (k, n, adds[ind]), "w")

            outfilel3_coefs = open(
                "test_preds/%s_preds_l3_%s%s_elasticnet_coefs.csv" %
                (k, n, adds[ind]), "w")

            for line in coefs_list:
                model, coefs = line
                outfilel3_coefs.write("%s_%s%s\t%s\t%s\n" %
                                      (k, n, adds[ind], model, str(coefs)))
            outfilel3_coefs.close()

            preds_l1_test.to_csv(outfilel1, index=False)
            preds_l2_test.to_csv(outfilel2, index=False)
            preds_l3_test.to_csv(outfilel3, index=False)

            all_test = pd.concat([
                preds_l1_test.reset_index(drop=True),
                preds_l2_test.reset_index(drop=True),
                preds_l3_test.reset_index(drop=True)
            ],
                                 axis=1)
            all_test = all_test.T.drop_duplicates().T
            all_test.to_csv(outfilel, index=False)

            preds_l1_train.to_csv(outfilel1train, index=False)
            preds_l2_train.to_csv(outfilel2train, index=False)
            preds_l3_train.to_csv(outfilel3train, index=False)

            all_train = pd.concat([
                preds_l1_train.reset_index(drop=True),
                preds_l2_train.reset_index(drop=True),
                preds_l3_train.reset_index(drop=True)
            ],
                                  axis=1)
            all_train = all_train.T.drop_duplicates().T
            all_train.to_csv(outfileltrain, index=False)

            outfilel1.close()
            outfilel2.close()
            outfilel3.close()
            outfilel.close()

            outfilel1train.close()
            outfilel2train.close()
            outfilel3train.close()
            outfileltrain.close()

            remove_models(k, n)
            move_models_back(k)
def main(infilen="train/retmetfeatures_new.csv"):
    """
    Make predictions for the evaluation of CALLC
    
    Parameters
    ----------
    infilen : str
		location of train data

    Returns
    -------

    """
    global adds
    global n_all

    infile = pd.read_csv(infilen)
    try:
        keep_f = [
            x.strip()
            for x in open("features/selected_features.txt").readlines()
        ]
        infile = infile[keep_f]
    except IOError:
        infile, keep_f = sel_features(infile)
        outfile = open("features/selected_features.txt", "w")
        outfile.write("\n".join(list(keep_f)))

    keep_f_features_only = [
        f for f in keep_f if f not in ["time", "IDENTIFIER", "system"]
    ]

    scaler = StandardScaler()
    infile[keep_f_features_only] = scaler.fit_transform(
        infile[keep_f_features_only])

    sets = get_sets(infile)

    #duplic_df = pd.read_csv("train/retmetfeatures_new.csv")
    #unique_df = pd.read_csv("train/retmetfeatures_new_nodup.csv")

    #all_id = list(duplic_df["IDENTIFIER"])
    #uniq = list(unique_df["IDENTIFIER"])
    #not_uniq = [u for u in all_id if u not in uniq]

    for k in sets.keys():
        #if k in ["","RIKEN","Taguchi_12","LIFE_old","Ales_18","PFR-TK72","Beck","Cao_HILIC","Eawag_XBridgeC18","FEM_lipids","FEM_long","FEM_orbitrap_plasma","FEM_orbitrap_urine","FEM_short","IPB_Halle","kohlbacher","Kojima","Krauss_21","Krauss","LIFE_new","LIFE_old","Mark","Matsuura_15","Matsuura","MPI_Symmetry","MTBLS20","MTBLS36","MTBLS38","MTBLS39","MTBLS87","Nikiforos","Otto","Stravs_22","Stravs","Taguchi","Takahashi","Tohge","Toshimitsu","UFZ_Phenomenex","UniToyama_Atlantis"]: continue
        if k != "kohlbacher": continue
        #print k
        for ind in range(len(n_all)):
            selected_set = sets[k]
            select_index = list(range(len(selected_set.index)))
            #if len(select_index) < 101: continue

            n = n_all[ind]

            if n > len(selected_set.index): continue

            #dupl_indexes,uniq_indexes = get_unique_indexes(selected_set["IDENTIFIER"],not_uniq)
            #sel_train,sel_test = select_dupl(dupl_indexes,uniq_indexes,num_train=n,min_num_test=10)

            shuffle(select_index)
            train = selected_set.iloc[select_index[0:n], ]  #
            test = selected_set.iloc[select_index[n:], ]  #

            if len(select_index[n:]) < 10: continue

            cv = KFold(n_splits=10, shuffle=True)
            cv_spl = cv.split(list(train.index))
            #len(train.index)

            cv_list = cv_to_fold(cv_spl, len(train.index))

            print("Training L1 %s,%s,%s" % (k, n, adds[ind]))

            move_models(k)
            preds_own = train_l1_func(train,
                                      names=[k, k, k, k, k, k, k],
                                      adds=[n, n, n, n, n, n, n, n],
                                      cv=cv)

            print("Applying L1 %s,%s,%s" % (k, n, adds[ind]))

            preds_l1_train, skipped_train = apply_models(
                train.drop(["time", "IDENTIFIER", "system"], axis=1),
                known_rt=train["time"],
                row_identifiers=train["IDENTIFIER"],
                skip_cont=[k])
            preds_l1_test, skipped_test = apply_models(
                test.drop(["time", "IDENTIFIER", "system"], axis=1),
                known_rt=test["time"],
                row_identifiers=test["IDENTIFIER"])

            preds_l1_train = pd.concat(
                [preds_l1_train.reset_index(drop=True), preds_own], axis=1)

            print("Applying L2 %s,%s,%s" % (k, n, adds[ind]))

            preds_l2_test, preds_l2_train = apply_l2(preds_l1_train,
                                                     preds_l1_test,
                                                     cv_list=cv_list,
                                                     name=k)

            #rem_col = preds_l1_train.drop(["time","IDENTIFIER"],axis=1).columns
            #rem_col = [r for r in rem_col if r in preds_l2_train.columns]
            #preds_l2_train = preds_l2_train.drop(rem_col,axis=1)
            #preds_l2_test = preds_l2_test.drop(rem_col,axis=1)

            preds_l2_train = pd.concat([
                preds_l2_train.reset_index(drop=True),
                train.drop(["IDENTIFIER", "system", "time"],
                           axis=1).reset_index(drop=True)
            ],
                                       axis=1)
            preds_l2_test = pd.concat([
                preds_l2_test.reset_index(drop=True),
                test.drop(["IDENTIFIER", "system", "time"],
                          axis=1).reset_index(drop=True)
            ],
                                      axis=1)

            #print preds_l2_train
            #raw_input()
            #print preds_l2_test
            #raw_input()

            print("Applying L3 %s,%s,%s" % (k, n, adds[ind]))

            preds_l3_train, preds_l3_test, coefs_list = train_l3(
                preds_l2_train, preds_l2_test, cv=cv)

            outfilel1 = open(
                "test_preds/%s_preds_l1_%s%s.csv" % (k, n, adds[ind]), "w")
            outfilel2 = open(
                "test_preds/%s_preds_l2_%s%s.csv" % (k, n, adds[ind]), "w")
            outfilel3 = open(
                "test_preds/%s_preds_l3_%s%s.csv" % (k, n, adds[ind]), "w")
            outfilel = open(
                "test_preds/%s_preds_ALL_%s%s.csv" % (k, n, adds[ind]), "w")

            outfilel1train = open(
                "test_preds/%s_preds_train_l1_%s%s.csv" % (k, n, adds[ind]),
                "w")
            outfilel2train = open(
                "test_preds/%s_preds_train_l2_%s%s.csv" % (k, n, adds[ind]),
                "w")
            outfilel3train = open(
                "test_preds/%s_preds_train_l3_%s%s.csv" % (k, n, adds[ind]),
                "w")
            outfileltrain = open(
                "test_preds/%s_ALL_train_l3_%s%s.csv" % (k, n, adds[ind]), "w")

            outfilel3_coefs = open(
                "test_preds/%s_preds_l3_%s%s_elasticnet_coefs.csv" %
                (k, n, adds[ind]), "w")

            for line in coefs_list:
                model, coefs = line
                outfilel3_coefs.write("%s_%s%s\t%s\t%s\n" %
                                      (k, n, adds[ind], model, str(coefs)))
            outfilel3_coefs.close()

            preds_l1_test.to_csv(outfilel1, index=False)
            preds_l2_test.to_csv(outfilel2, index=False)
            preds_l3_test.to_csv(outfilel3, index=False)

            all_test = pd.concat([
                preds_l1_test.reset_index(drop=True),
                preds_l2_test.reset_index(drop=True),
                preds_l3_test.reset_index(drop=True)
            ],
                                 axis=1)
            all_test = all_test.T.drop_duplicates().T
            all_test.to_csv(outfilel, index=False)

            preds_l1_train.to_csv(outfilel1train, index=False)
            preds_l2_train.to_csv(outfilel2train, index=False)
            preds_l3_train.to_csv(outfilel3train, index=False)

            all_train = pd.concat([
                preds_l1_train.reset_index(drop=True),
                preds_l2_train.reset_index(drop=True),
                preds_l3_train.reset_index(drop=True)
            ],
                                  axis=1)
            all_train = all_train.T.drop_duplicates().T
            all_train.to_csv(outfileltrain, index=False)

            outfilel1.close()
            outfilel2.close()
            outfilel3.close()
            outfilel.close()

            outfilel1train.close()
            outfilel2train.close()
            outfilel3train.close()
            outfileltrain.close()

            remove_models(k, n)
            move_models_back(k)
Ejemplo n.º 3
0
def make_preds(reference_infile="train_set_lpp2.csv",
               pred_infile="lmfeatures.csv",
               k="MASSTRPLAN",
               outfile="",
               outfile_modname="",
               num_jobs=4,
               GUI_obj=None,
               ch_size=100000):
    """
    Make predictions for the evaluation of CALLC
    
    Parameters
    ----------
    reference_infile : str
		location of train data
    pred_infile : str
        location of file to make predictions for
    k : str
        key name to add to predictions and models
    outfile : str
        outfile for the predictions
    outfile_modname : str
        name for the models it will train
    num_jobs : int
        number of threads to spawn
    GUI_obj : object
        gui object to update log
    ch_size : int
        chunk size for generating predictions

    Returns
    -------

    """
    try:
        os.chdir("rt/")
    except:
        pass
    ref_infile = pd.read_csv(reference_infile)

    dict_dtypes = dict(
        ref_infile.select_dtypes(include=['int']).apply(
            pd.to_numeric, downcast="integer").dtypes)
    float_dtypes = dict(
        ref_infile.select_dtypes(include=['float']).apply(
            pd.to_numeric, downcast="float").dtypes)
    dict_dtypes.update(float_dtypes)

    tot_preds = sum(1 for row in open(pred_infile, "r")) / ch_size
    p_infile = pd.read_csv(pred_infile, dtype=dict_dtypes, chunksize=ch_size)

    counter_fold = 0

    keep_f = [
        x.strip() for x in open("features/selected_features.txt").readlines()
    ]

    keep_f.remove("system")
    ref_infile = ref_infile[keep_f]

    keep_f.remove("time")

    n = len(ref_infile)
    remove_models(k, n)

    print("===========")
    print("Total number of train molecules with tR: %s" % (n))

    train = ref_infile
    train = train.replace([np.inf, -np.inf], np.nan)
    train = train.fillna(0.0)

    cv = KFold(n_splits=5, shuffle=True, random_state=42)
    cv = list(cv.split(train.index))

    cv_list = cv_to_fold(cv, len(train.index))

    preds_own = train_l1_func(train,
                              names=[k, k, k, k, k, k, k],
                              adds=[n, n, n, n, n, n, n, n],
                              cv=cv,
                              outfile_modname=outfile_modname,
                              n_jobs=num_jobs)
    preds_l1_train, skipped_train = apply_models(
        train.drop(["time", "IDENTIFIER", "system"], axis=1, errors='ignore'),
        known_rt=train["time"],
        row_identifiers=train["IDENTIFIER"],
        skip_cont=[k])
    preds_l1_train = pd.concat(
        [preds_l1_train.reset_index(drop=True), preds_own], axis=1)

    for test in p_infile:
        counter_fold += 1
        print("----------")
        print("Read chunk (out of %s): %s" %
              (int(tot_preds) + 1, counter_fold))
        test = test[keep_f]
        test = test.replace([np.inf, -np.inf], np.nan)
        test = test.fillna(0.0)

        print("Applying Layer 1...")

        preds_l1_test, skipped_test = apply_models(
            test.drop(["time", "IDENTIFIER", "system"],
                      axis=1,
                      errors='ignore'),
            row_identifiers=test["IDENTIFIER"])

        print("Applying Layer 2...")

        preds_l2_test, preds_l2_train = apply_l2(preds_l1_train,
                                                 preds_l1_test,
                                                 cv_list=cv_list,
                                                 name=k)

        rem_col = preds_l1_train.drop(["time", "IDENTIFIER"],
                                      axis=1,
                                      errors='ignore').columns
        rem_col = [r for r in rem_col if r in preds_l2_train.columns]
        preds_l2_train = preds_l2_train.drop(rem_col, axis=1)
        preds_l2_test = preds_l2_test.drop(rem_col, axis=1)

        print("Applying Layer 3...")

        preds_l3_train, preds_l3_test, coefs = train_l3(preds_l2_train,
                                                        preds_l2_test,
                                                        cv=cv)

        outfilel3 = open("%s.csv" % (outfile), "w")
        outfilel3train = open("%s_train.csv" % (outfile), "w")

        preds_l3_train.columns = ["identifiers", "predictions", "tR"]
        preds_l3_test.columns = ["identifiers", "predictions"]

        preds_l3_test.to_csv(outfilel3, index=False)
        preds_l3_train.to_csv(outfilel3train, index=False)
        outfilel3.close()
        outfilel3train.close()

    print("Done, predictions can be found here: %s.csv" % (outfile))
    print("===========")

    if len(outfile_modname) > 0:
        rem_files = [
            "mods_l1/%s_brr.pickle" % (k),
            "mods_l1/%s_SVM.pickle" % (k),
            "mods_l1/%s_xgb.pickle" % (k),
            "mods_l1/%s_adaboost.pickle" % (k),
            "mods_l1/%s_lasso.pickle" % (k)
        ]
        for fn in rem_files:
            if os.path.exists(fn):
                os.remove(fn)
            else:
                print(
                    "Can not remove %s file. You need to remove it manually." %
                    fn)
Ejemplo n.º 4
0
def main(infilen="retmet_features_streamlit.csv"):
    """
    Make predictions for the evaluation of CALLC
    
    Parameters
    ----------
    infilen : str
        location of train data

    Returns
    -------

    """
    global adds
    global n_all

    infile = pd.read_csv(infilen)
    infile.fillna(0.0, inplace=True)
    sets = get_sets(infile)

    try:
        keep_f = [
            x.strip() for x in open("features/selected_features_v2.txt",
                                    encoding="utf-8").readlines()
        ]
        #infile = infile[keep_f]
        keep_f_features_only = [
            f for f in keep_f if f not in ["time", "IDENTIFIER", "system"]
        ]
        infile[keep_f_features_only] = infile[keep_f_features_only].applymap(
            lambda x: np.nan if isinstance(x, str) else x)
        infile.fillna(0.0, inplace=True)
    except IOError:
        infile, keep_f = sel_features(infile)
        outfile = open("features/selected_features_v2.txt", "w")
        outfile.write("\n".join(list(keep_f)))

        keep_f_features_only = [
            f for f in keep_f if f not in ["time", "IDENTIFIER", "system"]
        ]
        infile[keep_f_features_only] = infile[keep_f_features_only].applymap(
            lambda x: np.nan if isinstance(x, str) else x)
        infile.fillna(0.0, inplace=True)

    keep_f_time_sys_ident = list(keep_f)
    keep_f_time_sys_ident.extend(["time", "IDENTIFIER", "system"])

    scaler = load(open('scaler.pkl', 'rb'))

    infile[keep_f_features_only] = scaler.transform(
        infile[keep_f_features_only])

    infile.fillna(0.0, inplace=True)
    infile.replace(np.inf, 0.0, inplace=True)
    infile.replace(-np.inf, 0.0, inplace=True)

    sets = get_sets(infile)

    for k in sets.keys():
        #if k in ["Waters ACQUITY UPLC with Synapt G1 Q-TOF","Ales_18",
        #         "CS5","CS4","FEM_long","IJM_TEST","Matsuura_15","Krauss","CS17","ABC",
        #         "SNU_RP_108","CS11","CS20","SNU_RP_indole_order","CS12","CS23","MTBLS87",
        #         "Takahashi","Stravs","UniToyama_Atlantis","CS21","SNU_RP_30","RP_PSB_HF",
        #         "cecum_JS","CS14","MTBLS17","MTBLS38","Krauss_21","HILIC_BDD_2","LIFE_new",
        #         "FEM_orbitrap_plasma","Ken","CS15","SNU_RP_10","Janssen","cuw","CS3","Matsuura",
        #         "MTBLS52","BDD_C18","JKD_Probiotics","CS16","CS7","KI_GIAR_zic_HILIC_pH2_7",
        #         "RPFDAMM","HIILIC_tip","FEM_lipids","FEM_orbitrap_urine",
        #         "OBSF","CS19","RIKEN","CS10","MTBLS39","PFR-TK72","Cao_HILIC","CS8",
        #         "IPB_Halle","CS13","Tohge","RPMMFDA","UFZ_Phenomenex","Kojima","Nikiforos",
        #         "Toshimitsu","Stravs_22","CS22","Huntscha","Mark","SMRT","SNU-test","MTBLS4",
        #         "MTBLS36","SNU_RP_70","CS9"]:
        #    continue
        if k == "Waters ACQUITY UPLC with Synapt G1 Q-TOF":
            continue
        if k not in [
                "MPI_Symmetry", "PFR-TK72", "Cao_HILIC", "Eawag_Xbridge",
                "UniToyama_Atlantis", "LIFE_old", "MTBLS4", "RIKEN", "MTBLS52",
                "Beck", "FEM_lipids", "Nikiforos", "MTBLS36", "FEM_short",
                "MTBLS", "LIFE_new", "MTBLS20", "FEM_orbitrap_urine",
                "Matsuura_15", "Kojima", "MTBLS87", "MTBLS38", "Huntscha",
                "Aicheler", "Matsuura", "Takahashi", "Ken",
                "FEM_orbitrap_plasma", "UFZ_phenomenex", "Otto", "Tohge",
                "MTBLS19", "FEM_long", "Ales_18", "Taguchi", "IPB_Halle",
                "Stravs_22", "Krauss", "MTBLS39"
        ]:
            continue
        selected_set = sets[k]

        kf = KFold(shuffle=True, random_state=1, n_splits=10)

        if len(selected_set.index) < 20: continue

        exp_counter = 0
        ind = -1
        for train_index, test_index in kf.split(selected_set):
            ind += 1
            exp_counter += 1
            n = exp_counter
            print("TRAIN:", train_index, "TEST:", test_index)
            print(selected_set)

            train = selected_set.iloc[train_index]
            test = selected_set.iloc[test_index]

            cv = KFold(n_splits=10, shuffle=True, random_state=42)
            cv = list(cv.split(train.index))

            cv_list = cv_to_fold(cv, len(train.index))

            print("Training L1 %s,%s,%s" % (k, n, adds[ind]))

            move_models(k)
            preds_own, mods_own = train_l1_func(train[keep_f_time_sys_ident],
                                                names=[k, k, k, k, k, k, k],
                                                adds=[n, n, n, n, n, n, n, n],
                                                cv=cv)

            print("Applying L1 %s,%s,%s" % (k, n, adds[ind]))

            preds_l1_train, skipped_train = apply_models(
                train.drop(["time", "IDENTIFIER", "system"], axis=1)[keep_f],
                known_rt=train["time"],
                row_identifiers=train["IDENTIFIER"],
                skip_cont=[k])

            preds_l1_test, skipped_test = apply_models(
                test.drop(["time", "IDENTIFIER", "system"], axis=1)[keep_f],
                known_rt=test["time"],
                row_identifiers=test["IDENTIFIER"],
                additional_models=mods_own)

            preds_l1_train = pd.concat(
                [preds_l1_train.reset_index(drop=True), preds_own], axis=1)

            print("Applying L2 %s,%s,%s" % (k, n, adds[ind]))

            preds_l2_test, preds_l2_train = apply_l2(preds_l1_train,
                                                     preds_l1_test,
                                                     cv_list=cv_list,
                                                     name=k)

            preds_l2_train = pd.concat([
                preds_l2_train.reset_index(drop=True),
                train.drop(["IDENTIFIER", "system", "time"],
                           axis=1).reset_index(drop=True)
            ],
                                       axis=1)
            preds_l2_test = pd.concat([
                preds_l2_test.reset_index(drop=True),
                test.drop(["IDENTIFIER", "system", "time"],
                          axis=1).reset_index(drop=True)
            ],
                                      axis=1)

            preds_l2_test.drop(keep_f_features_only, axis=1, inplace=True)
            preds_l2_train.drop(keep_f_features_only, axis=1, inplace=True)

            print("Applying L3 %s,%s,%s" % (k, n, adds[ind]))

            preds_l3_train, preds_l3_test, coefs_list = train_l3(
                preds_l2_train, preds_l2_test, cv=cv)

            outfilel1 = open(
                "test_preds/%s_preds_l1_%s%s.csv" % (k, n, adds[ind]), "w")
            outfilel2 = open(
                "test_preds/%s_preds_l2_%s%s.csv" % (k, n, adds[ind]), "w")
            outfilel3 = open(
                "test_preds/%s_preds_l3_%s%s.csv" % (k, n, adds[ind]), "w")
            outfilel = open(
                "test_preds/%s_preds_ALL_%s%s.csv" % (k, n, adds[ind]), "w")

            outfilel1train = open(
                "test_preds/%s_preds_train_l1_%s%s.csv" % (k, n, adds[ind]),
                "w")
            outfilel2train = open(
                "test_preds/%s_preds_train_l2_%s%s.csv" % (k, n, adds[ind]),
                "w")
            outfilel3train = open(
                "test_preds/%s_preds_train_l3_%s%s.csv" % (k, n, adds[ind]),
                "w")
            outfileltrain = open(
                "test_preds/%s_ALL_train_l3_%s%s.csv" % (k, n, adds[ind]), "w")

            outfilel3_coefs = open(
                "test_preds/%s_preds_l3_%s%s_elasticnet_coefs.csv" %
                (k, n, adds[ind]), "w")

            for line in coefs_list:
                model, coefs = line
                outfilel3_coefs.write("%s_%s%s\t%s\t%s\n" %
                                      (k, n, adds[ind], model, str(coefs)))
            outfilel3_coefs.close()

            preds_l1_test.to_csv(outfilel1, index=False)
            preds_l2_test.to_csv(outfilel2, index=False)
            preds_l3_test.to_csv(outfilel3, index=False)

            all_test = pd.concat([
                preds_l1_test.reset_index(drop=True),
                preds_l2_test.reset_index(drop=True),
                preds_l3_test.reset_index(drop=True)
            ],
                                 axis=1)
            all_test = all_test.T.drop_duplicates().T
            all_test.to_csv(outfilel, index=False)

            preds_l1_train.to_csv(outfilel1train, index=False)
            preds_l2_train.to_csv(outfilel2train, index=False)
            preds_l3_train.to_csv(outfilel3train, index=False)

            all_train = pd.concat([
                preds_l1_train.reset_index(drop=True),
                preds_l2_train.reset_index(drop=True),
                preds_l3_train.reset_index(drop=True)
            ],
                                  axis=1)
            all_train = all_train.T.drop_duplicates().T
            all_train.to_csv(outfileltrain, index=False)

            outfilel1.close()
            outfilel2.close()
            outfilel3.close()
            outfilel.close()

            outfilel1train.close()
            outfilel2train.close()
            outfilel3train.close()
            outfileltrain.close()

            remove_models(k, n)
            move_models_back(k)
Ejemplo n.º 5
0
def make_preds(reference_infile="train_set_lpp2.csv",pred_infile="lmfeatures.csv",k="CALLCtemp",outfile="",extra_pred_file="",outfile_modname="",num_jobs=4,GUI_obj=None,ch_size=100000):
    """
    Make predictions for the evaluation of CALLC
    
    Parameters
    ----------
    reference_infile : str
		location of train data
    pred_infile : str
        location of file to make predictions for
    k : str
        key name to add to predictions and models
    outfile : str
        outfile for the predictions
    outfile_modname : str
        name for the models it will train
    num_jobs : int
        number of threads to spawn
    GUI_obj : object
        gui object to update log
    ch_size : int
        chunk size for generating predictions

    Returns
    -------

    """
    try: os.chdir("rt/")
    except: pass
    
    if type(reference_infile) == str: 
        ref_infile = pd.read_csv(reference_infile)
    else:
        ref_infile = get_feats("".join([l.decode() for l in reference_infile]))

    
    ref_infile["IDENTIFIER"] = ref_infile["IDENTIFIER"].apply(replace_non_ascii)

    # Make sure we have the correct data types
    dict_dtypes = dict(ref_infile.select_dtypes(include=['int']).apply(pd.to_numeric,downcast="integer").dtypes)
    float_dtypes = dict(ref_infile.select_dtypes(include=['float']).apply(pd.to_numeric,downcast="float").dtypes)
    dict_dtypes.update(float_dtypes)

    if type(reference_infile) == str:
        tot_preds = sum(1 for row in open(pred_infile,"r"))/ch_size
        p_infile = pd.read_csv(pred_infile,dtype=dict_dtypes,chunksize=ch_size)
    else:
        p_infile = get_feats("".join([l.decode() for l in pred_infile]))

    infile = pd.read_csv("datasets/input_for_scaler.csv",low_memory=False)	
    infile.fillna(0.0,inplace=True)

    try:
        keep_f = [x.strip() for x in open("features/selected_features_v2.txt", encoding="utf-8").readlines()]
        infile = infile[keep_f]
        keep_f_features_only = [f for f in keep_f if f not in ["time","IDENTIFIER","system"]]
        infile[keep_f_features_only] = infile[keep_f_features_only].applymap(lambda x: np.nan if isinstance(x, str) else x)
        infile.fillna(0.0,inplace=True)
    except IOError:
        infile,keep_f = sel_features(infile)
        outfile = open("features/selected_features_v2.txt","w")
        outfile.write("\n".join(list(keep_f)))
        
        keep_f_features_only = [f for f in keep_f if f not in ["time","IDENTIFIER","system"]]        
        infile[keep_f_features_only] = infile[keep_f_features_only].applymap(lambda x: np.nan if isinstance(x, str) else x)
        infile.fillna(0.0,inplace=True)

    n = len(ref_infile.index)

    print("===========")
    print("Total number of train molecules with tR: %s" % (n))

    keep_f_withoutid = list(keep_f_features_only)

    scaler = StandardScaler()
    
    infile.fillna(0.0,inplace=True)

    scaler = load(open('scaler.pkl', 'rb'))

    ref_infile[keep_f_withoutid] = scaler.transform(ref_infile[keep_f_withoutid])

    # Make sure that for the training data we do not have infinite or nan
    train = ref_infile
    
    # Define the folds to make predictions
    cv = KFold(n_splits=10,shuffle=True,random_state=42)
    cv = list(cv.split(train.index))
    
    cv_list = cv_to_fold(cv,len(train.index))

    train = train.replace([np.inf, -np.inf], np.nan)
    train.fillna(0.0,inplace=True)

    # Do layer 1 outside of the chunking
    keep_f_all = ["IDENTIFIER","time"]
    keep_f_all.extend(copy.deepcopy(keep_f_withoutid))
    
    ms = str(int(time.time_ns()))

    hash_object = hashlib.sha1(ms.encode())
    hex_dig = hash_object.hexdigest()

    preds_own,mods_own = train_l1_func(train[keep_f_all],names=[hex_dig,hex_dig,hex_dig,hex_dig,hex_dig,hex_dig,hex_dig],adds=[n,n,n,n,n,n,n,n],cv=cv,outfile_modname=outfile_modname,n_jobs=num_jobs)

    preds_l1_train,skipped_train = apply_models(train.drop(["time","IDENTIFIER","system"],axis=1, errors='ignore')[keep_f_withoutid],known_rt=train["time"],row_identifiers=train["IDENTIFIER"],skip_cont=[hex_dig])
    preds_l1_train = pd.concat([preds_l1_train.reset_index(drop=True), preds_own], axis=1)

    test = p_infile
    test["IDENTIFIER"] = test["IDENTIFIER"].apply(replace_non_ascii)

    test[keep_f_withoutid] = scaler.transform(test[keep_f_withoutid])
    test.replace([np.inf, -np.inf], np.nan)
    test.fillna(0.0,inplace=True)

    print("Applying Layer 1...")

    preds_l1_test,skipped_test = apply_models(test.drop(["time","IDENTIFIER","system"],axis=1,errors='ignore')[keep_f_withoutid],row_identifiers=test["IDENTIFIER"],skip_cont=[],additional_models=mods_own)
    preds_l1_test.fillna(0.0,inplace=True)

    preds_diff_l1 = (preds_l1_test.loc[:,preds_l1_test.columns!="IDENTIFIER"]-preds_l1_test.loc[:,preds_l1_test.columns!="IDENTIFIER"].min())/(preds_l1_test.loc[:,preds_l1_test.columns!="IDENTIFIER"].max()-preds_l1_test.loc[:,preds_l1_test.columns!="IDENTIFIER"].min())
    preds_diff_l1.fillna(0.0,inplace=True)
    preds_diff_l1 = preds_diff_l1.loc[:,[c for c in preds_diff_l1.columns if c.endswith("SVM")]]
    dist_l1 = preds_diff_l1.apply(calc_overlap_compounds)

    dist_l1 = dist_l1.sort_values()
    plot_setups = dist_l1[:5]

    print("Applying Layer 2...")

    preds_l2_test,preds_l2_train = apply_l2(preds_l1_train,preds_l1_test,cv_list=cv_list,name=k)
    
    print("Applying Layer 3...")

    preds_l3_train,preds_l3_test,coefs = train_l3(preds_l2_train,preds_l2_test,cv=cv)

    outfilel3 = open("%s.csv" % (outfile),"w")
    outfilel3train = open("%s_train.csv" % (outfile),"w")
    
    preds_l3_train.columns = ["identifiers","predictions","tR"]
    preds_l3_test.columns = ["identifiers","predictions"]
    
    preds_l3_test.to_csv(outfilel3,index=False)
    preds_l3_train.to_csv(outfilel3train,index=False)
    outfilel3.close()
    outfilel3train.close()
    
    print("Done, predictions can be found here: %s.csv" % (outfile))
    print("===========")
    
    if len(outfile_modname) > 0:
        rem_files = ["mods_l1/%s_brr.pickle" % (k),
                    "mods_l1/%s_SVM.pickle" % (k),
                    "mods_l1/%s_xgb.pickle" % (k),
                    "mods_l1/%s_adaboost.pickle" % (k),
                    "mods_l1/%s_lasso.pickle" % (k)]
        for fn in rem_files:
            if os.path.exists(fn):
                os.remove(fn)
            else:
                print("Can not remove %s file. You need to remove it manually." % fn)

    return preds_l3_train, preds_l3_test, plot_setups, preds_l1_test, coefs