def main(infilen="train/retmetfeatures_new.csv"): """ Make predictions for the evaluation of CALLC Parameters ---------- infilen : str location of train data Returns ------- """ global adds global n_all infile = pd.read_csv(infilen) infile.fillna(0.0, inplace=True) try: keep_f = [ x.strip() for x in open("features/selected_features.txt").readlines() ] infile = infile[keep_f] except IOError: infile, keep_f = sel_features(infile) outfile = open("features/selected_features.txt", "w") outfile.write("\n".join(list(keep_f))) keep_f_features_only = [ f for f in keep_f if f not in ["time", "IDENTIFIER", "system"] ] scaler = StandardScaler() infile[keep_f_features_only] = scaler.fit_transform( infile[keep_f_features_only]) sets = get_sets(infile) for k in sets.keys(): for ind in range(len(n_all)): selected_set = sets[k] select_index = list(range(len(selected_set.index))) n = n_all[ind] if n > len(selected_set.index): continue shuffle(select_index) train = selected_set.iloc[select_index[0:n], ] # test = selected_set.iloc[select_index[n:], ] # if len(select_index[n:]) < 10: continue cv = KFold(n_splits=10, shuffle=True) cv_spl = cv.split(list(train.index)) cv_list = cv_to_fold(cv_spl, len(train.index)) print("Training L1 %s,%s,%s" % (k, n, adds[ind])) move_models(k) preds_own = train_l1_func(train, names=[k, k, k, k, k, k, k], adds=[n, n, n, n, n, n, n, n], cv=cv) print("Applying L1 %s,%s,%s" % (k, n, adds[ind])) preds_l1_train, skipped_train = apply_models( train.drop(["time", "IDENTIFIER", "system"], axis=1), known_rt=train["time"], row_identifiers=train["IDENTIFIER"], skip_cont=[k]) preds_l1_test, skipped_test = apply_models( test.drop(["time", "IDENTIFIER", "system"], axis=1), known_rt=test["time"], row_identifiers=test["IDENTIFIER"]) preds_l1_train = pd.concat( [preds_l1_train.reset_index(drop=True), preds_own], axis=1) print("Applying L2 %s,%s,%s" % (k, n, adds[ind])) preds_l2_test, preds_l2_train = apply_l2(preds_l1_train, preds_l1_test, cv_list=cv_list, name=k) preds_l2_train = pd.concat([ preds_l2_train.reset_index(drop=True), train.drop(["IDENTIFIER", "system", "time"], axis=1).reset_index(drop=True) ], axis=1) preds_l2_test = pd.concat([ preds_l2_test.reset_index(drop=True), test.drop(["IDENTIFIER", "system", "time"], axis=1).reset_index(drop=True) ], axis=1) print("Applying L3 %s,%s,%s" % (k, n, adds[ind])) preds_l3_train, preds_l3_test, coefs_list = train_l3( preds_l2_train, preds_l2_test, cv=cv) outfilel1 = open( "test_preds/%s_preds_l1_%s%s.csv" % (k, n, adds[ind]), "w") outfilel2 = open( "test_preds/%s_preds_l2_%s%s.csv" % (k, n, adds[ind]), "w") outfilel3 = open( "test_preds/%s_preds_l3_%s%s.csv" % (k, n, adds[ind]), "w") outfilel = open( "test_preds/%s_preds_ALL_%s%s.csv" % (k, n, adds[ind]), "w") outfilel1train = open( "test_preds/%s_preds_train_l1_%s%s.csv" % (k, n, adds[ind]), "w") outfilel2train = open( "test_preds/%s_preds_train_l2_%s%s.csv" % (k, n, adds[ind]), "w") outfilel3train = open( "test_preds/%s_preds_train_l3_%s%s.csv" % (k, n, adds[ind]), "w") outfileltrain = open( "test_preds/%s_ALL_train_l3_%s%s.csv" % (k, n, adds[ind]), "w") outfilel3_coefs = open( "test_preds/%s_preds_l3_%s%s_elasticnet_coefs.csv" % (k, n, adds[ind]), "w") for line in coefs_list: model, coefs = line outfilel3_coefs.write("%s_%s%s\t%s\t%s\n" % (k, n, adds[ind], model, str(coefs))) outfilel3_coefs.close() preds_l1_test.to_csv(outfilel1, index=False) preds_l2_test.to_csv(outfilel2, index=False) preds_l3_test.to_csv(outfilel3, index=False) all_test = pd.concat([ preds_l1_test.reset_index(drop=True), preds_l2_test.reset_index(drop=True), preds_l3_test.reset_index(drop=True) ], axis=1) all_test = all_test.T.drop_duplicates().T all_test.to_csv(outfilel, index=False) preds_l1_train.to_csv(outfilel1train, index=False) preds_l2_train.to_csv(outfilel2train, index=False) preds_l3_train.to_csv(outfilel3train, index=False) all_train = pd.concat([ preds_l1_train.reset_index(drop=True), preds_l2_train.reset_index(drop=True), preds_l3_train.reset_index(drop=True) ], axis=1) all_train = all_train.T.drop_duplicates().T all_train.to_csv(outfileltrain, index=False) outfilel1.close() outfilel2.close() outfilel3.close() outfilel.close() outfilel1train.close() outfilel2train.close() outfilel3train.close() outfileltrain.close() remove_models(k, n) move_models_back(k)
def main(infilen="train/retmetfeatures_new.csv"): """ Make predictions for the evaluation of CALLC Parameters ---------- infilen : str location of train data Returns ------- """ global adds global n_all infile = pd.read_csv(infilen) try: keep_f = [ x.strip() for x in open("features/selected_features.txt").readlines() ] infile = infile[keep_f] except IOError: infile, keep_f = sel_features(infile) outfile = open("features/selected_features.txt", "w") outfile.write("\n".join(list(keep_f))) keep_f_features_only = [ f for f in keep_f if f not in ["time", "IDENTIFIER", "system"] ] scaler = StandardScaler() infile[keep_f_features_only] = scaler.fit_transform( infile[keep_f_features_only]) sets = get_sets(infile) #duplic_df = pd.read_csv("train/retmetfeatures_new.csv") #unique_df = pd.read_csv("train/retmetfeatures_new_nodup.csv") #all_id = list(duplic_df["IDENTIFIER"]) #uniq = list(unique_df["IDENTIFIER"]) #not_uniq = [u for u in all_id if u not in uniq] for k in sets.keys(): #if k in ["","RIKEN","Taguchi_12","LIFE_old","Ales_18","PFR-TK72","Beck","Cao_HILIC","Eawag_XBridgeC18","FEM_lipids","FEM_long","FEM_orbitrap_plasma","FEM_orbitrap_urine","FEM_short","IPB_Halle","kohlbacher","Kojima","Krauss_21","Krauss","LIFE_new","LIFE_old","Mark","Matsuura_15","Matsuura","MPI_Symmetry","MTBLS20","MTBLS36","MTBLS38","MTBLS39","MTBLS87","Nikiforos","Otto","Stravs_22","Stravs","Taguchi","Takahashi","Tohge","Toshimitsu","UFZ_Phenomenex","UniToyama_Atlantis"]: continue if k != "kohlbacher": continue #print k for ind in range(len(n_all)): selected_set = sets[k] select_index = list(range(len(selected_set.index))) #if len(select_index) < 101: continue n = n_all[ind] if n > len(selected_set.index): continue #dupl_indexes,uniq_indexes = get_unique_indexes(selected_set["IDENTIFIER"],not_uniq) #sel_train,sel_test = select_dupl(dupl_indexes,uniq_indexes,num_train=n,min_num_test=10) shuffle(select_index) train = selected_set.iloc[select_index[0:n], ] # test = selected_set.iloc[select_index[n:], ] # if len(select_index[n:]) < 10: continue cv = KFold(n_splits=10, shuffle=True) cv_spl = cv.split(list(train.index)) #len(train.index) cv_list = cv_to_fold(cv_spl, len(train.index)) print("Training L1 %s,%s,%s" % (k, n, adds[ind])) move_models(k) preds_own = train_l1_func(train, names=[k, k, k, k, k, k, k], adds=[n, n, n, n, n, n, n, n], cv=cv) print("Applying L1 %s,%s,%s" % (k, n, adds[ind])) preds_l1_train, skipped_train = apply_models( train.drop(["time", "IDENTIFIER", "system"], axis=1), known_rt=train["time"], row_identifiers=train["IDENTIFIER"], skip_cont=[k]) preds_l1_test, skipped_test = apply_models( test.drop(["time", "IDENTIFIER", "system"], axis=1), known_rt=test["time"], row_identifiers=test["IDENTIFIER"]) preds_l1_train = pd.concat( [preds_l1_train.reset_index(drop=True), preds_own], axis=1) print("Applying L2 %s,%s,%s" % (k, n, adds[ind])) preds_l2_test, preds_l2_train = apply_l2(preds_l1_train, preds_l1_test, cv_list=cv_list, name=k) #rem_col = preds_l1_train.drop(["time","IDENTIFIER"],axis=1).columns #rem_col = [r for r in rem_col if r in preds_l2_train.columns] #preds_l2_train = preds_l2_train.drop(rem_col,axis=1) #preds_l2_test = preds_l2_test.drop(rem_col,axis=1) preds_l2_train = pd.concat([ preds_l2_train.reset_index(drop=True), train.drop(["IDENTIFIER", "system", "time"], axis=1).reset_index(drop=True) ], axis=1) preds_l2_test = pd.concat([ preds_l2_test.reset_index(drop=True), test.drop(["IDENTIFIER", "system", "time"], axis=1).reset_index(drop=True) ], axis=1) #print preds_l2_train #raw_input() #print preds_l2_test #raw_input() print("Applying L3 %s,%s,%s" % (k, n, adds[ind])) preds_l3_train, preds_l3_test, coefs_list = train_l3( preds_l2_train, preds_l2_test, cv=cv) outfilel1 = open( "test_preds/%s_preds_l1_%s%s.csv" % (k, n, adds[ind]), "w") outfilel2 = open( "test_preds/%s_preds_l2_%s%s.csv" % (k, n, adds[ind]), "w") outfilel3 = open( "test_preds/%s_preds_l3_%s%s.csv" % (k, n, adds[ind]), "w") outfilel = open( "test_preds/%s_preds_ALL_%s%s.csv" % (k, n, adds[ind]), "w") outfilel1train = open( "test_preds/%s_preds_train_l1_%s%s.csv" % (k, n, adds[ind]), "w") outfilel2train = open( "test_preds/%s_preds_train_l2_%s%s.csv" % (k, n, adds[ind]), "w") outfilel3train = open( "test_preds/%s_preds_train_l3_%s%s.csv" % (k, n, adds[ind]), "w") outfileltrain = open( "test_preds/%s_ALL_train_l3_%s%s.csv" % (k, n, adds[ind]), "w") outfilel3_coefs = open( "test_preds/%s_preds_l3_%s%s_elasticnet_coefs.csv" % (k, n, adds[ind]), "w") for line in coefs_list: model, coefs = line outfilel3_coefs.write("%s_%s%s\t%s\t%s\n" % (k, n, adds[ind], model, str(coefs))) outfilel3_coefs.close() preds_l1_test.to_csv(outfilel1, index=False) preds_l2_test.to_csv(outfilel2, index=False) preds_l3_test.to_csv(outfilel3, index=False) all_test = pd.concat([ preds_l1_test.reset_index(drop=True), preds_l2_test.reset_index(drop=True), preds_l3_test.reset_index(drop=True) ], axis=1) all_test = all_test.T.drop_duplicates().T all_test.to_csv(outfilel, index=False) preds_l1_train.to_csv(outfilel1train, index=False) preds_l2_train.to_csv(outfilel2train, index=False) preds_l3_train.to_csv(outfilel3train, index=False) all_train = pd.concat([ preds_l1_train.reset_index(drop=True), preds_l2_train.reset_index(drop=True), preds_l3_train.reset_index(drop=True) ], axis=1) all_train = all_train.T.drop_duplicates().T all_train.to_csv(outfileltrain, index=False) outfilel1.close() outfilel2.close() outfilel3.close() outfilel.close() outfilel1train.close() outfilel2train.close() outfilel3train.close() outfileltrain.close() remove_models(k, n) move_models_back(k)
def make_preds(reference_infile="train_set_lpp2.csv", pred_infile="lmfeatures.csv", k="MASSTRPLAN", outfile="", outfile_modname="", num_jobs=4, GUI_obj=None, ch_size=100000): """ Make predictions for the evaluation of CALLC Parameters ---------- reference_infile : str location of train data pred_infile : str location of file to make predictions for k : str key name to add to predictions and models outfile : str outfile for the predictions outfile_modname : str name for the models it will train num_jobs : int number of threads to spawn GUI_obj : object gui object to update log ch_size : int chunk size for generating predictions Returns ------- """ try: os.chdir("rt/") except: pass ref_infile = pd.read_csv(reference_infile) dict_dtypes = dict( ref_infile.select_dtypes(include=['int']).apply( pd.to_numeric, downcast="integer").dtypes) float_dtypes = dict( ref_infile.select_dtypes(include=['float']).apply( pd.to_numeric, downcast="float").dtypes) dict_dtypes.update(float_dtypes) tot_preds = sum(1 for row in open(pred_infile, "r")) / ch_size p_infile = pd.read_csv(pred_infile, dtype=dict_dtypes, chunksize=ch_size) counter_fold = 0 keep_f = [ x.strip() for x in open("features/selected_features.txt").readlines() ] keep_f.remove("system") ref_infile = ref_infile[keep_f] keep_f.remove("time") n = len(ref_infile) remove_models(k, n) print("===========") print("Total number of train molecules with tR: %s" % (n)) train = ref_infile train = train.replace([np.inf, -np.inf], np.nan) train = train.fillna(0.0) cv = KFold(n_splits=5, shuffle=True, random_state=42) cv = list(cv.split(train.index)) cv_list = cv_to_fold(cv, len(train.index)) preds_own = train_l1_func(train, names=[k, k, k, k, k, k, k], adds=[n, n, n, n, n, n, n, n], cv=cv, outfile_modname=outfile_modname, n_jobs=num_jobs) preds_l1_train, skipped_train = apply_models( train.drop(["time", "IDENTIFIER", "system"], axis=1, errors='ignore'), known_rt=train["time"], row_identifiers=train["IDENTIFIER"], skip_cont=[k]) preds_l1_train = pd.concat( [preds_l1_train.reset_index(drop=True), preds_own], axis=1) for test in p_infile: counter_fold += 1 print("----------") print("Read chunk (out of %s): %s" % (int(tot_preds) + 1, counter_fold)) test = test[keep_f] test = test.replace([np.inf, -np.inf], np.nan) test = test.fillna(0.0) print("Applying Layer 1...") preds_l1_test, skipped_test = apply_models( test.drop(["time", "IDENTIFIER", "system"], axis=1, errors='ignore'), row_identifiers=test["IDENTIFIER"]) print("Applying Layer 2...") preds_l2_test, preds_l2_train = apply_l2(preds_l1_train, preds_l1_test, cv_list=cv_list, name=k) rem_col = preds_l1_train.drop(["time", "IDENTIFIER"], axis=1, errors='ignore').columns rem_col = [r for r in rem_col if r in preds_l2_train.columns] preds_l2_train = preds_l2_train.drop(rem_col, axis=1) preds_l2_test = preds_l2_test.drop(rem_col, axis=1) print("Applying Layer 3...") preds_l3_train, preds_l3_test, coefs = train_l3(preds_l2_train, preds_l2_test, cv=cv) outfilel3 = open("%s.csv" % (outfile), "w") outfilel3train = open("%s_train.csv" % (outfile), "w") preds_l3_train.columns = ["identifiers", "predictions", "tR"] preds_l3_test.columns = ["identifiers", "predictions"] preds_l3_test.to_csv(outfilel3, index=False) preds_l3_train.to_csv(outfilel3train, index=False) outfilel3.close() outfilel3train.close() print("Done, predictions can be found here: %s.csv" % (outfile)) print("===========") if len(outfile_modname) > 0: rem_files = [ "mods_l1/%s_brr.pickle" % (k), "mods_l1/%s_SVM.pickle" % (k), "mods_l1/%s_xgb.pickle" % (k), "mods_l1/%s_adaboost.pickle" % (k), "mods_l1/%s_lasso.pickle" % (k) ] for fn in rem_files: if os.path.exists(fn): os.remove(fn) else: print( "Can not remove %s file. You need to remove it manually." % fn)
def main(infilen="retmet_features_streamlit.csv"): """ Make predictions for the evaluation of CALLC Parameters ---------- infilen : str location of train data Returns ------- """ global adds global n_all infile = pd.read_csv(infilen) infile.fillna(0.0, inplace=True) sets = get_sets(infile) try: keep_f = [ x.strip() for x in open("features/selected_features_v2.txt", encoding="utf-8").readlines() ] #infile = infile[keep_f] keep_f_features_only = [ f for f in keep_f if f not in ["time", "IDENTIFIER", "system"] ] infile[keep_f_features_only] = infile[keep_f_features_only].applymap( lambda x: np.nan if isinstance(x, str) else x) infile.fillna(0.0, inplace=True) except IOError: infile, keep_f = sel_features(infile) outfile = open("features/selected_features_v2.txt", "w") outfile.write("\n".join(list(keep_f))) keep_f_features_only = [ f for f in keep_f if f not in ["time", "IDENTIFIER", "system"] ] infile[keep_f_features_only] = infile[keep_f_features_only].applymap( lambda x: np.nan if isinstance(x, str) else x) infile.fillna(0.0, inplace=True) keep_f_time_sys_ident = list(keep_f) keep_f_time_sys_ident.extend(["time", "IDENTIFIER", "system"]) scaler = load(open('scaler.pkl', 'rb')) infile[keep_f_features_only] = scaler.transform( infile[keep_f_features_only]) infile.fillna(0.0, inplace=True) infile.replace(np.inf, 0.0, inplace=True) infile.replace(-np.inf, 0.0, inplace=True) sets = get_sets(infile) for k in sets.keys(): #if k in ["Waters ACQUITY UPLC with Synapt G1 Q-TOF","Ales_18", # "CS5","CS4","FEM_long","IJM_TEST","Matsuura_15","Krauss","CS17","ABC", # "SNU_RP_108","CS11","CS20","SNU_RP_indole_order","CS12","CS23","MTBLS87", # "Takahashi","Stravs","UniToyama_Atlantis","CS21","SNU_RP_30","RP_PSB_HF", # "cecum_JS","CS14","MTBLS17","MTBLS38","Krauss_21","HILIC_BDD_2","LIFE_new", # "FEM_orbitrap_plasma","Ken","CS15","SNU_RP_10","Janssen","cuw","CS3","Matsuura", # "MTBLS52","BDD_C18","JKD_Probiotics","CS16","CS7","KI_GIAR_zic_HILIC_pH2_7", # "RPFDAMM","HIILIC_tip","FEM_lipids","FEM_orbitrap_urine", # "OBSF","CS19","RIKEN","CS10","MTBLS39","PFR-TK72","Cao_HILIC","CS8", # "IPB_Halle","CS13","Tohge","RPMMFDA","UFZ_Phenomenex","Kojima","Nikiforos", # "Toshimitsu","Stravs_22","CS22","Huntscha","Mark","SMRT","SNU-test","MTBLS4", # "MTBLS36","SNU_RP_70","CS9"]: # continue if k == "Waters ACQUITY UPLC with Synapt G1 Q-TOF": continue if k not in [ "MPI_Symmetry", "PFR-TK72", "Cao_HILIC", "Eawag_Xbridge", "UniToyama_Atlantis", "LIFE_old", "MTBLS4", "RIKEN", "MTBLS52", "Beck", "FEM_lipids", "Nikiforos", "MTBLS36", "FEM_short", "MTBLS", "LIFE_new", "MTBLS20", "FEM_orbitrap_urine", "Matsuura_15", "Kojima", "MTBLS87", "MTBLS38", "Huntscha", "Aicheler", "Matsuura", "Takahashi", "Ken", "FEM_orbitrap_plasma", "UFZ_phenomenex", "Otto", "Tohge", "MTBLS19", "FEM_long", "Ales_18", "Taguchi", "IPB_Halle", "Stravs_22", "Krauss", "MTBLS39" ]: continue selected_set = sets[k] kf = KFold(shuffle=True, random_state=1, n_splits=10) if len(selected_set.index) < 20: continue exp_counter = 0 ind = -1 for train_index, test_index in kf.split(selected_set): ind += 1 exp_counter += 1 n = exp_counter print("TRAIN:", train_index, "TEST:", test_index) print(selected_set) train = selected_set.iloc[train_index] test = selected_set.iloc[test_index] cv = KFold(n_splits=10, shuffle=True, random_state=42) cv = list(cv.split(train.index)) cv_list = cv_to_fold(cv, len(train.index)) print("Training L1 %s,%s,%s" % (k, n, adds[ind])) move_models(k) preds_own, mods_own = train_l1_func(train[keep_f_time_sys_ident], names=[k, k, k, k, k, k, k], adds=[n, n, n, n, n, n, n, n], cv=cv) print("Applying L1 %s,%s,%s" % (k, n, adds[ind])) preds_l1_train, skipped_train = apply_models( train.drop(["time", "IDENTIFIER", "system"], axis=1)[keep_f], known_rt=train["time"], row_identifiers=train["IDENTIFIER"], skip_cont=[k]) preds_l1_test, skipped_test = apply_models( test.drop(["time", "IDENTIFIER", "system"], axis=1)[keep_f], known_rt=test["time"], row_identifiers=test["IDENTIFIER"], additional_models=mods_own) preds_l1_train = pd.concat( [preds_l1_train.reset_index(drop=True), preds_own], axis=1) print("Applying L2 %s,%s,%s" % (k, n, adds[ind])) preds_l2_test, preds_l2_train = apply_l2(preds_l1_train, preds_l1_test, cv_list=cv_list, name=k) preds_l2_train = pd.concat([ preds_l2_train.reset_index(drop=True), train.drop(["IDENTIFIER", "system", "time"], axis=1).reset_index(drop=True) ], axis=1) preds_l2_test = pd.concat([ preds_l2_test.reset_index(drop=True), test.drop(["IDENTIFIER", "system", "time"], axis=1).reset_index(drop=True) ], axis=1) preds_l2_test.drop(keep_f_features_only, axis=1, inplace=True) preds_l2_train.drop(keep_f_features_only, axis=1, inplace=True) print("Applying L3 %s,%s,%s" % (k, n, adds[ind])) preds_l3_train, preds_l3_test, coefs_list = train_l3( preds_l2_train, preds_l2_test, cv=cv) outfilel1 = open( "test_preds/%s_preds_l1_%s%s.csv" % (k, n, adds[ind]), "w") outfilel2 = open( "test_preds/%s_preds_l2_%s%s.csv" % (k, n, adds[ind]), "w") outfilel3 = open( "test_preds/%s_preds_l3_%s%s.csv" % (k, n, adds[ind]), "w") outfilel = open( "test_preds/%s_preds_ALL_%s%s.csv" % (k, n, adds[ind]), "w") outfilel1train = open( "test_preds/%s_preds_train_l1_%s%s.csv" % (k, n, adds[ind]), "w") outfilel2train = open( "test_preds/%s_preds_train_l2_%s%s.csv" % (k, n, adds[ind]), "w") outfilel3train = open( "test_preds/%s_preds_train_l3_%s%s.csv" % (k, n, adds[ind]), "w") outfileltrain = open( "test_preds/%s_ALL_train_l3_%s%s.csv" % (k, n, adds[ind]), "w") outfilel3_coefs = open( "test_preds/%s_preds_l3_%s%s_elasticnet_coefs.csv" % (k, n, adds[ind]), "w") for line in coefs_list: model, coefs = line outfilel3_coefs.write("%s_%s%s\t%s\t%s\n" % (k, n, adds[ind], model, str(coefs))) outfilel3_coefs.close() preds_l1_test.to_csv(outfilel1, index=False) preds_l2_test.to_csv(outfilel2, index=False) preds_l3_test.to_csv(outfilel3, index=False) all_test = pd.concat([ preds_l1_test.reset_index(drop=True), preds_l2_test.reset_index(drop=True), preds_l3_test.reset_index(drop=True) ], axis=1) all_test = all_test.T.drop_duplicates().T all_test.to_csv(outfilel, index=False) preds_l1_train.to_csv(outfilel1train, index=False) preds_l2_train.to_csv(outfilel2train, index=False) preds_l3_train.to_csv(outfilel3train, index=False) all_train = pd.concat([ preds_l1_train.reset_index(drop=True), preds_l2_train.reset_index(drop=True), preds_l3_train.reset_index(drop=True) ], axis=1) all_train = all_train.T.drop_duplicates().T all_train.to_csv(outfileltrain, index=False) outfilel1.close() outfilel2.close() outfilel3.close() outfilel.close() outfilel1train.close() outfilel2train.close() outfilel3train.close() outfileltrain.close() remove_models(k, n) move_models_back(k)
def make_preds(reference_infile="train_set_lpp2.csv",pred_infile="lmfeatures.csv",k="CALLCtemp",outfile="",extra_pred_file="",outfile_modname="",num_jobs=4,GUI_obj=None,ch_size=100000): """ Make predictions for the evaluation of CALLC Parameters ---------- reference_infile : str location of train data pred_infile : str location of file to make predictions for k : str key name to add to predictions and models outfile : str outfile for the predictions outfile_modname : str name for the models it will train num_jobs : int number of threads to spawn GUI_obj : object gui object to update log ch_size : int chunk size for generating predictions Returns ------- """ try: os.chdir("rt/") except: pass if type(reference_infile) == str: ref_infile = pd.read_csv(reference_infile) else: ref_infile = get_feats("".join([l.decode() for l in reference_infile])) ref_infile["IDENTIFIER"] = ref_infile["IDENTIFIER"].apply(replace_non_ascii) # Make sure we have the correct data types dict_dtypes = dict(ref_infile.select_dtypes(include=['int']).apply(pd.to_numeric,downcast="integer").dtypes) float_dtypes = dict(ref_infile.select_dtypes(include=['float']).apply(pd.to_numeric,downcast="float").dtypes) dict_dtypes.update(float_dtypes) if type(reference_infile) == str: tot_preds = sum(1 for row in open(pred_infile,"r"))/ch_size p_infile = pd.read_csv(pred_infile,dtype=dict_dtypes,chunksize=ch_size) else: p_infile = get_feats("".join([l.decode() for l in pred_infile])) infile = pd.read_csv("datasets/input_for_scaler.csv",low_memory=False) infile.fillna(0.0,inplace=True) try: keep_f = [x.strip() for x in open("features/selected_features_v2.txt", encoding="utf-8").readlines()] infile = infile[keep_f] keep_f_features_only = [f for f in keep_f if f not in ["time","IDENTIFIER","system"]] infile[keep_f_features_only] = infile[keep_f_features_only].applymap(lambda x: np.nan if isinstance(x, str) else x) infile.fillna(0.0,inplace=True) except IOError: infile,keep_f = sel_features(infile) outfile = open("features/selected_features_v2.txt","w") outfile.write("\n".join(list(keep_f))) keep_f_features_only = [f for f in keep_f if f not in ["time","IDENTIFIER","system"]] infile[keep_f_features_only] = infile[keep_f_features_only].applymap(lambda x: np.nan if isinstance(x, str) else x) infile.fillna(0.0,inplace=True) n = len(ref_infile.index) print("===========") print("Total number of train molecules with tR: %s" % (n)) keep_f_withoutid = list(keep_f_features_only) scaler = StandardScaler() infile.fillna(0.0,inplace=True) scaler = load(open('scaler.pkl', 'rb')) ref_infile[keep_f_withoutid] = scaler.transform(ref_infile[keep_f_withoutid]) # Make sure that for the training data we do not have infinite or nan train = ref_infile # Define the folds to make predictions cv = KFold(n_splits=10,shuffle=True,random_state=42) cv = list(cv.split(train.index)) cv_list = cv_to_fold(cv,len(train.index)) train = train.replace([np.inf, -np.inf], np.nan) train.fillna(0.0,inplace=True) # Do layer 1 outside of the chunking keep_f_all = ["IDENTIFIER","time"] keep_f_all.extend(copy.deepcopy(keep_f_withoutid)) ms = str(int(time.time_ns())) hash_object = hashlib.sha1(ms.encode()) hex_dig = hash_object.hexdigest() preds_own,mods_own = train_l1_func(train[keep_f_all],names=[hex_dig,hex_dig,hex_dig,hex_dig,hex_dig,hex_dig,hex_dig],adds=[n,n,n,n,n,n,n,n],cv=cv,outfile_modname=outfile_modname,n_jobs=num_jobs) preds_l1_train,skipped_train = apply_models(train.drop(["time","IDENTIFIER","system"],axis=1, errors='ignore')[keep_f_withoutid],known_rt=train["time"],row_identifiers=train["IDENTIFIER"],skip_cont=[hex_dig]) preds_l1_train = pd.concat([preds_l1_train.reset_index(drop=True), preds_own], axis=1) test = p_infile test["IDENTIFIER"] = test["IDENTIFIER"].apply(replace_non_ascii) test[keep_f_withoutid] = scaler.transform(test[keep_f_withoutid]) test.replace([np.inf, -np.inf], np.nan) test.fillna(0.0,inplace=True) print("Applying Layer 1...") preds_l1_test,skipped_test = apply_models(test.drop(["time","IDENTIFIER","system"],axis=1,errors='ignore')[keep_f_withoutid],row_identifiers=test["IDENTIFIER"],skip_cont=[],additional_models=mods_own) preds_l1_test.fillna(0.0,inplace=True) preds_diff_l1 = (preds_l1_test.loc[:,preds_l1_test.columns!="IDENTIFIER"]-preds_l1_test.loc[:,preds_l1_test.columns!="IDENTIFIER"].min())/(preds_l1_test.loc[:,preds_l1_test.columns!="IDENTIFIER"].max()-preds_l1_test.loc[:,preds_l1_test.columns!="IDENTIFIER"].min()) preds_diff_l1.fillna(0.0,inplace=True) preds_diff_l1 = preds_diff_l1.loc[:,[c for c in preds_diff_l1.columns if c.endswith("SVM")]] dist_l1 = preds_diff_l1.apply(calc_overlap_compounds) dist_l1 = dist_l1.sort_values() plot_setups = dist_l1[:5] print("Applying Layer 2...") preds_l2_test,preds_l2_train = apply_l2(preds_l1_train,preds_l1_test,cv_list=cv_list,name=k) print("Applying Layer 3...") preds_l3_train,preds_l3_test,coefs = train_l3(preds_l2_train,preds_l2_test,cv=cv) outfilel3 = open("%s.csv" % (outfile),"w") outfilel3train = open("%s_train.csv" % (outfile),"w") preds_l3_train.columns = ["identifiers","predictions","tR"] preds_l3_test.columns = ["identifiers","predictions"] preds_l3_test.to_csv(outfilel3,index=False) preds_l3_train.to_csv(outfilel3train,index=False) outfilel3.close() outfilel3train.close() print("Done, predictions can be found here: %s.csv" % (outfile)) print("===========") if len(outfile_modname) > 0: rem_files = ["mods_l1/%s_brr.pickle" % (k), "mods_l1/%s_SVM.pickle" % (k), "mods_l1/%s_xgb.pickle" % (k), "mods_l1/%s_adaboost.pickle" % (k), "mods_l1/%s_lasso.pickle" % (k)] for fn in rem_files: if os.path.exists(fn): os.remove(fn) else: print("Can not remove %s file. You need to remove it manually." % fn) return preds_l3_train, preds_l3_test, plot_setups, preds_l1_test, coefs