def ssl_global(model_zoo, pipeline): numero_lotes = 5 semi_method = 'co-training-multi' datos = {} models_info = {} #test_cotraining,predicciones = [],[] #logs,logs_time,logs_label = [], [], [] datos["df_base"] = get_dataset(pipeline) datos = split_train_test(datos, pipeline) # Medir tiempo de ejecucion import time start = time.time() for kfold in range(1): for iteracion in range(numero_lotes * 1): print("\n######################") print("K-FOLD {} - ITERACION {}".format(kfold, iteracion)) print("######################\n") datos = get_Fold(kfold, datos, pipeline) if iteracion == 0: etapa = 'train' else: etapa = 'train_EL' print(pipeline["path_label_stats"] + str(pipeline["id"]) + '_' + str(iteracion) + '.pickle') for model in model_zoo: model_memory, model_performance = training( kfold, etapa, datos, model, iteracion, models_info, pipeline) models_info[model] = { 'model_memory': model_memory, 'model_performance': model_performance['val_acc'] } df_temp = pd.DataFrame(models_info).T top_models = df_temp.sort_values('model_performance', ascending=False) top_models = top_models.reset_index()['index'].values.tolist()[:3] mod_top1, arch_top1 = models_info[ top_models[0]]['model_memory'], top_models[0] mod_top2, arch_top2 = models_info[ top_models[1]]['model_memory'], top_models[1] mod_top3, arch_top3 = models_info[ top_models[2]]['model_memory'], top_models[2] print("\n") print( "Co-train: ", evaluate_cotrain(mod_top1, mod_top2, mod_top3, arch_top1, arch_top2, arch_top3, datos, etapa, kfold, iteracion, pipeline, models_info, logs)) print("\n") if semi_method == 'supervised': break if iteracion < numero_lotes: df_batchset = datos["batch_set"][iteracion] df_batchset.columns = [ pipeline["x_col_name"], pipeline["y_col_name"] ] df_batchset[pipeline["y_col_name"]] = '0' else: if iteracion == numero_lotes: df_LC = pd.DataFrame(pipeline["LC"]) batch_set_LC = list(dividir_lotes(df_LC, numero_lotes)) for i in enumerate(batch_set_LC): print(len(batch_set_LC[i].iloc[:, 0].values.tolist())) pipeline["LC"] = [] df_batchset = pd.DataFrame([ batch_set_LC[int(iteracion - numero_lotes)].iloc[:, 0].values.tolist() ]).T df_batchset.columns = [pipeline["x_col_name"]] df_batchset[pipeline["y_col_name"]] = '0' datos['df_batchset'] = df_batchset datos, EL_iter, LC_iter = labeling(etapa, mod_top1, mod_top2, mod_top3, arch_top1, arch_top2, arch_top3, datos, pipeline, iteracion, models_info) #logs_label.append([kfold,iteracion,arch_top1,arch_top2,arch_top3,len(EL_iter),len(LC_iter)]) #save_logs(logs_label,'label',pipeline) #df_EL = pd.DataFrame(EL, columns=[pipeline["x_col_name"], pipeline["y_col_name"], 'arch_scores']) #df_LC = pd.DataFrame(LC, columns=[pipeline["x_col_name"], pipeline["y_col_name"], 'arch_scores']) print("EL_iter", len(EL_iter)) print("LC_iter", len(LC_iter)) #df_EL = pd.DataFrame(EL_iter, columns=[ pipeline["x_col_name"], pipeline["y_col_name"], 'arch_scores' ]) # EXP30 #df_LC = pd.DataFrame(LC_iter, columns=[ pipeline["x_col_name"], pipeline["y_col_name"], 'arch_scores' ]) # EXP30 df_EL = pd.DataFrame(datos["EL"], columns=[ pipeline["x_col_name"], pipeline["y_col_name"], 'arch_scores' ]) df_LC = pd.DataFrame(datos["LC"], columns=[ pipeline["x_col_name"], pipeline["y_col_name"], 'arch_scores' ]) os.makedirs(pipeline["path_label_stats"].split('/')[0], exist_ok=True) df_EL.to_pickle(pipeline["path_label_stats"] + str(pipeline["id"]) + '_' + str(iteracion) + '_EL.pickle') df_LC.to_pickle(pipeline["path_label_stats"] + str(pipeline["id"]) + '_' + str(iteracion) + '_LC.pickle') df_label_stats = label_stats(df_EL, df_LC, pipeline) #print(df_label_stats) df_label_stats.to_pickle(pipeline["path_label_stats"] + str(pipeline["id"]) + '_' + str(iteracion) + '.pickle') df_train_EL = pd.concat([datos["df_train"], df_EL.iloc[:, :2] ]) # USANDO MUESTRAS TRAIN Y EL #df_train_EL = df_EL.iloc[:,:2].copy() # EXP30 # UNICAMENTE USANDO MUESTRAS EL #print(df_train) #print("df_train_EL") #print(df_train_EL) #print(df_EL.iloc[:,:2]) #print(df_train_EL) datos['df_train_EL'] = df_train_EL try: print("AUTO-ESTIMATING OF SSL THRESHOLD ...") df_EL_stats = df_label_stats["df_EL_stats"]["df"] df_LC_stats = df_label_stats["df_LC_stats"]["df"] df_U_iter = pd.concat([df_EL_stats, df_LC_stats], ignore_index=True) ssl_th = df_U_iter.describe()["arch_scores_mean"]["mean"] pipeline["ssl_threshold"] = ssl_th print("NEW SSL THRESHOLD: ", ssl_th) except: print("ERROR - AUTO-ESTIMATING SSL THRESHOLD") ssl_th = pipeline["ssl_threshold"] traceback.print_exc() #df_U_iter.describe()["arch_scores_mean"]["25%"] #df_U_iter = pd.concat([df_EL,df_LC], ignore_index=True) #EXP 33 #print("df_U_describe") #print(f"MEAN U_{iteracion}: {ssl_th}") #print(df_U_iter.describe()) #ssl_th = df_U_iter.describe()["arch_scores_mean"]["25%"] #print(f"MEAN U_{iteracion}: {ssl_th}") #print(f" P25 U_{iteracion}: {ssl_th}") #print(f"NUEVO UMBRAL PARA SSL: {ssl_th}") #pipeline["ssl_threshold"] = ssl_th logs_label.append([ kfold, iteracion, arch_top1, arch_top2, arch_top3, len(EL_iter), len(LC_iter), ssl_th ]) save_logs(logs_label, 'label', pipeline) #reset_keras() #models_info = [] end = time.time() print(end - start)
def ssl_global(archivos, model_zoo, csvs, pipeline): datos = {} models_info = {} #df_train, df_val, df_test1, df_test2 = get_data(archivos, csvs, pipeline) ACA VOY # Medir tiempo de ejecucion #import time start = time.time() fold = dividir_balanceado2(df_train, 4) for kfold in range(1): if dataset == 'gleasson': #import pandas as pd df_train_58 = pd.DataFrame([fold[kfold][0], fold[kfold][2]]).T df_train_58.columns = [x_col_name, y_col_name] df_val = pd.DataFrame([fold[kfold][1], fold[kfold][3]]).T df_val.columns = [x_col_name, y_col_name] fold1 = dividir_balanceado2(df_train_58, 4) df_train = pd.DataFrame([fold1[0][1], fold1[0][3]]).T df_train.columns = [x_col_name, y_col_name] df_train.to_csv('data/train.csv', index=False) df_val.to_csv('data/val.csv', index=False) df_test1.to_csv('data/test1.csv', index=False) df_test2.to_csv('data/test2.csv', index=False) df_U = pd.DataFrame([fold1[0][0], fold1[0][2]]).T df_U.columns = [x_col_name, y_col_name] EL, LC = [], [] print("train :", len(df_train)) print("val :", len(df_val)) print("u :", len(df_U)) # Segmentación de U en lotes para etiquetar batch_set = list(dividir_lotes(df_U, numero_lotes)) for i in range(len(batch_set)): print(len(batch_set[i].iloc[:, 0].values.tolist())) datos['df_train'] = df_train datos['df_val'] = df_val datos['df_test1'] = df_test1 datos['df_test2'] = df_test2 for iteracion in range(numero_lotes * 1): #import random random.seed(SEED) np.random.seed(SEED) tensorflow.random.set_random_seed(SEED) print("\n######################") print("K-FOLD {} - ITERACION {}".format(kfold, iteracion)) print("######################\n") if iteracion == 0: etapa = 'train' else: etapa = 'train_EL' print(pipeline["path_label_stats"] + str(pipeline["id"]) + '_' + str(iteracion) + '.pickle') for model in model_zoo: model_memory, model_performance = entrenamiento( kfold, etapa, datos, model, train_epochs, batch_epochs, early_stopping, iteracion, models_info, pipeline) models_info[model] = { 'model_memory': model_memory, 'model_performance': model_performance['val_acc'] } #import pandas as pd df_temp = pd.DataFrame(models_info).T top_models = df_temp.sort_values('model_performance', ascending=False) top_models = top_models.reset_index()['index'].values.tolist()[:3] mod_top1, arch_top1 = models_info[ top_models[0]]['model_memory'], top_models[0] mod_top2, arch_top2 = models_info[ top_models[1]]['model_memory'], top_models[1] mod_top3, arch_top3 = models_info[ top_models[2]]['model_memory'], top_models[2] if dataset == 'gleasson': print( "\nCo-train1: \n", evaluate_cotrain(mod_top1, mod_top2, mod_top3, arch_top1, arch_top2, arch_top3, 'gleasson-patologo1', datos, etapa, kfold, iteracion, pipeline, models_info)) print( "\nCo-train2: \n", evaluate_cotrain(mod_top1, mod_top2, mod_top3, arch_top1, arch_top2, arch_top3, 'gleasson-patologo2', datos, etapa, kfold, iteracion, pipeline, models_info)) if semi_method == 'supervised': break if iteracion < numero_lotes: df_batchset = batch_set[iteracion] df_batchset.columns = [x_col_name, y_col_name] df_batchset[y_col_name] = '0' else: if iteracion == numero_lotes: df_LC = pd.DataFrame(LC) batch_set_LC = list(dividir_lotes(df_LC, numero_lotes)) for i in range(len(batch_set_LC)): print(len(batch_set_LC[i].iloc[:, 0].values.tolist())) LC = [] df_batchset = pd.DataFrame([ batch_set_LC[int(iteracion - numero_lotes)].iloc[:, 0].values.tolist() ]).T df_batchset.columns = [x_col_name] df_batchset[y_col_name] = '0' datos['df_batchset'] = df_batchset EL, LC, EL_iter, LC_iter = labeling(etapa, mod_top1, mod_top2, mod_top3, arch_top1, arch_top2, arch_top3, EL, LC, datos, pipeline, iteracion, models_info) #logs_label.append([kfold,iteracion,arch_top1,arch_top2,arch_top3,len(EL_iter),len(LC_iter)]) #save_logs(logs_label,'label',pipeline) #df_EL = pd.DataFrame(EL, columns=[x_col_name, y_col_name, 'arch_scores']) #df_LC = pd.DataFrame(LC, columns=[x_col_name, y_col_name, 'arch_scores']) df_EL = pd.DataFrame( EL_iter, columns=[x_col_name, y_col_name, 'arch_scores']) # EXP30 df_LC = pd.DataFrame( LC_iter, columns=[x_col_name, y_col_name, 'arch_scores']) # EXP30 df_label_stats = label_stats(df_EL, df_LC) print(df_label_stats) df_label_stats.to_pickle(pipeline["path_label_stats"] + str(pipeline["id"]) + '_' + str(iteracion) + '.pickle') #df_train_EL = pd.concat([df_train,df_EL.iloc[:,:2]]) df_train_EL = df_EL.iloc[:, :2].copy() # EXP30 #print(df_train) print("df_train_EL") print(df_train_EL) #print(df_EL.iloc[:,:2]) #print(df_train_EL) datos['df_train_EL'] = df_train_EL df_EL_stats = df_label_stats["df_EL_stats"]["df"] df_LC_stats = df_label_stats["df_LC_stats"]["df"] df_U_iter = pd.concat([df_EL_stats, df_LC_stats], ignore_index=True) #df_U_iter.describe()["arch_scores_mean"]["25%"] #df_U_iter = pd.concat([df_EL,df_LC], ignore_index=True) #ssl_th = df_U_iter.describe()["arch_scores_mean"]["mean"] #EXP 33 #print("df_U_describe") #print(f"MEAN U_{iteracion}: {ssl_th}") #print(df_U_iter.describe()) #ssl_th = df_U_iter.describe()["arch_scores_mean"]["25%"] #print(f"MEAN U_{iteracion}: {ssl_th}") #print(f" P25 U_{iteracion}: {ssl_th}") #print(f"NUEVO UMBRAL PARA SSL: {ssl_th}") #pipeline["ssl_threshold"] = ssl_th logs_label.append([ kfold, iteracion, arch_top1, arch_top2, arch_top3, len(EL_iter), len(LC_iter), ssl_th ]) save_logs(logs_label, 'label', pipeline) #reset_keras() #models_info = [] end = time.time() print(end - start)
def ssl_global(model_zoo, pipeline): datos = {} datos["df_base"] = get_dataset(pipeline) datos = split_train_test(datos, pipeline) # Medir tiempo de ejecucion import time start = time.time() split_kfold = pipeline["split_kfold"] num_kfold = pipeline["num_kfold"] for kfold in range(num_kfold): models_info = {} datos = get_Fold(kfold, datos, pipeline) datos_by_fold = {"kfold": kfold, "datos": datos} datos_total.append(datos_by_fold) df_datos = pd.DataFrame(datos_total) datos_path = pipeline["save_path_stats"] + 'exp_' + str( pipeline["id"]) + '_' + str(kfold) + '_data.pkl' df_datos.to_pickle(datos_path) numero_lotes = len(datos["batch_set"]) #datos["batch_set"][0] for iteracion in range(numero_lotes * 1): kfold_info = f"K-FOLD {kfold}/{num_kfold} - ITERACION {iteracion}/{numero_lotes}" print("\n") print("#" * len(kfold_info)) print(kfold_info) print("#" * len(kfold_info)) print("\n") print("\n") print(f"CLASS DISTRIBUTION - BATCH_SET {iteracion}") print(datos["batch_set"][iteracion].groupby( pipeline["y_col_name"]).count()) print(f"OK - CLASS DISTRIBUTION - BATCH_SET {iteracion}") print("\n") if iteracion == 0: etapa = 'train' else: etapa = 'train_EL' #print(pipeline["save_path_stats"]+str(pipeline["id"])+'_'+str(iteracion)+'.pkl') for model in model_zoo: #print("##########") #print("AUG_FACTOR - CURRENT: ", pipeline["stage_config"][iteracion]["aug_factor"]) #pipeline["aug_factor"] = pipeline["stage_config"][iteracion]["aug_factor"] print("AUG_FACTOR: ", pipeline["aug_factor"]) model_memory, model_performance = training( kfold, etapa, datos, model, iteracion, models_info, classification_metrics, pipeline) models_info[model] = { 'model_memory': model_memory, 'model_performance': model_performance['val_acc'] } df_temp = pd.DataFrame(models_info).T top_models = df_temp.sort_values('model_performance', ascending=False) top_models = top_models.reset_index()['index'].values.tolist()[:3] mod_top1, arch_top1 = models_info[ top_models[0]]['model_memory'], top_models[0] mod_top2, arch_top2 = models_info[ top_models[1]]['model_memory'], top_models[1] mod_top3, arch_top3 = models_info[ top_models[2]]['model_memory'], top_models[2] #if pipeline['save_model']: # mod_top1 = load_model(mod_top1, compile=True) # mod_top2 = load_model(mod_top2, compile=True) # mod_top3 = load_model(mod_top3, compile=True) # Medir tiempo de ejecucion import time start = time.time() print("EVALUATING CO-TRAINING ...") print("\n") #print("Co-train: ", evaluate_cotrain(mod_top1,mod_top2,mod_top3,arch_top1, # arch_top2,arch_top3,datos,etapa,kfold, # iteracion,pipeline,models_info,logs)) cotrain_acc, cotrain_infer_dfs = evaluate_cotrain( mod_top1, mod_top2, mod_top3, arch_top1, arch_top2, arch_top3, datos, etapa, kfold, iteracion, pipeline, models_info, logs) print("Co-train: ", cotrain_acc) df_cotrain_info = { "kfold": kfold, "iteracion": iteracion, "df_arch1": cotrain_infer_dfs[0], "df_arch2": cotrain_infer_dfs[1], "df_arch3": cotrain_infer_dfs[2] } cotrain_list.append(df_cotrain_info) df_cotrain_list = pd.DataFrame(cotrain_list) #print(df_cotrain_list) infer_pkl = pipeline["save_path_stats"] + 'exp_' + str( pipeline["id"]) + '_' + str(iteracion) + '_cotrain_eval.pkl' print("SAVING COTRAIN EVAL PICKLE") df_cotrain_list.to_pickle(infer_pkl) print("OK - SAVING COTRAIN EVAL PICKLE") print("\n") print("OK - EVALUATING CO-TRAINING") end = time.time() infer_time = end - start # SAVE INFER_TIME BY DF_TEST BY ITERATION AND ARCH print(infer_time, len(datos["df_test"])) logs_infer_time = [] logs_infer_time.append([ kfold, iteracion, 'co-train', infer_time, len(datos["df_test"]) ]) save_logs(logs_infer_time, 'infer_time', pipeline) print(f"GETTING BATCH_SET OF ITERATION {iteracion}...") df_batchset = datos["batch_set"][iteracion] df_batchset.columns = [ pipeline["x_col_name"], pipeline["y_col_name"] ] df_batchset[pipeline["y_col_name"]] = '0' datos['df_batchset'] = df_batchset print("LABELING ...") datos, EL_iter, LC_iter, label_infer_df = labeling( etapa, mod_top1, mod_top2, mod_top3, arch_top1, arch_top2, arch_top3, datos, pipeline, iteracion, models_info) df_label_info = { "kfold": kfold, "iteracion": iteracion, "df_arch1": label_infer_df[0], "df_arch2": label_infer_df[1], "df_arch3": label_infer_df[2] } label_list.append(df_label_info) df_label_list = pd.DataFrame(label_list) #print(df_label_list) label_pkl = pipeline["save_path_stats"] + 'exp_' + str( pipeline["id"]) + '_' + str(iteracion) + '_labeling.pkl' print("SAVING LABEL PICKLE") df_label_list.to_pickle(label_pkl) print("OK - SAVING LABEL PICKLE") print("OK - LABELING") print("EL_iter", len(EL_iter)) print("LC_iter", len(LC_iter)) df_EL = pd.DataFrame(datos["EL"], columns=[ pipeline["x_col_name"], pipeline["y_col_name"], 'arch_scores' ]) df_LC = pd.DataFrame(datos["LC"], columns=[ pipeline["x_col_name"], pipeline["y_col_name"], 'arch_scores' ]) df_EL.to_pickle(pipeline["save_path_stats"] + 'exp_' + str(pipeline["id"]) + '_' + str(iteracion) + '_EL.pickle') df_LC.to_pickle(pipeline["save_path_stats"] + 'exp_' + str(pipeline["id"]) + '_' + str(iteracion) + '_LC.pickle') df_label_stats = label_stats(df_EL, df_LC, pipeline) #df_label_stats.to_pickle(pipeline["save_path_stats"]+'exp_'+str(pipeline["id"])+'_'+str(iteracion)+'.pickle') df_label_stats.to_pickle(pipeline["save_path_stats"] + 'exp_' + str(pipeline["id"]) + '_' + str(iteracion) + '_stats.pickle') df_train_EL = pd.concat([datos["df_train"], df_EL.iloc[:, :2]]) datos['df_train_EL'] = df_train_EL ssl_th = pipeline["ssl_threshold"] logs_label.append([ kfold, iteracion, arch_top1, arch_top2, arch_top3, len(EL_iter), len(LC_iter), ssl_th ]) save_logs(logs_label, 'label', pipeline) reset_keras(pipeline) #if pipeline["restart_weights"]: # reset_keras() #random.seed(SEED) #np.random.seed(SEED) #tensorflow.random.set_random_seed(SEED) end = time.time() print(end - start)
def ssl_global(model_zoo, pipeline): #datos = {} datos = get_dataset(pipeline) #print(datos) #return True #datos = split_train_test(datos, pipeline) #return True # Medir tiempo de ejecucion import time start = time.time() #split_kfold = pipeline["split_kfold"] #num_kfold = pipeline["num_kfold"] method = pipeline["method"] #for kfold in range(num_kfold): models_info = {} if method == "semi-supervised": datos = get_Fold(kfold, datos, pipeline) #return True #datos_by_fold = { # "kfold": kfold, # "datos": datos #} #datos_total.append(datos_by_fold) #df_datos = pd.DataFrame(datos_total) #datos_path = pipeline["save_path_stats"] + 'exp_'+str(pipeline["id"])+'_'+str(kfold)+'_data.pkl' #df_datos.to_pickle(datos_path) if method == "supervised": kfold = 0 total_stages = 1 #pipeline["train_epochs"] elif pipeline[ "labeling_method"] == 'decision' and method == "semi-supervised": total_stages = len(datos["batch_set"]) elif pipeline[ "labeling_method"] == 'democratic' and method == "semi-supervised": total_stages = pipeline["labeling_stages"] else: pass for iteracion in range(total_stages * 1): #kfold_info = f"K-FOLD {kfold}/{num_kfold} - ITERACION {iteracion}/{total_stages}" #print("\n") #print("#"*len(kfold_info)) #print(kfold_info) #print("#"*len(kfold_info)) #print("\n") info = f"METHOD - {method} - ITERATION {iteracion}/{total_stages}" if method == "semi-supervised": print("\n") print(f"CLASS DISTRIBUTION - BATCH_SET {iteracion}") if len(datos["LC"]) > 0: U_set = pd.DataFrame(datos["LC"], columns=[ pipeline["x_col_name"], pipeline["y_col_name"], 'arch_scores' ]) #print("LABELING LOW CONFIDENCE SAMPLES (LC)") print(U_set.groupby(pipeline["y_col_name"]).count()) #print("OK - LABELING LOW CONFIDENCE SAMPLES (LC)") else: U_set = datos['U'] #print("LABELING UNLABELED SAMPLES (U)") print(U_set.groupby(pipeline["y_col_name"]).count()) #print("OK - LABELING UNLABELED SAMPLES (U)") #print( datos["batch_set"][iteracion].groupby(pipeline["y_col_name"]).count() ) print(f"OK - CLASS DISTRIBUTION - BATCH_SET {iteracion}") print("\n") if iteracion == 0 or method == "supervised": etapa = 'train' else: etapa = 'train_EL' for model in model_zoo: print("AUG_FACTOR: ", pipeline["aug_factor"]) model_memory, model_performance = training(kfold, etapa, datos, model, iteracion, models_info, classification_metrics, pipeline) models_info[model] = { 'model_memory': model_memory, 'model_performance': model_performance['val_acc'] } df_temp = pd.DataFrame(models_info).T top_models = df_temp.sort_values('model_performance', ascending=False) top_models = top_models.reset_index()['index'].values.tolist()[:3] mod_top1, arch_top1 = models_info[ top_models[0]]['model_memory'], top_models[0] mod_top2, arch_top2 = models_info[ top_models[1]]['model_memory'], top_models[1] mod_top3, arch_top3 = models_info[ top_models[2]]['model_memory'], top_models[2] # Medir tiempo de ejecucion import time start = time.time() print("EVALUATING CO-TRAINING ...") print("\n") cotrain_acc1, cotrain_infer_dfs1 = evaluate_cotrain( mod_top1, mod_top2, mod_top3, arch_top1, arch_top2, arch_top3, datos, etapa, kfold, iteracion, pipeline, models_info, 'patologo1', logs) print("Co-train - Patologo 1: ", cotrain_acc1) cotrain_acc2, cotrain_infer_dfs2 = evaluate_cotrain( mod_top1, mod_top2, mod_top3, arch_top1, arch_top2, arch_top3, datos, etapa, kfold, iteracion, pipeline, models_info, 'patologo2', logs) print("Co-train - Patologo 2: ", cotrain_acc2) df_cotrain_info = { "kfold": kfold, "iteracion": iteracion, "patologo1": { "df_arch1": cotrain_infer_dfs1[0], "df_arch2": cotrain_infer_dfs1[1], "df_arch3": cotrain_infer_dfs1[2] }, "patologo2": { "df_arch1": cotrain_infer_dfs2[0], "df_arch2": cotrain_infer_dfs2[1], "df_arch3": cotrain_infer_dfs2[2] }, } cotrain_list.append(df_cotrain_info) df_cotrain_list = pd.DataFrame(cotrain_list) infer_pkl = pipeline["save_path_stats"] + 'exp_' + str( pipeline["id"]) + '_' + str(iteracion) + '_cotrain_eval.pkl' print("SAVING COTRAIN EVAL PICKLE") df_cotrain_list.to_pickle(infer_pkl) print("OK - SAVING COTRAIN EVAL PICKLE") print("\n") print("OK - EVALUATING CO-TRAINING") end = time.time() infer_time = end - start # SAVE INFER_TIME BY DF_TEST BY ITERATION AND ARCH print(infer_time, len(datos["df_test1"])) logs_infer_time = [] logs_infer_time.append([ kfold, iteracion, 'co-train1', infer_time, len(datos["df_test1"]) ]) save_logs(logs_infer_time, 'infer_time', pipeline) if method == "supervised": print(f"SUPERVISED METHOD COMPLETED FOR ITERATION: {iteracion}") #reset_keras(pipeline) continue print(f"GETTING BATCH_SET OF ITERATION {iteracion}...") print("LABELING ...") if pipeline["labeling_method"] == "decision": datos, EL_iter, LC_iter, EL_accu, LC_accu, label_infer_df = labeling( etapa, mod_top1, mod_top2, mod_top3, arch_top1, arch_top2, arch_top3, datos, pipeline, iteracion, models_info) elif pipeline["labeling_method"] == "democratic": datos, EL_iter, LC_iter, EL_accu, LC_accu, label_infer_df = labeling_v2( etapa, mod_top1, mod_top2, mod_top3, arch_top1, arch_top2, arch_top3, datos, pipeline, iteracion, models_info) df_label_info = { "kfold": kfold, "iteracion": iteracion, "df_arch1": label_infer_df[0], "df_arch2": label_infer_df[1], "df_arch3": label_infer_df[2] } label_list.append(df_label_info) df_label_list = pd.DataFrame(label_list) label_pkl = pipeline["save_path_stats"] + 'exp_' + str( pipeline["id"]) + '_' + str(iteracion) + '_labeling.pkl' print("SAVING LABEL PICKLE") df_label_list.to_pickle(label_pkl) print("OK - SAVING LABEL PICKLE") print("OK - LABELING") print("EL_iter", len(EL_iter)) print("LC_iter", len(LC_iter)) df_EL = pd.DataFrame(datos["EL"], columns=[ pipeline["x_col_name"], pipeline["y_col_name"], 'arch_scores' ]) df_LC = pd.DataFrame(datos["LC"], columns=[ pipeline["x_col_name"], pipeline["y_col_name"], 'arch_scores' ]) df_EL.to_pickle(pipeline["save_path_stats"] + 'exp_' + str(pipeline["id"]) + '_' + str(iteracion) + '_EL.pickle') df_LC.to_pickle(pipeline["save_path_stats"] + 'exp_' + str(pipeline["id"]) + '_' + str(iteracion) + '_LC.pickle') df_label_stats = label_stats(df_EL, df_LC, pipeline) df_label_stats.to_pickle(pipeline["save_path_stats"] + 'exp_' + str(pipeline["id"]) + '_' + str(iteracion) + '_stats.pickle') df_train_EL = pd.concat([datos["df_train"], df_EL.iloc[:, :2]]) datos['df_train_EL'] = df_train_EL ssl_th = pipeline["ssl_threshold"] logs_label.append([ kfold, iteracion, arch_top1, arch_top2, arch_top3, len(EL_iter), len(LC_iter), EL_accu, LC_accu, ssl_th ]) save_logs(logs_label, 'label', pipeline) reset_keras(pipeline) end = time.time() print(end - start)