コード例 #1
0
def ssl_global(model_zoo, pipeline):

    numero_lotes = 5
    semi_method = 'co-training-multi'

    datos = {}
    models_info = {}
    #test_cotraining,predicciones = [],[]
    #logs,logs_time,logs_label = [], [], []

    datos["df_base"] = get_dataset(pipeline)
    datos = split_train_test(datos, pipeline)

    # Medir tiempo de ejecucion
    import time
    start = time.time()

    for kfold in range(1):
        for iteracion in range(numero_lotes * 1):

            print("\n######################")
            print("K-FOLD {} - ITERACION {}".format(kfold, iteracion))
            print("######################\n")

            datos = get_Fold(kfold, datos, pipeline)

            if iteracion == 0:
                etapa = 'train'
            else:
                etapa = 'train_EL'

            print(pipeline["path_label_stats"] + str(pipeline["id"]) + '_' +
                  str(iteracion) + '.pickle')

            for model in model_zoo:

                model_memory, model_performance = training(
                    kfold, etapa, datos, model, iteracion, models_info,
                    pipeline)

                models_info[model] = {
                    'model_memory': model_memory,
                    'model_performance': model_performance['val_acc']
                }

            df_temp = pd.DataFrame(models_info).T
            top_models = df_temp.sort_values('model_performance',
                                             ascending=False)
            top_models = top_models.reset_index()['index'].values.tolist()[:3]

            mod_top1, arch_top1 = models_info[
                top_models[0]]['model_memory'], top_models[0]
            mod_top2, arch_top2 = models_info[
                top_models[1]]['model_memory'], top_models[1]
            mod_top3, arch_top3 = models_info[
                top_models[2]]['model_memory'], top_models[2]

            print("\n")
            print(
                "Co-train: ",
                evaluate_cotrain(mod_top1, mod_top2, mod_top3, arch_top1,
                                 arch_top2, arch_top3, datos, etapa, kfold,
                                 iteracion, pipeline, models_info, logs))
            print("\n")

            if semi_method == 'supervised':
                break

            if iteracion < numero_lotes:

                df_batchset = datos["batch_set"][iteracion]
                df_batchset.columns = [
                    pipeline["x_col_name"], pipeline["y_col_name"]
                ]
                df_batchset[pipeline["y_col_name"]] = '0'
            else:
                if iteracion == numero_lotes:
                    df_LC = pd.DataFrame(pipeline["LC"])
                    batch_set_LC = list(dividir_lotes(df_LC, numero_lotes))

                    for i in enumerate(batch_set_LC):
                        print(len(batch_set_LC[i].iloc[:, 0].values.tolist()))
                    pipeline["LC"] = []

                df_batchset = pd.DataFrame([
                    batch_set_LC[int(iteracion -
                                     numero_lotes)].iloc[:, 0].values.tolist()
                ]).T
                df_batchset.columns = [pipeline["x_col_name"]]
                df_batchset[pipeline["y_col_name"]] = '0'

            datos['df_batchset'] = df_batchset

            datos, EL_iter, LC_iter = labeling(etapa, mod_top1, mod_top2,
                                               mod_top3, arch_top1, arch_top2,
                                               arch_top3, datos, pipeline,
                                               iteracion, models_info)
            #logs_label.append([kfold,iteracion,arch_top1,arch_top2,arch_top3,len(EL_iter),len(LC_iter)])
            #save_logs(logs_label,'label',pipeline)

            #df_EL = pd.DataFrame(EL, columns=[pipeline["x_col_name"], pipeline["y_col_name"], 'arch_scores'])
            #df_LC = pd.DataFrame(LC, columns=[pipeline["x_col_name"], pipeline["y_col_name"], 'arch_scores'])
            print("EL_iter", len(EL_iter))
            print("LC_iter", len(LC_iter))
            #df_EL = pd.DataFrame(EL_iter, columns=[ pipeline["x_col_name"], pipeline["y_col_name"], 'arch_scores' ]) # EXP30
            #df_LC = pd.DataFrame(LC_iter, columns=[ pipeline["x_col_name"], pipeline["y_col_name"], 'arch_scores' ]) # EXP30

            df_EL = pd.DataFrame(datos["EL"],
                                 columns=[
                                     pipeline["x_col_name"],
                                     pipeline["y_col_name"], 'arch_scores'
                                 ])
            df_LC = pd.DataFrame(datos["LC"],
                                 columns=[
                                     pipeline["x_col_name"],
                                     pipeline["y_col_name"], 'arch_scores'
                                 ])

            os.makedirs(pipeline["path_label_stats"].split('/')[0],
                        exist_ok=True)

            df_EL.to_pickle(pipeline["path_label_stats"] +
                            str(pipeline["id"]) + '_' + str(iteracion) +
                            '_EL.pickle')
            df_LC.to_pickle(pipeline["path_label_stats"] +
                            str(pipeline["id"]) + '_' + str(iteracion) +
                            '_LC.pickle')

            df_label_stats = label_stats(df_EL, df_LC, pipeline)
            #print(df_label_stats)
            df_label_stats.to_pickle(pipeline["path_label_stats"] +
                                     str(pipeline["id"]) + '_' +
                                     str(iteracion) + '.pickle')

            df_train_EL = pd.concat([datos["df_train"], df_EL.iloc[:, :2]
                                     ])  # USANDO MUESTRAS TRAIN Y EL
            #df_train_EL = df_EL.iloc[:,:2].copy() # EXP30 # UNICAMENTE USANDO MUESTRAS EL
            #print(df_train)
            #print("df_train_EL")
            #print(df_train_EL)
            #print(df_EL.iloc[:,:2])
            #print(df_train_EL)
            datos['df_train_EL'] = df_train_EL

            try:
                print("AUTO-ESTIMATING OF SSL THRESHOLD ...")
                df_EL_stats = df_label_stats["df_EL_stats"]["df"]
                df_LC_stats = df_label_stats["df_LC_stats"]["df"]

                df_U_iter = pd.concat([df_EL_stats, df_LC_stats],
                                      ignore_index=True)

                ssl_th = df_U_iter.describe()["arch_scores_mean"]["mean"]
                pipeline["ssl_threshold"] = ssl_th
                print("NEW SSL THRESHOLD: ", ssl_th)
            except:
                print("ERROR - AUTO-ESTIMATING SSL THRESHOLD")
                ssl_th = pipeline["ssl_threshold"]
                traceback.print_exc()

            #df_U_iter.describe()["arch_scores_mean"]["25%"]
            #df_U_iter = pd.concat([df_EL,df_LC], ignore_index=True)

            #EXP 33
            #print("df_U_describe")
            #print(f"MEAN U_{iteracion}: {ssl_th}")
            #print(df_U_iter.describe())
            #ssl_th = df_U_iter.describe()["arch_scores_mean"]["25%"]
            #print(f"MEAN U_{iteracion}: {ssl_th}")
            #print(f" P25 U_{iteracion}: {ssl_th}")

            #print(f"NUEVO UMBRAL PARA SSL: {ssl_th}")
            #pipeline["ssl_threshold"] = ssl_th

            logs_label.append([
                kfold, iteracion, arch_top1, arch_top2, arch_top3,
                len(EL_iter),
                len(LC_iter), ssl_th
            ])
            save_logs(logs_label, 'label', pipeline)
            #reset_keras()
            #models_info = []
    end = time.time()
    print(end - start)
コード例 #2
0
def ssl_global(archivos, model_zoo, csvs, pipeline):

    datos = {}
    models_info = {}
    #df_train, df_val, df_test1, df_test2 = get_data(archivos, csvs, pipeline) ACA VOY

    # Medir tiempo de ejecucion
    #import time
    start = time.time()
    fold = dividir_balanceado2(df_train, 4)

    for kfold in range(1):

        if dataset == 'gleasson':
            #import pandas as pd
            df_train_58 = pd.DataFrame([fold[kfold][0], fold[kfold][2]]).T
            df_train_58.columns = [x_col_name, y_col_name]

            df_val = pd.DataFrame([fold[kfold][1], fold[kfold][3]]).T
            df_val.columns = [x_col_name, y_col_name]

            fold1 = dividir_balanceado2(df_train_58, 4)
            df_train = pd.DataFrame([fold1[0][1], fold1[0][3]]).T
            df_train.columns = [x_col_name, y_col_name]

            df_train.to_csv('data/train.csv', index=False)
            df_val.to_csv('data/val.csv', index=False)
            df_test1.to_csv('data/test1.csv', index=False)
            df_test2.to_csv('data/test2.csv', index=False)

            df_U = pd.DataFrame([fold1[0][0], fold1[0][2]]).T
            df_U.columns = [x_col_name, y_col_name]
            EL, LC = [], []

            print("train :", len(df_train))
            print("val   :", len(df_val))
            print("u     :", len(df_U))

            # Segmentación de U en lotes para etiquetar
            batch_set = list(dividir_lotes(df_U, numero_lotes))
            for i in range(len(batch_set)):
                print(len(batch_set[i].iloc[:, 0].values.tolist()))

        datos['df_train'] = df_train
        datos['df_val'] = df_val
        datos['df_test1'] = df_test1
        datos['df_test2'] = df_test2

        for iteracion in range(numero_lotes * 1):

            #import random
            random.seed(SEED)
            np.random.seed(SEED)
            tensorflow.random.set_random_seed(SEED)

            print("\n######################")
            print("K-FOLD {} - ITERACION {}".format(kfold, iteracion))
            print("######################\n")

            if iteracion == 0:
                etapa = 'train'
            else:
                etapa = 'train_EL'

            print(pipeline["path_label_stats"] + str(pipeline["id"]) + '_' +
                  str(iteracion) + '.pickle')

            for model in model_zoo:

                model_memory, model_performance = entrenamiento(
                    kfold, etapa, datos, model, train_epochs, batch_epochs,
                    early_stopping, iteracion, models_info, pipeline)

                models_info[model] = {
                    'model_memory': model_memory,
                    'model_performance': model_performance['val_acc']
                }

            #import pandas as pd
            df_temp = pd.DataFrame(models_info).T
            top_models = df_temp.sort_values('model_performance',
                                             ascending=False)
            top_models = top_models.reset_index()['index'].values.tolist()[:3]

            mod_top1, arch_top1 = models_info[
                top_models[0]]['model_memory'], top_models[0]
            mod_top2, arch_top2 = models_info[
                top_models[1]]['model_memory'], top_models[1]
            mod_top3, arch_top3 = models_info[
                top_models[2]]['model_memory'], top_models[2]

            if dataset == 'gleasson':
                print(
                    "\nCo-train1: \n",
                    evaluate_cotrain(mod_top1, mod_top2, mod_top3, arch_top1,
                                     arch_top2, arch_top3,
                                     'gleasson-patologo1', datos, etapa, kfold,
                                     iteracion, pipeline, models_info))
                print(
                    "\nCo-train2: \n",
                    evaluate_cotrain(mod_top1, mod_top2, mod_top3, arch_top1,
                                     arch_top2, arch_top3,
                                     'gleasson-patologo2', datos, etapa, kfold,
                                     iteracion, pipeline, models_info))

            if semi_method == 'supervised':
                break

            if iteracion < numero_lotes:

                df_batchset = batch_set[iteracion]
                df_batchset.columns = [x_col_name, y_col_name]
                df_batchset[y_col_name] = '0'
            else:
                if iteracion == numero_lotes:
                    df_LC = pd.DataFrame(LC)
                    batch_set_LC = list(dividir_lotes(df_LC, numero_lotes))
                    for i in range(len(batch_set_LC)):
                        print(len(batch_set_LC[i].iloc[:, 0].values.tolist()))
                    LC = []

                df_batchset = pd.DataFrame([
                    batch_set_LC[int(iteracion -
                                     numero_lotes)].iloc[:, 0].values.tolist()
                ]).T
                df_batchset.columns = [x_col_name]
                df_batchset[y_col_name] = '0'

            datos['df_batchset'] = df_batchset

            EL, LC, EL_iter, LC_iter = labeling(etapa, mod_top1, mod_top2,
                                                mod_top3, arch_top1, arch_top2,
                                                arch_top3, EL, LC, datos,
                                                pipeline, iteracion,
                                                models_info)
            #logs_label.append([kfold,iteracion,arch_top1,arch_top2,arch_top3,len(EL_iter),len(LC_iter)])
            #save_logs(logs_label,'label',pipeline)

            #df_EL = pd.DataFrame(EL, columns=[x_col_name, y_col_name, 'arch_scores'])
            #df_LC = pd.DataFrame(LC, columns=[x_col_name, y_col_name, 'arch_scores'])

            df_EL = pd.DataFrame(
                EL_iter, columns=[x_col_name, y_col_name,
                                  'arch_scores'])  # EXP30
            df_LC = pd.DataFrame(
                LC_iter, columns=[x_col_name, y_col_name,
                                  'arch_scores'])  # EXP30

            df_label_stats = label_stats(df_EL, df_LC)
            print(df_label_stats)
            df_label_stats.to_pickle(pipeline["path_label_stats"] +
                                     str(pipeline["id"]) + '_' +
                                     str(iteracion) + '.pickle')

            #df_train_EL = pd.concat([df_train,df_EL.iloc[:,:2]])
            df_train_EL = df_EL.iloc[:, :2].copy()  # EXP30
            #print(df_train)
            print("df_train_EL")
            print(df_train_EL)
            #print(df_EL.iloc[:,:2])
            #print(df_train_EL)
            datos['df_train_EL'] = df_train_EL

            df_EL_stats = df_label_stats["df_EL_stats"]["df"]
            df_LC_stats = df_label_stats["df_LC_stats"]["df"]

            df_U_iter = pd.concat([df_EL_stats, df_LC_stats],
                                  ignore_index=True)
            #df_U_iter.describe()["arch_scores_mean"]["25%"]
            #df_U_iter = pd.concat([df_EL,df_LC], ignore_index=True)
            #ssl_th = df_U_iter.describe()["arch_scores_mean"]["mean"]
            #EXP 33
            #print("df_U_describe")
            #print(f"MEAN U_{iteracion}: {ssl_th}")
            #print(df_U_iter.describe())
            #ssl_th = df_U_iter.describe()["arch_scores_mean"]["25%"]
            #print(f"MEAN U_{iteracion}: {ssl_th}")
            #print(f" P25 U_{iteracion}: {ssl_th}")

            #print(f"NUEVO UMBRAL PARA SSL: {ssl_th}")
            #pipeline["ssl_threshold"] = ssl_th

            logs_label.append([
                kfold, iteracion, arch_top1, arch_top2, arch_top3,
                len(EL_iter),
                len(LC_iter), ssl_th
            ])
            save_logs(logs_label, 'label', pipeline)
            #reset_keras()
            #models_info = []
    end = time.time()
    print(end - start)
コード例 #3
0
def ssl_global(model_zoo, pipeline):

    datos = {}
    datos["df_base"] = get_dataset(pipeline)
    datos = split_train_test(datos, pipeline)

    # Medir tiempo de ejecucion
    import time
    start = time.time()

    split_kfold = pipeline["split_kfold"]
    num_kfold = pipeline["num_kfold"]

    for kfold in range(num_kfold):

        models_info = {}
        datos = get_Fold(kfold, datos, pipeline)

        datos_by_fold = {"kfold": kfold, "datos": datos}

        datos_total.append(datos_by_fold)
        df_datos = pd.DataFrame(datos_total)
        datos_path = pipeline["save_path_stats"] + 'exp_' + str(
            pipeline["id"]) + '_' + str(kfold) + '_data.pkl'
        df_datos.to_pickle(datos_path)

        numero_lotes = len(datos["batch_set"])

        #datos["batch_set"][0]

        for iteracion in range(numero_lotes * 1):

            kfold_info = f"K-FOLD {kfold}/{num_kfold} - ITERACION {iteracion}/{numero_lotes}"
            print("\n")
            print("#" * len(kfold_info))
            print(kfold_info)
            print("#" * len(kfold_info))
            print("\n")

            print("\n")
            print(f"CLASS DISTRIBUTION - BATCH_SET {iteracion}")
            print(datos["batch_set"][iteracion].groupby(
                pipeline["y_col_name"]).count())
            print(f"OK - CLASS DISTRIBUTION - BATCH_SET {iteracion}")
            print("\n")

            if iteracion == 0:
                etapa = 'train'
            else:
                etapa = 'train_EL'

            #print(pipeline["save_path_stats"]+str(pipeline["id"])+'_'+str(iteracion)+'.pkl')

            for model in model_zoo:

                #print("##########")
                #print("AUG_FACTOR - CURRENT: ", pipeline["stage_config"][iteracion]["aug_factor"])
                #pipeline["aug_factor"] = pipeline["stage_config"][iteracion]["aug_factor"]
                print("AUG_FACTOR: ", pipeline["aug_factor"])

                model_memory, model_performance = training(
                    kfold, etapa, datos, model, iteracion, models_info,
                    classification_metrics, pipeline)

                models_info[model] = {
                    'model_memory': model_memory,
                    'model_performance': model_performance['val_acc']
                }

            df_temp = pd.DataFrame(models_info).T
            top_models = df_temp.sort_values('model_performance',
                                             ascending=False)
            top_models = top_models.reset_index()['index'].values.tolist()[:3]

            mod_top1, arch_top1 = models_info[
                top_models[0]]['model_memory'], top_models[0]
            mod_top2, arch_top2 = models_info[
                top_models[1]]['model_memory'], top_models[1]
            mod_top3, arch_top3 = models_info[
                top_models[2]]['model_memory'], top_models[2]

            #if pipeline['save_model']:
            #   mod_top1 = load_model(mod_top1, compile=True)
            #    mod_top2 = load_model(mod_top2, compile=True)
            #    mod_top3 = load_model(mod_top3, compile=True)

            # Medir tiempo de ejecucion
            import time
            start = time.time()

            print("EVALUATING CO-TRAINING ...")
            print("\n")
            #print("Co-train: ", evaluate_cotrain(mod_top1,mod_top2,mod_top3,arch_top1,
            #                                        arch_top2,arch_top3,datos,etapa,kfold,
            #                                        iteracion,pipeline,models_info,logs))

            cotrain_acc, cotrain_infer_dfs = evaluate_cotrain(
                mod_top1, mod_top2, mod_top3, arch_top1, arch_top2, arch_top3,
                datos, etapa, kfold, iteracion, pipeline, models_info, logs)

            print("Co-train: ", cotrain_acc)
            df_cotrain_info = {
                "kfold": kfold,
                "iteracion": iteracion,
                "df_arch1": cotrain_infer_dfs[0],
                "df_arch2": cotrain_infer_dfs[1],
                "df_arch3": cotrain_infer_dfs[2]
            }

            cotrain_list.append(df_cotrain_info)
            df_cotrain_list = pd.DataFrame(cotrain_list)
            #print(df_cotrain_list)

            infer_pkl = pipeline["save_path_stats"] + 'exp_' + str(
                pipeline["id"]) + '_' + str(iteracion) + '_cotrain_eval.pkl'

            print("SAVING COTRAIN EVAL PICKLE")
            df_cotrain_list.to_pickle(infer_pkl)
            print("OK - SAVING COTRAIN EVAL PICKLE")

            print("\n")
            print("OK - EVALUATING CO-TRAINING")

            end = time.time()
            infer_time = end - start
            # SAVE INFER_TIME BY DF_TEST BY ITERATION AND ARCH

            print(infer_time, len(datos["df_test"]))

            logs_infer_time = []
            logs_infer_time.append([
                kfold, iteracion, 'co-train', infer_time,
                len(datos["df_test"])
            ])
            save_logs(logs_infer_time, 'infer_time', pipeline)

            print(f"GETTING BATCH_SET OF ITERATION {iteracion}...")

            df_batchset = datos["batch_set"][iteracion]
            df_batchset.columns = [
                pipeline["x_col_name"], pipeline["y_col_name"]
            ]
            df_batchset[pipeline["y_col_name"]] = '0'

            datos['df_batchset'] = df_batchset

            print("LABELING ...")

            datos, EL_iter, LC_iter, label_infer_df = labeling(
                etapa, mod_top1, mod_top2, mod_top3, arch_top1, arch_top2,
                arch_top3, datos, pipeline, iteracion, models_info)

            df_label_info = {
                "kfold": kfold,
                "iteracion": iteracion,
                "df_arch1": label_infer_df[0],
                "df_arch2": label_infer_df[1],
                "df_arch3": label_infer_df[2]
            }

            label_list.append(df_label_info)
            df_label_list = pd.DataFrame(label_list)
            #print(df_label_list)

            label_pkl = pipeline["save_path_stats"] + 'exp_' + str(
                pipeline["id"]) + '_' + str(iteracion) + '_labeling.pkl'

            print("SAVING LABEL PICKLE")
            df_label_list.to_pickle(label_pkl)
            print("OK - SAVING LABEL PICKLE")

            print("OK - LABELING")
            print("EL_iter", len(EL_iter))
            print("LC_iter", len(LC_iter))

            df_EL = pd.DataFrame(datos["EL"],
                                 columns=[
                                     pipeline["x_col_name"],
                                     pipeline["y_col_name"], 'arch_scores'
                                 ])
            df_LC = pd.DataFrame(datos["LC"],
                                 columns=[
                                     pipeline["x_col_name"],
                                     pipeline["y_col_name"], 'arch_scores'
                                 ])

            df_EL.to_pickle(pipeline["save_path_stats"] + 'exp_' +
                            str(pipeline["id"]) + '_' + str(iteracion) +
                            '_EL.pickle')
            df_LC.to_pickle(pipeline["save_path_stats"] + 'exp_' +
                            str(pipeline["id"]) + '_' + str(iteracion) +
                            '_LC.pickle')

            df_label_stats = label_stats(df_EL, df_LC, pipeline)
            #df_label_stats.to_pickle(pipeline["save_path_stats"]+'exp_'+str(pipeline["id"])+'_'+str(iteracion)+'.pickle')

            df_label_stats.to_pickle(pipeline["save_path_stats"] + 'exp_' +
                                     str(pipeline["id"]) + '_' +
                                     str(iteracion) + '_stats.pickle')

            df_train_EL = pd.concat([datos["df_train"], df_EL.iloc[:, :2]])
            datos['df_train_EL'] = df_train_EL

            ssl_th = pipeline["ssl_threshold"]

            logs_label.append([
                kfold, iteracion, arch_top1, arch_top2, arch_top3,
                len(EL_iter),
                len(LC_iter), ssl_th
            ])
            save_logs(logs_label, 'label', pipeline)

            reset_keras(pipeline)

            #if pipeline["restart_weights"]:
            #    reset_keras()

            #random.seed(SEED)
            #np.random.seed(SEED)
            #tensorflow.random.set_random_seed(SEED)

    end = time.time()
    print(end - start)
コード例 #4
0
ファイル: baseline.py プロジェクト: ambigus9/msc-tesis
def ssl_global(model_zoo, pipeline):

    #datos = {}
    datos = get_dataset(pipeline)

    #print(datos)
    #return True

    #datos = split_train_test(datos, pipeline)

    #return True
    # Medir tiempo de ejecucion
    import time
    start = time.time()

    #split_kfold = pipeline["split_kfold"]
    #num_kfold = pipeline["num_kfold"]
    method = pipeline["method"]

    #for kfold in range(num_kfold):

    models_info = {}

    if method == "semi-supervised":
        datos = get_Fold(kfold, datos, pipeline)

    #return True

    #datos_by_fold = {
    #    "kfold": kfold,
    #    "datos": datos
    #}

    #datos_total.append(datos_by_fold)
    #df_datos = pd.DataFrame(datos_total)
    #datos_path = pipeline["save_path_stats"] + 'exp_'+str(pipeline["id"])+'_'+str(kfold)+'_data.pkl'
    #df_datos.to_pickle(datos_path)

    if method == "supervised":
        kfold = 0
        total_stages = 1  #pipeline["train_epochs"]
    elif pipeline[
            "labeling_method"] == 'decision' and method == "semi-supervised":
        total_stages = len(datos["batch_set"])
    elif pipeline[
            "labeling_method"] == 'democratic' and method == "semi-supervised":
        total_stages = pipeline["labeling_stages"]
    else:
        pass

    for iteracion in range(total_stages * 1):

        #kfold_info = f"K-FOLD {kfold}/{num_kfold} - ITERACION {iteracion}/{total_stages}"
        #print("\n")
        #print("#"*len(kfold_info))
        #print(kfold_info)
        #print("#"*len(kfold_info))
        #print("\n")

        info = f"METHOD - {method} - ITERATION {iteracion}/{total_stages}"

        if method == "semi-supervised":
            print("\n")
            print(f"CLASS DISTRIBUTION - BATCH_SET {iteracion}")

            if len(datos["LC"]) > 0:
                U_set = pd.DataFrame(datos["LC"],
                                     columns=[
                                         pipeline["x_col_name"],
                                         pipeline["y_col_name"], 'arch_scores'
                                     ])
                #print("LABELING LOW CONFIDENCE SAMPLES (LC)")
                print(U_set.groupby(pipeline["y_col_name"]).count())
                #print("OK - LABELING LOW CONFIDENCE SAMPLES (LC)")
            else:
                U_set = datos['U']
                #print("LABELING UNLABELED SAMPLES (U)")
                print(U_set.groupby(pipeline["y_col_name"]).count())
                #print("OK - LABELING UNLABELED SAMPLES (U)")

            #print( datos["batch_set"][iteracion].groupby(pipeline["y_col_name"]).count() )
            print(f"OK - CLASS DISTRIBUTION - BATCH_SET {iteracion}")
            print("\n")

        if iteracion == 0 or method == "supervised":
            etapa = 'train'
        else:
            etapa = 'train_EL'

        for model in model_zoo:

            print("AUG_FACTOR: ", pipeline["aug_factor"])

            model_memory, model_performance = training(kfold, etapa, datos,
                                                       model, iteracion,
                                                       models_info,
                                                       classification_metrics,
                                                       pipeline)

            models_info[model] = {
                'model_memory': model_memory,
                'model_performance': model_performance['val_acc']
            }

        df_temp = pd.DataFrame(models_info).T
        top_models = df_temp.sort_values('model_performance', ascending=False)
        top_models = top_models.reset_index()['index'].values.tolist()[:3]

        mod_top1, arch_top1 = models_info[
            top_models[0]]['model_memory'], top_models[0]
        mod_top2, arch_top2 = models_info[
            top_models[1]]['model_memory'], top_models[1]
        mod_top3, arch_top3 = models_info[
            top_models[2]]['model_memory'], top_models[2]

        # Medir tiempo de ejecucion
        import time
        start = time.time()

        print("EVALUATING CO-TRAINING ...")
        print("\n")

        cotrain_acc1, cotrain_infer_dfs1 = evaluate_cotrain(
            mod_top1, mod_top2, mod_top3, arch_top1, arch_top2, arch_top3,
            datos, etapa, kfold, iteracion, pipeline, models_info, 'patologo1',
            logs)

        print("Co-train - Patologo 1: ", cotrain_acc1)

        cotrain_acc2, cotrain_infer_dfs2 = evaluate_cotrain(
            mod_top1, mod_top2, mod_top3, arch_top1, arch_top2, arch_top3,
            datos, etapa, kfold, iteracion, pipeline, models_info, 'patologo2',
            logs)

        print("Co-train - Patologo 2: ", cotrain_acc2)

        df_cotrain_info = {
            "kfold": kfold,
            "iteracion": iteracion,
            "patologo1": {
                "df_arch1": cotrain_infer_dfs1[0],
                "df_arch2": cotrain_infer_dfs1[1],
                "df_arch3": cotrain_infer_dfs1[2]
            },
            "patologo2": {
                "df_arch1": cotrain_infer_dfs2[0],
                "df_arch2": cotrain_infer_dfs2[1],
                "df_arch3": cotrain_infer_dfs2[2]
            },
        }

        cotrain_list.append(df_cotrain_info)
        df_cotrain_list = pd.DataFrame(cotrain_list)

        infer_pkl = pipeline["save_path_stats"] + 'exp_' + str(
            pipeline["id"]) + '_' + str(iteracion) + '_cotrain_eval.pkl'

        print("SAVING COTRAIN EVAL PICKLE")
        df_cotrain_list.to_pickle(infer_pkl)
        print("OK - SAVING COTRAIN EVAL PICKLE")

        print("\n")
        print("OK - EVALUATING CO-TRAINING")

        end = time.time()
        infer_time = end - start

        # SAVE INFER_TIME BY DF_TEST BY ITERATION AND ARCH
        print(infer_time, len(datos["df_test1"]))

        logs_infer_time = []
        logs_infer_time.append([
            kfold, iteracion, 'co-train1', infer_time,
            len(datos["df_test1"])
        ])
        save_logs(logs_infer_time, 'infer_time', pipeline)

        if method == "supervised":
            print(f"SUPERVISED METHOD COMPLETED FOR ITERATION: {iteracion}")
            #reset_keras(pipeline)
            continue

        print(f"GETTING BATCH_SET OF ITERATION {iteracion}...")
        print("LABELING ...")

        if pipeline["labeling_method"] == "decision":
            datos, EL_iter, LC_iter, EL_accu, LC_accu, label_infer_df = labeling(
                etapa, mod_top1, mod_top2, mod_top3, arch_top1, arch_top2,
                arch_top3, datos, pipeline, iteracion, models_info)
        elif pipeline["labeling_method"] == "democratic":
            datos, EL_iter, LC_iter, EL_accu, LC_accu, label_infer_df = labeling_v2(
                etapa, mod_top1, mod_top2, mod_top3, arch_top1, arch_top2,
                arch_top3, datos, pipeline, iteracion, models_info)

        df_label_info = {
            "kfold": kfold,
            "iteracion": iteracion,
            "df_arch1": label_infer_df[0],
            "df_arch2": label_infer_df[1],
            "df_arch3": label_infer_df[2]
        }

        label_list.append(df_label_info)
        df_label_list = pd.DataFrame(label_list)

        label_pkl = pipeline["save_path_stats"] + 'exp_' + str(
            pipeline["id"]) + '_' + str(iteracion) + '_labeling.pkl'

        print("SAVING LABEL PICKLE")
        df_label_list.to_pickle(label_pkl)
        print("OK - SAVING LABEL PICKLE")

        print("OK - LABELING")
        print("EL_iter", len(EL_iter))
        print("LC_iter", len(LC_iter))

        df_EL = pd.DataFrame(datos["EL"],
                             columns=[
                                 pipeline["x_col_name"],
                                 pipeline["y_col_name"], 'arch_scores'
                             ])
        df_LC = pd.DataFrame(datos["LC"],
                             columns=[
                                 pipeline["x_col_name"],
                                 pipeline["y_col_name"], 'arch_scores'
                             ])

        df_EL.to_pickle(pipeline["save_path_stats"] + 'exp_' +
                        str(pipeline["id"]) + '_' + str(iteracion) +
                        '_EL.pickle')
        df_LC.to_pickle(pipeline["save_path_stats"] + 'exp_' +
                        str(pipeline["id"]) + '_' + str(iteracion) +
                        '_LC.pickle')

        df_label_stats = label_stats(df_EL, df_LC, pipeline)
        df_label_stats.to_pickle(pipeline["save_path_stats"] + 'exp_' +
                                 str(pipeline["id"]) + '_' + str(iteracion) +
                                 '_stats.pickle')

        df_train_EL = pd.concat([datos["df_train"], df_EL.iloc[:, :2]])
        datos['df_train_EL'] = df_train_EL

        ssl_th = pipeline["ssl_threshold"]

        logs_label.append([
            kfold, iteracion, arch_top1, arch_top2, arch_top3,
            len(EL_iter),
            len(LC_iter), EL_accu, LC_accu, ssl_th
        ])
        save_logs(logs_label, 'label', pipeline)

        reset_keras(pipeline)

    end = time.time()
    print(end - start)