def prediction(pid, tasks, queue, results, model_type, lib_path_server, criterion): model = __factory_model(model_type, init_matlab=True, add_path_matlab=lib_path_server, DEBUG=False) while True: training = queue.get() if training is None: break model.learn(**training) sum80, sum65 = 0, 0 while True: task = tasks.get() if task is None: break evaluate, _ = model.evaluate(task['X_test'], criterion=criterion) print("(pid, prediction, ground-truth) (", pid, evaluate, task["y_test"], ")", flush=True) if task['y_test'] in evaluate: sum65 += u65(evaluate) sum80 += u80(evaluate) results.put(dict({'u65': sum65, 'u80': sum80})) queue.task_done() print("Worker PID finished", pid, flush=True)
def computing_precise_vs_imprecise(in_path=None, ell_optimal=0.1, cv_n_fold=10, seeds=None, lib_path_server=None, model_type_precise='lda', model_type_imprecise='ilda', scaling=True): data = export_data_set('iris.data') if in_path is None else pd.read_csv( in_path) logger = create_logger("computing_precise_vs_imprecise", True) logger.info('Training dataset and models (%s, %s, %s, %s)', in_path, model_type_precise, model_type_imprecise, ell_optimal) X = data.iloc[:, :-1].values if scaling: X = normalize_minmax(X) y = np.array(data.iloc[:, -1].tolist()) seeds = generate_seeds(cv_n_fold) if seeds is None else seeds model_impr = __factory_model(model_type_imprecise, init_matlab=True, add_path_matlab=lib_path_server, DEBUG=False) model_prec = __factory_model_precise(model_type_precise, store_covariance=True) avg_imprecise, avg_precise, n_real_times = 0, 0, 0 for time in range(cv_n_fold): kf = KFold(n_splits=cv_n_fold, random_state=seeds[time], shuffle=True) imprecise_mean, precise_mean, n_real_fold = 0, 0, 0 for idx_train, idx_test in kf.split(y): X_cv_train, y_cv_train = X[idx_train], y[idx_train] X_cv_test, y_cv_test = X[idx_test], y[idx_test] model_impr.learn(X=X_cv_train, y=y_cv_train, ell=ell_optimal) model_prec.fit(X_cv_train, y_cv_train) n_real_tests, time_precise, time_imprecise = 0, 0, 0 n_test, _ = X_cv_test.shape for i, test in enumerate(X_cv_test): evaluate_imp, _ = model_impr.evaluate(test) evaluate = model_prec.predict([test]) if len(evaluate_imp) > 1: n_real_tests += 1 if y_cv_test[i] in evaluate_imp: time_imprecise += 1 if y_cv_test[i] in evaluate: time_precise += 1 logger.debug( "(time, iTest, ellOptimal, cautious, prediction, ground-truth)(%s, %s, %s, %s, %s, %s)", time, i, ell_optimal, evaluate_imp, evaluate, y_cv_test[i]) logger.debug( "(time, ellOptimal, nRealTests, timeImprecise, timePrecise) (%s, %s, %s, %s, %s)", time, ell_optimal, n_real_tests, time_imprecise, time_precise) if n_real_tests > 0: n_real_fold += 1 imprecise_mean += time_imprecise / n_real_tests precise_mean += time_precise / n_real_tests logger.debug("(time, nRealFold, imprecise, precise) (%s, %s, %s, %s)", time, n_real_fold, imprecise_mean, precise_mean) if n_real_fold > 0: n_real_times += 1 avg_imprecise += imprecise_mean / n_real_fold avg_precise += precise_mean / n_real_fold logger.debug("(dataset, models, imprec, prec) (%s, %s, %s, %s, %s)", in_path, model_type_imprecise, model_type_precise, avg_imprecise / n_real_times, avg_precise / n_real_times)
def output_paper_result(model_type="ieda", ell=0.5, hgrid=0.1): data = export_data_set('bin_normal_rnd.data') model = __factory_model(model_type, DEBUG=True) __test_imprecise_model(model, data, features=[1, 2], hgrid=hgrid, ell=ell, clazz=0)
def computing_best_imprecise_mean(in_path=None, out_path=None, cv_nfold=10, model_type="ieda", test_size=0.4, from_ell=0.1, to_ell=1.0, by_ell=0.1, seed=None, lib_path_server=None, scaling=False): assert os.path.exists(in_path), "Without training data, not testing" assert os.path.exists(out_path), "File for putting results does not exist" logger = create_logger("computing_best_imprecise_mean", True) logger.info('Training dataset %s', in_path) data = pd.read_csv(in_path) # , header=None) X = data.iloc[:, :-1].values if scaling: X = normalize_minmax(X) y = np.array(data.iloc[:, -1].tolist()) ell_u65, ell_u80 = dict(), dict() seed = random.randrange(pow(2, 30)) if seed is None else seed logger.debug("MODEL: %s, SEED: %s", model_type, seed) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed) kf = KFold(n_splits=cv_nfold, random_state=None, shuffle=True) splits = list([]) for idx_train, idx_test in kf.split(y_train): splits.append((idx_train, idx_test)) logger.info("Splits %s train %s", len(splits), idx_train) logger.info("Splits %s test %s", len(splits), idx_test) # Create a CSV file for saving results file_csv = open(out_path, 'a') writer = csv.writer(file_csv) model = __factory_model(model_type, solver_matlab=True, add_path_matlab=lib_path_server, DEBUG=True) for ell_current in np.arange(from_ell, to_ell, by_ell): ell_u65[ell_current], ell_u80[ell_current] = 0, 0 logger.info("ELL_CURRENT %s", ell_current) for idx_train, idx_test in splits: logger.info("Splits train %s", idx_train) logger.info("Splits test %s", idx_test) X_cv_train, y_cv_train = X_train[idx_train], y_train[idx_train] X_cv_test, y_cv_test = X_train[idx_test], y_train[idx_test] model.learn(X=X_cv_train, y=y_cv_train, ell=ell_current) sum_u65, sum_u80 = 0, 0 n_test = len(idx_test) for i, test in enumerate(X_cv_test): evaluate = model.evaluate(test) logger.debug("(testing, ell_current, prediction, ground-truth) (%s, %s, %s, %s)", i, ell_current, evaluate, y_cv_test[i]) if y_cv_test[i] in evaluate: sum_u65 += u65(evaluate) sum_u80 += u80(evaluate) ell_u65[ell_current] += sum_u65 / n_test ell_u80[ell_current] += sum_u80 / n_test logger.debug("Partial-kfold (%s, %s, %s)", ell_current, ell_u65[ell_current], ell_u80[ell_current]) ell_u65[ell_current] = ell_u65[ell_current] / cv_nfold ell_u80[ell_current] = ell_u80[ell_current] / cv_nfold writer.writerow([ell_current, ell_u65[ell_current], ell_u80[ell_current]]) file_csv.flush() logger.debug("Partial-ell (%s, %s, %s)", ell_current, ell_u65, ell_u80) file_csv.close() logger.debug("Total-ell %s %s %s", in_path, ell_u65, ell_u80)
def output_paper_zone_im_precise(is_imprecise=True, model_type="ieda", in_train=None, ell=2.0, hgrid=0.1, features=None, criterion="maximality", cmap_color=None): data = export_data_set('iris.data') if in_train is None else pd.read_csv(in_train) features = list([0, 1]) if features is None else features model = __factory_model(model_type, DEBUG=True, solver_matlab=False) if is_imprecise else None __test_imprecise_model(model, data, features=features, hgrid=hgrid, ell=ell, query=None, is_imprecise=is_imprecise, cmap_color=plt.cm.gist_ncar if cmap_color is None else cmap_color, criterion=criterion)
def performance_accuracy_hold_out(in_path=None, model_type="ilda", ell_optimal=0.1, lib_path_server=None, seeds=None, DEBUG=False, scaling=False): assert os.path.exists( in_path ), "Without training data, cannot performing cross hold-out accuracy" logger = create_logger("performance_accuracy_hold_out", True) logger.info('Training dataset (%s, %s, %s)', in_path, model_type, ell_optimal) X, y = dataset_to_Xy(in_path, scaling=scaling) seeds = generate_seeds(cv_n_fold) if seeds is None else seeds logger.info('Seeds used for accuracy %s', seeds) n_time = len(seeds) mean_u65, mean_u80 = 0, 0 model = __factory_model(model_type, solver_matlab=True, add_path_matlab=lib_path_server, DEBUG=DEBUG) for k in range(0, n_time): X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.4, random_state=seeds[k]) model.learn(X=X_cv_train, y=y_cv_train, ell=ell_optimal) sum_u65, sum_u80 = 0, 0 n_test, _ = X_test.shape for i, test in enumerate(X_test): evaluate = lqa.evaluate(test) logger.debug( "(testing, ell_current, prediction, ground-truth) (%s, %s, %s, %s)", i, ell_optimal, evaluate, y_test[i]) if y_test[i] in evaluate: sum_u65 += u65(evaluate) sum_u80 += u80(evaluate) logger.debug("Partial-kfold (%s, %s, %s, %s)", ell_current, k, sum_u65 / n_test, sum_u80 / n_test) mean_u65 += sum_u65 / n_test mean_u80 += sum_u80 / n_test mean_u65 = mean_u65 / n_time mean_u80 = mean_u80 / n_time logger.debug("Total-ell (%s, %s, %s, %s)", in_path, ell_optimal, mean_u65, mean_u80)
def computing_time_prediction(in_path=None, ell_optimal=0.1, lib_path_server=None, model_type="ilda", criterion="maximality", k_repetition=10, seeds=None): assert os.path.exists(in_path), "Without training data, not testing" data = pd.read_csv(in_path, header=None) logger = create_logger("computing_time_prediction", True) X = data.iloc[:, :-1].values y = data.iloc[:, -1].tolist() seeds = generate_seeds(k_repetition) if seeds is None else seeds logger.info( 'Training dataset %s with maximality version (%s) and model (%s), ell_optimal (%s) and seeds %s', in_path, criterion, model_type, ell_optimal, seeds) model = __factory_model(model_type, solver_matlab=True, add_path_matlab=lib_path_server, DEBUG=False) avg = np.array([]) for k in range(k_repetition): logger.info("%s-fold repetition randomly, seed %s", k, seeds[k]) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=seeds[k]) model.learn(X=X_train, y=y_train, ell=ell_optimal) n, _ = X_test.shape sum_time = 0 for i, test in enumerate(X_test): start = time.time() evaluate = model.evaluate(test, criterion=criterion) end = time.time() logger.info("Evaluate %s, Ground-truth %s, Time %s ", evaluate, y_test[i], (end - start)) sum_time += (end - start) avg = np.append(avg, sum_time / n) logger.info("Total time (%s, %s) and average %s and sd %s of %s testing", in_path, avg, np.mean(avg), np.std(avg), n)