def normal_experiment(args): test_size = 0.3 du = DataUtils(args.name) pos, neg, bk, clauses, lang = du.load_data() pos_train, pos_test = train_test_split(pos, test_size=test_size, random_state=7014) neg_train, neg_test = train_test_split(neg, test_size=test_size, random_state=7014) pos_train_, neg_train_ = get_dataset_with_noise(pos_train, neg_train, noise_rate=args.noise_rate) if args.name == 'member': beam_step = 3 N_beam = 3 elif args.name == 'subtree': beam_step = 3 N_beam = 15 else: beam_step = 5 N_beam = 10 N_max = 50 N = 1 ilp_train = ILPProblem(pos_train_, neg_train_, bk, lang, name=args.name) ilp_train.print() CG = ClauseGenerator(ilp_train, infer_step=args.T, max_depth=1, max_body_len=1) solver = ILPSolver(ilp_train, C_0=clauses, CG=CG, m=args.m, infer_step=args.T) clauses_, Ws_list, loss_list_list = solver.train_N(N=N, gen_mode='beam', N_max=N_max, T_beam=beam_step, N_beam=N_beam, epoch=args.epoch, lr=args.lr, wd=0.0) v_list, facts = solver.predict_N(pos_test, neg_test, clauses_, Ws_list) mse = compute_mse(pos_test, neg_test, v_list[0], facts) auc = compute_auc(pos_test, neg_test, v_list[0], facts) print('====== TEST SCORE =======') print('Mean-squared test error: ', mse) print('AUC: ', auc)
return None def pre_process(X): X = nlpUtils.create_vector_model(X) return nlpUtils.normalize_count_vector(X) def build_model(X, y, category_names): X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2) model = modelUtils.build_model() model.fit(X_train, Y_train) modelUtils.evaluate_model(model, X_test, Y_test, category_names) ''' def save_model(): modelUtils.save_model(model, model_filepath) next(end) ''' if __name__ == '__main__': print('Main') X, y, category_names = dataUtils.load_data('dataset/tweets_public.csv') print(y) #X = pre_process(X) print('Data preprocesed') with mlflow.start_run(): build_model(X, y, category_names)
def step_experiment(args, max_n=5, test_size=0.3): du = DataUtils(args.name) pos, neg, bk, clauses, lang = du.load_data() pos_train, pos_test = train_test_split( pos, test_size=test_size, random_state=seed) neg_train, neg_test = train_test_split( neg, test_size=test_size, random_state=seed) ilp_train = ILPProblem(pos_train, neg_train, bk, lang, name=args.name) if args.name in ['member']: N_max_list = [3, 6, 9, 12] else: N_max_list = [10, 15, 20, 25, 30, 35, 40] if args.name in ['subtree']: N_beam = 15 T_beam = 3 else: N_beam = 10 T_beam = 7 AUCs = [] AUC_stds = [] MSEs = [] MSE_stds = [] N = 5 # how many times to perform weight learn naive_AUCs = [] naive_AUC_stds = [] naive_MSEs = [] naive_MSE_stds = [] for N_max in N_max_list: CG = ClauseGenerator(ilp_train, infer_step=args.T, max_depth=1, max_body_len=1) solver = ILPSolver(ilp_train, C_0=clauses, CG=CG, m=args.m, infer_step=args.T) #solver = ILPSolver(ilp_train, C_0=clauses, m=args.m, infer_step=args.T) clauses_, Ws_list, loss_list_list = solver.train_N( N=N, gen_mode='beam', N_max=N_max, T_beam=T_beam, N_beam=N_beam, epoch=args.epoch, lr=args.lr, wd=0.0) v_list, facts = solver.predict_N(pos_test, neg_test, clauses_, Ws_list) auc_list = np.array( [compute_auc(pos_test, neg_test, v_, facts) for v_ in v_list]) auc_mean = np.mean(auc_list) auc_std = np.std(auc_list) AUCs.append(auc_mean) AUC_stds.append(auc_std) mse_list = np.array( [compute_mse(pos_test, neg_test, v_, facts) for v_ in v_list]) mse_mean = np.mean(mse_list) mse_std = np.std(mse_list) MSEs.append(mse_mean) MSE_stds.append(mse_std) # NAIVE CG = ClauseGenerator(ilp_train, infer_step=args.T, max_depth=1, max_body_len=1) solver = ILPSolver(ilp_train, C_0=clauses, CG=CG, m=args.m, infer_step=args.T) clauses_, Ws_list, naive_loss_list_list = solver.train_N( N=N, gen_mode='naive', N_max=N_max, T_beam=T_beam, N_beam=N_beam, epoch=args.epoch, lr=args.lr, wd=0.0) v_list, facts = solver.predict_N(pos_test, neg_test, clauses_, Ws_list) auc_list = np.array( [compute_auc(pos_test, neg_test, v_, facts) for v_ in v_list]) auc_mean = np.mean(auc_list) auc_std = np.std(auc_list) naive_AUCs.append(auc_mean) naive_AUC_stds.append(auc_std) mse_list = np.array( [compute_mse(pos_test, neg_test, v_, facts) for v_ in v_list]) mse_mean = np.mean(mse_list) mse_std = np.std(mse_list) naive_MSEs.append(mse_mean) naive_MSE_stds.append(mse_std) for j in range(N): loss_path = 'imgs/step/loss/' + args.name + \ '[N_max:' + str(N_max) + ']-' + str(j) + '.pdf' ys_list = [loss_list_list[j], naive_loss_list_list[j]] plot_loss_compare(loss_path, ys_list, args.name + ':[N_max:' + str(N_max) + ']-' + str(j)) path_auc = 'imgs/step/' + args.name + '_AUC.pdf' path_mse = 'imgs/step/' + args.name + '_MSE.pdf' print(AUC_stds) print(MSE_stds) labels = ['proposed', 'naive'] plot_line_graph_compare_err(path=path_auc, xs=N_max_list, ys_list=[AUCs, naive_AUCs], err_list=[AUC_stds, naive_AUC_stds], xlabel='Number of clauses', ylabel='AUC', title=args.name, labels=labels) plot_line_graph_compare_err(path=path_mse, xs=N_max_list, ys_list=[MSEs, naive_MSEs], err_list=[MSE_stds, naive_MSE_stds], xlabel='Number of clauses', ylabel='Mean-squared test error', title=args.name, labels=labels)
def noise_experiment(args, test_size=0.3): du = DataUtils(args.name) pos, neg, bk, clauses, lang = du.load_data() pos_train, pos_test = train_test_split(pos, test_size=test_size, random_state=seed) neg_train, neg_test = train_test_split(neg, test_size=test_size, random_state=seed) noise_rates = [ 0.0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.30, 0.35, 0.40, 0.45, 0.50 ] baseline_auc = [1.0, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5] datasets = get_datasets_with_noise(pos_train, neg_train, noise_rates) AUCs = [] AUC_stds = [] MSEs = [] MSE_stds = [] N = 5 # how many times to perform weight learn if args.name == 'member': T_beam = 3 N_beam = 3 elif args.name == 'subtree': T_beam = 3 N_beam = 15 else: T_beam = 5 N_beam = 10 N_max = 50 for i, (pos_train, neg_train) in enumerate(datasets): ilp_train = ILPProblem(pos_train, neg_train, bk, lang, name=args.name) print('NOISE RATE: ', noise_rates[i]) ilp_train.print() CG = ClauseGenerator(ilp_train, infer_step=args.T, max_depth=1, max_body_len=1) solver = ILPSolver(ilp_train, C_0=clauses, CG=CG, m=args.m, infer_step=args.T) clauses_, Ws_list, loss_list_list = solver.train_N(N=N, gen_mode='beam', N_max=N_max, T_beam=T_beam, N_beam=N_beam, epoch=args.epoch, lr=args.lr, wd=0.0) v_list, facts = solver.predict_N(pos_test, neg_test, clauses_, Ws_list) auc_list = np.array( [compute_auc(pos_test, neg_test, v_, facts) for v_ in v_list]) auc_mean = np.mean(auc_list) auc_std = np.std(auc_list) AUCs.append(auc_mean) AUC_stds.append(auc_std) mse_list = np.array( [compute_mse(pos_test, neg_test, v_, facts) for v_ in v_list]) mse_mean = np.mean(mse_list) mse_std = np.std(mse_list) MSEs.append(mse_mean) MSE_stds.append(mse_std) for j in range(N): loss_path = 'imgs/noise/loss/' + args.name + \ '[noise:' + str(noise_rates[i]) + ']-' + str(j) + '.pdf' plot_loss( loss_path, loss_list_list[j], args.name + ':[noise:' + str(noise_rates[i]) + ']-' + str(j)) # plot AUC with baseline path_auc = 'imgs/noise/' + args.name + '_AUC.pdf' path_mse = 'imgs/noise/' + args.name + '_MSE.pdf' print(AUC_stds) print(MSE_stds) plot_line_graph_baseline_err( path=path_auc, xs=noise_rates, ys=AUCs, err=AUC_stds, xlabel='Proportion of mislabeled training data', ylabel='AUC', title=args.name, baseline=baseline_auc) # plot MSR with std plot_line_graph_err(path=path_mse, xs=noise_rates, ys=MSEs, err=MSE_stds, xlabel='Proportion of mislabeled training data', ylabel='Mean-squared test error', title=args.name)
def softor_experiment(args, max_n=5, test_size=0.3): du = DataUtils(args.name) pos, neg, bk, clauses, lang = du.load_data() pos_train, pos_test = train_test_split(pos, test_size=test_size, random_state=7014) neg_train, neg_test = train_test_split(neg, test_size=test_size, random_state=7014) ilp_train = ILPProblem(pos_train, neg_train, bk, lang, name=args.name) N_max = 50 if args.name in ['member']: T_beam = 3 N_beam = 3 m = 3 elif args.name in ['subtree']: T_beam = 3 N_beam = 15 m = 4 else: T_beam = 5 N_beam = 10 m = 3 CG = ClauseGenerator(ilp_train, infer_step=args.T, max_depth=1, max_body_len=1) solver = ILPSolver(ilp_train, C_0=clauses, CG=CG, m=args.m, infer_step=args.T) #solver = ILPSolver(ilp_train, C_0=clauses, m=args.m, infer_step=args.T, im_mode='softmax') clauses_, Ws_, loss_list, times = solver.train_time(gen_mode='beam', N_max=N_max, T_beam=T_beam, N_beam=N_beam, epoch=args.epoch, lr=args.lr, wd=0.0) print('Ws: ') for W in Ws_: print(F.softmax(W, dim=0)) v_, facts = solver.predict(pos_test, neg_test, clauses_, Ws_) auc = compute_auc(pos_test, neg_test, v_, facts) mse = compute_mse(pos_test, neg_test, v_, facts) ent = compute_ent(Ws_, gen_mode='softmax') print('ENT:', ent) print('AUC:', auc) df = {} df['AUC'] = auc df['N_params'] = solver.count_params() df['time'] = mean(times) df['std'] = stdev(times) df['MSE'] = mse df['ENT'] = compute_ent(Ws_, gen_mode='softmax') path = 'results/' + args.name + '_softor' + '.txt' save(path, df) # PAIR CG = ClauseGenerator(ilp_train, infer_step=args.T, max_depth=1, max_body_len=1) solver = ILPSolver(ilp_train, C_0=clauses, CG=CG, m=args.m, infer_step=args.T, im_mode='pair') #solver = ILPSolver(ilp_train, C_0=clauses, m=2, infer_step=args.T, im_mode='pair') clauses_, Ws_, pair_loss_list, times = solver.train_time(gen_mode='beam', N_max=N_max, T_beam=T_beam, N_beam=N_beam, epoch=args.epoch, lr=args.lr, wd=0.0) print('Ws: ') print(softmax2d(Ws_[0])) v_, facts = solver.predict(pos_test, neg_test, clauses_, Ws_) auc = compute_auc(pos_test, neg_test, v_, facts) mse = compute_mse(pos_test, neg_test, v_, facts) df = {} df['AUC_pair'] = auc df['N_params_pair'] = solver.count_params() df['time_pair'] = mean(times) df['std_pair'] = stdev(times) df['MSE_pair'] = mse df['ENT_pair'] = compute_ent(Ws_, gen_mode='pair') path = 'results/' + args.name + '_pair' + '.txt' save(path, df) print(df) loss_path = 'imgs/softor/loss/' + args.name + '.pdf' ys_list = [loss_list, pair_loss_list] plot_loss_compare(loss_path, ys_list, args.name)