Beispiel #1
0
def normal_experiment(args):
    test_size = 0.3
    du = DataUtils(args.name)
    pos, neg, bk, clauses, lang = du.load_data()
    pos_train, pos_test = train_test_split(pos,
                                           test_size=test_size,
                                           random_state=7014)
    neg_train, neg_test = train_test_split(neg,
                                           test_size=test_size,
                                           random_state=7014)

    pos_train_, neg_train_ = get_dataset_with_noise(pos_train,
                                                    neg_train,
                                                    noise_rate=args.noise_rate)

    if args.name == 'member':
        beam_step = 3
        N_beam = 3
    elif args.name == 'subtree':
        beam_step = 3
        N_beam = 15
    else:
        beam_step = 5
        N_beam = 10
    N_max = 50
    N = 1

    ilp_train = ILPProblem(pos_train_, neg_train_, bk, lang, name=args.name)
    ilp_train.print()
    CG = ClauseGenerator(ilp_train,
                         infer_step=args.T,
                         max_depth=1,
                         max_body_len=1)
    solver = ILPSolver(ilp_train,
                       C_0=clauses,
                       CG=CG,
                       m=args.m,
                       infer_step=args.T)
    clauses_, Ws_list, loss_list_list = solver.train_N(N=N,
                                                       gen_mode='beam',
                                                       N_max=N_max,
                                                       T_beam=beam_step,
                                                       N_beam=N_beam,
                                                       epoch=args.epoch,
                                                       lr=args.lr,
                                                       wd=0.0)
    v_list, facts = solver.predict_N(pos_test, neg_test, clauses_, Ws_list)
    mse = compute_mse(pos_test, neg_test, v_list[0], facts)
    auc = compute_auc(pos_test, neg_test, v_list[0], facts)

    print('====== TEST SCORE =======')
    print('Mean-squared test error: ', mse)
    print('AUC: ', auc)
    return None


def pre_process(X):
    X = nlpUtils.create_vector_model(X)
    return nlpUtils.normalize_count_vector(X)


def build_model(X, y, category_names):
    X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2)
    model = modelUtils.build_model()
    model.fit(X_train, Y_train)
    modelUtils.evaluate_model(model, X_test, Y_test, category_names)


'''
def save_model():
    modelUtils.save_model(model, model_filepath)
    next(end)
'''

if __name__ == '__main__':
    print('Main')
    X, y, category_names = dataUtils.load_data('dataset/tweets_public.csv')
    print(y)
    #X = pre_process(X)
    print('Data preprocesed')
    with mlflow.start_run():
        build_model(X, y, category_names)

Beispiel #3
0
def step_experiment(args, max_n=5, test_size=0.3):
    du = DataUtils(args.name)
    pos, neg, bk, clauses, lang = du.load_data()
    pos_train, pos_test = train_test_split(
        pos, test_size=test_size, random_state=seed)
    neg_train, neg_test = train_test_split(
        neg, test_size=test_size, random_state=seed)

    ilp_train = ILPProblem(pos_train, neg_train, bk, lang, name=args.name)
    if args.name in ['member']:
        N_max_list = [3, 6, 9, 12]
    else:
        N_max_list = [10, 15, 20, 25, 30, 35, 40]

    if args.name in ['subtree']:
        N_beam = 15
        T_beam = 3
    else:
        N_beam = 10
        T_beam = 7

    AUCs = []
    AUC_stds = []
    MSEs = []
    MSE_stds = []
    N = 5  # how many times to perform weight learn

    naive_AUCs = []
    naive_AUC_stds = []
    naive_MSEs = []
    naive_MSE_stds = []

    for N_max in N_max_list:
        CG = ClauseGenerator(ilp_train, infer_step=args.T,
                             max_depth=1, max_body_len=1)
        solver = ILPSolver(ilp_train, C_0=clauses, CG=CG,
                           m=args.m, infer_step=args.T)
        #solver = ILPSolver(ilp_train, C_0=clauses, m=args.m, infer_step=args.T)
        clauses_, Ws_list, loss_list_list = solver.train_N(
            N=N, gen_mode='beam', N_max=N_max, T_beam=T_beam, N_beam=N_beam, epoch=args.epoch, lr=args.lr, wd=0.0)
        v_list, facts = solver.predict_N(pos_test, neg_test, clauses_, Ws_list)

        auc_list = np.array(
            [compute_auc(pos_test, neg_test, v_, facts) for v_ in v_list])
        auc_mean = np.mean(auc_list)
        auc_std = np.std(auc_list)
        AUCs.append(auc_mean)
        AUC_stds.append(auc_std)

        mse_list = np.array(
            [compute_mse(pos_test, neg_test, v_, facts) for v_ in v_list])
        mse_mean = np.mean(mse_list)
        mse_std = np.std(mse_list)
        MSEs.append(mse_mean)
        MSE_stds.append(mse_std)

        # NAIVE
        CG = ClauseGenerator(ilp_train, infer_step=args.T,
                             max_depth=1, max_body_len=1)
        solver = ILPSolver(ilp_train, C_0=clauses, CG=CG,
                           m=args.m, infer_step=args.T)
        clauses_, Ws_list, naive_loss_list_list = solver.train_N(
            N=N, gen_mode='naive', N_max=N_max, T_beam=T_beam, N_beam=N_beam, epoch=args.epoch, lr=args.lr, wd=0.0)
        v_list, facts = solver.predict_N(pos_test, neg_test, clauses_, Ws_list)

        auc_list = np.array(
            [compute_auc(pos_test, neg_test, v_, facts) for v_ in v_list])
        auc_mean = np.mean(auc_list)
        auc_std = np.std(auc_list)
        naive_AUCs.append(auc_mean)
        naive_AUC_stds.append(auc_std)

        mse_list = np.array(
            [compute_mse(pos_test, neg_test, v_, facts) for v_ in v_list])
        mse_mean = np.mean(mse_list)
        mse_std = np.std(mse_list)
        naive_MSEs.append(mse_mean)
        naive_MSE_stds.append(mse_std)

        for j in range(N):
            loss_path = 'imgs/step/loss/' + args.name + \
                '[N_max:' + str(N_max) + ']-' + str(j) + '.pdf'
            ys_list = [loss_list_list[j], naive_loss_list_list[j]]
            plot_loss_compare(loss_path, ys_list, args.name +
                              ':[N_max:' + str(N_max) + ']-' + str(j))

    path_auc = 'imgs/step/' + args.name + '_AUC.pdf'
    path_mse = 'imgs/step/' + args.name + '_MSE.pdf'
    print(AUC_stds)
    print(MSE_stds)
    labels = ['proposed', 'naive']

    plot_line_graph_compare_err(path=path_auc, xs=N_max_list, ys_list=[AUCs, naive_AUCs], err_list=[AUC_stds, naive_AUC_stds],
                                xlabel='Number of clauses', ylabel='AUC', title=args.name, labels=labels)
    plot_line_graph_compare_err(path=path_mse, xs=N_max_list, ys_list=[MSEs, naive_MSEs], err_list=[MSE_stds, naive_MSE_stds],
                                xlabel='Number of clauses', ylabel='Mean-squared test error', title=args.name, labels=labels)
Beispiel #4
0
def noise_experiment(args, test_size=0.3):
    du = DataUtils(args.name)
    pos, neg, bk, clauses, lang = du.load_data()
    pos_train, pos_test = train_test_split(pos,
                                           test_size=test_size,
                                           random_state=seed)
    neg_train, neg_test = train_test_split(neg,
                                           test_size=test_size,
                                           random_state=seed)

    noise_rates = [
        0.0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.30, 0.35, 0.40, 0.45, 0.50
    ]
    baseline_auc = [1.0, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5]

    datasets = get_datasets_with_noise(pos_train, neg_train, noise_rates)
    AUCs = []
    AUC_stds = []
    MSEs = []
    MSE_stds = []
    N = 5  # how many times to perform weight learn

    if args.name == 'member':
        T_beam = 3
        N_beam = 3

    elif args.name == 'subtree':
        T_beam = 3
        N_beam = 15
    else:
        T_beam = 5
        N_beam = 10

    N_max = 50

    for i, (pos_train, neg_train) in enumerate(datasets):
        ilp_train = ILPProblem(pos_train, neg_train, bk, lang, name=args.name)
        print('NOISE RATE: ', noise_rates[i])
        ilp_train.print()
        CG = ClauseGenerator(ilp_train,
                             infer_step=args.T,
                             max_depth=1,
                             max_body_len=1)
        solver = ILPSolver(ilp_train,
                           C_0=clauses,
                           CG=CG,
                           m=args.m,
                           infer_step=args.T)
        clauses_, Ws_list, loss_list_list = solver.train_N(N=N,
                                                           gen_mode='beam',
                                                           N_max=N_max,
                                                           T_beam=T_beam,
                                                           N_beam=N_beam,
                                                           epoch=args.epoch,
                                                           lr=args.lr,
                                                           wd=0.0)
        v_list, facts = solver.predict_N(pos_test, neg_test, clauses_, Ws_list)

        auc_list = np.array(
            [compute_auc(pos_test, neg_test, v_, facts) for v_ in v_list])
        auc_mean = np.mean(auc_list)
        auc_std = np.std(auc_list)
        AUCs.append(auc_mean)
        AUC_stds.append(auc_std)

        mse_list = np.array(
            [compute_mse(pos_test, neg_test, v_, facts) for v_ in v_list])
        mse_mean = np.mean(mse_list)
        mse_std = np.std(mse_list)
        MSEs.append(mse_mean)
        MSE_stds.append(mse_std)
        for j in range(N):
            loss_path = 'imgs/noise/loss/' + args.name + \
                '[noise:' + str(noise_rates[i]) + ']-' + str(j) + '.pdf'
            plot_loss(
                loss_path, loss_list_list[j],
                args.name + ':[noise:' + str(noise_rates[i]) + ']-' + str(j))

    # plot AUC with baseline
    path_auc = 'imgs/noise/' + args.name + '_AUC.pdf'
    path_mse = 'imgs/noise/' + args.name + '_MSE.pdf'

    print(AUC_stds)
    print(MSE_stds)

    plot_line_graph_baseline_err(
        path=path_auc,
        xs=noise_rates,
        ys=AUCs,
        err=AUC_stds,
        xlabel='Proportion of mislabeled training data',
        ylabel='AUC',
        title=args.name,
        baseline=baseline_auc)
    # plot MSR with std
    plot_line_graph_err(path=path_mse,
                        xs=noise_rates,
                        ys=MSEs,
                        err=MSE_stds,
                        xlabel='Proportion of mislabeled training data',
                        ylabel='Mean-squared test error',
                        title=args.name)
Beispiel #5
0
def softor_experiment(args, max_n=5, test_size=0.3):
    du = DataUtils(args.name)
    pos, neg, bk, clauses, lang = du.load_data()
    pos_train, pos_test = train_test_split(pos,
                                           test_size=test_size,
                                           random_state=7014)
    neg_train, neg_test = train_test_split(neg,
                                           test_size=test_size,
                                           random_state=7014)

    ilp_train = ILPProblem(pos_train, neg_train, bk, lang, name=args.name)

    N_max = 50

    if args.name in ['member']:
        T_beam = 3
        N_beam = 3
        m = 3

    elif args.name in ['subtree']:
        T_beam = 3
        N_beam = 15
        m = 4

    else:
        T_beam = 5
        N_beam = 10
        m = 3

    CG = ClauseGenerator(ilp_train,
                         infer_step=args.T,
                         max_depth=1,
                         max_body_len=1)
    solver = ILPSolver(ilp_train,
                       C_0=clauses,
                       CG=CG,
                       m=args.m,
                       infer_step=args.T)
    #solver = ILPSolver(ilp_train, C_0=clauses, m=args.m, infer_step=args.T, im_mode='softmax')
    clauses_, Ws_, loss_list, times = solver.train_time(gen_mode='beam',
                                                        N_max=N_max,
                                                        T_beam=T_beam,
                                                        N_beam=N_beam,
                                                        epoch=args.epoch,
                                                        lr=args.lr,
                                                        wd=0.0)
    print('Ws: ')
    for W in Ws_:
        print(F.softmax(W, dim=0))
    v_, facts = solver.predict(pos_test, neg_test, clauses_, Ws_)
    auc = compute_auc(pos_test, neg_test, v_, facts)
    mse = compute_mse(pos_test, neg_test, v_, facts)
    ent = compute_ent(Ws_, gen_mode='softmax')
    print('ENT:', ent)

    print('AUC:', auc)

    df = {}
    df['AUC'] = auc
    df['N_params'] = solver.count_params()
    df['time'] = mean(times)
    df['std'] = stdev(times)
    df['MSE'] = mse
    df['ENT'] = compute_ent(Ws_, gen_mode='softmax')

    path = 'results/' + args.name + '_softor' + '.txt'
    save(path, df)

    # PAIR
    CG = ClauseGenerator(ilp_train,
                         infer_step=args.T,
                         max_depth=1,
                         max_body_len=1)
    solver = ILPSolver(ilp_train,
                       C_0=clauses,
                       CG=CG,
                       m=args.m,
                       infer_step=args.T,
                       im_mode='pair')
    #solver = ILPSolver(ilp_train, C_0=clauses, m=2, infer_step=args.T, im_mode='pair')
    clauses_, Ws_, pair_loss_list, times = solver.train_time(gen_mode='beam',
                                                             N_max=N_max,
                                                             T_beam=T_beam,
                                                             N_beam=N_beam,
                                                             epoch=args.epoch,
                                                             lr=args.lr,
                                                             wd=0.0)
    print('Ws: ')
    print(softmax2d(Ws_[0]))
    v_, facts = solver.predict(pos_test, neg_test, clauses_, Ws_)
    auc = compute_auc(pos_test, neg_test, v_, facts)
    mse = compute_mse(pos_test, neg_test, v_, facts)

    df = {}
    df['AUC_pair'] = auc
    df['N_params_pair'] = solver.count_params()
    df['time_pair'] = mean(times)
    df['std_pair'] = stdev(times)
    df['MSE_pair'] = mse
    df['ENT_pair'] = compute_ent(Ws_, gen_mode='pair')

    path = 'results/' + args.name + '_pair' + '.txt'
    save(path, df)
    print(df)

    loss_path = 'imgs/softor/loss/' + args.name + '.pdf'
    ys_list = [loss_list, pair_loss_list]
    plot_loss_compare(loss_path, ys_list, args.name)