Esempio n. 1
0
def f1_domains(dir,domains):
    # train with one domain and test on other domains
    f1_te = {'Math': [], 'Med': [], 'ES': [], 'Chem': [], 'CS': [], 'Astr': [], 'Agr': [], 'MS': [], 'Bio': [],
             'Eng': []}  # test domain as the key
    for i in range(len(domains) - 1):
        test_d = generate_datasets(dir, domains[i], 10, random.sample(set(range(1, 11)), 2))[0]
        for j in range(len(domains) - 1):
            train_d = generate_datasets(dir, domains[i], 10, random.sample(set(range(1, 11)), 2))[1]
            print('============== Trained on {}, Tested on {}'.format(domains[j], domains[i]))
            test_scores = bilstm_character(train_d, test_d)[1]
            f1_te[domains[i]].append(test_scores[0])
    width = 9 / 100
    x = np.arange(10)
    fig, ax = plt.subplots()
    ax.bar(x - 4 * width, f1_te['Math'], width, label='Math')
    ax.bar(x - 3 * width, f1_te['Med'], width, label='Med')
    ax.bar(x - 2 * width, f1_te['ES'], width, label='ES')
    ax.bar(x - width, f1_te['Chem'], width, label='Chem')
    ax.bar(x, f1_te['CS'], width, label='CS')
    ax.bar(x + width, f1_te['Astr'], width, label='Astr')
    ax.bar(x + 2 * width, f1_te['Agr'], width, label='Agr')
    ax.bar(x + 3 * width, f1_te['MS'], width, label='MS')
    ax.bar(x + 4 * width, f1_te['Bio'], width, label='Bio')
    ax.bar(x + 5 * width, f1_te['Eng'], width, label='Eng')
    ax.set_ylabel('Scores')
    ax.set_title('BiLSTM-ChE F1 Scores Trained and Tested on Each Domain')
    ax.set_xticks(x)
    ax.set_xticklabels(f1_te.keys())
    ax.legend(bbox_to_anchor=(1.2, 1))
    plt.savefig('/data/xwang/OA-STM-domains/{}'.format('BiLSTM-ChE-per-domain'))
    plt.show()
    for k in f1_te.keys():
        f1_te[k] = ['%.2f' % i for i in f1_te[k]]
    print('F1 scores:', '\n', f1_te)
Esempio n. 2
0
def domain_independent(dir,domains):
    # domain-independent classifier: train with overall datasets, and test with specific domain
    f1_tr, f1_te = [], []
    precision_tr, precision_te = [], []
    recall_tr, recall_te = [], []
    overall_idx = random.sample(set(range(1, 111)), 20)
    test_overall, train_overall = generate_datasets(dir, 'Overall', 110, overall_idx)
    for i in range(len(domains) - 1):
        test_d, train_d = generate_datasets(dir, domains[i], 11, random.sample(set(range(1, 11)), 2))
        print('===========', domains[i])
        train_scores, test_scores = bilstm_character(train_overall, test_d)
        f1_tr.append(train_scores[0])
        f1_te.append(test_scores[0])
        precision_tr.append(train_scores[1])
        precision_te.append(test_scores[1])
        recall_tr.append(train_scores[2])
        recall_te.append(test_scores[2])
    s_domains = set(domains) - set('Overall')
    plot_scores(f1_tr, precision_tr, recall_tr, s_domains, 'BiLSTM-ChE-domain-Independent-Scores(train)')
    plot_scores(f1_te, precision_te, recall_te, s_domains, 'BiLSTM-ChE-domain-Independent-Scores(test)')
    f1_1 = ['%.2f' % i for i in f1_tr]
    f1_2 = ['%.2f' % i for i in f1_tr]
    recall_1 = ['%.2f' % i for i in recall_tr]
    recall_2 = ['%.2f' % i for i in recall_te]
    precision_1 = ['%.2f' % i for i in precision_tr]
    precision_2 = ['%.2f' % i for i in precision_te]
    print('Domain-independent:', '\n', 'Training Validation Scores:', '\n', f1_1, '\n', recall_1, '\n', precision_1)
    print('Testing Scores:', '\n', f1_2, '\n', precision_2, '\n', recall_2)
Esempio n. 3
0
def single_d_scores(train_d, test_d):
    if train_d == 'Overall':
        overall_idx = random.sample(set(range(1, 111)), 20)
        train_loc = generate_datasets(dir, 'Overall', 110, overall_idx)
    else:
        train_idx = random.sample(set(range(1, 11)), 2)
        train_loc = generate_datasets(dir, train_d, 110, train_idx)
    test_idx = random.sample(set(range(1, 11)), 2)
    test_loc = generate_datasets(dir, test_d, 10, test_idx)
    print('===========Trained on {} Tested on {}'.format(train_d,test_d))
    train_scores, test_scores = bilstm_character(train_loc, test_loc)
    return train_scores, test_scores
Esempio n. 4
0
def domain_specific(dir, domains, fun, plotname):
    f1_tr, f1_te = [], []
    precision_tr, precision_te = [], []
    recall_tr, recall_te = [], []
    for d in domains:
        if d == 'Overall':
            num_of_files = 110
        else:
            num_of_files = 10
        test_idx = random.sample(set(range(1, num_of_files + 1)), int(num_of_files * 0.2))
        test_file, train_file = generate_datasets(dir, d, num_of_files, test_idx)
        print('===========', d)
        train_scores, test_scores = fun(train_file, test_file)
        f1_tr.append(train_scores[0])
        f1_te.append(test_scores[0])
        precision_tr.append(train_scores[1])
        precision_te.append(test_scores[1])
        recall_tr.append(train_scores[2])
        recall_te.append(test_scores[2])
    plot_scores(f1_tr, precision_tr, recall_tr, domains, plotname+'(train)')
    plot_scores(f1_te, precision_te, recall_te, domains, plotname+'(test)')
    f1_1 = ['%.2f' % i for i in f1_tr]
    f1_2 = ['%.2f' % i for i in f1_tr]
    recall_1 = ['%.2f' % i for i in recall_tr]
    recall_2 = ['%.2f' % i for i in recall_te]
    precision_1 = ['%.2f' % i for i in precision_tr]
    precision_2 = ['%.2f' % i for i in precision_te]
    print('Domain-specific:', '\n', 'Training Validation Scores:', '\n', f1_1, '\n', recall_1, '\n', precision_1)
    print('Testing Scores:', '\n', f1_2, '\n', precision_2, '\n', recall_2)
Esempio n. 5
0
def kfold_bilstm_elmo(dir, d_train, d_test, num_tr, num_te):
    # given addresses of all the files, perform k-fold validation, return average scores
    f1 = []
    recall = []
    precision = []
    idx1 = set(range(1, num_tr + 1))
    idx2 = set(range(1, num_te + 1))
    test_num1 = int(num_tr * 0.2)
    test_num2 = int(num_te * 0.2)
    for i in range(5):
        temp1 = random.sample(idx1, test_num1)  # the first 0.2 for testing
        temp2 = random.sample(idx2, test_num2)
        idx1 = idx1 - set(temp1)  # the remaining 0.8 indexes for training
        idx2 = idx2 - set(temp2)
        train_loc = generate_datasets(dir, d_train, num_tr, temp1)[1]
        test_loc = generate_datasets(dir, d_test, num_te, temp2)[0]
        s = bilstm_elmo(train_loc, test_loc)
        f1.append(s[0])
        recall.append(s[1])
        precision.append(s[2])
    print('kfold mean scores:', np.mean(f1), np.mean(recall),
          np.mean(precision))
    return np.mean(f1), np.mean(recall), np.mean(precision)