Beispiel #1
0
def main(args):
    data_path = args.data_path  #'C:/Users/André/Documents/FCUL/2º Ano/CANTEMIST/organized version/cantemist_data/'
    vocab_path = args.vocab_path  #'C:/Users/André/Documents/FCUL/2º Ano/CANTEMIST/organized version/output/'
    out_path = args.output_path  #'C:/Users/André/Documents/FCUL/2º Ano/CANTEMIST/organized version/test_aval/'

    if not os.path.exists(out_path):
        print('Creating path %s' % out_path)
        os.mkdir(out_path)

    #Generates the dictionary of labels from the label correspondence file
    flabels = open(vocab_path + 'label_correspondence.txt', encoding='utf-8')
    labels = flabels.readlines()
    flabels.close()

    #Dict with ECIE-O codes as keys
    dict_labels = {}
    for i in range(len(labels)):
        dict_labels[labels[i].split('=')[1]] = (
            labels[i].split('=')[0], labels[i].split('=')[2].replace('\n', ''))

    #Reads dev data to fill part of the test files
    l_dev_txt, l_dev_labels = pu.read_files(data_path, 'dev1')
    l_dev_labels_ori = copy.deepcopy(l_dev_labels)
    l_dev_labels = pu.convert_labels(l_dev_labels, dict_labels)

    #Reads tst set data
    l_tst_aval_txt, l_tst_aval_labels = pu.read_test_set_files(data_path)
    l_tst_aval_labels_ori = copy.deepcopy(l_tst_aval_labels)
    l_tst_aval_labels = pu.convert_labels(l_tst_aval_labels, dict_labels)

    #Stemms the data
    su.check_nltk_punkt()
    stemmer = su.set_stemmer('spanish')
    print('Stemming dev1 text...')
    l_stem_text_dev = su.list_stemming(l_dev_txt, stemmer)
    print('Stemming test aval text...')
    l_stem_text_tst_aval = su.list_stemming(l_tst_aval_txt, stemmer)

    #Creates the Test aval files
    #It is necessary to split the articles and respective labels in 48 sets, each with 250 articles,
    #which is equal to the number of articles present in the test set of the trained X-Transformer models
    #The first 109 lines of each file correspond to text from the test&background set to classify.
    #The other 141 lines correspond to text from dev set1 that will be used to find best confidence threshold
    #for the predictions
    cnt = 1
    ini = 0
    fin = 109

    while cnt <= 48:
        l_chunk_txt = l_stem_text_tst_aval[ini:fin] + l_stem_text_dev[0:141]
        l_chunk_labels = l_tst_aval_labels[ini:fin] + l_dev_labels[0:141]
        l_chunk_labels_ori = l_tst_aval_labels_ori[ini:fin] + l_dev_labels_ori[
            0:141]

        pu.write_files(l_chunk_txt, l_chunk_labels, l_chunk_labels_ori,
                       out_path, 'test_' + str(cnt))

        ini = fin
        fin = fin + 109
        cnt += 1
Beispiel #2
0
def main(args):
    files_path = args.data_path  #'cantemist_data/'
    test_set_path = files_path + 'test-background-set-to-publish/'
    vocab_path = args.vocab_path  #'output/'
    npz_files = args.npz_path  #'ranker_CANTEMIST_2_DeCS_Titles_MER/'
    out_path = args.output_path  #'predictions_final/test_set/'

    if not os.path.exists(out_path):
        print('Creating path %s' % out_path)
        os.mkdir(out_path)

    #Generates the dictionary of labels from the label correspondence file
    flabels = open(vocab_path + 'label_correspondence.txt', encoding='utf-8')
    labels = flabels.readlines()
    flabels.close()

    #Dict with the ECIE-O codes as keys and a reverse version with the
    #numeric idenitifier as keys
    dict_labels, dict_labels_rev = {}, {}
    for i in range(len(labels)):
        dict_labels[labels[i].split('=')[1]] = (
            labels[i].split('=')[0], labels[i].split('=')[2].replace('\n', ''))
        dict_labels_rev[labels[i].split('=')[0]] = (
            labels[i].split('=')[1], labels[i].split('=')[2].replace('\n', ''))

    #Reads dev data that filled part of the test files
    l_dev_txt, l_dev_labels = pu.read_files(files_path, 'dev1')
    l_dev_labels = pu.convert_labels(l_dev_labels, dict_labels)
    l_dev_labels = [list(map(int, i)) for i in l_dev_labels]

    #Creates list of all the .npz files in the given path
    #the list is being made manually since through sorted(listdir(npz_files)) the order
    #was never the right one...
    l_npz_files = []
    for i in range(1, len(listdir(npz_files)) - 1):
        l_npz_files.append('tst_' + str(i) + '.pred.npz')

    #will contain the predictions for all files from the test and background sets using the
    #threshold set for the different metrics: b = baseline, f = f1, p = precision, r = recall
    l_preds_test_b, l_preds_test_f, l_preds_test_p, l_preds_test_r = [], [], [], []

    #Reads the first .npz file in order to find the best threshold value for each measure
    #using the predictions made for the texts of the dev1 set that was used to fill the files.
    data = np.load(npz_files + l_npz_files[0])

    count = 0
    l_probs, l_aux = [], []
    for i, j in zip(data['indices'], data['data']):
        if count == 19:  #number of predictions made by X-Transformer for each article
            l_probs.append(l_aux)
            l_aux = []
            count = 0
        else:
            l_aux.append((i, j))
            count += 1

    l_probs_dev1 = l_probs[109:]

    #Finds the best confidence threshold value that achieves the best score in the measures
    prob_baseline = 0
    prob_best_prec = au.check_prob_min(l_probs_dev1, l_dev_labels[0:141],
                                       'prec')
    prob_best_rec = au.check_prob_min(l_probs_dev1, l_dev_labels[0:141], 'rec')
    prob_best_f1 = au.check_prob_min(l_probs_dev1, l_dev_labels[0:141], 'f1')

    #Using the previously calculated threshold scores, it iterates each .npz file
    #and stores the predictions for the test-background set using those thresholds.
    print('Processing .npz files...')
    for f in l_npz_files:
        print(f)  #DEBUG
        #loads npz prediction file and stores the predictions in list.
        data = np.load(npz_files + f)

        count = 0
        l_probs, l_aux = [], []
        for i, j in zip(data['indices'], data['data']):
            if count == 19:  #number of predictions made by X-Transformer for each article
                l_probs.append(l_aux)
                l_aux = []
                count = 0
            else:
                l_aux.append((i, j))
                count += 1

        l_probs_test = l_probs[0:109]

        #Stores the predictions for each test set file using the previously set threshold scores
        l_pred_labs = au.make_pred_list(l_probs_test, prob_baseline)
        for l in l_pred_labs:
            l_preds_test_b.append(l)

        l_pred_labs = au.make_pred_list(l_probs_test, prob_best_prec)
        for l in l_pred_labs:
            l_preds_test_p.append(l)

        l_pred_labs = au.make_pred_list(l_probs_test, prob_best_rec)
        for l in l_pred_labs:
            l_preds_test_r.append(l)

        l_pred_labs = au.make_pred_list(l_probs_test, prob_best_f1)
        for l in l_pred_labs:
            l_preds_test_f.append(l)

    print('Writing files...')
    pu.write_test_set_results(test_set_path, out_path, l_preds_test_b,
                              dict_labels_rev, 'baseline')
    pu.write_test_set_results(test_set_path, out_path, l_preds_test_p,
                              dict_labels_rev, 'prec')
    pu.write_test_set_results(test_set_path, out_path, l_preds_test_r,
                              dict_labels_rev, 'rec')
    pu.write_test_set_results(test_set_path, out_path, l_preds_test_f,
                              dict_labels_rev, 'f1')
Beispiel #3
0
def main(args):
    data_path = args.input_path  #'C:/Users/André/Documents/FCUL/2º Ano/CANTEMIST/cantemist_new/'
    out_path = args.output_path  #'output/'

    if not os.path.exists(out_path):
        print('Creating path %s' % out_path)
        os.mkdir(out_path)

    #Generates .tsv file with all unique labels present in the txt files
    pu.gen_vocab_tsv(data_path, out_path)

    #Reads the generated .tsv file
    ecie_data = pd.read_csv(out_path + 'cantemist_terms.tsv', sep='\t')

    #Stores the terms in spanish and the respective ECIE-O Code in lists
    l_codes = ecie_data['Code'].astype(str).values.tolist()
    l_terms = ecie_data['Terms'].astype(str).values.tolist()

    #Generates vocab and label_correspondence files and returns dict with label correspondence
    dict_labels = pu.gen_vocab(l_terms, l_codes, out_path)

    #Reads training data
    l_trn_txt, l_trn_labels = pu.read_files(data_path, 'trn')
    #creates a copy of the original labels for it is needed for X-Transformer
    l_trn_labels_ori = copy.deepcopy(l_trn_labels)
    #converts the labels to their corresponding numeric identifier
    l_trn_labels = pu.convert_labels(l_trn_labels, dict_labels)

    #Reads dev data
    #l_dev_txt, l_dev_labels = pu.read_files(data_path, 'dev1')
    #it is using dev2 since dev1 has one unlabelled file, and that causes X-Transformer to fail.
    #if the unlabelled file is removed, then the tst_aval processing would have to be changed, since the
    #X-Transformer model would not have a test set with 250 documents.
    l_dev_txt, l_dev_labels = pu.read_files(data_path, 'dev2')
    l_dev_labels_ori = copy.deepcopy(l_dev_labels)
    l_dev_labels = pu.convert_labels(l_dev_labels, dict_labels)

    #Reads extra dev data
    #it uses the dev1 files to crete a larger train set
    #The file that has no assigned labels is removed.
    l_extra_txt, l_extra_labels = pu.read_files(data_path, 'dev1')
    l_extra_txt.pop(212)  #text file with no assigned labels
    l_extra_labels.pop(212)  #text file with no assigned labels
    l_extra_labels_ori = copy.deepcopy(l_extra_labels)
    l_extra_labels = pu.convert_labels(l_extra_labels, dict_labels)

    #Stemms the data
    su.check_nltk_punkt()
    stemmer = su.set_stemmer('spanish')
    print('Stemming trn text...')
    l_stem_text_trn = su.list_stemming(l_trn_txt, stemmer)
    print('Stemming dev text...')
    l_stem_text_dev = su.list_stemming(l_dev_txt, stemmer)
    print('Stemming extra text...')
    l_stem_text_extra = su.list_stemming(l_extra_txt, stemmer)

    #Writes files
    pu.write_files(l_stem_text_trn, l_trn_labels, l_trn_labels_ori, out_path,
                   'train')
    pu.write_files(l_stem_text_dev, l_dev_labels, l_dev_labels_ori, out_path,
                   'test')

    #Joins the extra data to the train data
    for i, j, z in zip(l_stem_text_extra, l_extra_labels, l_extra_labels_ori):
        l_stem_text_trn.append(i)
        l_trn_labels.append(j)
        l_trn_labels_ori.append(z)
    #Writes larger train set
    pu.write_files(l_stem_text_trn, l_trn_labels, l_trn_labels_ori, out_path,
                   'train_750')