def main(args): data_path = args.data_path #'C:/Users/André/Documents/FCUL/2º Ano/CANTEMIST/organized version/cantemist_data/' vocab_path = args.vocab_path #'C:/Users/André/Documents/FCUL/2º Ano/CANTEMIST/organized version/output/' out_path = args.output_path #'C:/Users/André/Documents/FCUL/2º Ano/CANTEMIST/organized version/test_aval/' if not os.path.exists(out_path): print('Creating path %s' % out_path) os.mkdir(out_path) #Generates the dictionary of labels from the label correspondence file flabels = open(vocab_path + 'label_correspondence.txt', encoding='utf-8') labels = flabels.readlines() flabels.close() #Dict with ECIE-O codes as keys dict_labels = {} for i in range(len(labels)): dict_labels[labels[i].split('=')[1]] = ( labels[i].split('=')[0], labels[i].split('=')[2].replace('\n', '')) #Reads dev data to fill part of the test files l_dev_txt, l_dev_labels = pu.read_files(data_path, 'dev1') l_dev_labels_ori = copy.deepcopy(l_dev_labels) l_dev_labels = pu.convert_labels(l_dev_labels, dict_labels) #Reads tst set data l_tst_aval_txt, l_tst_aval_labels = pu.read_test_set_files(data_path) l_tst_aval_labels_ori = copy.deepcopy(l_tst_aval_labels) l_tst_aval_labels = pu.convert_labels(l_tst_aval_labels, dict_labels) #Stemms the data su.check_nltk_punkt() stemmer = su.set_stemmer('spanish') print('Stemming dev1 text...') l_stem_text_dev = su.list_stemming(l_dev_txt, stemmer) print('Stemming test aval text...') l_stem_text_tst_aval = su.list_stemming(l_tst_aval_txt, stemmer) #Creates the Test aval files #It is necessary to split the articles and respective labels in 48 sets, each with 250 articles, #which is equal to the number of articles present in the test set of the trained X-Transformer models #The first 109 lines of each file correspond to text from the test&background set to classify. #The other 141 lines correspond to text from dev set1 that will be used to find best confidence threshold #for the predictions cnt = 1 ini = 0 fin = 109 while cnt <= 48: l_chunk_txt = l_stem_text_tst_aval[ini:fin] + l_stem_text_dev[0:141] l_chunk_labels = l_tst_aval_labels[ini:fin] + l_dev_labels[0:141] l_chunk_labels_ori = l_tst_aval_labels_ori[ini:fin] + l_dev_labels_ori[ 0:141] pu.write_files(l_chunk_txt, l_chunk_labels, l_chunk_labels_ori, out_path, 'test_' + str(cnt)) ini = fin fin = fin + 109 cnt += 1
def main(args): files_path = args.data_path #'cantemist_data/' test_set_path = files_path + 'test-background-set-to-publish/' vocab_path = args.vocab_path #'output/' npz_files = args.npz_path #'ranker_CANTEMIST_2_DeCS_Titles_MER/' out_path = args.output_path #'predictions_final/test_set/' if not os.path.exists(out_path): print('Creating path %s' % out_path) os.mkdir(out_path) #Generates the dictionary of labels from the label correspondence file flabels = open(vocab_path + 'label_correspondence.txt', encoding='utf-8') labels = flabels.readlines() flabels.close() #Dict with the ECIE-O codes as keys and a reverse version with the #numeric idenitifier as keys dict_labels, dict_labels_rev = {}, {} for i in range(len(labels)): dict_labels[labels[i].split('=')[1]] = ( labels[i].split('=')[0], labels[i].split('=')[2].replace('\n', '')) dict_labels_rev[labels[i].split('=')[0]] = ( labels[i].split('=')[1], labels[i].split('=')[2].replace('\n', '')) #Reads dev data that filled part of the test files l_dev_txt, l_dev_labels = pu.read_files(files_path, 'dev1') l_dev_labels = pu.convert_labels(l_dev_labels, dict_labels) l_dev_labels = [list(map(int, i)) for i in l_dev_labels] #Creates list of all the .npz files in the given path #the list is being made manually since through sorted(listdir(npz_files)) the order #was never the right one... l_npz_files = [] for i in range(1, len(listdir(npz_files)) - 1): l_npz_files.append('tst_' + str(i) + '.pred.npz') #will contain the predictions for all files from the test and background sets using the #threshold set for the different metrics: b = baseline, f = f1, p = precision, r = recall l_preds_test_b, l_preds_test_f, l_preds_test_p, l_preds_test_r = [], [], [], [] #Reads the first .npz file in order to find the best threshold value for each measure #using the predictions made for the texts of the dev1 set that was used to fill the files. data = np.load(npz_files + l_npz_files[0]) count = 0 l_probs, l_aux = [], [] for i, j in zip(data['indices'], data['data']): if count == 19: #number of predictions made by X-Transformer for each article l_probs.append(l_aux) l_aux = [] count = 0 else: l_aux.append((i, j)) count += 1 l_probs_dev1 = l_probs[109:] #Finds the best confidence threshold value that achieves the best score in the measures prob_baseline = 0 prob_best_prec = au.check_prob_min(l_probs_dev1, l_dev_labels[0:141], 'prec') prob_best_rec = au.check_prob_min(l_probs_dev1, l_dev_labels[0:141], 'rec') prob_best_f1 = au.check_prob_min(l_probs_dev1, l_dev_labels[0:141], 'f1') #Using the previously calculated threshold scores, it iterates each .npz file #and stores the predictions for the test-background set using those thresholds. print('Processing .npz files...') for f in l_npz_files: print(f) #DEBUG #loads npz prediction file and stores the predictions in list. data = np.load(npz_files + f) count = 0 l_probs, l_aux = [], [] for i, j in zip(data['indices'], data['data']): if count == 19: #number of predictions made by X-Transformer for each article l_probs.append(l_aux) l_aux = [] count = 0 else: l_aux.append((i, j)) count += 1 l_probs_test = l_probs[0:109] #Stores the predictions for each test set file using the previously set threshold scores l_pred_labs = au.make_pred_list(l_probs_test, prob_baseline) for l in l_pred_labs: l_preds_test_b.append(l) l_pred_labs = au.make_pred_list(l_probs_test, prob_best_prec) for l in l_pred_labs: l_preds_test_p.append(l) l_pred_labs = au.make_pred_list(l_probs_test, prob_best_rec) for l in l_pred_labs: l_preds_test_r.append(l) l_pred_labs = au.make_pred_list(l_probs_test, prob_best_f1) for l in l_pred_labs: l_preds_test_f.append(l) print('Writing files...') pu.write_test_set_results(test_set_path, out_path, l_preds_test_b, dict_labels_rev, 'baseline') pu.write_test_set_results(test_set_path, out_path, l_preds_test_p, dict_labels_rev, 'prec') pu.write_test_set_results(test_set_path, out_path, l_preds_test_r, dict_labels_rev, 'rec') pu.write_test_set_results(test_set_path, out_path, l_preds_test_f, dict_labels_rev, 'f1')
def main(args): data_path = args.input_path #'C:/Users/André/Documents/FCUL/2º Ano/CANTEMIST/cantemist_new/' out_path = args.output_path #'output/' if not os.path.exists(out_path): print('Creating path %s' % out_path) os.mkdir(out_path) #Generates .tsv file with all unique labels present in the txt files pu.gen_vocab_tsv(data_path, out_path) #Reads the generated .tsv file ecie_data = pd.read_csv(out_path + 'cantemist_terms.tsv', sep='\t') #Stores the terms in spanish and the respective ECIE-O Code in lists l_codes = ecie_data['Code'].astype(str).values.tolist() l_terms = ecie_data['Terms'].astype(str).values.tolist() #Generates vocab and label_correspondence files and returns dict with label correspondence dict_labels = pu.gen_vocab(l_terms, l_codes, out_path) #Reads training data l_trn_txt, l_trn_labels = pu.read_files(data_path, 'trn') #creates a copy of the original labels for it is needed for X-Transformer l_trn_labels_ori = copy.deepcopy(l_trn_labels) #converts the labels to their corresponding numeric identifier l_trn_labels = pu.convert_labels(l_trn_labels, dict_labels) #Reads dev data #l_dev_txt, l_dev_labels = pu.read_files(data_path, 'dev1') #it is using dev2 since dev1 has one unlabelled file, and that causes X-Transformer to fail. #if the unlabelled file is removed, then the tst_aval processing would have to be changed, since the #X-Transformer model would not have a test set with 250 documents. l_dev_txt, l_dev_labels = pu.read_files(data_path, 'dev2') l_dev_labels_ori = copy.deepcopy(l_dev_labels) l_dev_labels = pu.convert_labels(l_dev_labels, dict_labels) #Reads extra dev data #it uses the dev1 files to crete a larger train set #The file that has no assigned labels is removed. l_extra_txt, l_extra_labels = pu.read_files(data_path, 'dev1') l_extra_txt.pop(212) #text file with no assigned labels l_extra_labels.pop(212) #text file with no assigned labels l_extra_labels_ori = copy.deepcopy(l_extra_labels) l_extra_labels = pu.convert_labels(l_extra_labels, dict_labels) #Stemms the data su.check_nltk_punkt() stemmer = su.set_stemmer('spanish') print('Stemming trn text...') l_stem_text_trn = su.list_stemming(l_trn_txt, stemmer) print('Stemming dev text...') l_stem_text_dev = su.list_stemming(l_dev_txt, stemmer) print('Stemming extra text...') l_stem_text_extra = su.list_stemming(l_extra_txt, stemmer) #Writes files pu.write_files(l_stem_text_trn, l_trn_labels, l_trn_labels_ori, out_path, 'train') pu.write_files(l_stem_text_dev, l_dev_labels, l_dev_labels_ori, out_path, 'test') #Joins the extra data to the train data for i, j, z in zip(l_stem_text_extra, l_extra_labels, l_extra_labels_ori): l_stem_text_trn.append(i) l_trn_labels.append(j) l_trn_labels_ori.append(z) #Writes larger train set pu.write_files(l_stem_text_trn, l_trn_labels, l_trn_labels_ori, out_path, 'train_750')