def main(args): data_path = args.data_path #'C:/Users/André/Documents/FCUL/2º Ano/CANTEMIST/organized version/cantemist_data/' vocab_path = args.vocab_path #'C:/Users/André/Documents/FCUL/2º Ano/CANTEMIST/organized version/output/' out_path = args.output_path #'C:/Users/André/Documents/FCUL/2º Ano/CANTEMIST/organized version/test_aval/' if not os.path.exists(out_path): print('Creating path %s' % out_path) os.mkdir(out_path) #Generates the dictionary of labels from the label correspondence file flabels = open(vocab_path + 'label_correspondence.txt', encoding='utf-8') labels = flabels.readlines() flabels.close() #Dict with ECIE-O codes as keys dict_labels = {} for i in range(len(labels)): dict_labels[labels[i].split('=')[1]] = ( labels[i].split('=')[0], labels[i].split('=')[2].replace('\n', '')) #Reads dev data to fill part of the test files l_dev_txt, l_dev_labels = pu.read_files(data_path, 'dev1') l_dev_labels_ori = copy.deepcopy(l_dev_labels) l_dev_labels = pu.convert_labels(l_dev_labels, dict_labels) #Reads tst set data l_tst_aval_txt, l_tst_aval_labels = pu.read_test_set_files(data_path) l_tst_aval_labels_ori = copy.deepcopy(l_tst_aval_labels) l_tst_aval_labels = pu.convert_labels(l_tst_aval_labels, dict_labels) #Stemms the data su.check_nltk_punkt() stemmer = su.set_stemmer('spanish') print('Stemming dev1 text...') l_stem_text_dev = su.list_stemming(l_dev_txt, stemmer) print('Stemming test aval text...') l_stem_text_tst_aval = su.list_stemming(l_tst_aval_txt, stemmer) #Creates the Test aval files #It is necessary to split the articles and respective labels in 48 sets, each with 250 articles, #which is equal to the number of articles present in the test set of the trained X-Transformer models #The first 109 lines of each file correspond to text from the test&background set to classify. #The other 141 lines correspond to text from dev set1 that will be used to find best confidence threshold #for the predictions cnt = 1 ini = 0 fin = 109 while cnt <= 48: l_chunk_txt = l_stem_text_tst_aval[ini:fin] + l_stem_text_dev[0:141] l_chunk_labels = l_tst_aval_labels[ini:fin] + l_dev_labels[0:141] l_chunk_labels_ori = l_tst_aval_labels_ori[ini:fin] + l_dev_labels_ori[ 0:141] pu.write_files(l_chunk_txt, l_chunk_labels, l_chunk_labels_ori, out_path, 'test_' + str(cnt)) ini = fin fin = fin + 109 cnt += 1
def main(args): finput = args.input_file finput_decs = args.input_decs_file out_path = args.output_path xmlc_alg = args.xmlc_alg mer = args.mer lexicon = args.mer_lex n_cores = args.mer_cores #Checks if input files/paths exist assert os.path.exists(finput), "Input file/path doesn't exist" assert os.path.exists(finput_decs), "DeCS file/path doesn't exist" assert os.path.splitext(finput)[-1].lower( ) == '.json', "Input file isn't a \'.json\' file. Json file is required." assert os.path.splitext(finput_decs)[-1].lower( ) == '.tsv', "DeCS input file isn't a \'.tsv\' file. Tsv file is required." assert xmlc_alg == 'X-BERT' or xmlc_alg == 'X-Transformer', "Invalid XMLC algorithm. Valid values: X-BERT, X-Transformer." if not os.path.exists(out_path): logging.info('Creating path %s' % out_path) print('Creating path %s' % out_path) os.mkdir(out_path) #Reads the DeCS terms file and stores them on separate lists logging.info('Reading DeCS terms file \'%s\' ...' % finput_decs) print('Reading DeCS terms file \'%s\' ...' % finput_decs) #decs_data = pd.read_csv('DeCS.2019.both.v5.tsv', sep='\t') decs_data = pd.read_csv(finput_decs, sep='\t') #Stores the terms in spanish and the respective DeCS Code in lists l_term_spanish = decs_data['Term_Spanish'].astype(str).values.tolist() l_decs_code = decs_data['#DeCS_code'].astype(str).values.tolist() #Generates vocab and label_correspondence files and returns dict with label correspondence dict_labels = tfu.gen_dict_label_corr(l_term_spanish, l_decs_code) if xmlc_alg == 'X-Transformer': CON_TEST_SIZE = 95598 else: CON_TEST_SIZE = 63732 logging.info('Reading MESINESP data \'%s\' ...' % finput) print('Reading MESINESP data \'%s\'...' % finput) with open(finput, 'r', encoding='utf-8') as json_input: data = json.load(json_input) df_data = pd.json_normalize(data['articles']) df_size = len(df_data) l_abs_mesinesp = df_data['abstractText'].values.tolist() l_title_mesinesp = df_data['title'].values.tolist() l_decs_mesinesp = [0] * df_size l_journal_mesinesp = df_data['journal'].values.tolist() #Checks if all abstracts have text. If not, they are exchanged by the titles of the articles cnt_debug = 0 for i in range(df_size): if l_abs_mesinesp[i] == 'No disponible' or l_abs_mesinesp[ i] == 'No disponibl' or l_abs_mesinesp[i] == None or len( l_abs_mesinesp[i]) <= 0: if l_title_mesinesp[i] != 'No disponible' and l_title_mesinesp[ i] != 'No disponibl' and l_title_mesinesp[ i] != None and len(l_title_mesinesp[i]) > 0: l_abs_mesinesp[i] = l_title_mesinesp[i] else: l_abs_mesinesp[i] = l_journal_mesinesp[i] cnt_debug += 1 #Checks if all titles have text. Otherwise, they are exchanged by the abstracts #so that MER doesn't break cnt_debug = 0 for i in range(df_size): if l_title_mesinesp[i] == None or len(l_title_mesinesp[i]) <= 0: if l_abs_mesinesp[i] != None and len(l_abs_mesinesp[i]) > 0: l_title_mesinesp[i] = l_abs_mesinesp[i] cnt_debug += 1 #Reads and adds extra data logging.info('Reading extra data ...') print('Reading extra data ...') with open('pubmed_extra_1.json', 'r', encoding='utf-8') as json_input_extra: extra_data = json.load(json_input_extra) df_extra_data = pd.json_normalize(extra_data['articles']) with open('pubmed_extra_2.json') as json_input_extra: extra_data = json.load(json_input_extra) df_extra_data = df_extra_data.append(pd.json_normalize( extra_data['articles']), ignore_index=True) """ for i in range(3,9): with open('pubmed_extra_'+str(i)+'.json') as json_input_extra: extra_data = json.load(json_input_extra) df_extra_data = df_extra_data.append(pd.json_normalize(extra_data['articles']), ignore_index=True) """ l_abs_extra = df_extra_data['abstractText.ab_es'].values.tolist() l_title_extra = df_extra_data['title_es'].values.tolist() l_decs_extra = df_extra_data['decsCodes'].values.tolist() l_decs_extra, _ = tfu.convert_labels(l_decs_extra, dict_labels) i = 0 while len(l_abs_mesinesp) < CON_TEST_SIZE: l_abs_mesinesp.append(l_abs_extra[i]) l_title_mesinesp.append(l_title_extra[i]) l_decs_mesinesp.append(l_decs_extra[i]) i += 1 #For titles l_lists = [(l_abs_mesinesp, l_title_mesinesp, l_decs_mesinesp, out_path + '/test_aval_tits_MER', 'test aval_tits_MER')] #Generate Stemmer su.check_nltk_punkt() stemmer = su.set_stemmer('spanish') for l in l_lists: logging.info('Processing %s data...' % l[4]) print('Processing %s data...' % l[4]) l_stem_text = [] if mer: l_mer = [] logging.info('MERing using %s ...' % lexicon) print('MERing using %s ...' % lexicon) if lexicon == 'decs_lex': l_mer = mu.call_simple_mer(l[0], n_cores) else: l_mer = mu.call_custom_mer(l[0], lexicon, n_cores) #appends to the text the corresponding MER terms iddentified earlier for i in range(len(l[1])): #TITLES l[1][i] = l[1][i] + ' ' + str(l_mer[i]) #for i in range(len(l[0])): #ABSTRACTS # l[0][i] = l[0][i] + ' ' + str(l_mer[i]) logging.info('Stemming...') print('Stemming...') l_stem_text = tfu.list_stemming(l[1], stemmer) #TITLES #l_stem_text = tfu.list_stemming(l[0], stemmer) #ABSTRACTS logging.info('Writing %s file' % l[3]) print('Writing %s file' % l[3]) tfu.write_file(l_stem_text, l[2], l[3])
def main(args): data_path = args.input_path #'C:/Users/André/Documents/FCUL/2º Ano/CANTEMIST/cantemist_new/' out_path = args.output_path #'output/' if not os.path.exists(out_path): print('Creating path %s' % out_path) os.mkdir(out_path) #Generates .tsv file with all unique labels present in the txt files pu.gen_vocab_tsv(data_path, out_path) #Reads the generated .tsv file ecie_data = pd.read_csv(out_path + 'cantemist_terms.tsv', sep='\t') #Stores the terms in spanish and the respective ECIE-O Code in lists l_codes = ecie_data['Code'].astype(str).values.tolist() l_terms = ecie_data['Terms'].astype(str).values.tolist() #Generates vocab and label_correspondence files and returns dict with label correspondence dict_labels = pu.gen_vocab(l_terms, l_codes, out_path) #Reads training data l_trn_txt, l_trn_labels = pu.read_files(data_path, 'trn') #creates a copy of the original labels for it is needed for X-Transformer l_trn_labels_ori = copy.deepcopy(l_trn_labels) #converts the labels to their corresponding numeric identifier l_trn_labels = pu.convert_labels(l_trn_labels, dict_labels) #Reads dev data #l_dev_txt, l_dev_labels = pu.read_files(data_path, 'dev1') #it is using dev2 since dev1 has one unlabelled file, and that causes X-Transformer to fail. #if the unlabelled file is removed, then the tst_aval processing would have to be changed, since the #X-Transformer model would not have a test set with 250 documents. l_dev_txt, l_dev_labels = pu.read_files(data_path, 'dev2') l_dev_labels_ori = copy.deepcopy(l_dev_labels) l_dev_labels = pu.convert_labels(l_dev_labels, dict_labels) #Reads extra dev data #it uses the dev1 files to crete a larger train set #The file that has no assigned labels is removed. l_extra_txt, l_extra_labels = pu.read_files(data_path, 'dev1') l_extra_txt.pop(212) #text file with no assigned labels l_extra_labels.pop(212) #text file with no assigned labels l_extra_labels_ori = copy.deepcopy(l_extra_labels) l_extra_labels = pu.convert_labels(l_extra_labels, dict_labels) #Stemms the data su.check_nltk_punkt() stemmer = su.set_stemmer('spanish') print('Stemming trn text...') l_stem_text_trn = su.list_stemming(l_trn_txt, stemmer) print('Stemming dev text...') l_stem_text_dev = su.list_stemming(l_dev_txt, stemmer) print('Stemming extra text...') l_stem_text_extra = su.list_stemming(l_extra_txt, stemmer) #Writes files pu.write_files(l_stem_text_trn, l_trn_labels, l_trn_labels_ori, out_path, 'train') pu.write_files(l_stem_text_dev, l_dev_labels, l_dev_labels_ori, out_path, 'test') #Joins the extra data to the train data for i, j, z in zip(l_stem_text_extra, l_extra_labels, l_extra_labels_ori): l_stem_text_trn.append(i) l_trn_labels.append(j) l_trn_labels_ori.append(z) #Writes larger train set pu.write_files(l_stem_text_trn, l_trn_labels, l_trn_labels_ori, out_path, 'train_750')
def main(args): finput = args.input_file finput_decs = args.input_decs_file out_path = args.output_path xmlc_alg = args.xmlc_alg trn_rat = args.train_ratio tst_rat = args.test_ratio mer = args.mer lexicon = args.mer_lex n_cores = args.mer_cores #Checks if input files/paths exist assert os.path.exists(finput), "Input file/path doesn't exist" assert os.path.exists(finput_decs), "DeCS file/path doesn't exist" assert os.path.splitext(finput)[-1].lower( ) == '.json', "Input file isn't a \'.json\' file. Json file is required." assert os.path.splitext(finput_decs)[-1].lower( ) == '.tsv', "DeCS input file isn't a \'.tsv\' file. Tsv file is required." assert xmlc_alg == 'X-BERT' or xmlc_alg == 'X-Transformer', "Invalid XMLC algorithm. Valid values: X-BERT, X-Transformer." if not os.path.exists(out_path): logging.info('Creating path %s' % out_path) print('Creating path %s' % out_path) os.mkdir(out_path) #Reads the DeCS terms file and stores them on separate lists logging.info('Reading DeCS terms file \'%s\' ...' % finput_decs) print('Reading DeCS terms file \'%s\' ...' % finput_decs) decs_data = pd.read_csv(finput_decs, sep='\t') #Stores the terms in spanish and the respective DeCS Code in lists l_term_spanish = decs_data['Term_Spanish'].astype(str).values.tolist() l_decs_code = decs_data['#DeCS_code'].astype(str).values.tolist() #Generates vocab and label_correspondence files tfu.gen_vocab(l_term_spanish, l_decs_code, out_path) #Generates dict with label correspondence dict_labels = tfu.gen_dict_label_corr(l_term_spanish, l_decs_code) logging.info('Reading MESINESP data \'%s\' ...' % finput) print('Reading MESINESP data \'%s\'...' % finput) with open(finput, 'r', encoding='utf-8') as json_input: data = json.load(json_input) df_data = json_normalize(data) df_size = len(df_data) l_abs_mesinesp = df_data['abstractText'].values.tolist() l_title_mesinesp = df_data['title'].values.tolist() l_decs_mesinesp = df_data['decsCodes'].values.tolist() #l_decs_mesinesp = tfu.convert_labels(l_decs_mesinesp, dict_labels) l_decs_mesinesp, l_decs_names = tfu.convert_labels(l_decs_mesinesp, dict_labels) #Checks if all titles have text. Otherwise, they are exchanged by the abstracts #so that MER doesn't break for i in range(df_size): if l_title_mesinesp[i] == None or len(l_title_mesinesp[i]) <= 0: if l_abs_mesinesp[i] != None and len(l_abs_mesinesp[i]) > 0: l_title_mesinesp[i] = l_abs_mesinesp[i] logging.info('Spliting the data into different sets...') print('Spliting the data into different sets...') trn_limit = int(df_size * trn_rat / 100) tst_limit = int(df_size * tst_rat / 100) l_train_decs, l_train_decs_names, l_train_abs, l_train_title,\ l_test_decs, l_test_decs_names, l_test_abs, l_test_title,\ l_valid_decs, l_valid_abs, l_valid_title = tfu.split_data(l_decs_mesinesp,\ l_abs_mesinesp,\ l_title_mesinesp,\ df_size,\ trn_limit, tst_limit,\ xmlc_alg,\ l_decs_names) #For titles if xmlc_alg == 'X-Transformer': l_lists = [(l_train_abs, l_train_title, l_train_decs, out_path + 'train', 'train', l_train_decs_names), (l_test_abs, l_test_title, l_test_decs, out_path + 'test', 'test', l_test_decs_names)] else: #X-BERT l_lists = [(l_train_abs, l_train_title, l_train_decs, out_path + 'train', 'train'), (l_test_abs, l_test_title, l_test_decs, out_path + 'test', 'test'), (l_valid_abs, l_valid_title, l_valid_decs, out_path + 'valid', 'valid')] #Generate Stemmer su.check_nltk_punkt() stemmer = su.set_stemmer('spanish') for l in l_lists: logging.info('Processing %s data...' % l[4]) print('Processing %s data...' % l[4]) l_stem_text = [] if mer: l_mer = [] logging.info('MERing using %s ...' % lexicon) print('MERing using %s ...' % lexicon) if lexicon == 'decslex': l_mer = mu.call_simple_mer(l[0], n_cores) else: l_mer = mu.call_custom_mer(l[0], lexicon, n_cores) #appends to the titles the corresponding MER terms iddentified earlier for i in range(len(l[1])): l[1][i] = l[1][i] + ' ' + str(l_mer[i]) logging.info('Stemming...') print('Stemming...') l_stem_text = su.list_stemming(l[1], stemmer) logging.info('Writing %s file' % l[3]) print('Writing %s file' % l[3]) tfu.write_file(l_stem_text, l[2], l[3]) if xmlc_alg == 'X-Transformer': logging.info('Writing %s raw file' % l[3]) print('Writing %s raw file' % l[3]) tfu.write_raw_files(l_stem_text, l[5], l[3])
def main(args): finput_folder = args.input_folder finput_mesh = args.input_mesh_file finput_bioasq = args.bioasq_file out_path = args.output_path dtype = args.dtype mer = args.mer n_cores = args.mer_cores assert os.path.exists(finput_folder), "Folder does not exist" assert os.path.exists(finput_mesh), "MeSH file/path doesn't exist" assert os.path.splitext(finput_mesh)[-1].lower() == '.txt', "MeSH input file isn't a \'.txt\' file. Txt file is required." assert dtype == 'txt' or dtype == 'json', "Invalid data type. Valid values: txt, json" if not os.path.exists(out_path): logging.info('Creating path %s' % out_path) print('Creating path %s' % out_path) os.mkdir(out_path) with open(finput_mesh) as mesh_file: #MeSH_name_id_mapping.txt mesh_data = mesh_file.readlines() l_mesh_term, l_mesh_code = [], [] for i in range(len(mesh_data)): l_mesh_term.append(mesh_data[i].split('=')[0]) l_mesh_code.append(mesh_data[i].strip('\n').split('=')[1]) #Generates vocab and label_correspondence files tfu.gen_vocab(l_mesh_term, l_mesh_code, out_path) #Generates dict with label correspondence {MeSH Term: (label number, MeSH Code)} dict_labels = gen_dict_corr(l_mesh_term, l_mesh_code) with open(finput_bioasq, 'r', encoding='utf-8') as bioasq_input: data = json.load(bioasq_input) df_bioasq = json_normalize(data['documents']) df_size = len(df_bioasq) l_mesh_bioasq = [0] * df_size l_abs_bioasq = df_bioasq['abstractText'].values.tolist() l_title_bioasq = df_bioasq['title'].values.tolist() l_mesh, l_title, l_abs = [], [], [] if dtype == 'json': with open(finput_folder + 'bioasq_data_3.json', 'r', encoding='utf-8') as json_file: logging.info('Loading json file 3...') print('Loading json file 3...') data = json.load(json_file) df = json_normalize(data) df = df.dropna() #stores the values of the codes, abstracts and titles into different lists l_mesh = df['meshMajor'].values.tolist() l_abs = df['abstractText'].values.tolist() l_title = df['title'].values.tolist() else: #txt with open(finput_folder + 'bioasq_data_extra.txt', 'r', encoding='utf-8') as txt_file: logging.info('Loading txt file...') print('Loading txt file...') data = txt_file.readlines() for l in range(len(data)): aux = data[l].split('\t') l_mesh.append([aux[0]]) l_title.append(aux[1]) l_abs.append(aux[2]) #Converts from string to list for i in range(len(l_mesh)): l_mesh[i] = ast.literal_eval(l_mesh[i][0]) logging.info('Converting labels...') print('Converting labels...') l_mesh = convert_labels(l_mesh, dict_labels) logging.info('Preparing data...') print('Preparing data...') CON_TEST_SIZE = 63732 #This value needs to change if the size of the test.txt file used to train the X-BERT model changes for i in range(0, CON_TEST_SIZE): if i < df_size: l_mesh_bioasq[i] = l_mesh_bioasq[i] l_abs_bioasq[i] = l_abs_bioasq[i].replace(',','').replace('\n','') l_title_bioasq[i] = l_title_bioasq[i].replace('\n','') else: l_mesh_bioasq.append(l_mesh[i]) l_abs_bioasq.append(l_abs[i].replace(',','').replace('\n','')) l_title_bioasq.append(l_title[i].replace('\n','')) #Generate Stemmer su.check_nltk_punkt() stemmer = su.set_stemmer('english') l_lists = [(l_abs_bioasq, l_title_bioasq, l_mesh_bioasq, out_path+'test', 'test')] for l in l_lists: logging.info('Processing %s data...' % l[4]) print('Processing %s data...' % l[4]) l_stem_text = [] if mer: l_mer = [] logging.info('MERing using mesh_lex...') print('MERing using mesh_lex...') l_mer = mu.call_simple_mer(l[0], n_cores, 'meshlex') #appends to the titles the corresponding MER terms iddentified earlier for i in range(len(l[1])): l[1][i] = l[1][i] + ' ' + str(l_mer[i]) logging.info('Stemming...') print('Stemming...') l_stem_text = su.list_stemming(l[1], stemmer) logging.info('Writing %s file' % l[3]) print('Writing %s file' % l[3]) tfu.write_file(l_stem_text, l[2], l[3])
def main(args): finput_folder = args.input_folder finput_mesh = args.input_mesh_file out_path = args.output_path dtype = args.dtype trn_rat = args.trr tst_rat = args.tsr mer = args.mer n_cores = args.mer_cores assert os.path.exists(finput_folder), "Folder does not exist" assert os.path.exists(finput_mesh), "MeSH file/path doesn't exist" assert os.path.splitext(finput_mesh)[-1].lower( ) == '.txt', "MeSH input file isn't a \'.txt\' file. Txt file is required." assert dtype == 'txt' or dtype == 'json', "Invalid data type. Valid values: txt, json" if not os.path.exists(out_path): logging.info('Creating path %s' % out_path) print('Creating path %s' % out_path) os.mkdir(out_path) with open(finput_mesh) as mesh_file: #MeSH_name_id_mapping.txt mesh_data = mesh_file.readlines() l_mesh_term, l_mesh_code = [], [] for i in range(len(mesh_data)): l_mesh_term.append(mesh_data[i].split('=')[0]) l_mesh_code.append(mesh_data[i].strip('\n').split('=')[1]) #Generates vocab and label_correspondence files tfu.gen_vocab(l_mesh_term, l_mesh_code, out_path) #Generates dict with label correspondence {MeSH Term: (label number, MeSH Code)} dict_labels = gen_dict_corr(l_mesh_term, l_mesh_code) l_mesh, l_title, l_abs = [], [], [] if dtype == 'json': with open(finput_folder + 'bioasq_data_1.json', 'r', encoding='utf-8') as json_file: logging.info('Loading json file 1...') print('Loading json file 1...') data = json.load(json_file) df = json_normalize(data) with open(finput_folder + 'bioasq_data_2.json', 'r', encoding='utf-8') as json_file: logging.info('Loading json file 2...') print('Loading json file 2...') data = json.load(json_file) df = df.append(json_normalize(data), ignore_index=True) df = df.dropna() #stores the values of the codes, abstracts and titles into different lists l_mesh = df['meshMajor'].values.tolist() l_abs = df['abstractText'].values.tolist() l_title = df['title'].values.tolist() else: #txt with open(finput_folder + 'bioasq_data.txt', 'r', encoding='utf-8') as txt_file: logging.info('Loading txt file...') print('Loading txt file...') data = txt_file.readlines() for l in range(len(data)): aux = data[l].split('\t') l_mesh.append([aux[0]]) l_title.append(aux[1]) l_abs.append(aux[2]) #Converts from string to list for i in range(len(l_mesh)): l_mesh[i] = ast.literal_eval(l_mesh[i][0]) logging.info('Converting labels...') print('Converting labels...') l_mesh = convert_labels(l_mesh, dict_labels) logging.info('Spliting the data into train, test and validation...') print('Spliting the data into train, test and validation...') CON_LIMIT = 318658 trn_limit = (CON_LIMIT * trn_rat) / 100 tst_limit = (CON_LIMIT * tst_rat) / 100 l_train_mesh, _, l_train_abs, l_train_title,\ l_test_mesh, _, l_test_abs, l_test_title,\ l_valid_mesh, l_valid_abs, l_valid_title = tfu.split_data(l_mesh,\ l_abs,\ l_title,\ CON_LIMIT,\ trn_limit, tst_limit,\ 'X-BERT') l_lists = [ (l_train_abs, l_train_title, l_train_mesh, out_path + 'train', 'train'), (l_test_abs, l_test_title, l_test_mesh, out_path + 'test', 'test'), (l_valid_abs, l_valid_title, l_valid_mesh, out_path + 'valid', 'valid') ] #Generate Stemmer su.check_nltk_punkt() stemmer = su.set_stemmer('english') for l in l_lists: logging.info('Processing %s data...' % l[4]) print('Processing %s data...' % l[4]) l_stem_text = [] if mer: l_mer = [] logging.info('MERing using mesh_lex...') print('MERing using mesh_lex...') l_mer = mu.call_simple_mer(l[0], n_cores, 'meshlex') #appends to the titles the corresponding MER terms iddentified earlier for i in range(len(l[1])): l[1][i] = l[1][i] + ' ' + str(l_mer[i]) logging.info('Stemming...') print('Stemming...') l_stem_text = su.list_stemming(l[1], stemmer) logging.info('Writing %s file' % l[3]) print('Writing %s file' % l[3]) tfu.write_file(l_stem_text, l[2], l[3])