def load_datasets(fnames, lowercase=True): datasets = [] for fn in fnames: d = Dataset.load_conll(fn) print "\t%d examples in %s" % (len(d), fn) if lowercase: converters = {'word': lambda word_list: [x.lower() if x is not None else None for x in word_list]} d.convert(converters, in_place=True) datasets.append(d) return datasets
def get_counter_for_field(filelist, field): c = Counter() for fin_name in filelist: print('loading {}'.format(fin_name)) d = Dataset.load_conll(fin_name) for i, row in enumerate(d): for j in range(len(row['word'])): t = row[field][j] if t == SKIP_TOKEN or t is None: continue else: c[t] += 1 return c
from stanza.text.dataset import Dataset # for fin_name in ['train.conll', 'dev.conll', 'test.conll']: # fout_name = fin_name.replace('.conll', '.anon.conll') # print('loading {}'.format(fin_name)) # d = Dataset.load_conll(fin_name) # print(d) # for i, row in enumerate(d): # if row['subj'] == 'SUBJECT': # d.fields['word'][i] = row['subj_ner'] # if row['obj'] == 'OBJECT': # d.fields['word'][i] = row['obj_ner'] # d.write_conll(fout_name) for fin_name in ['train.conll', 'dev.conll', 'test.conll']: fout_name = fin_name.replace('.conll', '.anon.conll') print('loading {}'.format(fin_name)) d = Dataset.load_conll(fin_name) print(d) for i, row in enumerate(d): for j in range(len(row['word'])): if row['subj'][j] == 'SUBJECT': d.fields['word'][i][j] = 'NER-' + row['subj_ner'][j] if row['obj'][j] == 'OBJECT': d.fields['word'][i][j] = 'NER-' + row['obj_ner'][j] d.write_conll(fout_name)
def test_load_conll(self): with NamedTemporaryFile() as f: f.write(self.CONLL) f.flush() d = Dataset.load_conll(f.name) self.assertDictEqual(self.CONLL_MOCK, d.fields)