def make_poslist(): tagged_to_poslist.set = set() apply_transformer('tagged', 'poslist') outfilename = '{}/{}'.format(lists_dir, 'pos_list.txt') with open(outfilename, 'w') as outfile: for t in sorted(tagged_to_poslist.set): print(t, file=outfile)
def enumerate_entities(infilename): 'infilename is a file with "{surface}\t{entity}[\t...]" format' with open(infilename) as infile: for line in infile: line = line.strip() # print(line) if line: yield line.split('\t')
def tagged_to_full(infilename, entities_infilename, outfilename): from string import punctuation def flat(st): d = { 'ך':'כ', 'ם':'מ', 'ן':'נ', 'ף':'פ', 'ץ':'צ', } return ''.join(d.get(c, c) for c in st if c not in punctuation + ' \t\n') with open(outfilename, 'w') as outfile: tokens = list(open(infilename)) entities = list(enumerate_entities(entities_infilename)) i = j = 0 missed = 0 while i < len(tokens) and j < len(entities): token_list = tokens[i].rstrip().split('\t') entity = entities[j] token_surface = token_list[3] b = True old_i = i next_phrase = ''.join(e[0] for e in entities[i - 3:i]) if flat(entity[0]) in flat(token_surface) or any(x in flat(token_surface) for x in flat(entity[0]).split('-')): token_list.append(entity[1]) j += 1 b = False if flat(token_surface) in flat(entity[0]) or '"' in next_phrase and next_phrase == token_surface: if b: token_list.insert(4, entities[j - 1][1] if token_surface not in punctuation or token_surface == '-' and entities[j - 1][1] == entities[j + 1][1] else 'O') i += 1 b = False if b: token_list.append('O') print(token_surface, entity[0]) missed += 1 if missed > 10: raise Exception(next_phrase) j += 1 if old_i != i: print(*token_list, file=outfile, sep='\t') if tokens[i:]: print(tokens[i:]) if entities[j:]: print(entities[j:])
def xml_to_tagged(infilename, outfilename): with open(outfilename, 'w') as outfile: for token in MilaText(infilename): print(*token, file=outfile, sep='\t')
def tagged_to_poslist(infilename, _ignore): with open(infilename) as infile: for line in infile: tagged_to_poslist.set.add(line.split('\t')[10].strip())
def wrap(infilename, outfilename): with open(infilename) as infile, open(outfilename, 'w') as outfile: return func(infile, outfile)