コード例 #1
0
def make_poslist():
    tagged_to_poslist.set = set()
    apply_transformer('tagged', 'poslist')
    outfilename = '{}/{}'.format(lists_dir, 'pos_list.txt')
    with open(outfilename, 'w') as outfile:
        for t in sorted(tagged_to_poslist.set):
            print(t, file=outfile)
コード例 #2
0
def enumerate_entities(infilename):
    'infilename is a file with "{surface}\t{entity}[\t...]" format'
    with open(infilename) as infile:
        for line in infile:
            line = line.strip()
            # print(line)
            if line:
                yield line.split('\t')
コード例 #3
0
def tagged_to_full(infilename, entities_infilename, outfilename):
    from string import punctuation
    def flat(st):
        d = {
          'ך':'כ',
         'ם':'מ',
         'ן':'נ',
         'ף':'פ',
         'ץ':'צ',
         }
        return ''.join(d.get(c, c) for c in st if c not in punctuation + ' \t\n')

    with open(outfilename, 'w') as outfile:
        tokens = list(open(infilename))
        entities = list(enumerate_entities(entities_infilename))
        i = j = 0
        missed = 0
        while i < len(tokens) and j < len(entities):
            token_list = tokens[i].rstrip().split('\t')
            entity = entities[j]
            token_surface = token_list[3]
            b = True
            old_i = i
            next_phrase = ''.join(e[0] for e in entities[i - 3:i])
            if flat(entity[0]) in flat(token_surface) or any(x in flat(token_surface) for x in flat(entity[0]).split('-')):
                token_list.append(entity[1])
                j += 1
                b = False
            if flat(token_surface) in flat(entity[0]) or '"' in next_phrase and next_phrase == token_surface:
                if b:
                    token_list.insert(4, entities[j - 1][1] if token_surface not in punctuation or token_surface == '-' and entities[j - 1][1] == entities[j + 1][1] else 'O')
                i += 1
                b = False
            if b:
                token_list.append('O')
                print(token_surface, entity[0])
                missed += 1
                if missed > 10:
                    raise Exception(next_phrase)
                j += 1
            if old_i != i:
                print(*token_list, file=outfile, sep='\t')
        if tokens[i:]: print(tokens[i:])
        if entities[j:]: print(entities[j:])
コード例 #4
0
def xml_to_tagged(infilename, outfilename):
    with open(outfilename, 'w') as outfile:
        for token in MilaText(infilename):
            print(*token, file=outfile, sep='\t')
コード例 #5
0
def tagged_to_poslist(infilename, _ignore):
    with open(infilename) as infile:
        for line in infile:
            tagged_to_poslist.set.add(line.split('\t')[10].strip())
コード例 #6
0
 def wrap(infilename, outfilename):
     with open(infilename) as infile, open(outfilename, 'w') as outfile:
         return func(infile, outfile)