Exemple #1
0
    def load_file(self, filepath: tuple):
        phrase_tree_path = get_resource(filepath[0])
        dep_tree_path = get_resource(filepath[1])
        pf = TimingFileIterator(phrase_tree_path)
        message_prefix = f'Loading {os.path.basename(phrase_tree_path)} and {os.path.basename(dep_tree_path)}'
        for i, (dep_sent, phrase_sent) in enumerate(
                zip(read_tsv_as_sents(dep_tree_path), pf)):
            # Somehow the file contains escaped literals
            phrase_sent = phrase_sent.replace('\\/', '/')

            token = [x[1] for x in dep_sent]
            pos = [x[3] for x in dep_sent]
            head = [int(x[6]) for x in dep_sent]
            rel = [x[7] for x in dep_sent]
            phrase_tree = load_trees_from_str(phrase_sent, [head], [rel],
                                              [token])
            assert len(
                phrase_tree
            ) == 1, f'{phrase_tree_path} must have on tree per line.'
            phrase_tree = phrase_tree[0]

            yield {
                'FORM': token,
                'CPOS': pos,
                'HEAD': head,
                'DEPREL': rel,
                'tree': phrase_tree,
                'hpsg': phrase_tree.convert()
            }
            pf.log(
                f'{message_prefix} {i + 1} samples [blink][yellow]...[/yellow][/blink]'
            )
        pf.erase()
Exemple #2
0
 def load_file(self, filepath: str):
     filepath = get_resource(filepath)
     if os.path.isfile(filepath):
         files = [filepath]
     else:
         assert os.path.isdir(
             filepath), f'{filepath} has to be a directory of CoNLL 2012'
         files = sorted(
             glob.glob(f'{filepath}/**/*gold_conll', recursive=True))
     timer = CountdownTimer(len(files))
     for fid, f in enumerate(files):
         timer.log(f'files loading[blink][yellow]...[/yellow][/blink]')
         # 0:DOCUMENT 1:PART 2:INDEX 3:WORD 4:POS 5:PARSE 6:LEMMA 7:FRAME 8:SENSE 9:SPEAKER 10:NE 11-N:ARGS N:COREF
         for sent in read_tsv_as_sents(f, ignore_prefix='#'):
             sense = [cell[7] for cell in sent]
             props = [cell[11:-1] for cell in sent]
             props = map(lambda p: p, zip(*props))
             prd_bio_labels = [
                 self._make_bio_labels(prop) for prop in props
             ]
             prd_bio_labels = [self._remove_B_V(x) for x in prd_bio_labels]
             prd_indices = [i for i, x in enumerate(sense) if x != '-']
             token = [x[3] for x in sent]
             srl = [None for x in token]
             for idx, labels in zip(prd_indices, prd_bio_labels):
                 srl[idx] = labels
             srl = [x if x else ['O'] * len(token) for x in srl]
             yield {'token': token, 'srl': srl}
Exemple #3
0
def bmes_to_flat(inpath, outpath):
    with open(outpath, 'w', encoding='utf-8') as out:
        for sent in read_tsv_as_sents(inpath):
            chunks = get_entities([cells[1] for cells in sent])
            chars = [cells[0] for cells in sent]
            words = []
            for tag, start, end in chunks:
                word = ''.join(chars[start: end])
                words.append(word)
            out.write(' '.join(f'{word}/{tag}' for word, (tag, _, _) in zip(words, chunks)))
            out.write('\n')
Exemple #4
0
def ctb_pos_to_text_format(path, delimiter='_'):
    """
    Convert ctb pos tagging corpus from tsv format to text format, where each word is followed by
    its pos tag.
    Args:
        path: File to be converted.
        delimiter: Delimiter between word and tag.
    """
    path = get_resource(path)
    name, ext = os.path.splitext(path)
    with open(f'{name}.txt', 'w', encoding='utf-8') as out:
        for sent in read_tsv_as_sents(path):
            out.write(' '.join([delimiter.join(x) for x in sent]))
            out.write('\n')
Exemple #5
0
def make_con_txt(conll_file, output_file):
    with open(output_file, 'w') as out:
        for sent in read_tsv_as_sents(conll_file):
            tree = []
            pos_per_sent = []
            for cell in sent:
                if cell[0] == '#begin' or cell[0] == '#end':
                    continue
                if len(cell) < 8:
                    print(cell)
                filename, sentence_id, token_id, word, POS, parse, framefile, roleset, *_ = cell
                parse = parse.replace('*', f'({POS} {word})')
                tree.append(parse)
                pos_per_sent.append(POS)
            bracketed = ' '.join(tree)
            out.write(bracketed)
            out.write('\n')
Exemple #6
0
def convert_conll03_to_json(file_path):
    dataset = []
    num_docs = [0]

    def new_doc():
        doc_key = num_docs[0]
        num_docs[0] += 1
        return {
            'doc_key': doc_key,
            'sentences': [],
            'ner': [],
        }

    doc = new_doc()
    offset = 0
    for cells in read_tsv_as_sents(file_path):
        if cells[0][0] == '-DOCSTART-' and doc['ner']:
            dataset.append(doc)
            doc = new_doc()
            offset = 0
        sentence = [x[0] for x in cells]
        ner = [x[-1] for x in cells]
        ner = hanlp.utils.span_util.iobes_tags_to_spans(ner)
        adjusted_ner = []
        for label, (span_start, span_end) in ner:
            adjusted_ner.append(
                [span_start + offset, span_end + offset, label])
        doc['sentences'].append(sentence)
        doc['ner'].append(adjusted_ner)
        offset += len(sentence)
    if doc['ner']:
        dataset.append(doc)
    output_path = os.path.splitext(file_path)[0] + '.json'
    with open(output_path, 'w') as out:
        for each in dataset:
            json.dump(each, out)
            out.write('\n')