Ejemplo n.º 1
0
    def load_file(self, filepath):
        """Both ``.conllx`` and ``.conllu`` are supported. Their descriptions can be found in
        :class:`hanlp_common.conll.CoNLLWord` and :class:`hanlp_common.conll.CoNLLUWord` respectively.

        Args:
            filepath: ``.conllx`` or ``.conllu`` file path.
        """
        if filepath.endswith('.conllu'):
            # See https://universaldependencies.org/format.html
            field_names = [
                'ID', 'FORM', 'LEMMA', 'UPOS', 'XPOS', 'FEATS', 'HEAD',
                'DEPREL', 'DEPS', 'MISC'
            ]
        else:
            field_names = [
                'ID', 'FORM', 'LEMMA', 'CPOS', 'POS', 'FEATS', 'HEAD',
                'DEPREL', 'PHEAD', 'PDEPREL'
            ]
        fp = TimingFileIterator(filepath)
        for idx, sent in enumerate(read_conll(fp)):
            sample = {}
            for i, field in enumerate(field_names):
                sample[field] = [cell[i] for cell in sent]
            if not self._prune or not self._prune(sample):
                yield sample
            fp.log(f'{idx + 1} samples [blink][yellow]...[/yellow][/blink]')
Ejemplo n.º 2
0
 def file_to_inputs(self, filepath: str, gold=True):
     assert gold, 'only support gold file for now'
     use_pos = self.use_pos
     conllu = filepath.endswith('.conllu')
     enhanced_only = self.config.get('enhanced_only', None)
     for i, sent in enumerate(read_conll(filepath)):
         parsed_sent = []
         if conllu:
             for cell in sent:
                 ID = cell[0]
                 form = cell[1]
                 cpos = cell[3]
                 head = cell[6]
                 deprel = cell[7]
                 deps = cell[8]
                 deps = [x.split(':', 1) for x in deps.split('|')]
                 heads = [int(x[0]) for x in deps if x[0].isdigit()]
                 rels = [x[1] for x in deps if x[0].isdigit()]
                 if enhanced_only:
                     if head in heads:
                         offset = heads.index(head)
                         heads.pop(offset)
                         rels.pop(offset)
                 else:
                     if head not in heads:
                         heads.append(head)
                         rels.append(deprel)
                 parsed_sent.append([form, cpos, heads, rels]
                                    if use_pos else [form, heads, rels])
         else:
             prev_cells = None
             heads = []
             rels = []
             for j, cell in enumerate(sent):
                 ID = cell[0]
                 form = cell[1]
                 cpos = cell[3]
                 head = cell[6]
                 deprel = cell[7]
                 if prev_cells and ID != prev_cells[0]:  # found end of token
                     parsed_sent.append(
                         [prev_cells[1], prev_cells[2], heads, rels]
                         if use_pos else [prev_cells[1], heads, rels])
                     heads = []
                     rels = []
                 heads.append(head)
                 rels.append(deprel)
                 prev_cells = [ID, form, cpos, head, deprel
                               ] if use_pos else [ID, form, head, deprel]
             parsed_sent.append(
                 [prev_cells[1], prev_cells[2], heads, rels]
                 if use_pos else [prev_cells[1], heads, rels])
         yield parsed_sent
Ejemplo n.º 3
0
def make_ctb_tasks(chtbs, out_root, part):
    for task in ['cws', 'pos', 'par', 'dep']:
        os.makedirs(join(out_root, task), exist_ok=True)
    timer = CountdownTimer(len(chtbs))
    par_path = join(out_root, 'par', f'{part}.txt')
    with open(join(out_root, 'cws', f'{part}.txt'), 'w', encoding='utf-8') as cws, \
            open(join(out_root, 'pos', f'{part}.tsv'), 'w', encoding='utf-8') as pos, \
            open(par_path, 'w', encoding='utf-8') as par:
        for f in chtbs:
            with open(f, encoding='utf-8') as src:
                content = src.read()
                trees = split_str_to_trees(content)
                for tree in trees:
                    try:
                        tree = Tree.fromstring(tree)
                    except ValueError:
                        print(tree)
                        exit(1)
                    words = []
                    for word, tag in tree.pos():
                        if tag == '-NONE-' or not tag:
                            continue
                        tag = tag.split('-')[0]
                        if tag == 'X':  # 铜_NN 30_CD x_X 25_CD x_X 14_CD cm_NT 1999_NT
                            tag = 'FW'
                        pos.write('{}\t{}\n'.format(word, tag))
                        words.append(word)
                    cws.write(' '.join(words))
                    par.write(tree.pformat(margin=sys.maxsize))
                    for fp in cws, pos, par:
                        fp.write('\n')
            timer.log(
                f'Preprocesing the [blue]{part}[/blue] set of CTB [blink][yellow]...[/yellow][/blink]',
                erase=False)
    remove_all_ec(par_path)
    dep_path = join(out_root, 'dep', f'{part}.conllx')
    convert_to_stanford_dependency_330(par_path, dep_path)
    sents = list(read_conll(dep_path))
    with open(dep_path, 'w') as out:
        for sent in sents:
            for i, cells in enumerate(sent):
                tag = cells[3]
                tag = tag.split('-')[0]  # NT-SHORT ---> NT
                if tag == 'X':  # 铜_NN 30_CD x_X 25_CD x_X 14_CD cm_NT 1999_NT
                    tag = 'FW'
                cells[3] = cells[4] = tag
                out.write('\t'.join(str(x) for x in cells))
                out.write('\n')
            out.write('\n')
Ejemplo n.º 4
0
 def file_to_inputs(self, filepath: str, gold=True):
     assert gold, 'only support gold file for now'
     use_pos = self.use_pos
     conllu = filepath.endswith('.conllu')
     for sent in read_conll(filepath):
         for i, cell in enumerate(sent):
             form = cell[1]
             cpos = cell[3]
             head = cell[6]
             deprel = cell[7]
             # if conllu:
             #     deps = cell[8]
             #     deps = [x.split(':', 1) for x in deps.split('|')]
             #     heads = [int(x[0]) for x in deps if '_' not in x[0] and '.' not in x[0]]
             #     rels = [x[1] for x in deps if '_' not in x[0] and '.' not in x[0]]
             #     if head in heads:
             #         offset = heads.index(head)
             #         if not self.rel_vocab or rels[offset] in self.rel_vocab:
             #             deprel = rels[offset]
             sent[i] = [form, cpos, head, deprel] if use_pos else [form, head, deprel]
         yield sent