Esempio n. 1
0
 def predictions_to_human(self,
                          predictions,
                          outputs,
                          data,
                          use_pos,
                          conll=True):
     rel_vocab = self.vocabs['rel'].idx_to_token
     for d, graph in zip(data, predictions):
         sent = CoNLLSentence()
         for idx, (cell, hrs) in enumerate(zip(d, graph)):
             if use_pos:
                 token, pos = cell
             else:
                 token, pos = cell, None
             head = hrs[0][0]
             deprel = rel_vocab[hrs[0][1]]
             deps = [(h, rel_vocab[r]) for h, r in hrs[1:]]
             sent.append(
                 CoNLLUWord(idx + 1,
                            token,
                            upos=pos,
                            head=head,
                            deprel=deprel,
                            deps=deps))
         outputs.append(sent)
Esempio n. 2
0
    def to_conll(self,
                 tok='tok',
                 lem='lem',
                 pos='pos',
                 dep='dep',
                 sdp='sdp') -> Union[CoNLLSentence, List[CoNLLSentence]]:
        """
        Convert to :class:`~hanlp_common.conll.CoNLLSentence`.

        Args:
            tok (str): Field name for tok.
            lem (str): Field name for lem.
            pos (str): Filed name for upos.
            dep (str): Field name for dependency parsing.
            sdp (str): Field name for semantic dependency parsing.

        Returns:
            A :class:`~hanlp_common.conll.CoNLLSentence` representation.

        """
        tok = prefix_match(tok, self)
        lem = prefix_match(lem, self)
        pos = prefix_match(pos, self)
        dep = prefix_match(dep, self)
        sdp = prefix_match(sdp, self)
        results = CoNLLSentenceList()
        if not self[tok]:
            return results
        flat = isinstance(self[tok][0], str)
        if flat:
            d = Document((k, [v]) for k, v in self.items())
        else:
            d = self
        for sample in [dict(zip(d, t)) for t in zip(*d.values())]:

            def get(_k, _i):
                _v = sample.get(_k, None)
                if not _v:
                    return None
                return _v[_i]

            sent = CoNLLSentence()

            for i, _tok in enumerate(sample[tok]):
                _dep = get(dep, i)
                if not _dep:
                    _dep = (None, None)
                sent.append(
                    CoNLLUWord(i + 1,
                               form=_tok,
                               lemma=get(lem, i),
                               upos=get(pos, i),
                               head=_dep[0],
                               deprel=_dep[1],
                               deps=None if not get(sdp, i) else '|'.join(
                                   f'{x[0]}:{x[1]}' for x in get(sdp, i))))
            results.append(sent)
        if flat:
            return results[0]
        return results
Esempio n. 3
0
 def predictions_to_human(self, predictions, outputs, data, use_pos):
     for d, (arcs, rels, masks) in zip(data, predictions):
         sent = CoNLLSentence()
         for idx, (cell, a, r) in enumerate(zip(d, arcs[1:], rels[1:])):
             if use_pos:
                 token, pos = cell
             else:
                 token, pos = cell, None
             heads = [i for i in range(len(d) + 1) if a[i]]
             deprels = [self.vocabs['rel'][r[i]] for i in range(len(d) + 1) if a[i]]
             sent.append(
                 CoNLLUWord(idx + 1, token, upos=pos, head=None, deprel=None, deps=list(zip(heads, deprels))))
         outputs.append(sent)
Esempio n. 4
0
 def predictions_to_human(self, predictions, outputs, data, use_pos):
     for d, (arcs, rels) in zip(data, predictions):
         sent = CoNLLSentence()
         for idx, (cell, a, r) in enumerate(zip(d, arcs, rels)):
             if use_pos:
                 token, pos = cell
             else:
                 token, pos = cell, None
             sent.append(
                 CoNLLWord(idx + 1,
                           token,
                           cpos=pos,
                           head=a,
                           deprel=self.vocabs['rel'][r]))
         outputs.append(sent)
Esempio n. 5
0
 def prediction_to_human(self, outputs: dict, batch):
     arcs, rels = outputs['arc_preds'], outputs['rel_preds']
     upos = outputs['class_probabilities']['upos'][:, 1:, :].argmax(
         -1).tolist()
     feats = outputs['class_probabilities']['feats'][:, 1:, :].argmax(
         -1).tolist()
     lemmas = outputs['class_probabilities']['lemmas'][:, 1:, :].argmax(
         -1).tolist()
     lem_vocab = self.vocabs['lemma'].idx_to_token
     pos_vocab = self.vocabs['pos'].idx_to_token
     feat_vocab = self.vocabs['feat'].idx_to_token
     # noinspection PyCallByClass,PyTypeChecker
     for tree, form, lemma, pos, feat in zip(
             BiaffineDependencyParser.prediction_to_head_rel(
                 self, arcs, rels, batch), batch['token'], lemmas, upos,
             feats):
         form = form[1:]
         assert len(form) == len(tree)
         lemma = [
             apply_lemma_rule(t, lem_vocab[r]) for t, r in zip(form, lemma)
         ]
         pos = [pos_vocab[x] for x in pos]
         feat = [feat_vocab[x] for x in feat]
         yield CoNLLSentence([
             CoNLLUWord(id=i + 1,
                        form=fo,
                        lemma=l,
                        upos=p,
                        feats=fe,
                        head=a,
                        deprel=r)
             for i, (fo, (a, r), l, p,
                     fe) in enumerate(zip(form, tree, lemma, pos, feat))
         ])
Esempio n. 6
0
def convert_conll_to_conllu(path):
    sents = CoNLLSentence.from_file(path, conllu=True)
    with open(os.path.splitext(path)[0] + '.conllu', 'w') as out:
        for sent in sents:
            for word in sent:
                if not word.deps:
                    word.deps = [(word.head, word.deprel)]
                    word.head = None
                    word.deprel = None
            out.write(str(sent))
            out.write('\n\n')
Esempio n. 7
0
 def XY_to_inputs_outputs(self,
                          X: Union[tf.Tensor, Tuple[tf.Tensor]],
                          Y: Union[tf.Tensor, Tuple[tf.Tensor]],
                          gold=False,
                          inputs=None,
                          conll=True) -> Iterable:
     (words, feats, mask), (arc_preds, rel_preds) = X, Y
     xs = inputs
     ys = self.Y_to_outputs((arc_preds, rel_preds, mask))
     sents = []
     for x, y in zip(xs, ys):
         sent = CoNLLSentence()
         for idx, ((form, cpos), pred) in enumerate(zip(x, y)):
             head = [p[0] for p in pred]
             deprel = [p[1] for p in pred]
             if conll:
                 sent.append(
                     CoNLLWord(id=idx + 1,
                               form=form,
                               cpos=cpos,
                               head=head,
                               deprel=deprel))
             else:
                 sent.append([head, deprel])
         sents.append(sent)
     return sents
Esempio n. 8
0
def _make_ptm():
    raw = get_resource(PTM_V1_RAW)
    home = os.path.dirname(raw)
    done = True
    for part in ['train', 'dev', 'test']:
        if not os.path.isfile(os.path.join(home, f'{part}.conllx')):
            done = False
            break
    if done:
        return
    sents = []
    with open(raw) as src:
        buffer = []
        for line in src:
            line = line.strip()
            if line:
                buffer.append(line)
            else:
                if buffer:
                    tok, pos, rel, arc = [x.split() for x in buffer]
                    sent = CoNLLSentence()
                    for i, (t, p, r, a) in enumerate(zip(tok, pos, rel, arc)):
                        sent.append(CoNLLWord(i + 1, form=t, cpos=p, head=a, deprel=r))
                    sents.append(sent)
                    buffer.clear()

    prev_offset = 0
    # Sentences 12001-13000 and 13001-14463 are used as the development and test set, respectively. The remaining
    # sentences are used as training data.
    for part, offset in zip(['train', 'dev', 'test'], [12000, 13000, 14463]):
        with open(os.path.join(home, f'{part}.conllx'), 'w') as out:
            portion = sents[prev_offset:offset]
            cprint(f'[yellow]{len(portion)}[/yellow] sentences [cyan][{prev_offset + 1}:{offset})[/cyan] in {part}')
            for sent in portion:
                out.write(str(sent) + '\n\n')
        prev_offset = offset
Esempio n. 9
0
def merge_head_deprel_with_2nd(sample: dict):
    if 'arc' in sample:
        arc_2nd = sample['arc_2nd']
        rel_2nd = sample['rel_2nd']
        for i, (arc, rel) in enumerate(zip(sample['arc'], sample['rel'])):
            if i:
                if arc_2nd[i][arc] and rel_2nd[i][arc] != rel:
                    sample_str = CoNLLSentence.from_dict(
                        sample, conllu=True).to_markdown()
                    warnings.warn(f'The main dependency conflicts with 2nd dependency at ID={i}, ' \
                                  'which means joint mode might not be suitable. ' \
                                  f'The sample is\n{sample_str}')
                arc_2nd[i][arc] = True
                rel_2nd[i][arc] = rel
    return sample
Esempio n. 10
0
 def XY_to_inputs_outputs(self, X: Union[tf.Tensor, Tuple[tf.Tensor]], Y: Union[tf.Tensor, Tuple[tf.Tensor]],
                          gold=False, inputs=None, conll=True, arc_scores=None, rel_scores=None) -> Iterable:
     (words, feats, mask), (arc_preds, rel_preds) = X, Y
     if inputs is None:
         inputs = self.X_to_inputs(X)
     ys = self.Y_to_outputs((arc_preds, rel_preds, mask), inputs=inputs)
     sents = []
     for x, y in zip(inputs, ys):
         sent = CoNLLSentence()
         for idx, (cell, (head, deprel)) in enumerate(zip(x, y)):
             if self.use_pos and not self.config.get('joint_pos', None):
                 form, cpos = cell
             else:
                 form, cpos = cell, None
             if conll:
                 sent.append(
                     CoNLLWord(id=idx + 1, form=form, cpos=cpos, head=head, deprel=deprel) if conll == '.conll'
                     else CoNLLUWord(id=idx + 1, form=form, upos=cpos, head=head, deprel=deprel))
             else:
                 sent.append([head, deprel])
         sents.append(sent)
     return sents