def predictions_to_human(self, predictions, outputs, data, use_pos, conll=True): rel_vocab = self.vocabs['rel'].idx_to_token for d, graph in zip(data, predictions): sent = CoNLLSentence() for idx, (cell, hrs) in enumerate(zip(d, graph)): if use_pos: token, pos = cell else: token, pos = cell, None head = hrs[0][0] deprel = rel_vocab[hrs[0][1]] deps = [(h, rel_vocab[r]) for h, r in hrs[1:]] sent.append( CoNLLUWord(idx + 1, token, upos=pos, head=head, deprel=deprel, deps=deps)) outputs.append(sent)
def to_conll(self, tok='tok', lem='lem', pos='pos', dep='dep', sdp='sdp') -> Union[CoNLLSentence, List[CoNLLSentence]]: """ Convert to :class:`~hanlp_common.conll.CoNLLSentence`. Args: tok (str): Field name for tok. lem (str): Field name for lem. pos (str): Filed name for upos. dep (str): Field name for dependency parsing. sdp (str): Field name for semantic dependency parsing. Returns: A :class:`~hanlp_common.conll.CoNLLSentence` representation. """ tok = prefix_match(tok, self) lem = prefix_match(lem, self) pos = prefix_match(pos, self) dep = prefix_match(dep, self) sdp = prefix_match(sdp, self) results = CoNLLSentenceList() if not self[tok]: return results flat = isinstance(self[tok][0], str) if flat: d = Document((k, [v]) for k, v in self.items()) else: d = self for sample in [dict(zip(d, t)) for t in zip(*d.values())]: def get(_k, _i): _v = sample.get(_k, None) if not _v: return None return _v[_i] sent = CoNLLSentence() for i, _tok in enumerate(sample[tok]): _dep = get(dep, i) if not _dep: _dep = (None, None) sent.append( CoNLLUWord(i + 1, form=_tok, lemma=get(lem, i), upos=get(pos, i), head=_dep[0], deprel=_dep[1], deps=None if not get(sdp, i) else '|'.join( f'{x[0]}:{x[1]}' for x in get(sdp, i)))) results.append(sent) if flat: return results[0] return results
def predictions_to_human(self, predictions, outputs, data, use_pos): for d, (arcs, rels, masks) in zip(data, predictions): sent = CoNLLSentence() for idx, (cell, a, r) in enumerate(zip(d, arcs[1:], rels[1:])): if use_pos: token, pos = cell else: token, pos = cell, None heads = [i for i in range(len(d) + 1) if a[i]] deprels = [self.vocabs['rel'][r[i]] for i in range(len(d) + 1) if a[i]] sent.append( CoNLLUWord(idx + 1, token, upos=pos, head=None, deprel=None, deps=list(zip(heads, deprels)))) outputs.append(sent)
def predictions_to_human(self, predictions, outputs, data, use_pos): for d, (arcs, rels) in zip(data, predictions): sent = CoNLLSentence() for idx, (cell, a, r) in enumerate(zip(d, arcs, rels)): if use_pos: token, pos = cell else: token, pos = cell, None sent.append( CoNLLWord(idx + 1, token, cpos=pos, head=a, deprel=self.vocabs['rel'][r])) outputs.append(sent)
def prediction_to_human(self, outputs: dict, batch): arcs, rels = outputs['arc_preds'], outputs['rel_preds'] upos = outputs['class_probabilities']['upos'][:, 1:, :].argmax( -1).tolist() feats = outputs['class_probabilities']['feats'][:, 1:, :].argmax( -1).tolist() lemmas = outputs['class_probabilities']['lemmas'][:, 1:, :].argmax( -1).tolist() lem_vocab = self.vocabs['lemma'].idx_to_token pos_vocab = self.vocabs['pos'].idx_to_token feat_vocab = self.vocabs['feat'].idx_to_token # noinspection PyCallByClass,PyTypeChecker for tree, form, lemma, pos, feat in zip( BiaffineDependencyParser.prediction_to_head_rel( self, arcs, rels, batch), batch['token'], lemmas, upos, feats): form = form[1:] assert len(form) == len(tree) lemma = [ apply_lemma_rule(t, lem_vocab[r]) for t, r in zip(form, lemma) ] pos = [pos_vocab[x] for x in pos] feat = [feat_vocab[x] for x in feat] yield CoNLLSentence([ CoNLLUWord(id=i + 1, form=fo, lemma=l, upos=p, feats=fe, head=a, deprel=r) for i, (fo, (a, r), l, p, fe) in enumerate(zip(form, tree, lemma, pos, feat)) ])
def convert_conll_to_conllu(path): sents = CoNLLSentence.from_file(path, conllu=True) with open(os.path.splitext(path)[0] + '.conllu', 'w') as out: for sent in sents: for word in sent: if not word.deps: word.deps = [(word.head, word.deprel)] word.head = None word.deprel = None out.write(str(sent)) out.write('\n\n')
def XY_to_inputs_outputs(self, X: Union[tf.Tensor, Tuple[tf.Tensor]], Y: Union[tf.Tensor, Tuple[tf.Tensor]], gold=False, inputs=None, conll=True) -> Iterable: (words, feats, mask), (arc_preds, rel_preds) = X, Y xs = inputs ys = self.Y_to_outputs((arc_preds, rel_preds, mask)) sents = [] for x, y in zip(xs, ys): sent = CoNLLSentence() for idx, ((form, cpos), pred) in enumerate(zip(x, y)): head = [p[0] for p in pred] deprel = [p[1] for p in pred] if conll: sent.append( CoNLLWord(id=idx + 1, form=form, cpos=cpos, head=head, deprel=deprel)) else: sent.append([head, deprel]) sents.append(sent) return sents
def _make_ptm(): raw = get_resource(PTM_V1_RAW) home = os.path.dirname(raw) done = True for part in ['train', 'dev', 'test']: if not os.path.isfile(os.path.join(home, f'{part}.conllx')): done = False break if done: return sents = [] with open(raw) as src: buffer = [] for line in src: line = line.strip() if line: buffer.append(line) else: if buffer: tok, pos, rel, arc = [x.split() for x in buffer] sent = CoNLLSentence() for i, (t, p, r, a) in enumerate(zip(tok, pos, rel, arc)): sent.append(CoNLLWord(i + 1, form=t, cpos=p, head=a, deprel=r)) sents.append(sent) buffer.clear() prev_offset = 0 # Sentences 12001-13000 and 13001-14463 are used as the development and test set, respectively. The remaining # sentences are used as training data. for part, offset in zip(['train', 'dev', 'test'], [12000, 13000, 14463]): with open(os.path.join(home, f'{part}.conllx'), 'w') as out: portion = sents[prev_offset:offset] cprint(f'[yellow]{len(portion)}[/yellow] sentences [cyan][{prev_offset + 1}:{offset})[/cyan] in {part}') for sent in portion: out.write(str(sent) + '\n\n') prev_offset = offset
def merge_head_deprel_with_2nd(sample: dict): if 'arc' in sample: arc_2nd = sample['arc_2nd'] rel_2nd = sample['rel_2nd'] for i, (arc, rel) in enumerate(zip(sample['arc'], sample['rel'])): if i: if arc_2nd[i][arc] and rel_2nd[i][arc] != rel: sample_str = CoNLLSentence.from_dict( sample, conllu=True).to_markdown() warnings.warn(f'The main dependency conflicts with 2nd dependency at ID={i}, ' \ 'which means joint mode might not be suitable. ' \ f'The sample is\n{sample_str}') arc_2nd[i][arc] = True rel_2nd[i][arc] = rel return sample
def XY_to_inputs_outputs(self, X: Union[tf.Tensor, Tuple[tf.Tensor]], Y: Union[tf.Tensor, Tuple[tf.Tensor]], gold=False, inputs=None, conll=True, arc_scores=None, rel_scores=None) -> Iterable: (words, feats, mask), (arc_preds, rel_preds) = X, Y if inputs is None: inputs = self.X_to_inputs(X) ys = self.Y_to_outputs((arc_preds, rel_preds, mask), inputs=inputs) sents = [] for x, y in zip(inputs, ys): sent = CoNLLSentence() for idx, (cell, (head, deprel)) in enumerate(zip(x, y)): if self.use_pos and not self.config.get('joint_pos', None): form, cpos = cell else: form, cpos = cell, None if conll: sent.append( CoNLLWord(id=idx + 1, form=form, cpos=cpos, head=head, deprel=deprel) if conll == '.conll' else CoNLLUWord(id=idx + 1, form=form, upos=cpos, head=head, deprel=deprel)) else: sent.append([head, deprel]) sents.append(sent) return sents