Ejemplo n.º 1
0
def create_parsed_sentences(doc, separate_sentences=True):
    if len(doc) == 0:
        return [ParsedSentence('', [])]
    sentences = []
    begin = 0
    base_idx = doc[0].idx
    if separate_sentences:
        for token in doc:
            if token.sent_start and token.i > 0:
                end = token.i
                morphs = [
                    Morph(
                        i,
                        t.idx - base_idx,
                        t.orth_,
                        t.lemma_,
                        t.pos_,
                        t.tag_,
                        ex_attr(t).inf,
                        t.whitespace_,
                    ) for i, t in enumerate(doc[begin:end])
                ]
                for m, t in zip(morphs, doc[begin:end]):
                    m.dep_morph = morphs[t.head.i - begin]
                    m.dep_label = t.dep_.lower()
                sentences.append(
                    ParsedSentence(doc.text[base_idx:token.idx], morphs))
                begin = end
                base_idx = token.idx
    if begin < len(doc):
        morphs = [
            Morph(
                i,
                t.idx - base_idx,
                t.orth_,
                t.lemma_,
                t.pos_,
                t.tag_,
                ex_attr(t).inf,
                t.whitespace_,
            ) for i, t in enumerate(doc[begin:])
        ]
        for m, t in zip(morphs, doc[begin:]):
            m.dep_morph = morphs[t.head.i - begin]
            m.dep_label = t.dep_.lower()
        sentences.append(ParsedSentence(doc.text[base_idx:], morphs))
    return sentences
Ejemplo n.º 2
0
 def rewrite_with_tokens(self, rewriting_morph_index, tokens):
     origin = self.morphs[rewriting_morph_index]
     origin_pos = origin.pos
     t = tokens[0]
     origin.surface = t.orth_
     origin.lemma = t.lemma_
     origin.pos = t.pos_
     origin.tag = t.tag_
     origin.inf = ex_attr(t).inf
     origin.trailing_space = t.whitespace_
     if origin_pos != origin.pos:
         origin.dep_label = '{}_as_{}'.format(origin.dep_label, origin_pos)
     if len(tokens) == 1:
         return
     label = 'as_{}'.format(origin.pos)
     others = [
         Morph(
             rewriting_morph_index + i + 1,
             origin.offset + t.idx - tokens[0].idx,
             t.orth_,
             t.lemma_,
             t.pos_,
             t.tag_,
             ex_attr(t).inf,
             t.whitespace_,
         ) for i, t in enumerate(tokens[1:])
     ]
     offset = origin.offset
     if origin.trailing_space:
         offset += 1
     for m in others:
         m.offset = offset
         offset += len(m.surface)
         if m.trailing_space:
             offset += 1
         m.dep_morph = origin
         m.dep_label = label
     for m in self.morphs[rewriting_morph_index + 1:]:
         m.id += len(others)
     self.morphs[rewriting_morph_index + 1:rewriting_morph_index +
                 1] = others
Ejemplo n.º 3
0
 def to_doc(self, vocab, is_parsed=False):
     words = [morph.surface for morph in self.morphs]
     spaces = [morph.trailing_space for morph in self.morphs]
     doc = Doc(vocab, words=words, spaces=spaces)
     for token, morph in zip(doc, self.morphs):
         token.tag_ = morph.tag
         token.pos_ = morph.pos
         ex_attr(token).inf = morph.inf
         token.lemma_ = morph.lemma  # work around: lemma_ must be set after tag_ (spaCy's bug)
         if is_parsed and morph.dep_label:
             token.dep_ = morph.dep_label
             token.head = doc[morph.dep_morph.id]
     return doc
Ejemplo n.º 4
0
def unify_range(gold_tokens, start, end, replacing_token, extend_dep_labels):
    dep_outer_id = None
    dep_outer_label = None
    head_pos = None
    for g in gold_tokens[start:end]:
        head_id = g['id'] + g['head']
        if head_id < start or end <= head_id or g['head'] == 0:
            if dep_outer_id is None:
                dep_outer_id = head_id
                dep_outer_label = g['dep']
                head_pos = g['pos']
            elif dep_outer_id != head_id:
                return False
    if dep_outer_id is None:
        print(gold_tokens[start:end], file=sys.stderr)
        raise Exception('unexpected state')
    elif start < dep_outer_id < end:
        dep_outer_id = start

    g = gold_tokens[start]
    g['orth'] = replacing_token.orth_
    g['lemma'] = replacing_token.lemma_
    g['pos'] = replacing_token.pos_
    g['tag'] = replacing_token.tag_
    g['inf'] = ex_attr(replacing_token).inf
    g['whitespace'] = replacing_token.whitespace_ != ''
    g['head'] = dep_outer_id - start
    if dep_outer_label.startswith('as_'):
        g['dep'] = dep_outer_label
    else:
        dep = dep_outer_label.split('_as_')[0]
        g['dep'] = dep if not extend_dep_labels or head_pos == g[
            'pos'] else '{}_as_{}'.format(dep, head_pos)

    for g in gold_tokens:
        if g['id'] <= start and end <= g['id'] + g['head']:
            g['head'] -= end - start - 1
        elif g['id'] <= start < g['id'] + g['head']:
            g['head'] = start - g['id']
        elif g['id'] + g['head'] <= start and end <= g['id']:
            g['head'] += end - start - 1
        elif g['id'] + g['head'] < end <= g['id']:
            g['head'] = end - g['id'] - 1
    for g in gold_tokens[end:]:
        g['id'] -= end - start - 1
    del gold_tokens[start + 1:end]

    return True
Ejemplo n.º 5
0
    def unify_range(self, start, end, replacing_token):
        dep_outer_id = None
        dep_outer_label = None
        head = None
        for m in self.morphs[start:end]:
            if start <= m.dep_morph.id < end:
                if m.dep_morph.id == m.id:
                    if dep_outer_id:
                        return False
                    else:
                        dep_outer_id = m.id
                        dep_outer_label = m.dep_label
                        head = m
            elif dep_outer_id:
                if dep_outer_id == m.dep_morph.id:
                    head = m
                else:
                    return False
            else:
                dep_outer_id = m.dep_morph.id
                dep_outer_label = m.dep_label
                head = m
        if dep_outer_id is None:
            raise Exception('unexpected state')
        elif start < dep_outer_id < end:
            dep_outer_id = start

        origin = self.morphs[start]
        origin.surface = replacing_token.orth_
        origin.lemma = replacing_token.lemma_
        origin.pos = replacing_token.pos_
        origin.tag = replacing_token.tag_
        origin.inf = ex_attr(replacing_token).inf
        origin.trailing_space = replacing_token.whitespace_
        origin.dep_morph = self.morphs[dep_outer_id]
        origin.dep_label = dep_outer_label if origin.pos == head.pos else '{}_as_{}'.format(
            dep_outer_label, head.pos)

        for m in self.morphs:
            if start < m.dep_morph.id < end:
                m.dep_morph = origin
        del self.morphs[start + 1:end]
        for m in self.morphs:
            if m.id >= end:
                m.id -= end - start - 1

        return True