Beispiel #1
0
def number_by_position(out: Output = Output("{annotation}:misc.number_position"),
                       chunk: Annotation = Annotation("{annotation}"),
                       prefix: str = "",
                       zfill: bool = False,
                       start: int = START_DEFAULT):
    """Number chunks by their position."""
    spans = list(chunk.read_spans())

    def _order(index, _value):
        return spans[index]

    _read_chunks_and_write_new_ordering(out, chunk, _order, prefix, zfill, start)
Beispiel #2
0
def text_spans(text: Text = Text(),
               chunk: Annotation = Annotation("<token>"),
               out: Output = Output("<token>:misc.word", cls="token:word"),
               keep_formatting_chars: Optional[bool] = Config(
                   "misc.keep_formatting_chars")):
    """Add the text content for each edge as a new annotation."""
    corpus_text = text.read()
    if isinstance(chunk, (str, Annotation)):
        chunk = chunk.read_spans()
    out_annotation = []
    for span in chunk:
        token = corpus_text[span[0]:span[1]]
        if not keep_formatting_chars:
            new_token = util.remove_formatting_characters(token)
            # If this token consists entirely of formatting characters, don't remove them. Empty tokens are bad!
            if new_token:
                token = new_token
        out_annotation.append(token)
    if out:
        out.write(out_annotation)
    else:
        return out_annotation
Beispiel #3
0
def span_as_value(chunk: Annotation, out: Output):
    """Create new annotation, with spans as values."""
    out.write((f"{start}-{end}" for start, end in chunk.read_spans()))
Beispiel #4
0
def parse_swener_output(sentences: list, token: Annotation, output, out_ne: Output, out_ne_ex: Output,
                        out_ne_type: Output, out_ne_subtype: Output, out_ne_name: Output):
    """Parse the SweNER output and write annotation files."""
    out_ne_spans = []
    out_ex = []
    out_type = []
    out_subtype = []
    out_name = []

    token_spans = list(token.read_spans())

    # Loop through the NE-tagged sentences and parse each one with ElemenTree
    for sent, tagged_sent in zip(sentences, output.strip().split(SENT_SEP)):
        xml_sent = "<sroot>" + tagged_sent + "</sroot>"

        # Filter out tags on the format <EnamexXxxXxx> since they seem to always overlap with <ENAMEX> elements,
        # making the XML invalid.
        xml_sent = re.sub(r"</?Enamex[^>\s]+>", "", xml_sent)
        try:
            root = etree.fromstring(xml_sent)
        except:
            log.warning("Error parsing sentence. Skipping.")
            continue

        # Init token counter; needed to get start_pos and end_pos
        i = 0
        previous_end = 0
        children = list(root.iter())

        try:
            for count, child in enumerate(children):
                start_pos = token_spans[sent[i]][0]
                start_i = i

                # If current child has text, increase token counter
                if child.text:
                    i += len(child.text.strip().split(TOK_SEP))

                    # Extract NE tags and save them in lists
                    if child.tag != "sroot":
                        if start_i < previous_end:
                            pass
                            # log.warning("Overlapping NE elements found; discarding one.")
                        else:
                            end_pos = token_spans[sent[i - 1]][1]
                            previous_end = i
                            span = (start_pos, end_pos)
                            out_ne_spans.append(span)
                            out_ex.append(child.tag)
                            out_type.append(child.get("TYPE"))
                            out_subtype.append(child.get("SBT"))
                            out_name.append(child.text)

                        # If this child has a tail and it doesn't start with a space, or if it has no tail at all
                        # despite not being the last child, it means this NE ends in the middle of a token.
                        if (child.tail and child.tail.strip() and not child.tail[0] == " ") or (
                                not child.tail and count < len(children) - 1):
                            i -= 1
                            # log.warning("Split token returned by name tagger.")

                # If current child has text in the tail, increase token counter
                if child.tail and child.tail.strip():
                    i += len(child.tail.strip().split(TOK_SEP))

                if (child.tag == "sroot" and child.text and not child.text[-1] == " ") or (
                        child.tail and not child.tail[-1] == " "):
                    # The next NE would start in the middle of a token, so decrease the counter by 1
                    i -= 1
        except IndexError:
            log.warning("Error parsing sentence. Skipping.")
            continue

    # Write annotations
    out_ne.write(out_ne_spans)
    out_ne_ex.write(out_ex)
    out_ne_type.write(out_type)
    out_ne_subtype.write(out_subtype)
    out_ne_name.write(out_name)
Beispiel #5
0
def annotate(corpus_text: Text = Text(),
             lang: Language = Language(),
             text: Annotation = Annotation("<text>"),
             out_sentence: Output = Output("stanford.sentence", cls="sentence", description="Sentence segments"),
             out_token: Output = Output("stanford.token", cls="token", description="Token segments"),
             out_word: Output = Output("<token>:stanford.word", cls="token:word", description="Token strings"),
             out_ref: Output = Output("<token>:stanford.ref", description="Token ID relative to sentence"),
             out_baseform: Output = Output("<token>:stanford.baseform", description="Baseforms from Stanford Parser"),
             out_upos: Output = Output("<token>:stanford.upos", cls="token:upos", description="Part-of-speeches in UD"),
             out_pos: Output = Output("<token>:stanford.pos", cls="token:pos",
                                      description="Part-of-speeches from Stanford Parser"),
             out_ne: Output = Output("<token>:stanford.ne_type", cls="token:named_entity_type",
                                     description="Named entitiy types from Stanford Parser"),
             out_deprel: Output = Output("<token>:stanford.deprel", cls="token:deprel",
                                         description="Dependency relations to the head"),
             out_dephead_ref: Output = Output("<token>:stanford.dephead_ref", cls="token:dephead_ref",
                                              description="Sentence-relative positions of the dependency heads"),
             binary: BinaryDir = BinaryDir("[stanford.bin]")):
    """Use Stanford Parser to parse and annotate text."""
    args = ["-cp", binary + "/*", "edu.stanford.nlp.pipeline.StanfordCoreNLP",
            "-annotators", "tokenize,ssplit,pos,lemma,depparse,ner",
            "-outputFormat", "conll"]
    process = util.system.call_binary("java", arguments=args, return_command=True)

    # Read corpus_text and text_spans
    text_data = corpus_text.read()
    text_spans = text.read_spans()

    sentence_segments = []
    all_tokens = []

    # Go through text elements and parse them with Stanford Parser
    for text_span in text_spans:
        inputtext = text_data[text_span[0]:text_span[1]]
        stdout, _ = process.communicate(inputtext.encode(util.UTF8))
        processed_sentences = _parse_output(stdout.decode(util.UTF8), lang)

        # Go through output and try to match tokens with input text to get correct spans
        index_counter = text_span[0]
        for sentence in processed_sentences:
            for token in sentence:
                all_tokens.append(token)
                # Get token span
                match = re.match(r"\s*(%s)" % re.escape(token.word), inputtext)
                span = match.span(1)
                token.start = span[0] + index_counter
                token.end = span[1] + index_counter
                # Forward inputtext
                inputtext = inputtext[span[1]:]
                index_counter += span[1]
            # Extract sentence span for current sentence
            sentence_segments.append((sentence[0].start, sentence[-1].end))

    # Write annotations
    out_sentence.write(sentence_segments)
    out_token.write([(t.start, t.end) for t in all_tokens])
    out_ref.write([t.ref for t in all_tokens])
    out_word.write([t.word for t in all_tokens])
    out_baseform.write([t.baseform for t in all_tokens])
    out_upos.write([t.upos for t in all_tokens])
    out_pos.write([t.pos for t in all_tokens])
    out_ne.write([t.ne for t in all_tokens])
    out_dephead_ref.write([t.dephead_ref for t in all_tokens])
    out_deprel.write([t.deprel for t in all_tokens])
Beispiel #6
0
def annotate(
        out_phrase: Output = Output("phrase_structure.phrase",
                                    description="Phrase segments"),
        out_phrase_name: Output = Output(
            "phrase_structure.phrase:phrase_structure.name",
            description="Phrase names"),
        out_phrase_func: Output = Output(
            "phrase_structure.phrase:phrase_structure.func",
            description="Phrase functions"),
        token: Annotation = Annotation("<token>"),
        word: Annotation = Annotation("<token:word>"),
        sentence: Annotation = Annotation("<sentence>"),
        pos: Annotation = Annotation("<token:pos>"),
        msd: Annotation = Annotation("<token:msd>"),
        ref: Annotation = Annotation("<token>:misc.number_rel_<sentence>"),
        dephead_ref: Annotation = Annotation("<token:dephead_ref>"),
        deprel: Annotation = Annotation("<token:deprel>")):
    """Annotate sentence with phrase structures."""
    sentences, _orphans = sentence.get_children(word)
    token_annotations = list(
        ref.read_attributes([ref, word, pos, msd, dephead_ref, deprel]))
    token_spans = list(token.read_spans())

    def get_token_span(index):
        return token_spans[index]

    nodes = []

    for s in sentences:
        tokenlist = [Token(None)]
        for token_index in s:
            token = token_annotations[token_index]
            tokenlist.append(Token(token))

        # Get PS tree
        sen = Sentence(tokenlist)
        if not sen.is_cyclic():
            tree = convert_sentence(sen).top.to_tree_str()
            # print(pprint.pformat(tree), file=sys.stderr)

            # Make nodes
            children = flatten_tree(tree[1], [])
            log.debug("\n\nSENTENCE:")
            position = 0
            open_elem_stack = []
            for child in children:
                if not child[0].startswith("WORD:"):
                    start_pos = get_token_span(s[position])[0]
                    open_elem_stack.append(child + (start_pos, ))
                    log.debug(
                        f"<phrase name={child[0]} func={child[1]}> {s[position]}"
                    )
                else:
                    # Close nodes
                    while open_elem_stack[-1][2] == child[2]:
                        start_pos = open_elem_stack[-1][3]
                        end_pos = get_token_span(s[position - 1])[1]
                        nodes.append(
                            ((start_pos, end_pos), open_elem_stack[-1][0],
                             open_elem_stack[-1][1]))
                        log.debug(
                            f"</phrase name={open_elem_stack[-1][0]} func={open_elem_stack[-1][1]}> {start_pos}-{end_pos}"
                        )
                        open_elem_stack.pop()
                    position += 1
                    log.debug(f"   {child[0][5:]}")

            # Close remaining open nodes
            end_pos = get_token_span(s[-1])[1]
            for elem in reversed(open_elem_stack):
                start_pos = elem[3]
                nodes.append(((start_pos, end_pos), elem[0], elem[1]))
                log.debug(
                    f"</phrase name={elem[0]} func={elem[1]}> {start_pos}-{end_pos}"
                )

    # Sort nodes
    sorted_nodes = sorted(nodes)

    # Write annotations
    out_phrase.write([i[0] for i in sorted_nodes])
    out_phrase_name.write([i[1] for i in sorted_nodes])
    out_phrase_func.write([i[2] for i in sorted_nodes])