Example #1
0
    def save(self):
        """Save text data and annotation files to disk."""
        text = unicodedata.normalize("NFC", "".join(self.text))
        Text(self.doc).write(text)
        structure = []
        header_elements = []

        for element in self.data:
            is_header = False
            spans = []
            attributes = {attr: [] for attr in self.data[element]["attrs"]}
            for instance in self.data[element]["elements"]:
                start, start_subpos, end, end_subpos, _original_element, attrs = instance
                spans.append(((start, start_subpos), (end, end_subpos)))
                for attr in attributes:
                    attributes[attr].append(attrs.get(attr, ""))

            full_element = "{}.{}".format(self.prefix,
                                          element) if self.prefix else element

            if element in self.header_elements:
                is_header = True
                header_elements.append(full_element)
            else:
                structure.append(full_element)

            # Sort spans and annotations by span position (required by Sparv)
            if attributes:
                attr_names, attr_values = list(zip(*attributes.items()))
                spans, *attr_values = list(
                    zip(*sorted(zip(spans, *attr_values), key=lambda x: x[0])))
                attributes = dict(zip(attr_names, attr_values))
            else:
                spans.sort()

            Output(full_element, doc=self.doc).write(spans)

            for attr in attributes:
                full_attr = "{}.{}".format(self.prefix,
                                           attr) if self.prefix else attr
                Output("{}:{}".format(full_element, full_attr),
                       doc=self.doc).write(attributes[attr],
                                           allow_newlines=is_header)
                if element not in self.header_elements:
                    structure.append("{}:{}".format(full_element, full_attr))

        # Save list of all elements and attributes to a file (needed for export)
        SourceStructure(self.doc).write(structure)

        if header_elements:
            # Save list of all header elements to a file
            Headers(self.doc).write(header_elements)
Example #2
0
def annotate(corpus_text: Text = Text(),
             lang: Language = Language,
             conf_file: Model = Model("[freeling.conf]"),
             fl_binary: Binary = Binary("[freeling.binary]"),
             sentence_chunk: Optional[Annotation] = Annotation("[freeling.sentence_chunk]"),
             out_token: Output = Output("freeling.token", cls="token", description="Token segments"),
             out_word: Output = Output("<token>:freeling.word", cls="token:word", description="Token strings"),
             out_baseform: Output = Output("<token>:freeling.baseform", description="Baseforms from FreeLing"),
             out_upos: Output = Output("<token>:freeling.upos", cls="token:upos", description="Part-of-speeches in UD"),
             out_pos: Output = Output("<token>:freeling.pos", cls="token:pos",
                                      description="Part-of-speeches from FreeLing"),
             out_sentence: Optional[Output] = Output("freeling.sentence", cls="sentence", description="Sentence segments"),
             sentence_annotation: Optional[Annotation] = Annotation("[freeling.sentence_annotation]")):
    """Run FreeLing and output sentences, tokens, baseforms, upos and pos."""
    main(corpus_text, lang, conf_file, fl_binary, sentence_chunk, out_token, out_word, out_baseform, out_upos, out_pos,
         out_sentence, sentence_annotation)
Example #3
0
def sentence(
        text: Text = Text(),
        out: Output = Output("segment.sentence",
                             cls="sentence",
                             description="Sentence segments"),
        chunk: Optional[Annotation] = Annotation("[segment.sentence_chunk]"),
        segmenter: str = Config("segment.sentence_segmenter"),
        existing_segments: Optional[str] = Config(
            "segment.existing_sentences"),
        model: Optional[Model] = Model("[segment.sentence_model]")):
    """Split text into sentences."""
    do_segmentation(text=text,
                    out=out,
                    chunk=chunk,
                    segmenter=segmenter,
                    existing_segments=existing_segments,
                    model=model)
Example #4
0
def paragraph(
        text: Text = Text(),
        out: Output = Output("segment.paragraph",
                             cls="paragraph",
                             description="Paragraph segments"),
        chunk: Optional[Annotation] = Annotation("[segment.paragraph_chunk]"),
        segmenter: str = Config("segment.paragraph_segmenter"),
        existing_segments: Optional[str] = Config("segment.existing_paragraphs"
                                                  ),
        model: Optional[Model] = None):
    """Split text into paragraphs."""
    do_segmentation(text=text,
                    out=out,
                    chunk=chunk,
                    segmenter=segmenter,
                    existing_segments=existing_segments,
                    model=model)
Example #5
0
def tokenize(
        text: Text = Text(),
        out: Output = Output("segment.token",
                             cls="token",
                             description="Token segments"),
        chunk: Annotation = Annotation("[segment.token_chunk]"),
        segmenter: str = Config("segment.token_segmenter"),
        existing_segments: Optional[str] = Config("segment.existing_tokens"),
        model: Optional[Model] = Model("[segment.tokenizer_config]"),
        token_list: Optional[Model] = Model("[segment.token_list]")):
    """Tokenize text."""
    do_segmentation(text=text,
                    out=out,
                    chunk=chunk,
                    segmenter=segmenter,
                    existing_segments=existing_segments,
                    model=model,
                    token_list=token_list)
Example #6
0
def annotate_full(corpus_text: Text = Text(),
                  lang: Language = Language(),
                  conf_file: Model = Model("[freeling.conf]"),
                  fl_binary: Binary = Binary("[freeling.binary]"),
                  sentence_chunk: Annotation = Annotation("[freeling.sentence_chunk]"),
                  out_token: Output = Output("freeling.token", cls="token", description="Token segments"),
                  out_word: Output = Output("<token>:freeling.word", cls="token:word", description="Token strings"),
                  out_baseform: Output = Output("<token>:freeling.baseform", description="Baseforms from FreeLing"),
                  out_upos: Output = Output("<token>:freeling.upos", cls="token:upos",
                                            description="Part-of-speeches in UD"),
                  out_pos: Output = Output("<token>:freeling.pos", cls="token:pos",
                                           description="Part-of-speeches from FreeLing"),
                  out_ne_type: Output = Output("<token>:freeling.ne_type", cls="token:named_entity_type",
                                               description="Named entitiy types from FreeLing"),
                  out_sentence: Optional[Output] = Output("freeling.sentence", cls="sentence",
                                                          description="Sentence segments"),
                  sentence_annotation: Optional[Annotation] = Annotation("[freeling.sentence_annotation]")):
    """Run FreeLing and output the usual annotations plus named entity types."""
    main(corpus_text, lang, conf_file, fl_binary, sentence_chunk, out_token, out_word, out_baseform, out_upos, out_pos,
         out_sentence, sentence_annotation, out_ne_type)
Example #7
0
def text_headtail(text: Text = Text(),
                  chunk: Annotation = Annotation("<token>"),
                  out_head: Output = Output("<token>:misc.head"),
                  out_tail: Output = Output("<token>:misc.tail")):
    """Extract "head" and "tail" whitespace characters for tokens."""
    def escape(t):
        """Escape whitespace characters."""
        return t.replace(" ", "\\s").replace("\n", "\\n").replace("\t", "\\t")

    out_head_annotation = chunk.create_empty_attribute()
    out_tail_annotation = chunk.create_empty_attribute()
    head_text = None

    corpus_text = text.read()
    chunk = list(chunk.read())

    for i, span in enumerate(chunk):
        if head_text:
            out_head_annotation[i] = escape(head_text)
            head_text = None

        if i < len(chunk) - 1:
            tail_start = span[1][0]
            tail_end = chunk[i + 1][0][0]
            tail_text = corpus_text[tail_start:tail_end]

            try:
                n_pos = tail_text.rindex("\n")
            except ValueError:
                n_pos = None
            if n_pos is not None and n_pos + 1 < len(tail_text):
                head_text = tail_text[n_pos + 1:]
                tail_text = tail_text[:n_pos + 1]

            if tail_text:
                out_tail_annotation[i] = escape(tail_text)

    out_head.write(out_head_annotation)
    out_tail.write(out_tail_annotation)
Example #8
0
def text_spans(text: Text = Text(),
               chunk: Annotation = Annotation("<token>"),
               out: Output = Output("<token>:misc.word", cls="token:word"),
               keep_formatting_chars: Optional[bool] = Config(
                   "misc.keep_formatting_chars")):
    """Add the text content for each edge as a new annotation."""
    corpus_text = text.read()
    if isinstance(chunk, (str, Annotation)):
        chunk = chunk.read_spans()
    out_annotation = []
    for span in chunk:
        token = corpus_text[span[0]:span[1]]
        if not keep_formatting_chars:
            new_token = util.remove_formatting_characters(token)
            # If this token consists entirely of formatting characters, don't remove them. Empty tokens are bad!
            if new_token:
                token = new_token
        out_annotation.append(token)
    if out:
        out.write(out_annotation)
    else:
        return out_annotation
Example #9
0
def preserved_format(
        doc: Document = Document(),
        text: Text = Text(),
        docid: AnnotationData = AnnotationData("<docid>"),
        out: Export = Export(
            "xml_preserved_format/[xml_export.filename_formatted]"),
        annotations: ExportAnnotations = ExportAnnotations(
            "xml_export.annotations"),
        source_annotations: SourceAnnotations = SourceAnnotations(
            "xml_export.source_annotations"),
        header_annotations: SourceAnnotations = SourceAnnotations(
            "xml_export.header_annotations"),
        remove_namespaces: bool = Config("export.remove_module_namespaces",
                                         False),
        sparv_namespace: str = Config("export.sparv_namespace"),
        source_namespace: str = Config("export.source_namespace"),
        include_empty_attributes: bool = Config(
            "xml_export.include_empty_attributes")):
    """Export annotations to XML in export_dir and keep whitespaces and indentation from original file.

    Args:
        doc: Name of the original document.
        text: The corpus text.
        docid: Annotation with document IDs.
        out: Path and filename pattern for resulting file.
        annotations: List of elements:attributes (annotations) to include.
        source_annotations: List of elements:attributes from the original document
            to be kept. If not specified, everything will be kept.
        header_annotations: List of header elements from the original document to include
            in the export. If not specified, all headers will be kept.
        remove_namespaces: Whether to remove module "namespaces" from element and attribute names.
            Disabled by default.
        sparv_namespace: The namespace to be added to all Sparv annotations.
        source_namespace: The namespace to be added to all annotations present in the source.
        include_empty_attributes: Whether to include attributes even when they are empty. Disabled by default.
    """
    # Create export dir
    os.makedirs(os.path.dirname(out), exist_ok=True)

    # Read corpus text and document ID
    corpus_text = text.read()
    docid = docid.read()

    # Get annotation spans, annotations list etc.
    annotation_list, _, export_names = util.get_annotation_names(
        annotations,
        source_annotations,
        doc=doc,
        remove_namespaces=remove_namespaces,
        sparv_namespace=sparv_namespace,
        source_namespace=source_namespace)
    h_annotations, h_export_names = util.get_header_names(header_annotations,
                                                          doc=doc)
    export_names.update(h_export_names)
    span_positions, annotation_dict = util.gather_annotations(
        annotation_list,
        export_names,
        h_annotations,
        doc=doc,
        flatten=False,
        split_overlaps=True)
    sorted_positions = [(pos, span[0], span[1])
                        for pos, spans in sorted(span_positions.items())
                        for span in spans]

    # Root tag sanity check
    if not xml_utils.valid_root(sorted_positions[0], sorted_positions[-1]):
        raise util.SparvErrorMessage(
            "Root tag is missing! If you have manually specified which elements to include, "
            "make sure to include an element that encloses all other included elements and "
            "text content.")

    # Create root node
    root_span = sorted_positions[0][2]
    root_span.set_node()
    node_stack = []
    last_pos = 0  # Keeps track of the position of the processed text

    for x, (_pos, instruction, span) in enumerate(sorted_positions):
        # Open node: Create child node under the top stack node
        if instruction == "open":
            # Set tail for previous node if necessary
            if last_pos < span.start:
                # Get last closing node in this position
                _, tail_span = [
                    i for i in span_positions[last_pos] if i[0] == "close"
                ][-1]
                tail_span.node.tail = corpus_text[last_pos:span.start]
                last_pos = span.start

            # Handle headers
            if span.is_header:
                header = annotation_dict[span.name][util.HEADER_CONTENTS][
                    span.index]
                header_xml = etree.fromstring(header)
                header_xml.tag = span.export  # Rename element if needed
                span.node = header_xml
                node_stack[-1].node.append(header_xml)
            else:
                if node_stack:  # Don't create root node, it already exists
                    span.set_node(parent_node=node_stack[-1].node)

                xml_utils.add_attrs(span.node, span.name, annotation_dict,
                                    export_names, span.index,
                                    include_empty_attributes)
                if span.overlap_id:
                    if sparv_namespace:
                        span.node.set(f"{sparv_namespace}.{util.OVERLAP_ATTR}",
                                      f"{docid}-{span.overlap_id}")
                    else:
                        span.node.set(
                            f"{util.SPARV_DEFAULT_NAMESPACE}.{util.OVERLAP_ATTR}",
                            f"{docid}-{span.overlap_id}")
                node_stack.append(span)

                # Set text if there should be any between this node and the next one
                next_item = sorted_positions[x + 1]
                if next_item[1] == "open" and next_item[2].start > span.start:
                    span.node.text = corpus_text[last_pos:next_item[2].start]
                    last_pos = next_item[2].start

        # Close node
        else:
            if span.is_header:
                continue
            if last_pos < span.end:
                # Set node text if necessary
                if span.start == last_pos:
                    span.node.text = corpus_text[last_pos:span.end]
                # Set tail for previous node if necessary
                else:
                    # Get last closing node in this position
                    _, tail_span = [
                        i for i in span_positions[last_pos] if i[0] == "close"
                    ][-1]
                    tail_span.node.tail = corpus_text[last_pos:span.end]
                last_pos = span.end

            # Make sure closing node == top stack node
            assert span == node_stack[
                -1], "Overlapping elements found: {}".format(node_stack[-2:])
            # Pop stack and move on to next span
            node_stack.pop()

    # Write xml to file
    etree.ElementTree(root_span.node).write(out,
                                            encoding="unicode",
                                            method="xml",
                                            xml_declaration=True)
    log.info("Exported: %s", out)
Example #10
0
def annotate(corpus_text: Text = Text(),
             lang: Language = Language(),
             text: Annotation = Annotation("<text>"),
             out_sentence: Output = Output("stanford.sentence", cls="sentence", description="Sentence segments"),
             out_token: Output = Output("stanford.token", cls="token", description="Token segments"),
             out_word: Output = Output("<token>:stanford.word", cls="token:word", description="Token strings"),
             out_ref: Output = Output("<token>:stanford.ref", description="Token ID relative to sentence"),
             out_baseform: Output = Output("<token>:stanford.baseform", description="Baseforms from Stanford Parser"),
             out_upos: Output = Output("<token>:stanford.upos", cls="token:upos", description="Part-of-speeches in UD"),
             out_pos: Output = Output("<token>:stanford.pos", cls="token:pos",
                                      description="Part-of-speeches from Stanford Parser"),
             out_ne: Output = Output("<token>:stanford.ne_type", cls="token:named_entity_type",
                                     description="Named entitiy types from Stanford Parser"),
             out_deprel: Output = Output("<token>:stanford.deprel", cls="token:deprel",
                                         description="Dependency relations to the head"),
             out_dephead_ref: Output = Output("<token>:stanford.dephead_ref", cls="token:dephead_ref",
                                              description="Sentence-relative positions of the dependency heads"),
             binary: BinaryDir = BinaryDir("[stanford.bin]")):
    """Use Stanford Parser to parse and annotate text."""
    args = ["-cp", binary + "/*", "edu.stanford.nlp.pipeline.StanfordCoreNLP",
            "-annotators", "tokenize,ssplit,pos,lemma,depparse,ner",
            "-outputFormat", "conll"]
    process = util.system.call_binary("java", arguments=args, return_command=True)

    # Read corpus_text and text_spans
    text_data = corpus_text.read()
    text_spans = text.read_spans()

    sentence_segments = []
    all_tokens = []

    # Go through text elements and parse them with Stanford Parser
    for text_span in text_spans:
        inputtext = text_data[text_span[0]:text_span[1]]
        stdout, _ = process.communicate(inputtext.encode(util.UTF8))
        processed_sentences = _parse_output(stdout.decode(util.UTF8), lang)

        # Go through output and try to match tokens with input text to get correct spans
        index_counter = text_span[0]
        for sentence in processed_sentences:
            for token in sentence:
                all_tokens.append(token)
                # Get token span
                match = re.match(r"\s*(%s)" % re.escape(token.word), inputtext)
                span = match.span(1)
                token.start = span[0] + index_counter
                token.end = span[1] + index_counter
                # Forward inputtext
                inputtext = inputtext[span[1]:]
                index_counter += span[1]
            # Extract sentence span for current sentence
            sentence_segments.append((sentence[0].start, sentence[-1].end))

    # Write annotations
    out_sentence.write(sentence_segments)
    out_token.write([(t.start, t.end) for t in all_tokens])
    out_ref.write([t.ref for t in all_tokens])
    out_word.write([t.word for t in all_tokens])
    out_baseform.write([t.baseform for t in all_tokens])
    out_upos.write([t.upos for t in all_tokens])
    out_pos.write([t.pos for t in all_tokens])
    out_ne.write([t.ne for t in all_tokens])
    out_dephead_ref.write([t.dephead_ref for t in all_tokens])
    out_deprel.write([t.deprel for t in all_tokens])