def save(self): """Save text data and annotation files to disk.""" text = unicodedata.normalize("NFC", "".join(self.text)) Text(self.doc).write(text) structure = [] header_elements = [] for element in self.data: is_header = False spans = [] attributes = {attr: [] for attr in self.data[element]["attrs"]} for instance in self.data[element]["elements"]: start, start_subpos, end, end_subpos, _original_element, attrs = instance spans.append(((start, start_subpos), (end, end_subpos))) for attr in attributes: attributes[attr].append(attrs.get(attr, "")) full_element = "{}.{}".format(self.prefix, element) if self.prefix else element if element in self.header_elements: is_header = True header_elements.append(full_element) else: structure.append(full_element) # Sort spans and annotations by span position (required by Sparv) if attributes: attr_names, attr_values = list(zip(*attributes.items())) spans, *attr_values = list( zip(*sorted(zip(spans, *attr_values), key=lambda x: x[0]))) attributes = dict(zip(attr_names, attr_values)) else: spans.sort() Output(full_element, doc=self.doc).write(spans) for attr in attributes: full_attr = "{}.{}".format(self.prefix, attr) if self.prefix else attr Output("{}:{}".format(full_element, full_attr), doc=self.doc).write(attributes[attr], allow_newlines=is_header) if element not in self.header_elements: structure.append("{}:{}".format(full_element, full_attr)) # Save list of all elements and attributes to a file (needed for export) SourceStructure(self.doc).write(structure) if header_elements: # Save list of all header elements to a file Headers(self.doc).write(header_elements)
def annotate(corpus_text: Text = Text(), lang: Language = Language, conf_file: Model = Model("[freeling.conf]"), fl_binary: Binary = Binary("[freeling.binary]"), sentence_chunk: Optional[Annotation] = Annotation("[freeling.sentence_chunk]"), out_token: Output = Output("freeling.token", cls="token", description="Token segments"), out_word: Output = Output("<token>:freeling.word", cls="token:word", description="Token strings"), out_baseform: Output = Output("<token>:freeling.baseform", description="Baseforms from FreeLing"), out_upos: Output = Output("<token>:freeling.upos", cls="token:upos", description="Part-of-speeches in UD"), out_pos: Output = Output("<token>:freeling.pos", cls="token:pos", description="Part-of-speeches from FreeLing"), out_sentence: Optional[Output] = Output("freeling.sentence", cls="sentence", description="Sentence segments"), sentence_annotation: Optional[Annotation] = Annotation("[freeling.sentence_annotation]")): """Run FreeLing and output sentences, tokens, baseforms, upos and pos.""" main(corpus_text, lang, conf_file, fl_binary, sentence_chunk, out_token, out_word, out_baseform, out_upos, out_pos, out_sentence, sentence_annotation)
def sentence( text: Text = Text(), out: Output = Output("segment.sentence", cls="sentence", description="Sentence segments"), chunk: Optional[Annotation] = Annotation("[segment.sentence_chunk]"), segmenter: str = Config("segment.sentence_segmenter"), existing_segments: Optional[str] = Config( "segment.existing_sentences"), model: Optional[Model] = Model("[segment.sentence_model]")): """Split text into sentences.""" do_segmentation(text=text, out=out, chunk=chunk, segmenter=segmenter, existing_segments=existing_segments, model=model)
def paragraph( text: Text = Text(), out: Output = Output("segment.paragraph", cls="paragraph", description="Paragraph segments"), chunk: Optional[Annotation] = Annotation("[segment.paragraph_chunk]"), segmenter: str = Config("segment.paragraph_segmenter"), existing_segments: Optional[str] = Config("segment.existing_paragraphs" ), model: Optional[Model] = None): """Split text into paragraphs.""" do_segmentation(text=text, out=out, chunk=chunk, segmenter=segmenter, existing_segments=existing_segments, model=model)
def tokenize( text: Text = Text(), out: Output = Output("segment.token", cls="token", description="Token segments"), chunk: Annotation = Annotation("[segment.token_chunk]"), segmenter: str = Config("segment.token_segmenter"), existing_segments: Optional[str] = Config("segment.existing_tokens"), model: Optional[Model] = Model("[segment.tokenizer_config]"), token_list: Optional[Model] = Model("[segment.token_list]")): """Tokenize text.""" do_segmentation(text=text, out=out, chunk=chunk, segmenter=segmenter, existing_segments=existing_segments, model=model, token_list=token_list)
def annotate_full(corpus_text: Text = Text(), lang: Language = Language(), conf_file: Model = Model("[freeling.conf]"), fl_binary: Binary = Binary("[freeling.binary]"), sentence_chunk: Annotation = Annotation("[freeling.sentence_chunk]"), out_token: Output = Output("freeling.token", cls="token", description="Token segments"), out_word: Output = Output("<token>:freeling.word", cls="token:word", description="Token strings"), out_baseform: Output = Output("<token>:freeling.baseform", description="Baseforms from FreeLing"), out_upos: Output = Output("<token>:freeling.upos", cls="token:upos", description="Part-of-speeches in UD"), out_pos: Output = Output("<token>:freeling.pos", cls="token:pos", description="Part-of-speeches from FreeLing"), out_ne_type: Output = Output("<token>:freeling.ne_type", cls="token:named_entity_type", description="Named entitiy types from FreeLing"), out_sentence: Optional[Output] = Output("freeling.sentence", cls="sentence", description="Sentence segments"), sentence_annotation: Optional[Annotation] = Annotation("[freeling.sentence_annotation]")): """Run FreeLing and output the usual annotations plus named entity types.""" main(corpus_text, lang, conf_file, fl_binary, sentence_chunk, out_token, out_word, out_baseform, out_upos, out_pos, out_sentence, sentence_annotation, out_ne_type)
def text_headtail(text: Text = Text(), chunk: Annotation = Annotation("<token>"), out_head: Output = Output("<token>:misc.head"), out_tail: Output = Output("<token>:misc.tail")): """Extract "head" and "tail" whitespace characters for tokens.""" def escape(t): """Escape whitespace characters.""" return t.replace(" ", "\\s").replace("\n", "\\n").replace("\t", "\\t") out_head_annotation = chunk.create_empty_attribute() out_tail_annotation = chunk.create_empty_attribute() head_text = None corpus_text = text.read() chunk = list(chunk.read()) for i, span in enumerate(chunk): if head_text: out_head_annotation[i] = escape(head_text) head_text = None if i < len(chunk) - 1: tail_start = span[1][0] tail_end = chunk[i + 1][0][0] tail_text = corpus_text[tail_start:tail_end] try: n_pos = tail_text.rindex("\n") except ValueError: n_pos = None if n_pos is not None and n_pos + 1 < len(tail_text): head_text = tail_text[n_pos + 1:] tail_text = tail_text[:n_pos + 1] if tail_text: out_tail_annotation[i] = escape(tail_text) out_head.write(out_head_annotation) out_tail.write(out_tail_annotation)
def text_spans(text: Text = Text(), chunk: Annotation = Annotation("<token>"), out: Output = Output("<token>:misc.word", cls="token:word"), keep_formatting_chars: Optional[bool] = Config( "misc.keep_formatting_chars")): """Add the text content for each edge as a new annotation.""" corpus_text = text.read() if isinstance(chunk, (str, Annotation)): chunk = chunk.read_spans() out_annotation = [] for span in chunk: token = corpus_text[span[0]:span[1]] if not keep_formatting_chars: new_token = util.remove_formatting_characters(token) # If this token consists entirely of formatting characters, don't remove them. Empty tokens are bad! if new_token: token = new_token out_annotation.append(token) if out: out.write(out_annotation) else: return out_annotation
def preserved_format( doc: Document = Document(), text: Text = Text(), docid: AnnotationData = AnnotationData("<docid>"), out: Export = Export( "xml_preserved_format/[xml_export.filename_formatted]"), annotations: ExportAnnotations = ExportAnnotations( "xml_export.annotations"), source_annotations: SourceAnnotations = SourceAnnotations( "xml_export.source_annotations"), header_annotations: SourceAnnotations = SourceAnnotations( "xml_export.header_annotations"), remove_namespaces: bool = Config("export.remove_module_namespaces", False), sparv_namespace: str = Config("export.sparv_namespace"), source_namespace: str = Config("export.source_namespace"), include_empty_attributes: bool = Config( "xml_export.include_empty_attributes")): """Export annotations to XML in export_dir and keep whitespaces and indentation from original file. Args: doc: Name of the original document. text: The corpus text. docid: Annotation with document IDs. out: Path and filename pattern for resulting file. annotations: List of elements:attributes (annotations) to include. source_annotations: List of elements:attributes from the original document to be kept. If not specified, everything will be kept. header_annotations: List of header elements from the original document to include in the export. If not specified, all headers will be kept. remove_namespaces: Whether to remove module "namespaces" from element and attribute names. Disabled by default. sparv_namespace: The namespace to be added to all Sparv annotations. source_namespace: The namespace to be added to all annotations present in the source. include_empty_attributes: Whether to include attributes even when they are empty. Disabled by default. """ # Create export dir os.makedirs(os.path.dirname(out), exist_ok=True) # Read corpus text and document ID corpus_text = text.read() docid = docid.read() # Get annotation spans, annotations list etc. annotation_list, _, export_names = util.get_annotation_names( annotations, source_annotations, doc=doc, remove_namespaces=remove_namespaces, sparv_namespace=sparv_namespace, source_namespace=source_namespace) h_annotations, h_export_names = util.get_header_names(header_annotations, doc=doc) export_names.update(h_export_names) span_positions, annotation_dict = util.gather_annotations( annotation_list, export_names, h_annotations, doc=doc, flatten=False, split_overlaps=True) sorted_positions = [(pos, span[0], span[1]) for pos, spans in sorted(span_positions.items()) for span in spans] # Root tag sanity check if not xml_utils.valid_root(sorted_positions[0], sorted_positions[-1]): raise util.SparvErrorMessage( "Root tag is missing! If you have manually specified which elements to include, " "make sure to include an element that encloses all other included elements and " "text content.") # Create root node root_span = sorted_positions[0][2] root_span.set_node() node_stack = [] last_pos = 0 # Keeps track of the position of the processed text for x, (_pos, instruction, span) in enumerate(sorted_positions): # Open node: Create child node under the top stack node if instruction == "open": # Set tail for previous node if necessary if last_pos < span.start: # Get last closing node in this position _, tail_span = [ i for i in span_positions[last_pos] if i[0] == "close" ][-1] tail_span.node.tail = corpus_text[last_pos:span.start] last_pos = span.start # Handle headers if span.is_header: header = annotation_dict[span.name][util.HEADER_CONTENTS][ span.index] header_xml = etree.fromstring(header) header_xml.tag = span.export # Rename element if needed span.node = header_xml node_stack[-1].node.append(header_xml) else: if node_stack: # Don't create root node, it already exists span.set_node(parent_node=node_stack[-1].node) xml_utils.add_attrs(span.node, span.name, annotation_dict, export_names, span.index, include_empty_attributes) if span.overlap_id: if sparv_namespace: span.node.set(f"{sparv_namespace}.{util.OVERLAP_ATTR}", f"{docid}-{span.overlap_id}") else: span.node.set( f"{util.SPARV_DEFAULT_NAMESPACE}.{util.OVERLAP_ATTR}", f"{docid}-{span.overlap_id}") node_stack.append(span) # Set text if there should be any between this node and the next one next_item = sorted_positions[x + 1] if next_item[1] == "open" and next_item[2].start > span.start: span.node.text = corpus_text[last_pos:next_item[2].start] last_pos = next_item[2].start # Close node else: if span.is_header: continue if last_pos < span.end: # Set node text if necessary if span.start == last_pos: span.node.text = corpus_text[last_pos:span.end] # Set tail for previous node if necessary else: # Get last closing node in this position _, tail_span = [ i for i in span_positions[last_pos] if i[0] == "close" ][-1] tail_span.node.tail = corpus_text[last_pos:span.end] last_pos = span.end # Make sure closing node == top stack node assert span == node_stack[ -1], "Overlapping elements found: {}".format(node_stack[-2:]) # Pop stack and move on to next span node_stack.pop() # Write xml to file etree.ElementTree(root_span.node).write(out, encoding="unicode", method="xml", xml_declaration=True) log.info("Exported: %s", out)
def annotate(corpus_text: Text = Text(), lang: Language = Language(), text: Annotation = Annotation("<text>"), out_sentence: Output = Output("stanford.sentence", cls="sentence", description="Sentence segments"), out_token: Output = Output("stanford.token", cls="token", description="Token segments"), out_word: Output = Output("<token>:stanford.word", cls="token:word", description="Token strings"), out_ref: Output = Output("<token>:stanford.ref", description="Token ID relative to sentence"), out_baseform: Output = Output("<token>:stanford.baseform", description="Baseforms from Stanford Parser"), out_upos: Output = Output("<token>:stanford.upos", cls="token:upos", description="Part-of-speeches in UD"), out_pos: Output = Output("<token>:stanford.pos", cls="token:pos", description="Part-of-speeches from Stanford Parser"), out_ne: Output = Output("<token>:stanford.ne_type", cls="token:named_entity_type", description="Named entitiy types from Stanford Parser"), out_deprel: Output = Output("<token>:stanford.deprel", cls="token:deprel", description="Dependency relations to the head"), out_dephead_ref: Output = Output("<token>:stanford.dephead_ref", cls="token:dephead_ref", description="Sentence-relative positions of the dependency heads"), binary: BinaryDir = BinaryDir("[stanford.bin]")): """Use Stanford Parser to parse and annotate text.""" args = ["-cp", binary + "/*", "edu.stanford.nlp.pipeline.StanfordCoreNLP", "-annotators", "tokenize,ssplit,pos,lemma,depparse,ner", "-outputFormat", "conll"] process = util.system.call_binary("java", arguments=args, return_command=True) # Read corpus_text and text_spans text_data = corpus_text.read() text_spans = text.read_spans() sentence_segments = [] all_tokens = [] # Go through text elements and parse them with Stanford Parser for text_span in text_spans: inputtext = text_data[text_span[0]:text_span[1]] stdout, _ = process.communicate(inputtext.encode(util.UTF8)) processed_sentences = _parse_output(stdout.decode(util.UTF8), lang) # Go through output and try to match tokens with input text to get correct spans index_counter = text_span[0] for sentence in processed_sentences: for token in sentence: all_tokens.append(token) # Get token span match = re.match(r"\s*(%s)" % re.escape(token.word), inputtext) span = match.span(1) token.start = span[0] + index_counter token.end = span[1] + index_counter # Forward inputtext inputtext = inputtext[span[1]:] index_counter += span[1] # Extract sentence span for current sentence sentence_segments.append((sentence[0].start, sentence[-1].end)) # Write annotations out_sentence.write(sentence_segments) out_token.write([(t.start, t.end) for t in all_tokens]) out_ref.write([t.ref for t in all_tokens]) out_word.write([t.word for t in all_tokens]) out_baseform.write([t.baseform for t in all_tokens]) out_upos.write([t.upos for t in all_tokens]) out_pos.write([t.pos for t in all_tokens]) out_ne.write([t.ne for t in all_tokens]) out_dephead_ref.write([t.dephead_ref for t in all_tokens]) out_deprel.write([t.deprel for t in all_tokens])