def number_by_position(out: Output = Output("{annotation}:misc.number_position"), chunk: Annotation = Annotation("{annotation}"), prefix: str = "", zfill: bool = False, start: int = START_DEFAULT): """Number chunks by their position.""" spans = list(chunk.read_spans()) def _order(index, _value): return spans[index] _read_chunks_and_write_new_ordering(out, chunk, _order, prefix, zfill, start)
def text_spans(text: Text = Text(), chunk: Annotation = Annotation("<token>"), out: Output = Output("<token>:misc.word", cls="token:word"), keep_formatting_chars: Optional[bool] = Config( "misc.keep_formatting_chars")): """Add the text content for each edge as a new annotation.""" corpus_text = text.read() if isinstance(chunk, (str, Annotation)): chunk = chunk.read_spans() out_annotation = [] for span in chunk: token = corpus_text[span[0]:span[1]] if not keep_formatting_chars: new_token = util.remove_formatting_characters(token) # If this token consists entirely of formatting characters, don't remove them. Empty tokens are bad! if new_token: token = new_token out_annotation.append(token) if out: out.write(out_annotation) else: return out_annotation
def span_as_value(chunk: Annotation, out: Output): """Create new annotation, with spans as values.""" out.write((f"{start}-{end}" for start, end in chunk.read_spans()))
def parse_swener_output(sentences: list, token: Annotation, output, out_ne: Output, out_ne_ex: Output, out_ne_type: Output, out_ne_subtype: Output, out_ne_name: Output): """Parse the SweNER output and write annotation files.""" out_ne_spans = [] out_ex = [] out_type = [] out_subtype = [] out_name = [] token_spans = list(token.read_spans()) # Loop through the NE-tagged sentences and parse each one with ElemenTree for sent, tagged_sent in zip(sentences, output.strip().split(SENT_SEP)): xml_sent = "<sroot>" + tagged_sent + "</sroot>" # Filter out tags on the format <EnamexXxxXxx> since they seem to always overlap with <ENAMEX> elements, # making the XML invalid. xml_sent = re.sub(r"</?Enamex[^>\s]+>", "", xml_sent) try: root = etree.fromstring(xml_sent) except: log.warning("Error parsing sentence. Skipping.") continue # Init token counter; needed to get start_pos and end_pos i = 0 previous_end = 0 children = list(root.iter()) try: for count, child in enumerate(children): start_pos = token_spans[sent[i]][0] start_i = i # If current child has text, increase token counter if child.text: i += len(child.text.strip().split(TOK_SEP)) # Extract NE tags and save them in lists if child.tag != "sroot": if start_i < previous_end: pass # log.warning("Overlapping NE elements found; discarding one.") else: end_pos = token_spans[sent[i - 1]][1] previous_end = i span = (start_pos, end_pos) out_ne_spans.append(span) out_ex.append(child.tag) out_type.append(child.get("TYPE")) out_subtype.append(child.get("SBT")) out_name.append(child.text) # If this child has a tail and it doesn't start with a space, or if it has no tail at all # despite not being the last child, it means this NE ends in the middle of a token. if (child.tail and child.tail.strip() and not child.tail[0] == " ") or ( not child.tail and count < len(children) - 1): i -= 1 # log.warning("Split token returned by name tagger.") # If current child has text in the tail, increase token counter if child.tail and child.tail.strip(): i += len(child.tail.strip().split(TOK_SEP)) if (child.tag == "sroot" and child.text and not child.text[-1] == " ") or ( child.tail and not child.tail[-1] == " "): # The next NE would start in the middle of a token, so decrease the counter by 1 i -= 1 except IndexError: log.warning("Error parsing sentence. Skipping.") continue # Write annotations out_ne.write(out_ne_spans) out_ne_ex.write(out_ex) out_ne_type.write(out_type) out_ne_subtype.write(out_subtype) out_ne_name.write(out_name)
def annotate(corpus_text: Text = Text(), lang: Language = Language(), text: Annotation = Annotation("<text>"), out_sentence: Output = Output("stanford.sentence", cls="sentence", description="Sentence segments"), out_token: Output = Output("stanford.token", cls="token", description="Token segments"), out_word: Output = Output("<token>:stanford.word", cls="token:word", description="Token strings"), out_ref: Output = Output("<token>:stanford.ref", description="Token ID relative to sentence"), out_baseform: Output = Output("<token>:stanford.baseform", description="Baseforms from Stanford Parser"), out_upos: Output = Output("<token>:stanford.upos", cls="token:upos", description="Part-of-speeches in UD"), out_pos: Output = Output("<token>:stanford.pos", cls="token:pos", description="Part-of-speeches from Stanford Parser"), out_ne: Output = Output("<token>:stanford.ne_type", cls="token:named_entity_type", description="Named entitiy types from Stanford Parser"), out_deprel: Output = Output("<token>:stanford.deprel", cls="token:deprel", description="Dependency relations to the head"), out_dephead_ref: Output = Output("<token>:stanford.dephead_ref", cls="token:dephead_ref", description="Sentence-relative positions of the dependency heads"), binary: BinaryDir = BinaryDir("[stanford.bin]")): """Use Stanford Parser to parse and annotate text.""" args = ["-cp", binary + "/*", "edu.stanford.nlp.pipeline.StanfordCoreNLP", "-annotators", "tokenize,ssplit,pos,lemma,depparse,ner", "-outputFormat", "conll"] process = util.system.call_binary("java", arguments=args, return_command=True) # Read corpus_text and text_spans text_data = corpus_text.read() text_spans = text.read_spans() sentence_segments = [] all_tokens = [] # Go through text elements and parse them with Stanford Parser for text_span in text_spans: inputtext = text_data[text_span[0]:text_span[1]] stdout, _ = process.communicate(inputtext.encode(util.UTF8)) processed_sentences = _parse_output(stdout.decode(util.UTF8), lang) # Go through output and try to match tokens with input text to get correct spans index_counter = text_span[0] for sentence in processed_sentences: for token in sentence: all_tokens.append(token) # Get token span match = re.match(r"\s*(%s)" % re.escape(token.word), inputtext) span = match.span(1) token.start = span[0] + index_counter token.end = span[1] + index_counter # Forward inputtext inputtext = inputtext[span[1]:] index_counter += span[1] # Extract sentence span for current sentence sentence_segments.append((sentence[0].start, sentence[-1].end)) # Write annotations out_sentence.write(sentence_segments) out_token.write([(t.start, t.end) for t in all_tokens]) out_ref.write([t.ref for t in all_tokens]) out_word.write([t.word for t in all_tokens]) out_baseform.write([t.baseform for t in all_tokens]) out_upos.write([t.upos for t in all_tokens]) out_pos.write([t.pos for t in all_tokens]) out_ne.write([t.ne for t in all_tokens]) out_dephead_ref.write([t.dephead_ref for t in all_tokens]) out_deprel.write([t.deprel for t in all_tokens])
def annotate( out_phrase: Output = Output("phrase_structure.phrase", description="Phrase segments"), out_phrase_name: Output = Output( "phrase_structure.phrase:phrase_structure.name", description="Phrase names"), out_phrase_func: Output = Output( "phrase_structure.phrase:phrase_structure.func", description="Phrase functions"), token: Annotation = Annotation("<token>"), word: Annotation = Annotation("<token:word>"), sentence: Annotation = Annotation("<sentence>"), pos: Annotation = Annotation("<token:pos>"), msd: Annotation = Annotation("<token:msd>"), ref: Annotation = Annotation("<token>:misc.number_rel_<sentence>"), dephead_ref: Annotation = Annotation("<token:dephead_ref>"), deprel: Annotation = Annotation("<token:deprel>")): """Annotate sentence with phrase structures.""" sentences, _orphans = sentence.get_children(word) token_annotations = list( ref.read_attributes([ref, word, pos, msd, dephead_ref, deprel])) token_spans = list(token.read_spans()) def get_token_span(index): return token_spans[index] nodes = [] for s in sentences: tokenlist = [Token(None)] for token_index in s: token = token_annotations[token_index] tokenlist.append(Token(token)) # Get PS tree sen = Sentence(tokenlist) if not sen.is_cyclic(): tree = convert_sentence(sen).top.to_tree_str() # print(pprint.pformat(tree), file=sys.stderr) # Make nodes children = flatten_tree(tree[1], []) log.debug("\n\nSENTENCE:") position = 0 open_elem_stack = [] for child in children: if not child[0].startswith("WORD:"): start_pos = get_token_span(s[position])[0] open_elem_stack.append(child + (start_pos, )) log.debug( f"<phrase name={child[0]} func={child[1]}> {s[position]}" ) else: # Close nodes while open_elem_stack[-1][2] == child[2]: start_pos = open_elem_stack[-1][3] end_pos = get_token_span(s[position - 1])[1] nodes.append( ((start_pos, end_pos), open_elem_stack[-1][0], open_elem_stack[-1][1])) log.debug( f"</phrase name={open_elem_stack[-1][0]} func={open_elem_stack[-1][1]}> {start_pos}-{end_pos}" ) open_elem_stack.pop() position += 1 log.debug(f" {child[0][5:]}") # Close remaining open nodes end_pos = get_token_span(s[-1])[1] for elem in reversed(open_elem_stack): start_pos = elem[3] nodes.append(((start_pos, end_pos), elem[0], elem[1])) log.debug( f"</phrase name={elem[0]} func={elem[1]}> {start_pos}-{end_pos}" ) # Sort nodes sorted_nodes = sorted(nodes) # Write annotations out_phrase.write([i[0] for i in sorted_nodes]) out_phrase_name.write([i[1] for i in sorted_nodes]) out_phrase_func.write([i[2] for i in sorted_nodes])