def _process_entity_annotations( self, pack: DataPack, label: str, word_begin: int, word_end: int, current_entity_mention: Optional[Tuple[int, str]], ) -> Optional[Tuple[int, str]]: ner_type = label.strip("()*") if "(" in label: # Entering into a span for a particular ner. current_entity_mention = (word_begin, ner_type) if ")" in label: if current_entity_mention is None: raise ValueError( "current_entity_mention is None when meet right blanket.") # Exiting a span, add and then reset the current span. kwargs_i = {"ner_type": current_entity_mention[1]} entity = EntityMention(pack, current_entity_mention[0], word_end) entity.set_fields(**kwargs_i) pack.add_or_get_entry(entity) current_entity_mention = None return current_entity_mention
def _process(self, input_pack: DataPack): for sentence in input_pack.get(Sentence): token_entries = list( input_pack.get(entry_type=Token, range_annotation=sentence, component=self.token_component)) tokens = [(token.text, token.pos) for token in token_entries] ne_tree = ne_chunk(tokens) index = 0 for chunk in ne_tree: if hasattr(chunk, 'label'): # For example: # chunk: Tree('GPE', [('New', 'NNP'), ('York', 'NNP')]) begin_pos = token_entries[index].span.begin end_pos = token_entries[index + len(chunk) - 1].span.end entity = EntityMention(input_pack, begin_pos, end_pos) kwargs_i = {"ner_type": chunk.label()} entity.set_fields(**kwargs_i) input_pack.add_or_get_entry(entity) index += len(chunk) else: # For example: # chunk: ('This', 'DT') index += 1
def _parse_pack(self, data: dict) -> Iterator[DataPack]: r"""Extracts information from input `data` of one document output from Prodigy Annotator including the text, tokens and its annotations into a DataPack. Args: data: a dict that contains information for one document. Returns: DataPack containing information extracted from `data`. """ pack = DataPack() text = data['text'] tokens = data['tokens'] spans = data['spans'] document = Document(pack, 0, len(text)) pack.set_text(text, replace_func=self.text_replace_operation) pack.add_or_get_entry(document) for token in tokens: begin = token['start'] end = token['end'] token_entry = Token(pack, begin, end) pack.add_or_get_entry(token_entry) for span_items in spans: begin = span_items['start'] end = span_items['end'] annotation_entry = EntityMention(pack, begin, end) annotation_entry.set_fields(ner_type=span_items['label']) pack.add_or_get_entry(annotation_entry) pack.meta.doc_id = data['meta']['id'] yield pack
def pack(self, data_pack: DataPack, output_dict: Optional[Dict[str, Dict[str, List[str]]]] = None): """ Write the prediction results back to datapack. by writing the predicted ner to the original tokens. """ if output_dict is None: return current_entity_mention: Tuple[int, str] = (-1, "None") for i in range(len(output_dict["Token"]["tid"])): # an instance for j in range(len(output_dict["Token"]["tid"][i])): tid: int = output_dict["Token"]["tid"][i][j] # type: ignore orig_token: Token = data_pack.get_entry(tid) # type: ignore ner_tag: str = output_dict["Token"]["ner"][i][j] orig_token.set_fields(ner=ner_tag) token = orig_token token_ner = token.get_field("ner") if token_ner[0] == "B": current_entity_mention = (token.span.begin, token_ner[2:]) elif token_ner[0] == "I": continue elif token_ner[0] == "O": continue elif token_ner[0] == "E": if token_ner[2:] != current_entity_mention[1]: continue kwargs_i = {"ner_type": current_entity_mention[1]} entity = EntityMention(data_pack, current_entity_mention[0], token.span.end) entity.set_fields(**kwargs_i) data_pack.add_or_get_entry(entity) elif token_ner[0] == "S": current_entity_mention = (token.span.begin, token_ner[2:]) kwargs_i = {"ner_type": current_entity_mention[1]} entity = EntityMention(data_pack, current_entity_mention[0], token.span.end) entity.set_fields(**kwargs_i) data_pack.add_or_get_entry(entity)
def _process_ner(self, doc, input_pack): """Perform spaCy's NER Pipeline on the document. Args: doc: The document input_pack: Input pack to fill Returns: """ ner_doc = self.nlp(doc) for item in ner_doc.ents: entity = EntityMention(input_pack, item.start_char, item.end_char) kwargs_i = {"ner_type": item.label_} entity.set_fields(**kwargs_i) input_pack.add_or_get_entry(entity)