def test_not_hashable(self): anno: Annotation = Annotation(self.pack, 0, 5) with self.assertRaises(TypeError): hash(anno) anno.regret_creation() anno1: EntityMention = EntityMention(self.pack, 0, 2) with self.assertRaises(TypeError): hash(anno1) anno1.regret_creation()
def _process(self, input_pack: DataPack): entity_text = self.configs.entities_to_insert input_text = input_pack.text if not all(bool(entity in input_text) for entity in entity_text): raise Exception( "Entities to be added are not valid for the input text.") for entity in entity_text: start = input_text.index(entity) end = start + len(entity) entity_mention = EntityMention(input_pack, start, end) input_pack.add_entry(entity_mention)
def pack(self, data_pack: DataPack, output_dict: Optional[Dict[str, Dict[str, List[str]]]] = None): """ Write the prediction results back to datapack. by writing the predicted ner to the original tokens. """ if output_dict is None: return current_entity_mention: Tuple[int, str] = (-1, "None") for i in range(len(output_dict["Token"]["tid"])): # an instance for j in range(len(output_dict["Token"]["tid"][i])): tid: int = output_dict["Token"]["tid"][i][j] # type: ignore orig_token: Token = data_pack.get_entry(tid) # type: ignore ner_tag: str = output_dict["Token"]["ner"][i][j] orig_token.set_fields(ner=ner_tag) token = orig_token token_ner = token.get_field("ner") if token_ner[0] == "B": current_entity_mention = (token.span.begin, token_ner[2:]) elif token_ner[0] == "I": continue elif token_ner[0] == "O": continue elif token_ner[0] == "E": if token_ner[2:] != current_entity_mention[1]: continue kwargs_i = {"ner_type": current_entity_mention[1]} entity = EntityMention(data_pack, current_entity_mention[0], token.span.end) entity.set_fields(**kwargs_i) data_pack.add_or_get_entry(entity) elif token_ner[0] == "S": current_entity_mention = (token.span.begin, token_ner[2:]) kwargs_i = {"ner_type": current_entity_mention[1]} entity = EntityMention(data_pack, current_entity_mention[0], token.span.end) entity.set_fields(**kwargs_i) data_pack.add_or_get_entry(entity)
def pack(self, data_pack: DataPack, output_dict: Optional[Dict[str, Dict[str, List[str]]]] = None): """ Write the prediction results back to datapack. by writing the predicted ner to the original subwords and convert predictions to something that makes sense in a word-by-word segmentation """ if output_dict is None: return for i in range(len(output_dict["Subword"]["tid"])): tids = output_dict["Subword"]["tid"][i] labels = output_dict["Subword"]["ner"][i] # Filter to labels not in `self.ignore_labels` entities = [ dict(idx=idx, label=label, tid=tid) for idx, (label, tid) in enumerate(zip(labels, tids)) if label not in self.ft_configs.ignore_labels ] entity_groups = self._group_entities(entities, data_pack, tids) # Add NER tags and create EntityMention ontologies. for first_tid, last_tid in entity_groups: first_token: Subword = data_pack.get_entry( # type: ignore first_tid) first_token.ner = 'B-' + self.ft_configs.ner_type for tid in range(first_tid + 1, last_tid + 1): token: Subword = data_pack.get_entry(tid) # type: ignore token.ner = 'I-' + self.ft_configs.ner_type begin = first_token.span.begin end = data_pack.get_entry(last_tid).span.end entity = EntityMention(data_pack, begin, end) entity.ner_type = self.ft_configs.ner_type
def _process(self, input_pack: DataPack): for sentence in input_pack.get(Sentence): token_entries = list( input_pack.get(entry_type=Token, range_annotation=sentence, components=self.token_component)) tokens = [(token.text, token.pos) for token in token_entries] ne_tree = ne_chunk(tokens) index = 0 for chunk in ne_tree: if hasattr(chunk, 'label'): # For example: # chunk: Tree('GPE', [('New', 'NNP'), ('York', 'NNP')]) begin_pos = token_entries[index].span.begin end_pos = token_entries[index + len(chunk) - 1].span.end entity = EntityMention(input_pack, begin_pos, end_pos) entity.ner_type = chunk.label() index += len(chunk) else: # For example: # chunk: ('This', 'DT') index += 1
def _process(self, input_pack: MultiPack): from_pack: DataPack = input_pack.get_pack(self.configs.copy_from) copy_pack: DataPack = input_pack.add_pack(self.configs.copy_to) copy_pack.set_text(from_pack.text) if from_pack.pack_name is not None: copy_pack.pack_name = from_pack.pack_name + '_copy' else: copy_pack.pack_name = 'copy' ent: EntityMention for ent in from_pack.get(EntityMention): EntityMention(copy_pack, ent.begin, ent.end)
def _parse_pack(self, data: dict) -> Iterator[DataPack]: """ Extracts information from input `data` of one document output from Prodigy Annotator including the text, tokens and its annotations into a DataPack. Args: data: a dict that contains information for one document. Returns: DataPack containing information extracted from `data`. """ pack = DataPack() text = data['text'] tokens = data['tokens'] spans = data['spans'] document = Document(pack, 0, len(text)) pack.set_text(text, replace_func=self.text_replace_operation) pack.add_or_get_entry(document) for token in tokens: begin = token['start'] end = token['end'] token_entry = Token(pack, begin, end) pack.add_or_get_entry(token_entry) for span_items in spans: begin = span_items['start'] end = span_items['end'] annotation_entry = EntityMention(pack, begin, end) annotation_entry.set_fields(ner_type=span_items['label']) pack.add_or_get_entry(annotation_entry) pack.meta.doc_id = data['meta']['id'] yield pack
def _parse_pack(self, file_path: str) -> Iterator[DataPack]: pack: DataPack = DataPack() text: str = "" offset: int = 0 with open(file_path, "r", encoding="utf8") as f: for line in f: line = line.strip() if line != "": oie_component: List[str] = line.split("\t") # Add sentence. sentence = oie_component[0] text += sentence + "\n" Sentence(pack, offset, offset + len(sentence)) # Find argument 1. arg1_begin = sentence.find(oie_component[3]) + offset arg1_end = arg1_begin + len(oie_component[3]) arg1: EntityMention = EntityMention( pack, arg1_begin, arg1_end) # Find argument 2. arg2_begin = sentence.find(oie_component[4]) + offset arg2_end = arg2_begin + len(oie_component[4]) arg2: EntityMention = EntityMention( pack, arg2_begin, arg2_end) head_relation = RelationLink(pack, arg1, arg2) head_relation.rel_type = oie_component[2] offset += len(sentence) + 1 self.set_text(pack, text) pack.pack_name = os.path.basename(file_path) yield pack
def _process(self, input_pack: MultiPack): from_pack: DataPack = input_pack.get_pack(self.configs.copy_from) copy_pack: DataPack = input_pack.add_pack(self.configs.copy_to) copy_pack.set_text(from_pack.text) if from_pack.pack_name is not None: copy_pack.pack_name = from_pack.pack_name + "_copy" else: copy_pack.pack_name = "copy" s: Sentence for s in from_pack.get(Sentence): Sentence(copy_pack, s.begin, s.end) e: EntityMention for e in from_pack.get(EntityMention): EntityMention(copy_pack, e.begin, e.end)
def pack( self, pack: DataPack, predict_results: Dict[str, Dict[str, List[str]]], _: Optional[Annotation] = None, ): """ Write the prediction results back to datapack. by writing the predicted ner to the original tokens. """ if predict_results is None: return current_entity_mention: Tuple[int, str] = (-1, "None") for i in range(len(predict_results["Token"]["tid"])): # an instance for j in range(len(predict_results["Token"]["tid"][i])): tid: int = predict_results["Token"]["tid"][i][ j] # type: ignore orig_token: Token = pack.get_entry(tid) # type: ignore ner_tag: str = predict_results["Token"]["ner"][i][j] orig_token.ner = ner_tag token = orig_token token_ner = token.ner assert isinstance(token_ner, str) if token_ner[0] == "B": current_entity_mention = (token.begin, token_ner[2:]) elif token_ner[0] == "I": continue elif token_ner[0] == "O": continue elif token_ner[0] == "E": if token_ner[2:] != current_entity_mention[1]: continue entity = EntityMention(pack, current_entity_mention[0], token.end) entity.ner_type = current_entity_mention[1] elif token_ner[0] == "S": current_entity_mention = (token.begin, token_ner[2:]) entity = EntityMention(pack, current_entity_mention[0], token.end) entity.ner_type = current_entity_mention[1]
def add_to_pack(self, pack: DataPack, instance: Annotation, prediction: List[int]): r"""Add the prediction for attribute to the instance. We make following assumptions for prediction. 1. If we encounter "I" while its tag is different from the previous tag, we will consider this "I" as a "B" and start a new tag here. 2. We will truncate the prediction it according to the number of entry. If the prediction contains `<PAD>` element, this should remove them. Args: pack (DataPack): The datapack that contains the current instance. instance (Annotation): The instance to which the extractor add prediction. prediction (Iterable[Union[int, Any]]): This is the output of the model, which contains the index for attributes of one instance. """ instance_tagging_unit: List[Annotation] = \ list(pack.get(self.config.tagging_unit, instance)) prediction = prediction[:len(instance_tagging_unit)] tags = [self.id2element(x) for x in prediction] tag_start = None tag_end = None tag_type = None for entry, tag in zip(instance_tagging_unit, tags): if tag[1] == "O" or tag[1] == "B" or \ (tag[1] == "I" and tag[0] != tag_type): if tag_type: entity_mention = EntityMention(pack, tag_start, tag_end) entity_mention.ner_type = tag_type tag_start = entry.begin tag_end = entry.end tag_type = tag[0] else: tag_end = entry.end # Handle the final tag if tag_type is not None and \ tag_start is not None and \ tag_end is not None: entity_mention = EntityMention(pack, tag_start, tag_end) entity_mention.ner_type = tag_type
def _parse_pack(self, file_path: str) -> Iterator[DataPack]: pack: DataPack = DataPack() with open(file_path, 'r', encoding='utf8') as fp: txt = "" offset = 0 while True: sent_line: str = fp.readline() if not sent_line: break if len(sent_line.split()) == 0: continue relation_line: str = fp.readline() # Command line is not used. _ = fp.readline() sent_line = sent_line[sent_line.find('"') + 1:sent_line.rfind('"')] index1 = sent_line.find("<e1>") index2 = sent_line.find("<e2>") # 5 is the length of "</e1>", include both <e1> and # </e1> when extracting the string. e1 = sent_line[index1:sent_line.find("</e1>") + 5] e2 = sent_line[index2:sent_line.find("</e2>") + 5] # Remove <e1> and </e1> in the sentence. sent_line = sent_line.replace(e1, e1[4:-5]) sent_line = sent_line.replace(e2, e2[4:-5]) # Remove <e1> and </e1> in e1. e1 = e1[4:-5] e2 = e2[4:-5] # Re-calculate the index after removing <e1>, </e1> in # in the sentence. if index1 < index2: diff1 = 0 diff2 = 9 else: diff1 = 9 diff2 = 0 index1 += offset - diff1 index2 += offset - diff2 Sentence(pack, offset, offset + len(sent_line)) entry1 = EntityMention(pack, index1, index1 + len(e1)) entry2 = EntityMention(pack, index2, index2 + len(e2)) offset += len(sent_line) + 1 txt += sent_line + " " pair = relation_line[relation_line.find("(") + 1:relation_line.find(")")] if "," in pair: parent, _ = pair.split(",") if parent == "e1": relation = RelationLink(pack, entry1, entry2) else: relation = RelationLink(pack, entry2, entry1) relation.rel_type = relation_line[:relation_line.find("(")] else: # For "Other" relation, just set parent as e1 # set child as e2. relation = RelationLink(pack, entry1, entry2) relation.rel_type = relation_line.strip() pack.set_text(txt, replace_func=self.text_replace_operation) pack.pack_name = os.path.basename(file_path) yield pack
def _parse_pack(self, collection: str) -> Iterator[DataPack]: with open(collection, "r", encoding="utf8") as doc: pack_id: int = 0 pack: DataPack = DataPack() text: str = "" offset: int = 0 has_rows: bool = False sentence_begin: int = 0 sentence_cnt: int = 0 # NER tag is either "O" or in the format "X-Y", # where X is one of B, I, # Y is a tag like ORG, PER etc prev_y = None prev_x = None start_index = -1 for line in doc: line = line.strip() if line.find("DOCSTART") != -1: # Skip the first DOCSTART. if offset == 0: continue # Add remaining sentence. if has_rows: # Add the last sentence if exists. Sentence(pack, sentence_begin, offset - 1) sentence_cnt += 1 pack.set_text(text, replace_func=self.text_replace_operation) Document(pack, 0, len(text)) pack.pack_name = collection + "_%d" % pack_id pack_id += 1 yield pack # Create a new datapack. pack = DataPack() text = "" offset = 0 has_rows = False sentence_begin = 0 sentence_cnt = 0 prev_y = None prev_x = None start_index = -1 elif line != "" and not line.startswith("#"): conll_components = line.split() word = conll_components[0] pos = conll_components[1] chunk_id = conll_components[2] ner_tag = conll_components[3] # A new ner tag occurs. if ner_tag == "O" or ner_tag.split("-")[0] == "B": # Add previous ner tag to sentence if it exists. if prev_y is not None: entity_mention = EntityMention( pack, start_index, offset - 1) entity_mention.ner_type = prev_y # Start process current ner tag. if ner_tag == "O": # Current ner tag is O, reset information. prev_x = None prev_y = None start_index = -1 else: # Current ner tag is B. prev_x = "B" prev_y = ner_tag.split("-")[1] start_index = offset # This ner tag is connected to previous one. else: x, y = ner_tag.split("-") assert x == "I", "Unseen tag %s in the file." % x assert y == prev_y, "Error in %s." % ner_tag assert prev_x in ("B", "I"), "Error in %s." % ner_tag prev_x = "I" word_begin = offset word_end = offset + len(word) # Add tokens. token = Token(pack, word_begin, word_end) token.pos = pos token.chunk = chunk_id text += word + " " offset = word_end + 1 has_rows = True else: if not has_rows: # Skip consecutive empty lines. continue # Add sentence Sentence(pack, sentence_begin, offset - 1) # Handle the last ner tag if exists. if prev_x is not None: entity_mention = EntityMention(pack, start_index, offset - 1) entity_mention.ner_type = prev_y # Reset information. sentence_cnt += 1 has_rows = False prev_y = None prev_x = None sentence_begin = offset if has_rows: # Add the last sentence if exists. Sentence(pack, sentence_begin, offset - 1) sentence_cnt += 1 pack.set_text(text, replace_func=self.text_replace_operation) Document(pack, 0, len(text)) pack.pack_name = os.path.basename(collection) yield pack