def extracNer_text(self, text): text = re.sub(r'[,\.\()]', ' va ', text) text = text.translate(str.maketrans('', '', string.punctuation)) sentence = Sentence(str(text)) self.model.predict(sentence) ner = [] for i in sentence.get_spans('ner'): ner.append(str(i).split('"')[1]) ner = unique(ner) fillter = ner for i in self.stopwords: if i in text: fillter.append(i.strip()) fillter.sort(key=len) for i in range(len(fillter) - 1): for j in range(i + 1, len(fillter)): if fillter[i] in fillter[j]: fillter[i] = '' fillter = [i for i in fillter if i != ''] clean_fillter = [] for i in fillter: tmp = [] tmp.append(i) try: tmp.append(self.entity[self.name.index(i + ' ')]) except: tmp.append('Ner') clean_fillter.append(tmp) return clean_fillter
def extract_entities(): if not request.json or 'message' not in request.json: abort(400) query = request.json['message'] sentence = Sentence(query, use_tokenizer=True) tagger.predict(sentence) entities = [] tags = [] scores = [] start_positions = [] end_positions = [] for i, en in enumerate(sentence.get_spans('ner')): entities.append([str(token.text) for token in en.tokens]) tags.append(en.tag) scores.append(str(round(en.score, 2))) start_positions.append(int(en.start_pos)) end_positions.append(int(en.end_pos)) response = { 'entities': entities, 'tags': tags, 'scores': scores, 'start_positions': start_positions, 'end_positions': end_positions } return jsonify(response), 200
def train(corpus): """ Train a Flair model :param corpus: Corpus object :return: """ print(corpus) # 2. what tag do we want to predict? tag_type = "ner" # 3. make the tag dictionary from the corpus tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) print(tag_dictionary.idx2item) # 4. initialize embeddings embedding_types: List[TokenEmbeddings] = [ WordEmbeddings("glove"), FlairEmbeddings("news-forward"), FlairEmbeddings("news-backward"), ] embeddings: StackedEmbeddings = StackedEmbeddings( embeddings=embedding_types) # 5. initialize sequence tagger tagger: SequenceTagger = SequenceTagger( hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type, use_crf=True, ) # 6. initialize trainer trainer: ModelTrainer = ModelTrainer(tagger, corpus) checkpoint = "resources/taggers/presidio-ner/checkpoint.pt" # trainer = ModelTrainer.load_checkpoint(checkpoint, corpus) trainer.train( "resources/taggers/presidio-ner", learning_rate=0.1, mini_batch_size=32, max_epochs=150, checkpoint=True, ) sentence = Sentence("I am from Jerusalem") # run NER over sentence tagger.predict(sentence) print(sentence) print("The following NER tags are found:") # iterate over entities and print for entity in sentence.get_spans("ner"): print(entity)
def get_top_class(tagger, row): try: sentence = Sentence(row) tagger.predict(sentence) for entity in sentence.get_spans("ner"): ner_entities[entity.tag].append(entity.text) except: return "problem"
def get_phrases(sentence): sentence = Sentence(sentence) ner_tagger.predict(sentence) entities = [] for item in sentence.get_spans('ner'): entities.append(item.text.split()) relations = [] return [relations, entities]
def flair_pos(language, input): sentence = Sentence(input) if (language == "french"): tagger = SequenceTagger.load("pos-multi") tagger.predict(sentence) annotated = sentence.to_tagged_string() print(annotated) temp = dict() boolean = True count = 1 while (boolean): index = annotated.find("<") end_index = annotated.find(">") if (index != -1) and (end_index != -1): #annotated = annotated[index+1:] print("start", index) print("end", end_index) pos = annotated[index:end_index + 1] print("POS", pos) #print(annotated.find("<")) annotated = annotated[end_index + 1:] if (pos in temp): temp[pos].append(count) else: temp[pos] = list() temp[pos].append(count) # print(annotated) count = count + 1 else: boolean = False print(temp) #sreturn temp else: tagger = SequenceTagger.load('pos') tagger.predict(sentence) print(sentence) print('The following POS tags are found:') # iterate over entities and print temp = dict() for word in sentence.get_spans('pos'): if word.tag in temp: token_text = str(word.tokens) index = token_text[8:] endl = index.find(" ") index = index[:endl] temp[word.tag].append(index) else: token_text = str(word.tokens) index = token_text[8:] endl = index.find(" ") index = index[:endl] temp[word.tag] = list() temp[word.tag].append(index) print(temp) return dict()
def model_ner_PRG_flair(paragraph, type_question): sentence = Sentence(paragraph) tagger.predict(sentence) list_predictions_data = [] for entity in sentence.get_spans('ner'): if entity.tag in interesting_entities(type_question) and len( normalize_answer(entity.text)): list_predictions_data.append(entity.text) return list_predictions_data
def evaluate(tagger: SequenceTagger, content: str) -> Dict[Any, Any]: sentence = Sentence(content) tagger.predict(sentence) entities = [ asdict(e) for e in map(NerEntity.from_span, sentence.get_spans('ner')) ] return {"entities": entities}
def add_ner_predictions(): for sent in prep_sentences(): sent["hf"] = hf_tagger(sent["sentence"]) # careful, flair tagger replace the sentence in place. sentence = Sentence(sent["sentence"]) flair_tagger.predict(sentence) sent["flair"] = sentence.get_spans() yield sent
def test_sequence_tagger_transformer_finetune(results_base_path, tasks_base_path): flair.set_seed(123) # load dataset corpus: Corpus = ColumnCorpus( data_folder=tasks_base_path / "trivial" / "trivial_bioes", column_format={ 0: "text", 1: "ner" }, ) tag_dictionary = corpus.make_label_dictionary("ner") # tagger without CRF tagger: SequenceTagger = SequenceTagger( hidden_size=64, embeddings=TransformerWordEmbeddings("distilbert-base-uncased", fine_tune=True), tag_dictionary=tag_dictionary, tag_type="ner", use_crf=False, use_rnn=False, reproject_embeddings=False, ) # train trainer = ModelTrainer(tagger, corpus) trainer.fine_tune( results_base_path, mini_batch_size=2, max_epochs=10, shuffle=True, learning_rate=0.5e-4, ) loaded_model: SequenceTagger = SequenceTagger.load(results_base_path / "final-model.pt") sentence = Sentence("this is New York") sentence_empty = Sentence(" ") loaded_model.predict(sentence) loaded_model.predict([sentence, sentence_empty]) loaded_model.predict([sentence_empty]) # check if loaded model can predict entities = [span.text for span in sentence.get_spans("ner")] assert "New York" in entities # check if loaded model successfully fit the training data result: Result = loaded_model.evaluate(corpus.test, gold_label_type="ner") assert result.classification_report["micro avg"]["f1-score"] == 1.0 del loaded_model
def get_entities(self, the_question, model): the_sentenced_question = Sentence(the_question) model.predict(the_sentenced_question) spans = [ span for span in the_sentenced_question.get_spans('ner') if span.tag == "PER" or span.tag == "MISC" or span.tag == 'LOC' ] entities = [ " ".join([tok.text for tok in span.tokens]) for span in spans ] return entities
def predict(self, text): sentence = Sentence(text) tagger = SequenceTagger.load('ner') tagger.predict(sentence) predictions = [] for entity in sentence.get_spans('ner'): ids = [] for token in entity.tokens: ids.append(token.text) predictions.append((' '.join(ids), entity.tag)) return predictions
def analyse(self, text: str, entities: List[str]) -> List[Entity]: self.validate_entities(entities) sentence = Sentence(text) self.model.predict(sentence) span_labels = [] for entity in sentence.get_spans("ner"): if entity.tag in entities: span_labels.append(Entity(entity.tag, entity.start_pos, entity.end_pos)) return span_labels
def get_named_entity_types(self, token): sentence = Sentence(token) self.tagger.predict(sentence) entities = sentence.get_spans('ner') label_mapping = { 'LOC': 'LOCATION', 'ORG': 'ORGANIZATION', 'PER': 'PERSON' } return [ label_mapping.get(entity.tag) or entity.tag for entity in entities ]
def test_sequence_tagger_with_crf(results_base_path, tasks_base_path): flair.set_seed(123) # load dataset corpus: Corpus = ColumnCorpus(data_folder=tasks_base_path / "trivial" / "trivial_bioes", column_format={ 0: "text", 1: "ner" }) tag_dictionary = corpus.make_label_dictionary("ner") # tagger without CRF tagger: SequenceTagger = SequenceTagger( hidden_size=64, embeddings=turian_embeddings, tag_dictionary=tag_dictionary, tag_type="ner", use_crf=True, ) # train trainer: ModelTrainer = ModelTrainer(tagger, corpus) trainer.train( results_base_path, learning_rate=0.1, mini_batch_size=2, max_epochs=10, shuffle=False, ) loaded_model: SequenceTagger = SequenceTagger.load(results_base_path / "final-model.pt") sentence = Sentence("this is New York") sentence_empty = Sentence(" ") loaded_model.predict(sentence) loaded_model.predict([sentence, sentence_empty]) loaded_model.predict([sentence_empty]) # check if loaded model can predict entities = [span.text for span in sentence.get_spans('ner')] assert "New York" in entities # check if loaded model successfully fit the training data result: Result = loaded_model.evaluate(corpus.test, gold_label_type='ner') assert result.classification_report["micro avg"]["f1-score"] == 1. # clean up results directory shutil.rmtree(results_base_path) del loaded_model
def test_span_tags(): # set 3 labels for 2 spans (HU is tagged twice) sentence = Sentence( "Humboldt Universität zu Berlin is located in Berlin .") sentence[0:4].add_label("ner", "Organization") sentence[0:4].add_label("ner", "University") sentence[7:8].add_label("ner", "City") # check if there are three labels with correct text and values labels: List[Label] = sentence.get_labels("ner") assert 3 == len(labels) assert "Humboldt Universität zu Berlin" == labels[0].data_point.text assert "Organization" == labels[0].value assert "Humboldt Universität zu Berlin" == labels[1].data_point.text assert "University" == labels[1].value assert "Berlin" == labels[2].data_point.text assert "City" == labels[2].value # check if there are two spans with correct text and values spans: List[Span] = sentence.get_spans("ner") assert 2 == len(spans) assert "Humboldt Universität zu Berlin" == spans[0].text assert 2 == len(spans[0].get_labels("ner")) assert "Berlin" == spans[1].text assert "City" == spans[1].get_label("ner").value # now delete the NER tags of "Humboldt-Universität zu Berlin" sentence[0:4].remove_labels("ner") # should be only one NER label left labels: List[Label] = sentence.get_labels("ner") assert 1 == len(labels) assert "Berlin" == labels[0].data_point.text assert "City" == labels[0].value # and only one NER span spans: List[Span] = sentence.get_spans("ner") assert 1 == len(spans) assert "Berlin" == spans[0].text assert "City" == spans[0].get_label("ner").value
def get_reason_for_appearance(organisation: Span, sentence: Sentence): """ Extract the reason for the appearance of an 'ORG' NER tag in a sentence. """ # Find ORG placement in sentence. org_end = organisation.end_pos frame_tags = sentence.get_spans("frame") # Extract frame and POS tags after organisation occurence. pos_tags = list( filter(lambda span: "VBD" in span.tag, sentence.get_spans("pos"))) frame_tags_after_org = list( filter(lambda span: span.start_pos > org_end, frame_tags)) pos_tags_after_org = list( filter(lambda span: span.start_pos > org_end, pos_tags)) # If no frame tags are usable, fall back to POS tags. if not frame_tags_after_org and not pos_tags_after_org: return None first_after_org = (frame_tags_after_org[0] if frame_tags_after_org else pos_tags_after_org[0]) original = sentence.to_original_text() # Extract reason following ORG occurence. reason = original[first_after_org.start_pos:] return reason
def tagIt(title, noOfTags): # make a sentence sentence = Sentence(title) # load the NER tagger tagger = SequenceTagger.load( 'ner') #ner-fast for cpu, in case youy are poor # run NER over sentence tagger.predict(sentence) # iterate over entities and print for entity in sentence.get_spans('ner'): noOfTags.append(entity.text)
def split_to_spans(s: Sentence): orig = s.to_original_text() last_idx = 0 spans = [] tagged_ents = s.get_spans('ner') for ent in tagged_ents: if last_idx != ent.start_pos: spans.append((orig[last_idx:ent.start_pos], None)) spans.append((orig[ent.start_pos:ent.end_pos], ent.tag)) last_idx = ent.end_pos if last_idx < len(orig) - 1: spans.append((orig[last_idx:len(orig)], None)) return spans
def extract_named_entity(Idea_Text): # Pre-process the text text_input = Idea_Text # text_input=clean_content(Idea_Text) ### Preporocess Input Text 1. remove Non-english characters text_input = text_input.encode("ascii", errors="ignore").decode() sentence = Sentence(text_input) # load the NER tagger tagger.predict(sentence) # iterate over entities and print dict_ner = {} list_entity = [ 'PERSON', 'NORP', 'FAC', 'ORG', 'GPE', 'LOC', 'PRODUCT', 'EVENT', 'WORK_OF_ART', 'LAW', 'LANGUAGE', 'DATE', 'TIME', 'PERCENT', 'MONEY', 'QUANTITY', 'ORDINAL', 'CARDINAL' ] ''' for entity in sentence.get_spans('ner'): named_entity=str(entity).strip().split('-')[0] dict_ner[entity]=[] ''' for entity in list_entity: dict_ner[entity] = [] list_ner_entity = [] dict_ner_entitiy = {} for entity in sentence.get_spans('ner'): # print (entity) temp_dict_ner = {} named_entity = str(entity).strip().split('-')[0] entity_value = str(entity).strip().split(':')[1] if entity_value: entity_value = re.sub(r'\..*', '', entity_value) temp_dict_ner['Name'] = entity_value.strip().replace('"', '') temp_dict_ner['Type'] = named_entity list_ner_entity.append(temp_dict_ner) ''' try: dict_ner[named_entity].append(entity_value) except: dict_ner[named_entity]=[] dict_ner[named_entity].append(entity_value) ''' # print (list_ner_entity) dict_ner_entitiy['Entities'] = list_ner_entity return dict_ner_entitiy
def train(corpus): print(corpus) # 2. what tag do we want to predict? tag_type = 'ner' # 3. make the tag dictionary from the corpus tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) print(tag_dictionary.idx2item) # 4. initialize embeddings embedding_types: List[TokenEmbeddings] = [ WordEmbeddings('glove'), FlairEmbeddings('news-forward'), FlairEmbeddings('news-backward'), BertEmbeddings() ] embeddings: StackedEmbeddings = StackedEmbeddings( embeddings=embedding_types) # 5. initialize sequence tagger tagger: SequenceTagger = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type, use_crf=True) # 6. initialize trainer trainer: ModelTrainer = ModelTrainer(tagger, corpus) checkpoint = 'resources/taggers/presidio-ner/checkpoint.pt' # trainer = ModelTrainer.load_checkpoint(checkpoint, corpus) trainer.train('resources/taggers/presidio-ner', learning_rate=0.1, mini_batch_size=32, max_epochs=150, checkpoint=True) sentence = Sentence('I am from Jerusalem') # run NER over sentence tagger.predict(sentence) print(sentence) print('The following NER tags are found:') # iterate over entities and print for entity in sentence.get_spans('ner'): print(entity)
def extract(self, sentence: str) -> Dict[str, Dict[str, Union[str, Tuple]]]: doc = Sentence(sentence) self.nlp.predict(doc) d = sorted([(e.tag, { "text": e.text, "span": (e.tokens[0].start_pos, len(e.tokens)) }) for e in doc.get_spans('ner') if e.tag in self.valid_entity_types], key=lambda t: t[0]) d = { k: list(map(lambda t: t[1], g)) for k, g in groupby(d, key=lambda t: t[0]) } return d
def __call__(self, doc): sent = Sentence(doc.text) self.tagger.predict(sent) for match in sent.get_spans('ner'): _match = match.to_dict() span = doc.char_span(_match.get('start_pos'), _match.get('end_pos'), label=_match.get('labels')[0].value) # Pass, in case a match already exists try: doc.ents = list(doc.ents) + [span] except: pass return doc
def get_entities(self, text: str) -> List[str]: """ Get the list of named entities for given text. COMMENT: We should reinitialize this method for using another NER model :param text: str, text used for NER extraction :return list of str (entities found in text) """ sentence = Sentence(text) self.tagger.predict(sentence) entities = [] for entity in sentence.get_spans('ner'): entities.append(entity.text) return entities
def predict_ner(sent): sentence = Sentence(sent) ner_model.predict(sentence) print(sentence.to_tagged_string()) tags = {} for entity in sentence.get_spans('ner'): tags[entity.text] = entity.tag print(tags) output = print_ner_tags(tags, sent) return output
def get_line_parts_flair(self, line: 'Line'): """ Split by comma since Flair is insensitive to commas """ line_parts = defaultdict(lambda: None) for part in self.split_line(line): part = Sentence(part) self.flair_tagger.predict(part) for entity in part.get_spans('ner'): # Currently not saving for multiple ner extractions if not line_parts[entity.tag]: line_parts[entity.tag] = entity.text print(f"{entity.text}, {entity.tag}| ", end="") print() return line_parts
def get_important_words(corpus: Corpus, preprocess_pipeline: Optional[List] = None) -> Tuple[Dict, Corpus]: important_words = {} tagger = SequenceTagger.load('de-ner') preprocessor = Preprocessor(pipeline=preprocess_pipeline) corpus_processed = preprocessor.process(corpus=corpus) for doc in tqdm(corpus_processed.documents): sentence = Sentence(doc.text) tagger.predict(sentence) important_words[doc.id_] = [entity.text for entity in sentence.get_spans('ner')] return important_words, corpus_processed
def correct_who_to_whom(text): doc = nlp(text) tokenized_text = [] phrases = [] token_number = 0 for token in doc: tokenized_text.append(token.text_with_ws) token_number += 1 # it is very difficult for named entity recognizer to recognize 'Who' # in isolation - the motivating text was repeated exclamation of # 'Who! Who!' in a The Grinch fan fiction. if token.text.lower() in ['grinch', 'whoville', 'scooby', 'horton']: return if token.text.lower() == 'who': if token.dep_ in ['dobj', 'iobj', 'pobj']: # check for the hard-coded exceptions if not check_for_exceptions(doc, token): should_be_whom = True sentence = Sentence(text, use_tokenizer=SegtokTokenizer()) tagger.predict(sentence) # make sure it is not part of a named entity for entity in sentence.get_spans('ner'): if token.idx >= entity.start_pos and token.idx <= entity.end_pos: should_be_whom = False if should_be_whom: # detokenizes the corrected excerpt (e.g. removes added space # between last word in sentence and punctutation, rejoins # don and 't to form don't, etc., only if such joins were # present in the original text) tokenized_text[token.i] = whom_string( token.text_with_ws, True) # prints the text with corrections made (corrections surround by asterisks) corrected_text = ''.join([tkn for tkn in tokenized_text]) print(corrected_text)
def get_source_frames(sentence_tokens, frame_tagger): all_frames = [] if frame_tagger: sentence_obj = Sentence(" ".join(sentence_tokens)) frame_tagger.predict(sentence_obj) for frame in sentence_obj.get_spans('frame'): if frame.tag != "_": indices, tokens = zip(*[(tok.idx - 1, tok.text) for tok in frame.tokens]) all_frames.append({ "predicate_sense": frame.tag, "predicate_word": tokens, "predicate_ix": indices }) return all_frames
def get_flair_ner(text): # wnut_17 = flair.datasets.WNUT_17() # wikiner_en = flair.datasets.WIKINER_ENGLISH() # # make a multi corpus consisting of two UDs # multi_corpus = MultiCorpus([wnut_17, wikiner_en]) # make a sentence sentence = Sentence(text) flair_tagger = SequenceTagger.load('multi-ner-fast') # run NER over sentence flair_tagger.predict(sentence) # iterate over entities and print return sentence.get_spans('ner')