def _split_long_sentences(self, sentences): """Split long sentences. Args: sentences (list): list of flair's Sentences Returns: list: """ extended = sentences.copy() tokenizer = self.model.embeddings.tokenizer offset = 0 for i, sentence in enumerate(sentences): len_bpe = len(tokenizer.tokenize(sentence.to_tokenized_string())) if len_bpe > self.max_length: extended.pop(i + offset) num_pieces = len_bpe // self.max_length + 1 for piece in array_split(sentence, num_pieces): char_offset = piece[0].start_pos sentence_piece = Sentence() for token in piece: token.start_pos -= char_offset token.end_pos -= char_offset sentence_piece.add_token(token) piece[-1].whitespace_after = False extended.insert(i + offset, sentence_piece) offset += 1 # we pop original sentence, so we should decrease offset by one offset -= 1 logger.debug(f'Lengths before split: {[len(x) for x in sentences]}') logger.debug(f'Lengths after split: {[len(x) for x in extended]}') return extended
def predict_sentence_entities(tagger, sent, all_entities, phase=1): #print("Processing {}".format(sent['words'][0])) global elapsed_times newsent = Sentence() for i in range(0, len(sent['normwords'])): tok = sent['normwords'][i] token = Token(tok, i, None, start_position=int(sent['starts'][i])) newsent.add_token(token) seqtagger = tagger['model'] model_id = tagger['model_id'] start = time.time() seqtagger.predict(newsent) end = time.time() words = len(sent['normwords']) elapsed = np.round((end - start) * 1000, 0) if elapsed < 0: elapsed = 0 obj = elapsed_times[model_id] if obj['freqs'].get(words) != None: obj['freqs'][words] = obj['freqs'][words] + 1 obj['times'][words] = obj['times'][words] + elapsed else: obj['freqs'][words] = 1 obj['times'][words] = elapsed ner_spans = newsent.get_spans("ner") make_entities(sent, all_entities, ner_spans, False, model_id) return
def benchmark_flair_mdl(): tagger = load_flair_ner_model() start = time.time() flair_sentences = [] for i, sentence in enumerate(sentences_tokens): flair_sentence = Sentence() for token_txt in sentence: flair_sentence.add_token(Token(token_txt)) flair_sentences.append(flair_sentence) tagger.predict(flair_sentences, verbose=True) predictions = [[tok.tags['ner'].value for tok in fs] for fs in flair_sentences] print('Flair:') print("Made predictions on {} sentences and {} tokens in {}s".format( num_sentences, num_tokens, time.time() - start)) assert len(predictions) == num_sentences print( classification_report(sentences_entities, remove_miscs(predictions), digits=4))
def pad_sequence(self, sentences, labelVoc, word_maxlen=30, sent_maxlen=35): """ This function is used to pad the word into the same length, the word length is set to 30. Moreover, it also pad each sentence into the same length, the length is set to 35. """ tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') x = [] x_flair = [] y = [] for sentence in sentences: w_id = [] y_id = [] st = Sentence() for idx, word_label in enumerate(sentence): try: w_id.append(tokenizer.vocab[word_label[0].lower()]) except Exception as e: w_id.append(tokenizer.vocab['[MASK]']) st.add_token(word_label[0]) y_id.append(labelVoc[word_label[1]]) x.append(w_id) x_flair.append(st) y.append(y_id) y = self.pad_sequences(y, sent_maxlen) x = self.pad_sequences(x, sent_maxlen) y = np.asarray(y) return [x, x_flair, y]
def _convert_to_flair(self, data, labels=None): """ Convert data and labels into a list of flair.data.Sentence objects. Parameters ---------- data : list(list(str)) list of list of tokens, each inner list represents a list of tokens or words in sentence, and each outer list represents a sentence. labels : list(list(str)), can be None list of list of NER tags corresponding to tokens in data. Returns ------- sentences : list(flair.data.Sentence) """ sentences = [] if labels is None: labels = data use_dummy_labels = True else: use_dummy_labels = False for tokens, tags in zip(data, labels): sentence = Sentence() for token, tag in zip(tokens, tags): t = Token(token) if not use_dummy_labels: t.add_tag("ner", tag) sentence.add_token(t) sentences.append(sentence) return sentences
def _embed(self, x): vocab_idx = self.vocab_idx embeddings = [] for sequence in x: padding_length = sequence.size(0) flair_sentence = Sentence() for index in sequence: index = index.item() if index == self.pad_index: break # skip padding padding_length = padding_length - 1 token = vocab_idx.get(index, '[UNK]') flair_sentence.add_token(token) self.embeddings.embed(flair_sentence) sentence_embedding = torch.stack( [token.embedding for token in flair_sentence.tokens]) if padding_length: sentence_embedding = torch.cat( (sentence_embedding, torch.zeros(padding_length, sentence_embedding.size(-1), device=sentence_embedding.device))) embeddings.append(sentence_embedding) return torch.stack(embeddings)
def process_conll_doc(input_file_name, output_file_name): columns = { 0: 'text', 1: 'nero', 2: 'nme', 3: 'wiki', } with open(input_file_name, "r") as input_file, open(output_file_name, "w+") as output_file: doc = None docs = [] spos = 0 for line in input_file: if "DOCSTART" in line: if doc == None: doc = Sentence() else: docs.append(doc) doc = Sentence() spos = 0 else: lsplit = line.split("\t") #print(lsplit) token = Token(lsplit[0].strip()) for c in columns: if c != 0: if c < len(lsplit): token.add_tag(columns[c], lsplit[c].strip()) token.start_pos = spos token.end_pos = spos + len(token.text) spos = token.end_pos + 1 doc.add_token(token) for d in docs: myjson = { "text": unidecode.unidecode(d.to_tokenized_string()), "spans": [] } res = requests.post(NEURAL_EL_SERVER, json=myjson) info = res.json() #print(info) for i in info: entity_ran = range(i[0], i[0] + i[1]) #print(i[2] + " " + str(entity_ran)) for t in d.tokens: #print(t.text + " " + str(t.start_pos)) if t.start_position in entity_ran: #print("found tag") t.add_tag("pnme", i[2]) for t in d: output_file.write( t.text + "\t" + t.get_tag("nero").value + "\t" + t.get_tag("nme").value + "\t" + unidecode.unidecode(t.get_tag("wiki").value) + "\t" + t.get_tag("pnme").value + "\n")
def test_sentence_to_tagged_string(): token1 = Token('I', 0) token2 = Token('love', 1, 0) token3 = Token('Berlin', 2, 1) token3.add_tag('ner', 'LOC') sentence = Sentence() sentence.add_token(token1) sentence.add_token(token2) sentence.add_token(token3) assert ('I love Berlin <LOC>' == sentence.to_tagged_string())
def predict(self, sentence): flair_sentence = Sentence() for word in sentence: flair_sentence.add_token(word) self.model.predict(flair_sentence, label_name="predicted") predictions = [] for token in flair_sentence: predictions.append((token.get_tag("upos").value, token.get_tag("predicted").value)) return predictions
def predict(self, text: Generator[list[str]]) -> list[list[str]]: preds = list() flair_sents = list() for words in text: s = Sentence() for word in words: s.add_token(Token(word)) flair_sents.append(s) self.model.predict(flair_sents) return [[tok.tags["ner"].value for tok in s] for s in flair_sents]
def test_get_head(): token1 = Token('I', 0) token2 = Token('love', 1, 0) token3 = Token('Berlin', 2, 1) sentence = Sentence() sentence.add_token(token1) sentence.add_token(token2) sentence.add_token(token3) assert (token2 == token3.get_head()) assert (token1 == token2.get_head()) assert (None == token1.get_head())
def sent_to_flair(sent): """ Convert a tokenized sentence (list of words) to a Flair sentence object """ sentence = Sentence() for w in sent: token = Token(w) sentence.add_token(token) sentence.infer_space_after() return sentence
def embed_sentence(self, sentence): """This function embed each sentence with BERT embedder Args: sentence (str): raw sentence Returns: np.array: embedded matrix """ flair_sentence = Sentence(sentence) while len(flair_sentence) < self.MAX_LEN: flair_sentence.add_token(Token("__PAD__")) self.embedder.embed(flair_sentence) return np.stack([t.embedding.cpu().numpy() for t in flair_sentence])
def get_tags(line, tagger): # join list for tagging sentence = Sentence() for token in line: sentence.add_token(Token(token)) tagger.predict(sentence) # split to get tags tagged_line = sentence.to_tagged_string().split() tags = [] # tags are every other token in sentence for i in range(1, len(tagged_line), 2): tags.append(tagged_line[i][1:-1]) return tags
def test_sentence_add_token(): token1 = Token('Munich') token2 = Token('and') token3 = Token('Berlin') token4 = Token('are') token5 = Token('nice') sentence = Sentence() sentence.add_token(token1) sentence.add_token(token2) sentence.add_token(token3) sentence.add_token(token4) sentence.add_token(token5) sentence.add_token('cities') sentence.add_token(Token('.')) assert ('Munich and Berlin are nice cities .' == sentence.to_tokenized_string())
def standoff_to_flair_sents( docs: List[Document], tokenizer: Tokenizer, verbose=False) -> Tuple[List[Sentence], List[ParsedDoc]]: sents, parsed_docs = standoff_to_sents(docs=docs, tokenizer=tokenizer, verbose=verbose) flair_sents = [] for sent in sents: flair_sent = Sentence() for token in sent: tok = Token(token.text) tok.add_tag(tag_type='ner', tag_value=token.label) flair_sent.add_token(tok) flair_sents.append(flair_sent) return flair_sents, parsed_docs
def get_flair_predictions(self, model_type, input_conllu, with_score=False): if model_type == "onto": model = self.flair_onto elif model_type == "ner": model = self.flair_ner else: model = self.flair_gum sentences = [] conll_sents = input_conllu.strip().split("\n\n") for sent in conll_sents: token_list = [l.split("\t") for l in sent.split("\n") if "\t" in l] token_list = [ t[1] for t in token_list if "." not in t[0] and "-" not in t[0] ] sentence = Sentence() for token in token_list: sentence.add_token(token) sentences.append(sentence) output = [] scores = [] preds = model.predict(sentences, all_tag_prob=with_score) if preds is None: # Newer versions of flair have void predict method, use modified Sentence list preds = sentences for sentence in preds: for token in sentence: if str(flair.__version__).startswith("0.4"): output.append(token.tags['pos'].value) else: output.append(token.labels[0].value) if with_score: scores.append(token.labels[0].score) if with_score: return (output, scores) else: return [output]
def _get_rnn_output(self, tokens: List[List[str]], mask: Tensor = None) -> Tensor: sentences = [] for token in tokens: sentence = Sentence() [sentence.add_token(Token(t.replace('\xa0', ' '))) for t in token] sentences.append(sentence) self.embeddings.embed(sentences) lengths = [len(sentence.tokens) for sentence in sentences] longest_token_sequence_in_batch = max(lengths) pre_allocated_zero_tensor = torch.zeros( self.embeddings.embedding_length * longest_token_sequence_in_batch, dtype=torch.float, device=flair.device) all_embs = list() for sentence in sentences: all_embs += [ emb for token in sentence.tokens for emb in token.get_each_embedding() ] nb_padding_tokens = longest_token_sequence_in_batch - len(sentence) if nb_padding_tokens > 0: t = pre_allocated_zero_tensor[:self.embeddings. embedding_length * nb_padding_tokens] all_embs.append(t) for token in sentence.tokens: token.clear_embeddings() # [batch, length, word_dim] input = torch.cat(all_embs) \ .view((len(sentences), longest_token_sequence_in_batch, self.embeddings.embedding_length)) if self.device != flair.device: if self.device != torch.device('cpu'): input = input.cuda(self.device) else: input = input.cpu() # output from rnn [batch, length, hidden_size] output, hn = self.rnn(input, mask) # apply dropout for the output of rnn # [batch, length, hidden_size] --> [batch, hidden_size, length] --> [batch, length, hidden_size] output = self.dropout_out(output.transpose(1, 2)).transpose(1, 2) return output
def get_flair_predictions(sentences): predictions = [] flair_sentences = [] for sentence in sentences: flair_sentence = Sentence() for token in sentence: flair_sentence.add_token(Token(token)) flair_sentences.append(flair_sentence) flair.predict(flair_sentences) for s in flair_sentences: predicted_categories = [] for t in s: predicted_categories.append(t.tags['ner'].value) predictions.append(predicted_categories) return predictions #flair_preds = get_flair_predictions(sentences)
def test_sentence_infer_tokenization(): sentence = Sentence() sentence.add_token(Token('xyz')) sentence.add_token(Token('"')) sentence.add_token(Token('abc')) sentence.add_token(Token('"')) sentence.infer_space_after() assert ('xyz " abc "' == sentence.to_tokenized_string()) assert ('xyz "abc"' == sentence.to_plain_string()) sentence = Sentence('xyz " abc "') sentence.infer_space_after() assert ('xyz " abc "' == sentence.to_tokenized_string()) assert ('xyz "abc"' == sentence.to_plain_string())
def benchmark_flair_mdl(): tagger = load_flair_ner_model() start = time.time() flair_sentences = [] for i, sentence in enumerate(sentences_tokens): flair_sentence = Sentence() for token_txt in sentence: flair_sentence.add_token(Token(token_txt)) flair_sentences.append(flair_sentence) tagger.predict(flair_sentences, verbose=True) predictions = [[tok.get_tag('ner').value for tok in fs] for fs in flair_sentences] print('Flair:') print_speed_performance(start, num_sentences, num_tokens) assert len(predictions) == num_sentences print(f1_report(sentences_entities, remove_miscs(predictions), bio=True))
def get_sentences(text, lang, use_ontonotes, fast, use_embeddings, char_embeddings, bpe_size, expressions, pos, sentiment) -> List[Sentence]: """Process text using Flair and return the output from Flair""" if lang not in ('en', 'multi', 'de', 'nl', 'fr'): raise TypeError( f'{lang} is not supported! Try multi. See https://github.com/zalandoresearch/flair/blob/master/resources/docs/TUTORIAL_2_TAGGING.md' ) # tokenize sentences sentences = [] for s in segment(text): sentence = Sentence() sentences.append(sentence) for t in s: sentence.add_token( Token(t.value, start_position=t.offset, whitespace_after=t.space_after)) # run models for model in get_models(lang=lang, use_ontonotes=use_ontonotes, fast=fast, expressions=expressions, pos=pos, sentiment=sentiment): model.predict(sentences) # load embedding models if use_embeddings or char_embeddings or bpe_size > 0: get_embeddings([e.strip() for e in use_embeddings.split(',')], char_embeddings, lang, bpe_size).embed(sentences) return sentences
def predictSentence(): res = request.get_json() sentence = Sentence() for i in json.loads(res): sentence.add_token(Token(i)) #print(sentence) tagger.predict(sentence) #for entity in sentence.get_spans('ner'): # print(entity) # print(entity.text) # print(entity.tag) # print("--------------") #print(sentence.to_dict(tag_type='ner')) tags = [] for token in sentence.tokens: #print(token.text, token.get_tag('ner').value) tags.append(token.get_tag('ner').value) res = json.dumps(tags) return res
def process_conll_doc(input_file_name, output_file_name, ner_model, with_disambiguation, sim_level_disambig): nertagger = SequenceTagger.load(ner_model) columns = { 0: 'text', 1: 'nero', 2: 'nme', 3: 'wiki', } with open(input_file_name, "r") as input_file, open(output_file_name, "w+") as output_file: doc = None docs = [] spos = 0 for line in input_file: if "DOCSTART" in line: if doc == None: doc = Sentence() else: docs.append(doc) doc = Sentence() spos = 0 else: lsplit = line.split("\t") #print(lsplit) token = Token(lsplit[0].strip()) for c in columns: if c != 0: if c < len(lsplit): token.add_tag(columns[c], lsplit[c].strip()) token.start_pos = spos token.end_pos = spos + len(token.text) spos = token.end_pos + 1 doc.add_token(token) for d in docs: nertagger.predict(d) centity = [] newsent = [] for token in d: #print(token) nertag = token.get_tag("ner").value #print(token.text + " " + nertag) if nertag[0:2] in ['B-', 'S-']: if len(centity) != 0: newsent.append("<entity>" + " ".join(centity) + "</entity>") centity = [] centity.append(token.text) if nertag[0:2] in ['E-', 'I-']: centity.append(token.text) if nertag == "O": if len(centity) != 0: newsent.append("<entity>" + " ".join(centity) + "</entity>") centity = [] newsent.append(token.text) sent_for_ag = " ".join(newsent) agres = ag.disambiguate(sent_for_ag) for entity in d.get_spans('ner'): for r in agres: if r["namedEntity"] == entity.text: for t in entity.tokens: t.add_tag("pnme", r["disambiguatedURL"]) break if with_disambiguation: searcher = load_disambiguation() for nerspan in d.get_spans('ner'): if "pnme" not in nerspan.tokens[0].tags: #print("calling with " + nerspan.text) r = searcher.search(nerspan.text.lower(), sim_level_disambig) #print(r) if len(r) > 0: d_tag = unidecode.unidecode( (string.capwords(r[0]) + "_(disambiguation)").replace(" ", "_")) for t2 in nerspan.tokens: t2.add_tag("pnme", d_tag) for t in d: output_file.write( t.text + "\t" + t.get_tag("nero").value + "\t" + t.get_tag("nme").value + "\t" + unidecode.unidecode(t.get_tag("wiki").value) + "\t" + t.get_tag("pnme").value + "\n")
def train(self, intent_fst) -> None: from flair.data import Sentence, Token from flair.models import SequenceTagger, TextClassifier from flair.embeddings import ( FlairEmbeddings, StackedEmbeddings, DocumentRNNEmbeddings, ) from flair.data import TaggedCorpus from flair.trainers import ModelTrainer # Directory to look for downloaded embeddings cache_dir = self.profile.read_path( self.profile.get("intent.flair.cache_dir", "flair/cache") ) os.makedirs(cache_dir, exist_ok=True) # Directory to store generated models data_dir = self.profile.write_path( self.profile.get("intent.flair.data_dir", "flair/data") ) if os.path.exists(data_dir): shutil.rmtree(data_dir) self.embeddings = self.profile.get("intent.flair.embeddings", []) assert len(self.embeddings) > 0, "No word embeddings" # Create directories to write training data to class_data_dir = os.path.join(data_dir, "classification") ner_data_dir = os.path.join(data_dir, "ner") os.makedirs(class_data_dir, exist_ok=True) os.makedirs(ner_data_dir, exist_ok=True) # Convert FST to training data class_data_path = os.path.join(class_data_dir, "train.txt") ner_data_path = os.path.join(ner_data_dir, "train.txt") # { intent: [ { 'text': ..., 'entities': { ... } }, ... ] } sentences_by_intent: Dict[str, Any] = {} # Get sentences for training do_sampling = self.profile.get("intent.flair.do_sampling", True) start_time = time.time() if do_sampling: # Sample from each intent FST num_samples = int(self.profile.get("intent.flair.num_samples", 10000)) intent_map_path = self.profile.read_path( self.profile.get("training.intent.intent_map", "intent_map.json") ) with open(intent_map_path, "r") as intent_map_file: intent_map = json.load(intent_map_file) # Gather FSTs for all known intents fsts_dir = self.profile.write_dir( self.profile.get("speech_to_text.fsts_dir") ) intent_fst_paths = { intent_id: os.path.join(fsts_dir, f"{intent_id}.fst") for intent_id in intent_map.keys() } # Generate samples self._logger.debug( f"Generating {num_samples} sample(s) from {len(intent_fst_paths)} intent(s)" ) sentences_by_intent = sample_sentences_by_intent( intent_fst_paths, num_samples ) else: # Exhaustively generate all sentences self._logger.debug( "Generating all possible sentences (may take a long time)" ) sentences_by_intent = make_sentences_by_intent(intent_fst) sentence_time = time.time() - start_time self._logger.debug(f"Generated sentences in {sentence_time} second(s)") # Get least common multiple in order to balance sentences by intent lcm_sentences = lcm(*(len(sents) for sents in sentences_by_intent.values())) # Generate examples class_sentences = [] ner_sentences: Dict[str, List[Sentence]] = defaultdict(list) for intent_name, intent_sents in sentences_by_intent.items(): num_repeats = max(1, lcm_sentences // len(intent_sents)) for intent_sent in intent_sents: # Only train an intent classifier if there's more than one intent if len(sentences_by_intent) > 1: # Add balanced copies for i in range(num_repeats): class_sent = Sentence(labels=[intent_name]) for word in intent_sent["tokens"]: class_sent.add_token(Token(word)) class_sentences.append(class_sent) if len(intent_sent["entities"]) == 0: continue # no entities, no sequence tagger # Named entity recognition (NER) example token_idx = 0 entity_start = {ev["start"]: ev for ev in intent_sent["entities"]} entity_end = {ev["end"]: ev for ev in intent_sent["entities"]} entity = None word_tags = [] for word in intent_sent["tokens"]: # Determine tag label tag = "O" if not entity else f"I-{entity}" if token_idx in entity_start: entity = entity_start[token_idx]["entity"] tag = f"B-{entity}" word_tags.append((word, tag)) # word ner token_idx += len(word) + 1 if (token_idx - 1) in entity_end: entity = None # Add balanced copies for i in range(num_repeats): ner_sent = Sentence() for word, tag in word_tags: token = Token(word) token.add_tag("ner", tag) ner_sent.add_token(token) ner_sentences[intent_name].append(ner_sent) # Start training max_epochs = int(self.profile.get("intent.flair.max_epochs", 100)) # Load word embeddings self._logger.debug(f"Loading word embeddings from {cache_dir}") word_embeddings = [ FlairEmbeddings(os.path.join(cache_dir, "embeddings", e)) for e in self.embeddings ] if len(class_sentences) > 0: self._logger.debug("Training intent classifier") # Random 80/10/10 split class_train, class_dev, class_test = self._split_data(class_sentences) class_corpus = TaggedCorpus(class_train, class_dev, class_test) # Intent classification doc_embeddings = DocumentRNNEmbeddings( word_embeddings, hidden_size=512, reproject_words=True, reproject_words_dimension=256, ) classifier = TextClassifier( doc_embeddings, label_dictionary=class_corpus.make_label_dictionary(), multi_label=False, ) self._logger.debug( f"Intent classifier has {len(class_sentences)} example(s)" ) trainer = ModelTrainer(classifier, class_corpus) trainer.train(class_data_dir, max_epochs=max_epochs) else: self._logger.info("Skipping intent classifier training") if len(ner_sentences) > 0: self._logger.debug(f"Training {len(ner_sentences)} NER sequence tagger(s)") # Named entity recognition stacked_embeddings = StackedEmbeddings(word_embeddings) for intent_name, intent_ner_sents in ner_sentences.items(): ner_train, ner_dev, ner_test = self._split_data(intent_ner_sents) ner_corpus = TaggedCorpus(ner_train, ner_dev, ner_test) tagger = SequenceTagger( hidden_size=256, embeddings=stacked_embeddings, tag_dictionary=ner_corpus.make_tag_dictionary(tag_type="ner"), tag_type="ner", use_crf=True, ) ner_intent_dir = os.path.join(ner_data_dir, intent_name) os.makedirs(ner_intent_dir, exist_ok=True) self._logger.debug( f"NER tagger for {intent_name} has {len(intent_ner_sents)} example(s)" ) trainer = ModelTrainer(tagger, ner_corpus) trainer.train(ner_intent_dir, max_epochs=max_epochs) else: self._logger.info("Skipping NER sequence tagger training")
def form_sentence(tokens): s = Sentence() for w in tokens: s.add_token(Token(w)) return s
def process_conll_doc(input_file_name, output_file_name, ner_model, with_disambiguation, sim_level_disambig): nertagger = SequenceTagger.load(ner_model) columns = { 0: 'text', 1: 'nero', 2: 'nme', 3: 'wiki', } with open(input_file_name, "r") as input_file, open(output_file_name, "w+") as output_file: doc = None docs = [] spos = 0 for line in input_file: if "DOCSTART" in line: if doc == None: doc = Sentence() else: docs.append(doc) doc = Sentence() spos = 0 else: lsplit = line.split("\t") #print(lsplit) token = Token(lsplit[0].strip()) for c in columns: if c != 0: if c < len(lsplit): token.add_tag(columns[c], lsplit[c].strip()) token.start_pos = spos token.end_pos = spos + len(token.text) spos = token.end_pos + 1 doc.add_token(token) for d in docs: nertagger.predict(d) spans = [] for nerspan in d.get_spans('ner'): start = nerspan.start_pos length = nerspan.end_pos - nerspan.start_pos spans.append({"start": start, "length": length}) myjson = { "text": unidecode.unidecode(d.to_tokenized_string()), "spans": spans } res = requests.post(NEURAL_EL_SERVER, json=myjson) info = res.json() for nerspan in d.get_spans('ner'): for i in info: if i[0] == nerspan.start_pos: for t in nerspan.tokens: t.add_tag("pnme", i[2]) break if with_disambiguation: searcher = load_disambiguation() for nerspan in d.get_spans('ner'): if "pnme" not in nerspan.tokens[0].tags: #print("calling with " + nerspan.text) r = searcher.search(nerspan.text.lower(), sim_level_disambig) #print(r) if len(r) > 0: d_tag = unidecode.unidecode( (string.capwords(r[0]) + "_(disambiguation)").replace(" ", "_")) for t2 in nerspan.tokens: t2.add_tag("pnme", d_tag) for t in d: output_file.write( t.text + "\t" + t.get_tag("nero").value + "\t" + t.get_tag("nme").value + "\t" + unidecode.unidecode(t.get_tag("wiki").value) + "\t" + t.get_tag("pnme").value + "\n")
def test_check_input(self): """ Test for check_input function """ phone_sigs = [ 'cell', 'Cell', 'phone', 'Phone', 'Phone/fax', 'phone/fax', 'Phone/Fax' ] fax_sigs = ['Fax', 'fax'] # Check for email address sentence = Sentence() token = Token('hello') tag = 'S-email_id' token.add_tag('ner', tag) sentence.add_token(token) app.check_input(sentence) return_val = sentence[0].get_tag('ner').value self.assertNotEqual(return_val, tag) token = Token('*****@*****.**') sentence.add_token(token) app.check_input(sentence) return_val = sentence[1].get_tag('ner').value self.assertEqual(return_val, tag) token = Token('*****@*****.**') sentence.add_token(token) app.check_input(sentence) return_val = sentence[2].get_tag('ner').value self.assertNotEqual(return_val, tag) # Check for phone number for sig in phone_sigs: sentence = Sentence() token = Token(sig) tag = 'S-phone' token.add_tag('ner', tag) sentence.add_token(token) token = Token('123-456-7890') sentence.add_token(token) app.check_input(sentence) return_val = sentence[0].get_tag('ner').value self.assertNotEqual(return_val, tag) return_val = sentence[1].get_tag('ner').value self.assertEqual(return_val, tag) # Check for fax number for sig in fax_sigs: sentence = Sentence() token = Token(sig) tag = 'S-fax' token.add_tag('ner', tag) sentence.add_token(token) token = Token('123-456-7890') sentence.add_token(token) app.check_input(sentence) return_val = sentence[0].get_tag('ner').value self.assertNotEqual(return_val, tag) return_val = sentence[1].get_tag('ner').value self.assertEqual(return_val, tag) # Check for zipcode num = '' for i in range(10): num += str(i) sentence = Sentence() token = Token(num) tag = 'S-zipcode' sentence.add_token(token) app.check_input(sentence) return_val = sentence[0].get_tag('ner').value if len(num) == 5: self.assertEqual(return_val, tag) else: self.assertNotEqual(return_val, tag)
def __call__(self, doc): # TODO: use a sentencizer or not? # TODO: process all sentences in one batch on GPU for doc_sentence in doc.sents: #filtered_doc_sentence = [token for token in doc_sentence if not token.is_punct and not token.is_space] filtered_doc_sentence = doc_sentence json_data = [] # if still token remaining in sentence if filtered_doc_sentence: sentence = Sentence() for token in filtered_doc_sentence: sentence.add_token(Token(token.text)) json_data.append(token.text) json_obj = json.dumps(json_data) r = requests.post(self.req_address, json=json_obj) tags_res = r.json() spans = [] tags = [] for doc_token, tag in zip(filtered_doc_sentence, tags_res): start = doc_token.i end = start + 1 #tag = tagged_token.get_tag('ner') if tag != 'O': _, label = tag.split('-') span = Span(doc, start, end, label=self.nlp.vocab.strings[label]) spans.append(span) tags.append(tag) # doc.ents = list(doc.ents) + [span] doc.ents = list(doc.ents) + self.merge_iob_spans( doc, spans, tags) """ tagged_sentences = self.tagger.predict(sentence) spans = [] tags = [] for doc_token, tagged_token in zip(filtered_doc_sentence, tagged_sentences[0]): start = doc_token.i end = start + 1 tag = tagged_token.get_tag('ner') if tag != 'O': _, label = tag.split('-') span = Span(doc, start, end, label=self.nlp.vocab.strings[label]) spans.append(span) tags.append(tag) #doc.ents = list(doc.ents) + [span] doc.ents = list(doc.ents) + self.merge_iob_spans(doc, spans, tags) """ return doc
def create_sentlist_from_file_batchmax(self, data, maxlen=64, compare_column="cat"): """ takes a pandas dataframe with columns 'tok' and 'sentstart' and creates a list of flair Sentence objects with tags. Each flair Sentence object may contain several real sentences, but at most maxlen tokens. The Sentence object stops at a sentence boundary, so it is often shorter than maxlen. Sentences longer than maxlen are split! If a line with token value "EOF" is encountered, a shorter flair Sentence object is returned, so no file boundaries are crossed :param data_path: :return: """ sent_list = [] toklist = [] catlist = [] # the len_last_token is needed to add proper start/end pos for each sentence token len_last_token = 0 # track the sentence that is currently being processed curr_sentence_tok = [] curr_sentence_cat = [] for index, row in data.iterrows(): tok = str(row["tok"]) if compare_column != "NaN": cat = str(row[compare_column]) else: cat = "-" # if the current token is "EOF" this marks the end of sample file # chunks may not cross file boundaries, therefore end the sentence here in any case if tok == "EOF": # do not add this token to any list # merge toklist and curr_sentence_tok list to get all current tokens # and create a flair sentence toklist.extend(curr_sentence_tok) catlist.extend(curr_sentence_cat) self.logger.debug( "create chunk at EOF with (len: {}): {}".format( len(toklist), toklist)) self.logger.debug("catlist with (len: {}): {}".format( len(catlist), catlist)) sent = Sentence() for i, tok in enumerate(toklist): flair_tok = Token(str(tok), start_position=len_last_token) len_last_token += len(tok) + 1 flair_tok.add_tag("cat", catlist[i]) sent.add_token(flair_tok) if len(sent.tokens) > 0: sent_list.append(sent) len_last_token = 0 toklist = [] catlist = [] # reset the curr sent lists as well curr_sentence_tok = [] curr_sentence_cat = [] else: # if we are at the start of a new sentence, add the contents of curr_sentence_tok # and curr_sentence_cat to the main lists and start a new curr_sentence if row["sentstart"] == "yes": toklist.extend(curr_sentence_tok) catlist.extend(curr_sentence_cat) curr_sentence_tok = [tok] curr_sentence_cat = [cat] else: curr_sentence_tok.append(tok) curr_sentence_cat.append(cat) # if the combined length of toklist and curr_sentence_tok is > maxlen now, # create a flair sentence with the tokens in toklist and reset it # the remaining tokens in curr_sentence_tok are saved for the next chunk if len(toklist) + len(curr_sentence_tok) > maxlen: # if toklist is empty at this point, we have a sentence > maxlen # and must split it. The last token currently in curr_sentence will # be preserved for later so that the chunk is not too long if len(toklist) == 0: toklist.extend(curr_sentence_tok[0:-1]) catlist.extend(curr_sentence_cat[0:-1]) curr_sentence_tok = [curr_sentence_tok[-1]] curr_sentence_cat = [curr_sentence_cat[-1]] self.logger.debug( "Sentence is split (len: {}): {}".format( len(toklist), toklist)) self.logger.debug("create chunk with (len: {}): {}".format( len(toklist), toklist)) self.logger.debug("catlist with (len: {}): {}".format( len(catlist), catlist)) sent = Sentence() for i, tok in enumerate(toklist): flair_tok = Token(str(tok), start_position=len_last_token) len_last_token += len(tok) + 1 flair_tok.add_tag("cat", str(catlist[i])) sent.add_token(flair_tok) if len(sent.tokens) > 0: sent_list.append(sent) len_last_token = 0 toklist = [] catlist = [] self.logger.debug("toklist: {}, curr_sent_tok: {}".format( len(toklist), len(curr_sentence_tok))) # if the loop is complete, empty the buffers and add them to the list if len(curr_sentence_tok) > 0: toklist.extend(curr_sentence_tok) catlist.extend(curr_sentence_cat) sent = Sentence() for i, tok in enumerate(toklist): flair_tok = Token(str(tok), start_position=len_last_token) len_last_token += len(tok) + 1 flair_tok.add_tag("cat", str(catlist[i])) sent.add_token(flair_tok) if len(sent.tokens) > 0: sent_list.append(sent) len_last_token = 0 return sent_list