def samples_generator_sorted(path, max_text_legth=10000): data = [] with open(path, newline='') as f: reader = csv.reader(f) for row in reader: data.append(row) MAX = max_text_legth datas = sorted(data, key=lambda x: len(x[3]), reverse=True) print('Longest text', len(datas[0][3])) for row in datas: id = row[0] print(id) text_id = row[1] sequence = row[2] text = row[3] if len(text) > MAX: for fragment in split_long_text(text, MAX): s = Sentence(fragment, use_tokenizer='toki') s.id = id s.text_id = text_id s.sequence = sequence s.ner = [] s.length = len(fragment) yield s else: s = Sentence(text, use_tokenizer='toki') s.id = id s.text_id = text_id s.sequence = sequence s.ner = [] s.length = len(text) yield s
from flair.data import Sentence from flair.models import SequenceTagger # load the model you trained model = SequenceTagger.load( 'C:\Projects\SAKI_NLP\models/flair_best-model_33.pt') sent = "Afreen Jamadar\nActive member of IIIT Committee in Third year\n\nSangli, Maharashtra - Email me on Indeed: indeed.com/r/Afreen-Jamadar/8baf379b705e37c6\n\nI wish to use my knowledge, skills and conceptual understanding to create excellent team\nenvironments and work consistently achieving organization objectives believes in taking initiative\nand work to excellence in my work.\n\nWORK EXPERIENCE\n\nActive member of IIIT Committee in Third year\n\nCisco Networking - Kanpur, Uttar Pradesh\n\norganized by Techkriti IIT Kanpur and Azure Skynet.\nPERSONALLITY TRAITS:\n\u2022 Quick learning ability\n\u2022 hard working\n\nEDUCATION\n\nPG-DAC\n\nCDAC ACTS\n\n2017\n\nBachelor of Engg in Information Technology\n\nShivaji University Kolhapur - Kolhapur, Maharashtra\n\n2016\n\nSKILLS\n\nDatabase (Less than 1 year), HTML (Less than 1 year), Linux. (Less than 1 year), MICROSOFT\nACCESS (Less than 1 year), MICROSOFT WINDOWS (Less than 1 year)\n\nADDITIONAL INFORMATION\n\nTECHNICAL SKILLS:\n\n\u2022 Programming Languages: C, C++, Java, .net, php.\n\u2022 Web Designing: HTML, XML\n\u2022 Operating Systems: Windows [\u2026] Windows Server 2003, Linux.\n\u2022 Database: MS Access, MS SQL Server 2008, Oracle 10g, MySql.\n\n" # create example sentence sentence = Sentence(sent) # predict tags and print model.predict(sentence) print(sentence.to_tagged_string()) from flair.datasets import WIKINER_ENGLISH x = WIKINER_ENGLISH()
def similarities(self, mask_entity=False): tp = 0 fp = 0 tps = list() fps = list() prediction = list() num_candidates = list() for sentence in tqdm.tqdm(self.test): is_mentioned = [token for token in sentence if token[2] != "-"] if not is_mentioned: continue if is_mentioned: persons = self.get_persons(sentence) mention_vectors = list( self._vectorize( sentence, persons, return_id=True, return_type=True, return_str=True, mask_entity=mask_entity, ) ) for identifier, type_, mention, mention_vector in mention_vectors: TOP3 = dict() max_score = 0.0 best_candidate = None best_context = None best_sent = None if type_ == "ORG": is_org = True else: is_org = False candidates = self._get_candidates(mention, is_org) num_candidates.append(len(candidates)) for candidate in candidates: for context in self.kb[candidate]["MENTIONS"]: if self.kb[candidate].get("DESCRIPTION"): t = list(utils.tokenize(context)) t.extend( list( utils.tokenize( self.kb[candidate].get("DESCRIPTION") ) ) ) text = " ".join(t) else: t = list(utils.tokenize(context)) text = " ".join(t) indices = list(range(len(list(utils.tokenize(context))))) sentence_ = Sentence(text, use_tokenizer=False) if isinstance(EMBEDDING, EntityEmbeddings): EMBEDDING.embed(sentence_, [indices]) candidate_vector = ( sentence_.embedding.detach().numpy().reshape(1, -1) ) else: EMBEDDING.embed(sentence_) vector = sentence_[indices[0]].get_embedding().numpy() for i in indices[1:]: vector = ( vector + sentence_[i].get_embedding().numpy() ) candidate_vector = (vector / len(indices)).reshape( 1, -1 ) score = cosine_similarity(mention_vector, candidate_vector)[ 0 ][0] TOP3[ f"pred: {context} ({candidate}) vs. gold: {mention} ({identifier})" ] = float(score) if score > max_score: max_score = score best_candidate = candidate best_context = context best_sent = text prediction.append( { "pred": best_candidate, "gold": identifier, "top3": [ {key: value} for key, value in Counter(TOP3).most_common(5) ], } ) if best_candidate == identifier: tp += 1 tps.append( { "true": mention, "pred": best_context, "true_id": identifier, "pred_id": best_candidate, "score": float(max_score), "sentence": " ".join([token[0] for token in sentence]), "context": " ".join([token[0] for token in best_sent]), } ) else: fp += 1 if best_sent: fps.append( { "true": mention, "pred": best_context, "true_id": identifier, "pred_id": best_candidate, "score": float(max_score), "sentence": " ".join( [token[0] for token in sentence] ), "context": " ".join( [token[0] for token in best_sent] ), } ) with open("fps-tps.json", "w", encoding="utf-8") as f: json.dump({"tps": tps, "fps": fps}, f, ensure_ascii=False, indent=4) with open("scores.json", "w", encoding="utf-8") as f: json.dump( { "accuracy": self.accuracy(tp, fp), "precision": self.precision(tp, fp), "num_candidates": statistics.mean(num_candidates), "embedding": "language-models/presse/multi", }, f, indent=4, ensure_ascii=False, ) with open("prediction.json", "w", encoding="utf-8") as f: json.dump(prediction, f) return { "accuracy": self.accuracy(tp, fp), "precision": self.precision(tp, fp), "num_candidates": statistics.mean(num_candidates), "embedding": "language-models/presse/multi", }
def get_tag_sentence(model,query): sentence = Sentence(query) model.predict(sentence) return sentence
df["Sentence_num"] = new_sentence_nums fill_sentence_numbers(df) sentences = df.groupby('Sentence_num').apply(lambda row: " ".join(row["Word"])) from flair.models import SequenceTagger from flair.data import Sentence model = SequenceTagger.load('final-model.pt') tagged_sentences = [] # create example sentence for sentence_string in sentences: sentence = Sentence(text=sentence_string, use_tokenizer=False) # predict model.predict(sentence) tagged_sentences.append(sentence.to_tagged_string()) import re ner_regex = re.compile('^<[B|I]-.+>') # set all predictions to O df["Predicted"] = "O" row_index = 0 for tagged_sentence in tagged_sentences: sentence_tokens = tagged_sentence.split(" ") for token in sentence_tokens:
def __init__(self, path_to_conll_file: Union[str, Path], in_memory: bool = True): """ Instantiates a column dataset in CoNLL-U format. :param path_to_conll_file: Path to the CoNLL-U formatted file :param in_memory: If set to True, keeps full dataset in memory, otherwise does disk reads """ if type(path_to_conll_file) is str: path_to_conll_file = Path(path_to_conll_file) assert path_to_conll_file.exists() self.in_memory = in_memory self.path_to_conll_file = path_to_conll_file self.total_sentence_count: int = 0 if self.in_memory: self.sentences: List[Sentence] = [] else: self.indices: List[int] = [] with open(str(self.path_to_conll_file), encoding="utf-8") as file: line = file.readline() position = 0 sentence: Sentence = Sentence() while line: line = line.strip() fields: List[str] = re.split("\t+", line) if line == "": if len(sentence) > 0: self.total_sentence_count += 1 if self.in_memory: self.sentences.append(sentence) else: self.indices.append(position) position = file.tell() sentence: Sentence = Sentence() elif line.startswith("#"): line = file.readline() continue elif "." in fields[0]: line = file.readline() continue elif "-" in fields[0]: line = file.readline() continue else: token = Token(fields[1], head_id=int(fields[6])) token.add_label("lemma", str(fields[2])) token.add_label("upos", str(fields[3])) token.add_label("pos", str(fields[4])) token.add_label("dependency", str(fields[7])) if len(fields) > 9 and 'SpaceAfter=No' in fields[9]: token.whitespace_after = False for morph in str(fields[5]).split("|"): if "=" not in morph: continue token.add_label( morph.split("=")[0].lower(), morph.split("=")[1]) if len(fields) > 10 and str(fields[10]) == "Y": token.add_label("frame", str(fields[11])) sentence.add_token(token) line = file.readline() if len(sentence.tokens) > 0: self.total_sentence_count += 1 if self.in_memory: self.sentences.append(sentence) else: self.indices.append(position)
def ebm_nlp_processing(data, context_embedding, sentence, label_representations): domain_label_count = {} pain_mortality_domain = 'PAIN_MORT' for i in data: if i != '\n': i = i.split() #sentence += ' '+i[0] sentence.append((i[0], i[1])) elif i == '\n': # join words making up a sentence, the list of tags in each sentence and obtain the context vectors for the sentence words sent_unpacked = ' '.join(i[0] for i in sentence) tag_unpacked = [i[1] for i in sentence] sent = Sentence(sent_unpacked.strip()) context_embedding.embed(sent) v = '' print('+++++++', len(sent), len(sentence)) print(sent_unpacked) print(tag_unpacked) d = k = 0 #process each word in a sentence, looking for those that form outcome phrases and obtain a vector representation for entire outcome phrase, for i in range(len(sent)): if i == d: if tag_unpacked[i].startswith('B-'): b = sent[i].embedding b = b.reshape(1, len(b)) out_domain = tag_unpacked[i][2:].strip() for j in range(i + 1, len(sent)): if tag_unpacked[j].startswith('I-'): inner_b = sent[j].embedding inner_b = inner_b.reshape(1, len(inner_b)) b = torch.cat((b, inner_b), dim=0) d = j else: break b_mean = torch.mean(b, 0) if len( b.shape ) == 2 else b #extract the centroid for word vectors of an outcome phrase b_mean = b_mean.reshape(1, len(b_mean)) if out_domain not in label_representations: label_representations[out_domain] = b_mean domain_label_count[out_domain] = 1 elif out_domain in label_representations: label_representations[out_domain] = torch.cat( (label_representations[out_domain], b_mean), dim=0) domain_label_count[out_domain] += 1 #combine pain and mortality outcomes if out_domain.lower() in ['pain', 'mortality']: if pain_mortality_domain not in label_representations: label_representations[ pain_mortality_domain] = b_mean else: label_representations[ pain_mortality_domain] = torch.cat( (label_representations[out_domain], b_mean), dim=0) else: pass d += 1 sentence.clear() return label_representations, domain_label_count
def __create_models(self): models = [] models_fit = [] #for _params in self.model_params: _params = {} for k, v in self.params.items(): if k.startswith('_'): continue _params[k] = v self.textModels = dict( mtc=TextModel(_params).fit(self.train), #charEmb=DocumentPoolEmbeddings([CharacterEmbeddings()]), #charLangEmb=DocumentPoolEmbeddings([CharacterEmbeddings(),BytePairEmbeddings(self.lang)]), ##charMultiEmb=DocumentPoolEmbeddings([CharacterEmbeddings(),BytePairEmbeddings('multi')]), langEmb=DocumentPoolEmbeddings([BytePairEmbeddings(self.lang)]), charLangMultiEmb=DocumentPoolEmbeddings([ CharacterEmbeddings(), BytePairEmbeddings(self.lang), BytePairEmbeddings('multi') ]), langMultiEmb=DocumentPoolEmbeddings( [BytePairEmbeddings(self.lang), BytePairEmbeddings('multi')]), bytePairEMB=DocumentPoolEmbeddings([BytePairEmbeddings('multi')]), #flairEmbF=DocumentPoolEmbeddings([FlairEmbeddings('multi-forward')]), #flairEmbB=DocumentPoolEmbeddings([FlairEmbeddings('multi-backward')]), #bertEMB=DocumentPoolEmbeddings([TransformerWordEmbeddings('bert-base-uncased', layers='-1')]) ) for km, tmodel in self.textModels.items(): models.append({'name': km}) models_fit.append({'name': km}) if km == 'mtc': xt = tmodel.transform(self.train) xv = tmodel.transform(self.validation) X = tmodel.transform(self.data) else: sentences_train = [Sentence(txt) for txt in self.train] tmodel.embed(sentences_train) xt = np.array([ e.get_embedding().cpu().detach().numpy() for e in sentences_train ]) sentences_val = [Sentence(txt) for txt in self.validation] tmodel.embed(sentences_val) xv = np.array([ e.get_embedding().cpu().detach().numpy() for e in sentences_val ]) sentences = [Sentence(txt) for txt in self.data] tmodel.embed(sentences) X = np.array([ e.get_embedding().cpu().detach().numpy() for e in sentences ]) models[-1]['xv'] = xv models[-1]['xt'] = xt models_fit[-1]['xt'] = X #max_iter=5000 #if km=='mtc': max_iter=1000 #if km=='langMulti': max_iter=5000 #self.models[-1]['clf']=LinearSVC(max_iter=max_iter).fit(xt,self.yt) #yp=self.models[-1]['clf'].decision_function(xv) #scaler=Normalizer().fit(yp) #self.models[-1]['macroF1']=f1_score(self.yv,np.argmax(scaler.transform(yp),axis=1),average='weighted') #self.models[-1]['weightedF1']=f1_score(self.yv,np.argmax(scaler.transform(yp),axis=1),average='weighted') #self.models[-1]['score']=f1_score(self.yv,np.argmax(yp,axis=1),average='weighted') #self.models[-1]['probas']=scaler.transform(yp) ### Fit model with all avaliable data #self.models_fit[-1]['clf']=LinearSVC(max_iter=max_iter).fit(X,self.y) print('Fitting Ensemble') #self.models = Parallel(n_jobs=5)(delayed(self._train_model)(md) for md in models) #self.models_fit = Parallel(n_jobs=5)(delayed(self._train_model)(md) for md in models_fit) self.models, self.models_fit = [], [] for md, mdf in zip(models, models_fit): self.models.append(self._train_model( md)) # = [self._train_model(md) for md in models] self.models_fit.append(self._train_model(md))
def edu_extract_2(model, d): print(d) l = [] exp = { 'start': '', 'end': '', 'school': '', 'major': '', 'description': '' } d = d[1:] spans = [] for block in d: for index in range(len(block['lines'])): block['lines'][index] = combine_text_in_line(block['lines'][index]) block = combine_text_in_block(block) Edu = d if len(Edu) == 0: return [exp] for block in d: for line in block['lines']: for span in line['spans']: spans.append(span) first_flags = 0 first_size = 0 first_type = '' first_color = '' for span in spans: sentence = Sentence(span['text']) model.predict(sentence) result = sentence.to_dict(tag_type='ner') if len(result['entities']) > 0: if first_size == 0 and first_flags == 0: first_size = span['size'] first_flags = span['flags'] first_type = result['entities'][0]['type'] first_color = span['color'] else: break for span in spans: print(span['text']) sentence = Sentence(span['text']) model.predict(sentence) result = sentence.to_dict(tag_type='ner') if len(result['entities']) > 0: if result['entities'][0]['type'] == first_type and span[ 'color'] == first_color: l.append(exp) exp = { 'start': '', 'end': '', 'school': '', 'major': '', 'description': '' } for entiti in result['entities']: exp = complete_entiti(exp, entiti['type'], entiti['text']) exp['description'] += ' ' + span['text'][ result['entities'][-1]['end_pos']:] else: exp['description'] += ' ' + span['text'] l.append(exp) return l[1:]
# Run run_process_vec.py to add commas to feature vectors trainingResultFile = fasttext(trainingFile) trainFeatures = np.genfromtxt(trainingResultFile) trainFeatures = trainFeatures.tolist() trainLabels = np.genfromtxt(trainLabelsFile) trainFeatures_withsentiment = np.genfromtxt(trainingResultFile) trainFeatures_withsentiment = trainFeatures_withsentiment.tolist() train_file = open(trainingFile, 'r') trainTweetList = train_file.readlines() i = 0 for tweet in trainTweetList: sentence = Sentence(tweet) classifier.predict(sentence) if (sentence.labels[0].value == 'POSITIVE'): score = sentence.labels[0].score else: score = 0 - sentence.labels[0].score trainFeatures_withsentiment[i].append(score) i += 1 validationResultFile = fasttext(validationFile) validationFeatures = np.genfromtxt(validationResultFile) validationFeatures = validationFeatures.tolist() validationLabels = np.genfromtxt(validationLabelsFile) validationFeatures_withsentiment = np.genfromtxt(validationResultFile) validationFeatures_withsentiment = validationFeatures_withsentiment.tolist()
sentences, labels = get_word_sentences(word, dataset) km = KMeans(n_clusters=2, n_jobs=-1) tok_vecs = [] word_obj_list = [] except_counter = 0 token_length_exceed_counter = 0 print("Getting embeddings..") for sentence_ind, sent in enumerate(sentences): if sentence_ind % 1000 == 0: print("Finished sentences: " + str(sentence_ind) + " out of " + str(len(sentences))) sentence = Sentence(sent) if len(sentence.tokens) > 200: token_length_exceed_counter += 1 print("Token length exceeded for : " + str(sentence_ind) + " Token exceed counter: " + str(token_length_exceed_counter)) continue try: embedding.embed(sentence) except Exception as e: except_counter += 1 print("Exception Counter: ", except_counter, sentence_ind, e) continue for token_ind, token in enumerate(sentence): if token.text != word: continue vec = token.embedding.cpu().numpy()
def recognize( text: str, class_model: Optional[TextClassifier] = None, ner_models: Dict[str, SequenceTagger] = {}, intent_name: Optional[str] = None, intent_to_slots: Dict[str, Dict[str, fst.Fst]] = {}, ) -> Dict[str, Any]: intent = empty_intent() intent["text"] = text start_time = time.time() sentence = Sentence(text) if class_model is not None: class_model.predict(sentence) assert len(sentence.labels) > 0, "No intent predicted" label = sentence.labels[0] intent_id = label.value intent["intent"]["confidence"] = label.score elif len(ner_models) > 0: # Assume first intent intent_id = intent_name or next(iter(ner_models.keys())) intent["intent"]["confidence"] = 1 else: return intent # empty intent["intent"]["name"] = intent_id if intent_id in ner_models: slot_fsts = intent_to_slots.get(intent_id, {}) # Predict entities ner_models[intent_id].predict(sentence) ner_dict = sentence.to_dict(tag_type="ner") for named_entity in ner_dict["entities"]: slot_name = named_entity["type"] slot_value = named_entity["text"] # Check for FST to transform slot_fst = slot_fsts.get(slot_name) if slot_fst is not None: try: # Transform with FST logger.debug( f'Transforming "{slot_value}" for slot "{slot_name}" with FST' ) slot_value = fstaccept(slot_fst, slot_value)[0]["text"] except: logger.exception(slot_name) intent["entities"].append({ "entity": slot_name, "value": slot_value, "raw_value": named_entity["text"], "start": named_entity["start_pos"], "end": named_entity["end_pos"], "confidence": named_entity["confidence"], }) # Add slots intent["slots"] = {} for ev in intent["entities"]: intent["slots"][ev["entity"]] = ev["value"] # Record recognition time intent["recognize_seconds"] = time.time() - start_time return intent
def token_list_to_sentence(self, token_list: conllu.TokenList) -> Sentence: sentence: Sentence = Sentence() # Build the sentence tokens and add the annotations. for conllu_token in token_list: token = Token(conllu_token["form"]) for field in self.token_annotation_fields: field_value: Any = conllu_token[field] if isinstance(field_value, dict): # For fields that contain key-value annotations, # we add the key as label type-name and the value as the label value. for key, value in field_value.items(): token.add_label(typename=key, value=str(value)) else: token.add_label(typename=field, value=str(field_value)) if conllu_token.get("misc") is not None: space_after: Optional[str] = conllu_token["misc"].get( "SpaceAfter") if space_after == "No": token.whitespace_after = False sentence.add_token(token) if "sentence_id" in token_list.metadata: sentence.add_label("sentence_id", token_list.metadata["sentence_id"]) if "relations" in token_list.metadata: for ( head_start, head_end, tail_start, tail_end, label, ) in token_list.metadata["relations"]: # head and tail span indices are 1-indexed and end index is inclusive head = Span(sentence.tokens[head_start - 1:head_end]) tail = Span(sentence.tokens[tail_start - 1:tail_end]) sentence.add_complex_label( "relation", RelationLabel(value=label, head=head, tail=tail)) # determine all NER label types in sentence and add all NER spans as sentence-level labels ner_label_types = [] for token in sentence.tokens: for annotation in token.annotation_layers.keys(): if annotation.startswith( "ner") and annotation not in ner_label_types: ner_label_types.append(annotation) for label_type in ner_label_types: spans = sentence.get_spans(label_type) for span in spans: sentence.add_complex_label( "entity", label=SpanLabel(span=span, value=span.tag, score=span.score), ) return sentence
# share parameters if len(otaggers) > 1: first_tagger = otaggers[0] tagger.embedding2nn = first_tagger.embedding2nn tagger.rnn = first_tagger.rnn if tagger.train_initial_hidden_state: tagger.hs_initializer = first_tagger.hs_initializer tagger.lstm_init_h = first_tagger.lstm_init_h tagger.lstm_init_c = first_tagger.lstm_init_c taggers += otaggers sentences = [ Sentence( "Zellecken der Lamina deutlich verdickt, die Zellwände getüpfelt." ), Sentence( "Die Spitze der Parichätialblätter entwickelt aus zahlreichen Zellen braune Rhizoiden." ), Sentence( "Blattränder überall bis zum nächsten Blatte herablaufend." ), Sentence( "Gefunden am Rande des Hammersees " ), Sentence( "Die Blumen sind von vollendeter Form mit elegant gewellten und nach Art der Petunien und chinesischen Primeln gefranzten Petalen, oft zur Füllung neigend und von edler, meist aufrechter Faltung ." ), Sentence( "Obwohl fast 70 Jahre alt, als er sich der mühevollen Aufgabe unterzog, diese schwierige Pilzgruppe systematisch zu beschreiben, widmete er dem Werke mit dem Eifer und der Schaffenskraft eines Jugendlichen seine letzten Lebensjahre fast ausschliefslich."
tag_dictionary = corpus.make_label_dictionary(tag_type) tagger.add_and_switch_to_new_task("zeroshot-moviecomplex-synonyms-to-conll3", tag_dictionary=tag_dictionary, tag_type=tag_type) result, eval_loss = tagger.evaluate(corpus.test) print(result.main_score) print(result.log_header) print(result.log_line) print(result.detailed_results) print(eval_loss) # evaluation sentences = [ Sentence( "The Parlament of the United Kingdom is discussing a variety of topics." ), Sentence( "A man fell in love with a woman. This takes place in the last century. The film received the Golden Love Film Award." ), Sentence("The Company of Coca Cola was invented in 1901."), Sentence("This is very frustrating! I was smiling since I saw you."), Sentence("The Green Party received only a small percentage of the vote."), Sentence( "Bayern Munich won the german soccer series the sixth time in a row.") ] tags = [[ "O", "B-Institution", "I-Institution", "B-Place", "I-Place", "B-Diverse", "I-Diverse" ], ["O", "B-Story", "I-Story", "B-Price", "I-Price", "B-Time", "I-Time"],
def bert_doc_embed(path): f = open(path, 'r') f1 = f.readlines() f.close() #number of sentences in text file l = len(f1) print('number of sentences in text file: ' + str(l)) diff = l % 3 quo = int((l - diff) / 3) #number of tokens token_count = 0 if quo == 0: f2 = f1[0] for k in range(diff - 1): f2 = f2 + f1[k + 1] #create sentence sentence = Sentence(f2) size = len(sentence) if size < MAX: token_count = size #embed words in sentence embedding.embed(sentence) A = sentence[0].embedding for j in range(size - 1): A = A + sentence[j + 1].embedding else: sentence = Sentence(f1[0]) size = len(sentence) if size > MAX: print('bad sentences') return torch.zeros(3072) token_count = token_count + size #embed words in sentence embedding.embed(sentence) A = sentence[0].embedding for j in range(size - 1): A = A + sentence[j + 1].embedding A = A / token_count print('embed success1') return A else: #create a sentence sentence = Sentence(f1[0] + f1[1] + f1[2]) size = len(sentence) if size < MAX: token_count = token_count + size #embed words in sentence embedding.embed(sentence) A = sentence[0].embedding for j in range(size - 1): A = A + sentence[j + 1].embedding else: sentence = Sentence(f1[0]) size = len(sentence) if size > MAX: print('bad sentences') return torch.zeros(3072) token_count = token_count + size #embed words in sentence embedding.embed(sentence) A = sentence[0].embedding for j in range(size - 1): A = A + sentence[j + 1].embedding sentence = Sentence(f1[1]) size = len(sentence) if size > MAX: print('bad sentences') return torch.zeros(3072) token_count = token_count + size #embed words in sentence embedding.embed(sentence) for j in range(size): A = A + sentence[j].embedding sentence = Sentence(f1[2]) size = len(sentence) if size > MAX: print('bad sentences') return torch.zeros(3072) token_count = token_count + size #embed words in sentence embedding.embed(sentence) for j in range(size): A = A + sentence[j].embedding for i in range(quo - 1): #create a sentence sentence = Sentence(f1[3 * (i + 1)] + f1[3 * (i + 1) + 1] + f1[3 * (i + 1) + 2]) size = len(sentence) if size < MAX: token_count = token_count + size #embed words in sentence embedding.embed(sentence) for j in range(size): A = A + sentence[j].embedding else: sentence = Sentence(f1[3 * (i + 1)]) size = len(sentence) if size > MAX: print('bad sentences') return torch.zeros(3072) token_count = token_count + size #embed words in sentence embedding.embed(sentence) for j in range(size): A = A + sentence[j].embedding sentence = Sentence(f1[3 * (i + 1) + 1]) size = len(sentence) if size > MAX: print('bad sentences') return torch.zeros(3072) token_count = token_count + size #embed words in sentence embedding.embed(sentence) for j in range(size): A = A + sentence[j].embedding sentence = Sentence(f1[3 * (i + 1) + 2]) size = len(sentence) if size > MAX: print('bad sentences') return torch.zeros(3072) token_count = token_count + size #embed words in sentence embedding.embed(sentence) for j in range(size): A = A + sentence[j].embedding if diff != 0: f2 = f1[quo * 3] for i in range(diff - 1): f2 = f2 + f1[3 * quo + i + 1] #create sentence sentence = Sentence(f2) size = len(sentence) if size < MAX: token_count = token_count + size #embed words in sentence embedding.embed(sentence) for j in range(size): A = A + sentence[j].embedding A = A / token_count print('embed success2') return A
def embed(text, embedder): sentence = Sentence(text) embedder.embed(sentence) return sentence.get_embedding().detach().numpy()
def contextualize(df, cluster_dump_dir): def get_cluster(tok_vec, cc): max_sim = -10 max_sim_id = -1 for i, cluster_center in enumerate(cc): sim = cosine_similarity(tok_vec, cluster_center) if sim > max_sim: max_sim = sim max_sim_id = i return max_sim_id print("Contextualizing the corpus..") embedding = BertEmbeddings('bert-base-uncased') stop_words = set(stopwords.words('english')) stop_words.add('would') except_counter = 0 word_cluster = {} #this tokenizer is used to check for length > 512 tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') for index, row in df.iterrows(): if index % 100 == 0: print("Finished rows: " + str(index) + " out of " + str(len(df))) line = row["sentence"] sentences = sent_tokenize(line) for sentence_ind, sent in enumerate(sentences): tokenized_text = tokenizer.tokenize(sent) if len(tokenized_text) > 512: print('sentence too long for Bert: truncating') sentence = Sentence(' '.join(sent[:512]), use_tokenizer=True) else: sentence = Sentence(sent, use_tokenizer=True) try: embedding.embed(sentence) except: print(index) print(sentence) for token_ind, token in enumerate(sentence): word = token.text if word in stop_words: continue word_clean = word.translate( str.maketrans('', '', string.punctuation)) if len( word_clean ) == 0 or word_clean in stop_words or "/" in word_clean: continue try: cc = word_cluster[word_clean] except: try: cc = word_cluster[word] except: word_clean_path = cluster_dump_dir + word_clean + "/cc.pkl" word_path = cluster_dump_dir + word + "/cc.pkl" try: with open(word_clean_path, "rb") as handler: cc = pickle.load(handler) word_cluster[word_clean] = cc except: try: with open(word_path, "rb") as handler: cc = pickle.load(handler) word_cluster[word] = cc except Exception as e: except_counter += 1 print( "Exception Counter while getting clusters: ", except_counter, index, e) continue if len(cc) > 1: tok_vec = token.embedding.cpu().numpy() cluster = get_cluster(tok_vec, cc) sentence.tokens[token_ind].text = word + "$" + str( cluster) sentences[sentence_ind] = to_tokenized_string(sentence) df["sentence"][index] = " . ".join(sentences) return df, word_cluster
def ebm_comet_preprocessing(data, context_embedding, sentence, label_representations, file): for i in data: if not i.__contains__('docx'): if i != '\n': if i.startswith("[['P") or i.startswith( "[['E") or i.startswith("[['S") or re.search( '\[\]', i): multi_labels = i else: i = i.split() sentence.append((i[0], i[1])) elif i == '\n': if sentence: sent_unpacked = ' '.join(i[0] for i in sentence) tag_unpacked = [i[1] for i in sentence] sent = Sentence(sent_unpacked.strip()) context_embedding.embed(sent) v = '' print('\n+++++++') print(sent_unpacked) print(tag_unpacked) print(multi_labels, type(multi_labels)) multi_labels = ast.literal_eval(multi_labels) d = k = ann = 0 for i in range(len(sent)): if i == d: if tag_unpacked[i].startswith('B-'): b = sent[i].embedding b = b.reshape(1, len(b)) z = sent[i].text file.write('{} {}\n'.format( sent[i].text, tag_unpacked[i])) out_domain = multi_labels[ann] if out_domain[0][0] not in ['E', 'S']: for j in range(i + 1, len(sent)): if tag_unpacked[j].startswith('I-'): file.write('{} {}\n'.format( sent[j].text, tag_unpacked[j])) inner_b = sent[j].embedding inner_b = inner_b.reshape( 1, len(inner_b)) b = torch.cat((b, inner_b), dim=0) d = j else: break # print('---------',b.shape) b_mean = torch.mean(b, 0) if len( b.shape) == 2 else b b_mean = b_mean.reshape(1, len(b_mean)) for dom in out_domain: if dom not in label_representations: label_representations[dom] = b_mean elif dom in label_representations: label_representations[ dom] = torch.cat(( label_representations[dom], b_mean), dim=0) else: e_s_features = [] x_indices = [] x_indices.append( (z, re_shape(sent[i].embedding))) for j in range(i + 1, len(sent)): inner_b = sent[j].embedding inner_b = inner_b.reshape( 1, len(inner_b)) if re.search( 'E\d', tag_unpacked[j]) or re.search( 'S\d', tag_unpacked[j]): b = torch.cat((b, inner_b), dim=0) z += ' ' + sent[j].text file.write('{} {}\n'.format( sent[j].text, tag_unpacked[j])) e_s_features.append( (z, tag_unpacked[j], b)) x_indices.append( (sent[j].text, re_shape(sent[j].embedding))) d = j break elif re.search( 'B', tag_unpacked[j] ) or ('Seperator' == tag_unpacked[j] and out_domain[0][0] == 'S'): e_s_features.append( (z, tag_unpacked[j], b)) z = '' if out_domain[0][ 0] == 'S' else sent[j].text file.write('{} {}\n'.format( sent[j].text, tag_unpacked[j])) b = sent[j].embedding b = b.reshape(1, len(b)) x_indices.append( (sent[j].text, re_shape(sent[j].embedding))) else: z += ' ' + sent[j].text file.write('{} {}\n'.format( sent[j].text, tag_unpacked[j])) b = torch.cat((b, inner_b), dim=0) x_indices.append( (sent[j].text, re_shape(sent[j].embedding))) x = int(out_domain[0][-1]) print([ (i[0], i[1].shape) for i in x_indices ], '+++++++++++++++++#####################+++++++++', x, [(g[0], g[1], g[2].shape) for g in e_s_features]) y_indices = [] if re.search('E\d', out_domain[0]): for m in range(len(e_s_features)): if m < (len(e_s_features) - 1): _m_ = e_s_features[m][2] for t in range(x): #print('Ennnnnnnnnnnnd',e_s_features[m][0]) _m_ = torch.cat( (_m_, x_indices[-(t + 1)][1] ), dim=0) y_indices.append(_m_) y_indices.append(e_s_features[-1][2]) elif re.search('S\d', out_domain[0]): for m in range(len(e_s_features)): if m > 0: _m_ = e_s_features[m][2] for t in range(x): _m_ = torch.cat( (x_indices[t][1], _m_)) #print('Staaaaaaaaaaaaaart',e_s_features[m][0]) y_indices.append(_m_) y_indices.insert(0, e_s_features[0][2]) b_mean = [] for d_ in y_indices: d_ = torch.mean( d_, 0) if len(d_.shape) > 1 else d_ b_mean.append(d_.reshape(1, len(d_))) for b_, dom in zip(b_mean, out_domain[1:]): if dom not in label_representations: label_representations[dom] = b_ elif dom in label_representations: label_representations[ dom] = torch.cat(( label_representations[dom], b_), dim=0) ann += 1 else: file.write('{} {}\n'.format(sent[i].text, 'O')) pass d += 1 file.write('\n') sentence.clear() return label_representations
def interpret_sentence(flair_model_wrapper, lig, sentence, target_label, visualization_list, n_steps=100, estimation_method="gausslegendre", internal_batch_size=None): """ We can visualise the attributions made by making use of Pytorch Captum. Inputs: flair_model_wrapper: class containing a customized forward function of Flair model. lig: the layer integrated gradient object. sentence: the Flair sentence-object we want to interpret. target_label: the ground truth class-label of the sentence. visualization_list: a list to store the visualization records in. """ # Return the target index from the label dictionary. target_index = flair_model_wrapper.label_dictionary.get_idx_for_item( target_label) # In order maintain consistency with Flair, we apply the same tokenization # steps. flair_sentence = Sentence(sentence) tokenized_sentence = flair_sentence.to_tokenized_string() # This calculates the token input IDs tensor for the model. input_ids = flair_model_wrapper.tokenizer.encode( tokenized_sentence, add_special_tokens=False, max_length=flair_model_wrapper.tokenizer.model_max_length, truncation=True, return_tensors="pt") # Create a baseline by creating a tensor of equal length # containing the padding token tensor id. ref_base_line = torch.ones_like(input_ids) # Convert back to tokens as the model requires. # As some words might get split up. e.g. Caroll to Carol l. all_tokens = flair_model_wrapper.tokenizer.convert_ids_to_tokens( input_ids[0]) # The tokenizer in the model adds a special character # in front of every sentence. readable_tokens = [token.replace("▁", "") for token in all_tokens] # The input IDs are passed to the embedding layer of the model. # It is better to return the logits for Captum. # https://github.com/pytorch/captum/issues/355#issuecomment-619610044 # Thus we calculate the softmax afterwards. # For now, I take the first dimension and run this sentence, per sentence. model_outputs = flair_model_wrapper(input_ids) softmax = torch.nn.functional.softmax(model_outputs[0], dim=0) # Return the confidence and the class ID of the top predicted class. conf, idx = torch.max(softmax, 0) #conf, idx = torch.max(model_outputs[0], 0) # Returns the probability. prediction_confidence = conf.item() # Returns the label name from the top prediction class. pred_label = flair_model_wrapper.label_dictionary.get_item_for_index( idx.item()) # Calculate the attributions according to the LayerIntegratedGradients method. attributions_ig, delta = lig.attribute( input_ids, baselines=ref_base_line, n_steps=n_steps, return_convergence_delta=True, target=target_index, method=estimation_method, internal_batch_size=internal_batch_size) convergence_delta = abs(delta) print('pred: ', idx.item(), '(', '%.2f' % conf.item(), ')', ', delta: ', convergence_delta) word_attributions, attribution_score = summarize_attributions( attributions_ig) visualization_list.append( viz.VisualizationDataRecord(word_attributions=word_attributions, pred_prob=prediction_confidence, pred_class=pred_label, true_class=target_label, attr_class=target_label, attr_score=attribution_score, raw_input=readable_tokens, convergence_score=delta)) # Return these for the sanity checks. return readable_tokens, word_attributions, convergence_delta
from flair.data import Sentence from flair.models import SequenceTagger # make a sentence sentence = Sentence('I love Berlin .') # load the NER tagger tagger = SequenceTagger.load('ner') # run NER over sentence tagger.predict(sentence) print(sentence) print('The following NER tags are found:') # iterate over entities and print for entity in sentence.get_spans('ner'): print(entity)
def generate_embeddings(docs, batch_size, model_name='bert-base-cased', pooling='mean', offset=0): """ Generator function for generating embeddings from strings using a flair model. Takes a list of sentences and returns a list tuple. The first element represents failure (0) or success (1 or 2) and the second element contains a list of embeddings as numpy arrays if successful, and the indices of the failed batch if unsuccessful. The first element is 1, if batch_size embeddings were created :param docs: a list of strings for which embeddings should be created :param batch_size: integer representing how many embeddings should be created at once :param model_name: the model for creating the embeddings. Defaults to document embeddings using BERT-Base :param pooling: the pooling strategy to generate Document Embeddings :param offset: the offset of the integers, for printing out the correct index :return: a tuple (success/failure, embeddings/failed_indices) """ rest = len(docs) % batch_size model = False if pooling == 'mean': embedding = TransformerWordEmbeddings(model_name, layers='-1', allow_long_sentences=True) model = DocumentPoolEmbeddings([embedding], fine_tune_mode='none') elif pooling == 'CLS': model = TransformerDocumentEmbeddings(model_name) if model: for i in range(0, len(docs) - rest, batch_size): sentences = [ Sentence(sentence) for sentence in docs[i:i + batch_size] ] try: model.embed(sentences) print( f'successfully embedded sentences {offset + i} to {offset + i + batch_size-1}' ) yield 1, [ sentence.get_embedding().detach().cpu().numpy() for sentence in sentences ] except RuntimeError: print( f'could not embed sentences with index {offset + i} ' f'to {offset + i + batch_size-1}\nstoring in failed index list' ) yield 0, (offset + i, offset + i + batch_size - 1) if rest: sentences = [Sentence(sentence) for sentence in docs[-rest:]] try: model.embed(sentences) print( f'successfully embedded sentences from {len(docs) + offset - rest} to the end' ) yield 1, [ sentence.get_embedding().detach().cpu().numpy() for sentence in sentences ] except RuntimeError: yield 0, (len(docs) - rest, 0) elif pooling == 'SentenceBert': model = SentenceTransformer(model_name) for i in range(0, len(docs) - rest, batch_size): try: embeddings = model.encode(docs[i:i + batch_size]) print( f'successfully embedded sentences {offset + i} to {offset + i + batch_size-1}' ) yield 1, embeddings except RuntimeError: print( f'could not embed sentences with index {offset + i} ' f'to {offset + i + batch_size-1}\nstoring in failed index list' ) yield 0, (offset + i, offset + i + batch_size - 1) if rest: try: embeddings = model.encode(docs[-rest:]) print( f'successfully embedded sentences from {len(docs) + offset - rest} to the end' ) yield 1, embeddings except RuntimeError: yield 0, (len(docs) - rest, 0) else: raise Exception("No Valid model")
def get_indexed_data(self, data, data_type="train"): ''' indexing data if data_type == "test", matrix_spots is not included ''' indexed_sample_list = [] max_word_num = self.max_word_num max_subword_num = self.max_subword_num max_char_num_in_tok = self.max_char_num_in_tok max_word_num4flair = max( [len(sample["text"].split(" ")) for sample in data] ) # 这里主要考虑让flair脱离max_word_num的依赖,如果有bug,改回max_word_num。以后用到flair都要带上max_word_num。 for sample in tqdm(data, desc="Generate indexed data"): text = sample["text"] indexed_sample = {} indexed_sample["sample"] = sample if self.subword_tokenizer is not None: # codes for bert input bert_codes = self.subword_tokenizer.encode_plus( text, return_offsets_mapping=True, add_special_tokens=False, max_length=max_subword_num, truncation=True, pad_to_max_length=True) # get bert codes subword_input_ids = torch.tensor( bert_codes["input_ids"]).long() attention_mask = torch.tensor( bert_codes["attention_mask"]).long() token_type_ids = torch.tensor( bert_codes["token_type_ids"]).long() subword2char_span = bert_codes["offset_mapping"] indexed_sample["subword_input_ids"] = subword_input_ids indexed_sample["attention_mask"] = attention_mask indexed_sample["token_type_ids"] = token_type_ids indexed_sample[ "tok2char_span"] = subword2char_span # token is subword level # word level tokenizer if self.word_tokenizer is not None: # use word enc if self.subword_tokenizer is not None: # also use bert indexed_sample[ "word_input_ids"] = self.word_tokenizer.text2word_indices( text, max_word_num) # subword2word_idx_map: map subword to corresponding word words = self.word_tokenizer.tokenize(text) subword2word_idx_map = [] for wd_idx, wd in enumerate(words): for subwd in self.subword_tokenizer.tokenize(wd): if subwd != "[PAD]": subword2word_idx_map.append(wd_idx) if len(subword2word_idx_map) < max_subword_num: subword2word_idx_map.extend( [len(words) - 1] * (max_subword_num - len(subword2word_idx_map))) subword2word_idx_map = torch.tensor( subword2word_idx_map).long() indexed_sample[ "subword2word_idx_map"] = subword2word_idx_map else: # do not use bert, but use word enc word_codes = self.word_tokenizer.encode_plus( text, max_word_num) word2char_span = word_codes["offset_mapping"] indexed_sample[ "tok2char_span"] = word2char_span # token is word level indexed_sample["word_input_ids"] = word_codes["input_ids"] if self.text2char_indices_func is not None: # use char enc char_input_ids = self.text2char_indices_func(text) char_input_ids_padded = [] for span in indexed_sample["tok2char_span"]: char_ids = char_input_ids[span[0]:span[1]] if len(char_ids) < max_char_num_in_tok: char_ids.extend([0] * (max_char_num_in_tok - len(char_ids))) else: char_ids = char_ids[:max_char_num_in_tok] char_input_ids_padded.extend(char_ids) char_input_ids_padded = torch.tensor( char_input_ids_padded).long() indexed_sample["char_input_ids_padded"] = char_input_ids_padded # prepare padded sentences for flair embeddings words = text.split(" ") words.extend(["[PAD]"] * (max_word_num4flair - len(words))) indexed_sample["padded_sent"] = Sentence(" ".join(words)) # get spots if data_type != "test": matrix_spots = self.handshaking_tagger.get_spots(sample) indexed_sample["matrix_spots"] = matrix_spots indexed_sample_list.append(indexed_sample) return indexed_sample_list
@author: God """ #import commands for flair ner from flair.data import Sentence from flair.models import SequenceTagger #Load NER Model tagger = SequenceTagger.load('ner') #Sample text to run NER text = 'Jackson is placed in Microsoft located in Redmond' #passing text to sentence sentence = Sentence(text) # Run NER on sentence to identify Entities tagger.predict(sentence) # print the entities with below command for entity in sentence.get_spans('ner'): print(entity) print(sentence.to_tagged_string()) #Sample text text1 = 'Redmond is coming to New York city' #passing text to sentence sentence = Sentence(text1)
print( f"#####\n{s}\n morfeusz={is_valid1} stanza={is_valid2} krnnt={is_valid3}" ) #%% pos with flair from flair.data import Sentence from flair.models import SequenceTagger tagger = SequenceTagger.load("pos-multi") #%% sentence = sentences[0] print(f"\n>>>{sentence}") sent = Sentence(sentence) tagger.predict(sent) print(f"\n{sent.to_tagged_string()}") for t in sent.tokens: print(f"{t}- {t.get_tag('upos').value} {t.get_tag('upos').score}") conv_flair_get_pos = lambda x: x.get_tag("upos").value flair_ud_pos = list(map(conv_flair_get_pos, sent.tokens)) stats_flair_pos = Counter(flair_ud_pos) print(stats_flair_pos) # %% sentence taggers # docker run -p 9003:9003 -it djstrong/krnnt:1.0.0
def _embed_document(self, document_text: str, doc_embeddings: DocumentPoolEmbeddings): sentence = Sentence(document_text) doc_embeddings.embed(sentence) return sentence.get_embedding().data.cpu().numpy()
def test_sentence_to_plain_string(): sentence: Sentence = Sentence('I love Berlin.', use_tokenizer=True) assert ('I love Berlin .' == sentence.to_tokenized_string())
from flair.embeddings import WordEmbeddings from flair.embeddings import CharacterEmbeddings from flair.embeddings import BertEmbeddings import nltk filename = "/home/harsh/Downloads/data/abc_datafiles/01.txt" file = open(filename, "r") text = file.read() text.replace('\"', '\\"') sent_text = nltk.sent_tokenize(text) final_text = "" for sentence in sent_text: final_text += sentence sentence = Sentence(final_text, use_tokenizer=True) # load the NER tagger # Part-of-Speech Tagging tagger = SequenceTagger.load('pos') # 4-class Named Entity Recognition # tagger = SequenceTagger.load('ner') # Semantic Frame Detection (Experimental) # tagger = SequenceTagger.load('frame') # Syntactic Chunking # tagger = SequenceTagger.load('chunk') # 12-class Named Entity Recognition
def main(args): args = parser.parse_args() # Loading classifier model: print("Loading classifier model") classifier = TextClassifier.load_from_file(join(args.model_dir, 'best-model.pt')) txt_files = glob.glob(join(args.data_dir, '*.txt')) sent_splitter = PunktSentenceTokenizer() tokenizer = TreebankWordTokenizer() sentence_lookahead = 0 for txt_fn in txt_files: print("Processing %s" % (txt_fn)) ann_input_fn = join(args.data_dir, basename(txt_fn)[:-3]+'ann') ents, _ = read_brat_file(ann_input_fn) ann_output_fn = join(args.output_dir, basename(txt_fn)[:-3]+'ann') with open(txt_fn, 'r') as myfile: text = myfile.read() ann_out = open(ann_output_fn, 'w') # Write entities right away: for ent_id in ents.keys(): ent = ents[ent_id] ent_text = text[ent.start:ent.end].replace('\n', ' ') ann_out.write('%s\t%s %d %d\t%s\n' % (ent_id, ent.cat, ent.start, ent.end, ent_text)) sent_spans = list(sent_splitter.span_tokenize(text)) rel_ind = 0 rel_attempts = 0 for sent_ind in range(len(sent_spans)): primary_sent_span = sent_spans[sent_ind] end_window_ind = min(sent_ind+sentence_lookahead, len(sent_spans)-1) end_sent_span = sent_spans[end_window_ind] sent = text[primary_sent_span[0]:end_sent_span[1]].replace('\n', ' ') drug_ents, att_ents = get_span_ents(primary_sent_span, end_sent_span, ents) for att_ent in att_ents: for drug_ent in drug_ents: ## Get index of ents into sent: a1_start = att_ent.start - primary_sent_span[0] a1_end = att_ent.end - primary_sent_span[0] a1_text = sent[a1_start:a1_end] a2_start = drug_ent.start - primary_sent_span[0] a2_end = drug_ent.end - primary_sent_span[0] a2_text = sent[a2_start:a2_end] if a1_start < a2_start: # arg1 occurs before arg2 rel_text = (sent[:a1_start] + " %sStart %s %sEnd " % (att_ent.cat, a1_text, att_ent.cat) + sent[a1_end:a2_start] + " DrugStart %s DrugEnd" % (a2_text) + sent[a2_end:]) else: rel_text = (sent[:a2_start] + " DrugStart %s DrugEnd " % (a2_text) + sent[a2_end:a1_start] + " %sStart %s %sEnd " % (att_ent.cat, a1_text, att_ent.cat) + sent[a1_end:]) # if att_ent.cat == 'Dosage': # print("working with Dosage ent") sentence = Sentence(rel_text, use_tokenizer=True) labels = classifier.predict(sentence)[0].labels if len(labels) > 1: print(' This relation has more than one output label') label = labels[0].value # print("Comparing ent %s and ent %s and got %s" % (att_ent.id, drug_ent.id, label)) rel_attempts += 1 if not label == 'None': # Make sure label corresponds to entity type: if label.find(att_ent.cat) < 0: # print(" Skipping found relation where label %s doesn't match arg type %s" % (label, att_ent.cat)) continue ann_out.write('R%d\t%s Arg1:%s Arg2:%s\n' % (rel_ind, label, att_ent.id, drug_ent.id)) rel_ind += 1 # print("Finished: Found %d relations while making %d classification attempts" % (rel_ind, rel_attempts)) ann_out.close()
def read_column_data(path_to_column_file: Path, column_name_map: Dict[int, str], infer_whitespace_after: bool = True): """ Reads a file in column format and produces a list of Sentence with tokenlevel annotation as specified in the column_name_map. For instance, by passing "{0: 'text', 1: 'pos', 2: 'np', 3: 'ner'}" as column_name_map you specify that the first column is the text (lexical value) of the token, the second the PoS tag, the third the chunk and the forth the NER tag. :param path_to_column_file: the path to the column file :param column_name_map: a map of column number to token annotation name :param infer_whitespace_after: if True, tries to infer whitespace_after field for Token :return: list of sentences """ sentences: List[Sentence] = [] try: lines: List[str] = open( str(path_to_column_file), encoding='utf-8').read().strip().split('\n') except: log.info( 'UTF-8 can\'t read: {} ... using "latin-1" instead.'.format( path_to_column_file)) lines: List[str] = open( str(path_to_column_file), encoding='latin1').read().strip().split('\n') # most data sets have the token text in the first column, if not, pass 'text' as column text_column: int = 0 for column in column_name_map: if column_name_map[column] == 'text': text_column = column sentence: Sentence = Sentence() for line in lines: if line.startswith('#'): continue if line.strip().replace('', '') == '': if len(sentence) > 0: sentence.infer_space_after() sentences.append(sentence) sentence: Sentence = Sentence() else: fields: List[str] = re.split("\s+", line) token = Token(fields[text_column]) for column in column_name_map: if len(fields) > column: if column != text_column: token.add_tag(column_name_map[column], fields[column]) sentence.add_token(token) if len(sentence.tokens) > 0: sentence.infer_space_after() sentences.append(sentence) return sentences