def predict(sentence): if sentence: model1 = NERModel('bert', 'NERMODEL1', labels=[ "B-sector", "I-sector", "B-funda", "O", "operator", "threshold", "Join", "B-attr", "I-funda", "TPQty", "TPUnit", "Sortby", "B-eco", "I-eco", "B-index", "Capitalization", "I-", "funda", "B-security", 'I-security', 'Number', 'Sector', 'TPMonth', 'TPYr', 'TPRef' ], args={ "save_eval_checkpoints": False, "save_steps": -1, "output_dir": "NERMODEL", 'overwrite_output_dir': True, "save_model_every_epoch": False, 'reprocess_input_data': True, "train_batch_size": 10, 'num_train_epochs': 15, "max_seq_length": 64 }, use_cuda=False) predictions, raw_outputs = model1.predict([sentence]) result = json.dumps(predictions[0]) return result
def test_named_entity_recognition(): # Creating train_df and eval_df for demonstration train_data = [ [0, "Simple", "B-MISC"], [0, "Transformers", "I-MISC"], [0, "started", "O"], [1, "with", "O"], [0, "text", "O"], [0, "classification", "B-MISC"], [1, "Simple", "B-MISC"], [1, "Transformers", "I-MISC"], [1, "can", "O"], [1, "now", "O"], [1, "perform", "O"], [1, "NER", "B-MISC"], ] train_df = pd.DataFrame(train_data, columns=["sentence_id", "words", "labels"]) eval_data = [ [0, "Simple", "B-MISC"], [0, "Transformers", "I-MISC"], [0, "was", "O"], [1, "built", "O"], [1, "for", "O"], [0, "text", "O"], [0, "classification", "B-MISC"], [1, "Simple", "B-MISC"], [1, "Transformers", "I-MISC"], [1, "then", "O"], [1, "expanded", "O"], [1, "to", "O"], [1, "perform", "O"], [1, "NER", "B-MISC"], ] eval_df = pd.DataFrame(eval_data, columns=["sentence_id", "words", "labels"]) # Create a NERModel model = NERModel( "bert", "bert-base-cased", args={ "no_save": True, "overwrite_output_dir": True, "reprocess_input_data": False }, use_cuda=False, ) # Train the model model.train_model(train_df) # Evaluate the model result, model_outputs, predictions = model.eval_model(eval_df) # Predictions on arbitary text strings predictions, raw_outputs = model.predict(["Some arbitary sentence"])
def predict(): if request.method == 'POST': test_sents = pd.read_csv(request.files.get('file')) aspect_model = NERModel("bert", "/Users/mutaz/Desktop/Mutaz Thesis bert/hotel_reviews", use_cuda=False, labels=["B-A", "I-A", "O"], args={"use_cuda": False, "save_eval_checkpoints": False, "save_steps": -1, "output_dir": "MODEL", 'overwrite_output_dir': True, "save_model_every_epoch": False, }) predictions, raw_outputs = aspect_model.predict(test_sents["sentence"]) pred = [[tag__ for word_ in s for word__, tag__ in word_.items()] for s in [sentence_ for sentence_ in predictions]] print(pred) # connect(test_sents, aspects_prds) return redirect('https://dub01.online.tableau.com/#/site/absa/views/absa_project/MAIN?:iid=20&:original_view=y')
args.learning_rate=1e-4 args.overwrite_output_dir=True args.train_batch_size=32 args.eval_batch_size=32 model=NERModel('bert', 'bert-base-cased', labels=label, args=args) model.train_model(train_data, eval_data=test_data, acc=accuracy_score) result, model_outputs, preds_list=model.eval_model(test_data) result prediction, model_output=model.predict(["This is Nishi"]) prediction !pip install bert-extractive-summarizer !pip install wikipedia import wikipedia wiki = wikipedia.page('Amsterdam') article=wiki.content print(article) from summarizer import Summarizer model = Summarizer()
class NerModel: def __init__(self, modelname="", dataset=None, use_saved_model=False): self.dataset = dataset #labels_list = ["O", "B-ACT", "I-ACT", "B-OBJ", "I-OBJ", "B-VAL", "I-VAL", "B-VAR", "I-VAR"] #labels_list = dataset.get_labels_list() labels_list = dataset['labels_list'] output_dir = "outputs_{}".format(modelname) # Create a NERModel model_args = { 'output_dir': output_dir, 'overwrite_output_dir': True, 'reprocess_input_data': True, 'save_eval_checkpoints': False, 'save_steps': -1, 'save_model_every_epoch': False, 'train_batch_size': 10, # 10 'num_train_epochs': 10, # 5 'max_seq_length': 256, 'gradient_accumulation_steps': 8, 'labels_list': labels_list } if use_saved_model: self.model = NERModel("bert", output_dir, use_cuda=False, args=model_args) else: self.model = NERModel("bert", "bert-base-cased", use_cuda=False, args=model_args) # args={"overwrite_output_dir": True, "reprocess_input_data": True} def train(self): # # Train the model if self.dataset: self.model.train_model(self.dataset['train']) else: raise Exception("dataset is None") def eval(self): # # Evaluate the model if self.dataset: result, model_outputs, predictions = self.model.eval_model(self.dataset['val']) print("Evaluation result:", result) else: raise Exception("dataset is None") def simple_test(self): # Predictions on arbitary text strings sentences = ["Some arbitary sentence", "Simple Transformers sentence"] predictions, raw_outputs = self.model.predict(sentences) print(predictions) # More detailed preditctions for n, (preds, outs) in enumerate(zip(predictions, raw_outputs)): print("\n___________________________") print("Sentence: ", sentences[n]) for pred, out in zip(preds, outs): key = list(pred.keys())[0] new_out = out[key] preds = list(softmax(np.mean(new_out, axis=0))) print(key, pred[key], preds[np.argmax(preds)], preds) def predict(self, sentences): predictions, raw_outputs = self.model.predict(sentences) return predictions
class DependencyModel: def __init__(self, ner_path: str, deps_path: str): """Create/Load a new DependencyModel Args: ner_path (str): directory of NER model. (if not exists, create a new model) deps_path (str): directory of classification model. """ self.ner_path = ner_path try: self.ner_model = NERModel("distilbert", ner_path, use_cuda=False) except: self.ner_model = NERModel("distilbert", "distilbert-base-uncased", labels=custom_labels, use_cuda=False) self.deps_model = ClassifyModel(deps_path) def train_ner(self, ner_data: Iterator[Tuple[str, str, str]] ): """Train the NER model Args: ner_data (Iterator[Tuple[str, str, str]]): iterator of tuple of (sentence, entity1, entity2). """ train_data = [] tokenizer = nltk.RegexpTokenizer(r"[A-Za-z']+") for i, (sentence, e1, e2) in enumerate(ner_data): for word in tokenizer.tokenize(sentence): label = "O" if word.lower() == e1.lower() or word.lower() == e2.lower(): label = "E" train_data.append((i, word, label)) train_data = pd.DataFrame(train_data) train_data.columns = ["sentence_id", "words", "labels"] self.ner_model.train(train_data, output_dir=self.ner_path) def train_deps(self, train_data: Iterator[Tuple[str, Tuple[str, str, str]]]): """Train the dependency tree path classification model. Args: train_data (Iterator[Tuple[str, Tuple[str, str, str]]]): iterator of (sentence, (relation, entity1, entity2)) """ classify_data: List[str, str] = [] for sentence, (relation, e1, e2) in train_data: tree = DepTree(sentence) classify_data.append((" ".join(tree.shortest_path(e1,e2)), relation_list.index(relation))) self.deps_model.train(train_data) def train(self, train_data: Iterator[Tuple[str, Tuple[str, str, str]]]): """Train both the NER model AND the classification model. Args: train_data (Iterator[Tuple[str, Tuple[str, str, str]]]): iterator of (sentence, (relation, entity1, entity2)) """ train_data = list(train_data) self.train_ner([ (sent, e1, e2) for sent, (rela, e1, e2) in train_data ]) self.train_deps(train_data) def predict(self, data: Iterable[str]) -> List[Tuple[str, Tuple[str, str]]]: """Predict the relation extracted from input sentences. Args: data (Iterable[str]): list of input sentences. Returns: List[Tuple[str, Tuple[str, str]]]: list of (relation, (entity1, entity2)) """ data = list(data) predictions, raws = self.ner_model.predict(data) predict_data = [] entities = [] for raw, sentence in zip(raws, data): [e1, e2] = get_entity(raw) tree = DepTree(sentence) entities.append((e1, e2)) predict_data.append(" ".join(tree.shortest_path(e1,e2))) return list(zip(self.deps_model.predict(predict_data), entities))
if __name__ == '__main__': train_df = txt_to_df("/Users/mutaz/Desktop/Mutaz Thesis bert/Data/raw/train.txt") test_df = txt_to_df("/Users/mutaz/Desktop/Mutaz Thesis bert/Data/raw/test.txt") test_df.reset_index(drop=True, inplace=True) # test_df["Sentence #"] = test_df["Sentence #"].values logging.basicConfig(level=logging.INFO) transformers_logger = logging.getLogger("transformers") transformers_logger.setLevel(logging.WARNING) train_df = pd.DataFrame(train_df.values, columns=['sentence_id', 'words', 'labels']) test_df = pd.DataFrame(test_df.values, columns=['sentence_id', 'words', 'labels']) model_used = "/Users/mutaz/Desktop/Mutaz Thesis bert/hotel_reviews" mm, epp, lrr, f11, results = [], [], [], [], {} aspect_model = NERModel("bert", "{}".format(model_used), labels=["B-A", "I-A", "O"], args={"save_eval_checkpoints": False, "save_steps": -1, 'overwrite_output_dir': True, "save_model_every_epoch": False, 'reprocess_input_data': True, "train_batch_size": 5, 'num_train_epochs': 2, "gradient_accumulation_steps": 5, "output_dir": "/Users/mutaz/Desktop/Mutaz Thesis bert" , "learning_rate": 0.0001}, use_cuda=False) # aspect_model.train_model(train_df) test_df = test_df.iloc[:100] a = test_df.groupby("sentence_id") gps = [a.get_group(key) for key, item in a] actual = [list(g.labels.values) for g in gps] sentences = [" ".join(i) for i in [[w for w in g.words.values] for g in gps]] predictions, raw_outputs = aspect_model.predict(sentences) pred = [[tag__ for word_ in s for word__, tag__ in word_.items()] for s in [sentence_ for sentence_ in predictions]] clear_output() f1 = f1_score(actual, pred, mode='strict', scheme=IOB2) print(f1)
line = line.split('\t') if len(line) == 3: eval_data.append(line) eval_df = pd.DataFrame(eval_data, columns=['sentence_id', 'words', 'labels']) # test = eval_df.groupby('sentence_id').count()[['words']] # print(test.query("words > 128")) # Create a NERModel model = NERModel('albert', 'albert-base-v2', use_cuda=False, args={ 'overwrite_output_dir': True, 'reprocess_input_data': True }) # # Train the model model.train_model(train_df) # Evaluate the model result, model_outputs, predictions = model.eval_model(eval_df) # Predictions on arbitary text strings predictions, raw_outputs = model.predict( ["Peter er lige begyndt til badminton."]) print(predictions)
class CustomSlotAnalysis(BaseEstimator, TransformerMixin): def __init__(self): device = str(torch.device('cuda' if torch.cuda.is_available() else 'cpu')) if device=='cpu': use_cuda = False else: use_cuda = True model_args = {"use_multiprocessing": False} self.labels = ['B-aircraft_code', 'B-airline_code', 'B-airline_name', 'B-airport_code', 'B-airport_name', 'B-arrive_date.date_relative', 'B-arrive_date.day_name', 'B-arrive_date.day_number', 'B-arrive_date.month_name', 'B-arrive_date.today_relative', 'B-arrive_time.end_time', 'B-arrive_time.period_mod', 'B-arrive_time.period_of_day', 'B-arrive_time.start_time', 'B-arrive_time.time', 'B-arrive_time.time_relative', 'B-booking_class', 'B-city_name', 'B-class_type', 'B-compartment', 'B-connect', 'B-cost_relative', 'B-day_name', 'B-day_number', 'B-days_code', 'B-depart_date.date_relative', 'B-depart_date.day_name', 'B-depart_date.day_number', 'B-depart_date.month_name', 'B-depart_date.today_relative', 'B-depart_date.year', 'B-depart_time.end_time', 'B-depart_time.period_mod', 'B-depart_time.period_of_day', 'B-depart_time.start_time', 'B-depart_time.time', 'B-depart_time.time_relative', 'B-economy', 'B-fare_amount', 'B-fare_basis_code', 'B-flight', 'B-flight_days', 'B-flight_mod', 'B-flight_number', 'B-flight_stop', 'B-flight_time', 'B-fromloc.airport_code', 'B-fromloc.airport_name', 'B-fromloc.city_name', 'B-fromloc.state_code', 'B-fromloc.state_name', 'B-meal', 'B-meal_code', 'B-meal_description', 'B-mod', 'B-month_name', 'B-or', 'B-period_of_day', 'B-restriction_code', 'B-return_date.date_relative', 'B-return_date.day_name', 'B-return_date.day_number', 'B-return_date.month_name', 'B-return_date.today_relative', 'B-return_time.period_mod', 'B-return_time.period_of_day', 'B-round_trip', 'B-state_code', 'B-state_name', 'B-stoploc.airport_code', 'B-stoploc.airport_name', 'B-stoploc.city_name', 'B-stoploc.state_code', 'B-time', 'B-time_relative', 'B-today_relative', 'B-toloc.airport_code', 'B-toloc.airport_name', 'B-toloc.city_name', 'B-toloc.country_name', 'B-toloc.state_code', 'B-toloc.state_name', 'B-transport_type', 'I-airline_name', 'I-airport_name', 'I-arrive_date.date_relative', 'I-arrive_date.day_number', 'I-arrive_time.end_time', 'I-arrive_time.period_of_day', 'I-arrive_time.start_time', 'I-arrive_time.time', 'I-arrive_time.time_relative', 'I-city_name', 'I-class_type', 'I-cost_relative', 'I-depart_date.date_relative', 'I-depart_date.day_number', 'I-depart_date.today_relative', 'I-depart_time.end_time', 'I-depart_time.period_of_day', 'I-depart_time.start_time', 'I-depart_time.time', 'I-depart_time.time_relative', 'I-economy', 'I-fare_amount', 'I-fare_basis_code', 'I-flight_mod', 'I-flight_number', 'I-flight_stop', 'I-flight_time', 'I-fromloc.airport_name', 'I-fromloc.city_name', 'I-fromloc.state_name', 'I-meal_code', 'I-meal_description', 'I-period_of_day', 'I-restriction_code', 'I-return_date.date_relative', 'I-return_date.day_number', 'I-return_date.today_relative', 'I-round_trip', 'I-state_name', 'I-stoploc.city_name', 'I-time', 'I-today_relative', 'I-toloc.airport_name', 'I-toloc.city_name', 'I-toloc.state_name', 'I-transport_type', 'O'] self.model = NERModel('bert', '../models/checkpoint-1120-epoch-2',use_cuda=use_cuda,labels=self.labels,args=model_args) def fit(self): print('\n>>>>>>>fit() called. \n') return self def convert_to_dict(self,flag,ent,text): l=[] for i in range(len(flag)): l.append({"slot":flag[i], "value":ent[i], "extractor": "slot_extractor"}) #x={"entity":l} #print(l) text['entities']=l return text def transform(self,text): slot_res=text['correctness']['value'] #print(text['intent']['value'][0]) #print(slot_res) prediction = self.model.predict([text['text']]) #print('15:slot_pred',prediction) #label=message.get('correctness')[0].get('value') prediction = prediction[0][0] # print(prediction) #text = '' ent=[] flag=[] # ent='' # flag='' for i in prediction: key = list(i.keys())[0] if i[key]!='O': flag.append(i[key]) ent.append(key.split('{')[0]) if slot_res=='correct': entities =self.convert_to_dict(flag,ent,text) return entities else: entities =self.convert_to_dict('-','-',text) return entities
transformers_logger.setLevel(logging.WARNING) # Creating train_df and eval_df for demonstration train_data = [ [0, 'Simple', 'B-MISC'], [0, 'Transformers', 'I-MISC'], [0, 'started', 'O'], [1, 'with', 'O'], [0, 'text', 'O'], [0, 'classification', 'B-MISC'], [1, 'Simple', 'B-MISC'], [1, 'Transformers', 'I-MISC'], [1, 'can', 'O'], [1, 'now', 'O'], [1, 'perform', 'O'], [1, 'NER', 'B-MISC'] ] train_df = pd.DataFrame(train_data, columns=['sentence_id', 'words', 'labels']) eval_data = [ [0, 'Simple', 'B-MISC'], [0, 'Transformers', 'I-MISC'], [0, 'was', 'O'], [1, 'built', 'O'], [1, 'for', 'O'], [0, 'text', 'O'], [0, 'classification', 'B-MISC'], [1, 'Simple', 'B-MISC'], [1, 'Transformers', 'I-MISC'], [1, 'then', 'O'], [1, 'expanded', 'O'], [1, 'to', 'O'], [1, 'perform', 'O'], [1, 'NER', 'B-MISC'] ] eval_df = pd.DataFrame(eval_data, columns=['sentence_id', 'words', 'labels']) # print(train_df) # print(eval_df) # Create a NERModel model = NERModel('bert', 'bert-base-cased', args={'overwrite_output_dir': True, 'reprocess_input_data': True}) # Train the model model.train_model(train_df) # # Evaluate the model result, model_outputs, predictions = model.eval_model(eval_df) print(result, predictions) # # Predictions on arbitary text strings predictions, raw_outputs = model.predict(["Simple Transformers started with text classification"]) print(predictions) # print(raw_outputs)
class RestorePuncts: def __init__(self, wrds_per_pred=250): self.wrds_per_pred = wrds_per_pred self.overlap_wrds = 30 self.valid_labels = [ 'OU', 'OO', '.O', '!O', ',O', '.U', '!U', ',U', ':O', ';O', ':U', "'O", '-O', '?O', '?U' ] self.model = NERModel("bert", "felflare/bert-restore-punctuation", labels=self.valid_labels, args={ "silent": True, "max_seq_length": 512 }) # use_cuda isnt working and this hack seems to load the model correctly to the gpu self.model.device = torch.device("cuda:1") # dummy punctuate to load the model onto gpu self.punctuate("hello how are you") def punctuate(self, text: str, batch_size: int = 32, lang: str = ''): """ Performs punctuation restoration on arbitrarily large text. Detects if input is not English, if non-English was detected terminates predictions. Overrride by supplying `lang='en'` Args: - text (str): Text to punctuate, can be few words to as large as you want. - lang (str): Explicit language of input text. """ #if not lang and len(text) > 10: # lang = detect(text) #if lang != 'en': # raise Exception(F"""Non English text detected. Restore Punctuation works only for English. # If you are certain the input is English, pass argument lang='en' to this function. # Punctuate received: {text}""") def chunks(L, n): return [L[x:x + n] for x in range(0, len(L), n)] # plit up large text into bert digestable chunks splits = self.split_on_toks(text, self.wrds_per_pred, self.overlap_wrds) texts = [i["text"] for i in splits] batches = chunks(texts, batch_size) preds_lst = [] for batch in batches: batch_preds, _ = self.model.predict(batch) preds_lst.extend(batch_preds) # predict slices # full_preds_lst contains tuple of labels and logits #full_preds_lst = [self.predict(i['text']) for i in splits] # extract predictions, and discard logits #preds_lst = [i[0][0] for i in full_preds_lst] # join text slices combined_preds = self.combine_results(text, preds_lst) # create punctuated prediction punct_text = self.punctuate_texts(combined_preds) return punct_text def predict(self, input_slice): """ Passes the unpunctuated text to the model for punctuation. """ predictions, raw_outputs = self.model.predict([input_slice]) return predictions, raw_outputs @staticmethod def split_on_toks(text, length, overlap): """ Splits text into predefined slices of overlapping text with indexes (offsets) that tie-back to original text. This is done to bypass 512 token limit on transformer models by sequentially feeding chunks of < 512 toks. Example output: [{...}, {"text": "...", 'start_idx': 31354, 'end_idx': 32648}, {...}] """ wrds = text.replace('\n', ' ').split(" ") resp = [] lst_chunk_idx = 0 i = 0 while True: # words in the chunk and the overlapping portion wrds_len = wrds[(length * i):(length * (i + 1))] wrds_ovlp = wrds[(length * (i + 1)):((length * (i + 1)) + overlap)] wrds_split = wrds_len + wrds_ovlp # Break loop if no more words if not wrds_split: break wrds_str = " ".join(wrds_split) nxt_chunk_start_idx = len(" ".join(wrds_len)) lst_char_idx = len(" ".join(wrds_split)) resp_obj = { "text": wrds_str, "start_idx": lst_chunk_idx, "end_idx": lst_char_idx + lst_chunk_idx, } resp.append(resp_obj) lst_chunk_idx += nxt_chunk_start_idx + 1 i += 1 logging.info(f"Sliced transcript into {len(resp)} slices.") return resp @staticmethod def combine_results(full_text: str, text_slices): """ Given a full text and predictions of each slice combines predictions into a single text again. Performs validataion wether text was combined correctly """ split_full_text = full_text.replace('\n', ' ').split(" ") split_full_text = [i for i in split_full_text if i] split_full_text_len = len(split_full_text) output_text = [] index = 0 if len(text_slices[-1]) <= 3 and len(text_slices) > 1: text_slices = text_slices[:-1] for _slice in text_slices: slice_wrds = len(_slice) for ix, wrd in enumerate(_slice): # print(index, "|", str(list(wrd.keys())[0]), "|", split_full_text[index]) if index == split_full_text_len: break if split_full_text[index] == str(list(wrd.keys())[0]) and \ ix <= slice_wrds - 3 and text_slices[-1] != _slice: index += 1 pred_item_tuple = list(wrd.items())[0] output_text.append(pred_item_tuple) elif split_full_text[index] == str(list( wrd.keys())[0]) and text_slices[-1] == _slice: index += 1 pred_item_tuple = list(wrd.items())[0] output_text.append(pred_item_tuple) assert [i[0] for i in output_text] == split_full_text return output_text @staticmethod def punctuate_texts(full_pred: list): """ Given a list of Predictions from the model, applies the predictions to text, thus punctuating it. """ punct_resp = "" for i in full_pred: word, label = i if label[-1] == "U": punct_wrd = word.capitalize() else: punct_wrd = word if label[0] != "O": punct_wrd += label[0] punct_resp += punct_wrd + " " punct_resp = punct_resp.strip() # Append trailing period if doesnt exist. if punct_resp[-1].isalnum(): punct_resp += "." return punct_resp
labels = f.readlines() labels = [i.strip() for i in labels] train_args = { "output_dir": "ner_output", "overwrite_output_dir": True, "use_multiprocessing": False, "save_steps": 0, "use_early_stopping": True, "early_stopping_patience": 4, "evaluate_during_training": True, "reprocess_input_data": False, "use_cached_eval_features": True, "fp16": False, "num_train_epochs": 10, "evaluate_during_training_steps": 10000, "train_batch_size": 32, 'cross_entropy_ignore_index': 0, 'classification_report': True } model = NERModel("electra", "ner_output/checkpoint-150000", args=train_args, labels=labels, use_cuda=True, crf=True) result = model.predict([list('发烧头痛3天'), list('见盲肠底,升结肠近肝曲')], split_on_space=False) print(result)
class NerModel: def __init__(self, modelname="", dataset=None, use_saved_model=False, input_dir=None, output_dir=None): #pretrained_model_name = "lm_outputs_test/from_scratch/best_model" self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') self.dataset = dataset #labels_list = ["O", "B-ACT", "I-ACT", "B-OBJ", "I-OBJ", "B-VAL", "I-VAL", "B-VAR", "I-VAR"] #labels_list = dataset.get_labels_list() labels_list = dataset['labels_list'] #labels_list = ['O', 'B-ACT', 'I-ACT', 'B-OBJ', 'I-OBJ', 'B-CNT', 'I-CNT', # 'B-OPE', 'I-OPE', 'B-ORD', 'B-PRE', 'I-PRE', 'B-TYP', # 'B-VAL', 'I-VAL', 'B-ATT', 'I-ATT', 'B-VAR', 'I-VAR'] #output_dir = "outputs_{}".format(modelname) os.system("{} -rf".format(output_dir)) use_cuda = torch.cuda.is_available() # Create a NERModel model_args = { 'labels_list': labels_list, 'output_dir': output_dir, 'overwrite_output_dir': True, 'reprocess_input_data': True, 'save_eval_checkpoints': False, 'save_steps': -1, 'save_model_every_epoch': False, #'no_save' : True, #'no_cache': True, 'evaluate_during_training': True, 'num_train_epochs': 15, # 5 'train_batch_size': 10, # 10 (<=10 for bert, <=5 for longformer) 'eval_batch_size': 10, 'max_seq_length': 128, # default 128 'gradient_accumulation_steps': 8, 'learning_rate': 0.0001, # default 4e-5; a good value is 0.0001 for albert #'max_position_embeddings': 64, } #self.model = NERModel("bert", pretrained_model_name, use_cuda=False, args=model_args) #self.model = NERModel("bert", "bert-base-uncased", use_cuda=False, args=model_args) #self.model = NERModel("longformer", "allenai/longformer-base-4096", use_cuda=False, args=model_args) #self.model = NERModel("longformer", pretrained_model_name, use_cuda=False, args=model_args) #self.model = NERModel("xlmroberta", "xlm-roberta-base", use_cuda=False, args=model_args) #self.model = NERModel("albert", "albert-base-v2", use_cuda=False, args=model_args) #self.model = NERModel("electra", 'google/electra-small-generator', use_cuda=False, args=model_args) #self.model = NERModel("layoutlm", 'microsoft/layoutlm-base-uncased', use_cuda=False, args=model_args) #self.model = NERModel("distilbert", "distilbert-base-cased-distilled-squad", use_cuda=False, args=model_args) #model_type, english_model_name = "longformer", "allenai/longformer-base-4096" #model_type, english_model_name = "mpnet", "microsoft/mpnet-base" #model_type, english_model_name = "electra", "google/electra-small-discriminator" #model_type, english_model_name = "squeezebert", "squeezebert/squeezebert-uncased" #model_type, english_model_name = "albert", "albert-base-v2" #model_type, english_model_name = "xlmroberta", "xlm-roberta-base" model_type, english_model_name = "roberta", "distilroberta-base" #model_type, english_model_name = "bert", "bert-base-uncased" #model_type, english_model_name = "distilbert", "distilbert-base-uncased" if input_dir: # Use a previously trained model (on NER or LM tasks) self.model = NERModel(model_type, input_dir, use_cuda=use_cuda, args=model_args) else: # Use a pre-trained (English) model self.model = NERModel(model_type, english_model_name, use_cuda=use_cuda, args=model_args) # force_download=True """ if use_saved_model: if path: # Use a model located in a given folder self.model = NERModel("longformer", path, use_cuda=False, args=model_args) else: # Use a previously trained model (on NER or LM tasks) self.model = NERModel("longformer", output_dir, use_cuda=False, args=model_args) else: # Use a pre-trained (English) model self.model = NERModel("longformer", "allenai/longformer-base-4096", use_cuda=False, args=model_args) """ """ if use_saved_model: self.model = NERModel("bert", output_dir, use_cuda=False, args=model_args) else: self.model = NERModel("bert", pretrained_model_name, use_cuda=False, args=model_args) # args={"overwrite_output_dir": True, "reprocess_input_data": True} """ self.model_info = { 'model_type': model_type, 'english_model_name': english_model_name, 'input_dir': input_dir } def train(self): # # Train the model if self.dataset: global_step, training_details = self.model.train_model( self.dataset['train'], eval_data=self.dataset['val']) else: raise Exception("dataset is None") print("global_step:", global_step) print("training_details:", training_details) #training_details: {'global_step': [4], 'precision': [0.6987951807228916], 'recall': [0.402777777777777 #8], 'f1_score': [0.5110132158590308], 'train_loss': [0.41127926111221313], 'eval_loss': [0.63655577600 #00229]} # it contains f1_score only for the validation dataset return training_details def eval(self): # # Evaluate the model if self.dataset: res_train, model_outputs, predictions = self.model.eval_model( self.dataset['train']) res_val, model_outputs, predictions = self.model.eval_model( self.dataset['val']) print("Evaluation") #print("On train data:", result) #{'eval_loss': 0.8920, 'precision': 0.0833, 'recall': 0.027, 'f1_score': 0.0416} print("train loss: {:.3f}; prec/recall/f1: {:.3f}/{:.3f}/{:.3f}". format(res_train['eval_loss'], res_train['precision'], res_train['recall'], res_train['f1_score'])) #print("On validation data:", result) print("valid loss: {:.3f}; prec/recall/f1: {:.3f}/{:.3f}/{:.3f}". format(res_val['eval_loss'], res_val['precision'], res_val['recall'], res_val['f1_score'])) print( "Summary. Loss (train/val): {:.3f}/{:.3f}, F1: {:.3f}/{:.3f}". format(res_train['eval_loss'], res_val['eval_loss'], res_train['f1_score'], res_val['f1_score'])) else: raise Exception("dataset is None") print("model_info:", self.model_info) return res_val def test(self): sentence_id = self.dataset['test']['sentence_id'] words = self.dataset['test']['words'] labels = self.dataset['test']['labels'] prev_id = 0 s_words = [] s_labels = [] samples = [] for i in range(len(sentence_id)): s_id = sentence_id[i] word = words[i] label = labels[i] if s_id != prev_id: sentence = " ".join(s_words) #print("sentence id={}: {}".format(prev_id, sentence)) samples.append({ 'text': sentence, 'tokens': s_words, 'labels': s_labels }) #print("s_labels: {}".format(s_labels)) s_words = [] s_labels = [] prev_id = s_id s_words.append(words[i]) s_labels.append(labels[i]) #print("i={}, word={}, label={}".format(s_id, word, label)) sentence = " ".join(s_words) #print("sentence id={}: {}".format(prev_id, sentence)) samples.append({ 'text': sentence, 'tokens': s_words, 'labels': s_labels }) texts = [sample['text'] for sample in samples] predictions, raw_outputs = self.model.predict(texts) #print(predictions) acc_list = [] success_list = [] # More detailed preditctions for i, (preds, raw_outs) in enumerate(zip(predictions, raw_outputs)): print() print("text: ", texts[i]) #print("\npreds: ", preds) pred_labels = [list(t.values())[0] for t in preds] print("pred_labels: ", pred_labels) true_labels = samples[i]['labels'] print("true_labels: ", true_labels) #print("raw_outs: ", raw_outs) if len(true_labels) != len(pred_labels): raise Exception("len(true_labels) != len(pred_labels)") comp = [ true_labels[i] == pred_labels[i] for i in range(len(pred_labels)) ] acc1sentence = np.mean(comp) print("acc={:.3f}".format(acc1sentence)) acc_list.append(acc1sentence) success = 1 if acc1sentence == 1.0 else 0 success_list.append(success) avg_acc = np.mean(acc_list) avg_success = np.mean(success_list) return {'avg_acc': avg_acc, 'avg_success': avg_success} #for pred, out in zip(preds, outs): #print("pred:", pred) #print("out:", out) #key = list(pred.keys())[0] #new_out = out[key] #preds = list(softmax(np.mean(new_out, axis=0))) #print(key, pred[key], preds[np.argmax(preds)], preds) def simple_test(self): # Predictions on arbitary text strings sentences = ["Some arbitary sentence", "Simple Transformers sentence"] predictions, raw_outputs = self.model.predict(sentences) print(predictions) # More detailed preditctions for n, (preds, outs) in enumerate(zip(predictions, raw_outputs)): print("\n___________________________") print("Sentence: ", sentences[n]) for pred, out in zip(preds, outs): key = list(pred.keys())[0] new_out = out[key] preds = list(softmax(np.mean(new_out, axis=0))) print(key, pred[key], preds[np.argmax(preds)], preds) def predict(self, sentences): predictions, raw_outputs = self.model.predict(sentences) #tokenized_sentences = [self.tokenizer.tokenize(sentence) for sentence in sentences] #predictions, raw_outputs = self.model.predict(tokenized_sentences, split_on_space=False) return predictions def raw_predict(self, sentences): predictions, raw_outputs = self.model.predict(sentences) #print("raw_outputs:", raw_outputs) #print(self.model.args.labels_list) labels_list = self.model.args.labels_list confidences = [ calc_confidence(raw_output, labels_list) for raw_output in raw_outputs ] #print("confidence:", confidence) return { 'predictions': predictions, 'raw_outputs': raw_outputs, 'confidences': confidences } """
model_args = NERArgs( labels_list = ['B','I', 'O'], manual_seed = 2021, num_train_epochs = 4, max_seq_length = 512, use_early_stopping = True, overwrite_output_dir = True, train_batch_size = 8 ) model = NERModel( "roberta", "roberta-base", args=model_args, weight = [2.52853895, 12.18978944, 0.39643544], use_cuda=True, cuda_device=-1 ) model.train_model(train_df) result, model_outputs, wrong_predictions = model.eval_model( eval_df ) terms = [] preds = [] for lines_ in lines: sentences = [[token.text for token in nlp1(line.strip())] for line in lines_] predictions, raw_outputs = model.predict(sentences, split_on_space=False) preds.extend(predictions) with open('ann_weighted_roberta.pkl', 'wb') as f: pkl.dump(preds, f)
[1, "perform", "O"], [1, "NER", "B-MISC"], ] eval_df = pd.DataFrame(eval_data, columns=["sentence_id", "words", "labels"]) # Create a NERModel model = NERModel("bert", "bert-base-cased", args={"overwrite_output_dir": True, "reprocess_input_data": True}) # # Train the model # model.train_model(train_df) # # Evaluate the model # result, model_outputs, predictions = model.eval_model(eval_df) # Predictions on arbitary text strings sentences = ["Some arbitary sentence", "Simple Transformers sentence"] predictions, raw_outputs = model.predict(sentences) print(predictions) # More detailed preditctions for n, (preds, outs) in enumerate(zip(predictions, raw_outputs)): print("\n___________________________") print("Sentence: ", sentences[n]) for pred, out in zip(preds, outs): key = list(pred.keys())[0] new_out = out[key] preds = list(softmax(np.mean(new_out, axis=0))) print(key, pred[key], preds[np.argmax(preds)], preds)
[0, "text", "O"], [0, "classification", "B-MISC"], [1, "Simple", "B-MISC"], [1, "Transformers", "I-MISC"], [1, "then", "O"], [1, "expanded", "O"], [1, "to", "O"], [1, "perform", "O"], [1, "NER", "B-MISC"], ] eval_df = pd.DataFrame(eval_data, columns=["sentence_id", "words", "labels"]) # Create a NERModel model = NERModel("bert", "bert-base-cased", args={ "overwrite_output_dir": True, "reprocess_input_data": True }) # Train the model model.train_model(train_df) # Evaluate the model result, model_outputs, predictions = model.eval_model(eval_df) # Predictions on arbitary text strings predictions, raw_outputs = model.predict(["Some arbitary sentence"]) print(predictions)
def main(sentence): """Predicts NER labels""" model = NERModel('bert', 'outputs/', use_cuda=False) # predictions, raw_outputs = model.predict([sentence]) print(predictions)