class TransformerModel: """ This class provides the Machine Learning model and classifies tenders based on previous training data. """ def load_model(self): if not self.model: from simpletransformers.classification import ClassificationModel try: self.model = ClassificationModel('bert', './outputs/', use_cuda=False, args=args) except Exception as ex: logger.error( f"could not load model from /outputs due to {str(ex)}, creating new model" ) self.create_new_model() def __init__(self): self.model = None def __convert_to_input(self, tenders): titles = list(map(lambda x: x.get_title("DE"), tenders)) return titles def classify(self, tenders): self.load_model() titles = self.__convert_to_input(tenders) predictions, raw_output = self.model.predict(titles) tuples = zip(tenders, predictions) selected_tenders = [t for t, p in tuples if p == 1] return selected_tenders def train(self, labelled_tenders): self.load_model() tenders = [i for i, j in labelled_tenders] tenders = self.__convert_to_input(tenders) labels = [j for i, j in labelled_tenders] tenders_train, tenders_test, labels_train, labels_test = train_test_split( tenders, labels, test_size=0.1, random_state=42) data_input = pd.DataFrame(zip(tenders_train, labels_train)) self.model.train_model(data_input) labels_pred, raw_output = self.model.predict(tenders_test) tn, fp, fn, tp = confusion_matrix(labels_test, labels_pred).ravel() logger.info(f"tn: {tn} fp: {fp}") logger.info(f"fn: {fn} tp:{tp}") def create_new_model(self): from simpletransformers.classification import ClassificationModel self.model = ClassificationModel('bert', 'bert-base-german-cased', use_cuda=False, args=args)
def bert_predictions(tweet: pd.DataFrame, model: ClassificationModel): """ Bert Inference for prediction. :param tweet: dataframe with tweets :param model: Bert Model :return: list of pr """ tweet = tweet.values.tolist() try: predictions, raw_outputs = model.predict(tweet) except: for element in tweet.iteritems(): model.predict([element]) print("STOPP") auswertung = collections.Counter(predictions) gc.collect() # df = pd.DataFrame(raw_outputs) # df['predictions'] = pd.DataFrame(predictions) # df['tweets'] = pd.DataFrame(tweet) # df = df.replace(r'\n', ' ', regex=True) # df_softmax = pd.DataFrame(softmax(raw_outputs, axis=1)) # df['softmax0'] = df_softmax[0] # df['softmax1'] = df_softmax[1] # db_functions.df_to_sql(df, 'temp_table', 'replace') return auswertung
def cross_pseudo_labeling(train, pseudo_test, test, params, n_folds, model_name, model_type, lb_hack): splits = list( StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=1234).split(train["text"], train["label"])) splits_test = list( KFold(n_splits=n_folds, shuffle=True, random_state=1234).split(test["jobflag"])) y_pred = np.zeros((test.shape[0], n_folds)) oof = np.zeros(train.shape[0]) oof_raw = np.zeros((train.shape[0], n_folds)) weight = len(train) / train["label"].value_counts().sort_index().values f1_score = 0 for fold, (train_idx, valid_idx) in enumerate(splits): X_train = pd.concat([train.iloc[train_idx], pseudo_test]) X_valid = train.iloc[valid_idx] model = ClassificationModel(model_type=model_type, model_name=model_name, num_labels=4, args=params, use_cuda=True, weight=weight.tolist()) model.train_model(X_train) result, model_outputs, wrong_predictions = model.eval_model( X_valid, f1=metric_f1) print(result) f1_score += result["f1"] / n_folds fold_pred, raw_outputs = model.predict(test["description"].values) # y_pred[:, fold] = hack(raw_outputs) y_pred[:, :] = raw_outputs / n_folds oof_pred, oof_outputs = model.predict( X_valid["text"].values) # 謎のバグが発生するので変換 oof[valid_idx] = oof_pred oof_raw[valid_idx, :] = oof_outputs # oof[valid_idx] = hack(oof_outputs) print(f"mean f1_score: {f1_score}") raw_pred = y_pred.copy() y_pred = hack(y_pred, lb_hack) # oof = hack(oof_raw) # y_pred = stats.mode(y_pred, axis=1)[0].flatten().astype(int) test_pred = pd.DataFrame( np.concatenate([y_pred.reshape(-1, 1), raw_pred], 1)) oof_pred = pd.DataFrame(np.concatenate([oof.reshape(-1, 1), oof_raw], 1)) return test_pred, f1_score, oof_pred
class TransformerModel(TenderClassClassifier): """ This class provides the Machine Learning model and classifies tenders based on previous training data. """ def __init__(self): self.model = None def load(self, name): self.model = ClassificationModel('bert', './outputs/', use_cuda=cuda_available, args=args) def save(self, name): pass def __convert_to_input(self, tenders): titles = list(map(lambda x: x.get_title("DE"), tenders)) return titles def classify(self, tenders): titles = self.__convert_to_input(tenders) predictions, raw_output = self.model.predict(titles) tuples = zip(tenders, predictions) selected_tenders = [t for t, p in tuples if p == 1] return selected_tenders def train(self, labelled_tenders): tenders = [i for i, j in labelled_tenders] tenders = self.__convert_to_input(tenders) labels = [j for i, j in labelled_tenders] tenders_train, tenders_test, labels_train, labels_test = train_test_split( tenders, labels, test_size=0.1, random_state=42) data_input = pd.DataFrame(zip(tenders_train, labels_train)) start = time.time() self.model.train_model(data_input) end = time.time() print(end - start) labels_pred, raw_output = self.model.predict(tenders_test) tn, fp, fn, tp = confusion_matrix(labels_test, labels_pred).ravel() logger.info(f"tn: {tn} fp: {fp}") logger.info(f"fn: {fn} tp:{tp}") logger.info( f"Accuracy Score: {accuracy_score(labels_test, labels_pred)}") def create_new_model(self): from simpletransformers.classification import ClassificationModel self.model = ClassificationModel('bert', 'bert-base-german-cased', use_cuda=cuda_available, args=args)
def model(train, test, params, n_folds, model_name, model_type, lb_hack, prediction=False): kfold = StratifiedKFold(n_splits=n_folds) y_pred = np.zeros((test.shape[0], n_folds)) oof = np.zeros(train.shape[0]) oof_raw = np.zeros((train.shape[0], n_folds)) weight = len(train) / train["label"].value_counts().sort_index().values f1_score = 0 for fold, (train_idx, valid_idx) in enumerate(kfold.split(train["text"], train['label'])): args = params.copy() args["output_dir"] = params["output_dir"] + "_" + str(fold + 1) X_train = train.iloc[train_idx] X_valid = train.iloc[valid_idx] if prediction: model_name = args["output_dir"] model = ClassificationModel(model_type=model_type, model_name=model_name, num_labels=4, args=args, use_cuda=True, weight=weight.tolist()) if not prediction: model.train_model(X_train) result, model_outputs, wrong_predictions = model.eval_model(X_valid, f1=metric_f1) print(result) f1_score += result["f1"] / n_folds fold_pred, raw_outputs = model.predict(test['description']) # y_pred[:, fold] = hack(raw_outputs) y_pred += raw_outputs / n_folds oof_pred, oof_outputs = model.predict(X_valid["text"].values) # 謎のバグが発生するので oof[valid_idx] = oof_pred oof_raw[valid_idx, :] = oof_outputs # oof[valid_idx] = hack(oof_outputs) print(f"mean f1_score: {f1_score}") raw_pred = y_pred.copy() y_pred = hack(y_pred, lb_hack) # oof = hack(oof_raw) # y_pred = stats.mode(y_pred, axis=1)[0].flatten().astype(int) test_pred = pd.DataFrame(np.concatenate([y_pred.reshape(-1, 1), raw_pred], 1)) oof_pred = pd.DataFrame(np.concatenate([oof.reshape(-1, 1), oof_raw], 1)) return test_pred, f1_score, oof_pred
def predict_export(data): X = data[args.predict_partition]['text'] predictions = {} for class_name in ['arousal', 'valence', 'topic']: # if class_name in ['arousal', 'valence']: class_no = 3 else: class_no = 10 trained_model_path = os.path.join('experiments/best_model/', class_name + str(False)) model = ClassificationModel(args.model_type, trained_model_path, num_labels=class_no) predictions['prediction_' + class_name], _ = model.predict(X) predictions['id'] = data[args.predict_partition]['id'] predictions['segment_id'] = data[args.predict_partition]['segment_id'] df = pd.DataFrame.from_dict(predictions) # , orient='index' .T header_names = [ 'id', 'segment_id', 'prediction_arousal', 'prediction_valence', 'prediction_topic' ] df[header_names].to_csv(output_path + args.predict_partition + '.csv', header=header_names, index=False)
class EmpathyClassifier(): def __init__(self, use_cuda=torch.cuda.is_available(), cuda_device=0, batch_size=16): self.model_type = "empathy" train_args["eval_batch_size"] = batch_size model_path = os.path.join(os.path.dirname(__file__), "models/empathy/") model_file = os.path.join(os.path.dirname(__file__), "models/empathy.tar.gz") if not os.path.isdir(model_path): model = f'{self.model_type}_model' if not os.path.isfile(model_file): logger.info( f'Model {self.model_type} does not exist at {model_path}. Attempting to download it.' ) fetch_pretrained_model(model, model_file) unzip_simple_transformer_model(model, model_path, model_file) # Create a ClassificationModel self.model = ClassificationModel('roberta', model_path, num_labels=1, use_cuda=use_cuda, cuda_device=cuda_device, args=train_args) def predict(self, text): if isinstance(text[0], str): text = [text] predictions, raw_outputs = self.model.predict(text) return raw_outputs
def eval_stance_clf(model_path, src_path, gen_path, **kwargs): src = open(src_path, 'r').readlines() gen = open(gen_path, 'r').readlines() gen = [i.strip() for i in gen] src = [i.strip() for i in src] train_args = { 'learning_rate': 3e-5, 'num_train_epochs': 5, 'reprocess_input_data': True, 'overwrite_output_dir': False, 'process_count': 10, 'train_batch_size': 4, 'eval_batch_size': 400, 'max_seq_length': 300, "fp16": False } model = ClassificationModel('roberta', model_path, num_labels=4, use_cuda=True, cuda_device=0, args=train_args) input = [[i, j] for i, j in zip(src, gen)] predictions, raw_outputs = model.predict(input) th = Counter(predictions) th = sorted(th.items(), key=lambda x: x[0]) print(th)
def generate_prob_matrix(arguments): my_args = { "max_seq_length": 256, "train_batch_size": 16, "eval_batch_size": 16, "do_lower_case": True, "manual_seed": 17 } model = ClassificationModel('bert', "relation_processing/model/bert", use_cuda=False, args=my_args) num_arguments = len(arguments) prob_matrix = np.zeros((num_arguments, num_arguments)) for rel_from in range(1, num_arguments): for rel_to in arguments[rel_from].compare_list: if rel_from == rel_to: continue logging.info("calculating: " + str(rel_from) + "-->" + str(rel_to)) timer = datetime.now() predictions, raw_outputs = model.predict([[arguments[rel_to].sentence, arguments[rel_from].sentence]]) rel = softmax(raw_outputs, axis=1) Stats.h_bert_time += datetime.now() - timer Stats.h_bert += 1 logging.debug(rel) prob_matrix[rel_to][rel_from] = rel[0][1] return prob_matrix
def main(path, valid_in_cat_path, valid_out_of_cat_path): steam_df = load_steam_data() i = 1 print("starting training, using fold " + str(i)) train, test = load_fold_data(path, i) # Train the model using roberta model args_dict = {'output_dir': '../../models/roberta-base-bs8-e6-fold' + str(i), 'use_cached_eval_features': False, 'reprocess_input_data': True, 'train_batch_size': 8, 'num_train_epochs': 6, 'fp16': False, 'overwrite_output_dir': True} model = ClassificationModel('roberta', 'roberta-base', num_labels=2, args=args_dict) model.train_model(train) print("done training model fold " + str(i)) result, model_outputs, wrong_predictions = model.eval_model(test, acc=accuracy_score, f1=f1_score) acc = result['acc'] f1 = result['f1'] print(f"acc: {acc} , f1: {f1}") # Make predictions with the model save_path = '../../reports/steam-prediction.csv' print("predicting...") predictions, raw_outputs = model.predict(steam_df["sentence"].tolist()) print(f"predicting finished - saved to {save_path}" ) steam_df['prediction'] = predictions steam_df.to_csv(save_path, index=False)
class XLNetDeBERTaClassification: def __init__(self, model, model_dir="D:/Language Models/"): model_pn = { 'xlnet': f'{model_dir}{"XLNET-LARGE/"}', 'deberta': f'{model_dir}{"DEBERTA-LARGE/"}' }[model] self.model = ClassificationModel(model, model_pn, use_cuda=False) self.tokenizer = XLNetTokenizer.from_pretrained(model_pn) if model == 'xlnet' \ else DebertaTokenizer.from_pretrained(model_pn) self._text = None @property def text(self): return self._text @text.setter def text(self, raw_text): if not isinstance(raw_text, str): raise TypeError("Error: Invalid Input Type") else: self._text = raw_text def check_probs(self): _, logits = self.model.predict([self._text]) probs = softmax(logits[0]) probs_3dp = ["{:.5f}".format(float(i)) for i in probs] return probs_3dp, len( self.tokenizer.encode(self._text, add_special_tokens=False))
def main(): # load train & test data df_train = pd.read_csv("sentiment_train.csv") df_test = pd.read_csv("sentiment_test.csv") #set random seed random = 42 # Train test split X_train, X_val, y_train, y_val = train_test_split(df_train['Sentence'], df_train['Polarity'], test_size=0.10, random_state=random) train_dataset = pd.concat([X_train, y_train], axis=1) val_dataset = pd.concat([X_val, y_val], axis=1) # Load a pre-trained model, and train it with our data | See all models available: https://huggingface.co/transformers/pretrained_models.html # Create model ... args = parameters args = { 'reprocess_input_data': True, 'max_seq_length': 300, 'num_train_epochs': 1, 'fp16': False, 'train_batch_size': 4, 'overwrite_output_dir': True } my_model = ClassificationModel('roberta', 'distilroberta-base', num_labels=2, use_cuda=True, cuda_device=0, args=args) # Train the model my_model.train_model(train_dataset) # Evaluate the model result, model_outputs, wrong_predictions = my_model.eval_model( val_dataset, acc=f1_score) pred_val = np.argmax(model_outputs, axis=1).tolist() print("Results on evaluation:") print("----------------------") print("F1 Score = {:.6f}\n".format( f1_score(y_val, pred_val, average='micro') * 100)) print(classification_report(y_val, pred_val)) print(confusion_matrix(y_val, pred_val)) # get results on test set pred_test, _ = my_model.predict(df_test['Sentence']) # print f1 score print(f1_score(df_test.Polarity, pred_test)) # print accuracy score print(accuracy_score(df_test.Polarity, pred_test)) # save input/ground truth/prediction as one csv df_test['prediction'] = pred_test df_test.to_csv('q3_ans.csv', index=False)
def predict_df( data_pkl, model_type, model_name, ): """ Apply a fine-tuned regression model to generate predictions. The text is given in `data_pkl` and the predictions are generated per row and saved in a 'predictions' column. Parameters ---------- data_pkl: str path to pickled df with the data, which must contain the column 'text' model_type: str type of the pre-trained model, e.g. bert, roberta, electra model_name: str path to a directory containing model file Returns ------- None """ # load data df = pd.read_pickle(data_pkl) # check CUDA cuda_available = torch.cuda.is_available() if not cuda_available: def custom_formatwarning(msg, *args, **kwargs): return str(msg) + '\n' warnings.formatwarning = custom_formatwarning warnings.warn('CUDA device not available; running on a CPU!') # load model model = ClassificationModel( model_type, model_name, num_labels=1, use_cuda=cuda_available, ) # predict print("Generating predictions. This might take a while...") txt = df['text'].to_list() predictions, _ = model.predict(txt) col = f"pred_{Path(model_name).stem}" df[col] = predictions # pkl df df.to_pickle(data_pkl) print( f"A column with predictions was added.\nThe updated df is saved: {data_pkl}" )
def main(): logging.basicConfig(level=logging.INFO) transformers_logger = logging.getLogger("transformers") transformers_logger.setLevel(logging.WARNING) train_txt_path = "train.txt" test_txt_path = "test.txt" result_txt_path = "result.txt" relations = [ "Cause-Effect", "Component-Whole", "Entity-Destination", "Product-Producer", "Entity-Origin", "Member-Collection", "Message-Topic", "Content-Container", "Instrument-Agency", "Other" ] file_train = open(train_txt_path) train_data = [] for i in range(6400): temp = [] temp.append(file_train.readline().split('"')[1].strip('"').strip('.')) temp.append(relations.index(file_train.readline().split('(')[0])) train_data.append(temp) #print(train_data) train_df = pd.DataFrame(train_data) train_df.columns = ["text", "labels"] ''' model_args = ClassificationArgs(num_train_epochs=1) model = ClassificationModel( 'bert', 'bert-base-cased', num_labels=10, args=model_args, use_cuda=False ) # Train the model model.train_model(train_df, output_dir='./model') ''' model = ClassificationModel("bert", "model/checkpoint-800-epoch-1", use_cuda=False) file_test = open(test_txt_path) test_sentences = [] for i in range(1600): test_sentences.append( file_test.readline().split('"')[1].strip('"').strip('.')) test_result, raw_result = model.predict(test_sentences) print(test_result) file_result = open(result_txt_path, 'w+') for i in range(1600): file_result.write(relations[test_result[i]] + '\n') file_train.close() file_test.close() file_result.close()
class Classifier: def __init__(self, model_type, model_name, use_cuda=True): logging.basicConfig(level=logging.INFO) transformers_logger = logging.getLogger("transformers") transformers_logger.setLevel(logging.WARNING) # Create a ClassificationModel self.model_type = model_type self.model_name = model_name self.use_cuda = use_cuda self.dat = {} self.rerun = False def add(self, X, Y): self.dat[Y] = X def train(self, split=0.7, num_epochs=10): self.le = preprocessing.LabelEncoder() print(list(self.dat.keys())) self.le.fit(list(self.dat.keys())) train_data = [] eval_data = [] for k, v in self.dat.items(): len_train = int(round(len(v) * split)) train_data.extend([[i, self.le.transform([k])[0]] for i in v[:len_train]]) eval_data.extend([[i, self.le.transform([k])[0]] for i in v[len_train:]]) print(train_data, eval_data) train_df = pd.DataFrame(train_data) eval_df = pd.DataFrame(eval_data) train_args = { 'overwrite_output_dir': True, 'num_train_epochs': num_epochs, } self.model = ClassificationModel(self.model_type, self.model_name, num_labels=len(list(self.dat.keys())), use_cuda=self.use_cuda, cuda_device=0, args=train_args) # Train the model self.model.train_model(train_df, eval_df=eval_df) # Evaluate the model result, model_outputs, wrong_predictions = self.model.eval_model( eval_df, acc=sklearn.metrics.accuracy_score) def predict(self, x): predictions, raw_outputs = self.model.predict(x) return self.le.inverse_transform(predictions)
def predict(text): model = ClassificationModel("distilbert", './models/DistilBERT/', use_cuda=False, num_labels=11) prediction, raw_outputs = model.predict([text]) top_3 = raw_outputs.argsort()[0][-3:] confidence = softmax(raw_outputs[0]) index = confidence.argsort()[-3:] return [flairs[prediction[0]], flairs[top_3[1]], flairs[top_3[0]]], [ confidence[index[-1]], confidence[index[-2]], confidence[index[-3]] ] return flairs[prediction[0]]
class PretrainedBert(): def __init__(self): # CHANGE CUDA WHEN NEEDED # experiment w these args # change bert to other transformer # change pretrain to other pretrain self.model = ClassificationModel('roberta', 'roberta-base', use_cuda=False, num_labels=5, args={ 'max_seq_length': 128, 'save_steps': 10000, 'fp16': False, 'logging_steps': 1, 'train_batch_size': 16, 'num_train_epochs': 1 }) def run(self, train_df): self.model.train_model(train_df) def eval(self, eval_df): return self.model.predict(eval_df) def load(self, path): self.model = ClassificationModel('roberta', path, num_labels=5, use_cuda=False, args={ 'max_seq_length': 128, 'save_steps': 10000, 'fp16': False, 'logging_steps': 1, 'train_batch_size': 16, 'num_train_epochs': 1 }) def test_challenge_set(self, labels_pred, labels_act): mae = 0.0 acc = 0.0 for i in range(len(labels_act)): mae += abs(labels_pred[i] - labels_act[i]) if labels_pred[i] == labels_act[i]: acc += 1 mae = mae / len(labels_act) acc = acc / len(labels_act) return mae, acc
def use_model(model_type, model_path): model = ClassificationModel(model_type, model_path) comments = [ "Good hotel, nothing great but is enough to have a decent experience and fun with the family", "Horrible experience, the staff was rude and the food terrible", "I truly loved it! Great staff and delicious food. 5/5", "nice boutique hotel stayed 5 nights, rooms nice clean place pretty, location good in central singapore", "Wonderful place perfectly located. Just opened a month ago. Very helpful staff, reasonable rates, breakfast included. The rooms are nice and completely clean.", "The pricing was fine, good location. Food could be better." ] predictions, raw_outputs = model.predict(comments) print('Prediction:', predictions) print('Raw Outputs:', raw_outputs)
def binatron(request): try: inport = request.data inport = [inport["eligible_text"]] # inport = list(inport.values()) model = ClassificationModel("bert", "outputs", use_cuda=False) predictions, raw_outputs = model.predict(inport) df = pd.DataFrame(predictions, columns=['Criteria']) df = df.replace({1: 'Eligible', 0: 'Others'}) df = df.to_dict()['Criteria'].get(0) return JsonResponse('Text Belongs to Class {}'.format(df), safe=False) # # return df except ValueError as e: return Response(e.args[0], status.HTTP_400_BAD_REQUEST)
def binary(train, eval): train_df = pd.read_csv(train, skip_blank_lines=True) eval_df = pd.read_csv(eval, skip_blank_lines=True) # Create a ClassificationModel model = ClassificationModel("roberta", "roberta-large", num_labels=2, args=model_args) # Train the model model.train_model(train_df, eval_df) # evaluate the model preds, outputs = model.predict(eval_df.text.tolist()) targets = eval_df.labels.tolist() print("accuracy", accuracy_score(targets, preds)) print("f1 score", f1_score(targets, preds))
def train_model(model_type, model_name): print('Starting run:', model_type, model_name) train_df = pd.read_csv("data/reviews/train.csv", header=None) train_df.columns = ["text", "labels"] eval_df = pd.read_csv("data/reviews/test.csv", header=None) eval_df.columns = ["text", "labels"] # print(eval_df.iloc[0]) t0 = time.time() train_args = { 'output_dir': f'model-outputs/reviews/{model_name}-outputs', 'max_seq_length': 256, 'num_train_epochs': 5, 'train_batch_size': 16, 'eval_batch_size': 32, 'learning_rate': 5e-5, 'evaluate_during_training': True, 'evaluate_during_training_steps': 50000, 'save_model_every_epoch': False, 'overwrite_output_dir': True, 'no_cache': True, 'use_early_stopping': True, 'early_stopping_patience': 3, 'manual_seed': 4, 'regression': True, 'best_model_dir': f'outputs/{model_type}/best_model' } model = ClassificationModel(model_type, model_name, num_labels=1, args=train_args) model.train_model(train_df, eval_df=eval_df) print('Run finished') t1 = time.time() total = t1 - t0 print('Time:', total) print('--------------------') predictions, raw_outputs = model.predict([ "Good hotel, nothing great but is enough to have a decent experience and fun with the family" ]) print('Prediction:', predictions) print('Raw Outputs:', raw_outputs)
def model_predict(text): ''' Takes in an array of text and returns predicted probability of risk. Input: text (arr): E.g. data[['content']] Output: pred (arr): returns label of 0 for low risk and 1 for high risk based on prob_risk prob_risk (arr): E.g. data['probability_risk'] = model_predict(data[['content']]) pred_risk (arr): Risk score for each article ''' #read text file to get model path model_txt = open("../automation/curr_model.txt", "r") model_path = model_txt.read() model_txt.close() # loading saved model, specifying same args as model init # model names: path to directory containing model files # model naming convention : roberta_YYYY_MM model_args = ClassificationArgs(num_train_epochs=2, learning_rate=5e-5) model = ClassificationModel(model_type = 'roberta', model_name = model_path, \ args = model_args, use_cuda = False) # Preprocess text processed_text = text.apply( lambda x: text_processing(x, lower=False, remove_url=True, remove_punctuation=False, remove_stopwords=False, replace_entity=True, replace_hash=True, split_alphanumeric=False, lemmatize=False, stem=False)) # predict pred, raw_outputs = model.predict(text) # convert to probability of risk prob = softmax(raw_outputs, axis=1) prob_risk = [x[1] for x in prob] pred_risk = [predicted_risk(x) for x in prob_risk] return pred, prob_risk, pred_risk
class CamembertModel(BaseEstimator): def __init__(self): self.le = preprocessing.LabelEncoder() self.le.fit(labels) self.tokenizer = CamembertTokenizer.from_pretrained('camembert-base') self.model = ClassificationModel('camembert', 'camembert-base', num_labels=8) def fit(self, X_train, y_train): df_train = pd.DataFrame({'text': [pattern.sub(' ', u[1]) for u in X_train], 'labels': self.le.transform(y_train)}) res = self.model.train_model(df_train, tokenizer=self.tokenizer, output_dir="./model/camembert", show_running_loss=True, args=args) return res def predict_proba(self, X_test): test_texts = [pattern.sub(' ', u[1]) for u in X_test] _, raw_outputs = self.model.predict(test_texts) return [softmax(u) for u in raw_outputs]
def predict(self, test_file, model_type, model_dir, use_cuda=False): self.train_args["output_dir"] = out_dir test_df = self.read_df(test_file) model = ClassificationModel( model_type, f"{model_dir}/", num_labels=4, args=self.train_args, use_cuda=use_cuda, ) predictions, raw_outputs = model.predict(test_df['text'].to_list()) test_df['predictions'] = predictions test_predictions = f"{test_file[:-4]}_predictions_{model_type}.csv" write_csv(test_predictions, list(test_df.columns.values), [ test_df[column].to_list() for column in list(test_df.columns.values) ])
def all_train(train, test, params, model_name, model_type, lb_hack): weight = len(train) / train["label"].value_counts().sort_index().values model = ClassificationModel(model_type=model_type, model_name=model_name, num_labels=4, args=params, use_cuda=True, weight=weight.tolist()) model.train_model(train) pred, raw_outputs = model.predict(test["description"]) y_pred = hack(raw_outputs, lb_hack) pseudo_idx = (pd.DataFrame(raw_outputs).max(axis=1) > 3) return y_pred, pseudo_idx
def run_model(model_name, model_type, params): print('model_name:', model_name) print('model_type:', model_type) lang_dict = {'en': 'English', 'de': 'German', 'es': 'Spanish', 'fr': 'French'} pprint.pprint(params) for i in range(4): lang = languages[i] params['output_dir'] = 'models/' + model_type + '-' + lang + '/' # modelを書き出すのでそこで推論と分けられます。 print('train', lang_dict[lang]) train_path = _path(lang, 'train') train_data = pd.read_csv(train_path) train_data['jobflag'] -= 1 train, val = train_test_split( train_data, test_size=0.3, random_state=i) # ↑の切り方は完全にミスで、別コードを再利用してる時に間違えました。。。。。ひどい。。。。 model = ClassificationModel(model_name, model_type, num_labels=4, weight=[ 1.17, 2.10, 0.532, 1.25], args=params, use_cuda=False) # weightはsklearn.utils.class_weight.compute_class_weightを使って計算しました model.train_model(train) losses, model_outputs, _ = model.eval_model( val, f1=metric_f1) print('example :') pprint.pprint(np.argmax(model_outputs, axis=1)[:10] + 1) # 1クラスに引っ張られがちだったので確認用です # ax = sns.countplot(np.argmax(model_outputs, axis=1) + 1) # fig = ax.get_figure() # fig.savefig(lang + '.png') # 分布確認用. print('loss:') pprint.pprint(losses) for lang2 in languages: print('predict', lang_dict[lang2]) test_path = _path(lang2, 'test') test = pd.read_csv(test_path)[:2] y_pred, _ = model.predict(test['description']) result = pd.DataFrame({'jobflag': y_pred + 1}) result.to_csv('result/' + model_name + '_' + lang + '_' + lang2 + '.csv', index=False, header=False)
def main(argv): wandb.login() tasks, (train_df, valid_df, test_df), transformers = load_molnet_dataset(FLAGS.molnet_dataset, tasks_wanted=None) logging.basicConfig(level=logging.INFO) transformers_logger = logging.getLogger("transformers") transformers_logger.setLevel(logging.WARNING) model = ClassificationModel(FLAGS.model_type, FLAGS.model_name, args={ 'evaluate_each_epoch': True, 'evaluate_during_training_verbose': True, 'no_save': True, 'num_train_epochs': FLAGS.num_train_epochs, 'auto_weights': True }) # You can set class weights by using the optional weight argument # check if our train and evaluation dataframes are setup properly. There should only be two columns for the SMILES string and its corresponding label. print("Train Dataset: {}".format(train_df.shape)) print("Eval Dataset: {}".format(valid_df.shape)) print("TEST Dataset: {}".format(test_df.shape)) model.train_model(train_df, eval_df=valid_df, output_dir=FLAGS.output_dir, args={'wandb_project': 'project-name'}) # accuracy result, model_outputs, wrong_predictions = model.eval_model( test_df, acc=sklearn.metrics.accuracy_score) # ROC-PRC result, model_outputs, wrong_predictions = model.eval_model( test_df, acc=sklearn.metrics.average_precision_score) # Lets input a molecule with a toxicity value of 1 predictions, raw_outputs = model.predict(['C1=C(C(=O)NC(=O)N1)F']) print(predictions) print(raw_outputs)
def train( arch, model_name, ): model_args = ClassificationArgs( num_train_epochs=5, output_dir="./models", evaluate_during_training_steps=1000, train_batch_size=64, reprocess_input_data=True, evaluate_during_training=True, eval_batch_size=32, save_model_every_epoch=False, overwrite_output_dir=True, learning_rate=7e-5, save_eval_checkpoints=False, best_model_dir=f"./models/{model_name}/best_model", use_early_stopping=True, early_stopping_delta=1e-2, early_stopping_metric="mcc", tensorboard_dir='./runs/', early_stopping_metric_minimize=False, wandb_project='my_roberta', manual_seed=69, early_stopping_patience=5, ) model = ClassificationModel(arch, model_name, args=model_args, use_cuda=True) model.train_model( train_df, eval_df=test, accuracy=lambda x, y: accuracy_score(x, [round(a) for a in y]), ) result, model_output, top_loss = model.eval_model(test) print(result) print(top_loss) pred, _ = model.predict(["thanks for bearing with us"]) print(pred)
def test_multiclass_classification(model_type, model_name): # Train and Evaluation data needs to be in a Pandas Dataframe containing at # least two columns. If the Dataframe has a header, it should contain a 'text' # and a 'labels' column. If no header is present, the Dataframe should # contain at least two columns, with the first column is the text with # type str, and the second column in the label with type int. train_data = [ ["Example sentence belonging to class 1", 1], ["Example sentence belonging to class 0", 0], ["Example eval senntence belonging to class 2", 2], ] train_df = pd.DataFrame(train_data, columns=["text", "labels"]) eval_data = [ ["Example eval sentence belonging to class 1", 1], ["Example eval sentence belonging to class 0", 0], ["Example eval senntence belonging to class 2", 2], ] eval_df = pd.DataFrame(eval_data, columns=["text", "labels"]) # Create a ClassificationModel model = ClassificationModel( model_type, model_name, num_labels=3, args={ "no_save": True, "reprocess_input_data": True, "overwrite_output_dir": True, "max_seq_length": 20, }, use_cuda=False, ) # Train the model model.train_model(train_df) # Evaluate the model result, model_outputs, wrong_predictions = model.eval_model(eval_df) predictions, raw_outputs = model.predict(["Some arbitary sentence"])
def predict_sarcasm(data_path, results, model_loc, model): # Bringing in the test data with open(data_path, 'r') as json_file: json_list = list(json_file) for json_str in json_list: pred.append(json.loads(json_str)) pred_response = [ remove_stopwords(convert_emojis(pred[i]['response'])) for i in range(len(pred)) ] pred_id = [pred[i]['id'] for i in range(len(pred))] model = ClassificationModel(model, model_loc, use_cuda=False) predictions, raw_outputs = model.predict(pred_response) pred_bert = pd.DataFrame({'id': pred_id, 'label': predictions}) pred_bert['label'] = pred_bert['label'].replace([1, 0], ['SARCASM', 'NOT_SARCASM']) pd.DataFrame(pred_bert).to_csv(results, header=False, sep=',', index=False)