def eval(model_path, our_gen_file, human_file): gen = open(our_gen_file, 'r').readlines() gen = [i.strip() for i in gen] human = open(human_file, 'r').readlines() human = [i.strip() for i in human] assert len(human) - len(gen) == 0, "please balance the eval file" test_df = pd.DataFrame(gen+human) test_input = test_df.sample(frac=1, random_state=123) train_args={ 'learning_rate':3e-5, 'num_train_epochs': 5, 'reprocess_input_data': True, 'overwrite_output_dir': False, 'process_count': 10, 'train_batch_size': 4, 'eval_batch_size': 400, 'max_seq_length': 300, "fp16":False } model = ClassificationModel('roberta', model_path, num_labels=4, use_cuda=True, cuda_device=0, args=train_args) result, model_outputs, wrong_predictions = model.eval_model(test_input) print(result)
def train_model(args, output_dir, cache_dir): """ Train a SimpleTransformers model based on the given arguments, save and return it. :param args: Arguments as processed by parse_args() containing architecture and epochs. :param output_dir: Path to the directory in which the model should be stored. :param cache_dir: Path to the directory in which the cache should be stored. :return: SimpleTransformers model trained based on the given arguments. """ print('=> Training model...') # Set model arguments model_args = { 'num_train_epochs': args.num_epochs, 'train_batch_size': 32, 'eval_batch_size': 32, 'output_dir': output_dir, 'cache_dir': cache_dir } # Train the model pretrained = get_transformer_model(args.arch) model = ClassificationModel(args.arch, pretrained, use_cuda=True, args=model_args) train = load_corpus('train') model.train_model(train) return model
def main(): f_path = 'Breast Cancer(Raw_data_2_Classes).csv' data = loadDataAsDataFrame(f_path) X = data y = data['Class'].tolist() training_set_size = int(0.8 * len(X)) training_rows, test_rows, training_classes, test_classes = train_test_split( X, y, train_size=training_set_size, random_state=42069) training_rows, test_rows, training_classes, test_classes = train_test_split( X, y, train_size=training_set_size, random_state=42069) model_args = {'overwrite_output_dir': True} # Create a TransformerModel model = ClassificationModel('roberta', 'roberta-base', use_cuda=False, args=model_args) #model = ClassificationModel('roberta', 'roberta-base', use_cuda=True, args=model_args) #change our data into a format that simpletransformers can process training_rows['text'] = training_rows['Text'] training_rows['labels'] = training_rows['Class'] test_rows['text'] = test_rows['Text'] test_rows['labels'] = test_rows['Class'] # Train the model model.train_model(training_rows) # Evaluate the model result, model_outputs, wrong_predictions = model.eval_model(test_rows) print("f1 score") precision = result['tp'] / (result['tp'] + result['fp']) recall = result['tp'] / (result['tp'] + result['fn']) f1score = 2 * precision * recall / (precision + recall) print(f1score)
def train(): wandb.init(WAND_PROJECT_NAME) modelArgs = { "max_seq_length": self.maxSeqLength, "output_dir": self.modelOutputDir, "overwrite_output_dir": True, "best_model_dir": self.bestModelOutputDir, "wandb_project": WAND_PROJECT_NAME, "num_training_epochs": wandb.config.epochs, "learning_rate": wandb.config.learning_rate, "do_lower_case": True, "cache_dir": self.modelCacheDir, "encoding": "utf-8", "train_batch_size": 5, "eval_batch_size": 5, "evaluate_during_training_steps": 50, "evaluate_during_training_verbose": True, "logging_steps": 5, "sliding_window": True, "reprocess_input_data": True, "evaluate_during_training": True, "use_multiprocessing": True, "labels_list": SECTOR_LABELS } model = ClassificationModel(self.modelType, self.modelNameOrPath, args=modelArgs, sweep_config=wandb.config, use_cuda=torch.cuda.is_available(), num_labels=len(SECTOR_LABELS), ) # Training and evaluation try: log.info(f"Started training/finetuning BERT on multi-class classification task..") model.train_model(train_df=self.trainDataset, eval_df=self.evalDataset, show_running_loss=True, output_dir=self.modelOutputDir, mcc=sklearn.metrics.matthews_corrcoef, acc=sklearn.metrics.balanced_accuracy_score, ) log.info(f"Finished finetuning and evaluating our fine-tuned model on multi-class classification task. Check the folder '{self.modelOutputDir}' for finetuned weights.") log.info(f"It took {round((time.time() - startTime) / 3600, 1)} hours to finetune and evaluate our fine-tuned model on multi-class classification task.") except: exc_type, exc_value, exc_traceback = sys.exc_info() err = f"Error occurred while training and evaluating the finetuned model on multi-class classification task. Error is: {exc_type}; {exc_value}." log.error(err) wandb.join()
def train_model(train_df, num_labels): model_type, model_name = MODELNAME.split(";") model_output = 'models/{}-{}-{}'.format(TAG, model_type, model_name.replace("/", "-")) if OVERWRITE is False and os.path.exists(model_output): logging.info("Skipping training of {}".format(model_name)) sys.exit(0) logging.info("Starting training of {}".format(model_name)) run = wandb.init(project=model_output.split("/")[-1], reinit=True) model = ClassificationModel( model_type, model_name, num_labels=num_labels, args={ 'output_dir': model_output, 'overwrite_output_dir': OVERWRITE, 'best_model_dir': '{}/best'.format(model_output), 'evaluate_during_training': False, 'manual_seed': 42, 'num_train_epochs': 4, # 'learning_rate': 2e-5, # For BERT, 5e-5, 3e-5, 2e-5 # For BERT 16, 32. It could be 128, but with gradient_acc_steps set to 2 is equivalent 'train_batch_size': 8 if "large" in model_name else 32, 'eval_batch_size': 8 if "large" in model_name else 32, # Doubles train_batch_size, but gradients and weights are calculated once every 2 steps 'gradient_accumulation_steps': 2 if "large" in model_name else 1, 'max_seq_length': 256, 'sliding_window': False, 'wandb_project': model_output.split("/")[-1], # "adam_epsilon": 3e-5, # 1e-8 "silent": False, "fp16": False, # By default it uses 32 bit floating point "n_gpu": 1, }) # train the model model.train_model(train_df) return model, run
def load_models(self): args = {'eval_batch_size': 32, 'silent': True} self.model = ClassificationModel(self.model_emb, self.tr_path, num_labels=self.num_classes, args=args, use_cuda=False)
def train(human_file, gen_file, our_gen_file, output_dir): data = [] data += [(i.strip(), 1) for i in open(human_file,'r').readlines()] data += [(i.strip(), 0) for i in open(gen_file,'r').readlines()] data += [(i.strip(), 0) for i in open(our_gen_file,'r').readlines()] all_df = pd.DataFrame(data) train_args = { 'overwrite_output_dir':True, 'num_train_epochs': 10, 'process_count': 10, 'train_batch_size': 10, 'eval_batch_size': 20, 'max_seq_length': 300, 'reprocess_input_data':True, 'learning_rate':1e-5, "evaluate_during_training": True, "use_early_stopping":True, 'early_stopping_patience':3, "early_stopping_metric": "eval_loss", "early_stopping_metric_minimize": True, "no_cache":True, 'output_dir':output_dir } model = ClassificationModel('roberta', "roberta-base", args=train_args) # You can set class weights by using the optional weight argument # Train the model model.train_model(all_df) print("finish the training")
def train_stance_clf(data_dir, output_dir, **kwargs): headlines, bodies, labels = fnc( os.path.join(data_dir, 'combined_stances_train.csv'), os.path.join(data_dir, 'combined_bodies_train.csv')) list_of_tuples = list(zip(headlines, bodies, labels)) df = pd.DataFrame(list_of_tuples, columns=['text_a', 'text_b', 'label']) train_df, val_df = train_test_split(df, random_state=123) train_args = { 'learning_rate': 3e-5, 'num_train_epochs': 5, 'reprocess_input_data': True, 'overwrite_output_dir': False, 'process_count': 10, 'train_batch_size': 4, 'eval_batch_size': 20, 'max_seq_length': 300, "fp16": False, 'output_dir': output_dir } model = ClassificationModel('roberta', "roberta-base", num_labels=4, use_cuda=True, cuda_device=0, args=train_args) model.train_model(train_df)
def fake_classify(train_set, eval_set, test_set, seed): # Create a TransformerModel model = ClassificationModel('bert', 'bert-base-multilingual-uncased', args={ 'max_seq_length': 512, 'num_train_epochs': 3, 'overwrite_output_dir': True, 'manual_seed': seed }, use_cuda=True) print(model.args) # Train the model model.train_model(train_set) # Evaluate the model result, model_outputs, wrong_predictions = model.eval_model( test_set, f1=sklearn.metrics.f1_score, acc=sklearn.metrics.accuracy_score) #print('Evaluation results = ', results(results)) return result, model_outputs, wrong_predictions
def __init__(self, model_type, model_name, model_args): self.train_df = pd.read_pickle("D:/Language Models/train_df_500000") self.eval_df = pd.read_pickle("D:/Language Models/test_df_500000") self.model = ClassificationModel(model_type, model_name, use_cuda=False, args=model_args)
def train(self, split=0.7, num_epochs=10): self.le = preprocessing.LabelEncoder() print(list(self.dat.keys())) self.le.fit(list(self.dat.keys())) train_data = [] eval_data = [] for k, v in self.dat.items(): len_train = int(round(len(v) * split)) train_data.extend([[i, self.le.transform([k])[0]] for i in v[:len_train]]) eval_data.extend([[i, self.le.transform([k])[0]] for i in v[len_train:]]) print(train_data, eval_data) train_df = pd.DataFrame(train_data) eval_df = pd.DataFrame(eval_data) train_args = { 'overwrite_output_dir': True, 'num_train_epochs': num_epochs, } self.model = ClassificationModel(self.model_type, self.model_name, num_labels=len(list(self.dat.keys())), use_cuda=self.use_cuda, cuda_device=0, args=train_args) # Train the model self.model.train_model(train_df, eval_df=eval_df) # Evaluate the model result, model_outputs, wrong_predictions = self.model.eval_model( eval_df, acc=sklearn.metrics.accuracy_score)
def generate_prob_matrix(arguments): my_args = { "max_seq_length": 256, "train_batch_size": 16, "eval_batch_size": 16, "do_lower_case": True, "manual_seed": 17 } model = ClassificationModel('bert', "relation_processing/model/bert", use_cuda=False, args=my_args) num_arguments = len(arguments) prob_matrix = np.zeros((num_arguments, num_arguments)) for rel_from in range(1, num_arguments): for rel_to in arguments[rel_from].compare_list: if rel_from == rel_to: continue logging.info("calculating: " + str(rel_from) + "-->" + str(rel_to)) timer = datetime.now() predictions, raw_outputs = model.predict([[arguments[rel_to].sentence, arguments[rel_from].sentence]]) rel = softmax(raw_outputs, axis=1) Stats.h_bert_time += datetime.now() - timer Stats.h_bert += 1 logging.debug(rel) prob_matrix[rel_to][rel_from] = rel[0][1] return prob_matrix
def train(self, train_data: object, eval_data: object) -> object: """ Create and train the chosen model based on the args Parameters ---------- train_data : object train split of the train_data. eval_data : object validation split of the train_data. Returns ------- object model. """ # Create a ClassificationModel model = ClassificationModel( self.model_name, self.model_type, args=self.model_args, use_cuda=self.cuda, num_labels=len(self.labels) - 1, ) # Train the model model.train_model(train_df=train_data, eval_df=eval_data, accuracy=accuracy_score) return model
def test_binary_classification(model_type, model_name): # Train and Evaluation data needs to be in a Pandas Dataframe of two columns. # The first column is the text with type str, and the second column is the # label with type int. train_data = [ ["Example sentence belonging to class 1", 1], ["Example sentence belonging to class 0", 0], ] train_df = pd.DataFrame(train_data) eval_data = [ ["Example eval sentence belonging to class 1", 1], ["Example eval sentence belonging to class 0", 0], ] eval_df = pd.DataFrame(eval_data) # Create a ClassificationModel model = ClassificationModel( model_type, model_name, use_cuda=False, args={ "reprocess_input_data": True, "overwrite_output_dir": True }, ) # Train the model model.train_model(train_df) # Evaluate the model result, model_outputs, wrong_predictions = model.eval_model(eval_df)
def run_trainers(bucket_dir, train_args=None): os.makedirs('irl_models', exist_ok=True) if os.path.isfile('completed_irl.txt'): with open("completed_irl.txt", 'r') as f: done = [d.replace('\n', '') for d in f.readlines()] else: open('completed_irl.txt', 'a').close() with open("completed_irl.txt", 'r') as f: done = [d.replace('\n', '') for d in f.readlines()] for train_file in os.listdir(bucket_dir): print(train_file[5:]) print(done) if train_file[5:] not in done: train_df = pd.read_csv(bucket_dir + '/' + train_file + '/data_all.tsv', sep='\t') train_args['output_dir'] = f'irl_models/{train_file[5:]}/' train_args['cache_dir'] = f'cache_{train_file[5:]}/' train_args.update({'wandb_kwargs': {'name': train_file[5:]}}) model = ClassificationModel('roberta', 'roberta-base', args=train_args) print(train_df.head()) model.train_model(train_df) with open("completed_irl.txt", 'a') as f: f.write(f"{train_file[5:]}\n") exit() with open("done.runs", 'w') as f: f.write(f"Done at {datetime.datetime.now()}")
def eval_stance_clf(model_path, src_path, gen_path, **kwargs): src = open(src_path, 'r').readlines() gen = open(gen_path, 'r').readlines() gen = [i.strip() for i in gen] src = [i.strip() for i in src] train_args = { 'learning_rate': 3e-5, 'num_train_epochs': 5, 'reprocess_input_data': True, 'overwrite_output_dir': False, 'process_count': 10, 'train_batch_size': 4, 'eval_batch_size': 400, 'max_seq_length': 300, "fp16": False } model = ClassificationModel('roberta', model_path, num_labels=4, use_cuda=True, cuda_device=0, args=train_args) input = [[i, j] for i, j in zip(src, gen)] predictions, raw_outputs = model.predict(input) th = Counter(predictions) th = sorted(th.items(), key=lambda x: x[0]) print(th)
def transformer(train_df, eval_df, datafile): #tokenizer = BertTokenizer.from_pretrained("bert-base-dutch-cased") model = ClassificationModel( "bert", "bert-base-dutch-cased", use_cuda=False, num_labels=2 ) # You can set class weights by using the optional weight argument # Train the model model.train_model(train_df) result, model_outputs, wrong_predictions = model.eval_model(eval_df) print(model_outputs) predlist = [] model1_outputs = model_outputs.tolist() for output in model1_outputs: if output[0] > output[1]: prediction = 0 else: prediction = 1 predlist.append(prediction) labels = eval_df["labels"].tolist() print(labels) print(predlist) print(classification_report(labels, predlist)) print(confusion_matrix(labels, predlist)) print(accuracy_score(labels, predlist))
def predict_export(data): X = data[args.predict_partition]['text'] predictions = {} for class_name in ['arousal', 'valence', 'topic']: # if class_name in ['arousal', 'valence']: class_no = 3 else: class_no = 10 trained_model_path = os.path.join('experiments/best_model/', class_name + str(False)) model = ClassificationModel(args.model_type, trained_model_path, num_labels=class_no) predictions['prediction_' + class_name], _ = model.predict(X) predictions['id'] = data[args.predict_partition]['id'] predictions['segment_id'] = data[args.predict_partition]['segment_id'] df = pd.DataFrame.from_dict(predictions) # , orient='index' .T header_names = [ 'id', 'segment_id', 'prediction_arousal', 'prediction_valence', 'prediction_topic' ] df[header_names].to_csv(output_path + args.predict_partition + '.csv', header=header_names, index=False)
def __init__(self, use_cuda=torch.cuda.is_available(), cuda_device=0, batch_size=16): self.model_type = "empathy" train_args["eval_batch_size"] = batch_size model_path = os.path.join(os.path.dirname(__file__), "models/empathy/") model_file = os.path.join(os.path.dirname(__file__), "models/empathy.tar.gz") if not os.path.isdir(model_path): model = f'{self.model_type}_model' if not os.path.isfile(model_file): logger.info( f'Model {self.model_type} does not exist at {model_path}. Attempting to download it.' ) fetch_pretrained_model(model, model_file) unzip_simple_transformer_model(model, model_path, model_file) # Create a ClassificationModel self.model = ClassificationModel('roberta', model_path, num_labels=1, use_cuda=use_cuda, cuda_device=cuda_device, args=train_args)
def train(self, args={}, cleanFN=CleanText().cleanText): self.logger.debug("Train Simpletransformer") isCudaAvailable = torch.cuda.is_available() if not isCudaAvailable: self.logger.warning("Training on CPU!") _modelArgs = self.modelArgs(args) self.logger.debug("ModelArgs: ") self.logger.debug("\n" + pformat(_modelArgs)) self.loadData(cleanFN, _modelArgs) self.model = ClassificationModel(model_type=self.model_type, model_name=self.model_name, args=_modelArgs, use_cuda=isCudaAvailable, num_labels=2) if _modelArgs["lazy_loading"]: if not (isinstance(self.trainData, str) and isinstance(self.testData, str)): self.logger.error("Lazy loading requires a string to a path.") self.logger.error(f"Train-Data-Type: {type(self.trainData)}") self.logger.error(f"Test-Data-Type: {type(self.testData)}") return None return self.model.train_model(train_df=self.trainData, eval_df=self.testData)
class TransformerModel: """ This class provides the Machine Learning model and classifies tenders based on previous training data. """ def load_model(self): if not self.model: from simpletransformers.classification import ClassificationModel try: self.model = ClassificationModel('bert', './outputs/', use_cuda=False, args=args) except Exception as ex: logger.error( f"could not load model from /outputs due to {str(ex)}, creating new model" ) self.create_new_model() def __init__(self): self.model = None def __convert_to_input(self, tenders): titles = list(map(lambda x: x.get_title("DE"), tenders)) return titles def classify(self, tenders): self.load_model() titles = self.__convert_to_input(tenders) predictions, raw_output = self.model.predict(titles) tuples = zip(tenders, predictions) selected_tenders = [t for t, p in tuples if p == 1] return selected_tenders def train(self, labelled_tenders): self.load_model() tenders = [i for i, j in labelled_tenders] tenders = self.__convert_to_input(tenders) labels = [j for i, j in labelled_tenders] tenders_train, tenders_test, labels_train, labels_test = train_test_split( tenders, labels, test_size=0.1, random_state=42) data_input = pd.DataFrame(zip(tenders_train, labels_train)) self.model.train_model(data_input) labels_pred, raw_output = self.model.predict(tenders_test) tn, fp, fn, tp = confusion_matrix(labels_test, labels_pred).ravel() logger.info(f"tn: {tn} fp: {fp}") logger.info(f"fn: {fn} tp:{tp}") def create_new_model(self): from simpletransformers.classification import ClassificationModel self.model = ClassificationModel('bert', 'bert-base-german-cased', use_cuda=False, args=args)
def objective(args): pbar.update(1) try: # cast np values to python and convert list to dict args = list(map(int, args[:3])) + list(map(float, args[3:])) args = dict( zip([ 'train_batch_size', 'gradient_accumulation_steps', 'weight_decay', 'learning_rate', 'learning_rate', 'adam_epsilon', 'warmup_ratio', 'max_grad_norm' ], args)) args['overwrite_output_dir'] = True args['eval_batch_size'] = args['train_batch_size'] model = ClassificationModel('albert', 'albert-base-v1', num_labels=5) # train model, find reverse f1, force garbage collection model.train_model(train, args=args) result, *_ = model.eval_model(test, f1=f1_multiclass, acc=accuracy_score) del model return 1. - result['f1'] except: print('skip') return 1.
def __init__(self, dir_path, model_path, resources_path, use_cuda, debugging=False): self.dir_path = dir_path self.model_path = model_path self.resources_path = resources_path self.debugging = debugging self.mapper = FineGrainedClassifier(self.resources_path) if os.path.exists(model_path) is False: print('Model Path not found!') return #initializing models bert_model_path = os.path.join(model_path, "bert_model") self.bert_model = ClassificationModel('bert', bert_model_path, use_cuda=use_cuda, args={'from_tf': False}) print("Initialized BERT model") self.svm_est_model = pickle.load( open(os.path.join(model_path, 'svm_estimator.sav'), 'rb')) print("Initialized SVM model.") self.lr_model = pickle.load( open(os.path.join(model_path, 'lr.sav'), 'rb')) print("Initialized LR model.")
def get_evaluation_parameter(model): eval_df = pd.read_csv("data/reviews/new_test.csv", header=None) eval_df.columns = ["text", "labels"] model_type = f'outputs/{model}/best_model' model = ClassificationModel(model, model_type) result, model_outputs, wrong_predictions = model.eval_model(eval_df) print('Results:', result) print('Outputs:', model_outputs) plots = [] differences = [] max_difference = 0 min_difference = 5 for i in range(len(model_outputs)): value = round(abs(model_outputs[i] - eval_df['labels'][i]), 2) actual = round(eval_df['labels'][i], 2) plots.append([actual, model_outputs[i], value]) if value > max_difference: max_difference = value if value < min_difference: min_difference = value differences.append(value) print('Max Difference:', max_difference) # 3.8447265625 print('Min Difference:', min_difference) # 0.0 parameter = sum(differences) / len(differences) print('Parameter:', parameter) # 0.40202807008058644 pd.DataFrame(differences).to_csv("test.csv", index=None) pd.DataFrame(plots).to_csv("plots.csv", index=None)
def train(train_df, max_sub_len, output_dir): model_type = 'distilbert' lr = 2e-5 sent_length = max_sub_len OUTPUT_DIR = output_dir \ + str(datetime.datetime.now())[:19] + '_' + model_type + '_' + str(sent_length) + '_' + str(lr) print("model is saved at: {}".format(OUTPUT_DIR)) training_config = { 'output_dir': OUTPUT_DIR, 'reprocess_input_data': True, 'overwrite_output_dir': True, 'num_train_epochs': 2, 'train_batch_size': 32, 'eval_batch_size': 32, 'learning_rate': lr, 'max_seq_length': sent_length } logging.basicConfig(level=logging.INFO) transformers_logger = logging.getLogger("transformers") transformers_logger.setLevel(logging.WARNING) model = ClassificationModel(model_type, 'distilbert-base-cased', num_labels=4, args=training_config) torch.cuda.empty_cache() model.train_model(train_df) return model
def bert_predictions(tweet: pd.DataFrame, model: ClassificationModel): """ Bert Inference for prediction. :param tweet: dataframe with tweets :param model: Bert Model :return: list of pr """ tweet = tweet.values.tolist() try: predictions, raw_outputs = model.predict(tweet) except: for element in tweet.iteritems(): model.predict([element]) print("STOPP") auswertung = collections.Counter(predictions) gc.collect() # df = pd.DataFrame(raw_outputs) # df['predictions'] = pd.DataFrame(predictions) # df['tweets'] = pd.DataFrame(tweet) # df = df.replace(r'\n', ' ', regex=True) # df_softmax = pd.DataFrame(softmax(raw_outputs, axis=1)) # df['softmax0'] = df_softmax[0] # df['softmax1'] = df_softmax[1] # db_functions.df_to_sql(df, 'temp_table', 'replace') return auswertung
def fake_classify(train_set, eval_set, test_set, seed): # Create a TransformerModel model = ClassificationModel('bert', 'bert-base-multilingual-uncased', args={ 'num_train_epochs': 3, 'overwrite_output_dir': True, 'manual_seed': seed }, use_cuda=True) print(model.args) # Train the model model.train_model(train_set) # Evaluate the model result, model_outputs, wrong_predictions = model.eval_model( test_set, f1=sklearn.metrics.f1_score, acc=sklearn.metrics.accuracy_score) #print('Evaluation results = ', results(results)) #save the model #import torch #torch.save(model, path) --> no need to do this, model gets saved in output dir return result, model_outputs, wrong_predictions
def __init__(self, model_type: str, model_name_or_path: Union[str, Path], output_dir: Path, class_weights: Optional[List[float]] = None ): print('class weights: {}'.format(class_weights)) self.output_dir = output_dir self.cache_dir = output_dir / 'cache/' self.tensorboard_dir = output_dir / 'runs/' self.best_model_dir = output_dir / 'output/best_model/' self.model_type = model_type self.model_name_or_path = model_name_or_path self.model = ClassificationModel(self.model_type, str(self.model_name_or_path), cache_dir='/media/sarthak/HDD/data_science/fnp_resources/pretrained_models/', args={'fp_16': True, 'output_dir': str(self.output_dir), 'cache_dir': str(self.cache_dir), 'tensorboard_dir': str(self.tensorboard_dir), 'best_model_dir': str(self.best_model_dir)}, weight=class_weights ) self.class_weights = class_weights
def main(source=source, data_dir='data', checkpoint_dir="outputs/eval2/roberta_finetune_nogptneo", best_model_dir='outputs/eval2/best_model_roberta_finetune_nogptneo', n_train=240000, n_valid=4000, n_test=4000, n_epochs=10, learning_rate=4e-05, train_batch_size=64, eval_batch_size=64, evaluate_during_training=True, evaluate_during_training_steps=2000, reprocess_input=True, overwrite_output_dir=True, n_gpu=2): # import pdb; pdb.set_trace() train_df = data_loading.load_split(data_dir, source, 'train', n=n_train) valid_df = data_loading.load_split(data_dir, source_test, 'valid', n=n_valid) test_df = data_loading.load_split(data_dir, source_test, 'test', n=n_test) # Optional model configuration model_args = ClassificationArgs( num_train_epochs=n_epochs, evaluate_during_training=evaluate_during_training, evaluate_during_training_steps=evaluate_during_training_steps, best_model_dir=best_model_dir, manual_seed=0, train_batch_size=train_batch_size, eval_batch_size=eval_batch_size, overwrite_output_dir=overwrite_output_dir, n_gpu=n_gpu, output_dir=checkpoint_dir, reprocess_input_data=reprocess_input, learning_rate=learning_rate) # Create a ClassificationModel model = ClassificationModel("roberta", model_name="roberta-large", args=model_args, use_cuda=True) # Train the model model.train_model(train_df, eval_df=valid_df, f1=sklearn.metrics.f1_score, acc=sklearn.metrics.accuracy_score, eer=eer) # Evaluate the model result, model_outputs, wrong_predictions = model.eval_model( test_df, f1=sklearn.metrics.f1_score, acc=sklearn.metrics.accuracy_score, eer=eer)
class TransformerModel(TenderClassClassifier): """ This class provides the Machine Learning model and classifies tenders based on previous training data. """ def __init__(self): self.model = None def load(self, name): self.model = ClassificationModel('bert', './outputs/', use_cuda=cuda_available, args=args) def save(self, name): pass def __convert_to_input(self, tenders): titles = list(map(lambda x: x.get_title("DE"), tenders)) return titles def classify(self, tenders): titles = self.__convert_to_input(tenders) predictions, raw_output = self.model.predict(titles) tuples = zip(tenders, predictions) selected_tenders = [t for t, p in tuples if p == 1] return selected_tenders def train(self, labelled_tenders): tenders = [i for i, j in labelled_tenders] tenders = self.__convert_to_input(tenders) labels = [j for i, j in labelled_tenders] tenders_train, tenders_test, labels_train, labels_test = train_test_split( tenders, labels, test_size=0.1, random_state=42) data_input = pd.DataFrame(zip(tenders_train, labels_train)) start = time.time() self.model.train_model(data_input) end = time.time() print(end - start) labels_pred, raw_output = self.model.predict(tenders_test) tn, fp, fn, tp = confusion_matrix(labels_test, labels_pred).ravel() logger.info(f"tn: {tn} fp: {fp}") logger.info(f"fn: {fn} tp:{tp}") logger.info( f"Accuracy Score: {accuracy_score(labels_test, labels_pred)}") def create_new_model(self): from simpletransformers.classification import ClassificationModel self.model = ClassificationModel('bert', 'bert-base-german-cased', use_cuda=cuda_available, args=args)