def test_seq2seq(encoder_decoder_type, encoder_decoder_name, encoder_type): train_data = [ ["one", "1"], ["two", "2"], ] train_df = pd.DataFrame(train_data, columns=["input_text", "target_text"]) eval_data = [ ["three", "3"], ["four", "4"], ] eval_df = pd.DataFrame(eval_data, columns=["input_text", "target_text"]) model_args = { "reprocess_input_data": True, "overwrite_output_dir": True, "max_seq_length": 128, "train_batch_size": 2, "num_train_epochs": 2, "use_multiprocessing": False, "max_length": 15, "manual_seed": 4, "do_sample": False, "num_return_sequences": 1, } if encoder_type == "bart": model = Seq2SeqModel( encoder_decoder_type=encoder_decoder_type, encoder_decoder_name=encoder_decoder_name, args=model_args, use_cuda=False, ) else: model = Seq2SeqModel( encoder_type=encoder_type, encoder_name=encoder_decoder_type, decoder_name=encoder_decoder_name, args=model_args, use_cuda=False, ) model.train_model(train_df) model.eval_model(eval_df) a = model.predict(["five"])[0] model = Seq2SeqModel( encoder_decoder_type=encoder_decoder_type, encoder_decoder_name="outputs", args=model_args, use_cuda=False, ) b = model.predict(["five"])[0] assert a == b
def __init__(self, args): logging.basicConfig(level=logging.INFO) transformers_logger = logging.getLogger("transformers") transformers_logger.setLevel(logging.WARNING) model_args = Seq2SeqArgs() model_args.num_train_epochs = 1 model_args.no_save = True model_args.train_batch_size = 4 model_args.evaluate_generated_text = True model_args.evaluate_during_training = False model_args.evaluate_generated_text = False model_args.evaluate_during_training_verbose = True model_args.use_multiprocessing = False model_args.max_seq_length = 5 model_args.max_length = 6 model_args.overwrite_output_dir = True self.model_args = model_args cuda_available = torch.cuda.is_available() # Initialize model self.model = Seq2SeqModel( encoder_decoder_type="bart", encoder_decoder_name="facebook/bart-base", args=model_args, use_cuda=cuda_available, )
def load_swedish(): english_to_swedish_model = Seq2SeqModel( encoder_decoder_type="marian", encoder_decoder_name="Helsinki-NLP/opus-mt-en-sw", use_cuda=use_cuda, args=model_args, ) return english_to_swedish_model
def load_romance(): english_to_romance_model = Seq2SeqModel( encoder_decoder_type="marian", encoder_decoder_name="Helsinki-NLP/opus-mt-en-roa", use_cuda=use_cuda, args=model_args, ) return english_to_romance_model
def main(): model_args = Seq2SeqArgs() model_args.eval_batch_size = 4 model_args.evaluate_during_training = True model_args.evaluate_during_training_steps = 2500 model_args.evaluate_during_training_verbose = True model_args.fp16 = False model_args.learning_rate = 5e-5 model_args.max_seq_length = 128 model_args.num_train_epochs = 2 model_args.overwrite_output_dir = False model_args.reprocess_input_data = True model_args.save_eval_checkpoints = False model_args.save_steps = -1 model_args.train_batch_size = 16 model_args.use_multiprocessing = False model_args.do_sample = True model_args.num_beams = None model_args.num_return_sequences = 3 model_args.max_length = 128 model_args.top_k = 50 model_args.top_p = 0.95 model_args.n_gpu = 1 model_args.wandb_project = None logging.basicConfig(level=logging.INFO) transformers_logger = logging.getLogger("transformers") transformers_logger.setLevel(logging.ERROR) model = Seq2SeqModel( encoder_decoder_type="bart", encoder_decoder_name="outputs_23-04-2021/checkpoint-144205-epoch-5", args=model_args, cuda_device=2) while True: print('first sentence:') input_text = input() if input_text == 'exit': break print('second sentence:') target_text = input() prefix = 'paraphrase' d = dict(input_text=input_text, target_text=target_text, prefix=prefix) eval_df = pd.DataFrame([[input_text, target_text, prefix]], columns=d.keys()) prediction, losses = model.project_inference_method(eval_df) print(prediction) print(losses)
def create_network(): # Configure the model model_args = Seq2SeqArgs() model_args.padding = "longest" model_args.length_penalty = 1 model_args.truncation = True model_args.max_length = 512 model = Seq2SeqModel( encoder_decoder_type="bart", encoder_decoder_name="facebook/bart-large-cnn", args=model_args, ) return model
def train(model_args, train_path, dev_path): # Data Loading train_df = pd.read_csv(train_path) eval_df = pd.read_csv(dev_path) train_df = data_process(train_df) eval_df = data_process(eval_df) # Model Initialization model = Seq2SeqModel( encoder_decoder_type="bart", encoder_decoder_name="facebook/bart-large", args=model_args, ) # Model Training model.train_model(train_df, eval_data=eval_df) # Model Evaluating results = model.eval_model(eval_df) print(results)
def __init__(self): super().__init__() model_args = { "reprocess_input_data": True, "overwrite_output_dir": True, "max_seq_length": 64, "train_batch_size": 16, "num_train_epochs": 3, "save_eval_checkpoints": False, "save_model_every_epoch": False, # "silent": True, "evaluate_generated_text": False, "evaluate_during_training": False, "evaluate_during_training_verbose": False, "use_multiprocessing": False, "save_best_model": True, "max_length": 200, "do_sample": True, "top_k": 3, } self.model = Seq2SeqModel( "bert", encoder_decoder_name="app/model", args=model_args, use_cuda=False, ) with open('app/ingredients_and_mapper.json' ) as ingredients_and_mapper_file: _ingredients_and_mapper = json.load(ingredients_and_mapper_file) self.ingredients = list( dict.fromkeys(_ingredients_and_mapper['frontend'])) self.mapper = _ingredients_and_mapper['mapper'] self.small_ingredients = [ i for i in self.ingredients if "".join(i.lower().split()) in self.mapper ] print('initializing subsetter') self.subsetter = Subsetter() print('initialized subsetter')
def main(): logging.basicConfig(level=logging.INFO) transformers_logger = logging.getLogger("transformers") transformers_logger.setLevel(logging.ERROR) eval_df, train_df = import_datasets() model_args = Seq2SeqArgs() model_args.eval_batch_size = 4 model_args.evaluate_during_training = True model_args.evaluate_during_training_steps = 2500 model_args.evaluate_during_training_verbose = True model_args.fp16 = False model_args.learning_rate = 5e-5 model_args.max_seq_length = 128 model_args.num_train_epochs = 2 model_args.overwrite_output_dir = False model_args.reprocess_input_data = True model_args.save_eval_checkpoints = False model_args.save_steps = -1 model_args.train_batch_size = 4 model_args.use_multiprocessing = False model_args.do_sample = True model_args.num_beams = None model_args.num_return_sequences = 3 model_args.max_length = 128 model_args.top_k = 50 model_args.top_p = 0.95 model_args.n_gpu = 1 model_args.output_dir = "outputs_19-04-2021" model_args.wandb_project = "Paraphrasing with BART_19-04-2021" model = Seq2SeqModel(encoder_decoder_type="bart", encoder_decoder_name="facebook/bart-large", args=model_args, cuda_device=7) model.train_model(train_df, eval_data=eval_df) to_predict = [ prefix + ": " + str(input_text) for prefix, input_text in zip( eval_df["prefix"].tolist(), eval_df["input_text"].tolist()) ] truth = eval_df["target_text"].tolist() preds = model.predict(to_predict) # Saving the predictions if needed os.makedirs("predictions", exist_ok=True) with open(f"predictions/predictions_{datetime.now()}.txt", "w") as f: for i, text in enumerate(eval_df["input_text"].tolist()): f.write(str(text) + "\n\n") f.write("Truth:\n") f.write(truth[i] + "\n\n") f.write("Prediction:\n") for pred in preds[i]: f.write(str(pred) + "\n") f.write( "________________________________________________________________________________\n" )
def main(): model_args = Seq2SeqArgs() model_args.eval_batch_size = 1 # dont change model_args.evaluate_during_training = True model_args.evaluate_during_training_steps = 2500 model_args.evaluate_during_training_verbose = True model_args.fp16 = False model_args.learning_rate = 5e-5 model_args.max_seq_length = 128 model_args.num_train_epochs = 2 model_args.overwrite_output_dir = False model_args.reprocess_input_data = True model_args.save_eval_checkpoints = False model_args.save_steps = -1 model_args.train_batch_size = 1 # DON'T CHANGE!!! model_args.use_multiprocessing = False model_args.do_sample = True model_args.num_beams = None model_args.num_return_sequences = 3 model_args.max_length = 128 model_args.top_k = 50 model_args.top_p = 0.95 model_args.n_gpu = 1 model_args.wandb_project = None logging.basicConfig(level=logging.INFO) transformers_logger = logging.getLogger("transformers") transformers_logger.setLevel(logging.ERROR) dataset_to_evaluate = 'mrpc' cuda_device = 3 threshold = None inversed = False score_type = 'loss' # encoder_decoder_name = "facebook/bart-base" # encoder_decoder_name = "facebook/bart-large" # encoder_decoder_name = "bart-base-all" # encoder_decoder_name = "bart-large-all" # encoder_decoder_name = "bart-base-mrpc" # encoder_decoder_name = "bart-large-mrpc" # encoder_decoder_name = "bart-base-paws" # encoder_decoder_name = "bart-large-paws" encoder_decoder_name = "bart-base-qqp" # encoder_decoder_name = "bart-large-qqp" print(dataset_to_evaluate) print(encoder_decoder_name) negative, positive = import_cleaned_data(dataset_to_evaluate, inversed) model = Seq2SeqModel(encoder_decoder_type="bart", encoder_decoder_name=encoder_decoder_name, args=model_args, cuda_device=cuda_device) if score_type == 'probs': positive_losses = model.project_inference_method(positive) negative_losses = model.project_inference_method(negative) elif score_type == 'loss': positive_losses = model.ce_losses(positive) negative_losses = model.ce_losses(negative) else: raise AssertionError("score_type has to be one of 'loss' or 'probs'") plot_histograms(positive_losses=positive_losses, negative_losses=negative_losses, plot_title=(dataset_to_evaluate + '/' + encoder_decoder_name), x_min=0.5, x_max=8) calculate_accuracy_and_f1(positive_losses, negative_losses, threshold=threshold)
def main(): logging.basicConfig(level=logging.INFO) transformers_logger = logging.getLogger("transformers") transformers_logger.setLevel(logging.WARNING) parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the source and target files for the task.", ) parser.add_argument( "--model_type", default=None, type=str, required=True, help="Model type, choose from [seq2seq, T5]", ) parser.add_argument( "--model_name_or_path", default=None, type=str, required=True, help= "Path to pretrained model or model identifier from huggingface.co/models", ) # Other parameters parser.add_argument("--do_train", action="store_true", help="Whether to run training.") parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the valid set.") parser.add_argument("--do_predict", action="store_true", help="Whether to run prediction on the test set.") parser.add_argument("--init_model_weights", action="store_true", help="Whether to initialize the model weights") parser.add_argument("--overwrite_output_dir", action="store_true", help="Whether to overwrite on the existing output dir") parser.add_argument("--use_multiprocessed_decoding", action="store_true", help="Whether to use multiprocess when decoding") parser.add_argument( "--save_model_every_epoch", action="store_true", help="Whether to save model every epoch during training") parser.add_argument( "--predict_during_training", action="store_true", help="Whether to predict after each checkpoint-saving during training") parser.add_argument( "--evaluate_during_training", action="store_true", help="Whether to evaluate after each checkpoint-saving during training" ) parser.add_argument( "--output_dir", default='output_dir/', type=str, help= "The output directory where the model predictions and checkpoints will be written.", ) parser.add_argument( "--save_step", default=0, type=int, help="Save checkpoint every X updates steps.", ) parser.add_argument( "--train_batch_size", default=16, type=int, help="Size of each train batch", ) parser.add_argument( "--eval_batch_size", default=16, type=int, help="Size of each eval/predict batch", ) parser.add_argument( "--gradient_accumulation_steps", default=1, type=int, help="gradient accumulation steps", ) parser.add_argument( "--learning_rate", default=4e-5, type=float, help="learning rate", ) parser.add_argument( "--num_train_epochs", default=100, type=int, help="Number of train epochs", ) parser.add_argument( "--max_seq_length", default=None, type=int, help="Max input seq length", ) parser.add_argument( "--max_length", default=None, type=int, help="Max output seq length", ) parser.add_argument( "--prediction_dir", default=None, type=str, help= "The output directory where the predictions results will be written.", ) parser.add_argument( "--prediction_suffix", default=None, type=str, help=" The supplementary suffix of prediction results name.", ) parser.add_argument( "--mask_ratio", default=0.0, type=float, help="the proportion of masked words in the source", ) parser.add_argument( "--mask_length", default="span-poisson", type=str, choices=['subword', 'word', 'span-poisson'], help="when masking words, the length of mask segments", ) parser.add_argument( '--replace_length', default=-1, type=int, help= 'when masking N tokens, replace with 0, 1, or N tokens (use -1 for N)') parser.add_argument( '--poisson_lambda', default=3.0, type=float, help='randomly shuffle sentences for this proportion of inputs') parser.add_argument( '--dataloader_num_workers', default=0, type=int, help='the number of cpus used in collecting data in dataloader, ' 'note that if it is large than cpu number, the program may be stuck') parser.add_argument( '--evaluation_metric', default='qa', type=str, help='if pretrain passages, use \'passage\', else use \'qa\'') args = parser.parse_args() if (os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir): raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome." .format(args.output_dir)) if args.do_train == True: train_df = read_data_source_target(args.data_dir + "train.source", args.data_dir + "train.target") else: train_df = None if args.do_eval == True or args.evaluate_during_training == True: eval_df = read_data_source_target(args.data_dir + "valid.source", args.data_dir + "valid.target") else: eval_df = None if args.do_predict == True or args.predict_during_training == True: test_df = read_data_source_target(args.data_dir + "test.source", args.data_dir + "test.target") else: test_df = None model_args = { "reprocess_input_data": True, "overwrite_output_dir": args.overwrite_output_dir, "init_model_weights": args.init_model_weights, "max_seq_length": args.max_seq_length, "train_batch_size": args.train_batch_size, "eval_batch_size": args.eval_batch_size, "gradient_accumulation_steps": args.gradient_accumulation_steps, "learning_rate": args.learning_rate, "num_train_epochs": args.num_train_epochs, "save_eval_checkpoints": False, "save_model_every_epoch": args.save_model_every_epoch, "save_steps": args.save_step, "evaluate_during_training": args.evaluate_during_training, "evaluate_generated_text": True, "evaluate_during_training_verbose": True, "predict_during_training": args.predict_during_training, "use_multiprocessing": False, "output_dir": args.output_dir, "max_length": args.max_length, "manual_seed": 4, "mask_ratio": args.mask_ratio, "mask_length": args.mask_length, "replace_length": args.replace_length, "poisson_lambda": args.poisson_lambda, "fp16": False, "truncation": True, "dataloader_num_workers": args.dataloader_num_workers, "use_multiprocessed_decoding": args.use_multiprocessed_decoding, "evaluation_metric": args.evaluation_metric } # Initialize model if args.model_type == 'seq2seq': model = Seq2SeqModel( encoder_decoder_type="bart", encoder_decoder_name=args.model_name_or_path, args=model_args, ) elif args.model_type == 't5': model = T5Model( model_name=args.model_name_or_path, args=model_args, ) else: raise ValueError("The {} model is not supported now".format( args.model_type)) # Train the model if args.do_train == True: model.train_model(train_data=train_df, eval_data=eval_df, test_data=test_df, output_dir=args.output_dir) # Evaluate the model if args.do_eval == True: results = model.eval_model(eval_data=eval_df) print(results) # Use the model for prediction if args.do_predict == True: print( model.predict(pred_data=test_df, output_dir=args.prediction_dir, suffix=args.prediction_suffix))
eval_df = papers.sample(frac=0.1, random_state=42) train_df = papers.drop(eval_df.index) model_args = { "reprocess_input_data": True, "overwrite_output_dir": True, "save_model_every_epoch": False, "save_eval_checkpoints": False, "max_seq_length": 512, "train_batch_size": 6, "num_train_epochs": 3, } # Create a Bart-base model model = Seq2SeqModel(encoder_decoder_type="bart", encoder_decoder_name="facebook/bart-base", args=model_args) # Train the model model.train_model(train_df) # Evaluate the model result = model.eval_model(eval_df) print(result) ### #1.4 Prediction 🔮 for _ in range(250):
"reprocess_input_data": True, "overwrite_output_dir": True, "max_seq_length": 10, "train_batch_size": 2, "num_train_epochs": 100, "save_eval_checkpoints": False, "save_model_every_epoch": False, # "silent": True, "evaluate_generated_text": True, "evaluate_during_training": True, "evaluate_during_training_verbose": True, "use_multiprocessing": False, "save_best_model": False, "max_length": 15, } model = Seq2SeqModel("bert-base-cased", "bert-base-cased", args=model_args) def count_matches(labels, preds): print(labels) print(preds) return sum( [1 if label == pred else 0 for label, pred in zip(labels, preds)]) model.train_model(train_df, eval_data=eval_df, matches=count_matches) print(model.eval_model(eval_df, matches=count_matches)) print(model.predict(["four", "five"]))
label = ' '.join([str(elem) for elem in labels]) prediction = ' '.join([str(elem) for elem in preds]) if len(prediction) < 4 or len(label) < 4: return 0 return sentence_nist([label], prediction, 4) def calculate_m_score(target, predictions, length): score = 0 for t, p in zip(target, predictions): score += meteor_score(t, p) return score / length if __name__ == "__main__": print('good to go') # model = Seq2SeqModel("bart", "facebook/bart-base", "bart", config="outputs/best_model/config.json") model = Seq2SeqModel(encoder_decoder_type='bart', encoder_decoder_name="facebook/bart-base", config="outputs/best_model/config.json") test = "I have been having fever last few days. Any thoughts on this?" inference = model.predict([test]) print(inference)
def main(): logging.basicConfig(level=logging.INFO) transformers_logger = logging.getLogger("transformers") transformers_logger.setLevel(logging.ERROR) model_args = Seq2SeqArgs() model_args.eval_batch_size = 4 model_args.evaluate_during_training = True model_args.evaluate_during_training_steps = 5000 model_args.evaluate_during_training_verbose = True model_args.fp16 = False model_args.learning_rate = 5e-5 model_args.max_seq_length = 128 model_args.num_train_epochs = 5 model_args.overwrite_output_dir = False model_args.reprocess_input_data = True model_args.save_eval_checkpoints = True model_args.save_steps = -1 model_args.save_model_every_epoch = True model_args.train_batch_size = 4 model_args.use_multiprocessing = False model_args.do_sample = True model_args.num_beams = None model_args.num_return_sequences = 3 model_args.max_length = 128 model_args.top_k = 50 model_args.top_p = 0.95 model_args.n_gpu = 1 experiment_name = "bart-large-paws" model_args.output_dir = experiment_name model_args.best_model_dir = 'best_model/' + experiment_name model_args.wandb_experiment = experiment_name model_args.wandb_project = "NLP Project experiments" encoder_decoder_name = "facebook/bart-large" train_df = pd.read_csv( '/home/fodl/asafmaman/PycharmProjects/nlp_final_project_private/' 'paraphrasing/data/cleaned_labeled/' 'paws_train_clean.csv') eval_df = pd.read_csv( '/home/fodl/asafmaman/PycharmProjects/nlp_final_project_private/' 'paraphrasing/data/cleaned_labeled/' 'paws_test_clean_no_train_overlap.csv') train_df = train_df[train_df['is_duplicate'] == 1][[ 'sentence1', 'sentence2' ]] train_df['prefix'] = 'paraphrase' train_df = train_df.rename(columns={ "sentence1": "input_text", "sentence2": "target_text" }) # positive = positive.rename(columns={"sentence2": "input_text", "sentence1": "target_text"}) train_df = train_df[['input_text', 'target_text', 'prefix']] train_df = train_df.dropna() eval_df = eval_df[eval_df['is_duplicate'] == 1][['sentence1', 'sentence2']] eval_df['prefix'] = 'paraphrase' eval_df = eval_df.rename(columns={ "sentence1": "input_text", "sentence2": "target_text" }) # eval_df = eval_df.rename(columns={"sentence2": "input_text", "sentence1": "target_text"}) eval_df = eval_df[['input_text', 'target_text', 'prefix']] eval_df = eval_df.dropna() model = Seq2SeqModel(encoder_decoder_type="bart", encoder_decoder_name=encoder_decoder_name, args=model_args, cuda_device=3) print(train_df) model.train_model(train_df, eval_data=eval_df)
from simpletransformers.seq2seq import ( Seq2SeqModel, Seq2SeqArgs, ) model_args = Seq2SeqArgs() model_args.num_train_epochs = 20 model_args.no_save = True model_args.evaluate_generated_text = True model_args.evaluate_during_training = True model_args.evaluate_during_training_verbose = True # Initialize model model = Seq2SeqModel( encoder_decoder_type="bart", encoder_decoder_name="facebook/bart-large", args=model_args, use_cuda=False, ) def count_matches(labels, preds): print(labels) print(preds) return sum( [1 if label == pred else 0 for label, pred in zip(labels, preds)]) # Train the model model.train_model(train_df, eval_data=eval_df, matches=count_matches) # # Evaluate the model
logging.basicConfig(level=logging.INFO) transformers_logger = logging.getLogger("transformers") transformers_logger.setLevel(logging.WARNING) model_args = { "reprocess_input_data": True, "max_seq_length": 256, "use_multiprocessing": True, "max_length": 256, "use_cuda": True, } model = Seq2SeqModel( encoder_decoder_type="marian", encoder_decoder_name="Helsinki-NLP/opus-mt-de-en", args=model_args, ) dev_data = pd.read_csv('dev.csv') src = dev_data.text predictions = model.predict(src) for en, de in zip(src, predictions): print("-------------") print(en) print(de) print()
import logging from simpletransformers.seq2seq import Seq2SeqModel logging.basicConfig(level=logging.INFO) transformers_logger = logging.getLogger("transformers") transformers_logger.setLevel(logging.ERROR) model = Seq2SeqModel(encoder_decoder_type="bart", encoder_decoder_name="outputs") while True: original = input("Enter text to paraphrase: ") to_predict = [original] preds = model.predict(to_predict) print("---------------------------------------------------------") print(original) print() print("Predictions >>>") for pred in preds[0]: print(pred) print("---------------------------------------------------------") print()
"overwrite_output_dir": True, "max_seq_length": max_seq_length, "train_batch_size": bs, "num_train_epochs": epochs, "save_eval_checkpoints": False, "save_model_every_epoch": True, "evaluate_generated_text": True, "evaluate_during_training_verbose": True, "use_multiprocessing": False, "max_length": max_length, "manual_seed": 4, } torch.cuda.empty_cache() # clear cache before training every time model = Seq2SeqModel( encoder_decoder_type="bart", encoder_decoder_name="facebook/bart-large", args=model_args,) # load pre-trained model # Train the model model.train_model(train_df) # Evaluate the model results = model.eval_model(eval_df) # Use the model for prediction input_seqs = eval_df.input_text.to_list() pred = model.predict(input_seqs) pred_df = pd.DataFrame(pred, columns=["prediction"]) pd.concat([eval_df, pred_df], axis=1).to_csv(OUTPUT_DIR + "/prediction_on_test.csv") # print some sample predictions