def main(): # load data train_data = load_standard_dataset(standard_train) # ======================= instantiate model ==================== # models are from https://huggingface.co/models?pipeline_tag=question-answering # model = QuestionAnsweringModel('roberta', 'csarron/roberta-large-squad-v1', args=train_args) # model = QuestionAnsweringModel('electra', 'mrm8488/electra-large-finetuned-squadv1', args=train_args) # model = QuestionAnsweringModel('albert', 'Wikidepia/albert-bahasa-cased-squad', args=train_args) # model = QuestionAnsweringModel('bert', 'bert-base-cased', args=train_args) model = QuestionAnsweringModel('distilbert', 'distilbert-base-uncased-distilled-squad', args=train_args) # model = QuestionAnsweringModel('bert', 'trained_models/bert', args=train_args) # run eval from pre-trained model # =========================== train model ====================== model.train_model(train_data) # ========================= do evaluation ====================== dev_data = load_standard_dataset(standard_dev) result, texts = model.eval_model(dev_data, f1=f1_multiclass, acc=accuracy_score) print(f'Result: {result}') # ========================= do predictions ===================== answers, probabilities = model.predict(dev_data, n_best_size=1) preds = {pred['id']: pred['answer'] for pred in answers} with open('results/squad_predictions.json', 'w') as f: for qid, answer in preds.items(): f.write(f'{{"qid": "{qid}", "answer": "{answer}"}}\n')
def train_squad(train_path=ST_TRAINING_JSON, squad_path=SQUAD_TRAINING_JSON): trainset = load_trainset(train_path, squad_path=squad_path) model = QuestionAnsweringModel('distilbert', 'distilbert-base-uncased-distilled-squad', args={'reprocess_input_data': True, 'overwrite_output_dir': True}) model.train_model(trainset) return model
def train_model(model_name): api = os.environ['DB_API'] # get the request path path = os.environ[ 'MODEL_REPO'] + "/" + model_name # get the model repository location request_test = requests.get(api) # get the train data from db train_data = request_test.json()[0][0] # format the data as json model = None # init model if model_name == "bert": # if the model selected is bert then if not os.path.exists(path): # check whether BERT already exsits # set the arguments for training train_args = { 'learning_rate': 3e-5, 'num_train_epochs': 2, 'max_seq_length': 384, 'doc_stride': 128, 'overwrite_output_dir': True, 'reprocess_input_data': False, 'train_batch_size': 2, 'gradient_accumulation_steps': 8, } #download the bert model model = QuestionAnsweringModel( model_name, "mrm8488/bert-tiny-5-finetuned-squadv2", args=train_args, use_cuda=False) model.save_model( path, model=model.model) # save the model in the repository return json.dumps( { 'message': 'BERT has been downloaded, in order to train request again' }, sort_keys=False, indent=4), 200 # return result else: # if there is a BERT model available model = QuestionAnsweringModel(model_name, path + "/", use_cuda=False) # load the model model.train_model( train_data) # train the model on the fetched data model.save_model(path, model=model.model) # save the updated model return json.dumps( {'message': 'Model has been trained on database train data'}, sort_keys=False, indent=4), 200 # return the result return json.dumps({'message': model_name + 'has not yet been implemented'}, sort_keys=False, indent=4), 404 # return the result
class GetCases: def __init__(self): logging.basicConfig(level=logging.INFO) transformers_logger = logging.getLogger("transformers") transformers_logger.setLevel(logging.WARNING) self.model = QuestionAnsweringModel('distilbert', 'outputs/', args={ 'reprocess_input_data': True, 'overwrite_output_dir': True, 'fp16': False }, use_cuda=False) def train_model(self): train_data = [] with open('C:/Users/NathanGrant/Downloads/rona/rona/training_data.json' ) as f: train_data = json.load(f) self.model.train_model(train_data) def predict(self, news, county): to_predict = [] county = re.sub(", [A-Z]+", " county", county).lower() temp = { 'context': news, 'qas': [{ 'question': 'Total deaths in ' + county, 'id': '0' }] } to_predict.append(temp) pre = self.model.predict(to_predict) cases = [prediction['answer'] for prediction in pre] print(cases) if len(cases) > 0: for i in range(len(cases)): try: cases[i] = int(cases[i]) except: cases[i] = w2n.word_to_num(cases[i]) else: return 0 return cases def evaluate_model(self, data): result, text = self.model.eval_model(data) print(text) print(result)
def train(): parser = argparse.ArgumentParser() parser.add_argument('--job-id', help='Training file name') parser.add_argument('--model-type', default='bert', help='Type of model, eg: BERT, XLM, ROBERTA') parser.add_argument('--model-name', default='bert-base-cased', help='If true will not use cross validation') arguments, _ = parser.parse_known_args() bucket = connect_to_storage('question_answering') download_folder_structure_from_bucket(bucket, arguments.job_id, arguments.job_id) with open(posixpath.join(arguments.job_id, 'train_data.json'), 'r') as f: train_data = json.load(f) with open(posixpath.join(arguments.job_id, 'eval_data.json'), 'r') as f: eval_data = json.load(f) if os.path.isfile(posixpath.join(arguments.job_id, 'config.json')): with open(posixpath.join(arguments.job_id, 'config.json'), 'r') as f: args = json.load(f) else: args = None output_dir = posixpath.join(arguments.job_id, 'model') os.makedirs(output_dir, exist_ok=True) from simpletransformers.question_answering import QuestionAnsweringModel model = QuestionAnsweringModel(arguments.model_type, arguments.model_name, args=args, use_cuda=True) model.train_model(train_data, eval_data=eval_data) upload_folder_to_bucket(bucket, 'outputs', output_dir, recursive_upload=True) upload_folder_to_bucket(bucket, 'runs', output_dir, recursive_upload=True)
'learning_rate': 3e-5, 'num_train_epochs': 2, 'max_seq_length': 384, 'doc_stride': 128, 'overwrite_output_dir': True, 'reprocess_input_data': False, 'train_batch_size': 2, 'gradient_accumulation_steps': 8, } model = QuestionAnsweringModel('camembert', 'camembert-base', args=train_args, use_cuda=True) model.train_model(train_data) # with open('data/dev-v2.0.json', 'r') as f: # dev_data = json.load(f) # dev_data = [item for topic in dev_data['data'] for item in topic['paragraphs']] # preds = model.predict(dev_data) # os.makedirs('results', exist_ok=True) # submission = {pred['id']: pred['answer'] for pred in preds} # with open('results/submission.json', 'w') as f: # json.dump(submission, f)
def test_question_answering(model_type, model_name): # Create dummy data to use for training. train_data = [ { "context": "This is the first context", "qas": [ { "id": "00001", "is_impossible": False, "question": "Which context is this?", "answers": [{"text": "the first", "answer_start": 8}], } ], }, { "context": "Other legislation followed, including the Migratory Bird Conservation Act of 1929, a 1937 treaty prohibiting the hunting of right and gray whales,\ and the Bald Eagle Protection Act of 1940. These later laws had a low cost to society—the species were relatively rare—and little opposition was raised", "qas": [ { "id": "00002", "is_impossible": False, "question": "What was the cost to society?", "answers": [{"text": "low cost", "answer_start": 225}], }, { "id": "00003", "is_impossible": False, "question": "What was the name of the 1937 treaty?", "answers": [{"text": "Bald Eagle Protection Act", "answer_start": 167}], }, {"id": "00004", "is_impossible": True, "question": "How did Alexandar Hamilton die?", "answers": [],}, ], }, ] # noqa for i in range(4): train_data.extend(train_data) # Save as a JSON file os.makedirs("data", exist_ok=True) with open("data/train.json", "w") as f: json.dump(train_data, f) logging.basicConfig(level=logging.WARNING) transformers_logger = logging.getLogger("transformers") transformers_logger.setLevel(logging.ERROR) # Create the QuestionAnsweringModel model = QuestionAnsweringModel( model_type, model_name, args={"no_save": True, "reprocess_input_data": True, "overwrite_output_dir": True}, use_cuda=False, ) # Train the model model.train_model("data/train.json") # Evaluate the model. (Being lazy and evaluating on the train data itself) result, text = model.eval_model("data/train.json") # Making predictions using the model. to_predict = [ { "context": "This is the context used for demonstrating predictions.", "qas": [{"question": "What is this context?", "id": "0"}], } ] model.predict(to_predict)
'evaluate_during_training_steps': num_steps, 'save_eval_checkpoints': False, 'save_model_every_epoch': False, 'save_steps': -1, #500000 'train_batch_size': batch_size, 'num_train_epochs': epochs } ###### only train on first 100 samples x_train = x_train[:100] x_test = x_test[:100] x_valid = x_valid[:100] ############# qa_model = QuestionAnsweringModel(model_family[model], model_exact_id[model], args=train_args, cuda_device=cuda) qa_model.train_model(x_train, eval_data=x_valid) qa_model = QuestionAnsweringModel(model_family[model], save_dir + 'best_model/', args=train_args, cuda_device=cuda) result, text = qa_model.eval_model(x_test) r = evaluate_results(text) print (r) ''' rf = open('results/dailydialog_qa.txt', 'a') rf.write(str(args) + '\n\n') rf.write(r + '\n' + '-'*40 + '\n') rf.close() rf = open(result_file, 'a') rf.write(str(args) + '\n\n')
"no_save": True, "manual_seed": 4, "max_seq_length": 512, "no_save": True, "n_best_size": 10, "lazy_loading": True, # "use_multiprocessing": False, } # Create the QuestionAnsweringModel model = QuestionAnsweringModel("bert", "bert-base-cased", args=train_args, use_cuda=True, cuda_device=0) # Train the model with JSON file model.train_model("data/train.jsonl", eval_data="data/train.json") # Making predictions using the model. to_predict = [{ "context": "Other legislation followed, including the Migratory Bird Conservation Act of 1929, a 1937 treaty prohibiting the hunting of right and gray whales,\ and the Bald Eagle Protection Act of 1940. These later laws had a low cost to society—the species were relatively rare—and little opposition was raised", "qas": [{ "question": "What was the name of the 1937 treaty?", "id": "0" }], }] print(model.predict(to_predict, n_best_size=2))
def test_question_answering(): # Create dummy data to use for training. train_data = [ { "context": "This is the first context", "qas": [{ "id": "00001", "is_impossible": False, "question": "Which context is this?", "answers": [{ "text": "the first", "answer_start": 8 }], }], }, { "context": "Other legislation followed, including the Migratory Bird" " Conservation Act of 1929, a 1937 treaty prohibiting the hunting of" " right and gray whales, and the Bald Eagle Protection Act of 1940." " These later laws had a low cost to society—the species were" " relatively rare—and little opposition was raised", "qas": [ { "id": "00002", "is_impossible": False, "question": "What was the cost to society?", "answers": [{ "text": "low cost", "answer_start": 225 }], }, { "id": "00003", "is_impossible": False, "question": "What was the name of the 1937 treaty?", "answers": [{ "text": "Bald Eagle Protection Act", "answer_start": 167 }], }, ], }, ] # Save as a JSON file os.makedirs("data", exist_ok=True) with open("data/train.json", "w") as f: json.dump(train_data, f) # Create the QuestionAnsweringModel model = QuestionAnsweringModel( "distilbert", "distilbert-base-uncased-distilled-squad", args={ "no_save": True, "reprocess_input_data": True, "overwrite_output_dir": True }, use_cuda=False, ) # Train the model model.train_model("data/train.json") # Evaluate the model. (Being lazy and evaluating on the train data itself) result, text = model.eval_model("data/train.json") # Making predictions using the model. to_predict = [{ "context": "This is the context used for demonstrating predictions.", "qas": [{ "question": "What is this context?", "id": "0" }], }] model.predict(to_predict)
qas.append({'question': question, 'id': qid, 'is_impossible': False, 'answers': answers}) output.append({'context': context.lower(), 'qas': qas}) return output qa_test = do_qa_test(test) with open('/content/gdrive/My Drive/data/test.json', 'w') as outfile: json.dump(qa_test, outfile) MODEL_PATH = '/content/gdrive/My Drive/model_deeplearning/' #MODEL_PATH = 'https://drive.google.com/drive/folders/1CkjjRb6GJENfPQqfDJgVnzwipShmy4RE?usp=sharing' model = QuestionAnsweringModel('distilbert', MODEL_PATH, args={'reprocess_input_data': True, 'overwrite_output_dir': True, 'learning_rate': 5e-5, 'num_train_epochs': 3, 'max_seq_length': 192, 'doc_stride': 64, 'fp16': False, }, use_cuda=True) model.train_model('/content/gdrive/My Drive/data/train.json') predictions = model.predict(qa_test) predictions_df = pd.DataFrame.from_dict(predictions) sub_df['selected_text'] = predictions_df['answer'] sub_df.to_csv('/content/gdrive/My Drive/sample_submission.csv', index=False) print("File submitted successfully.") #test_df.head()
'learning_rate': 1e-5, 'num_train_epochs': 1, 'max_seq_length': 384, 'doc_stride': 128, 'overwrite_output_dir': True, 'reprocess_input_data': False, 'train_batch_size': 2, 'gradient_accumulation_steps': 8, 'save_model_every_epoch': False } model = QuestionAnsweringModel('bert', 'bert-base-cased', use_cuda=False, args=train_args) model.train_model(train_data, output_dir=None) #Prediction with open('dev-v2.0.json', 'r') as f: dev_data = json.load(f) dev_data = [item for topic in dev_data['data'] for item in topic['paragraphs']] preds = model.predict(dev_data) os.makedirs('results', exist_ok=True) submission = {pred['id']: pred['answer'] for pred in preds} with open('results/submission.json', 'w') as f: json.dump(submission, f)
# Save as a JSON file os.makedirs('data', exist_ok=True) with open('data/train.json', 'w') as f: json.dump(train_data, f) # Create the QuestionAnsweringModel model = QuestionAnsweringModel('distilbert', 'distilbert-base-uncased-distilled-squad', args={ 'reprocess_input_data': True, 'overwrite_output_dir': True }) # Train the model with JSON file model.train_model('data/train.json') # The list can also be used directly # model.train_model(train_data) # Evaluate the model. (Being lazy and evaluating on the train data itself) result, text = model.eval_model('data/train.json') print(result) print(text) print('-------------------') # Making predictions using the model. to_predict = [{ 'context': 'This is the context used for demonstrating predictions.',
qa_test # !pip install seqeval # !pip install transformers %%time from simpletransformers.question_answering import QuestionAnsweringModel model = QuestionAnsweringModel('distilbert', '/kaggle/input/transformers-pretrained-distilbert/distilbert-base-uncased-distilled-squad/', args={'reprocess_input_data': True, 'overwrite_output_dir': True, 'learning_rate': 5e-5, 'num_train_epochs': 4, 'max_seq_length': 200, 'doc_stride': 64, 'fp16': False, }, use_cuda=True) model.train_model(qa_train) %%time preds = model.predict(qa_test) predic_df = pd.DataFrame.from_dict(preds) sub_df['selected_text'] = predic_df['answer'] sub_df.to_csv("submission.csv", sep=',', index=False) sub_df.head()
from simpletransformers.question_answering import QuestionAnsweringModel import json from transformers import BertModel train_args = { 'fp16': False, 'learning_rate': 3e-5, 'num_train_epochs': 2, 'max_seq_length': 384, 'doc_stride': 128, 'overwrite_output_dir': True, 'reprocess_input_data': False, 'train_batch_size': 2, 'gradient_accumulation_steps': 8, } model = QuestionAnsweringModel('flaubert', './outputs/flaubert/checkpoint-2650-epoch-2/', args=train_args, use_cuda=True) with open('./data/illiun/json_output/third_output.json', 'r') as f: sur_train_data = json.load(f) sur_train_data = [ item for topic in sur_train_data['data'] for item in topic['paragraphs'] ] model.train_model(sur_train_data)
# Create the QuestionAnsweringModel model = QuestionAnsweringModel('bert', 'bert-base-uncased', use_cuda=args.use_cuda, args={ 'reprocess_input_data': True, 'overwrite_output_dir': True }) modelName = "%s_%s" % (args.data, "bert") # best_acc = 0 for epoch in range(args.epoch_nb): if args.enable_wdc: model.train_model('data/sgd/wdc.json') modelName = "%s_%s_wdc" % (args.data, "bert") model.train_model('data/sgd/%s-train.json' % args.data) # result, out = model.eval_model('data/sgd/sgd-dev-%s.json' % args.eval_mode) result, out = model.eval_model('data/sgd/%s-test.json' % args.data) slot_acc = result['correct'] / (result['correct'] + result['similar'] + result['incorrect']) if slot_acc > best_acc: best_acc = slot_acc print2file(args.out_dir, modelName, ".res", slot_acc, True) # save output to file with open('%s%s.out' % (args.out_dir, modelName), 'w') as f:
args.model] modelName = "%s_%s_%s_%s" % (args.model, pretrainFile.replace( "/", "-"), args.train, args.test) # Create the QuestionAnsweringModel model = QuestionAnsweringModel(args.model, pretrainFile, use_cuda=torch.cuda.is_available(), args={ 'reprocess_input_data': True, 'overwrite_output_dir': True }) # Train the model with JSON file model.train_model('data/dialog/%s.json' % args.train) # Evaluate the model. (Being lazy and evaluating on the train data itself) result, out = model.eval_model('data/dialog/%s.json' % args.test) print2file(args.out_dir, modelName, ".res", result, True) # save output to file with open('%s%s.out' % (args.out_dir, modelName), 'w') as f: json.dump(out, f) print('-------------------') # Making predictions using the model. # to_predict = [{'context': 'This is the context used for demonstrating predictions.', 'qas': [{'question': 'What is this context?', 'id': '0'}]}] # # print(model.predict(to_predict))
os.makedirs("data", exist_ok=True) with open("data/train.json", "w") as f: json.dump(train_data, f) # Create the QuestionAnsweringModel model = QuestionAnsweringModel( "distilbert", "distilbert-base-uncased-distilled-squad", args={ "reprocess_input_data": True, "overwrite_output_dir": True }, ) # Train the model model.train_model("data/train.json") # Evaluate the model. (Being lazy and evaluating on the train data itself) result, text = model.eval_model("data/train.json") print(result) print(text) print("-------------------") # Making predictions using the model. to_predict = [{ "context": "This is the context used for demonstrating predictions.", "qas": [{ "question": "What is this context?", "id": "0"
# Create the QuestionAnsweringModel model = QuestionAnsweringModel('distilbert', 'distilbert-base-uncased-distilled-squad', args={'reprocess_input_data': True, 'overwrite_output_dir': True, 'learning_rate': 5e-5, 'num_train_epochs': 3, 'max_seq_length': 144, 'doc_stride': 64, 'train_batch_size': 80, 'fp16': False, }, use_cuda=True) model.train_model('../working/train.json') predictions = model.predict(qa_test) predictions_df = pd.DataFrame.from_dict(predictions) sub_df['selected_text'] = predictions_df['answer'] sub_df.to_csv('submission.csv', index=False) print("File submitted successfully.")
json.dump(qa_test, outfile) MODEL_PATH = '/Users/DATA/Coding /Kaggle /tweet-sentiment-extraction/Distillbert_model' # Create the QuestionAnsweringModel model = QuestionAnsweringModel('distilbert', MODEL_PATH, args={ 'reprocess_input_data': True, 'overwrite_output_dir': True, 'learning_rate': 5e-5, 'num_train_epochs': 2, 'max_seq_length': 192, 'doc_stride': 64, 'fp16': False }, use_cuda=use_cuda) train_args = { 'learning_rate': 3e-5, 'num_train_epochs': 2, 'max_seq_length': 384, 'doc_stride': 128, 'overwrite_output_dir': True, 'reprocess_input_data': False, 'train_batch_size': 2, 'gradient_accumulation_steps': 8, } model = QuestionAnsweringModel('bert', 'bert-base-cased', args=train_args) model.train_model(train)
# training parameters; given below is the default setting used for bert large models #for BERT large you need at least 4 GPUS with 11 GB of GPU momory # for bert base models one GPU is sufficient for a batch size of 32 # make lr higher if you train with larger batch size model_args = {"train_batch_size": 8, "n_gpu":4, "eval_batch_size": 64, 'max_answer_length': 50, 'num_train_epochs': 6, 'output_dir': "./output/", 'best_model_dir': './output/best_model/', 'evaluate_during_training': True, 'fp16': False, 'use_cached_eval_features':True, 'save_eval_checkpoints': False, 'save_model_every_epoch': False, 'max_seq_length': 384, 'doc_stride': 128, 'do_lower_case': True, 'gradient_accumulation_steps': 1, 'learning_rate': 2e-05 } # if you want to fine tune a model locally saved or say you want to continue training a model previously saved give location of the dir where the model is #model = QuestionAnsweringModel('bert', './models/bert-large-squad-docvqa-finetuned/', args=model_args) # if you want to fine tune a pretrained model from pytorch trasnformers model zoo (https://huggingface.co/transformers/pretrained_models.html), you can directly give the model name ..the pretrained model will be downloadef first to a cache dir # here the model we are fine tuning is bert-large-cased-whole-word-masking-finetuned-squad model = QuestionAnsweringModel('bert', 'bert-large-cased-whole-word-masking-finetuned-squad', args=model_args) print (model.args) #import pdb; pdb.set_trace() # load train and val data with open('./data_in_squad_format/docvqa_train_squad_format.json') as f: train_data = json.load(f) with open('./data_in_squad_format/docvqa_val_squad_format.json') as f: dev_data = json.load(f) #train model.train_model(train_data, show_running_loss= False, eval_data=dev_data) ~