Ejemplo n.º 1
0
def main():
    # load data
    train_data = load_standard_dataset(standard_train)

    # ======================= instantiate model ====================
    # models are from https://huggingface.co/models?pipeline_tag=question-answering
    # model = QuestionAnsweringModel('roberta', 'csarron/roberta-large-squad-v1', args=train_args)
    # model = QuestionAnsweringModel('electra', 'mrm8488/electra-large-finetuned-squadv1', args=train_args)
    # model = QuestionAnsweringModel('albert', 'Wikidepia/albert-bahasa-cased-squad', args=train_args)
    # model = QuestionAnsweringModel('bert', 'bert-base-cased', args=train_args)
    model = QuestionAnsweringModel('distilbert',
                                   'distilbert-base-uncased-distilled-squad',
                                   args=train_args)
    # model = QuestionAnsweringModel('bert', 'trained_models/bert', args=train_args)  # run eval from pre-trained model

    # =========================== train model ======================
    model.train_model(train_data)

    # ========================= do evaluation ======================
    dev_data = load_standard_dataset(standard_dev)
    result, texts = model.eval_model(dev_data,
                                     f1=f1_multiclass,
                                     acc=accuracy_score)
    print(f'Result: {result}')

    # ========================= do predictions =====================
    answers, probabilities = model.predict(dev_data, n_best_size=1)
    preds = {pred['id']: pred['answer'] for pred in answers}

    with open('results/squad_predictions.json', 'w') as f:
        for qid, answer in preds.items():
            f.write(f'{{"qid": "{qid}", "answer": "{answer}"}}\n')
Ejemplo n.º 2
0
def train_squad(train_path=ST_TRAINING_JSON, squad_path=SQUAD_TRAINING_JSON):
    trainset = load_trainset(train_path, squad_path=squad_path)
    model = QuestionAnsweringModel('distilbert', 'distilbert-base-uncased-distilled-squad',
                                   args={'reprocess_input_data': True, 'overwrite_output_dir': True})
    model.train_model(trainset)

    return model
def train_model(model_name):
    api = os.environ['DB_API']  # get the request path
    path = os.environ[
        'MODEL_REPO'] + "/" + model_name  # get the model repository location

    request_test = requests.get(api)  # get the train data from db
    train_data = request_test.json()[0][0]  # format the data as json
    model = None  # init model
    if model_name == "bert":  # if the model selected is bert then
        if not os.path.exists(path):  # check whether BERT already exsits
            # set the arguments for training
            train_args = {
                'learning_rate': 3e-5,
                'num_train_epochs': 2,
                'max_seq_length': 384,
                'doc_stride': 128,
                'overwrite_output_dir': True,
                'reprocess_input_data': False,
                'train_batch_size': 2,
                'gradient_accumulation_steps': 8,
            }
            #download the bert model
            model = QuestionAnsweringModel(
                model_name,
                "mrm8488/bert-tiny-5-finetuned-squadv2",
                args=train_args,
                use_cuda=False)

            model.save_model(
                path, model=model.model)  # save the model in the repository
            return json.dumps(
                {
                    'message':
                    'BERT has been downloaded, in order to train request again'
                },
                sort_keys=False,
                indent=4), 200  # return result
        else:  # if there is a BERT model available
            model = QuestionAnsweringModel(model_name,
                                           path + "/",
                                           use_cuda=False)  # load the model
            model.train_model(
                train_data)  # train the model on the fetched data
            model.save_model(path, model=model.model)  # save the updated model
            return json.dumps(
                {'message': 'Model has been trained on database train data'},
                sort_keys=False,
                indent=4), 200  # return the result

    return json.dumps({'message': model_name + 'has not yet been implemented'},
                      sort_keys=False,
                      indent=4), 404  # return the result
Ejemplo n.º 4
0
class GetCases:
    def __init__(self):
        logging.basicConfig(level=logging.INFO)
        transformers_logger = logging.getLogger("transformers")
        transformers_logger.setLevel(logging.WARNING)
        self.model = QuestionAnsweringModel('distilbert',
                                            'outputs/',
                                            args={
                                                'reprocess_input_data': True,
                                                'overwrite_output_dir': True,
                                                'fp16': False
                                            },
                                            use_cuda=False)

    def train_model(self):
        train_data = []
        with open('C:/Users/NathanGrant/Downloads/rona/rona/training_data.json'
                  ) as f:
            train_data = json.load(f)
        self.model.train_model(train_data)

    def predict(self, news, county):
        to_predict = []
        county = re.sub(", [A-Z]+", " county", county).lower()
        temp = {
            'context': news,
            'qas': [{
                'question': 'Total deaths in ' + county,
                'id': '0'
            }]
        }
        to_predict.append(temp)
        pre = self.model.predict(to_predict)
        cases = [prediction['answer'] for prediction in pre]
        print(cases)
        if len(cases) > 0:
            for i in range(len(cases)):
                try:
                    cases[i] = int(cases[i])
                except:
                    cases[i] = w2n.word_to_num(cases[i])
        else:
            return 0
        return cases

    def evaluate_model(self, data):
        result, text = self.model.eval_model(data)
        print(text)
        print(result)
Ejemplo n.º 5
0
def train():
    parser = argparse.ArgumentParser()

    parser.add_argument('--job-id', help='Training file name')
    parser.add_argument('--model-type',
                        default='bert',
                        help='Type of model, eg: BERT, XLM, ROBERTA')
    parser.add_argument('--model-name',
                        default='bert-base-cased',
                        help='If true will not use cross validation')

    arguments, _ = parser.parse_known_args()

    bucket = connect_to_storage('question_answering')
    download_folder_structure_from_bucket(bucket, arguments.job_id,
                                          arguments.job_id)

    with open(posixpath.join(arguments.job_id, 'train_data.json'), 'r') as f:
        train_data = json.load(f)

    with open(posixpath.join(arguments.job_id, 'eval_data.json'), 'r') as f:
        eval_data = json.load(f)

    if os.path.isfile(posixpath.join(arguments.job_id, 'config.json')):
        with open(posixpath.join(arguments.job_id, 'config.json'), 'r') as f:
            args = json.load(f)
    else:
        args = None

    output_dir = posixpath.join(arguments.job_id, 'model')
    os.makedirs(output_dir, exist_ok=True)

    from simpletransformers.question_answering import QuestionAnsweringModel

    model = QuestionAnsweringModel(arguments.model_type,
                                   arguments.model_name,
                                   args=args,
                                   use_cuda=True)
    model.train_model(train_data, eval_data=eval_data)

    upload_folder_to_bucket(bucket,
                            'outputs',
                            output_dir,
                            recursive_upload=True)
    upload_folder_to_bucket(bucket, 'runs', output_dir, recursive_upload=True)
Ejemplo n.º 6
0
    'learning_rate': 3e-5,
    'num_train_epochs': 2,
    'max_seq_length': 384,
    'doc_stride': 128,
    'overwrite_output_dir': True,
    'reprocess_input_data': False,
    'train_batch_size': 2,
    'gradient_accumulation_steps': 8,
}

model = QuestionAnsweringModel('camembert',
                               'camembert-base',
                               args=train_args,
                               use_cuda=True)

model.train_model(train_data)

# with open('data/dev-v2.0.json', 'r') as f:
#     dev_data = json.load(f)

# dev_data = [item for topic in dev_data['data'] for item in topic['paragraphs']]

# preds = model.predict(dev_data)

# os.makedirs('results', exist_ok=True)

# submission = {pred['id']: pred['answer'] for pred in preds}

# with open('results/submission.json', 'w') as f:
#     json.dump(submission, f)
Ejemplo n.º 7
0
def test_question_answering(model_type, model_name):
    # Create dummy data to use for training.
    train_data = [
        {
            "context": "This is the first context",
            "qas": [
                {
                    "id": "00001",
                    "is_impossible": False,
                    "question": "Which context is this?",
                    "answers": [{"text": "the first", "answer_start": 8}],
                }
            ],
        },
        {
            "context": "Other legislation followed, including the Migratory Bird Conservation Act of 1929, a 1937 treaty prohibiting the hunting of right and gray whales,\
                and the Bald Eagle Protection Act of 1940. These later laws had a low cost to society—the species were relatively rare—and little opposition was raised",
            "qas": [
                {
                    "id": "00002",
                    "is_impossible": False,
                    "question": "What was the cost to society?",
                    "answers": [{"text": "low cost", "answer_start": 225}],
                },
                {
                    "id": "00003",
                    "is_impossible": False,
                    "question": "What was the name of the 1937 treaty?",
                    "answers": [{"text": "Bald Eagle Protection Act", "answer_start": 167}],
                },
                {"id": "00004", "is_impossible": True, "question": "How did Alexandar Hamilton die?", "answers": [],},
            ],
        },
    ]  # noqa

    for i in range(4):
        train_data.extend(train_data)

    # Save as a JSON file
    os.makedirs("data", exist_ok=True)
    with open("data/train.json", "w") as f:
        json.dump(train_data, f)

    logging.basicConfig(level=logging.WARNING)
    transformers_logger = logging.getLogger("transformers")
    transformers_logger.setLevel(logging.ERROR)

    # Create the QuestionAnsweringModel
    model = QuestionAnsweringModel(
        model_type,
        model_name,
        args={"no_save": True, "reprocess_input_data": True, "overwrite_output_dir": True},
        use_cuda=False,
    )

    # Train the model
    model.train_model("data/train.json")

    # Evaluate the model. (Being lazy and evaluating on the train data itself)
    result, text = model.eval_model("data/train.json")

    # Making predictions using the model.
    to_predict = [
        {
            "context": "This is the context used for demonstrating predictions.",
            "qas": [{"question": "What is this context?", "id": "0"}],
        }
    ]

    model.predict(to_predict)
Ejemplo n.º 8
0
        'evaluate_during_training_steps': num_steps,
        'save_eval_checkpoints': False,
        'save_model_every_epoch': False,
        'save_steps': -1, #500000
        'train_batch_size': batch_size,
        'num_train_epochs': epochs
    }
    
    ###### only train on first 100 samples
    x_train = x_train[:100]
    x_test = x_test[:100]
    x_valid = x_valid[:100]
    #############

    qa_model = QuestionAnsweringModel(model_family[model], model_exact_id[model], args=train_args, cuda_device=cuda)
    qa_model.train_model(x_train, eval_data=x_valid)
    
    qa_model = QuestionAnsweringModel(model_family[model], save_dir + 'best_model/', args=train_args, cuda_device=cuda)
    result, text = qa_model.eval_model(x_test)

    r = evaluate_results(text)
    print (r)
    
    '''
    rf = open('results/dailydialog_qa.txt', 'a')
    rf.write(str(args) + '\n\n')
    rf.write(r + '\n' + '-'*40 + '\n')    
    rf.close()
    
    rf = open(result_file, 'a')
    rf.write(str(args) + '\n\n')
Ejemplo n.º 9
0
    "no_save": True,
    "manual_seed": 4,
    "max_seq_length": 512,
    "no_save": True,
    "n_best_size": 10,
    "lazy_loading": True,
    # "use_multiprocessing": False,
}

# Create the QuestionAnsweringModel
model = QuestionAnsweringModel("bert",
                               "bert-base-cased",
                               args=train_args,
                               use_cuda=True,
                               cuda_device=0)

# Train the model with JSON file
model.train_model("data/train.jsonl", eval_data="data/train.json")

# Making predictions using the model.
to_predict = [{
    "context":
    "Other legislation followed, including the Migratory Bird Conservation Act of 1929, a 1937 treaty prohibiting the hunting of right and gray whales,\
            and the Bald Eagle Protection Act of 1940. These later laws had a low cost to society—the species were relatively rare—and little opposition was raised",
    "qas": [{
        "question": "What was the name of the 1937 treaty?",
        "id": "0"
    }],
}]

print(model.predict(to_predict, n_best_size=2))
def test_question_answering():
    # Create dummy data to use for training.
    train_data = [
        {
            "context":
            "This is the first context",
            "qas": [{
                "id": "00001",
                "is_impossible": False,
                "question": "Which context is this?",
                "answers": [{
                    "text": "the first",
                    "answer_start": 8
                }],
            }],
        },
        {
            "context":
            "Other legislation followed, including the Migratory Bird"
            " Conservation Act of 1929, a 1937 treaty prohibiting the hunting of"
            " right and gray whales, and the Bald Eagle Protection Act of 1940."
            " These later laws had a low cost to society—the species were"
            " relatively rare—and little opposition was raised",
            "qas": [
                {
                    "id": "00002",
                    "is_impossible": False,
                    "question": "What was the cost to society?",
                    "answers": [{
                        "text": "low cost",
                        "answer_start": 225
                    }],
                },
                {
                    "id":
                    "00003",
                    "is_impossible":
                    False,
                    "question":
                    "What was the name of the 1937 treaty?",
                    "answers": [{
                        "text": "Bald Eagle Protection Act",
                        "answer_start": 167
                    }],
                },
            ],
        },
    ]

    # Save as a JSON file
    os.makedirs("data", exist_ok=True)
    with open("data/train.json", "w") as f:
        json.dump(train_data, f)

    # Create the QuestionAnsweringModel
    model = QuestionAnsweringModel(
        "distilbert",
        "distilbert-base-uncased-distilled-squad",
        args={
            "no_save": True,
            "reprocess_input_data": True,
            "overwrite_output_dir": True
        },
        use_cuda=False,
    )

    # Train the model
    model.train_model("data/train.json")

    # Evaluate the model. (Being lazy and evaluating on the train data itself)
    result, text = model.eval_model("data/train.json")

    # Making predictions using the model.
    to_predict = [{
        "context": "This is the context used for demonstrating predictions.",
        "qas": [{
            "question": "What is this context?",
            "id": "0"
        }],
    }]

    model.predict(to_predict)
Ejemplo n.º 11
0
        qas.append({'question': question, 'id': qid, 'is_impossible': False, 'answers': answers})
        output.append({'context': context.lower(), 'qas': qas})
    return output
qa_test = do_qa_test(test)
with open('/content/gdrive/My Drive/data/test.json', 'w') as outfile:
    json.dump(qa_test, outfile)
MODEL_PATH = '/content/gdrive/My Drive/model_deeplearning/'
#MODEL_PATH = 'https://drive.google.com/drive/folders/1CkjjRb6GJENfPQqfDJgVnzwipShmy4RE?usp=sharing'
model = QuestionAnsweringModel('distilbert', 
                               MODEL_PATH, 
                               args={'reprocess_input_data': True,
                                     'overwrite_output_dir': True,
                                     'learning_rate': 5e-5,
                                     'num_train_epochs': 3,
                                     'max_seq_length': 192,
                                     'doc_stride': 64,
                                     'fp16': False,
                                    },
                              use_cuda=True)
model.train_model('/content/gdrive/My Drive/data/train.json')
predictions = model.predict(qa_test)
predictions_df = pd.DataFrame.from_dict(predictions)

sub_df['selected_text'] = predictions_df['answer']

sub_df.to_csv('/content/gdrive/My Drive/sample_submission.csv', index=False)

print("File submitted successfully.")
#test_df.head()

Ejemplo n.º 12
0
    'learning_rate': 1e-5,
    'num_train_epochs': 1,
    'max_seq_length': 384,
    'doc_stride': 128,
    'overwrite_output_dir': True,
    'reprocess_input_data': False,
    'train_batch_size': 2,
    'gradient_accumulation_steps': 8,
    'save_model_every_epoch': False
}

model = QuestionAnsweringModel('bert',
                               'bert-base-cased',
                               use_cuda=False,
                               args=train_args)
model.train_model(train_data, output_dir=None)

#Prediction
with open('dev-v2.0.json', 'r') as f:
    dev_data = json.load(f)

dev_data = [item for topic in dev_data['data'] for item in topic['paragraphs']]

preds = model.predict(dev_data)

os.makedirs('results', exist_ok=True)

submission = {pred['id']: pred['answer'] for pred in preds}

with open('results/submission.json', 'w') as f:
    json.dump(submission, f)
Ejemplo n.º 13
0
# Save as a JSON file
os.makedirs('data', exist_ok=True)
with open('data/train.json', 'w') as f:
    json.dump(train_data, f)

# Create the QuestionAnsweringModel
model = QuestionAnsweringModel('distilbert',
                               'distilbert-base-uncased-distilled-squad',
                               args={
                                   'reprocess_input_data': True,
                                   'overwrite_output_dir': True
                               })

# Train the model with JSON file
model.train_model('data/train.json')

# The list can also be used directly
# model.train_model(train_data)

# Evaluate the model. (Being lazy and evaluating on the train data itself)
result, text = model.eval_model('data/train.json')

print(result)
print(text)

print('-------------------')

# Making predictions using the model.
to_predict = [{
    'context': 'This is the context used for demonstrating predictions.',
Ejemplo n.º 14
0
qa_test

# !pip install seqeval
# !pip install transformers

%%time

from simpletransformers.question_answering import QuestionAnsweringModel

model = QuestionAnsweringModel('distilbert', 
                               '/kaggle/input/transformers-pretrained-distilbert/distilbert-base-uncased-distilled-squad/', 
                               args={'reprocess_input_data': True,
                                     'overwrite_output_dir': True,
                                     'learning_rate': 5e-5,
                                     'num_train_epochs': 4,
                                     'max_seq_length': 200,
                                     'doc_stride': 64,
                                     'fp16': False,
                                    },
                              use_cuda=True)
model.train_model(qa_train)

%%time

preds = model.predict(qa_test)
predic_df = pd.DataFrame.from_dict(preds)
sub_df['selected_text'] = predic_df['answer']
sub_df.to_csv("submission.csv", sep=',', index=False)

sub_df.head()
Ejemplo n.º 15
0
from simpletransformers.question_answering import QuestionAnsweringModel
import json

from transformers import BertModel

train_args = {
    'fp16': False,
    'learning_rate': 3e-5,
    'num_train_epochs': 2,
    'max_seq_length': 384,
    'doc_stride': 128,
    'overwrite_output_dir': True,
    'reprocess_input_data': False,
    'train_batch_size': 2,
    'gradient_accumulation_steps': 8,
}

model = QuestionAnsweringModel('flaubert',
                               './outputs/flaubert/checkpoint-2650-epoch-2/',
                               args=train_args,
                               use_cuda=True)

with open('./data/illiun/json_output/third_output.json', 'r') as f:
    sur_train_data = json.load(f)

sur_train_data = [
    item for topic in sur_train_data['data'] for item in topic['paragraphs']
]

model.train_model(sur_train_data)
Ejemplo n.º 16
0
    # Create the QuestionAnsweringModel
    model = QuestionAnsweringModel('bert',
                                   'bert-base-uncased',
                                   use_cuda=args.use_cuda,
                                   args={
                                       'reprocess_input_data': True,
                                       'overwrite_output_dir': True
                                   })
    modelName = "%s_%s" % (args.data, "bert")

    #
    best_acc = 0
    for epoch in range(args.epoch_nb):

        if args.enable_wdc:
            model.train_model('data/sgd/wdc.json')
            modelName = "%s_%s_wdc" % (args.data, "bert")

        model.train_model('data/sgd/%s-train.json' % args.data)
        # result, out = model.eval_model('data/sgd/sgd-dev-%s.json' % args.eval_mode)
        result, out = model.eval_model('data/sgd/%s-test.json' % args.data)

        slot_acc = result['correct'] / (result['correct'] + result['similar'] +
                                        result['incorrect'])

        if slot_acc > best_acc:
            best_acc = slot_acc

            print2file(args.out_dir, modelName, ".res", slot_acc, True)
            # save output to file
            with open('%s%s.out' % (args.out_dir, modelName), 'w') as f:
    args.model]

modelName = "%s_%s_%s_%s" % (args.model, pretrainFile.replace(
    "/", "-"), args.train, args.test)

# Create the QuestionAnsweringModel
model = QuestionAnsweringModel(args.model,
                               pretrainFile,
                               use_cuda=torch.cuda.is_available(),
                               args={
                                   'reprocess_input_data': True,
                                   'overwrite_output_dir': True
                               })

# Train the model with JSON file
model.train_model('data/dialog/%s.json' % args.train)

# Evaluate the model. (Being lazy and evaluating on the train data itself)
result, out = model.eval_model('data/dialog/%s.json' % args.test)

print2file(args.out_dir, modelName, ".res", result, True)
# save output to file
with open('%s%s.out' % (args.out_dir, modelName), 'w') as f:
    json.dump(out, f)

print('-------------------')

# Making predictions using the model.
# to_predict = [{'context': 'This is the context used for demonstrating predictions.', 'qas': [{'question': 'What is this context?', 'id': '0'}]}]
#
# print(model.predict(to_predict))
os.makedirs("data", exist_ok=True)
with open("data/train.json", "w") as f:
    json.dump(train_data, f)

# Create the QuestionAnsweringModel
model = QuestionAnsweringModel(
    "distilbert",
    "distilbert-base-uncased-distilled-squad",
    args={
        "reprocess_input_data": True,
        "overwrite_output_dir": True
    },
)

# Train the model
model.train_model("data/train.json")

# Evaluate the model. (Being lazy and evaluating on the train data itself)
result, text = model.eval_model("data/train.json")

print(result)
print(text)

print("-------------------")

# Making predictions using the model.
to_predict = [{
    "context": "This is the context used for demonstrating predictions.",
    "qas": [{
        "question": "What is this context?",
        "id": "0"
Ejemplo n.º 19
0


# Create the QuestionAnsweringModel
model = QuestionAnsweringModel('distilbert', 
			       'distilbert-base-uncased-distilled-squad',
                               args={'reprocess_input_data': True,
                                     'overwrite_output_dir': True,
                                     'learning_rate': 5e-5,
                                     'num_train_epochs': 3,
                                     'max_seq_length': 144,
                                     'doc_stride': 64,
                                     'train_batch_size': 80,
                                     'fp16': False,

                                    },
                                use_cuda=True)

model.train_model('../working/train.json')

predictions = model.predict(qa_test)
predictions_df = pd.DataFrame.from_dict(predictions)

sub_df['selected_text'] = predictions_df['answer']

sub_df.to_csv('submission.csv', index=False)

print("File submitted successfully.")


    json.dump(qa_test, outfile)

MODEL_PATH = '/Users/DATA/Coding /Kaggle /tweet-sentiment-extraction/Distillbert_model'

# Create the QuestionAnsweringModel
model = QuestionAnsweringModel('distilbert',
                               MODEL_PATH,
                               args={
                                   'reprocess_input_data': True,
                                   'overwrite_output_dir': True,
                                   'learning_rate': 5e-5,
                                   'num_train_epochs': 2,
                                   'max_seq_length': 192,
                                   'doc_stride': 64,
                                   'fp16': False
                               },
                               use_cuda=use_cuda)
train_args = {
    'learning_rate': 3e-5,
    'num_train_epochs': 2,
    'max_seq_length': 384,
    'doc_stride': 128,
    'overwrite_output_dir': True,
    'reprocess_input_data': False,
    'train_batch_size': 2,
    'gradient_accumulation_steps': 8,
}

model = QuestionAnsweringModel('bert', 'bert-base-cased', args=train_args)
model.train_model(train)
Ejemplo n.º 21
0
# training parameters; given below is the default setting used for bert large models
#for BERT large you need at least 4 GPUS with 11 GB of GPU momory
# for bert base models one GPU is sufficient for a batch size of 32
# make lr higher if you train with larger batch size

model_args = {"train_batch_size": 8, "n_gpu":4, "eval_batch_size": 64, 'max_answer_length': 50,  'num_train_epochs': 6, 'output_dir': "./output/", 'best_model_dir': './output/best_model/', 'evaluate_during_training': True, 'fp16': False, 'use_cached_eval_features':True, 'save_eval_checkpoints': False, 'save_model_every_epoch': False, 'max_seq_length': 384, 'doc_stride': 128, 'do_lower_case': True, 'gradient_accumulation_steps': 1, 'learning_rate': 2e-05 }


# if you want to fine tune a model locally saved or say you want to continue training a model previously saved give location of the dir where the model is
#model = QuestionAnsweringModel('bert', './models/bert-large-squad-docvqa-finetuned/', args=model_args)


# if you want to fine tune a pretrained model from pytorch trasnformers model zoo (https://huggingface.co/transformers/pretrained_models.html), you can directly give the model name ..the pretrained model will be downloadef first to a cache dir 
# here the model we are fine tuning is bert-large-cased-whole-word-masking-finetuned-squad
model = QuestionAnsweringModel('bert', 'bert-large-cased-whole-word-masking-finetuned-squad', args=model_args)

print (model.args)
#import pdb; pdb.set_trace()

# load train and val data
with open('./data_in_squad_format/docvqa_train_squad_format.json') as f:
  train_data = json.load(f)

with open('./data_in_squad_format/docvqa_val_squad_format.json') as f:
  dev_data = json.load(f)

#train 
model.train_model(train_data, show_running_loss= False, eval_data=dev_data)
~