Beispiel #1
0
def make_prediction(Description_A, Description_B, model_path, USE_GPU=True):
    # Loading tokenized using a stored Pickle Object, as it is more reliable
    # In case you'd like to create the object, you can do so here: bert_tokenizer = BertTokenizer.from_pretrained("bert-large-uncased", do_lower_case=True)

    pickle_off = open("tokenizer.pkl", "rb")
    tokenizer = pickle.load(pickle_off)

    # Tokenizes the words into format required by Bert ex: "I am playing" -> ["I","am","play","##ing"]
    hypothesis = tokenizer.tokenize(Description_A)

    #If sequence is too long it truncates it to ensure it fits into BERT's max seq len and changes the words into numbers
    if len(hypothesis) > 512 - 3:
        hypothesis = hypothesis[:512 - 3]
    input_ids = tokenizer.convert_tokens_to_ids(
        ['[CLS]'] + hypothesis + ['[SEP]'])

    #Determnines what sentence it's in, doesn't really matter for single sentence, but important for 2 sentence classification
    type_ids = [0] * (len(hypothesis) + 2)

    #Concatenates all the important labels into a dictionary
    #UID : id number (no importance) ; label: "ground truth" (no importance when making a prediction)
    # token_id: representation of words ; type_id: position within a sentence
    features = {'uid': 0, 'label': 0,
                'token_id': input_ids, 'type_id': type_ids}

    # Loads data into a BatchGen object which is needed for making a prediction, nothing needed to change here
    dev_data = BatchGen([features],
                        batch_size=8,
                        gpu=True, is_train=False,
                        task_id=0,
                        maxlen=512,
                        pairwise=False,
                        data_type=0,
                        task_type=0)

    # function to convert token ids back to words
    print(tokenizer.convert_ids_to_tokens([101, 100, 5208, 2024, 17662, 9119, 2096, 3173, 2000, 2175, 14555, 2044, 2074, 5983, 6265, 1012, 102, 100, 2308, 2024, 23581, 2096, 3173, 2000, 2175, 14555, 1012, 102]))

    #hyper parameters: whatever is necessary is added as variables at the top
    opt = {'init_checkpoint': model_path, 'data_dir': 'data/domain_adaptation', 'data_sort_on': False, 'name': 'farmer', 'train_datasets': ['sst'], 'test_datasets': ['sst'], 'pw_tasks': ['qnnli'], 'update_bert_opt': 0, 'multi_gpu_on': False, 'mem_cum_type': 'simple', 'answer_num_turn': 5, 'answer_mem_drop_p': 0.1, 'answer_att_hidden_size': 128, 'answer_att_type': 'bilinear', 'answer_rnn_type': 'gru', 'answer_sum_att_type': 'bilinear', 'answer_merge_opt': 1, 'answer_mem_type': 1, 'answer_dropout_p': 0.1, 'answer_weight_norm_on': False, 'dump_state_on': False, 'answer_opt': [
        0], 'label_size': '2', 'mtl_opt': 0, 'ratio': 0, 'mix_opt': 0, 'max_seq_len': 512, 'init_ratio': 1, 'cuda': USE_GPU, 'log_per_updates': 500, 'epochs': 5, 'batch_size': 32, 'batch_size_eval': 8, 'optimizer': 'adamax', 'grad_clipping': 0.0, 'global_grad_clipping': 1.0, 'weight_decay': 0, 'learning_rate': 5e-05, 'momentum': 0, 'warmup': 0.1, 'warmup_schedule': 'warmup_linear', 'vb_dropout': True, 'dropout_p': 0.1, 'dropout_w': 0.0, 'bert_dropout_p': 0.1, 'ema_opt': 0, 'ema_gamma': 0.995, 'have_lr_scheduler': True, 'multi_step_lr': '10,20,30', 'freeze_layers': -1, 'embedding_opt': 0, 'lr_gamma': 0.5, 'bert_l2norm': 0.0, 'scheduler_type': 'ms', 'output_dir': 'checkpoints/scitail_tl_adamax_answer_opt0_gc0_ggc1_7_2_19', 'seed': 2018, 'task_config_path': 'configs/tasks_config.json', 'tasks_dropout_p': [0.1]}
    state_dict = torch.load(model_path)
    config = state_dict['config']
    config['attention_probs_dropout_prob'] = 0.1
    config['hidden_dropout_prob'] = 0.1
    opt.update(config)
    model = MTDNNModel(opt, state_dict=state_dict, num_train_step=50)

    #actual prediction to be made: main outputs are predictions which is a list of size 1, and scores which is confidence in prediction for each class
    dev_metrics, dev_predictions, scores, golds, dev_ids = eval_model(
        model, dev_data, 0,use_cuda=True, with_label =False)
#model, data, metric_meta, use_cuda=True, with_label=True
    return dev_predictions, scores
Beispiel #2
0
def main():
    logger.info('Launching the MT-DNN training')
    opt = vars(args)
    # update data dir
    opt['data_dir'] = data_dir
    batch_size = args.batch_size
    train_data_list = []
    tasks = {}
    tasks_class = {}
    nclass_list = []
    decoder_opts = []
    dropout_list = []
    for dataset in args.train_datasets:
        prefix = dataset.split('_')[0]
        if prefix in tasks: continue
        assert prefix in task_defs.n_class_map
        assert prefix in task_defs.data_type_map
        data_type = task_defs.data_type_map[prefix]
        nclass = task_defs.n_class_map[prefix]
        task_id = len(tasks)
        if args.mtl_opt > 0:
            task_id = tasks_class[nclass] if nclass in tasks_class else len(
                tasks_class)

        task_type = task_defs.task_type_map[prefix]
        pw_task = False
        if task_type == TaskType.Ranking:
            pw_task = True

        dopt = generate_decoder_opt(task_defs.enable_san_map[prefix],
                                    opt['answer_opt'])
        if task_id < len(decoder_opts):
            decoder_opts[task_id] = min(decoder_opts[task_id], dopt)
        else:
            decoder_opts.append(dopt)

        if prefix not in tasks:
            tasks[prefix] = len(tasks)
            if args.mtl_opt < 1: nclass_list.append(nclass)

        if (nclass not in tasks_class):
            tasks_class[nclass] = len(tasks_class)
            if args.mtl_opt > 0: nclass_list.append(nclass)

        dropout_p = task_defs.dropout_p_map.get(prefix, args.dropout_p)
        dropout_list.append(dropout_p)

        train_path = os.path.join(data_dir, '{}_train.json'.format(dataset))
        logger.info('Loading {} as task {}'.format(train_path, task_id))
        train_data = BatchGen(BatchGen.load(train_path,
                                            True,
                                            pairwise=pw_task,
                                            maxlen=args.max_seq_len),
                              batch_size=batch_size,
                              dropout_w=args.dropout_w,
                              gpu=args.cuda,
                              task_id=task_id,
                              maxlen=args.max_seq_len,
                              pairwise=pw_task,
                              data_type=data_type,
                              task_type=task_type,
                              encoder_type=encoder_type)
        train_data_list.append(train_data)

    opt['answer_opt'] = decoder_opts
    opt['tasks_dropout_p'] = dropout_list

    args.label_size = ','.join([str(l) for l in nclass_list])
    logger.info(args.label_size)
    dev_data_list = []
    test_data_list = []
    for dataset in args.test_datasets:
        prefix = dataset.split('_')[0]
        task_id = tasks_class[
            task_defs.
            n_class_map[prefix]] if args.mtl_opt > 0 else tasks[prefix]
        task_type = task_defs.task_type_map[prefix]

        pw_task = False
        if task_type == TaskType.Ranking:
            pw_task = True

        assert prefix in task_defs.data_type_map
        data_type = task_defs.data_type_map[prefix]

        dev_path = os.path.join(data_dir, '{}_dev.json'.format(dataset))
        dev_data = None
        if os.path.exists(dev_path):
            dev_data = BatchGen(BatchGen.load(dev_path,
                                              False,
                                              pairwise=pw_task,
                                              maxlen=args.max_seq_len),
                                batch_size=args.batch_size_eval,
                                gpu=args.cuda,
                                is_train=False,
                                task_id=task_id,
                                maxlen=args.max_seq_len,
                                pairwise=pw_task,
                                data_type=data_type,
                                task_type=task_type,
                                encoder_type=encoder_type)
        dev_data_list.append(dev_data)

        test_path = os.path.join(data_dir, '{}_test.json'.format(dataset))
        test_data = None
        if os.path.exists(test_path):
            test_data = BatchGen(BatchGen.load(test_path,
                                               False,
                                               pairwise=pw_task,
                                               maxlen=args.max_seq_len),
                                 batch_size=args.batch_size_eval,
                                 gpu=args.cuda,
                                 is_train=False,
                                 task_id=task_id,
                                 maxlen=args.max_seq_len,
                                 pairwise=pw_task,
                                 data_type=data_type,
                                 task_type=task_type,
                                 encoder_type=encoder_type)
        test_data_list.append(test_data)

    logger.info('#' * 20)
    logger.info(opt)
    logger.info('#' * 20)

    all_iters = [iter(item) for item in train_data_list]
    all_lens = [len(bg) for bg in train_data_list]

    # div number of grad accumulation.
    num_all_batches = args.epochs * sum(
        all_lens) // args.grad_accumulation_step
    logger.info('############# Gradient Accumulation Info #############')
    logger.info('number of step: {}'.format(args.epochs * sum(all_lens)))
    logger.info('number of grad grad_accumulation step: {}'.format(
        args.grad_accumulation_step))
    logger.info('adjusted number of step: {}'.format(num_all_batches))
    logger.info('############# Gradient Accumulation Info #############')

    if len(train_data_list) > 1 and args.ratio > 0:
        num_all_batches = int(args.epochs * (len(train_data_list[0]) *
                                             (1 + args.ratio)))

    bert_model_path = args.init_checkpoint
    state_dict = None

    if encoder_type == EncoderModelType.BERT:
        if os.path.exists(bert_model_path):
            state_dict = torch.load(bert_model_path)
            config = state_dict['config']
            config['attention_probs_dropout_prob'] = args.bert_dropout_p
            config['hidden_dropout_prob'] = args.bert_dropout_p
            opt.update(config)
        else:
            logger.error('#' * 20)
            logger.error(
                'Could not find the init model!\n The parameters will be initialized randomly!'
            )
            logger.error('#' * 20)
            config = BertConfig(vocab_size_or_config_json_file=30522).to_dict()
            opt.update(config)
    elif encoder_type == EncoderModelType.ROBERTA:
        bert_model_path = '{}/model.pt'.format(bert_model_path)
        if os.path.exists(bert_model_path):
            new_state_dict = {}
            state_dict = torch.load(bert_model_path)
            for key, val in state_dict['model'].items():
                if key.startswith('decoder.sentence_encoder'):
                    key = 'bert.model.{}'.format(key)
                    new_state_dict[key] = val
                elif key.startswith('classification_heads'):
                    key = 'bert.model.{}'.format(key)
                    new_state_dict[key] = val
            state_dict = {'state': new_state_dict}

    model = MTDNNModel(opt,
                       state_dict=state_dict,
                       num_train_step=num_all_batches)
    if args.resume and args.model_ckpt:
        logger.info('loading model from {}'.format(args.model_ckpt))
        model.load(args.model_ckpt)

    #### model meta str
    headline = '############# Model Arch of MT-DNN #############'
    ### print network
    logger.info('\n{}\n{}\n'.format(headline, model.network))

    # dump config
    config_file = os.path.join(output_dir, 'config.json')
    with open(config_file, 'w', encoding='utf-8') as writer:
        writer.write('{}\n'.format(json.dumps(opt)))
        writer.write('\n{}\n{}\n'.format(headline, model.network))

    logger.info("Total number of params: {}".format(model.total_param))

    for epoch in range(0, args.epochs):
        logger.warning('At epoch {}'.format(epoch))
        for train_data in train_data_list:
            train_data.reset()
        start = datetime.now()
        all_indices = []
        if len(train_data_list) > 1 and args.ratio > 0:
            main_indices = [0] * len(train_data_list[0])
            extra_indices = []
            for i in range(1, len(train_data_list)):
                extra_indices += [i] * len(train_data_list[i])
            random_picks = int(
                min(len(train_data_list[0]) * args.ratio, len(extra_indices)))
            extra_indices = np.random.choice(extra_indices,
                                             random_picks,
                                             replace=False)
            if args.mix_opt > 0:
                extra_indices = extra_indices.tolist()
                random.shuffle(extra_indices)
                all_indices = extra_indices + main_indices
            else:
                all_indices = main_indices + extra_indices.tolist()

        else:
            for i in range(1, len(train_data_list)):
                all_indices += [i] * len(train_data_list[i])
            if args.mix_opt > 0:
                random.shuffle(all_indices)
            all_indices += [0] * len(train_data_list[0])
        if args.mix_opt < 1:
            random.shuffle(all_indices)

        for i in range(len(all_indices)):
            task_id = all_indices[i]
            batch_meta, batch_data = next(all_iters[task_id])
            model.update(batch_meta, batch_data)
            if (model.local_updates) % (args.log_per_updates *
                                        args.grad_accumulation_step
                                        ) == 0 or model.local_updates == 1:
                ramaining_time = str((datetime.now() - start) / (i + 1) *
                                     (len(all_indices) - i - 1)).split('.')[0]
                logger.info(
                    'Task [{0:2}] updates[{1:6}] train loss[{2:.5f}] remaining[{3}]'
                    .format(task_id, model.updates, model.train_loss.avg,
                            ramaining_time))

            if args.save_per_updates_on and (
                (model.local_updates) %
                (args.save_per_updates * args.grad_accumulation_step) == 0):
                model_file = os.path.join(
                    output_dir, 'model_{}_{}.pt'.format(epoch, model.updates))
                logger.info('Saving mt-dnn model to {}'.format(model_file))
                model.save(model_file)

        for idx, dataset in enumerate(args.test_datasets):
            prefix = dataset.split('_')[0]
            label_dict = task_defs.global_map.get(prefix, None)
            dev_data = dev_data_list[idx]
            if dev_data is not None:
                dev_metrics, dev_predictions, scores, golds, dev_ids = eval_model(
                    model,
                    dev_data,
                    metric_meta=task_defs.metric_meta_map[prefix],
                    use_cuda=args.cuda)
                for key, val in dev_metrics.items():
                    logger.warning(
                        'Task {0} -- epoch {1} -- Dev {2}: {3:.3f}'.format(
                            dataset, epoch, key, val))
                score_file = os.path.join(
                    output_dir, '{}_dev_scores_{}.json'.format(dataset, epoch))
                results = {
                    'metrics': dev_metrics,
                    'predictions': dev_predictions,
                    'uids': dev_ids,
                    'scores': scores
                }
                dump(score_file, results)
                official_score_file = os.path.join(
                    output_dir, '{}_dev_scores_{}.tsv'.format(dataset, epoch))
                submit(official_score_file, results, label_dict)

            # test eval
            test_data = test_data_list[idx]
            if test_data is not None:
                test_metrics, test_predictions, scores, golds, test_ids = eval_model(
                    model,
                    test_data,
                    metric_meta=task_defs.metric_meta_map[prefix],
                    use_cuda=args.cuda,
                    with_label=False)
                score_file = os.path.join(
                    output_dir,
                    '{}_test_scores_{}.json'.format(dataset, epoch))
                results = {
                    'metrics': test_metrics,
                    'predictions': test_predictions,
                    'uids': test_ids,
                    'scores': scores
                }
                dump(score_file, results)
                official_score_file = os.path.join(
                    output_dir, '{}_test_scores_{}.tsv'.format(dataset, epoch))
                submit(official_score_file, results, label_dict)
                logger.info('[new test scores saved.]')

        model_file = os.path.join(output_dir, 'model_{}.pt'.format(epoch))
        model.save(model_file)
Beispiel #3
0
def get_confidence(words, chars):
    dev_data = BatchGen(words, is_train=False, gpu=True,
                        batch_size=1, maxlen=512)
    dev_metrics, dev_predictions, scores, golds, dev_ids = eval_model(
       model, dev_data, 0, use_cuda=True, with_label = False)
    return max(scores)
Beispiel #4
0
# load data
test_data = BatchGen(BatchGen.load(args.prep_input, False, pairwise=pw_task, maxlen=args.max_seq_len),
                     batch_size=args.batch_size_eval,
                     gpu=args.cuda, is_train=False,
                     task_id=args.task_id,
                     maxlen=args.max_seq_len,
                     pairwise=pw_task,
                     data_type=data_type,
                     task_type=task_type)

# load model
checkpoint_path = args.checkpoint
assert os.path.exists(checkpoint_path)
if args.cuda:
    state_dict = torch.load(checkpoint_path)
else:
    state_dict = torch.load(checkpoint_path, map_location="cpu")
config = state_dict['config']
config["cuda"] = args.cuda
model = MTDNNModel(config, state_dict=state_dict)

test_metrics, test_predictions, scores, golds, test_ids = eval_model(model, test_data,
                                                                     metric_meta=metric_meta,
                                                                     use_cuda=args.cuda, with_label=args.with_label)

results = {'metrics': test_metrics, 'predictions': test_predictions, 'uids': test_ids, 'scores': scores}
dump(args.score, results)
if args.with_label:
    print(test_metrics)