def main(): parser = argparse.ArgumentParser() model_config(parser) set_config(parser) train_config(parser) args = parser.parse_args() layer_indexes = [int(x) for x in args.layers.split(",")] set_environment(args.seed) # process data data, is_single_sentence = process_data(args) data_type = DataFormat.PremiseOnly if is_single_sentence else DataFormat.PremiseAndOneHypothesis collater = Collater(gpu=args.cuda, is_train=False, data_type=data_type) batcher = DataLoader(data, batch_size=args.batch_size, collate_fn=collater.collate_fn, pin_memory=args.cuda) opt = vars(args) # load model if os.path.exists(args.checkpoint): state_dict = torch.load(args.checkpoint) config = state_dict['config'] config['dump_feature'] = True opt.update(config) else: logger.error('#' * 20) logger.error( 'Could not find the init model!\n The parameters will be initialized randomly!') logger.error('#' * 20) return num_all_batches = len(batcher) model = MTDNNModel( opt, state_dict=state_dict, num_train_step=num_all_batches) if args.cuda: model.cuda() features_dict = {} for batch_meta, batch_data in batcher: batch_meta, batch_data = Collater.patch_data(args.cuda, batch_meta, batch_data) all_encoder_layers, _ = model.extract(batch_meta, batch_data) embeddings = [all_encoder_layers[idx].detach().cpu().numpy() for idx in layer_indexes] uids = batch_meta['uids'] masks = batch_data[batch_meta['mask']].detach().cpu().numpy().tolist() for idx, uid in enumerate(uids): slen = sum(masks[idx]) features = {} for yidx, layer in enumerate(layer_indexes): features[layer] = str(embeddings[yidx][idx][:slen].tolist()) features_dict[uid] = features # save features with open(args.foutput, 'w', encoding='utf-8') as writer: for sample in data: uid = sample['uid'] tokens = sample['tokens'] feature = features_dict[uid] feature['tokens'] = tokens feature['uid'] = uid writer.write('{}\n'.format(json.dumps(feature)))
def main(): logger.info('MT-DNN predicting') opt = vars(args) batch_size = args.batch_size test_path = os.path.join(args.data_dir, args.test_file) official_score_file = os.path.join(output_dir, args.ouput_path) model_path = args.init_checkpoint state_dict = None if os.path.exists(model_path): state_dict = torch.load(model_path) config = state_dict['config'] opt.update(config) #print(state_dict['state']) #if state_dict['config']['ema_opt'] > 0: # new_state_dict = {'state': state_dict['state']['ema'], 'config': state_dict['config']} #else: # new_state_dict = {'state': state_dict['state']['network'], 'config': state_dict['config']} #state_dict = new_state_dict model = MTDNNModel(opt, state_dict=state_dict) # task type prefix = test_path.split('\\')[-1].split('_')[0] pw_task = False if prefix in opt['pw_tasks']: pw_task = True test_data = BatchGen(BatchGen.load(test_path, False, pairwise=pw_task), batch_size=batch_size, gpu=args.cuda, is_train=False, task_id=args.task_id, pairwise=pw_task, maxlen=opt['max_seq_len']) logger.info('#' * 20) logger.info(opt) logger.info('#' * 20) if args.cuda: model.cuda() prefix = args.test.split('_')[0] # 'mnli' # label_dict = GLOBAL_MAP.get(prefix, None) test_metrics, test_predictions, scores, golds, test_ids = eval_model( model, test_data, prefix) logger.info('test metrics:{}'.format(test_metrics)) results = { 'metrics': test_metrics, 'uids': test_ids, 'labels': golds, 'predictions': test_predictions, 'scores': scores } submit(official_score_file, results, label_dict)
def main(args): # load task info task_defs = TaskDefs(args.task_def) assert args.task in task_defs.task_type_map assert args.task in task_defs.data_type_map assert args.task in task_defs.metric_meta_map data_type = task_defs.data_type_map[args.task] task_type = task_defs.task_type_map[args.task] metric_meta = task_defs.metric_meta_map[args.task] # load model checkpoint_path = args.checkpoint assert os.path.exists(checkpoint_path) if args.cuda: state_dict = torch.load(checkpoint_path) else: state_dict = torch.load(checkpoint_path, map_location="cpu") config = state_dict['config'] config["cuda"] = args.cuda model = MTDNNModel(config, state_dict=state_dict) model.load(checkpoint_path) encoder_type = config.get('encoder_type', EncoderModelType.BERT) # load data test_data_set = SingleTaskDataset(args.prep_input, False, task_type=task_type, maxlen=args.max_seq_len) collater = Collater(is_train=False, encoder_type=encoder_type) test_data = DataLoader(test_data_set, batch_size=args.batch_size_eval, collate_fn=collater.collate_fn, pin_memory=args.cuda) with torch.no_grad(): test_metrics, test_predictions, scores, golds, test_ids = eval_model( model, test_data, metric_meta=metric_meta, use_cuda=args.cuda, with_label=args.with_label) results = { 'metrics': test_metrics, 'predictions': test_predictions, 'uids': test_ids, 'scores': scores } dump(args.score, results) if args.with_label: print(test_metrics)
def main(): # Read in the data global model nwords = len(w2i) ntags = len(t2i) nchars = len(c2i) if 'rnn' in model_type.lower(): print ("Running a RNN model") model = RNN() elif 'cnn' in model_type.lower(): print ("Running a CNN model") model = CNN() elif 'bilstm' == model_type.lower(): print ("Running a BiLSTM char + word model ") model = biLstm_with_chars.BiLSTM() elif 'bilstm' in model_type.lower() and 'word' in model_type.lower(): print ("Running a BiLSTM word only model ") model = biLstm.BiLSTM() elif 'bilstm' in model_type.lower() and 'char' in model_type.lower(): print ("Running a BiLSTM char only model ") model = biLstm_char_only.BiLSTM() opt = {'log_file': 'checkpoints/scitail_tl_adamax_answer_opt0_gc0_ggc1_7_2_19/log.log', 'init_checkpoint': '/data/kashyap_data/mt_dnn_models/mt_dnn_large_uncased.pt', 'data_dir': 'data/domain_adaptation', 'data_sort_on': False, 'name': 'farmer', 'train_datasets': ['scitail'], 'test_datasets': ['scitail'], 'pw_tasks': ['qnnli'], 'update_bert_opt': 0, 'multi_gpu_on': False, 'mem_cum_type': 'simple', 'answer_num_turn': 5, 'answer_mem_drop_p': 0.1, 'answer_att_hidden_size': 128, 'answer_att_type': 'bilinear', 'answer_rnn_type': 'gru', 'answer_sum_att_type': 'bilinear', 'answer_merge_opt': 1, 'answer_mem_type': 1, 'answer_dropout_p': 0.1, 'answer_weight_norm_on': False, 'dump_state_on': False, 'answer_opt': [ 0], 'label_size': '2', 'mtl_opt': 0, 'ratio': 0, 'mix_opt': 0, 'max_seq_len': 512, 'init_ratio': 1, 'cuda': True, 'log_per_updates': 500, 'epochs': 5, 'batch_size': 16, 'batch_size_eval': 8, 'optimizer': 'adamax', 'grad_clipping': 0.0, 'global_grad_clipping': 1.0, 'weight_decay': 0, 'learning_rate': 5e-05, 'momentum': 0, 'warmup': 0.1, 'warmup_schedule': 'warmup_linear', 'vb_dropout': True, 'dropout_p': 0.1, 'dropout_w': 0.0, 'bert_dropout_p': 0.1, 'ema_opt': 0, 'ema_gamma': 0.995, 'have_lr_scheduler': True, 'multi_step_lr': '10,20,30', 'freeze_layers': -1, 'embedding_opt': 0, 'lr_gamma': 0.5, 'bert_l2norm': 0.0, 'scheduler_type': 'ms', 'output_dir': 'checkpoints/scitail_tl_adamax_answer_opt0_gc0_ggc1_7_2_19', 'seed': 2018, 'task_config_path': 'configs/tasks_config.json', 'tasks_dropout_p': [0.1]} state_dict = torch.load( "checkpoint/scitail_model_0.pt") config = state_dict['config'] config['attention_probs_dropout_prob'] = 0.1 config['hidden_dropout_prob'] = 0.1 opt.update(config) model = MTDNNModel(opt, state_dict=state_dict, num_train_step=50) print ("building vocabulary...") create_vocabulary('data/classes/train.txt') print ("done building vocabulary...") print ('size of the character vocab %s' %(len(char_vocab_set))) # trainer = model.build_model(nwords, nchars, ntags) # if input_file != "": # model.load(input_file) if 'train' in mode.lower(): if params['adv_swap'] or params['adv_drop'] or params['adv_key'] \ or params['adv_add'] or params['adv_all']: start_adversarial_training(trainer) else: start_training(train, dev, trainer) elif 'gen' in mode.lower(): generate_ann() elif 'examples' in mode.lower(): get_qualitative_examples() else: evaluate() if type_of_attack is not None: check_against_spell_mistakes('data/classes/test.txt')
def make_prediction(Description_A, Description_B, model_path, USE_GPU=True): # Loading tokenized using a stored Pickle Object, as it is more reliable # In case you'd like to create the object, you can do so here: bert_tokenizer = BertTokenizer.from_pretrained("bert-large-uncased", do_lower_case=True) pickle_off = open("tokenizer.pkl", "rb") tokenizer = pickle.load(pickle_off) # Tokenizes the words into format required by Bert ex: "I am playing" -> ["I","am","play","##ing"] hypothesis = tokenizer.tokenize(Description_A) #If sequence is too long it truncates it to ensure it fits into BERT's max seq len and changes the words into numbers if len(hypothesis) > 512 - 3: hypothesis = hypothesis[:512 - 3] input_ids = tokenizer.convert_tokens_to_ids( ['[CLS]'] + hypothesis + ['[SEP]']) #Determnines what sentence it's in, doesn't really matter for single sentence, but important for 2 sentence classification type_ids = [0] * (len(hypothesis) + 2) #Concatenates all the important labels into a dictionary #UID : id number (no importance) ; label: "ground truth" (no importance when making a prediction) # token_id: representation of words ; type_id: position within a sentence features = {'uid': 0, 'label': 0, 'token_id': input_ids, 'type_id': type_ids} # Loads data into a BatchGen object which is needed for making a prediction, nothing needed to change here dev_data = BatchGen([features], batch_size=8, gpu=True, is_train=False, task_id=0, maxlen=512, pairwise=False, data_type=0, task_type=0) # function to convert token ids back to words print(tokenizer.convert_ids_to_tokens([101, 100, 5208, 2024, 17662, 9119, 2096, 3173, 2000, 2175, 14555, 2044, 2074, 5983, 6265, 1012, 102, 100, 2308, 2024, 23581, 2096, 3173, 2000, 2175, 14555, 1012, 102])) #hyper parameters: whatever is necessary is added as variables at the top opt = {'init_checkpoint': model_path, 'data_dir': 'data/domain_adaptation', 'data_sort_on': False, 'name': 'farmer', 'train_datasets': ['sst'], 'test_datasets': ['sst'], 'pw_tasks': ['qnnli'], 'update_bert_opt': 0, 'multi_gpu_on': False, 'mem_cum_type': 'simple', 'answer_num_turn': 5, 'answer_mem_drop_p': 0.1, 'answer_att_hidden_size': 128, 'answer_att_type': 'bilinear', 'answer_rnn_type': 'gru', 'answer_sum_att_type': 'bilinear', 'answer_merge_opt': 1, 'answer_mem_type': 1, 'answer_dropout_p': 0.1, 'answer_weight_norm_on': False, 'dump_state_on': False, 'answer_opt': [ 0], 'label_size': '2', 'mtl_opt': 0, 'ratio': 0, 'mix_opt': 0, 'max_seq_len': 512, 'init_ratio': 1, 'cuda': USE_GPU, 'log_per_updates': 500, 'epochs': 5, 'batch_size': 32, 'batch_size_eval': 8, 'optimizer': 'adamax', 'grad_clipping': 0.0, 'global_grad_clipping': 1.0, 'weight_decay': 0, 'learning_rate': 5e-05, 'momentum': 0, 'warmup': 0.1, 'warmup_schedule': 'warmup_linear', 'vb_dropout': True, 'dropout_p': 0.1, 'dropout_w': 0.0, 'bert_dropout_p': 0.1, 'ema_opt': 0, 'ema_gamma': 0.995, 'have_lr_scheduler': True, 'multi_step_lr': '10,20,30', 'freeze_layers': -1, 'embedding_opt': 0, 'lr_gamma': 0.5, 'bert_l2norm': 0.0, 'scheduler_type': 'ms', 'output_dir': 'checkpoints/scitail_tl_adamax_answer_opt0_gc0_ggc1_7_2_19', 'seed': 2018, 'task_config_path': 'configs/tasks_config.json', 'tasks_dropout_p': [0.1]} state_dict = torch.load(model_path) config = state_dict['config'] config['attention_probs_dropout_prob'] = 0.1 config['hidden_dropout_prob'] = 0.1 opt.update(config) model = MTDNNModel(opt, state_dict=state_dict, num_train_step=50) #actual prediction to be made: main outputs are predictions which is a list of size 1, and scores which is confidence in prediction for each class dev_metrics, dev_predictions, scores, golds, dev_ids = eval_model( model, dev_data, 0,use_cuda=True, with_label =False) #model, data, metric_meta, use_cuda=True, with_label=True return dev_predictions, scores
def main(): logger.info('Launching the MT-DNN training') opt = vars(args) # update data dir opt['data_dir'] = data_dir batch_size = args.batch_size train_data_list = [] tasks = {} tasks_class = {} nclass_list = [] dropout_list = [] for dataset in args.train_datasets: prefix = dataset.split('_')[0] if prefix in tasks: continue assert prefix in DATA_META assert prefix in DATA_TYPE data_type = DATA_TYPE[prefix] nclass = DATA_META[prefix] task_id = len(tasks) if args.mtl_opt > 0: task_id = tasks_class[nclass] if nclass in tasks_class else len(tasks_class) task_type = TASK_TYPE[prefix] pw_task = False if prefix in opt['pw_tasks']: pw_task = True if prefix not in tasks: tasks[prefix] = len(tasks) if args.mtl_opt < 1: nclass_list.append(nclass) if (nclass not in tasks_class): tasks_class[nclass] = len(tasks_class) if args.mtl_opt > 0: nclass_list.append(nclass) dropout_p = args.dropout_p if tasks_config and prefix in tasks_config: dropout_p = tasks_config[prefix] dropout_list.append(dropout_p) train_path = os.path.join(data_dir, '{}_train.json'.format(dataset)) logger.info('Loading {} as task {}'.format(train_path, task_id)) train_data = BatchGen(BatchGen.load(train_path, True, pairwise=pw_task, maxlen=args.max_seq_len, filter_long_parses=(prefix!='mednli')), batch_size=batch_size, dropout_w=args.dropout_w, gpu=args.cuda, task_id=task_id, maxlen=args.max_seq_len, pairwise=pw_task, data_type=data_type, task_type=task_type, use_parse=args.use_parse, use_generic_features=args.use_generic_features, use_domain_features=args.use_domain_features, feature_pkl_dir=args.feature_pkl_dir if args.feature_pkl_dir is not None else args.data_dir, feature_pkl_namespace='train') train_data_list.append(train_data) opt['tasks_dropout_p'] = dropout_list args.label_size = ','.join([str(l) for l in nclass_list]) logger.info(args.label_size) dev_data_list = [] test_data_list = [] for dataset in args.test_datasets: prefix = dataset.split('_')[0] task_id = tasks_class[DATA_META[prefix]] if args.mtl_opt > 0 else tasks[prefix] task_type = TASK_TYPE[prefix] pw_task = False if prefix in opt['pw_tasks']: pw_task = True assert prefix in DATA_TYPE data_type = DATA_TYPE[prefix] dev_path = os.path.join(data_dir, '{}_dev.json'.format(dataset)) dev_data = None if os.path.exists(dev_path): dev_data = BatchGen(BatchGen.load(dev_path, False, pairwise=pw_task, maxlen=args.max_seq_len, filter_long_parses=False), batch_size=args.batch_size_eval, gpu=args.cuda, is_train=False, task_id=task_id, maxlen=args.max_seq_len, pairwise=pw_task, data_type=data_type, task_type=task_type, use_parse=args.use_parse, use_generic_features=args.use_generic_features, use_domain_features=args.use_domain_features, feature_pkl_dir=args.feature_pkl_dir if args.feature_pkl_dir is not None else args.data_dir, feature_pkl_namespace='dev') dev_data_list.append(dev_data) test_path = os.path.join(data_dir, '{}_test.json'.format(dataset)) test_data = None if os.path.exists(test_path): test_data = BatchGen(BatchGen.load(test_path, False, pairwise=pw_task, maxlen=args.max_seq_len, filter_long_parses=False), batch_size=args.batch_size_eval, gpu=args.cuda, is_train=False, task_id=task_id, maxlen=args.max_seq_len, pairwise=pw_task, data_type=data_type, task_type=task_type, use_parse=args.use_parse, use_generic_features=args.use_generic_features, use_domain_features=args.use_domain_features, feature_pkl_dir=args.feature_pkl_dir if args.feature_pkl_dir is not None else args.data_dir, feature_pkl_namespace='test') test_data_list.append(test_data) logger.info('#' * 20) logger.info(opt) logger.info('#' * 20) model_path = args.init_checkpoint state_dict = None pretrained_embeddings = pretrained_idx2token = None if os.path.exists(model_path): state_dict = torch.load(model_path) state_dict.pop('optimizer', None) config = state_dict['config'] config['attention_probs_dropout_prob'] = args.bert_dropout_p config['hidden_dropout_prob'] = args.bert_dropout_p for k in {'epochs', 'output_dir', 'train_datasets', 'test_datasets', 'seed', 'local_model_idx2token', 'use_parse', 'stx_parse_dim', 'glove_path', 'unk_threshold', 'use_generic_features', 'use_domain_features', 'feature_dim', 'feature_pkl_dir'}: config.pop(k, None) opt.update(config) if 'treelstm.embedding.weight' in state_dict['state']: pretrained_embeddings = state_dict['state']['treelstm.embedding.weight'] pretrained_idx2token = pickle.load(open(args.local_model_idx2token, 'rb')) del state_dict['state']['treelstm.embedding.weight'] else: logger.error('#' * 20) logger.error('Could not find the init model!\n The parameters will be initialized randomly!') logger.error('#' * 20) config = BertConfig(vocab_size_or_config_json_file=30522).to_dict() opt.update(config) assert len(train_data_list) == len(dev_data_list) == len(test_data_list) == 1 embedding_matrix = token2idx = unked_words = None if args.use_parse: assert args.stx_parse_dim is not None assert args.glove_path is not None vocab = Counter() for data in train_data_list: for batch in data.data: for example in batch: for leaf in Tree.from_char_indices(example['parse_id_a']).leaves(): vocab[leaf.content.lower()] += 1 for leaf in Tree.from_char_indices(example['parse_id_b']).leaves(): vocab[leaf.content.lower()] += 1 for data in dev_data_list: for batch in data.data: for example in batch: for leaf in Tree.from_char_indices(example['parse_id_a']).leaves(): vocab[leaf.content.lower()] += 1 for leaf in Tree.from_char_indices(example['parse_id_b']).leaves(): vocab[leaf.content.lower()] += 1 final_vocab = {'<unk>'} unked_words = set() for word, count in vocab.items(): (final_vocab if count >= args.unk_threshold else unked_words).add(word) assert len(final_vocab) + len(unked_words) == len(vocab) + 1 vocab = final_vocab idx2token = {} for token in vocab: idx2token[len(idx2token)] = token pickle.dump(idx2token, open(os.path.join(args.output_dir, "idx2token.pkl"), 'wb')) token2idx = {token: idx for idx, token in idx2token.items()} embedding_matrix = load_embeddings(args.glove_path, vocab, idx2token, pretrained_embeddings=pretrained_embeddings, pretrained_idx2token=pretrained_idx2token) num_generic_features = num_domain_features = None if args.use_generic_features: num_generic_features = len(train_data_list[0].data[0][0]['generic_features']) if args.use_domain_features: num_domain_features = len(train_data_list[0].data[0][0]['domain_features']) all_iters =[iter(item) for item in train_data_list] all_lens = [len(bg) for bg in train_data_list] num_all_batches = args.epochs * sum(all_lens) if len(train_data_list)> 1 and args.ratio > 0: num_all_batches = int(args.epochs * (len(train_data_list[0]) * (1 + args.ratio))) model = MTDNNModel(opt, state_dict=state_dict, num_train_step=num_all_batches, use_parse=args.use_parse, embedding_matrix=embedding_matrix, token2idx=token2idx, stx_parse_dim=args.stx_parse_dim, unked_words=unked_words, use_generic_features=args.use_generic_features, num_generic_features=num_generic_features, use_domain_features=args.use_domain_features, num_domain_features=num_domain_features, feature_dim=args.feature_dim) ####model meta str headline = '############# Model Arch of MT-DNN #############' ###print network logger.info('\n{}\n{}\n'.format(headline, model.network)) # dump config config_file = os.path.join(output_dir, 'config.json') with open(config_file, 'w', encoding='utf-8') as writer: writer.write('{}\n'.format(json.dumps(opt))) writer.write('\n{}\n{}\n'.format(headline, model.network)) logger.info("Total number of params: {}".format(model.total_param)) if args.freeze_layers > 0: model.network.freeze_layers(args.freeze_layers) highest_dev_acc = -1 if args.cuda: model.cuda() for epoch in range(0, args.epochs): logger.warning('At epoch {}'.format(epoch)) for train_data in train_data_list: train_data.reset() start = datetime.now() all_indices=[] if len(train_data_list)> 1 and args.ratio > 0: main_indices =[0] * len(train_data_list[0]) extra_indices=[] for i in range(1, len(train_data_list)): extra_indices += [i] * len(train_data_list[i]) random_picks=int(min(len(train_data_list[0]) * args.ratio, len(extra_indices))) extra_indices = np.random.choice(extra_indices, random_picks, replace=False) if args.mix_opt > 0: extra_indices = extra_indices.tolist() random.shuffle(extra_indices) all_indices = extra_indices + main_indices else: all_indices = main_indices + extra_indices.tolist() else: for i in range(1, len(train_data_list)): all_indices += [i] * len(train_data_list[i]) if args.mix_opt > 0: random.shuffle(all_indices) all_indices += [0] * len(train_data_list[0]) if args.mix_opt < 1: random.shuffle(all_indices) for i in range(len(all_indices)): task_id = all_indices[i] batch_meta, batch_data= next(all_iters[task_id]) model.update(batch_meta, batch_data) if (model.updates) % args.log_per_updates == 0 or model.updates == 1: logger.info('Task [{0:2}] updates[{1:6}] train loss[{2:.5f}] remaining[{3}]'.format(task_id, model.updates, model.train_loss.avg, str((datetime.now() - start) / (i + 1) * (len(all_indices) - i - 1)).split('.')[0])) assert len(args.test_datasets) == 1 for idx, dataset in enumerate(args.test_datasets): prefix = dataset.split('_')[0] label_dict = GLOBAL_MAP.get(prefix, None) dev_data = dev_data_list[idx] if dev_data is not None: dev_metrics, dev_predictions, scores, golds, dev_ids= eval_model(model, dev_data, dataset=prefix, use_cuda=args.cuda) assert len(dev_metrics) == 1 for key, val in dev_metrics.items(): logger.warning("Task {0} -- epoch {1} -- Dev {2}: {3:.3f}".format(dataset, epoch, key, val)) score_file = os.path.join(output_dir, '{}_dev_scores_{}.json'.format(dataset, epoch)) results = {'metrics': dev_metrics, 'predictions': dev_predictions, 'uids': dev_ids, 'scores': scores} dump(score_file, results) official_score_file = os.path.join(output_dir, '{}_dev_scores_{}.tsv'.format(dataset, epoch)) submit(official_score_file, results, label_dict) if list(dev_metrics.values())[0] > highest_dev_acc: model.save(os.path.join(output_dir, 'best_model.pt')) highest_dev_acc = list(dev_metrics.values())[0] logger.warning(f'Best dev {highest_dev_acc}')
def main(): logger.info('Launching the MT-DNN training') opt = vars(args) # update data dir opt['data_dir'] = data_dir batch_size = args.batch_size train_data_list = [] tasks = {} tasks_class = {} nclass_list = [] decoder_opts = [] dropout_list = [] for dataset in args.train_datasets: prefix = dataset.split('_')[0] if prefix in tasks: continue assert prefix in DATA_META assert prefix in DATA_TYPE data_type = DATA_TYPE[prefix] nclass = DATA_META[prefix] task_id = len(tasks) if args.mtl_opt > 0: task_id = tasks_class[nclass] if nclass in tasks_class else len( tasks_class) task_type = TASK_TYPE[prefix] pw_task = False if prefix in opt['pw_tasks']: pw_task = True dopt = generate_decoder_opt(prefix, opt['answer_opt']) if task_id < len(decoder_opts): decoder_opts[task_id] = min(decoder_opts[task_id], dopt) else: decoder_opts.append(dopt) if prefix not in tasks: tasks[prefix] = len(tasks) if args.mtl_opt < 1: nclass_list.append(nclass) if (nclass not in tasks_class): tasks_class[nclass] = len(tasks_class) if args.mtl_opt > 0: nclass_list.append(nclass) dropout_p = args.dropout_p if tasks_config and prefix in tasks_config: dropout_p = tasks_config[prefix] dropout_list.append(dropout_p) train_path = os.path.join(data_dir, '{}_train.json'.format(dataset)) logger.info('Loading {} as task {}'.format(train_path, task_id)) train_data = BatchGen(BatchGen.load(train_path, True, pairwise=pw_task, maxlen=args.max_seq_len), batch_size=batch_size, dropout_w=args.dropout_w, gpu=args.cuda, task_id=task_id, maxlen=args.max_seq_len, pairwise=pw_task, data_type=data_type, task_type=task_type) train_data_list.append(train_data) opt['answer_opt'] = decoder_opts opt['tasks_dropout_p'] = dropout_list args.label_size = ','.join([str(l) for l in nclass_list]) logger.info(args.label_size) dev_data_list = [] test_data_list = [] for dataset in args.test_datasets: prefix = dataset.split('_')[0] if args.mtl_opt > 0: task_id = tasks_class[DATA_META[prefix]] else: task_id = tasks[prefix] task_type = TASK_TYPE[prefix] pw_task = False if prefix in opt['pw_tasks']: pw_task = True assert prefix in DATA_TYPE data_type = DATA_TYPE[prefix] dev_path = os.path.join(data_dir, '{}_dev.json'.format(dataset)) dev_data = None if os.path.exists(dev_path): dev_data = BatchGen(BatchGen.load(dev_path, False, pairwise=pw_task, maxlen=args.max_seq_len), batch_size=args.batch_size_eval, gpu=args.cuda, is_train=False, task_id=task_id, maxlen=args.max_seq_len, pairwise=pw_task, data_type=data_type, task_type=task_type) dev_data_list.append(dev_data) test_path = os.path.join(data_dir, '{}_test.json'.format(dataset)) test_data = None if os.path.exists(test_path): test_data = BatchGen(BatchGen.load(test_path, False, pairwise=pw_task, maxlen=args.max_seq_len), batch_size=args.batch_size_eval, gpu=args.cuda, is_train=False, task_id=task_id, maxlen=args.max_seq_len, pairwise=pw_task, data_type=data_type, task_type=task_type) test_data_list.append(test_data) logger.info('#' * 20) logger.info(opt) logger.info('#' * 20) all_iters = [iter(item) for item in train_data_list] all_lens = [len(bg) for bg in train_data_list] num_all_batches = args.epochs * sum(all_lens) if len(train_data_list) > 1 and args.ratio > 0: num_all_batches = int(args.epochs * (len(train_data_list[0]) * (1 + args.ratio))) model_path = args.init_checkpoint state_dict = None if os.path.exists(model_path): state_dict = torch.load(model_path) config = state_dict['config'] config['attention_probs_dropout_prob'] = args.bert_dropout_p config['hidden_dropout_prob'] = args.bert_dropout_p opt.update(config) else: logger.error('#' * 20) logger.error( 'Could not find the init model!\n The parameters will be initialized randomly!' ) logger.error('#' * 20) config = BertConfig(vocab_size_or_config_json_file=30522).to_dict() opt.update(config) model = MTDNNModel(opt, state_dict=state_dict, num_train_step=num_all_batches) ####model meta str headline = '############# Model Arch of MT-DNN #############' ###print network logger.info('\n{}\n{}\n'.format(headline, model.network)) # dump config config_file = os.path.join(output_dir, 'config.json') with open(config_file, 'w', encoding='utf-8') as writer: writer.write('{}\n'.format(json.dumps(opt))) writer.write('\n{}\n{}\n'.format(headline, model.network)) logger.info("Total number of params: {}".format(model.total_param)) if args.freeze_layers > 0: model.network.freeze_layers(args.freeze_layers) if args.cuda: model.cuda() for epoch in range(0, args.epochs): logger.warning('At epoch {}'.format(epoch)) for train_data in train_data_list: train_data.reset() start = datetime.now() all_indices = [] if len(train_data_list) > 1 and args.ratio > 0: main_indices = [0] * len(train_data_list[0]) extra_indices = [] for i in range(1, len(train_data_list)): extra_indices += [i] * len(train_data_list[i]) random_picks = int( min(len(train_data_list[0]) * args.ratio, len(extra_indices))) extra_indices = np.random.choice(extra_indices, random_picks, replace=False) if args.mix_opt > 0: extra_indices = extra_indices.tolist() random.shuffle(extra_indices) all_indices = extra_indices + main_indices else: all_indices = main_indices + extra_indices.tolist() else: for i in range(1, len(train_data_list)): all_indices += [i] * len(train_data_list[i]) if args.mix_opt > 0: random.shuffle(all_indices) all_indices += [0] * len(train_data_list[0]) if args.mix_opt < 1: random.shuffle(all_indices) for i in range(len(all_indices)): task_id = all_indices[i] batch_meta, batch_data = next(all_iters[task_id]) model.update(batch_meta, batch_data) if (model.updates ) % args.log_per_updates == 0 or model.updates == 1: logger.info( 'Task [{0:2}] updates[{1:6}] train loss[{2:.5f}] remaining[{3}]' .format( task_id, model.updates, model.train_loss.avg, str((datetime.now() - start) / (i + 1) * (len(all_indices) - i - 1)).split('.')[0])) for idx, dataset in enumerate(args.test_datasets): prefix = dataset.split('_')[0] label_dict = GLOBAL_MAP.get(prefix, None) dev_data = dev_data_list[idx] if dev_data is not None: dev_metrics, dev_predictions, scores, golds, dev_ids = eval_model( model, dev_data, dataset=prefix, use_cuda=args.cuda) for key, val in dev_metrics.items(): logger.warning( "Task {0} -- epoch {1} -- Dev {2}: {3:.3f}".format( dataset, epoch, key, val)) score_file = os.path.join( output_dir, '{}_dev_scores_{}.json'.format(dataset, epoch)) results = { 'metrics': dev_metrics, 'predictions': dev_predictions, 'uids': dev_ids, 'scores': scores } dump(score_file, results) official_score_file = os.path.join( output_dir, '{}_dev_scores_{}.tsv'.format(dataset, epoch)) submit(official_score_file, results, label_dict) # test eval test_data = test_data_list[idx] if test_data is not None: # For eval_model function, with_label = True specifies that evaluation metrics will be reported for test data - # this was presumably disabled by authors as it is bad practice in hyperparameter tuning, however it is the most convenient # way to get test scores. To avoid bias, hyperparameter decisions are made based on dev evaluation metrics, and test metrics # are only recorded for the final versions of models. test_metrics, test_predictions, scores, golds, test_ids = eval_model( model, test_data, dataset=prefix, use_cuda=args.cuda, with_label=True) score_file = os.path.join( output_dir, '{}_test_scores_{}.json'.format(dataset, epoch)) results = { 'metrics': test_metrics, 'predictions': test_predictions, 'uids': test_ids, 'scores': scores } dump(score_file, results) official_score_file = os.path.join( output_dir, '{}_test_scores_{}.tsv'.format(dataset, epoch)) submit(official_score_file, results, label_dict) logger.info('[new test scores saved.]') model_file = os.path.join(output_dir, 'model_{}.pt'.format(epoch)) model.save(model_file)
input_ids = tokenize_fn.convert_tokens_to_ids(['[CLS]'] + tokens_a + ['[SEP]']) segment_ids = [0] * len(input_ids) input_mask = None return input_ids, input_mask, segment_ids tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) print('Enter Sentence 1:') premise = input() print('Enter Sentence 2:') hypothesis = input() input_ids, _, type_ids = bert_feature_extractor(premise, hypothesis, max_seq_length=64, tokenize_fn=tokenizer) features = { 'uid': '0', 'label': '0', 'token_id': input_ids, 'type_id': type_ids } model_path = 'checkpoints/my_mnli/model_0.pt' state_dict = torch.load(model_path) config = state_dict['config'] opt.update(config) model = MTDNNModel(opt, state_dict=state_dict)
def main(): logger.info('Launching the MT-DNN training') opt = vars(args) # update data dir opt['data_dir'] = data_dir batch_size = args.batch_size train_data_list = [] tasks = {} tasks_class = {} nclass_list = [] decoder_opts = [] dropout_list = [] for dataset in args.train_datasets: prefix = dataset.split('_')[0] if prefix in tasks: continue assert prefix in DATA_META assert prefix in DATA_TYPE data_type = DATA_TYPE[prefix] nclass = DATA_META[prefix] task_id = len(tasks) if args.mtl_opt > 0: task_id = tasks_class[nclass] if nclass in tasks_class else len(tasks_class) task_type = TASK_TYPE[prefix] pw_task = False dopt = generate_decoder_opt(prefix, opt['answer_opt']) if task_id < len(decoder_opts): decoder_opts[task_id] = min(decoder_opts[task_id], dopt) else: decoder_opts.append(dopt) if prefix not in tasks: tasks[prefix] = len(tasks) if args.mtl_opt < 1: nclass_list.append(nclass) if (nclass not in tasks_class): tasks_class[nclass] = len(tasks_class) if args.mtl_opt > 0: nclass_list.append(nclass) dropout_p = args.dropout_p if tasks_config and prefix in tasks_config: dropout_p = tasks_config[prefix] dropout_list.append(dropout_p) train_path = os.path.join(data_dir, '{}_train.json'.format(dataset)) logger.info('Loading {} as task {}'.format(train_path, task_id)) train_data = BatchGen(BatchGen.load(train_path, True, pairwise=pw_task, maxlen=args.max_seq_len, opt=opt, dataset=dataset), batch_size=batch_size, dropout_w=args.dropout_w, gpu=args.cuda, task_id=task_id, maxlen=args.max_seq_len, pairwise=pw_task, data_type=data_type, task_type=task_type, dataset_name=dataset) train_data.reset() train_data_list.append(train_data) opt['answer_opt'] = decoder_opts opt['tasks_dropout_p'] = dropout_list args.label_size = ','.join([str(l) for l in nclass_list]) logger.info(args.label_size) dev_data_list = [] test_data_list = [] for dataset in args.test_datasets: prefix = dataset.split('_')[0] task_id = tasks_class[DATA_META[prefix]] if args.mtl_opt > 0 else tasks[prefix] task_type = TASK_TYPE[prefix] pw_task = False assert prefix in DATA_TYPE data_type = DATA_TYPE[prefix] if args.predict_split is not None: dev_path = os.path.join(data_dir, '{}_{}.json'.format(dataset, args.predict_split)) else: dev_path = os.path.join(data_dir, '{}_dev.json'.format(dataset)) dev_data = None if os.path.exists(dev_path): dev_data = BatchGen(BatchGen.load(dev_path, False, pairwise=pw_task, maxlen=args.max_seq_len, opt=opt, dataset=dataset), batch_size=args.batch_size_eval, gpu=args.cuda, is_train=False, task_id=task_id, maxlen=args.max_seq_len, pairwise=pw_task, data_type=data_type, task_type=task_type, dataset_name=dataset) dev_data_list.append(dev_data) test_path = os.path.join(data_dir, '{}_test.json'.format(dataset)) test_data = None if os.path.exists(test_path): test_data = BatchGen(BatchGen.load(test_path, False, pairwise=pw_task, maxlen=args.max_seq_len,opt=opt, dataset=dataset), batch_size=args.batch_size_eval, gpu=args.cuda, is_train=False, task_id=task_id, maxlen=args.max_seq_len, pairwise=pw_task, data_type=data_type, task_type=task_type, dataset_name=dataset) test_data_list.append(test_data) logger.info('#' * 20) logger.info(opt) logger.info('#' * 20) all_iters =[iter(item) for item in train_data_list] all_lens = [len(bg) for bg in train_data_list] num_all_batches = args.epochs * sum(all_lens) if len(args.external_datasets) > 0 and args.external_include_ratio > 0: num_in_domain_batches = args.epochs* sum(all_lens[:-len(args.external_datasets)]) num_all_batches = num_in_domain_batches * (1 + args.external_include_ratio) # pdb.set_trace() model_path = args.init_checkpoint state_dict = None if os.path.exists(model_path): state_dict = torch.load(model_path) if args.init_config is not None: # load huggingface model config = json.load(open(args.init_config)) state_dict={'config':config, 'state':state_dict} if args.finetune: # only resume config and state del_keys=set(state_dict.keys())-set(['config','state']) for key in del_keys: del state_dict[key] resume_configs=json.load(open('config/resume_configs.json')) del_keys=set(state_dict['config'].keys())-set(resume_configs) for key in del_keys: del state_dict['config'][key] if args.resume_scoring is not None: for key in state_dict['state'].keys(): if 'scoring_list.0' in key: state_dict['state'][key]=state_dict['state'][key.replace('0',str(args.resume_scoring))] # other scorings will be deleted during loading process, since finetune only has one task elif not args.retain_scoring: del_keys = [k for k in state_dict['state'] if 'scoring_list' in k] for key in del_keys: print('deleted previous weight:',key) del state_dict['state'][key] config = state_dict['config'] config['attention_probs_dropout_prob'] = args.bert_dropout_p config['hidden_dropout_prob'] = args.bert_dropout_p opt.update(config) else: logger.error('#' * 20) logger.error('Could not find the init model!\n The parameters will be initialized randomly!') logger.error('#' * 20) config = BertConfig(vocab_size_or_config_json_file=30522).to_dict() opt.update(config) model = MTDNNModel(opt, state_dict=state_dict, num_train_step=num_all_batches) ####model meta str headline = '############# Model Arch of MT-DNN #############' ###print network # logger.info('\n{}\n{}\n'.format(headline, model.network)) # dump config config_file = os.path.join(output_dir, 'config.json') with open(config_file, 'w', encoding='utf-8') as writer: writer.write('{}\n'.format(json.dumps(opt))) writer.write('\n{}\n{}\n'.format(headline, model.network)) logger.info("Total number of params: {}".format(model.total_param)) if args.freeze_layers > 0: model.network.freeze_layers(args.freeze_layers) if args.cuda: model.cuda() best_epoch=-1 best_performance=0 best_dataset_performance={dataset:{'perf':0,'epoch':-1} for dataset in args.mtl_observe_datasets} for epoch in range(0, args.epochs): logger.warning('At epoch {}'.format(epoch)) if epoch==0 and args.freeze_bert_first: model.network.freeze_bert() logger.warning('Bert freezed.') if epoch==1 and args.freeze_bert_first: model.network.unfreeze_bert() logger.warning('Bert unfreezed.') start = datetime.now() all_indices=[] if len(args.external_datasets)> 0 and args.external_include_ratio>0: main_indices = [] extra_indices = [] for data_idx,batcher in enumerate(train_data_list): if batcher.dataset_name not in args.external_datasets: main_indices += [data_idx] * len(batcher) else: extra_indices += [data_idx] * len(batcher) random_picks=int(min(len(main_indices) * args.external_include_ratio, len(extra_indices))) extra_indices = np.random.choice(extra_indices, random_picks, replace=False) if args.mix_opt > 0: extra_indices = extra_indices.tolist() random.shuffle(extra_indices) all_indices = extra_indices + main_indices else: all_indices = main_indices + extra_indices.tolist() else: for i in range(1, len(train_data_list)): all_indices += [i] * len(train_data_list[i]) if args.mix_opt > 0: random.shuffle(all_indices) all_indices += [0] * len(train_data_list[0]) if args.mix_opt < 1: random.shuffle(all_indices) if args.test_mode: all_indices=all_indices[:2] if args.predict_split is not None: all_indices=[] dev_split=args.predict_split else: dev_split='dev' for i in range(len(all_indices)): task_id = all_indices[i] batch_meta, batch_data= next(all_iters[task_id]) model.update(batch_meta, batch_data) if (model.updates) % args.log_per_updates == 0 or model.updates == 1: logger.info('Task [{0:2}] updates[{1:6}] train loss[{2:.5f}] remaining[{3}]'.format(task_id, model.updates, model.train_loss.avg, str((datetime.now() - start) / (i + 1) * (len(all_indices) - i - 1)).split('.')[0])) os.system('nvidia-smi') for train_data in train_data_list: train_data.reset() this_performance={} for idx, dataset in enumerate(args.test_datasets): prefix = dataset.split('_')[0] dev_data = dev_data_list[idx] if dev_data is not None: dev_metrics, dev_predictions, scores, golds, dev_ids= eval_model(model, dev_data, dataset=prefix, use_cuda=args.cuda) score_file = os.path.join(output_dir, '{}_{}_scores_{}.json'.format(dataset, dev_split, epoch)) results = {'metrics': dev_metrics, 'predictions': dev_predictions, 'uids': dev_ids, 'scores': scores} dump(score_file, results) official_score_file = os.path.join(output_dir, '{}_{}_scores_{}.csv'.format(dataset, dev_split, epoch)) submit(official_score_file, results,dataset_name=prefix, threshold=2.0+args.mediqa_score_offset) if prefix in mediqa_name_list: logger.warning('self test numbers:{}'.format(dev_metrics)) if '_' in dataset: affix = dataset.split('_')[1] ground_truth_path=os.path.join(args.data_root,'mediqa/task3_qa/gt_{}_{}.csv'.format(dev_split,affix)) else: ground_truth_path=os.path.join(args.data_root,'mediqa/task3_qa/gt_{}.csv'.format(dev_split)) official_result=eval_mediqa_official(pred_path=official_score_file, ground_truth_path=ground_truth_path, eval_qa_more=args.mediqa_eval_more) logger.warning("MediQA dev eval result:{}".format(official_result)) if args.mediqa_eval_more: dev_metrics={'ACC':official_result['score']*100,'Spearman':official_result['score_secondary']*100, 'F1':dev_metrics['F1'], 'MRR':official_result['meta']['MRR'], 'MAP':official_result['MAP'], 'P@1':official_result['meta']['P@1']} else: dev_metrics={'ACC':official_result['score']*100,'Spearman':official_result['score_secondary']*100} for key, val in dev_metrics.items(): logger.warning("Task {0} -- epoch {1} -- Dev {2}: {3:.3f}".format(dataset, epoch, key, val)) if args.predict_split is not None: continue print('args.mtl_observe_datasets:',args.mtl_observe_datasets, dataset) if dataset in args.mtl_observe_datasets: this_performance[dataset]=np.mean([val for val in dev_metrics.values()]) test_data = test_data_list[idx] if test_data is not None: test_metrics, test_predictions, scores, golds, test_ids= eval_model(model, test_data, dataset=prefix, use_cuda=args.cuda, with_label=False) for key, val in test_metrics.items(): logger.warning("Task {0} -- epoch {1} -- Test {2}: {3:.3f}".format(dataset, epoch, key, val)) score_file = os.path.join(output_dir, '{}_test_scores_{}.json'.format(dataset, epoch)) results = {'metrics': test_metrics, 'predictions': test_predictions, 'uids': test_ids, 'scores': scores} dump(score_file, results) # if dataset in mediqa_name_list: official_score_file = os.path.join(output_dir, '{}_test_scores_{}.csv'.format(dataset, epoch)) submit(official_score_file, results,dataset_name=prefix, threshold=2.0+args.mediqa_score_offset) logger.info('[new test scores saved.]') print('this_performance:',this_performance) if args.predict_split is not None: break epoch_performance = sum([val for val in this_performance.values()]) if epoch_performance>best_performance: print('changed:',epoch_performance,best_performance) best_performance=epoch_performance best_epoch=epoch for dataset in args.mtl_observe_datasets: if best_dataset_performance[dataset]['perf']<this_performance[dataset]: best_dataset_performance[dataset]={'perf':this_performance[dataset], 'epoch':epoch} print('current best:',best_performance,'at epoch', best_epoch) if not args.not_save_model: model_name = 'model_last.pt' if args.save_last else 'model_{}.pt'.format(epoch) model_file = os.path.join(output_dir, model_name) if args.save_last and os.path.exists(model_file): model_temp=os.path.join(output_dir, 'model_secondlast.pt') copyfile(model_file, model_temp) model.save(model_file) if args.save_best and best_epoch==epoch: best_path = os.path.join(output_dir,'best_model.pt') copyfile(model_file,best_path) for dataset in args.mtl_observe_datasets: if best_dataset_performance[dataset]['epoch']==epoch: best_path = os.path.join(output_dir,'best_model_{}.pt'.format(dataset)) copyfile(model_file,best_path)
def main(): logger.info('Launching the MT-DNN training') opt = vars(args) # update data dir opt['data_dir'] = data_dir batch_size = args.batch_size tasks = {} task_def_list = [] dropout_list = [] train_datasets = [] for dataset in args.train_datasets: prefix = dataset.split('_')[0] if prefix in tasks: continue task_id = len(tasks) tasks[prefix] = task_id task_def = task_defs.get_task_def(prefix) task_def_list.append(task_def) train_path = os.path.join(data_dir, '{}_train.json'.format(dataset)) logger.info('Loading {} as task {}'.format(train_path, task_id)) train_data_set = SingleTaskDataset(train_path, True, maxlen=args.max_seq_len, task_id=task_id, task_def=task_def) train_datasets.append(train_data_set) train_collater = Collater(dropout_w=args.dropout_w, encoder_type=encoder_type, soft_label=args.mkd_opt > 0) multi_task_train_dataset = MultiTaskDataset(train_datasets) multi_task_batch_sampler = MultiTaskBatchSampler(train_datasets, args.batch_size, args.mix_opt, args.ratio) multi_task_train_data = DataLoader(multi_task_train_dataset, batch_sampler=multi_task_batch_sampler, collate_fn=train_collater.collate_fn, pin_memory=args.cuda) opt['task_def_list'] = task_def_list dev_data_list = [] test_data_list = [] test_collater = Collater(is_train=False, encoder_type=encoder_type) for dataset in args.test_datasets: prefix = dataset.split('_')[0] task_def = task_defs.get_task_def(prefix) task_id = tasks[prefix] task_type = task_def.task_type data_type = task_def.data_type dev_path = os.path.join(data_dir, '{}_dev.json'.format(dataset)) dev_data = None if os.path.exists(dev_path): dev_data_set = SingleTaskDataset(dev_path, False, maxlen=args.max_seq_len, task_id=task_id, task_def=task_def) dev_data = DataLoader(dev_data_set, batch_size=args.batch_size_eval, collate_fn=test_collater.collate_fn, pin_memory=args.cuda) dev_data_list.append(dev_data) test_path = os.path.join(data_dir, '{}_test.json'.format(dataset)) test_data = None if os.path.exists(test_path): test_data_set = SingleTaskDataset(test_path, False, maxlen=args.max_seq_len, task_id=task_id, task_def=task_def) test_data = DataLoader(test_data_set, batch_size=args.batch_size_eval, collate_fn=test_collater.collate_fn, pin_memory=args.cuda) test_data_list.append(test_data) logger.info('#' * 20) logger.info(opt) logger.info('#' * 20) # div number of grad accumulation. num_all_batches = args.epochs * len( multi_task_train_data) // args.grad_accumulation_step logger.info('############# Gradient Accumulation Info #############') logger.info('number of step: {}'.format(args.epochs * len(multi_task_train_data))) logger.info('number of grad grad_accumulation step: {}'.format( args.grad_accumulation_step)) logger.info('adjusted number of step: {}'.format(num_all_batches)) logger.info('############# Gradient Accumulation Info #############') init_model = args.init_checkpoint state_dict = None if os.path.exists(init_model): state_dict = torch.load(init_model) config = state_dict['config'] else: if opt['encoder_type'] not in EncoderModelType._value2member_map_: raise ValueError("encoder_type is out of pre-defined types") literal_encoder_type = EncoderModelType( opt['encoder_type']).name.lower() config_class, model_class, tokenizer_class = MODEL_CLASSES[ literal_encoder_type] config = config_class.from_pretrained( init_model, output_hidden_states=True).to_dict( ) # change here to enable multi-layer output config['output_hidden_states'] = True config['attention_probs_dropout_prob'] = args.bert_dropout_p config['hidden_dropout_prob'] = args.bert_dropout_p config['multi_gpu_on'] = opt["multi_gpu_on"] if args.num_hidden_layers != -1: config['num_hidden_layers'] = args.num_hidden_layers opt.update(config) model = MTDNNModel(opt, state_dict=state_dict, num_train_step=num_all_batches) if args.resume and args.model_ckpt: logger.info('loading model from {}'.format(args.model_ckpt)) model.load(args.model_ckpt) #### model meta str headline = '############# Model Arch of MT-DNN #############' ### print network logger.info('\n{}\n{}\n'.format(headline, model.network)) # dump config config_file = os.path.join(output_dir, 'config.json') with open(config_file, 'w', encoding='utf-8') as writer: writer.write('{}\n'.format(json.dumps(opt))) writer.write('\n{}\n{}\n'.format(headline, model.network)) logger.info("Total number of params: {}".format(model.total_param)) # tensorboard if args.tensorboard: args.tensorboard_logdir = os.path.join(args.output_dir, args.tensorboard_logdir) tensorboard = SummaryWriter(log_dir=args.tensorboard_logdir) if args.encode_mode: for idx, dataset in enumerate(args.test_datasets): prefix = dataset.split('_')[0] test_data = test_data_list[idx] with torch.no_grad(): encoding = extract_encoding(model, test_data, use_cuda=args.cuda) torch.save( encoding, os.path.join(output_dir, '{}_encoding.pt'.format(dataset))) return for epoch in range(0, args.epochs): logger.warning('At epoch {}'.format(epoch)) start = datetime.now() for i, (batch_meta, batch_data) in enumerate(multi_task_train_data): batch_meta, batch_data = Collater.patch_data( args.cuda, batch_meta, batch_data) task_id = batch_meta['task_id'] model.update(batch_meta, batch_data) if (model.local_updates) % (args.log_per_updates * args.grad_accumulation_step ) == 0 or model.local_updates == 1: ramaining_time = str( (datetime.now() - start) / (i + 1) * (len(multi_task_train_data) - i - 1)).split('.')[0] logger.info( 'Task [{0:2}] updates[{1:6}] train loss[{2:.5f}] remaining[{3}]' .format(task_id, model.updates, model.train_loss.avg, ramaining_time)) if args.tensorboard: tensorboard.add_scalar('train/loss', model.train_loss.avg, global_step=model.updates) if args.save_per_updates_on and ( (model.local_updates) % (args.save_per_updates * args.grad_accumulation_step) == 0): model_file = os.path.join( output_dir, 'model_{}_{}.pt'.format(epoch, model.updates)) logger.info('Saving mt-dnn model to {}'.format(model_file)) model.save(model_file) for idx, dataset in enumerate(args.test_datasets): prefix = dataset.split('_')[0] task_def = task_defs.get_task_def(prefix) label_dict = task_def.label_vocab dev_data = dev_data_list[idx] if dev_data is not None: with torch.no_grad(): dev_metrics, dev_predictions, scores, golds, dev_ids = eval_model( model, dev_data, metric_meta=task_def.metric_meta, use_cuda=args.cuda, label_mapper=label_dict, task_type=task_def.task_type) for key, val in dev_metrics.items(): if args.tensorboard: tensorboard.add_scalar('dev/{}/{}'.format( dataset, key), val, global_step=epoch) if isinstance(val, str): logger.warning( 'Task {0} -- epoch {1} -- Dev {2}:\n {3}'.format( dataset, epoch, key, val)) else: logger.warning( 'Task {0} -- epoch {1} -- Dev {2}: {3:.3f}'.format( dataset, epoch, key, val)) score_file = os.path.join( output_dir, '{}_dev_scores_{}.json'.format(dataset, epoch)) results = { 'metrics': dev_metrics, 'predictions': dev_predictions, 'uids': dev_ids, 'scores': scores } dump(score_file, results) if args.glue_format_on: from experiments.glue.glue_utils import submit official_score_file = os.path.join( output_dir, '{}_dev_scores_{}.tsv'.format(dataset, epoch)) submit(official_score_file, results, label_dict) # test eval test_data = test_data_list[idx] if test_data is not None: with torch.no_grad(): test_metrics, test_predictions, scores, golds, test_ids = eval_model( model, test_data, metric_meta=task_def.metric_meta, use_cuda=args.cuda, with_label=False, label_mapper=label_dict, task_type=task_def.task_type) score_file = os.path.join( output_dir, '{}_test_scores_{}.json'.format(dataset, epoch)) results = { 'metrics': test_metrics, 'predictions': test_predictions, 'uids': test_ids, 'scores': scores } dump(score_file, results) if args.glue_format_on: from experiments.glue.glue_utils import submit official_score_file = os.path.join( output_dir, '{}_test_scores_{}.tsv'.format(dataset, epoch)) submit(official_score_file, results, label_dict) logger.info('[new test scores saved.]') model_file = os.path.join(output_dir, 'model_{}.pt'.format(epoch)) model.save(model_file) if args.tensorboard: tensorboard.close()
def main(): logger.info('Launching the MT-DNN training') opt = vars(args) # update data dir opt['data_dir'] = data_dir batch_size = args.batch_size # tensorboard tensorboard = None if args.tensorboard: args.tensorboard_logdir = os.path.join(args.output_dir, args.tensorboard_logdir) tensorboard = SummaryWriter(log_dir=args.tensorboard_logdir) json_logfile = os.path.join(args.output_dir, "runtime_log.json") tasks = {} tasks_class = {} nclass_list = [] decoder_opts = [] task_types = [] dropout_list = [] loss_types = [] kd_loss_types = [] train_datasets = [] for dataset in args.train_datasets: prefix = dataset.split('_')[0] if prefix in tasks: continue assert prefix in task_defs.n_class_map assert prefix in task_defs.data_type_map data_type = task_defs.data_type_map[prefix] nclass = task_defs.n_class_map[prefix] task_id = len(tasks) if args.mtl_opt > 0: task_id = tasks_class[nclass] if nclass in tasks_class else len( tasks_class) task_type = task_defs.task_type_map[prefix] dopt = generate_decoder_opt(task_defs.enable_san_map[prefix], opt['answer_opt']) if task_id < len(decoder_opts): decoder_opts[task_id] = min(decoder_opts[task_id], dopt) else: decoder_opts.append(dopt) task_types.append(task_type) loss_types.append(task_defs.loss_map[prefix]) kd_loss_types.append(task_defs.kd_loss_map[prefix]) if prefix not in tasks: tasks[prefix] = len(tasks) if args.mtl_opt < 1: nclass_list.append(nclass) if (nclass not in tasks_class): tasks_class[nclass] = len(tasks_class) if args.mtl_opt > 0: nclass_list.append(nclass) dropout_p = task_defs.dropout_p_map.get(prefix, args.dropout_p) dropout_list.append(dropout_p) train_path = os.path.join(data_dir, '{}_train.json'.format(dataset)) logger.info('Loading {} as task {}'.format(train_path, task_id)) train_data_set = SingleTaskDataset(train_path, True, maxlen=args.max_seq_len, task_id=task_id, task_type=task_type, data_type=data_type) train_datasets.append(train_data_set) train_collater = Collater(dropout_w=args.dropout_w, encoder_type=encoder_type) multi_task_train_dataset = MultiTaskDataset(train_datasets) # MTSampler = SAMPLERS[args.sampler] n_tasks = len(tasks) dataset_sizes = [len(dataset) for dataset in train_datasets] if "random" in args.controller: controller = CONTROLLERS[args.controller]( n_task=n_tasks, dataset_names=args.train_datasets, dataset_sizes=dataset_sizes, batch_size=args.batch_size, rebatch_size=args.batch_size_train, tensorboard=tensorboard, log_filename=json_logfile) else: controller = CONTROLLERS[args.controller]( n_task=n_tasks, phi=args.phi, K=args.concurrent_cnt, dataset_names=args.train_datasets, dataset_sizes=dataset_sizes, max_cnt=args.max_queue_cnt, batch_size=args.batch_size, rebatch_size=args.batch_size_train, tensorboard=tensorboard, log_filename=json_logfile) multi_task_batch_sampler = ACLSampler(train_datasets, args.batch_size, controller=controller) # controller.max_step = len(multi_task_batch_sampler) multi_task_train_data = DataLoader(multi_task_train_dataset, batch_sampler=multi_task_batch_sampler, collate_fn=train_collater.collate_fn, pin_memory=args.cuda) opt['answer_opt'] = decoder_opts opt['task_types'] = task_types opt['tasks_dropout_p'] = dropout_list opt['loss_types'] = loss_types opt['kd_loss_types'] = kd_loss_types args.label_size = ','.join([str(l) for l in nclass_list]) logger.info(args.label_size) dev_data_list = [] test_data_list = [] test_collater = Collater(is_train=False, encoder_type=encoder_type) for dataset in args.test_datasets: prefix = dataset.split('_')[0] task_id = tasks_class[ task_defs. n_class_map[prefix]] if args.mtl_opt > 0 else tasks[prefix] task_type = task_defs.task_type_map[prefix] pw_task = False if task_type == TaskType.Ranking: pw_task = True assert prefix in task_defs.data_type_map data_type = task_defs.data_type_map[prefix] dev_path = os.path.join(data_dir, '{}_dev.json'.format(dataset)) dev_data = None if os.path.exists(dev_path): dev_data_set = SingleTaskDataset(dev_path, False, maxlen=args.max_seq_len, task_id=task_id, task_type=task_type, data_type=data_type) dev_data = DataLoader(dev_data_set, batch_size=args.batch_size_eval, collate_fn=test_collater.collate_fn, pin_memory=args.cuda) dev_data_list.append(dev_data) test_path = os.path.join(data_dir, '{}_test.json'.format(dataset)) test_data = None if os.path.exists(test_path): test_data_set = SingleTaskDataset(test_path, False, maxlen=args.max_seq_len, task_id=task_id, task_type=task_type, data_type=data_type) test_data = DataLoader(test_data_set, batch_size=args.batch_size_eval, collate_fn=test_collater.collate_fn, pin_memory=args.cuda) test_data_list.append(test_data) logger.info('#' * 20) logger.info(opt) logger.info('#' * 20) # div number of grad accumulation. num_all_batches = args.epochs * len( multi_task_train_data) // args.grad_accumulation_step logger.info('############# Gradient Accumulation Info #############') logger.info('number of step: {}'.format(args.epochs * len(multi_task_train_data))) logger.info('number of grad grad_accumulation step: {}'.format( args.grad_accumulation_step)) logger.info('adjusted number of step: {}'.format(num_all_batches)) logger.info('############# Gradient Accumulation Info #############') bert_model_path = args.init_checkpoint state_dict = None if encoder_type == EncoderModelType.BERT: if os.path.exists(bert_model_path): state_dict = torch.load(bert_model_path) config = state_dict['config'] config['attention_probs_dropout_prob'] = args.bert_dropout_p config['hidden_dropout_prob'] = args.bert_dropout_p config['multi_gpu_on'] = opt["multi_gpu_on"] opt.update(config) else: logger.error('#' * 20) logger.error( 'Could not find the init model!\n The parameters will be initialized randomly!' ) logger.error('#' * 20) config = BertConfig(vocab_size_or_config_json_file=30522).to_dict() config['multi_gpu_on'] = opt["multi_gpu_on"] opt.update(config) elif encoder_type == EncoderModelType.ROBERTA: bert_model_path = '{}/model.pt'.format(bert_model_path) if os.path.exists(bert_model_path): new_state_dict = {} state_dict = torch.load(bert_model_path) for key, val in state_dict['model'].items(): if key.startswith('decoder.sentence_encoder'): key = 'bert.model.{}'.format(key) new_state_dict[key] = val elif key.startswith('classification_heads'): key = 'bert.model.{}'.format(key) new_state_dict[key] = val state_dict = {'state': new_state_dict} # add score history score_history = [[] for _ in range(len(args.test_datasets))] total_scores = [] model = MTDNNModel(opt, state_dict=state_dict, num_train_step=num_all_batches) if args.resume and args.model_ckpt: logger.info('loading model from {}'.format(args.model_ckpt)) model.load(args.model_ckpt) #### model meta str headline = '############# Model Arch of MT-DNN #############' ### print network logger.info('\n{}\n{}\n'.format(headline, model.network)) # dump config config_file = os.path.join(output_dir, 'config.json') with open(config_file, 'w', encoding='utf-8') as writer: writer.write('{}\n'.format(json.dumps(opt))) writer.write('\n{}\n{}\n'.format(headline, model.network)) logger.info("Total number of params: {}".format(model.total_param)) for epoch in range(0, args.epochs): logger.warning('At epoch {0}/{1}'.format(epoch + 1, args.epochs)) start = datetime.now() total_len = len(controller) controller.set_epoch(epoch) for i, (batch_meta, batch_data) in enumerate(multi_task_train_data): batch_meta, batch_data = Collater.patch_data( args.cuda, batch_meta, batch_data) task_id = batch_meta['task_id'] loss = model.calculate_loss(batch_meta, batch_data) controller.insert(task_id, (batch_meta, batch_data), loss.item()) if i % args.log_per_updates == 0: ramaining_time = str( (datetime.now() - start) / (controller.cur_step + 1) * (total_len - controller.cur_step - 1)).split('.')[0] logger.info("Epoch {0} Progress {1} / {2} ({3:.2%})".format( epoch + 1, controller.cur_step, total_len, controller.cur_step * 1.0 / total_len)) # logger.info("Progress {0} / {1} ({2:.2f}%)".format(i, total_len, i*100.0/total_len)) logger.info( 'Task [{0:2}] updates[{1:6}] train loss[{2:.5f}] remaining[{3}]' .format(task_id, model.updates, model.train_loss.avg, ramaining_time)) summary_str = controller.summary() for line in summary_str.split("\n"): logger.info(line) # avg_loss, out_loss, loss_change, min_loss, min_out_loss = controller.get_loss() # logger.info('List of loss {}'.format(",".join(avg_loss))) # logger.info('List of out_loss {}'.format(",".join(out_loss))) # logger.info('List of loss_change {}'.format(",".join(loss_change))) # logger.info('List of min_loss {}'.format(",".join(min_loss))) # logger.info('List of min_out_loss {}'.format(",".join(min_out_loss))) # chosen = [ "%s:%.3f "%(k,v) for k, v in controller.scaled_dict.items()] # logger.info('List of Scaled Choosen time {}'.format(",".join(chosen))) if args.tensorboard: tensorboard.add_scalar('train/loss', model.train_loss.avg, global_step=model.updates) controller.step(model=model) if args.save_per_updates_on and ( (model.local_updates) % (args.save_per_updates * args.grad_accumulation_step) == 0): model_file = os.path.join( output_dir, 'model_{}_{}.pt'.format(epoch, model.updates)) logger.info('Saving mt-dnn model to {}'.format(model_file)) model.save(model_file) total_average_score = 0.0 scoring_cnt = 0 score_dict = dict() scoring_datasets = "cola,sst,mrpc,stsb,qqp,mnli,qnli,rte,wnli".split( ",") logger.info('Start Testing') for idx, dataset in enumerate(args.test_datasets): prefix = dataset.split('_')[0] label_dict = task_defs.global_map.get(prefix, None) dev_data = dev_data_list[idx] if dev_data is not None: with torch.no_grad(): dev_metrics, dev_predictions, scores, golds, dev_ids = eval_model( model, dev_data, metric_meta=task_defs.metric_meta_map[prefix], use_cuda=args.cuda, label_mapper=label_dict, task_type=task_defs.task_type_map[prefix]) task_score = 0.0 for key, val in dev_metrics.items(): if args.tensorboard: tensorboard.add_scalar('dev/{}/{}'.format( dataset, key), val, global_step=epoch) if isinstance(val, str): logger.warning( 'Task {0} -- epoch {1} -- Dev {2}:\n {3}'.format( dataset, epoch + 1, key, val)) else: logger.warning( 'Task {0} -- epoch {1} -- Dev {2}: {3:.2f}'.format( dataset, epoch + 1, key, val)) task_score += val if len(dev_metrics) > 1: task_score /= len(dev_metrics) logger.warning( 'Task {0} -- epoch {1} -- Dev {2}: {3:.2f}'.format( dataset, epoch + 1, "Average", task_score)) if prefix in scoring_datasets: scoring_cnt += 1 if prefix not in score_dict: score_dict[prefix] = task_score else: score_dict[prefix] = (score_dict[prefix] + task_score) / 2 total_average_score += task_score score_history[idx].append("%.2f" % task_score) logger.warning('Task {0} -- epoch {1} -- Dev {2}: {3}'.format( dataset, epoch + 1, "History", score_history[idx])) score_file = os.path.join( output_dir, '{}_dev_scores_{}.json'.format(dataset, epoch)) results = { 'metrics': dev_metrics, 'predictions': dev_predictions, 'uids': dev_ids, 'scores': scores } dump(score_file, results) if args.glue_format_on: from experiments.glue.glue_utils import submit official_score_file = os.path.join( output_dir, '{}_dev_scores_{}.tsv'.format(dataset, epoch)) submit(official_score_file, results, label_dict) # test eval test_data = test_data_list[idx] if test_data is not None: with torch.no_grad(): test_metrics, test_predictions, scores, golds, test_ids = eval_model( model, test_data, metric_meta=task_defs.metric_meta_map[prefix], use_cuda=args.cuda, with_label=False, label_mapper=label_dict, task_type=task_defs.task_type_map[prefix]) score_file = os.path.join( output_dir, '{}_test_scores_{}.json'.format(dataset, epoch)) results = { 'metrics': test_metrics, 'predictions': test_predictions, 'uids': test_ids, 'scores': scores } dump(score_file, results) if args.glue_format_on: from experiments.glue.glue_utils import submit official_score_file = os.path.join( output_dir, '{}_test_scores_{}.tsv'.format(dataset, epoch)) submit(official_score_file, results, label_dict) logger.info('[new test scores saved.]') scoreing_cnt = len(score_dict) if scoreing_cnt > 0: mean_value = np.mean([v for k, v in score_dict.items()]) logger.warning( 'Epoch {0} -- Dev {1} Tasks, Average Score : {2:.3f}'.format( epoch + 1, scoring_cnt, mean_value)) score_dict['avg'] = mean_value total_scores.append(score_dict) model_file = os.path.join(output_dir, 'model_{}.pt'.format(epoch)) model.save(model_file) for i, total_score in enumerate(total_scores): logger.info(total_score) if args.tensorboard: tensorboard.close()
device = torch.device("cpu") state_dict = torch.load(checkpoint_path, map_location=device) config = state_dict["config"] config["cuda"] = args.cuda task_def = task_defs.get_task_def(prefix) task_def_list = [task_def] config["task_def_list"] = task_def_list ## temp fix config["fp16"] = False config["answer_opt"] = 0 config["adv_train"] = False del state_dict["optimizer"] model = MTDNNModel(config, device=device, state_dict=state_dict) encoder_type = config.get("encoder_type", EncoderModelType.BERT) # load data test_data_set = SingleTaskDataset( args.prep_input, False, maxlen=args.max_seq_len, task_id=args.task_id, task_def=task_def, ) collater = Collater(is_train=False, encoder_type=encoder_type) test_data = DataLoader( test_data_set, batch_size=args.batch_size_eval, collate_fn=collater.collate_fn, pin_memory=args.cuda,
def load_model_for_viz_0(task_def_path, checkpoint_path, input_path, model_type='bert-base-cased', do_lower_case=False, use_cuda=True): # load task info task = os.path.splitext(os.path.basename(task_def_path))[0] task_defs = TaskDefs(task_def_path) assert task in task_defs._task_type_map assert task in task_defs._data_type_map assert task in task_defs._metric_meta_map prefix = task.split('_')[0] task_def = task_defs.get_task_def(prefix) data_type = task_defs._data_type_map[task] task_type = task_defs._task_type_map[task] metric_meta = task_defs._metric_meta_map[task] # load model assert os.path.exists(checkpoint_path) state_dict = torch.load(checkpoint_path) config = state_dict['config'] config["cuda"] = use_cuda task_def = task_defs.get_task_def(prefix) task_def_list = [task_def] config['task_def_list'] = task_def_list ####### temp fix ####### config['fp16'] = False config['answer_opt'] = 0 config['adv_train'] = False del state_dict['optimizer'] ######################### model = MTDNNModel(config, state_dict=state_dict) encoder_type = config.get('encoder_type', EncoderModelType.BERT) root = os.path.basename(task_def_path) literal_model_type = model_type.split('-')[0].upper() encoder_model = EncoderModelType[literal_model_type] literal_model_type = literal_model_type.lower() mt_dnn_suffix = literal_model_type if 'base' in model_type: mt_dnn_suffix += "_base" elif 'large' in model_type: mt_dnn_suffix += "_large" # load tokenizer config_class, model_class, tokenizer_class = MODEL_CLASSES[ literal_model_type] tokenizer = tokenizer_class.from_pretrained(model_type, do_lower_case=do_lower_case) # load data prep_input = input_path test_data_set = SingleTaskDataset(prep_input, False, maxlen=512, task_id=0, task_def=task_def) collater = Collater(is_train=False, encoder_type=encoder_type) test_data = DataLoader(test_data_set, batch_size=1, collate_fn=collater.collate_fn, pin_memory=True) idx = 0 results = [] return model.mnetwork.module.bert, config, test_data
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument("--task_def", type=str, required=True, default="experiments/glue/glue_task_def.yml") parser.add_argument("--task", type=str, required=True) parser.add_argument("--task_id", type=int, default=0, help="the id of this task when training") parser.add_argument("--checkpoint", default='mt_dnn_models/bert_model_base_uncased.pt', type=str) parser.add_argument( "--output_dir", default= '/content/gdrive/My Drive/Colab Notebooks/cs99/mt-dnn/checkpoints/bert-cased_lcp-single_2020-12-23T2029/', type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written.", ) parser.add_argument( "--prep_input", default= '/content/gdrive/My Drive/Colab Notebooks/cs99/mt-dnn/data_complex/bert_base_cased/lcp_dev.json', type=str, required=True, ) parser.add_argument( '--bert_model_type', default='bert-base-cased', type=str, help="What type of bert model should we be using", ) # Other parameters parser.add_argument( "--config_name", default="", type=str, help= "Pretrained config name or path if not the same as model_name_or_path", ) parser.add_argument( "--tokenizer_name", default="", type=str, help= "Pretrained tokenizer name or path if not the same as model_name_or_path", ) parser.add_argument( "--cache_dir", default=None, type=str, help= "Where do you want to store the pre-trained models downloaded from huggingface.co", ) parser.add_argument( "--data_subset", type=int, default=-1, help="If > 0: limit the data to a subset of data_subset instances.") parser.add_argument("--overwrite_output_dir", action="store_true", help="Whether to overwrite data in output directory") parser.add_argument( "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets") parser.add_argument("--dont_normalize_importance_by_layer", action="store_true", help="Don't normalize importance score by layers") parser.add_argument( "--dont_normalize_global_importance", action="store_true", help="Don't normalize all importance scores between 0 and 1", ) parser.add_argument( "--try_masking", action="store_true", help="Whether to try to mask head until a threshold of accuracy.") parser.add_argument( "--masking_threshold", default=0.9, type=float, help= "masking threshold in term of metrics (stop masking when metric < threshold * original metric value).", ) parser.add_argument( "--masking_amount", default=0.1, type=float, help="Amount to heads to masking at each masking step.") parser.add_argument("--metric_name", default="acc", type=str, help="Metric to use for head masking.") parser.add_argument( "--max_seq_length", default=512, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, sequences shorter padded.", ) # temp fix: technically these parameters should've already bin in checkpoint's config... parser.add_argument("--world_size", type=int, default=1, help="For distributed training: world size") parser.add_argument("--batch_size", default=8, type=int, help="Batch size.") parser.add_argument("--seed", type=int, default=2018) parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--cuda', type=bool, default=torch.cuda.is_available(), help='whether to use GPU acceleration.') parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.") parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.") parser.add_argument("--do_proper", type=str, default=False, help="Can be used for distant debugging.") parser.add_argument("--do_improper", type=str, default=False, help="Can be used for distant debugging.") args = parser.parse_args() if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() # Setup devices and distributed training device = torch.device("cuda") if args.local_rank > -1: device = initialize_distributed(args) elif torch.cuda.is_available(): device = torch.device("cuda") else: device = torch.device("cpu") # load task info task = args.task task_defs = TaskDefs(args.task_def) assert args.task in task_defs._task_type_map assert args.task in task_defs._data_type_map assert args.task in task_defs._metric_meta_map prefix = task.split('_')[0] task_def = task_defs.get_task_def(prefix) data_type = task_defs._data_type_map[args.task] task_type = task_defs._task_type_map[args.task] metric_meta = task_defs._metric_meta_map[args.task] # load model checkpoint_path = args.checkpoint assert os.path.exists(checkpoint_path) if args.cuda: state_dict = torch.load(checkpoint_path) else: state_dict = torch.load(checkpoint_path, map_location="cpu") opt = state_dict['config'] args.bin_on = False opt.update(vars(args)) model = MTDNNModel(opt, device=device, state_dict=state_dict) # Load pretrained model and tokenizer # Load data data = pd.read_csv('data_complex/lcp_test.tsv', sep='\t', header=None, names=['idx', 'complexity', 'sentence', 'token']) data['complexity'] = np.load( '/content/gdrive/My Drive/Colab Notebooks/cs99/from_macbook/single_test_labels.npy' ) data['class'] = pd.cut(data['complexity'], labels=[1, 2, 3, 4, 5], bins=[0, 0.2, 0.4, 0.6, 0.8, 1], include_lowest=True) data['sent_len'] = data['sentence'].str.len() with open( '/content/gdrive/My Drive/Colab Notebooks/cs99/new-mt-dnn/checkpoints/bert-cased_lcp-single_2021-01-19T0309/lcp_test_scores_epoch_4.json', 'r') as file: single_dev_bert_scores = json.load(file) data['finetuned_complexity'] = single_dev_bert_scores['scores'] data['finetuned_error'] = data['finetuned_complexity'] - data[ 'complexity'] data['finetuned_abs_error'] = (data['finetuned_complexity'] - data['complexity']).abs() with open( '/content/gdrive/My Drive/Colab Notebooks/cs99/new-mt-dnn/checkpoints/bert-cased_lcp-single_2021-01-19T0309/pretrained.json', 'r') as file: single_dev_bert_scores = json.load(file) data['pretrained_complexity'] = single_dev_bert_scores['scores'] data['pretrained_error'] = data['pretrained_complexity'] - data[ 'complexity'] data['pretrained_abs_error'] = (data['pretrained_complexity'] - data['complexity']).abs() data['improvement'] = data['pretrained_abs_error'] - data[ 'finetuned_abs_error'] data['proper'] = data['token'].apply(lambda x: x[0].isupper()) # Distributed training: # download model & vocab. printable = opt['local_rank'] in [-1, 0] encoder_type = opt.get('encoder_type', EncoderModelType.BERT) collater = Collater(is_train=True, encoder_type=encoder_type, max_seq_len=opt['max_seq_len'], do_padding=opt['do_padding']) dev_data = SingleTaskDataset(opt['prep_input'], True, maxlen=opt['max_seq_len'], task_id=opt['task_id'], task_def=task_def, printable=printable) if args.do_proper: dev_data._data = np.array( dev_data._data)[data[data['proper']]['idx'].to_numpy()].tolist() if args.do_improper: dev_data._data = np.array( dev_data._data)[data[~data['proper']]['idx'].to_numpy()].tolist() dev_data_loader = DataLoader(dev_data, batch_size=opt['batch_size_eval'], collate_fn=collater.collate_fn, pin_memory=opt['cuda']) # Compute head entropy and importance score results = [] for seed in tqdm(range(2010 + 1, 2020 + 1)): # Set seeds set_seed(seed) attn_entropy, head_importance, preds, labels = compute_heads_importance( opt, model, dev_data_loader) results.append((attn_entropy, head_importance)) pkl.dump( results, open('checkpoints/bert-cased_lcp-single_2021-01-19T0309/results.pkl', 'wb')) # Try head masking (set heads to zero until the score goes under a threshold) # and head pruning (remove masked heads and see the effect on the network) if args.try_masking and args.masking_threshold > 0.0 and args.masking_threshold < 1.0: head_mask = mask_heads(opt, model, dev_data_loader)
# load data test_data = BatchGen(BatchGen.load(args.prep_input, False, pairwise=pw_task, maxlen=args.max_seq_len), batch_size=args.batch_size_eval, gpu=args.cuda, is_train=False, task_id=args.task_id, maxlen=args.max_seq_len, pairwise=pw_task, data_type=data_type, task_type=task_type) # load model checkpoint_path = args.checkpoint assert os.path.exists(checkpoint_path) if args.cuda: state_dict = torch.load(checkpoint_path) else: state_dict = torch.load(checkpoint_path, map_location="cpu") config = state_dict['config'] config["cuda"] = args.cuda model = MTDNNModel(config, state_dict=state_dict) test_metrics, test_predictions, scores, golds, test_ids = eval_model(model, test_data, metric_meta=metric_meta, use_cuda=args.cuda, with_label=args.with_label) results = {'metrics': test_metrics, 'predictions': test_predictions, 'uids': test_ids, 'scores': scores} dump(args.score, results) if args.with_label: print(test_metrics)
def main(): task_def_path = 'data_complex/lcp.yml' task = os.path.splitext(os.path.basename(task_def_path))[0] task_defs = TaskDefs(task_def_path) prefix = task.split('_')[0] task_def = task_defs.get_task_def(prefix) parser = argparse.ArgumentParser() model_config(parser) set_config(parser) train_config(parser) args = parser.parse_args() encoder_type = args.encoder_type layer_indexes = [int(x) for x in args.layers.split(",")] set_environment(args.seed) # process data data, is_single_sentence = process_data(args) data_type = DataFormat.PremiseOnly if is_single_sentence else DataFormat.PremiseAndOneHypothesis fout_temp = '{}.tmp'.format(args.finput) dump_data(data, fout_temp) collater = Collater(is_train=False, encoder_type=encoder_type) dataset = SingleTaskDataset(fout_temp, False, maxlen=args.max_seq_length, task_def=task_def)#, data_type=data_type) batcher = DataLoader(dataset, batch_size=args.batch_size, collate_fn=collater.collate_fn, pin_memory=args.cuda) opt = vars(args) # load model if os.path.exists(args.checkpoint): state_dict = torch.load(args.checkpoint) config = state_dict['config'] config['dump_feature'] = True config['local_rank'] = -1 opt.update(config) else: logger.error('#' * 20) logger.error( 'Could not find the init model!\n The parameters will be initialized randomly!') logger.error('#' * 20) return num_all_batches = len(batcher) model = MTDNNModel( opt, state_dict=state_dict, num_train_step=num_all_batches) if args.cuda: model.cuda() features_dict = {} for batch_meta, batch_data in batcher: batch_meta, batch_data = Collater.patch_data(args.cuda, batch_meta, batch_data) all_encoder_layers, _ = model.extract(batch_meta, batch_data) embeddings = [all_encoder_layers[idx].detach().cpu().numpy() for idx in layer_indexes] #import pdb; pdb.set_trace() uids = batch_meta['uids'] masks = batch_data[batch_meta['mask']].detach().cpu().numpy().tolist() for idx, uid in enumerate(uids): slen = sum(masks[idx]) features = {} for yidx, layer in enumerate(layer_indexes): features[layer] = str(embeddings[yidx][idx][:slen].tolist()) features_dict[uid] = features # save features with open(args.foutput, 'w', encoding='utf-8') as writer: for sample in data: uid = sample['uid'] tokens = sample['tokens'] feature = features_dict[uid] feature['tokens'] = tokens feature['uid'] = uid writer.write('{}\n'.format(json.dumps(feature)))
'freeze_layers': -1, 'embedding_opt': 0, 'lr_gamma': 0.5, 'bert_l2norm': 0.0, 'scheduler_type': 'ms', 'output_dir': 'checkpoints/scitail_tl_adamax_answer_opt0_gc0_ggc1_7_2_19', 'seed': 2018, 'task_config_path': 'configs/tasks_config.json', 'tasks_dropout_p': [0.1] } state_dict = torch.load("checkpoint/snli_model_0.pt") config = state_dict['config'] config['attention_probs_dropout_prob'] = 0.1 config['hidden_dropout_prob'] = 0.1 opt.update(config) model = MTDNNModel(opt, state_dict=state_dict, num_train_step=50) test_metrics, test_predictions, scores, golds, test_ids = eval_model( model, test_data, metric_meta=metric_meta, use_cuda=args.cuda, with_label=args.with_label) results = { 'metrics': test_metrics, 'predictions': test_predictions, 'uids': test_ids, 'scores': scores } dump(args.score, results)
def main(): global tokenizer, test_collater, model logger.info('Launching the MT-DNN training') opt = vars(args) # update data dir opt['data_dir'] = data_dir batch_size = args.batch_size tasks = {} tasks_class = {} nclass_list = [] decoder_opts = [] task_types = [] dropout_list = [] loss_types = [] kd_loss_types = [] #train_datasets = [] for dataset in args.train_datasets: prefix = dataset.split('_')[0] if prefix in tasks: continue assert prefix in task_defs.n_class_map assert prefix in task_defs.data_type_map data_type = task_defs.data_type_map[prefix] nclass = task_defs.n_class_map[prefix] task_id = len(tasks) if args.mtl_opt > 0: task_id = tasks_class[nclass] if nclass in tasks_class else len(tasks_class) task_type = task_defs.task_type_map[prefix] dopt = generate_decoder_opt(task_defs.enable_san_map[prefix], opt['answer_opt']) if task_id < len(decoder_opts): decoder_opts[task_id] = min(decoder_opts[task_id], dopt) else: decoder_opts.append(dopt) task_types.append(task_type) loss_types.append(task_defs.loss_map[prefix]) kd_loss_types.append(task_defs.kd_loss_map[prefix]) if prefix not in tasks: tasks[prefix] = len(tasks) if args.mtl_opt < 1: nclass_list.append(nclass) if (nclass not in tasks_class): tasks_class[nclass] = len(tasks_class) if args.mtl_opt > 0: nclass_list.append(nclass) dropout_p = task_defs.dropout_p_map.get(prefix, args.dropout_p) dropout_list.append(dropout_p) train_path = os.path.join(data_dir, '{}_train.json'.format(dataset)) logger.info('Loading {} as task {}'.format(train_path, task_id)) # train_data_set = SingleTaskDataset(train_path, True, maxlen=args.max_seq_len, task_id=task_id, # task_type=task_type, data_type=data_type) # train_datasets.append(train_data_set) #train_collater = Collater(dropout_w=args.dropout_w, encoder_type=encoder_type) # multi_task_train_dataset = MultiTaskDataset(train_datasets) # multi_task_batch_sampler = MultiTaskBatchSampler(train_datasets, args.batch_size, args.mix_opt, args.ratio) # multi_task_train_data = DataLoader(multi_task_train_dataset, batch_sampler=multi_task_batch_sampler, # collate_fn=train_collater.collate_fn, pin_memory=args.cuda) opt['answer_opt'] = decoder_opts opt['task_types'] = task_types opt['tasks_dropout_p'] = dropout_list opt['loss_types'] = loss_types opt['kd_loss_types'] = kd_loss_types args.label_size = ','.join([str(l) for l in nclass_list]) logger.info(args.label_size) dev_data_list = [] test_data_list = [] test_collater = Collater(is_train=False, encoder_type=encoder_type) logger.info('#' * 20) logger.info(opt) logger.info('#' * 20) bert_model_path = 'checkpoints/my_mnli/model_0.pt' state_dict = None if encoder_type == EncoderModelType.BERT: if os.path.exists(bert_model_path): state_dict = torch.load(bert_model_path, map_location=torch.device('cpu')) config = state_dict['config'] config['attention_probs_dropout_prob'] = args.bert_dropout_p config['hidden_dropout_prob'] = args.bert_dropout_p config['multi_gpu_on'] = opt["multi_gpu_on"] opt.update(config) else: logger.error('#' * 20) logger.error('Could not find the init model!\n The parameters will be initialized randomly!') logger.error('#' * 20) config = BertConfig(vocab_size_or_config_json_file=30522).to_dict() config['multi_gpu_on'] = opt["multi_gpu_on"] opt.update(config) elif encoder_type == EncoderModelType.ROBERTA: bert_model_path = '{}/model.pt'.format(bert_model_path) if os.path.exists(bert_model_path): new_state_dict = {} state_dict = torch.load(bert_model_path) for key, val in state_dict['model'].items(): if key.startswith('decoder.sentence_encoder'): key = 'bert.model.{}'.format(key) new_state_dict[key] = val elif key.startswith('classification_heads'): key = 'bert.model.{}'.format(key) new_state_dict[key] = val state_dict = {'state': new_state_dict} model = MTDNNModel(opt, state_dict=state_dict) if args.resume and args.model_ckpt: logger.info('loading model from {}'.format(args.model_ckpt)) model.load(args.model_ckpt) #### model meta str headline = '############# Model Arch of MT-DNN #############' ### print network logger.info('\n{}\n{}\n'.format(headline, model.network)) # dump config config_file = os.path.join(output_dir, 'config.json') with open(config_file, 'w', encoding='utf-8') as writer: writer.write('{}\n'.format(json.dumps(opt))) writer.write('\n{}\n{}\n'.format(headline, model.network)) logger.info("Total number of params: {}".format(model.total_param)) # # tensorboard # if args.tensorboard: # args.tensorboard_logdir = os.path.join(args.output_dir, args.tensorboard_logdir) # tensorboard = SummaryWriter(log_dir=args.tensorboard_logdir) tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
assert args.task in task_defs.data_type_map assert args.task in task_defs.metric_meta_map data_type = task_defs.data_type_map[args.task] task_type = task_defs.task_type_map[args.task] metric_meta = task_defs.metric_meta_map[args.task] # load model checkpoint_path = args.checkpoint assert os.path.exists(checkpoint_path) if args.cuda: state_dict = torch.load(checkpoint_path) else: state_dict = torch.load(checkpoint_path, map_location="cpu") config = state_dict['config'] config["cuda"] = args.cuda model = MTDNNModel(config, state_dict=state_dict) model.load(checkpoint_path) encoder_type = config.get('encoder_type', EncoderModelType.BERT) # load data test_data = BatchGen(BatchGen.load(args.prep_input, False, task_type=task_type, maxlen=args.max_seq_len), batch_size=args.batch_size_eval, gpu=args.cuda, is_train=False, task_id=args.task_id, maxlen=args.max_seq_len, data_type=data_type, task_type=task_type, encoder_type=encoder_type)
def main(): logger.info('Launching the MT-DNN training') opt = vars(args) # update data dir opt['data_dir'] = data_dir batch_size = args.batch_size train_data_list = [] tasks = {} tasks_class = {} nclass_list = [] decoder_opts = [] dropout_list = [] for dataset in args.train_datasets: prefix = dataset.split('_')[0] if prefix in tasks: continue assert prefix in DATA_META assert prefix in DATA_TYPE data_type = DATA_TYPE[prefix] nclass = DATA_META[prefix] task_id = len(tasks) if args.mtl_opt > 0: task_id = tasks_class[nclass] if nclass in tasks_class else len( tasks_class) task_type = TASK_TYPE[prefix] pw_task = False if prefix in opt['pw_tasks']: pw_task = True dopt = generate_decoder_opt(prefix, opt['answer_opt']) if task_id < len(decoder_opts): decoder_opts[task_id] = min(decoder_opts[task_id], dopt) else: decoder_opts.append(dopt) if prefix not in tasks: tasks[prefix] = len(tasks) if args.mtl_opt < 1: nclass_list.append(nclass) if (nclass not in tasks_class): tasks_class[nclass] = len(tasks_class) if args.mtl_opt > 0: nclass_list.append(nclass) dropout_p = args.dropout_p if tasks_config and prefix in tasks_config: dropout_p = tasks_config[prefix] dropout_list.append(dropout_p) train_data_ratio_string = str( args.train_data_ratio) + "p" if args.train_data_ratio < 100 else "" train_path = os.path.join( data_dir, '{0}_train{1}.json'.format(dataset, train_data_ratio_string)) logger.info('Loading {} as task {}'.format(train_path, task_id)) train_data = BatchGen(BatchGen.load(train_path, True, pairwise=pw_task, maxlen=args.max_seq_len), batch_size=batch_size, dropout_w=args.dropout_w, gpu=args.cuda, task_id=task_id, maxlen=args.max_seq_len, pairwise=pw_task, data_type=data_type, task_type=task_type) train_data_list.append(train_data) opt['answer_opt'] = decoder_opts opt['tasks_dropout_p'] = dropout_list args.label_size = ','.join([str(l) for l in nclass_list]) logger.info(args.label_size) dev_data_list = [] test_data_list = [] for dataset in args.test_datasets: prefix = dataset.split('_')[0] task_id = tasks_class[ DATA_META[prefix]] if args.mtl_opt > 0 else tasks[prefix] task_type = TASK_TYPE[prefix] pw_task = False if prefix in opt['pw_tasks']: pw_task = True assert prefix in DATA_TYPE data_type = DATA_TYPE[prefix] dev_path = os.path.join(data_dir, '{}_dev.json'.format(dataset)) dev_data = None if os.path.exists(dev_path): dev_data = BatchGen(BatchGen.load(dev_path, False, pairwise=pw_task, maxlen=args.max_seq_len), batch_size=args.batch_size_eval, gpu=args.cuda, is_train=False, task_id=task_id, maxlen=args.max_seq_len, pairwise=pw_task, data_type=data_type, task_type=task_type) dev_data_list.append(dev_data) test_path = os.path.join(data_dir, '{}_test.json'.format(dataset)) test_data = None if os.path.exists(test_path): test_data = BatchGen(BatchGen.load(test_path, False, pairwise=pw_task, maxlen=args.max_seq_len), batch_size=args.batch_size_eval, gpu=args.cuda, is_train=False, task_id=task_id, maxlen=args.max_seq_len, pairwise=pw_task, data_type=data_type, task_type=task_type) test_data_list.append(test_data) logger.info('#' * 20) logger.info(opt) logger.info('#' * 20) all_iters = [iter(item) for item in train_data_list] all_lens = [len(bg) for bg in train_data_list] num_all_batches = args.epochs * sum(all_lens) if len(train_data_list) > 1 and args.ratio > 0: num_all_batches = int(args.epochs * (len(train_data_list[0]) * (1 + args.ratio))) model_path = args.init_checkpoint state_dict = None if os.path.exists(model_path): state_dict = torch.load(model_path, map_location='cpu') config = state_dict['config'] config['attention_probs_dropout_prob'] = args.bert_dropout_p config['hidden_dropout_prob'] = args.bert_dropout_p opt.update(config) else: logger.error('#' * 20) logger.error('Could not find the init model!\n Exit application!') logger.error('#' * 20) try: shutil.rmtree(output_dir) except Exception as e: print(e) exit(1) model = MTDNNModel(opt, state_dict=state_dict, num_train_step=num_all_batches) ####model meta str headline = '############# Model Arch of MT-DNN #############' ###print network logger.info('\n{}\n{}\n'.format(headline, model.network)) # dump config config_file = os.path.join(output_dir, 'config.json') with open(config_file, 'w', encoding='utf-8') as writer: writer.write('{}\n'.format(json.dumps(opt))) writer.write('\n{}\n{}\n'.format(headline, model.network)) logger.info("Total number of params: {}".format(model.total_param)) if args.freeze_layers > 0: model.network.freeze_layers(args.freeze_layers) if args.cuda: model.cuda() best_F1_macro = -1.0 for epoch in range(0, args.epochs): logger.warning('At epoch {}'.format(epoch)) for train_data in train_data_list: train_data.reset() start = datetime.now() all_indices = [] if len(train_data_list) > 1 and (args.ratio > 0 or args.reduce_first_dataset_ratio > 0): main_indices = [0] * (int(args.reduce_first_dataset_ratio * len( train_data_list[0])) if args.reduce_first_dataset_ratio > 0 else len(train_data_list[0])) extra_indices = [] for i in range(1, len(train_data_list)): extra_indices += [i] * len(train_data_list[i]) if args.ratio > 0: random_picks = int( min( len(train_data_list[0]) * args.ratio, len(extra_indices))) extra_indices = np.random.choice(extra_indices, random_picks, replace=False).tolist() if args.mix_opt > 0: extra_indices = extra_indices random.shuffle(extra_indices) all_indices = extra_indices + main_indices else: all_indices = main_indices + extra_indices logger.info( "Main batches loaded (first dataset in list): {}".format( len(main_indices))) logger.info( "Extra batches loaded (all except first dataset in list): {}". format(len(extra_indices))) else: # shuffle the index of the train sets whose batches will be trained on in the order: e.g. if train_set[1] is large, it will get trained on more often for i in range(1, len(train_data_list)): all_indices += [i] * len(train_data_list[i]) if args.mix_opt > 0: random.shuffle(all_indices) all_indices += [0] * len(train_data_list[0]) if args.mix_opt < 1: random.shuffle(all_indices) for i in range(len(all_indices)): task_id = all_indices[i] batch_meta, batch_data = next(all_iters[task_id]) model.update(batch_meta, batch_data) if (model.updates ) % args.log_per_updates == 0 or model.updates == 1: logger.info( 'Task [{0:2}] updates[{1:6}] train loss[{2:.5f}] remaining[{3}]' .format( task_id, model.updates, model.train_loss.avg, str((datetime.now() - start) / (i + 1) * (len(all_indices) - i - 1)).split('.')[0])) temp_dev_F1s = [] dev_dump_list = [] test_dump_list = [] for idx, dataset in enumerate(args.test_datasets): prefix = dataset.split('_')[0] label_dict = GLOBAL_MAP.get(prefix, None) dev_data = dev_data_list[idx] if dev_data is not None: dev_metrics, dev_predictions, scores, golds, dev_ids, premises, hypotheses = eval_model( model, dev_data, dataset=prefix, use_cuda=args.cuda) for key, val in dev_metrics.items(): if not isinstance(val, dict): logger.warning( "Task {0} -- epoch {1} -- Dev {2}: {3:.3f}".format( dataset, epoch, key, val)) score_file = os.path.join( output_dir, '{}_dev_scores_{}.json'.format(dataset, epoch)) results = { 'metrics': dev_metrics, 'predictions': dev_predictions, 'uids': dev_ids, 'scores': scores, 'golds': golds, 'premises': premises, 'hypotheses': hypotheses } dump(score_file, results) official_score_file = os.path.join( output_dir, '{}_dev_scores_{}.tsv'.format(dataset, epoch)) submit(official_score_file, results, label_dict) # for checkpoint temp_dev_F1s.append(dev_metrics['F1_macro']) dev_dump_list.append({ "output_dir": output_dir, "dev_metrics": dev_metrics, "dev_predictions": dev_predictions, "golds": golds, "opt": opt, "dataset": dataset }) # test eval test_data = test_data_list[idx] if test_data is not None: test_metrics, test_predictions, scores, golds, test_ids, premises, hypotheses = eval_model( model, test_data, dataset=prefix, use_cuda=args.cuda, with_label=True) score_file = os.path.join( output_dir, '{}_test_scores_{}.json'.format(dataset, epoch)) results = { 'metrics': test_metrics, 'predictions': test_predictions, 'uids': test_ids, 'scores': scores, 'golds': golds, 'premises': premises, 'hypotheses': hypotheses } dump(score_file, results) official_score_file = os.path.join( output_dir, '{}_test_scores_{}.tsv'.format(dataset, epoch)) submit(official_score_file, results, label_dict) logger.info('[new test scores saved.]') # for checkpoint test_dump_list.append({ "output_dir": output_dir, "test_metrics": test_metrics, "test_predictions": test_predictions, "golds": golds, "opt": opt, "dataset": dataset }) # save checkpoint if np.average(temp_dev_F1s) > best_F1_macro: print("Save new model! Current best F1 macro over all dev sets: " + "{0:.2f}".format(best_F1_macro) + ". New: " + "{0:.2f}".format(np.average(temp_dev_F1s))) best_F1_macro = np.average(temp_dev_F1s) # override current dump file for l in dev_dump_list: dump_result_files(l['dataset'])(l['output_dir'], epoch, l['dev_metrics'], str(l['dev_predictions']), str(l['golds']), "dev", l['opt'], l['dataset']) for l in test_dump_list: dump_result_files(l['dataset'])(l['output_dir'], epoch, l['test_metrics'], str(l['test_predictions']), str(l['golds']), "test", l['opt'], l['dataset']) # save model model_file = os.path.join(output_dir, 'model.pt') model.save(model_file)
def load_model_for_viz_1(task_def_path, checkpoint_path, input_path, model_type='bert-base-cased', do_lower_case=False, use_cuda=True): # load task info task = os.path.splitext(os.path.basename(task_def_path))[0] task_defs = TaskDefs(task_def_path) assert task in task_defs._task_type_map assert task in task_defs._data_type_map assert task in task_defs._metric_meta_map prefix = task.split('_')[0] task_def = task_defs.get_task_def(prefix) data_type = task_defs._data_type_map[task] task_type = task_defs._task_type_map[task] metric_meta = task_defs._metric_meta_map[task] # load model assert os.path.exists(checkpoint_path) state_dict = torch.load(checkpoint_path) config = state_dict['config'] config["cuda"] = use_cuda device = torch.device("cuda" if use_cuda else "cpu") task_def = task_defs.get_task_def(prefix) task_def_list = [task_def] config['task_def_list'] = task_def_list ## temp fix config['fp16'] = False config['answer_opt'] = 0 config['adv_train'] = False #del state_dict['optimizer'] config['output_attentions'] = True config['local_rank'] = -1 model = MTDNNModel(config, device, state_dict=state_dict) encoder_type = config.get('encoder_type', EncoderModelType.BERT) root = os.path.basename(task_def_path) literal_model_type = model_type.split('-')[0].upper() encoder_model = EncoderModelType[literal_model_type] literal_model_type = literal_model_type.lower() mt_dnn_suffix = literal_model_type if 'base' in model_type: mt_dnn_suffix += "_base" elif 'large' in model_type: mt_dnn_suffix += "_large" # load tokenizer config_class, model_class, tokenizer_class = MODEL_CLASSES[ literal_model_type] tokenizer = tokenizer_class.from_pretrained(model_type, do_lower_case=do_lower_case) # load data prep_input = input_path test_data_set = SingleTaskDataset(prep_input, False, maxlen=512, task_id=0, task_def=task_def) collater = Collater(is_train=False, encoder_type=encoder_type) test_data = DataLoader(test_data_set, batch_size=1, collate_fn=collater.collate_fn, pin_memory=True) idx = 0 results = [] for batch_meta, batch_data in tqdm(test_data): if idx < 360: idx += 1 continue batch_meta, batch_data = Collater.patch_data(device, batch_meta, batch_data) model.network.eval() task_id = batch_meta['task_id'] task_def = TaskDef.from_dict(batch_meta['task_def']) task_type = task_def.task_type task_obj = tasks.get_task_obj(task_def) inputs = batch_data[:batch_meta['input_len']] if len(inputs) == 3: inputs.append(None) inputs.append(None) inputs.append(task_id) input_ids = inputs[0] token_type_ids = inputs[1] attention = model.mnetwork.module.bert( input_ids, token_type_ids=token_type_ids)[-1] batch_size = batch_data[0].shape[0] for i in range(batch_size): attention = tuple([item[i:i + 1, :, :, :] for item in attention]) input_id_list = input_ids[i].tolist() tokens = tokenizer.convert_ids_to_tokens(input_id_list) idx_sep = listRightIndex(tokens, '[SEP]') + 1 tokens = tokens[:idx_sep] attention = tuple( [item[:, :, :idx_sep, :idx_sep] for item in attention]) results.append((attention, tokens)) idx += batch_size return results
def main(): logger.info('Launching the MT-DNN training') opt = vars(args) # update data dir opt['data_dir'] = data_dir batch_size = args.batch_size train_data_list = [] tasks = {} tasks_class = {} nclass_list = [] decoder_opts = [] dropout_list = [] for dataset in args.train_datasets: prefix = dataset.split('_')[0] if prefix in tasks: continue assert prefix in DATA_META assert prefix in DATA_TYPE nclass = DATA_META[prefix] task_id = len(tasks) if args.mtl_opt > 0: task_id = tasks_class[nclass] if nclass in tasks_class else len(tasks_class) dopt = generate_decoder_opt(prefix, opt['answer_opt']) if task_id < len(decoder_opts): decoder_opts[task_id] = min(decoder_opts[task_id], dopt) else: decoder_opts.append(dopt) if prefix not in tasks: tasks[prefix] = len(tasks) if args.mtl_opt < 1: nclass_list.append(nclass) if (nclass not in tasks_class): tasks_class[nclass] = len(tasks_class) if args.mtl_opt > 0: nclass_list.append(nclass) dropout_p = args.dropout_p if tasks_config and prefix in tasks_config: dropout_p = tasks_config[prefix] dropout_list.append(dropout_p) opt['answer_opt'] = decoder_opts opt['tasks_dropout_p'] = dropout_list args.label_size = ','.join([str(l) for l in nclass_list]) logger.info(args.label_size) dev_data_list = [] test_data_list = [] stress_data_list = [] for dataset in args.test_datasets: prefix = dataset.split('_')[0] task_id = tasks_class[DATA_META[prefix]] if args.mtl_opt > 0 else tasks[prefix] task_type = TASK_TYPE[prefix] pw_task = False if prefix in opt['pw_tasks']: pw_task = True assert prefix in DATA_TYPE data_type = DATA_TYPE[prefix] dev_path = os.path.join(data_dir, '{}_dev.json'.format(dataset)) dev_data = None if os.path.exists(dev_path): dev_data = BatchGen(BatchGen.load(dev_path, False, pairwise=pw_task, maxlen=args.max_seq_len), batch_size=args.batch_size_eval, gpu=args.cuda, is_train=False, task_id=task_id, maxlen=args.max_seq_len, pairwise=pw_task, data_type=data_type, task_type=task_type) dev_data_list.append(dev_data) test_path = os.path.join(data_dir, '{}_test.json'.format(dataset)) test_data = None if os.path.exists(test_path): test_data = BatchGen(BatchGen.load(test_path, False, pairwise=pw_task, maxlen=args.max_seq_len), batch_size=args.batch_size_eval, gpu=args.cuda, is_train=False, task_id=task_id, maxlen=args.max_seq_len, pairwise=pw_task, data_type=data_type, task_type=task_type) test_data_list.append(test_data) stress_data = [] if args.stress_tests != "NONE": for stress_test in args.stress_tests.split(','): stress_path = os.path.join(data_dir, '{}_test_{}.json'.format(dataset, stress_test)) if os.path.exists(stress_path): stress_data.append(BatchGen(BatchGen.load(stress_path, False, pairwise=pw_task, maxlen=args.max_seq_len), batch_size=args.batch_size_eval, gpu=args.cuda, is_train=False, task_id=task_id, maxlen=512, pairwise=pw_task, data_type=data_type, task_type=task_type) ) stress_data_list.append(stress_data) logger.info('#' * 20) logger.info(opt) logger.info('#' * 20) all_lens = [len(bg) for bg in train_data_list] num_all_batches = args.epochs * sum(all_lens) if len(train_data_list) > 1 and args.ratio > 0: num_all_batches = int(args.epochs * (len(train_data_list[0]) * (1 + args.ratio))) model_path = args.init_checkpoint state_dict = None if os.path.exists(model_path): state_dict = torch.load(model_path) config = state_dict['config'] config['attention_probs_dropout_prob'] = args.bert_dropout_p config['hidden_dropout_prob'] = args.bert_dropout_p opt.update(config) else: logger.error('#' * 20) logger.error('Could not find the init model!\n Exit application!') logger.error('#' * 20) model = MTDNNModel(opt, state_dict=state_dict, num_train_step=num_all_batches) ####model meta str headline = '############# Model Arch of MT-DNN #############' ###print network logger.info('\n{}\n{}\n'.format(headline, model.network)) # dump config config_file = os.path.join(output_dir, 'config.json') with open(config_file, 'w', encoding='utf-8') as writer: writer.write('{}\n'.format(json.dumps(opt))) writer.write('\n{}\n{}\n'.format(headline, model.network)) logger.info("Total number of params: {}".format(model.total_param)) if args.freeze_layers > 0: model.network.freeze_layers(args.freeze_layers) if args.cuda: model.cuda() for epoch in range(0, 1): dev_dump_list = [] test_dump_list = [] stress_dump_list = [] for idx, dataset in enumerate(args.test_datasets): prefix = dataset.split('_')[0] label_dict = GLOBAL_MAP.get(prefix, None) dev_data = dev_data_list[idx] if dev_data is not None: dev_metrics, dev_predictions, scores, golds, dev_ids, premises, hypotheses = eval_model(model, dev_data, dataset=prefix, use_cuda=args.cuda) for key, val in dev_metrics.items(): if not isinstance(val, dict): logger.warning("Task {0} -- epoch {1} -- Dev {2}: {3:.3f}".format(dataset, epoch, key, val)) if args.dump_to_checkpoints == 1: score_file = os.path.join(output_dir, '{}_dev_scores_{}_EVAL_ONLY.json'.format(dataset, epoch)) results = {'metrics': dev_metrics, 'predictions': dev_predictions, 'uids': dev_ids, 'scores': scores, 'golds': golds, 'premises': premises, 'hypotheses': hypotheses} dump(score_file, results) official_score_file = os.path.join(output_dir, '{}_dev_scores_{}_EVAL_ONLY.tsv'.format(dataset, epoch)) submit(official_score_file, results, label_dict) # for checkpoint dev_dump_list.append({ "output_dir": output_dir, "dev_metrics": dev_metrics, "dev_predictions": dev_predictions, "golds": golds, "opt": opt, "dataset": dataset }) # test eval test_data = test_data_list[idx] if test_data is not None: test_metrics, test_predictions, scores, golds, test_ids, premises, hypotheses = eval_model(model, test_data, dataset=prefix, use_cuda=args.cuda, with_label=True) if args.dump_to_checkpoints == 1: score_file = os.path.join(output_dir, '{}_test_scores_{}_EVAL_ONLY.json'.format(dataset, epoch)) results = {'metrics': test_metrics, 'predictions': test_predictions, 'uids': test_ids, 'scores': scores, 'golds': golds, 'premises': premises, 'hypotheses': hypotheses} dump(score_file, results) official_score_file = os.path.join(output_dir, '{}_test_scores_{}_EVAL_ONLY.tsv'.format(dataset, epoch)) submit(official_score_file, results, label_dict) logger.info('[new test scores saved.]') # for checkpoint test_dump_list.append({ "output_dir": output_dir, "test_metrics": test_metrics, "test_predictions": test_predictions, "golds": golds, "opt": opt, "dataset": dataset }) # stress test eval if args.stress_tests != "NONE": stress_data = stress_data_list[idx] for j, stress_test in enumerate(args.stress_tests.split(',')): stress_metrics, stress_predictions, scores, golds, stress_ids, premises, hypotheses = \ eval_model(model, stress_data[j], dataset=prefix, use_cuda=args.cuda, with_label=True) if args.dump_to_checkpoints == 1: score_file = os.path.join(output_dir, '{}_test_{}_scores_{}_EVAL_ONLY.json'.format(dataset, stress_test, epoch)) results = {'metrics': stress_metrics, 'predictions': stress_predictions, 'uids': stress_ids, 'scores': scores, 'golds': golds, 'premises': premises, 'hypotheses': hypotheses} dump(score_file, results) official_score_file = os.path.join(output_dir, '{}_test_{}_scores_{}_EVAL_ONLY.tsv'.format(dataset, stress_test, epoch)) submit(official_score_file, results, label_dict) logger.info('[new stress test scores for "{}" saved.]'.format(stress_test)) # for checkpoint stress_dump_list.append({ "output_dir": output_dir, "test_metrics": stress_metrics, "test_predictions": stress_predictions, "golds": golds, "opt": opt, "dataset": dataset, "stress_test": stress_test }) # save results print("Save new results!") for l in dev_dump_list: dump_result_files(l['dataset'])(l['output_dir'], -1, l['dev_metrics'], str(l['dev_predictions']), str(l['golds']), "dev", l['opt'], l['dataset']) for l in test_dump_list: dump_result_files(l['dataset'])(l['output_dir'], -1, l['test_metrics'], str(l['test_predictions']), str(l['golds']), "test", l['opt'], l['dataset']) if args.stress_tests != "NONE": for l in stress_dump_list: dump_result_files(l['dataset'])(l['output_dir'], -1, l['test_metrics'], str(l['test_predictions']), str(l['golds']), l['stress_test'], l['opt'], l['dataset'])
def main(): # set up dist device = torch.device("cuda") if args.local_rank > -1: device = initialize_distributed(args) elif torch.cuda.is_available(): device = torch.device("cuda") else: device = torch.device("cpu") opt = vars(args) # update data dir opt['data_dir'] = data_dir batch_size = args.batch_size print_message(logger, 'Launching the MT-DNN training') #return tasks = {} task_def_list = [] dropout_list = [] printable = args.local_rank in [-1, 0] train_datasets = [] for dataset in args.train_datasets: prefix = dataset.split('_')[0] if prefix in tasks: continue task_id = len(tasks) tasks[prefix] = task_id task_def = task_defs.get_task_def(prefix) task_def_list.append(task_def) train_path = os.path.join(data_dir, '{}_train.json'.format(dataset)) print_message(logger, 'Loading {} as task {}'.format(train_path, task_id)) train_data_set = SingleTaskDataset(train_path, True, maxlen=args.max_seq_len, task_id=task_id, task_def=task_def, printable=printable) train_datasets.append(train_data_set) train_collater = Collater(dropout_w=args.dropout_w, encoder_type=encoder_type, soft_label=args.mkd_opt > 0, max_seq_len=args.max_seq_len, do_padding=args.do_padding) multi_task_train_dataset = MultiTaskDataset(train_datasets) if args.local_rank != -1: multi_task_batch_sampler = DistMultiTaskBatchSampler( train_datasets, args.batch_size, args.mix_opt, args.ratio, rank=args.local_rank, world_size=args.world_size) else: multi_task_batch_sampler = MultiTaskBatchSampler( train_datasets, args.batch_size, args.mix_opt, args.ratio, bin_on=args.bin_on, bin_size=args.bin_size, bin_grow_ratio=args.bin_grow_ratio) multi_task_train_data = DataLoader(multi_task_train_dataset, batch_sampler=multi_task_batch_sampler, collate_fn=train_collater.collate_fn, pin_memory=args.cuda) opt['task_def_list'] = task_def_list dev_data_list = [] test_data_list = [] test_collater = Collater(is_train=False, encoder_type=encoder_type, max_seq_len=args.max_seq_len, do_padding=args.do_padding) for dataset in args.test_datasets: prefix = dataset.split('_')[0] task_def = task_defs.get_task_def(prefix) task_id = tasks[prefix] task_type = task_def.task_type data_type = task_def.data_type dev_path = os.path.join(data_dir, '{}_dev.json'.format(dataset)) dev_data = None if os.path.exists(dev_path): dev_data_set = SingleTaskDataset(dev_path, False, maxlen=args.max_seq_len, task_id=task_id, task_def=task_def, printable=printable) if args.local_rank != -1: dev_data_set = DistTaskDataset(dev_data_set, task_id) single_task_batch_sampler = DistSingleTaskBatchSampler( dev_data_set, args.batch_size_eval, rank=args.local_rank, world_size=args.world_size) dev_data = DataLoader(dev_data_set, batch_sampler=single_task_batch_sampler, collate_fn=test_collater.collate_fn, pin_memory=args.cuda) else: dev_data = DataLoader(dev_data_set, batch_size=args.batch_size_eval, collate_fn=test_collater.collate_fn, pin_memory=args.cuda) dev_data_list.append(dev_data) test_path = os.path.join(data_dir, '{}_test.json'.format(dataset)) test_data = None if os.path.exists(test_path): test_data_set = SingleTaskDataset(test_path, False, maxlen=args.max_seq_len, task_id=task_id, task_def=task_def, printable=printable) if args.local_rank != -1: test_data_set = DistTaskDataset(test_data_set, task_id) single_task_batch_sampler = DistSingleTaskBatchSampler( test_data_set, args.batch_size_eval, rank=args.local_rank, world_size=args.world_size) test_data = DataLoader(test_data_set, batch_sampler=single_task_batch_sampler, collate_fn=test_collater.collate_fn, pin_memory=args.cuda) else: test_data = DataLoader(test_data_set, batch_size=args.batch_size_eval, collate_fn=test_collater.collate_fn, pin_memory=args.cuda) test_data_list.append(test_data) print_message(logger, '#' * 20) print_message(logger, opt) print_message(logger, '#' * 20) # div number of grad accumulation. num_all_batches = args.epochs * len( multi_task_train_data) // args.grad_accumulation_step print_message(logger, '############# Gradient Accumulation Info #############') print_message( logger, 'number of step: {}'.format(args.epochs * len(multi_task_train_data))) print_message( logger, 'number of grad grad_accumulation step: {}'.format( args.grad_accumulation_step)) print_message(logger, 'adjusted number of step: {}'.format(num_all_batches)) print_message(logger, '############# Gradient Accumulation Info #############') init_model = args.init_checkpoint state_dict = None if os.path.exists(init_model): if encoder_type == EncoderModelType.BERT or \ encoder_type == EncoderModelType.DEBERTA or \ encoder_type == EncoderModelType.ELECTRA: state_dict = torch.load(init_model, map_location=device) config = state_dict['config'] elif encoder_type == EncoderModelType.ROBERTA or encoder_type == EncoderModelType.XLM: model_path = '{}/model.pt'.format(init_model) state_dict = torch.load(model_path, map_location=device) arch = state_dict['args'].arch arch = arch.replace('_', '-') if encoder_type == EncoderModelType.XLM: arch = "xlm-{}".format(arch) # convert model arch from data_utils.roberta_utils import update_roberta_keys from data_utils.roberta_utils import patch_name_dict state = update_roberta_keys( state_dict['model'], nlayer=state_dict['args'].encoder_layers) state = patch_name_dict(state) literal_encoder_type = EncoderModelType( opt['encoder_type']).name.lower() config_class, model_class, tokenizer_class = MODEL_CLASSES[ literal_encoder_type] config = config_class.from_pretrained(arch).to_dict() state_dict = {'state': state} else: if opt['encoder_type'] not in EncoderModelType._value2member_map_: raise ValueError("encoder_type is out of pre-defined types") literal_encoder_type = EncoderModelType( opt['encoder_type']).name.lower() config_class, model_class, tokenizer_class = MODEL_CLASSES[ literal_encoder_type] config = config_class.from_pretrained(init_model).to_dict() config['attention_probs_dropout_prob'] = args.bert_dropout_p config['hidden_dropout_prob'] = args.bert_dropout_p config['multi_gpu_on'] = opt["multi_gpu_on"] if args.num_hidden_layers > 0: config['num_hidden_layers'] = args.num_hidden_layers opt.update(config) model = MTDNNModel(opt, device=device, state_dict=state_dict, num_train_step=num_all_batches) if args.resume and args.model_ckpt: print_message(logger, 'loading model from {}'.format(args.model_ckpt)) model.load(args.model_ckpt) #### model meta str headline = '############# Model Arch of MT-DNN #############' ### print network print_message(logger, '\n{}\n{}\n'.format(headline, model.network)) # dump config config_file = os.path.join(output_dir, 'config.json') with open(config_file, 'w', encoding='utf-8') as writer: writer.write('{}\n'.format(json.dumps(opt))) writer.write('\n{}\n{}\n'.format(headline, model.network)) print_message(logger, "Total number of params: {}".format(model.total_param)) # tensorboard tensorboard = None if args.tensorboard: args.tensorboard_logdir = os.path.join(args.output_dir, args.tensorboard_logdir) tensorboard = SummaryWriter(log_dir=args.tensorboard_logdir) if args.encode_mode: for idx, dataset in enumerate(args.test_datasets): prefix = dataset.split('_')[0] test_data = test_data_list[idx] with torch.no_grad(): encoding = extract_encoding(model, test_data, use_cuda=args.cuda) torch.save( encoding, os.path.join(output_dir, '{}_encoding.pt'.format(dataset))) return for epoch in range(0, args.epochs): print_message(logger, 'At epoch {}'.format(epoch), level=1) start = datetime.now() for i, (batch_meta, batch_data) in enumerate(multi_task_train_data): batch_meta, batch_data = Collater.patch_data( device, batch_meta, batch_data) task_id = batch_meta['task_id'] model.update(batch_meta, batch_data) if (model.updates) % ( args.log_per_updates) == 0 or model.updates == 1: ramaining_time = str( (datetime.now() - start) / (i + 1) * (len(multi_task_train_data) - i - 1)).split('.')[0] if args.adv_train and args.debug: debug_info = ' basic loss[%.5f] adv loss[%.5f] emb val[%.8f] noise val[%.8f] noise grad val[%.8f] no proj noise[%.8f] ' % ( model.basic_loss.avg, model.adv_loss.avg, model.emb_val.avg, model.noise_val.avg, model.noise_grad_val.avg, model.no_proj_noise_val.avg) else: debug_info = ' ' print_message( logger, 'Task [{0:2}] updates[{1:6}] train loss[{2:.5f}]{3}remaining[{4}]' .format(task_id, model.updates, model.train_loss.avg, debug_info, ramaining_time)) if args.tensorboard: tensorboard.add_scalar('train/loss', model.train_loss.avg, global_step=model.updates) if args.save_per_updates_on and ( (model.local_updates) % (args.save_per_updates * args.grad_accumulation_step) == 0) and args.local_rank in [-1, 0]: model_file = os.path.join( output_dir, 'model_{}_{}.pt'.format(epoch, model.updates)) evaluation(model, args.test_datasets, dev_data_list, task_defs, output_dir, epoch, n_updates=args.save_per_updates, with_label=True, tensorboard=tensorboard, glue_format_on=args.glue_format_on, test_on=False, device=device, logger=logger) evaluation(model, args.test_datasets, test_data_list, task_defs, output_dir, epoch, n_updates=args.save_per_updates, with_label=False, tensorboard=tensorboard, glue_format_on=args.glue_format_on, test_on=True, device=device, logger=logger) print_message(logger, 'Saving mt-dnn model to {}'.format(model_file)) model.save(model_file) evaluation(model, args.test_datasets, dev_data_list, task_defs, output_dir, epoch, with_label=True, tensorboard=tensorboard, glue_format_on=args.glue_format_on, test_on=False, device=device, logger=logger) evaluation(model, args.test_datasets, test_data_list, task_defs, output_dir, epoch, with_label=False, tensorboard=tensorboard, glue_format_on=args.glue_format_on, test_on=True, device=device, logger=logger) print_message(logger, '[new test scores at {} saved.]'.format(epoch)) if args.local_rank in [-1, 0]: model_file = os.path.join(output_dir, 'model_{}.pt'.format(epoch)) model.save(model_file) if args.tensorboard: tensorboard.close()
def main(): logger.info('Launching the MT-DNN training') opt = vars(args) # update data dir opt['data_dir'] = data_dir batch_size = args.batch_size tasks = {} tasks_class = {} nclass_list = [] decoder_opts = [] task_types = [] dropout_list = [] loss_types = [] kd_loss_types = [] train_datasets = [] for dataset in args.train_datasets: prefix = dataset.split('_')[0] if prefix in tasks: continue assert prefix in task_defs.n_class_map assert prefix in task_defs.data_type_map data_type = task_defs.data_type_map[prefix] nclass = task_defs.n_class_map[prefix] task_id = len(tasks) if args.mtl_opt > 0: task_id = tasks_class[nclass] if nclass in tasks_class else len( tasks_class) task_type = task_defs.task_type_map[prefix] dopt = generate_decoder_opt(task_defs.enable_san_map[prefix], opt['answer_opt']) if task_id < len(decoder_opts): decoder_opts[task_id] = min(decoder_opts[task_id], dopt) else: decoder_opts.append(dopt) task_types.append(task_type) loss_types.append(task_defs.loss_map[prefix]) kd_loss_types.append(task_defs.kd_loss_map[prefix]) if prefix not in tasks: tasks[prefix] = len(tasks) if args.mtl_opt < 1: nclass_list.append(nclass) if (nclass not in tasks_class): tasks_class[nclass] = len(tasks_class) if args.mtl_opt > 0: nclass_list.append(nclass) dropout_p = task_defs.dropout_p_map.get(prefix, args.dropout_p) dropout_list.append(dropout_p) train_path = os.path.join(data_dir, '{}_train.json'.format(dataset)) logger.info('Loading {} as task {}'.format(train_path, task_id)) train_data_set = SingleTaskDataset(train_path, True, maxlen=args.max_seq_len, task_id=task_id, task_type=task_type, data_type=data_type) train_datasets.append(train_data_set) train_collater = Collater(dropout_w=args.dropout_w, encoder_type=encoder_type) multi_task_train_dataset = MultiTaskDataset(train_datasets) multi_task_batch_sampler = MultiTaskBatchSampler(train_datasets, args.batch_size, args.mix_opt, args.ratio) multi_task_train_data = DataLoader(multi_task_train_dataset, batch_sampler=multi_task_batch_sampler, collate_fn=train_collater.collate_fn, pin_memory=args.cuda) opt['answer_opt'] = decoder_opts opt['task_types'] = task_types opt['tasks_dropout_p'] = dropout_list opt['loss_types'] = loss_types opt['kd_loss_types'] = kd_loss_types args.label_size = ','.join([str(l) for l in nclass_list]) logger.info(args.label_size) dev_data_list = [] test_data_list = [] test_collater = Collater(is_train=False, encoder_type=encoder_type) for dataset in args.test_datasets: prefix = dataset.split('_')[0] task_id = tasks_class[ task_defs. n_class_map[prefix]] if args.mtl_opt > 0 else tasks[prefix] task_type = task_defs.task_type_map[prefix] pw_task = False if task_type == TaskType.Ranking: pw_task = True assert prefix in task_defs.data_type_map data_type = task_defs.data_type_map[prefix] dev_path = os.path.join(data_dir, '{}_dev.json'.format(dataset)) dev_data = None if os.path.exists(dev_path): dev_data_set = SingleTaskDataset(dev_path, False, maxlen=args.max_seq_len, task_id=task_id, task_type=task_type, data_type=data_type) dev_data = DataLoader(dev_data_set, batch_size=args.batch_size_eval, collate_fn=test_collater.collate_fn, pin_memory=args.cuda) dev_data_list.append(dev_data) test_path = os.path.join(data_dir, '{}_test.json'.format(dataset)) test_data = None if os.path.exists(test_path): test_data_set = SingleTaskDataset(test_path, False, maxlen=args.max_seq_len, task_id=task_id, task_type=task_type, data_type=data_type) test_data = DataLoader(test_data_set, batch_size=args.batch_size_eval, collate_fn=test_collater.collate_fn, pin_memory=args.cuda) test_data_list.append(test_data) logger.info('#' * 20) logger.info(opt) logger.info('#' * 20) # div number of grad accumulation. num_all_batches = args.epochs * len( multi_task_train_data) // args.grad_accumulation_step logger.info('############# Gradient Accumulation Info #############') logger.info('number of step: {}'.format(args.epochs * len(multi_task_train_data))) logger.info('number of grad grad_accumulation step: {}'.format( args.grad_accumulation_step)) logger.info('adjusted number of step: {}'.format(num_all_batches)) logger.info('############# Gradient Accumulation Info #############') bert_model_path = args.init_checkpoint state_dict = None if encoder_type == EncoderModelType.BERT: if os.path.exists(bert_model_path): state_dict = torch.load(bert_model_path) config = state_dict['config'] config['attention_probs_dropout_prob'] = args.bert_dropout_p config['hidden_dropout_prob'] = args.bert_dropout_p config['multi_gpu_on'] = opt["multi_gpu_on"] opt.update(config) else: logger.error('#' * 20) logger.error( 'Could not find the init model!\n The parameters will be initialized randomly!' ) logger.error('#' * 20) config = BertConfig(vocab_size_or_config_json_file=30522).to_dict() config['multi_gpu_on'] = opt["multi_gpu_on"] opt.update(config) elif encoder_type == EncoderModelType.ROBERTA: bert_model_path = '{}/model.pt'.format(bert_model_path) if os.path.exists(bert_model_path): new_state_dict = {} state_dict = torch.load(bert_model_path) for key, val in state_dict['model'].items(): if key.startswith('decoder.sentence_encoder'): key = 'bert.model.{}'.format(key) new_state_dict[key] = val elif key.startswith('classification_heads'): key = 'bert.model.{}'.format(key) new_state_dict[key] = val state_dict = {'state': new_state_dict} model = MTDNNModel(opt, state_dict=state_dict, num_train_step=num_all_batches) if args.resume and args.model_ckpt: logger.info('loading model from {}'.format(args.model_ckpt)) model.load(args.model_ckpt) #### model meta str headline = '############# Model Arch of MT-DNN #############' ### print network logger.info('\n{}\n{}\n'.format(headline, model.network)) # dump config config_file = os.path.join(output_dir, 'config.json') with open(config_file, 'w', encoding='utf-8') as writer: writer.write('{}\n'.format(json.dumps(opt))) writer.write('\n{}\n{}\n'.format(headline, model.network)) logger.info("Total number of params: {}".format(model.total_param)) # tensorboard if args.tensorboard: args.tensorboard_logdir = os.path.join(args.output_dir, args.tensorboard_logdir) tensorboard = SummaryWriter(log_dir=args.tensorboard_logdir) for epoch in range(0, args.epochs): logger.warning('At epoch {}'.format(epoch)) start = datetime.now() for i, (batch_meta, batch_data) in enumerate(multi_task_train_data): batch_meta, batch_data = Collater.patch_data( args.cuda, batch_meta, batch_data) task_id = batch_meta['task_id'] model.update(batch_meta, batch_data) if (model.local_updates) % (args.log_per_updates * args.grad_accumulation_step ) == 0 or model.local_updates == 1: ramaining_time = str( (datetime.now() - start) / (i + 1) * (len(multi_task_train_data) - i - 1)).split('.')[0] logger.info( 'Task [{0:2}] updates[{1:6}] train loss[{2:.5f}] remaining[{3}]' .format(task_id, model.updates, model.train_loss.avg, ramaining_time)) if args.tensorboard: tensorboard.add_scalar('train/loss', model.train_loss.avg, global_step=model.updates) if args.save_per_updates_on and ( (model.local_updates) % (args.save_per_updates * args.grad_accumulation_step) == 0): model_file = os.path.join( output_dir, 'model_{}_{}.pt'.format(epoch, model.updates)) logger.info('Saving mt-dnn model to {}'.format(model_file)) model.save(model_file) for idx, dataset in enumerate(args.test_datasets): prefix = dataset.split('_')[0] label_dict = task_defs.global_map.get(prefix, None) dev_data = dev_data_list[idx] if dev_data is not None: with torch.no_grad(): dev_metrics, dev_predictions, scores, golds, dev_ids = eval_model( model, dev_data, metric_meta=task_defs.metric_meta_map[prefix], use_cuda=args.cuda, label_mapper=label_dict, task_type=task_defs.task_type_map[prefix]) for key, val in dev_metrics.items(): if args.tensorboard: tensorboard.add_scalar('dev/{}/{}'.format( dataset, key), val, global_step=epoch) if isinstance(val, str): logger.warning( 'Task {0} -- epoch {1} -- Dev {2}:\n {3}'.format( dataset, epoch, key, val)) else: logger.warning( 'Task {0} -- epoch {1} -- Dev {2}: {3:.3f}'.format( dataset, epoch, key, val)) score_file = os.path.join( output_dir, '{}_dev_scores_{}.json'.format(dataset, epoch)) results = { 'metrics': dev_metrics, 'predictions': dev_predictions, 'uids': dev_ids, 'scores': scores } dump(score_file, results) if args.glue_format_on: from experiments.glue.glue_utils import submit official_score_file = os.path.join( output_dir, '{}_dev_scores_{}.tsv'.format(dataset, epoch)) submit(official_score_file, results, label_dict) # test eval test_data = test_data_list[idx] if test_data is not None: with torch.no_grad(): test_metrics, test_predictions, scores, golds, test_ids = eval_model( model, test_data, metric_meta=task_defs.metric_meta_map[prefix], use_cuda=args.cuda, with_label=False, label_mapper=label_dict, task_type=task_defs.task_type_map[prefix]) score_file = os.path.join( output_dir, '{}_test_scores_{}.json'.format(dataset, epoch)) results = { 'metrics': test_metrics, 'predictions': test_predictions, 'uids': test_ids, 'scores': scores } dump(score_file, results) if args.glue_format_on: from experiments.glue.glue_utils import submit official_score_file = os.path.join( output_dir, '{}_test_scores_{}.tsv'.format(dataset, epoch)) submit(official_score_file, results, label_dict) logger.info('[new test scores saved.]') model_file = os.path.join(output_dir, 'model_{}.pt'.format(epoch)) model.save(model_file) if args.tensorboard: tensorboard.close()
def main(): logger.info('Launching the MT-DNN training') opt = vars(args) # update data dir opt['data_dir'] = data_dir batch_size = args.batch_size train_data_list = [] tasks = {} tasks_class = {} nclass_list = [] decoder_opts = [] dropout_list = [] for dataset in args.train_datasets: prefix = dataset.split('_')[0] if prefix in tasks: continue assert prefix in task_defs.n_class_map assert prefix in task_defs.data_type_map data_type = task_defs.data_type_map[prefix] nclass = task_defs.n_class_map[prefix] task_id = len(tasks) if args.mtl_opt > 0: task_id = tasks_class[nclass] if nclass in tasks_class else len( tasks_class) task_type = task_defs.task_type_map[prefix] pw_task = False if task_type == TaskType.Ranking: pw_task = True dopt = generate_decoder_opt(task_defs.enable_san_map[prefix], opt['answer_opt']) if task_id < len(decoder_opts): decoder_opts[task_id] = min(decoder_opts[task_id], dopt) else: decoder_opts.append(dopt) if prefix not in tasks: tasks[prefix] = len(tasks) if args.mtl_opt < 1: nclass_list.append(nclass) if (nclass not in tasks_class): tasks_class[nclass] = len(tasks_class) if args.mtl_opt > 0: nclass_list.append(nclass) dropout_p = task_defs.dropout_p_map.get(prefix, args.dropout_p) dropout_list.append(dropout_p) train_path = os.path.join(data_dir, '{}_train.json'.format(dataset)) logger.info('Loading {} as task {}'.format(train_path, task_id)) train_data = BatchGen(BatchGen.load(train_path, True, pairwise=pw_task, maxlen=args.max_seq_len), batch_size=batch_size, dropout_w=args.dropout_w, gpu=args.cuda, task_id=task_id, maxlen=args.max_seq_len, pairwise=pw_task, data_type=data_type, task_type=task_type, encoder_type=encoder_type) train_data_list.append(train_data) opt['answer_opt'] = decoder_opts opt['tasks_dropout_p'] = dropout_list args.label_size = ','.join([str(l) for l in nclass_list]) logger.info(args.label_size) dev_data_list = [] test_data_list = [] for dataset in args.test_datasets: prefix = dataset.split('_')[0] task_id = tasks_class[ task_defs. n_class_map[prefix]] if args.mtl_opt > 0 else tasks[prefix] task_type = task_defs.task_type_map[prefix] pw_task = False if task_type == TaskType.Ranking: pw_task = True assert prefix in task_defs.data_type_map data_type = task_defs.data_type_map[prefix] dev_path = os.path.join(data_dir, '{}_dev.json'.format(dataset)) dev_data = None if os.path.exists(dev_path): dev_data = BatchGen(BatchGen.load(dev_path, False, pairwise=pw_task, maxlen=args.max_seq_len), batch_size=args.batch_size_eval, gpu=args.cuda, is_train=False, task_id=task_id, maxlen=args.max_seq_len, pairwise=pw_task, data_type=data_type, task_type=task_type, encoder_type=encoder_type) dev_data_list.append(dev_data) test_path = os.path.join(data_dir, '{}_test.json'.format(dataset)) test_data = None if os.path.exists(test_path): test_data = BatchGen(BatchGen.load(test_path, False, pairwise=pw_task, maxlen=args.max_seq_len), batch_size=args.batch_size_eval, gpu=args.cuda, is_train=False, task_id=task_id, maxlen=args.max_seq_len, pairwise=pw_task, data_type=data_type, task_type=task_type, encoder_type=encoder_type) test_data_list.append(test_data) logger.info('#' * 20) logger.info(opt) logger.info('#' * 20) all_iters = [iter(item) for item in train_data_list] all_lens = [len(bg) for bg in train_data_list] # div number of grad accumulation. num_all_batches = args.epochs * sum( all_lens) // args.grad_accumulation_step logger.info('############# Gradient Accumulation Info #############') logger.info('number of step: {}'.format(args.epochs * sum(all_lens))) logger.info('number of grad grad_accumulation step: {}'.format( args.grad_accumulation_step)) logger.info('adjusted number of step: {}'.format(num_all_batches)) logger.info('############# Gradient Accumulation Info #############') if len(train_data_list) > 1 and args.ratio > 0: num_all_batches = int(args.epochs * (len(train_data_list[0]) * (1 + args.ratio))) bert_model_path = args.init_checkpoint state_dict = None if encoder_type == EncoderModelType.BERT: if os.path.exists(bert_model_path): state_dict = torch.load(bert_model_path) config = state_dict['config'] config['attention_probs_dropout_prob'] = args.bert_dropout_p config['hidden_dropout_prob'] = args.bert_dropout_p opt.update(config) else: logger.error('#' * 20) logger.error( 'Could not find the init model!\n The parameters will be initialized randomly!' ) logger.error('#' * 20) config = BertConfig(vocab_size_or_config_json_file=30522).to_dict() opt.update(config) elif encoder_type == EncoderModelType.ROBERTA: bert_model_path = '{}/model.pt'.format(bert_model_path) if os.path.exists(bert_model_path): new_state_dict = {} state_dict = torch.load(bert_model_path) for key, val in state_dict['model'].items(): if key.startswith('decoder.sentence_encoder'): key = 'bert.model.{}'.format(key) new_state_dict[key] = val elif key.startswith('classification_heads'): key = 'bert.model.{}'.format(key) new_state_dict[key] = val state_dict = {'state': new_state_dict} model = MTDNNModel(opt, state_dict=state_dict, num_train_step=num_all_batches) if args.resume and args.model_ckpt: logger.info('loading model from {}'.format(args.model_ckpt)) model.load(args.model_ckpt) #### model meta str headline = '############# Model Arch of MT-DNN #############' ### print network logger.info('\n{}\n{}\n'.format(headline, model.network)) # dump config config_file = os.path.join(output_dir, 'config.json') with open(config_file, 'w', encoding='utf-8') as writer: writer.write('{}\n'.format(json.dumps(opt))) writer.write('\n{}\n{}\n'.format(headline, model.network)) logger.info("Total number of params: {}".format(model.total_param)) for epoch in range(0, args.epochs): logger.warning('At epoch {}'.format(epoch)) for train_data in train_data_list: train_data.reset() start = datetime.now() all_indices = [] if len(train_data_list) > 1 and args.ratio > 0: main_indices = [0] * len(train_data_list[0]) extra_indices = [] for i in range(1, len(train_data_list)): extra_indices += [i] * len(train_data_list[i]) random_picks = int( min(len(train_data_list[0]) * args.ratio, len(extra_indices))) extra_indices = np.random.choice(extra_indices, random_picks, replace=False) if args.mix_opt > 0: extra_indices = extra_indices.tolist() random.shuffle(extra_indices) all_indices = extra_indices + main_indices else: all_indices = main_indices + extra_indices.tolist() else: for i in range(1, len(train_data_list)): all_indices += [i] * len(train_data_list[i]) if args.mix_opt > 0: random.shuffle(all_indices) all_indices += [0] * len(train_data_list[0]) if args.mix_opt < 1: random.shuffle(all_indices) for i in range(len(all_indices)): task_id = all_indices[i] batch_meta, batch_data = next(all_iters[task_id]) model.update(batch_meta, batch_data) if (model.local_updates) % (args.log_per_updates * args.grad_accumulation_step ) == 0 or model.local_updates == 1: ramaining_time = str((datetime.now() - start) / (i + 1) * (len(all_indices) - i - 1)).split('.')[0] logger.info( 'Task [{0:2}] updates[{1:6}] train loss[{2:.5f}] remaining[{3}]' .format(task_id, model.updates, model.train_loss.avg, ramaining_time)) if args.save_per_updates_on and ( (model.local_updates) % (args.save_per_updates * args.grad_accumulation_step) == 0): model_file = os.path.join( output_dir, 'model_{}_{}.pt'.format(epoch, model.updates)) logger.info('Saving mt-dnn model to {}'.format(model_file)) model.save(model_file) for idx, dataset in enumerate(args.test_datasets): prefix = dataset.split('_')[0] label_dict = task_defs.global_map.get(prefix, None) dev_data = dev_data_list[idx] if dev_data is not None: dev_metrics, dev_predictions, scores, golds, dev_ids = eval_model( model, dev_data, metric_meta=task_defs.metric_meta_map[prefix], use_cuda=args.cuda) for key, val in dev_metrics.items(): logger.warning( 'Task {0} -- epoch {1} -- Dev {2}: {3:.3f}'.format( dataset, epoch, key, val)) score_file = os.path.join( output_dir, '{}_dev_scores_{}.json'.format(dataset, epoch)) results = { 'metrics': dev_metrics, 'predictions': dev_predictions, 'uids': dev_ids, 'scores': scores } dump(score_file, results) official_score_file = os.path.join( output_dir, '{}_dev_scores_{}.tsv'.format(dataset, epoch)) submit(official_score_file, results, label_dict) # test eval test_data = test_data_list[idx] if test_data is not None: test_metrics, test_predictions, scores, golds, test_ids = eval_model( model, test_data, metric_meta=task_defs.metric_meta_map[prefix], use_cuda=args.cuda, with_label=False) score_file = os.path.join( output_dir, '{}_test_scores_{}.json'.format(dataset, epoch)) results = { 'metrics': test_metrics, 'predictions': test_predictions, 'uids': test_ids, 'scores': scores } dump(score_file, results) official_score_file = os.path.join( output_dir, '{}_test_scores_{}.tsv'.format(dataset, epoch)) submit(official_score_file, results, label_dict) logger.info('[new test scores saved.]') model_file = os.path.join(output_dir, 'model_{}.pt'.format(epoch)) model.save(model_file)
def main(): parser = argparse.ArgumentParser() model_config(parser) set_config(parser) train_config(parser) args = parser.parse_args() encoder_type = args.encoder_type layer_indexes = [int(x) for x in args.layers.split(",")] set_environment(args.seed) # process data data, is_single_sentence = process_data(args) data_type = (DataFormat.PremiseOnly if is_single_sentence else DataFormat.PremiseAndOneHypothesis) fout_temp = "{}.tmp".format(args.finput) dump_data(data, fout_temp) collater = Collater(is_train=False, encoder_type=encoder_type) dataset = SingleTaskDataset( fout_temp, False, maxlen=args.max_seq_length, ) batcher = DataLoader( dataset, batch_size=args.batch_size, collate_fn=collater.collate_fn, pin_memory=args.cuda, ) opt = vars(args) # load model if os.path.exists(args.checkpoint): state_dict = torch.load(args.checkpoint) config = state_dict["config"] config["dump_feature"] = True opt.update(config) else: logger.error("#" * 20) logger.error( "Could not find the init model!\n The parameters will be initialized randomly!" ) logger.error("#" * 20) return num_all_batches = len(batcher) model = MTDNNModel(opt, state_dict=state_dict, num_train_step=num_all_batches) if args.cuda: model.cuda() features_dict = {} for batch_meta, batch_data in batcher: batch_meta, batch_data = Collater.patch_data(args.cuda, batch_meta, batch_data) all_encoder_layers, _ = model.extract(batch_meta, batch_data) embeddings = [ all_encoder_layers[idx].detach().cpu().numpy() for idx in layer_indexes ] uids = batch_meta["uids"] masks = batch_data[batch_meta["mask"]].detach().cpu().numpy().tolist() for idx, uid in enumerate(uids): slen = sum(masks[idx]) features = {} for yidx, layer in enumerate(layer_indexes): features[layer] = str(embeddings[yidx][idx][:slen].tolist()) features_dict[uid] = features # save features with open(args.foutput, "w", encoding="utf-8") as writer: for sample in data: uid = sample["uid"] feature = features_dict[uid] feature["uid"] = uid writer.write("{}\n".format(json.dumps(feature)))