def main(): # set up dist device = torch.device("cuda") if args.local_rank > -1: device = initialize_distributed(args) elif torch.cuda.is_available(): device = torch.device("cuda") else: device = torch.device("cpu") opt = vars(args) # update data dir opt['data_dir'] = data_dir batch_size = args.batch_size print_message(logger, 'Launching the MT-DNN training') #return tasks = {} task_def_list = [] dropout_list = [] printable = args.local_rank in [-1, 0] train_datasets = [] for dataset in args.train_datasets: prefix = dataset.split('_')[0] if prefix in tasks: continue task_id = len(tasks) tasks[prefix] = task_id task_def = task_defs.get_task_def(prefix) task_def_list.append(task_def) train_path = os.path.join(data_dir, '{}_train.json'.format(dataset)) print_message(logger, 'Loading {} as task {}'.format(train_path, task_id)) train_data_set = SingleTaskDataset(train_path, True, maxlen=args.max_seq_len, task_id=task_id, task_def=task_def, printable=printable) train_datasets.append(train_data_set) train_collater = Collater(dropout_w=args.dropout_w, encoder_type=encoder_type, soft_label=args.mkd_opt > 0, max_seq_len=args.max_seq_len, do_padding=args.do_padding) multi_task_train_dataset = MultiTaskDataset(train_datasets) if args.local_rank != -1: multi_task_batch_sampler = DistMultiTaskBatchSampler( train_datasets, args.batch_size, args.mix_opt, args.ratio, rank=args.local_rank, world_size=args.world_size) else: multi_task_batch_sampler = MultiTaskBatchSampler( train_datasets, args.batch_size, args.mix_opt, args.ratio, bin_on=args.bin_on, bin_size=args.bin_size, bin_grow_ratio=args.bin_grow_ratio) multi_task_train_data = DataLoader(multi_task_train_dataset, batch_sampler=multi_task_batch_sampler, collate_fn=train_collater.collate_fn, pin_memory=args.cuda) opt['task_def_list'] = task_def_list dev_data_list = [] test_data_list = [] test_collater = Collater(is_train=False, encoder_type=encoder_type, max_seq_len=args.max_seq_len, do_padding=args.do_padding) for dataset in args.test_datasets: prefix = dataset.split('_')[0] task_def = task_defs.get_task_def(prefix) task_id = tasks[prefix] task_type = task_def.task_type data_type = task_def.data_type dev_path = os.path.join(data_dir, '{}_dev.json'.format(dataset)) dev_data = None if os.path.exists(dev_path): dev_data_set = SingleTaskDataset(dev_path, False, maxlen=args.max_seq_len, task_id=task_id, task_def=task_def, printable=printable) if args.local_rank != -1: dev_data_set = DistTaskDataset(dev_data_set, task_id) single_task_batch_sampler = DistSingleTaskBatchSampler( dev_data_set, args.batch_size_eval, rank=args.local_rank, world_size=args.world_size) dev_data = DataLoader(dev_data_set, batch_sampler=single_task_batch_sampler, collate_fn=test_collater.collate_fn, pin_memory=args.cuda) else: dev_data = DataLoader(dev_data_set, batch_size=args.batch_size_eval, collate_fn=test_collater.collate_fn, pin_memory=args.cuda) dev_data_list.append(dev_data) test_path = os.path.join(data_dir, '{}_test.json'.format(dataset)) test_data = None if os.path.exists(test_path): test_data_set = SingleTaskDataset(test_path, False, maxlen=args.max_seq_len, task_id=task_id, task_def=task_def, printable=printable) if args.local_rank != -1: test_data_set = DistTaskDataset(test_data_set, task_id) single_task_batch_sampler = DistSingleTaskBatchSampler( test_data_set, args.batch_size_eval, rank=args.local_rank, world_size=args.world_size) test_data = DataLoader(test_data_set, batch_sampler=single_task_batch_sampler, collate_fn=test_collater.collate_fn, pin_memory=args.cuda) else: test_data = DataLoader(test_data_set, batch_size=args.batch_size_eval, collate_fn=test_collater.collate_fn, pin_memory=args.cuda) test_data_list.append(test_data) print_message(logger, '#' * 20) print_message(logger, opt) print_message(logger, '#' * 20) # div number of grad accumulation. num_all_batches = args.epochs * len( multi_task_train_data) // args.grad_accumulation_step print_message(logger, '############# Gradient Accumulation Info #############') print_message( logger, 'number of step: {}'.format(args.epochs * len(multi_task_train_data))) print_message( logger, 'number of grad grad_accumulation step: {}'.format( args.grad_accumulation_step)) print_message(logger, 'adjusted number of step: {}'.format(num_all_batches)) print_message(logger, '############# Gradient Accumulation Info #############') init_model = args.init_checkpoint state_dict = None if os.path.exists(init_model): if encoder_type == EncoderModelType.BERT or \ encoder_type == EncoderModelType.DEBERTA or \ encoder_type == EncoderModelType.ELECTRA: state_dict = torch.load(init_model, map_location=device) config = state_dict['config'] elif encoder_type == EncoderModelType.ROBERTA or encoder_type == EncoderModelType.XLM: model_path = '{}/model.pt'.format(init_model) state_dict = torch.load(model_path, map_location=device) arch = state_dict['args'].arch arch = arch.replace('_', '-') if encoder_type == EncoderModelType.XLM: arch = "xlm-{}".format(arch) # convert model arch from data_utils.roberta_utils import update_roberta_keys from data_utils.roberta_utils import patch_name_dict state = update_roberta_keys( state_dict['model'], nlayer=state_dict['args'].encoder_layers) state = patch_name_dict(state) literal_encoder_type = EncoderModelType( opt['encoder_type']).name.lower() config_class, model_class, tokenizer_class = MODEL_CLASSES[ literal_encoder_type] config = config_class.from_pretrained(arch).to_dict() state_dict = {'state': state} else: if opt['encoder_type'] not in EncoderModelType._value2member_map_: raise ValueError("encoder_type is out of pre-defined types") literal_encoder_type = EncoderModelType( opt['encoder_type']).name.lower() config_class, model_class, tokenizer_class = MODEL_CLASSES[ literal_encoder_type] config = config_class.from_pretrained(init_model).to_dict() config['attention_probs_dropout_prob'] = args.bert_dropout_p config['hidden_dropout_prob'] = args.bert_dropout_p config['multi_gpu_on'] = opt["multi_gpu_on"] if args.num_hidden_layers > 0: config['num_hidden_layers'] = args.num_hidden_layers opt.update(config) model = MTDNNModel(opt, device=device, state_dict=state_dict, num_train_step=num_all_batches) if args.resume and args.model_ckpt: print_message(logger, 'loading model from {}'.format(args.model_ckpt)) model.load(args.model_ckpt) #### model meta str headline = '############# Model Arch of MT-DNN #############' ### print network print_message(logger, '\n{}\n{}\n'.format(headline, model.network)) # dump config config_file = os.path.join(output_dir, 'config.json') with open(config_file, 'w', encoding='utf-8') as writer: writer.write('{}\n'.format(json.dumps(opt))) writer.write('\n{}\n{}\n'.format(headline, model.network)) print_message(logger, "Total number of params: {}".format(model.total_param)) # tensorboard tensorboard = None if args.tensorboard: args.tensorboard_logdir = os.path.join(args.output_dir, args.tensorboard_logdir) tensorboard = SummaryWriter(log_dir=args.tensorboard_logdir) if args.encode_mode: for idx, dataset in enumerate(args.test_datasets): prefix = dataset.split('_')[0] test_data = test_data_list[idx] with torch.no_grad(): encoding = extract_encoding(model, test_data, use_cuda=args.cuda) torch.save( encoding, os.path.join(output_dir, '{}_encoding.pt'.format(dataset))) return for epoch in range(0, args.epochs): print_message(logger, 'At epoch {}'.format(epoch), level=1) start = datetime.now() for i, (batch_meta, batch_data) in enumerate(multi_task_train_data): batch_meta, batch_data = Collater.patch_data( device, batch_meta, batch_data) task_id = batch_meta['task_id'] model.update(batch_meta, batch_data) if (model.updates) % ( args.log_per_updates) == 0 or model.updates == 1: ramaining_time = str( (datetime.now() - start) / (i + 1) * (len(multi_task_train_data) - i - 1)).split('.')[0] if args.adv_train and args.debug: debug_info = ' adv loss[%.5f] emb val[%.8f] eff_perturb[%.8f] ' % ( model.adv_loss.avg, model.emb_val.avg, model.eff_perturb.avg) else: debug_info = ' ' print_message( logger, 'Task [{0:2}] updates[{1:6}] train loss[{2:.5f}]{3}remaining[{4}]' .format(task_id, model.updates, model.train_loss.avg, debug_info, ramaining_time)) if args.tensorboard: tensorboard.add_scalar('train/loss', model.train_loss.avg, global_step=model.updates) if args.save_per_updates_on and ( (model.local_updates) % (args.save_per_updates * args.grad_accumulation_step) == 0) and args.local_rank in [-1, 0]: model_file = os.path.join( output_dir, 'model_{}_{}.pt'.format(epoch, model.updates)) evaluation(model, args.test_datasets, dev_data_list, task_defs, output_dir, epoch, n_updates=args.save_per_updates, with_label=True, tensorboard=tensorboard, glue_format_on=args.glue_format_on, test_on=False, device=device, logger=logger) evaluation(model, args.test_datasets, test_data_list, task_defs, output_dir, epoch, n_updates=args.save_per_updates, with_label=False, tensorboard=tensorboard, glue_format_on=args.glue_format_on, test_on=True, device=device, logger=logger) print_message(logger, 'Saving mt-dnn model to {}'.format(model_file)) model.save(model_file) evaluation(model, args.test_datasets, dev_data_list, task_defs, output_dir, epoch, with_label=True, tensorboard=tensorboard, glue_format_on=args.glue_format_on, test_on=False, device=device, logger=logger) evaluation(model, args.test_datasets, test_data_list, task_defs, output_dir, epoch, with_label=False, tensorboard=tensorboard, glue_format_on=args.glue_format_on, test_on=True, device=device, logger=logger) print_message(logger, '[new test scores at {} saved.]'.format(epoch)) if args.local_rank in [-1, 0]: model_file = os.path.join(output_dir, 'model_{}.pt'.format(epoch)) model.save(model_file) if args.tensorboard: tensorboard.close()
def main(): logger.info('Launching the MT-DNN training') opt = vars(args) # update data dir opt['data_dir'] = data_dir batch_size = args.batch_size train_data_list = [] tasks = {} tasks_class = {} nclass_list = [] decoder_opts = [] dropout_list = [] for dataset in args.train_datasets: prefix = dataset.split('_')[0] if prefix in tasks: continue assert prefix in DATA_META assert prefix in DATA_TYPE data_type = DATA_TYPE[prefix] nclass = DATA_META[prefix] task_id = len(tasks) if args.mtl_opt > 0: task_id = tasks_class[nclass] if nclass in tasks_class else len( tasks_class) task_type = TASK_TYPE[prefix] pw_task = False if prefix in opt['pw_tasks']: pw_task = True dopt = generate_decoder_opt(prefix, opt['answer_opt']) if task_id < len(decoder_opts): decoder_opts[task_id] = min(decoder_opts[task_id], dopt) else: decoder_opts.append(dopt) if prefix not in tasks: tasks[prefix] = len(tasks) if args.mtl_opt < 1: nclass_list.append(nclass) if (nclass not in tasks_class): tasks_class[nclass] = len(tasks_class) if args.mtl_opt > 0: nclass_list.append(nclass) dropout_p = args.dropout_p if tasks_config and prefix in tasks_config: dropout_p = tasks_config[prefix] dropout_list.append(dropout_p) train_data_ratio_string = str( args.train_data_ratio) + "p" if args.train_data_ratio < 100 else "" train_path = os.path.join( data_dir, '{0}_train{1}.json'.format(dataset, train_data_ratio_string)) logger.info('Loading {} as task {}'.format(train_path, task_id)) train_data = BatchGen(BatchGen.load(train_path, True, pairwise=pw_task, maxlen=args.max_seq_len), batch_size=batch_size, dropout_w=args.dropout_w, gpu=args.cuda, task_id=task_id, maxlen=args.max_seq_len, pairwise=pw_task, data_type=data_type, task_type=task_type) train_data_list.append(train_data) opt['answer_opt'] = decoder_opts opt['tasks_dropout_p'] = dropout_list args.label_size = ','.join([str(l) for l in nclass_list]) logger.info(args.label_size) dev_data_list = [] test_data_list = [] for dataset in args.test_datasets: prefix = dataset.split('_')[0] task_id = tasks_class[ DATA_META[prefix]] if args.mtl_opt > 0 else tasks[prefix] task_type = TASK_TYPE[prefix] pw_task = False if prefix in opt['pw_tasks']: pw_task = True assert prefix in DATA_TYPE data_type = DATA_TYPE[prefix] dev_path = os.path.join(data_dir, '{}_dev.json'.format(dataset)) dev_data = None if os.path.exists(dev_path): dev_data = BatchGen(BatchGen.load(dev_path, False, pairwise=pw_task, maxlen=args.max_seq_len), batch_size=args.batch_size_eval, gpu=args.cuda, is_train=False, task_id=task_id, maxlen=args.max_seq_len, pairwise=pw_task, data_type=data_type, task_type=task_type) dev_data_list.append(dev_data) test_path = os.path.join(data_dir, '{}_test.json'.format(dataset)) test_data = None if os.path.exists(test_path): test_data = BatchGen(BatchGen.load(test_path, False, pairwise=pw_task, maxlen=args.max_seq_len), batch_size=args.batch_size_eval, gpu=args.cuda, is_train=False, task_id=task_id, maxlen=args.max_seq_len, pairwise=pw_task, data_type=data_type, task_type=task_type) test_data_list.append(test_data) logger.info('#' * 20) logger.info(opt) logger.info('#' * 20) all_iters = [iter(item) for item in train_data_list] all_lens = [len(bg) for bg in train_data_list] num_all_batches = args.epochs * sum(all_lens) if len(train_data_list) > 1 and args.ratio > 0: num_all_batches = int(args.epochs * (len(train_data_list[0]) * (1 + args.ratio))) model_path = args.init_checkpoint state_dict = None if os.path.exists(model_path): state_dict = torch.load(model_path, map_location='cpu') config = state_dict['config'] config['attention_probs_dropout_prob'] = args.bert_dropout_p config['hidden_dropout_prob'] = args.bert_dropout_p opt.update(config) else: logger.error('#' * 20) logger.error('Could not find the init model!\n Exit application!') logger.error('#' * 20) try: shutil.rmtree(output_dir) except Exception as e: print(e) exit(1) model = MTDNNModel(opt, state_dict=state_dict, num_train_step=num_all_batches) ####model meta str headline = '############# Model Arch of MT-DNN #############' ###print network logger.info('\n{}\n{}\n'.format(headline, model.network)) # dump config config_file = os.path.join(output_dir, 'config.json') with open(config_file, 'w', encoding='utf-8') as writer: writer.write('{}\n'.format(json.dumps(opt))) writer.write('\n{}\n{}\n'.format(headline, model.network)) logger.info("Total number of params: {}".format(model.total_param)) if args.freeze_layers > 0: model.network.freeze_layers(args.freeze_layers) if args.cuda: model.cuda() best_F1_macro = -1.0 for epoch in range(0, args.epochs): logger.warning('At epoch {}'.format(epoch)) for train_data in train_data_list: train_data.reset() start = datetime.now() all_indices = [] if len(train_data_list) > 1 and (args.ratio > 0 or args.reduce_first_dataset_ratio > 0): main_indices = [0] * (int(args.reduce_first_dataset_ratio * len( train_data_list[0])) if args.reduce_first_dataset_ratio > 0 else len(train_data_list[0])) extra_indices = [] for i in range(1, len(train_data_list)): extra_indices += [i] * len(train_data_list[i]) if args.ratio > 0: random_picks = int( min( len(train_data_list[0]) * args.ratio, len(extra_indices))) extra_indices = np.random.choice(extra_indices, random_picks, replace=False).tolist() if args.mix_opt > 0: extra_indices = extra_indices random.shuffle(extra_indices) all_indices = extra_indices + main_indices else: all_indices = main_indices + extra_indices logger.info( "Main batches loaded (first dataset in list): {}".format( len(main_indices))) logger.info( "Extra batches loaded (all except first dataset in list): {}". format(len(extra_indices))) else: # shuffle the index of the train sets whose batches will be trained on in the order: e.g. if train_set[1] is large, it will get trained on more often for i in range(1, len(train_data_list)): all_indices += [i] * len(train_data_list[i]) if args.mix_opt > 0: random.shuffle(all_indices) all_indices += [0] * len(train_data_list[0]) if args.mix_opt < 1: random.shuffle(all_indices) for i in range(len(all_indices)): task_id = all_indices[i] batch_meta, batch_data = next(all_iters[task_id]) model.update(batch_meta, batch_data) if (model.updates ) % args.log_per_updates == 0 or model.updates == 1: logger.info( 'Task [{0:2}] updates[{1:6}] train loss[{2:.5f}] remaining[{3}]' .format( task_id, model.updates, model.train_loss.avg, str((datetime.now() - start) / (i + 1) * (len(all_indices) - i - 1)).split('.')[0])) temp_dev_F1s = [] dev_dump_list = [] test_dump_list = [] for idx, dataset in enumerate(args.test_datasets): prefix = dataset.split('_')[0] label_dict = GLOBAL_MAP.get(prefix, None) dev_data = dev_data_list[idx] if dev_data is not None: dev_metrics, dev_predictions, scores, golds, dev_ids, premises, hypotheses = eval_model( model, dev_data, dataset=prefix, use_cuda=args.cuda) for key, val in dev_metrics.items(): if not isinstance(val, dict): logger.warning( "Task {0} -- epoch {1} -- Dev {2}: {3:.3f}".format( dataset, epoch, key, val)) score_file = os.path.join( output_dir, '{}_dev_scores_{}.json'.format(dataset, epoch)) results = { 'metrics': dev_metrics, 'predictions': dev_predictions, 'uids': dev_ids, 'scores': scores, 'golds': golds, 'premises': premises, 'hypotheses': hypotheses } dump(score_file, results) official_score_file = os.path.join( output_dir, '{}_dev_scores_{}.tsv'.format(dataset, epoch)) submit(official_score_file, results, label_dict) # for checkpoint temp_dev_F1s.append(dev_metrics['F1_macro']) dev_dump_list.append({ "output_dir": output_dir, "dev_metrics": dev_metrics, "dev_predictions": dev_predictions, "golds": golds, "opt": opt, "dataset": dataset }) # test eval test_data = test_data_list[idx] if test_data is not None: test_metrics, test_predictions, scores, golds, test_ids, premises, hypotheses = eval_model( model, test_data, dataset=prefix, use_cuda=args.cuda, with_label=True) score_file = os.path.join( output_dir, '{}_test_scores_{}.json'.format(dataset, epoch)) results = { 'metrics': test_metrics, 'predictions': test_predictions, 'uids': test_ids, 'scores': scores, 'golds': golds, 'premises': premises, 'hypotheses': hypotheses } dump(score_file, results) official_score_file = os.path.join( output_dir, '{}_test_scores_{}.tsv'.format(dataset, epoch)) submit(official_score_file, results, label_dict) logger.info('[new test scores saved.]') # for checkpoint test_dump_list.append({ "output_dir": output_dir, "test_metrics": test_metrics, "test_predictions": test_predictions, "golds": golds, "opt": opt, "dataset": dataset }) # save checkpoint if np.average(temp_dev_F1s) > best_F1_macro: print("Save new model! Current best F1 macro over all dev sets: " + "{0:.2f}".format(best_F1_macro) + ". New: " + "{0:.2f}".format(np.average(temp_dev_F1s))) best_F1_macro = np.average(temp_dev_F1s) # override current dump file for l in dev_dump_list: dump_result_files(l['dataset'])(l['output_dir'], epoch, l['dev_metrics'], str(l['dev_predictions']), str(l['golds']), "dev", l['opt'], l['dataset']) for l in test_dump_list: dump_result_files(l['dataset'])(l['output_dir'], epoch, l['test_metrics'], str(l['test_predictions']), str(l['golds']), "test", l['opt'], l['dataset']) # save model model_file = os.path.join(output_dir, 'model.pt') model.save(model_file)
def main(): logger.info('Launching the MT-DNN training') opt = vars(args) # update data dir opt['data_dir'] = data_dir batch_size = args.batch_size train_data_list = [] tasks = {} tasks_class = {} nclass_list = [] decoder_opts = [] dropout_list = [] for dataset in args.train_datasets: prefix = dataset.split('_')[0] if prefix in tasks: continue assert prefix in task_defs.n_class_map assert prefix in task_defs.data_type_map data_type = task_defs.data_type_map[prefix] nclass = task_defs.n_class_map[prefix] task_id = len(tasks) if args.mtl_opt > 0: task_id = tasks_class[nclass] if nclass in tasks_class else len( tasks_class) task_type = task_defs.task_type_map[prefix] pw_task = False if task_type == TaskType.Ranking: pw_task = True dopt = generate_decoder_opt(task_defs.enable_san_map[prefix], opt['answer_opt']) if task_id < len(decoder_opts): decoder_opts[task_id] = min(decoder_opts[task_id], dopt) else: decoder_opts.append(dopt) if prefix not in tasks: tasks[prefix] = len(tasks) if args.mtl_opt < 1: nclass_list.append(nclass) if (nclass not in tasks_class): tasks_class[nclass] = len(tasks_class) if args.mtl_opt > 0: nclass_list.append(nclass) dropout_p = task_defs.dropout_p_map.get(prefix, args.dropout_p) dropout_list.append(dropout_p) train_path = os.path.join(data_dir, '{}_train.json'.format(dataset)) logger.info('Loading {} as task {}'.format(train_path, task_id)) train_data = BatchGen(BatchGen.load(train_path, True, pairwise=pw_task, maxlen=args.max_seq_len), batch_size=batch_size, dropout_w=args.dropout_w, gpu=args.cuda, task_id=task_id, maxlen=args.max_seq_len, pairwise=pw_task, data_type=data_type, task_type=task_type, encoder_type=encoder_type) train_data_list.append(train_data) opt['answer_opt'] = decoder_opts opt['tasks_dropout_p'] = dropout_list args.label_size = ','.join([str(l) for l in nclass_list]) logger.info(args.label_size) dev_data_list = [] test_data_list = [] for dataset in args.test_datasets: prefix = dataset.split('_')[0] task_id = tasks_class[ task_defs. n_class_map[prefix]] if args.mtl_opt > 0 else tasks[prefix] task_type = task_defs.task_type_map[prefix] pw_task = False if task_type == TaskType.Ranking: pw_task = True assert prefix in task_defs.data_type_map data_type = task_defs.data_type_map[prefix] dev_path = os.path.join(data_dir, '{}_dev.json'.format(dataset)) dev_data = None if os.path.exists(dev_path): dev_data = BatchGen(BatchGen.load(dev_path, False, pairwise=pw_task, maxlen=args.max_seq_len), batch_size=args.batch_size_eval, gpu=args.cuda, is_train=False, task_id=task_id, maxlen=args.max_seq_len, pairwise=pw_task, data_type=data_type, task_type=task_type, encoder_type=encoder_type) dev_data_list.append(dev_data) test_path = os.path.join(data_dir, '{}_test.json'.format(dataset)) test_data = None if os.path.exists(test_path): test_data = BatchGen(BatchGen.load(test_path, False, pairwise=pw_task, maxlen=args.max_seq_len), batch_size=args.batch_size_eval, gpu=args.cuda, is_train=False, task_id=task_id, maxlen=args.max_seq_len, pairwise=pw_task, data_type=data_type, task_type=task_type, encoder_type=encoder_type) test_data_list.append(test_data) logger.info('#' * 20) logger.info(opt) logger.info('#' * 20) all_iters = [iter(item) for item in train_data_list] all_lens = [len(bg) for bg in train_data_list] # div number of grad accumulation. num_all_batches = args.epochs * sum( all_lens) // args.grad_accumulation_step logger.info('############# Gradient Accumulation Info #############') logger.info('number of step: {}'.format(args.epochs * sum(all_lens))) logger.info('number of grad grad_accumulation step: {}'.format( args.grad_accumulation_step)) logger.info('adjusted number of step: {}'.format(num_all_batches)) logger.info('############# Gradient Accumulation Info #############') if len(train_data_list) > 1 and args.ratio > 0: num_all_batches = int(args.epochs * (len(train_data_list[0]) * (1 + args.ratio))) bert_model_path = args.init_checkpoint state_dict = None if encoder_type == EncoderModelType.BERT: if os.path.exists(bert_model_path): state_dict = torch.load(bert_model_path) config = state_dict['config'] config['attention_probs_dropout_prob'] = args.bert_dropout_p config['hidden_dropout_prob'] = args.bert_dropout_p opt.update(config) else: logger.error('#' * 20) logger.error( 'Could not find the init model!\n The parameters will be initialized randomly!' ) logger.error('#' * 20) config = BertConfig(vocab_size_or_config_json_file=30522).to_dict() opt.update(config) model = MTDNNModel(opt, state_dict=state_dict, num_train_step=num_all_batches) if args.resume and args.model_ckpt: logger.info('loading model from {}'.format(args.model_ckpt)) model.load(args.model_ckpt) #### model meta str headline = '############# Model Arch of MT-DNN #############' ### print network logger.info('\n{}\n{}\n'.format(headline, model.network)) # dump config config_file = os.path.join(output_dir, 'config.json') with open(config_file, 'w', encoding='utf-8') as writer: writer.write('{}\n'.format(json.dumps(opt))) writer.write('\n{}\n{}\n'.format(headline, model.network)) logger.info("Total number of params: {}".format(model.total_param)) for epoch in range(0, args.epochs): logger.warning('At epoch {}'.format(epoch)) for train_data in train_data_list: train_data.reset() start = datetime.now() all_indices = [] if len(train_data_list) > 1 and args.ratio > 0: main_indices = [0] * len(train_data_list[0]) extra_indices = [] for i in range(1, len(train_data_list)): extra_indices += [i] * len(train_data_list[i]) random_picks = int( min(len(train_data_list[0]) * args.ratio, len(extra_indices))) extra_indices = np.random.choice(extra_indices, random_picks, replace=False) if args.mix_opt > 0: extra_indices = extra_indices.tolist() random.shuffle(extra_indices) all_indices = extra_indices + main_indices else: all_indices = main_indices + extra_indices.tolist() else: for i in range(1, len(train_data_list)): all_indices += [i] * len(train_data_list[i]) if args.mix_opt > 0: random.shuffle(all_indices) all_indices += [0] * len(train_data_list[0]) if args.mix_opt < 1: random.shuffle(all_indices) for i in range(len(all_indices)): task_id = all_indices[i] batch_meta, batch_data = next(all_iters[task_id]) model.update(batch_meta, batch_data) if (model.local_updates) % (args.log_per_updates * args.grad_accumulation_step ) == 0 or model.local_updates == 1: ramaining_time = str((datetime.now() - start) / (i + 1) * (len(all_indices) - i - 1)).split('.')[0] logger.info( 'Task [{0:2}] updates[{1:6}] train loss[{2:.5f}] remaining[{3}]' .format(task_id, model.updates, model.train_loss.avg, ramaining_time)) if args.save_per_updates_on and ( (model.local_updates) % (args.save_per_updates * args.grad_accumulation_step) == 0): model_file = os.path.join( output_dir, 'model_{}_{}.pt'.format(epoch, model.updates)) logger.info('Saving mt-dnn model to {}'.format(model_file)) model.save(model_file) for idx, dataset in enumerate(args.test_datasets): prefix = dataset.split('_')[0] label_dict = task_defs.global_map.get(prefix, None) dev_data = dev_data_list[idx] if dev_data is not None: dev_metrics, dev_predictions, scores, golds, dev_ids = eval_model( model, dev_data, metric_meta=task_defs.metric_meta_map[prefix], use_cuda=args.cuda) for key, val in dev_metrics.items(): logger.warning( 'Task {0} -- epoch {1} -- Dev {2}: {3:.3f}'.format( dataset, epoch, key, val)) score_file = os.path.join( output_dir, '{}_dev_scores_{}.json'.format(dataset, epoch)) results = { 'metrics': dev_metrics, 'predictions': dev_predictions, 'uids': dev_ids, 'scores': scores } dump(score_file, results) official_score_file = os.path.join( output_dir, '{}_dev_scores_{}.tsv'.format(dataset, epoch)) submit(official_score_file, results, label_dict) # test eval test_data = test_data_list[idx] if test_data is not None: test_metrics, test_predictions, scores, golds, test_ids = eval_model( model, test_data, metric_meta=task_defs.metric_meta_map[prefix], use_cuda=args.cuda, with_label=False) score_file = os.path.join( output_dir, '{}_test_scores_{}.json'.format(dataset, epoch)) results = { 'metrics': test_metrics, 'predictions': test_predictions, 'uids': test_ids, 'scores': scores } dump(score_file, results) official_score_file = os.path.join( output_dir, '{}_test_scores_{}.tsv'.format(dataset, epoch)) submit(official_score_file, results, label_dict) logger.info('[new test scores saved.]') model_file = os.path.join(output_dir, 'model_{}.pt'.format(epoch)) model.save(model_file)
def main(): logger.info('Launching the MT-DNN training') opt = vars(args) # update data dir opt['data_dir'] = data_dir batch_size = args.batch_size tasks = {} task_def_list = [] dropout_list = [] train_datasets = [] for dataset in args.train_datasets: prefix = dataset.split('_')[0] if prefix in tasks: continue task_id = len(tasks) tasks[prefix] = task_id task_def = task_defs.get_task_def(prefix) task_def_list.append(task_def) train_path = os.path.join(data_dir, '{}_train.json'.format(dataset)) logger.info('Loading {} as task {}'.format(train_path, task_id)) train_data_set = SingleTaskDataset(train_path, True, maxlen=args.max_seq_len, task_id=task_id, task_def=task_def) train_datasets.append(train_data_set) train_collater = Collater(dropout_w=args.dropout_w, encoder_type=encoder_type, soft_label=args.mkd_opt > 0) multi_task_train_dataset = MultiTaskDataset(train_datasets) multi_task_batch_sampler = MultiTaskBatchSampler(train_datasets, args.batch_size, args.mix_opt, args.ratio) multi_task_train_data = DataLoader(multi_task_train_dataset, batch_sampler=multi_task_batch_sampler, collate_fn=train_collater.collate_fn, pin_memory=args.cuda) opt['task_def_list'] = task_def_list dev_data_list = [] test_data_list = [] test_collater = Collater(is_train=False, encoder_type=encoder_type) for dataset in args.test_datasets: prefix = dataset.split('_')[0] task_def = task_defs.get_task_def(prefix) task_id = tasks[prefix] task_type = task_def.task_type data_type = task_def.data_type dev_path = os.path.join(data_dir, '{}_dev.json'.format(dataset)) dev_data = None if os.path.exists(dev_path): dev_data_set = SingleTaskDataset(dev_path, False, maxlen=args.max_seq_len, task_id=task_id, task_def=task_def) dev_data = DataLoader(dev_data_set, batch_size=args.batch_size_eval, collate_fn=test_collater.collate_fn, pin_memory=args.cuda) dev_data_list.append(dev_data) test_path = os.path.join(data_dir, '{}_test.json'.format(dataset)) test_data = None if os.path.exists(test_path): test_data_set = SingleTaskDataset(test_path, False, maxlen=args.max_seq_len, task_id=task_id, task_def=task_def) test_data = DataLoader(test_data_set, batch_size=args.batch_size_eval, collate_fn=test_collater.collate_fn, pin_memory=args.cuda) test_data_list.append(test_data) logger.info('#' * 20) logger.info(opt) logger.info('#' * 20) # div number of grad accumulation. num_all_batches = args.epochs * len( multi_task_train_data) // args.grad_accumulation_step logger.info('############# Gradient Accumulation Info #############') logger.info('number of step: {}'.format(args.epochs * len(multi_task_train_data))) logger.info('number of grad grad_accumulation step: {}'.format( args.grad_accumulation_step)) logger.info('adjusted number of step: {}'.format(num_all_batches)) logger.info('############# Gradient Accumulation Info #############') init_model = args.init_checkpoint state_dict = None if os.path.exists(init_model): state_dict = torch.load(init_model) config = state_dict['config'] else: if opt['encoder_type'] not in EncoderModelType._value2member_map_: raise ValueError("encoder_type is out of pre-defined types") literal_encoder_type = EncoderModelType( opt['encoder_type']).name.lower() config_class, model_class, tokenizer_class = MODEL_CLASSES[ literal_encoder_type] config = config_class.from_pretrained(init_model).to_dict() config['attention_probs_dropout_prob'] = args.bert_dropout_p config['hidden_dropout_prob'] = args.bert_dropout_p config['multi_gpu_on'] = opt["multi_gpu_on"] if args.num_hidden_layers != -1: config['num_hidden_layers'] = args.num_hidden_layers opt.update(config) model = MTDNNModel(opt, state_dict=state_dict, num_train_step=num_all_batches) if args.resume and args.model_ckpt: logger.info('loading model from {}'.format(args.model_ckpt)) model.load(args.model_ckpt) #### model meta str headline = '############# Model Arch of MT-DNN #############' ### print network logger.info('\n{}\n{}\n'.format(headline, model.network)) # dump config config_file = os.path.join(output_dir, 'config.json') with open(config_file, 'w', encoding='utf-8') as writer: writer.write('{}\n'.format(json.dumps(opt))) writer.write('\n{}\n{}\n'.format(headline, model.network)) logger.info("Total number of params: {}".format(model.total_param)) # tensorboard if args.tensorboard: args.tensorboard_logdir = os.path.join(args.output_dir, args.tensorboard_logdir) tensorboard = SummaryWriter(log_dir=args.tensorboard_logdir) if args.encode_mode: for idx, dataset in enumerate(args.test_datasets): prefix = dataset.split('_')[0] test_data = test_data_list[idx] with torch.no_grad(): encoding = extract_encoding(model, test_data, use_cuda=args.cuda) torch.save( encoding, os.path.join(output_dir, '{}_encoding.pt'.format(dataset))) return for epoch in range(0, args.epochs): logger.warning('At epoch {}'.format(epoch)) start = datetime.now() for i, (batch_meta, batch_data) in enumerate(multi_task_train_data): batch_meta, batch_data = Collater.patch_data( args.cuda, batch_meta, batch_data) task_id = batch_meta['task_id'] model.update(batch_meta, batch_data) if (model.local_updates) % (args.log_per_updates * args.grad_accumulation_step ) == 0 or model.local_updates == 1: ramaining_time = str( (datetime.now() - start) / (i + 1) * (len(multi_task_train_data) - i - 1)).split('.')[0] logger.info( 'Task [{0:2}] updates[{1:6}] train loss[{2:.5f}] remaining[{3}]' .format(task_id, model.updates, model.train_loss.avg, ramaining_time)) if args.tensorboard: tensorboard.add_scalar('train/loss', model.train_loss.avg, global_step=model.updates) if args.save_per_updates_on and ( (model.local_updates) % (args.save_per_updates * args.grad_accumulation_step) == 0): model_file = os.path.join( output_dir, 'model_{}_{}.pt'.format(epoch, model.updates)) logger.info('Saving mt-dnn model to {}'.format(model_file)) model.save(model_file) for idx, dataset in enumerate(args.test_datasets): prefix = dataset.split('_')[0] task_def = task_defs.get_task_def(prefix) label_dict = task_def.label_vocab dev_data = dev_data_list[idx] if dev_data is not None: with torch.no_grad(): dev_metrics, dev_predictions, scores, golds, dev_ids = eval_model( model, dev_data, metric_meta=task_def.metric_meta, use_cuda=args.cuda, label_mapper=label_dict, task_type=task_def.task_type) for key, val in dev_metrics.items(): if args.tensorboard: tensorboard.add_scalar('dev/{}/{}'.format( dataset, key), val, global_step=epoch) if isinstance(val, str): logger.warning( 'Task {0} -- epoch {1} -- Dev {2}:\n {3}'.format( dataset, epoch, key, val)) else: logger.warning( 'Task {0} -- epoch {1} -- Dev {2}: {3:.3f}'.format( dataset, epoch, key, val)) score_file = os.path.join( output_dir, '{}_dev_scores_{}.json'.format(dataset, epoch)) results = { 'metrics': dev_metrics, 'predictions': dev_predictions, 'uids': dev_ids, 'scores': scores } dump(score_file, results) if args.glue_format_on: from experiments.glue.glue_utils import submit official_score_file = os.path.join( output_dir, '{}_dev_scores_{}.tsv'.format(dataset, epoch)) submit(official_score_file, results, label_dict) # test eval test_data = test_data_list[idx] if test_data is not None: with torch.no_grad(): test_metrics, test_predictions, scores, golds, test_ids = eval_model( model, test_data, metric_meta=task_def.metric_meta, use_cuda=args.cuda, with_label=False, label_mapper=label_dict, task_type=task_def.task_type) score_file = os.path.join( output_dir, '{}_test_scores_{}.json'.format(dataset, epoch)) results = { 'metrics': test_metrics, 'predictions': test_predictions, 'uids': test_ids, 'scores': scores } dump(score_file, results) if args.glue_format_on: from experiments.glue.glue_utils import submit official_score_file = os.path.join( output_dir, '{}_test_scores_{}.tsv'.format(dataset, epoch)) submit(official_score_file, results, label_dict) logger.info('[new test scores saved.]') model_file = os.path.join(output_dir, 'model_{}.pt'.format(epoch)) model.save(model_file) if args.tensorboard: tensorboard.close()