def __init__(self, config, num_classes, output_attentions=False): super(BERTImage, self).__init__() self.output_attentions = output_attentions bert_config = BertConfig.from_dict(config) num_channels_in = config['num_channels_in'] self.hidden_size = config['hidden_size'] self.features_upscale = nn.Linear(num_channels_in, self.hidden_size) # use the BERT encoder self.encoder = BertEncoder(bert_config, output_attentions=output_attentions) self.register_buffer('attention_mask', torch.tensor(1.0)) self.classifier = nn.Linear(self.hidden_size, num_classes)
def __init__(self, opt): super(BertMapping, self).__init__() bert_config = BertConfig.from_json_file(opt.bert_config_file) self.bert = BertModel(bert_config) self.bert.load_state_dict( torch.load(opt.init_checkpoint, map_location='cpu')) freeze_layers(self.bert) self.txt_stru = opt.txt_stru if opt.txt_stru == 'pooling': self.dropout = nn.Dropout(bert_config.hidden_dropout_prob) self.mapping = nn.Linear(bert_config.hidden_size, opt.final_dims) elif opt.txt_stru == 'cnn': Ks = [1, 2, 3] in_channel = 1 out_channel = 512 embedding_dim = bert_config.hidden_size self.convs1 = nn.ModuleList([ nn.Conv2d(in_channel, out_channel, (K, embedding_dim)) for K in Ks ]) self.dropout = nn.Dropout(bert_config.hidden_dropout_prob) self.mapping = nn.Linear(len(Ks) * out_channel, opt.final_dims) elif opt.txt_stru == 'rnn': embedding_dim = bert_config.hidden_size self.bi_gru = opt.bi_gru self.rnn = nn.GRU(embedding_dim, opt.embed_size, opt.num_layers, batch_first=True, bidirectional=opt.bi_gru) self.dropout = nn.Dropout(bert_config.hidden_dropout_prob) self.mapping = nn.Linear(opt.embed_size, opt.final_dims) elif opt.txt_stru == 'trans': bert_config = BertConfig.from_json_file(opt.img_trans_cfg) self.layer = bert.BERTLayer(bert_config) self.dropout = nn.Dropout(bert_config.hidden_dropout_prob) self.mapping = nn.Linear(768, opt.final_dims)
def main(args): """main function""" bert_config = BertConfig(args.bert_config_path) bert_config.print_config() task_name = args.task_name.lower() paradigm_inst = define_paradigm.Paradigm(task_name) pred_inst = define_predict_pack.DefinePredict() pred_func = getattr(pred_inst, pred_inst.task_map[task_name]) processors = { 'udc': reader.UDCProcessor, 'swda': reader.SWDAProcessor, 'mrda': reader.MRDAProcessor, 'atis_slot': reader.ATISSlotProcessor, 'atis_intent': reader.ATISIntentProcessor, 'dstc2': reader.DSTC2Processor, 'dstc2_asr': reader.DSTC2Processor, } in_tokens = { 'udc': True, 'swda': True, 'mrda': True, 'atis_slot': False, 'atis_intent': True, 'dstc2': True, 'dstc2_asr': True } processor = processors[task_name](data_dir=args.data_dir, vocab_path=args.vocab_path, max_seq_len=args.max_seq_len, do_lower_case=args.do_lower_case, in_tokens=in_tokens[task_name], task_name=task_name, random_seed=args.random_seed) num_labels = len(processor.get_labels()) predict_prog = fluid.Program() predict_startup = fluid.Program() with fluid.program_guard(predict_prog, predict_startup): with fluid.unique_name.guard(): pred_results = create_model(args, pyreader_name='predict_reader', bert_config=bert_config, num_labels=num_labels, paradigm_inst=paradigm_inst, is_prediction=True) predict_pyreader = pred_results.get('pyreader', None) probs = pred_results.get('probs', None) feed_target_names = pred_results.get('feed_targets_name', None) predict_prog = predict_prog.clone(for_test=True) if args.use_cuda: place = fluid.CUDAPlace(0) dev_count = fluid.core.get_cuda_device_count() else: place = fluid.CPUPlace() dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) place = fluid.CUDAPlace(0) if args.use_cuda == True else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(predict_startup) if args.init_checkpoint: init_pretraining_params(exe, args.init_checkpoint, predict_prog) else: raise ValueError( "args 'init_checkpoint' should be set for prediction!") predict_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda, main_program=predict_prog) test_data_generator = processor.data_generator(batch_size=args.batch_size, phase='test', epoch=1, shuffle=False) predict_pyreader.decorate_tensor_provider(test_data_generator) predict_pyreader.start() all_results = [] time_begin = time.time() while True: try: results = predict_exe.run(fetch_list=[probs.name]) all_results.extend(results[0]) except fluid.core.EOFException: predict_pyreader.reset() break time_end = time.time() np.set_printoptions(precision=4, suppress=True) print("-------------- prediction results --------------") print("example_id\t" + ' '.join(processor.get_labels())) if in_tokens[task_name]: for index, result in enumerate(all_results): tags = pred_func(result) print("%s\t%s" % (index, tags)) else: tags = pred_func(all_results, args.max_seq_len) for index, tag in enumerate(tags): print("%s\t%s" % (index, tag)) if args.save_inference_model_path: _, ckpt_dir = os.path.split(args.init_checkpoint) dir_name = ckpt_dir + '_inference_model' model_path = os.path.join(args.save_inference_model_path, dir_name) fluid.io.save_inference_model(model_path, feed_target_names, [probs], exe, main_program=predict_prog)
def main(args): """main function""" bert_config = BertConfig(args.bert_config_path) bert_config.print_config() if args.use_cuda: place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0'))) dev_count = fluid.core.get_cuda_device_count() else: place = fluid.CPUPlace() dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) exe = fluid.Executor(place) task_name = args.task_name.lower() paradigm_inst = define_paradigm.Paradigm(task_name) processors = { 'udc': reader.UDCProcessor, 'swda': reader.SWDAProcessor, 'mrda': reader.MRDAProcessor, 'atis_slot': reader.ATISSlotProcessor, 'atis_intent': reader.ATISIntentProcessor, 'dstc2': reader.DSTC2Processor, } in_tokens = { 'udc': True, 'swda': True, 'mrda': True, 'atis_slot': False, 'atis_intent': True, 'dstc2': True, } processor = processors[task_name](data_dir=args.data_dir, vocab_path=args.vocab_path, max_seq_len=args.max_seq_len, do_lower_case=args.do_lower_case, in_tokens=in_tokens[task_name], task_name=task_name, random_seed=args.random_seed) num_labels = len(processor.get_labels()) if not (args.do_train or args.do_val or args.do_test): raise ValueError("For args `do_train`, `do_val` and `do_test`, at " "least one of them must be True.") startup_prog = fluid.Program() if args.random_seed is not None: startup_prog.random_seed = args.random_seed if args.do_train: train_data_generator = processor.data_generator( batch_size=args.batch_size, phase='train', epoch=args.epoch, shuffle=True) num_train_examples = processor.get_num_examples(phase='train') if in_tokens[task_name]: max_train_steps = args.epoch * num_train_examples // ( args.batch_size // args.max_seq_len) // dev_count else: max_train_steps = args.epoch * num_train_examples // args.batch_size // dev_count warmup_steps = int(max_train_steps * args.warmup_proportion) print("Device count: %d" % dev_count) print("Num train examples: %d" % num_train_examples) print("Max train steps: %d" % max_train_steps) print("Num warmup steps: %d" % warmup_steps) train_program = fluid.Program() if args.random_seed is not None: train_program.random_seed = args.random_seed with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): results = create_model( args, pyreader_name='train_reader', bert_config=bert_config, num_labels=num_labels, paradigm_inst=paradigm_inst) train_pyreader = results.get("pyreader", None) loss = results.get("loss", None) probs = results.get("probs", None) accuracy = results.get("accuracy", None) num_seqs = results.get("num_seqs", None) scheduled_lr = optimization( loss=loss, warmup_steps=warmup_steps, num_train_steps=max_train_steps, learning_rate=args.learning_rate, train_program=train_program, startup_prog=startup_prog, weight_decay=args.weight_decay, scheduler=args.lr_scheduler, use_fp16=args.use_fp16, loss_scaling=args.loss_scaling) if accuracy is not None: skip_opt_set = [loss.name, probs.name, accuracy.name, num_seqs.name] else: skip_opt_set = [loss.name, probs.name, num_seqs.name] fluid.memory_optimize( input_program=train_program, skip_opt_set=skip_opt_set) if args.verbose: if in_tokens[task_name]: lower_mem, upper_mem, unit = fluid.contrib.memory_usage( program=train_program, batch_size=args.batch_size // args.max_seq_len) else: lower_mem, upper_mem, unit = fluid.contrib.memory_usage( program=train_program, batch_size=args.batch_size) print("Theoretical memory usage in training: %.3f - %.3f %s" % (lower_mem, upper_mem, unit)) if args.do_val or args.do_test: test_prog = fluid.Program() with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): test_results = create_model( args, pyreader_name='test_reader', bert_config=bert_config, num_labels=num_labels, paradigm_inst=paradigm_inst) test_pyreader = test_results.get("pyreader", None) loss = test_results.get("loss", None) probs = test_results.get("probs", None) accuracy = test_results.get("accuracy", None) num_seqs = test_results.get("num_seqs", None) test_prog = test_prog.clone(for_test=True) exe.run(startup_prog) if args.do_train: if args.init_checkpoint and args.init_pretraining_params: print( "WARNING: args 'init_checkpoint' and 'init_pretraining_params' " "both are set! Only arg 'init_checkpoint' is made valid.") if args.init_checkpoint: init_checkpoint( exe, args.init_checkpoint, main_program=startup_prog, use_fp16=args.use_fp16) elif args.init_pretraining_params: init_pretraining_params( exe, args.init_pretraining_params, main_program=startup_prog, use_fp16=args.use_fp16) elif args.do_val or args.do_test: if not args.init_checkpoint: raise ValueError("args 'init_checkpoint' should be set if" "only doing validation or testing!") init_checkpoint( exe, args.init_checkpoint, main_program=startup_prog, use_fp16=args.use_fp16) if args.do_train: exec_strategy = fluid.ExecutionStrategy() exec_strategy.use_experimental_executor = args.use_fast_executor exec_strategy.num_threads = dev_count exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope train_exe = fluid.ParallelExecutor( use_cuda=args.use_cuda, loss_name=loss.name, exec_strategy=exec_strategy, main_program=train_program) train_pyreader.decorate_tensor_provider(train_data_generator) else: train_exe = None if args.do_val or args.do_test: test_exe = fluid.ParallelExecutor( use_cuda=args.use_cuda, main_program=test_prog, share_vars_from=train_exe) if args.do_train: train_pyreader.start() steps = 0 total_cost, total_acc, total_num_seqs = [], [], [] time_begin = time.time() ce_info = [] while True: try: steps += 1 if steps % args.skip_steps == 0: if warmup_steps <= 0: if accuracy is not None: fetch_list = [loss.name, accuracy.name, num_seqs.name] else: fetch_list = [loss.name, num_seqs.name] else: if accuracy is not None: fetch_list = [ loss.name, accuracy.name, scheduled_lr.name, num_seqs.name ] else: fetch_list = [loss.name, scheduled_lr.name, num_seqs.name] else: fetch_list = [] if accuracy is not None: fetch_test_list = [loss.name, accuracy.name, num_seqs.name] else: fetch_test_list = [loss.name, num_seqs.name] outputs = train_exe.run(fetch_list=fetch_list) if steps % args.skip_steps == 0: if warmup_steps <= 0: if accuracy is not None: np_loss, np_acc, np_num_seqs = outputs else: np_loss, np_num_seqs = outputs else: if accuracy is not None: np_loss, np_acc, np_lr, np_num_seqs = outputs else: np_loss, np_lr, np_num_seqs = outputs total_cost.extend(np_loss * np_num_seqs) total_num_seqs.extend(np_num_seqs) if accuracy is not None: total_acc.extend(np_acc * np_num_seqs) if args.verbose: verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size() verbose += "learning rate: %f" % ( np_lr[0] if warmup_steps > 0 else args.learning_rate) print(verbose) current_example, current_epoch = processor.get_train_progress() time_end = time.time() used_time = time_end - time_begin current_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) if accuracy is not None: print("%s epoch: %d, progress: %d/%d, step: %d, ave loss: %f, " "ave acc: %f, speed: %f steps/s" % (current_time, current_epoch, current_example, num_train_examples, steps, np.sum(total_cost) / np.sum(total_num_seqs), np.sum(total_acc) / np.sum(total_num_seqs), args.skip_steps / used_time)) ce_info.append([np.sum(total_cost) / np.sum(total_num_seqs), np.sum(total_acc) / np.sum(total_num_seqs), args.skip_steps / used_time]) else: print("%s epoch: %d, progress: %d/%d, step: %d, ave loss: %f, " "speed: %f steps/s" % (current_time, current_epoch, current_example, num_train_examples, steps, np.sum(total_cost) / np.sum(total_num_seqs), args.skip_steps / used_time)) ce_info.append([np.sum(total_cost) / np.sum(total_num_seqs), args.skip_steps / used_time]) total_cost, total_acc, total_num_seqs = [], [], [] time_begin = time.time() if steps % args.save_steps == 0: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.io.save_persistables(exe, save_path, train_program) if steps % args.validation_steps == 0: #evaluate dev set if args.do_val: test_pyreader.decorate_tensor_provider( processor.data_generator( batch_size=args.batch_size, phase='dev', epoch=1, shuffle=False)) evaluate(test_exe, test_prog, test_pyreader, fetch_test_list, "dev") #evaluate test set if args.do_test: test_pyreader.decorate_tensor_provider( processor.data_generator( batch_size=args.batch_size, phase='test', epoch=1, shuffle=False)) evaluate(test_exe, test_prog, test_pyreader, fetch_test_list, "test") except fluid.core.EOFException: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.io.save_persistables(exe, save_path, train_program) train_pyreader.reset() break if args.do_train and args.enable_ce: card_num = get_cards() print("zytest_card_num", card_num) ce_loss = 0 ce_acc = 0 ce_time = 0 try: ce_loss = ce_info[-2][0] ce_acc = ce_info[-2][1] ce_time = ce_info[-2][2] except: print("ce info error") print("kpis\teach_step_duration_%s_card%s\t%s" % (task_name, card_num, ce_time)) print("kpis\ttrain_loss_%s_card%s\t%f" % (task_name, card_num, ce_loss)) print("kpis\ttrain_acc_%s_card%s\t%f" % (task_name, card_num, ce_acc)) #final eval on dev set if args.do_val: test_pyreader.decorate_tensor_provider( processor.data_generator( batch_size=args.batch_size, phase='dev', epoch=1, shuffle=False)) print("Final validation result:") evaluate(test_exe, test_prog, test_pyreader, fetch_test_list, "dev") #final eval on test set if args.do_test: test_pyreader.decorate_tensor_provider( processor.data_generator( batch_size=args.batch_size, phase='test', epoch=1, shuffle=False)) print("Final test result:") evaluate(test_exe, test_prog, test_pyreader, fetch_test_list, "test")
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument('--mode', type=str, default='train') parser.add_argument('--pause', type=int, default=0) parser.add_argument('--iteration', type=str, default='1') parser.add_argument('--fs', type=str, default='local', help='must be `local`. Do not change.') # Data paths parser.add_argument('--data_dir', default='data/', type=str) parser.add_argument("--train_file", default='train-v1.1.json', type=str, help="SQuAD json for training. E.g., train-v1.1.json") parser.add_argument("--predict_file", default='dev-v1.1.json', type=str, help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json") parser.add_argument('--gt_file', default='dev-v1.1.json', type=str, help='ground truth file needed for evaluation.') # Metadata paths parser.add_argument('--metadata_dir', default='metadata/', type=str) parser.add_argument("--vocab_file", default='vocab.txt', type=str, help="The vocabulary file that the BERT model was trained on.") parser.add_argument("--bert_model_option", default='large_uncased', type=str, help="model architecture option. [large_uncased] or [base_uncased]") parser.add_argument("--bert_config_file", default='bert_config.json', type=str, help="The config json file corresponding to the pre-trained BERT model. " "This specifies the model architecture.") parser.add_argument("--init_checkpoint", default='pytorch_model.bin', type=str, help="Initial checkpoint (usually from a pre-trained BERT model).") # Output and load paths parser.add_argument("--output_dir", default='out/', type=str, help="The output directory where the model checkpoints will be written.") parser.add_argument("--index_file", default='index.hdf5', type=str, help="index output file.") parser.add_argument("--question_emb_file", default='question.hdf5', type=str, help="question output file.") parser.add_argument('--load_dir', default='out/', type=str) # Local paths (if we want to run cmd) parser.add_argument('--eval_script', default='evaluate-v1.1.py', type=str) # Do's parser.add_argument("--do_load", default=False, action='store_true', help='Do load. If eval, do load automatically') parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") parser.add_argument("--do_train_filter", default=False, action='store_true', help='Train filter or not.') parser.add_argument("--do_train_sparse", default=False, action='store_true', help='Train sparse or not.') parser.add_argument("--do_predict", default=False, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument('--do_eval', default=False, action='store_true') parser.add_argument('--do_embed_question', default=False, action='store_true') parser.add_argument('--do_index', default=False, action='store_true') parser.add_argument('--do_serve', default=False, action='store_true') # Model options: if you change these, you need to train again parser.add_argument("--do_case", default=False, action='store_true', help="Whether to lower case the input text. Should be True for uncased " "models and False for cased models.") parser.add_argument('--phrase_size', default=961, type=int) parser.add_argument('--metric', default='ip', type=str, help='ip | l2') parser.add_argument("--use_sparse", default=False, action='store_true') # GPU and memory related options parser.add_argument("--max_seq_length", default=384, type=int, help="The maximum total input sequence length after WordPiece tokenization. Sequences " "longer than this will be truncated, and sequences shorter than this will be padded.") parser.add_argument("--doc_stride", default=128, type=int, help="When splitting up a long document into chunks, how much stride to take between chunks.") parser.add_argument("--max_query_length", default=64, type=int, help="The maximum number of tokens for the question. Questions longer than this will " "be truncated to this length.") parser.add_argument("--train_batch_size", default=12, type=int, help="Total batch size for training.") parser.add_argument("--predict_batch_size", default=16, type=int, help="Total batch size for predictions.") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument('--optimize_on_cpu', default=False, action='store_true', help="Whether to perform optimization and keep the optimizer averages on CPU") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--fp16', default=False, action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") # Training options: only effective during training parser.add_argument("--learning_rate", default=3e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--num_train_filter_epochs", default=1.0, type=float, help="Total number of training epochs for filter to perform.") parser.add_argument("--num_train_sparse_epochs", default=3.0, type=float, help="Total number of training epochs for sparse to perform.") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10% " "of training.") parser.add_argument("--save_checkpoints_steps", default=1000, type=int, help="How often to save the model checkpoint.") parser.add_argument("--iterations_per_loop", default=1000, type=int, help="How many steps to make in each estimator call.") # Prediction options: only effective during prediction parser.add_argument("--n_best_size", default=20, type=int, help="The total number of n-best predictions to generate in the nbest_predictions.json " "output file.") parser.add_argument("--max_answer_length", default=30, type=int, help="The maximum length of an answer that can be generated. This is needed because the start " "and end predictions are not conditioned on one another.") # Index Options parser.add_argument('--dtype', default='float32', type=str) parser.add_argument('--filter_threshold', default=-1e9, type=float) parser.add_argument('--compression_offset', default=-2, type=float) parser.add_argument('--compression_scale', default=20, type=float) parser.add_argument('--split_by_para', default=False, action='store_true') # Serve Options parser.add_argument('--port', default=9009, type=int) # Others parser.add_argument('--parallel', default=False, action='store_true') parser.add_argument("--verbose_logging", default=False, action='store_true', help="If true, all of the warnings related to data processing will be printed. " "A number of warnings are expected for a normal SQuAD evaluation.") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--draft', default=False, action='store_true') parser.add_argument('--draft_num_examples', type=int, default=12) args = parser.parse_args() # Filesystem routines if args.fs == 'local': class Processor(object): def __init__(self, path): self._save = None self._load = None self._path = path def bind(self, save, load): self._save = save self._load = load def save(self, checkpoint=None, save_fn=None, **kwargs): path = os.path.join(self._path, str(checkpoint)) if save_fn is None: self._save(path, **kwargs) else: save_fn(path, **kwargs) def load(self, checkpoint, load_fn=None, session=None, **kwargs): assert self._path == session path = os.path.join(self._path, str(checkpoint), 'model.pt') if load_fn is None: self._load(path, **kwargs) else: load_fn(path, **kwargs) processor = Processor(args.load_dir) else: raise ValueError(args.fs) if not args.do_train: args.do_load = True # Configure paths args.train_file = os.path.join(args.data_dir, args.train_file) args.predict_file = os.path.join(args.data_dir, args.predict_file) args.gt_file = os.path.join(args.data_dir, args.gt_file) args.bert_config_file = os.path.join(args.metadata_dir, args.bert_config_file.replace(".json", "") + "_" + args.bert_model_option + ".json") args.init_checkpoint = os.path.join(args.metadata_dir, args.init_checkpoint.replace(".bin", "") + "_" + args.bert_model_option + ".bin") args.vocab_file = os.path.join(args.metadata_dir, args.vocab_file) args.index_file = os.path.join(args.output_dir, args.index_file) # Multi-GPU stuff if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) # Seed for reproducibility random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) bert_config = BertConfig.from_json_file(args.bert_config_file) if args.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (args.max_seq_length, bert_config.max_position_embeddings)) if os.path.exists(args.output_dir) and os.listdir(args.output_dir): # raise ValueError("Output directory () already exists and is not empty.") pass else: os.makedirs(args.output_dir, exist_ok=True) tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file, do_lower_case=not args.do_case) model = BertPhraseModel( bert_config, phrase_size=args.phrase_size, metric=args.metric, use_sparse=args.use_sparse ) print('Number of model parameters:', sum(p.numel() for p in model.parameters())) if not args.do_load and args.init_checkpoint is not None: state_dict = torch.load(args.init_checkpoint, map_location='cpu') # If below: for Korean BERT compatibility if next(iter(state_dict)).startswith('bert.'): state_dict = {key[len('bert.'):]: val for key, val in state_dict.items()} state_dict = {key: val for key, val in state_dict.items() if key in model.encoder.bert_model.state_dict()} model.encoder.bert.load_state_dict(state_dict) if args.fp16: model.half() if not args.optimize_on_cpu: model.to(device) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank) elif args.parallel or n_gpu > 1: model = torch.nn.DataParallel(model) if args.do_load: bind_model(processor, model) processor.load(args.iteration, session=args.load_dir) if args.do_train: train_examples = read_squad_examples( input_file=args.train_file, is_training=True, draft=args.draft, draft_num_examples=args.draft_num_examples) num_train_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) no_decay = ['bias', 'gamma', 'beta'] optimizer_parameters = [ {'params': [p for n, p in model.named_parameters() if n not in no_decay], 'weight_decay_rate': 0.01}, {'params': [p for n, p in model.named_parameters() if n in no_decay], 'weight_decay_rate': 0.0} ] optimizer = BERTAdam(optimizer_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_steps) bind_model(processor, model, optimizer) global_step = 0 train_features, train_features_ = convert_examples_to_features( examples=train_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=True) train_features = inject_noise_to_features_list(train_features, clamp=True, replace=True, shuffle=True) logger.info("***** Running training *****") logger.info(" Num orig examples = %d", len(train_examples)) logger.info(" Num split examples = %d", len(train_features)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_start_positions = torch.tensor([f.start_position for f in train_features], dtype=torch.long) all_end_positions = torch.tensor([f.end_position for f in train_features], dtype=torch.long) all_input_ids_ = torch.tensor([f.input_ids for f in train_features_], dtype=torch.long) all_input_mask_ = torch.tensor([f.input_mask for f in train_features_], dtype=torch.long) if args.fp16: (all_input_ids, all_input_mask, all_start_positions, all_end_positions) = tuple(t.half() for t in (all_input_ids, all_input_mask, all_start_positions, all_end_positions)) all_input_ids_, all_input_mask_ = tuple(t.half() for t in (all_input_ids_, all_input_mask_)) train_data = TensorDataset(all_input_ids, all_input_mask, all_input_ids_, all_input_mask_, all_start_positions, all_end_positions) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for epoch in range(int(args.num_train_epochs)): for step, batch in enumerate(tqdm(train_dataloader, desc="Epoch %d" % (epoch + 1))): batch = tuple(t.to(device) for t in batch) (input_ids, input_mask, input_ids_, input_mask_, start_positions, end_positions) = batch loss, _ = model(input_ids, input_mask, input_ids_, input_mask_, start_positions, end_positions) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: if args.optimize_on_cpu: model.to('cpu') optimizer.step() # We have accumulated enought gradients model.zero_grad() if args.optimize_on_cpu: model.to(device) global_step += 1 processor.save(epoch + 1) if args.do_train_filter: train_examples = read_squad_examples( input_file=args.train_file, is_training=True, draft=args.draft, draft_num_examples=args.draft_num_examples) num_train_steps = int( len( train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_filter_epochs) if args.parallel or n_gpu > 1: optimizer = Adam(model.module.filter.parameters()) else: optimizer = Adam(model.filter.parameters()) bind_model(processor, model, optimizer) global_step = 0 train_features, train_features_ = convert_examples_to_features( examples=train_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=True) logger.info("***** Running filter training *****") logger.info(" Num orig examples = %d", len(train_examples)) logger.info(" Num split examples = %d", len(train_features)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_start_positions = torch.tensor([f.start_position for f in train_features], dtype=torch.long) all_end_positions = torch.tensor([f.end_position for f in train_features], dtype=torch.long) all_input_ids_ = torch.tensor([f.input_ids for f in train_features_], dtype=torch.long) all_input_mask_ = torch.tensor([f.input_mask for f in train_features_], dtype=torch.long) if args.fp16: (all_input_ids, all_input_mask, all_start_positions, all_end_positions) = tuple(t.half() for t in (all_input_ids, all_input_mask, all_start_positions, all_end_positions)) all_input_ids_, all_input_mask_ = tuple(t.half() for t in (all_input_ids_, all_input_mask_)) train_data = TensorDataset(all_input_ids, all_input_mask, all_input_ids_, all_input_mask_, all_start_positions, all_end_positions) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for epoch in range(int(args.num_train_filter_epochs)): for step, batch in enumerate(tqdm(train_dataloader, desc="Epoch %d" % (epoch + 1))): batch = tuple(t.to(device) for t in batch) (input_ids, input_mask, input_ids_, input_mask_, start_positions, end_positions) = batch _, loss = model(input_ids, input_mask, input_ids_, input_mask_, start_positions, end_positions) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: if args.optimize_on_cpu: model.to('cpu') optimizer.step() # We have accumulated enought gradients model.zero_grad() if args.optimize_on_cpu: model.to(device) global_step += 1 processor.save(epoch + 1) if args.do_train_sparse: train_examples = read_squad_examples( input_file=args.train_file, is_training=True, draft=args.draft, draft_num_examples=args.draft_num_examples) num_train_steps = int( len( train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_sparse_epochs) ''' if args.parallel or n_gpu > 1: optimizer = Adam(model.module.sparse_layer.parameters()) else: optimizer = Adam(model.sparse_layer.parameters()) ''' no_decay = ['bias', 'gamma', 'beta'] optimizer_parameters = [ {'params': [p for n, p in model.named_parameters() if (n not in no_decay) and ('filter' not in n)], 'weight_decay_rate': 0.01}, {'params': [p for n, p in model.named_parameters() if (n in no_decay) and ('filter' not in n)], 'weight_decay_rate': 0.0} ] optimizer = BERTAdam(optimizer_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_steps) bind_model(processor, model, optimizer) global_step = 0 train_features, train_features_ = convert_examples_to_features( examples=train_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=True) logger.info("***** Running sparse training *****") logger.info(" Num orig examples = %d", len(train_examples)) logger.info(" Num split examples = %d", len(train_features)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_start_positions = torch.tensor([f.start_position for f in train_features], dtype=torch.long) all_end_positions = torch.tensor([f.end_position for f in train_features], dtype=torch.long) all_input_ids_ = torch.tensor([f.input_ids for f in train_features_], dtype=torch.long) all_input_mask_ = torch.tensor([f.input_mask for f in train_features_], dtype=torch.long) if args.fp16: (all_input_ids, all_input_mask, all_start_positions, all_end_positions) = tuple(t.half() for t in (all_input_ids, all_input_mask, all_start_positions, all_end_positions)) all_input_ids_, all_input_mask_ = tuple(t.half() for t in (all_input_ids_, all_input_mask_)) train_data = TensorDataset(all_input_ids, all_input_mask, all_input_ids_, all_input_mask_, all_start_positions, all_end_positions) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for epoch in range(int(args.num_train_sparse_epochs)): for step, batch in enumerate(tqdm(train_dataloader, desc="Epoch %d" % (epoch + 1))): batch = tuple(t.to(device) for t in batch) (input_ids, input_mask, input_ids_, input_mask_, start_positions, end_positions) = batch loss, _ = model(input_ids, input_mask, input_ids_, input_mask_, start_positions, end_positions) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: if args.optimize_on_cpu: model.to('cpu') optimizer.step() # We have accumulated enought gradients model.zero_grad() if args.optimize_on_cpu: model.to(device) global_step += 1 processor.save(epoch + 1) if args.do_predict: eval_examples = read_squad_examples( input_file=args.predict_file, is_training=False, draft=args.draft, draft_num_examples=args.draft_num_examples) eval_features, query_eval_features = convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=False) logger.info("***** Running predictions *****") logger.info(" Num orig examples = %d", len(eval_examples)) logger.info(" Num split examples = %d", len(eval_features)) logger.info(" Batch size = %d", args.predict_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_input_ids_ = torch.tensor([f.input_ids for f in query_eval_features], dtype=torch.long) all_input_mask_ = torch.tensor([f.input_mask for f in query_eval_features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) if args.fp16: (all_input_ids, all_input_mask, all_example_index) = tuple(t.half() for t in (all_input_ids, all_input_mask, all_example_index)) all_input_ids_, all_input_mask_ = tuple(t.half() for t in (all_input_ids_, all_input_mask_)) eval_data = TensorDataset(all_input_ids, all_input_mask, all_input_ids_, all_input_mask_, all_example_index) if args.local_rank == -1: eval_sampler = SequentialSampler(eval_data) else: eval_sampler = DistributedSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size) model.eval() logger.info("Start evaluating") def get_results(): for (input_ids, input_mask, input_ids_, input_mask_, example_indices) in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) input_ids_ = input_ids_.to(device) input_mask_ = input_mask_.to(device) with torch.no_grad(): batch_all_logits, bs, be = model(input_ids, input_mask, input_ids_, input_mask_) for i, example_index in enumerate(example_indices): all_logits = batch_all_logits[i].detach().cpu().numpy() filter_start_logits = bs[i].detach().cpu().numpy() filter_end_logits = be[i].detach().cpu().numpy() eval_feature = eval_features[example_index.item()] unique_id = int(eval_feature.unique_id) yield RawResult(unique_id=unique_id, all_logits=all_logits, filter_start_logits=filter_start_logits, filter_end_logits=filter_end_logits) output_prediction_file = os.path.join(args.output_dir, "predictions.json") write_predictions(eval_examples, eval_features, get_results(), args.max_answer_length, not args.do_case, output_prediction_file, args.verbose_logging, args.filter_threshold) if args.do_eval: command = "python %s %s %s" % (args.eval_script, args.gt_file, output_prediction_file) import subprocess process = subprocess.Popen(command.split(), stdout=subprocess.PIPE) output, error = process.communicate() if args.do_embed_question: question_examples = read_squad_examples( question_only=True, input_file=args.predict_file, is_training=False, draft=args.draft, draft_num_examples=args.draft_num_examples) query_eval_features = convert_questions_to_features( examples=question_examples, tokenizer=tokenizer, max_query_length=args.max_query_length) question_dataloader = convert_question_features_to_dataloader(query_eval_features, args.fp16, args.local_rank, args.predict_batch_size) model.eval() logger.info("Start embedding") question_results = get_question_results_(question_examples, query_eval_features, question_dataloader, device, model) path = os.path.join(args.output_dir, args.question_emb_file) print('Writing %s' % path) write_question_results(question_results, query_eval_features, path) if args.do_index: if ':' not in args.predict_file: predict_files = [args.predict_file] offsets = [0] else: dirname = os.path.dirname(args.predict_file) basename = os.path.basename(args.predict_file) start, end = list(map(int, basename.split(':'))) # skip files if possible if os.path.exists(args.index_file): with h5py.File(args.index_file, 'r') as f: dids = list(map(int, f.keys())) start = int(max(dids) / 1000) print('%s exists; starting from %d' % (args.index_file, start)) names = [str(i).zfill(4) for i in range(start, end)] predict_files = [os.path.join(dirname, name) for name in names] offsets = [int(each) * 1000 for each in names] for offset, predict_file in zip(offsets, predict_files): try: context_examples = read_squad_examples( context_only=True, input_file=predict_file, is_training=False, draft=args.draft, draft_num_examples=args.draft_num_examples) for example in context_examples: example.doc_idx += offset context_features = convert_documents_to_features( examples=context_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride) logger.info("***** Running indexing on %s *****" % predict_file) logger.info(" Num orig examples = %d", len(context_examples)) logger.info(" Num split examples = %d", len(context_features)) logger.info(" Batch size = %d", args.predict_batch_size) all_input_ids = torch.tensor([f.input_ids for f in context_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in context_features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) if args.fp16: all_input_ids, all_input_mask, all_example_index = tuple( t.half() for t in (all_input_ids, all_input_mask, all_example_index)) context_data = TensorDataset(all_input_ids, all_input_mask, all_example_index) if args.local_rank == -1: context_sampler = SequentialSampler(context_data) else: context_sampler = DistributedSampler(context_data) context_dataloader = DataLoader(context_data, sampler=context_sampler, batch_size=args.predict_batch_size) model.eval() logger.info("Start indexing") def get_context_results(): for (input_ids, input_mask, example_indices) in context_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) with torch.no_grad(): batch_start, batch_end, batch_span_logits, bs, be, batch_sparse = model(input_ids, input_mask) for i, example_index in enumerate(example_indices): start = batch_start[i].detach().cpu().numpy().astype(args.dtype) end = batch_end[i].detach().cpu().numpy().astype(args.dtype) sparse = None if batch_sparse is not None: sparse = batch_sparse[i].detach().cpu().numpy().astype(args.dtype) span_logits = batch_span_logits[i].detach().cpu().numpy().astype(args.dtype) filter_start_logits = bs[i].detach().cpu().numpy().astype(args.dtype) filter_end_logits = be[i].detach().cpu().numpy().astype(args.dtype) context_feature = context_features[example_index.item()] unique_id = int(context_feature.unique_id) yield ContextResult(unique_id=unique_id, start=start, end=end, span_logits=span_logits, filter_start_logits=filter_start_logits, filter_end_logits=filter_end_logits, sparse=sparse) t0 = time() write_hdf5(context_examples, context_features, get_context_results(), args.max_answer_length, not args.do_case, args.index_file, args.filter_threshold, args.verbose_logging, offset=args.compression_offset, scale=args.compression_scale, split_by_para=args.split_by_para, use_sparse=args.use_sparse) print('%s: %.1f mins' % (predict_file, (time() - t0) / 60)) except Exception as e: with open(os.path.join(args.output_dir, 'error_files.txt'), 'a') as fp: fp.write('error file: %s\n' % predict_file) fp.write('error message: %s\n' % str(e)) if args.do_serve: def get(text): question_examples = [SquadExample(qas_id='serve', question_text=text)] query_eval_features = convert_questions_to_features( examples=question_examples, tokenizer=tokenizer, max_query_length=16) question_dataloader = convert_question_features_to_dataloader(query_eval_features, args.fp16, args.local_rank, args.predict_batch_size) model.eval() question_results = get_question_results_(question_examples, query_eval_features, question_dataloader, device, model) question_result = next(iter(question_results)) out = question_result.start.tolist(), question_result.end.tolist(), question_result.span_logit.tolist() return out serve(get, args.port)
def run(): bert_config = BertConfig.from_json_file(HP.bert_config) inputs = tf.placeholder(dtype=tf.int32, shape=[None, None]) segments = tf.placeholder(dtype=tf.int32, shape=[None, None]) inputs_length = tf.placeholder(dtype=tf.int32, shape=[None]) answers = tf.placeholder(dtype=tf.int32, shape=[None, None, 2]) answers_length = tf.placeholder(dtype=tf.int32, shape=[None]) mod = model.Model(bert_config, HP.is_training, HP.num_units, inputs, segments, inputs_length, answers, answers_length, layers=HP.layers) train_data = DataGenerator(HP.train_file, HP.max_seq_length, HP.batch_size) dev_data = DataGenerator(HP.dev_file, HP.max_seq_length, HP.batch_size) long_train_loss = {i: Average() for i in HP.train_layers} long_train_accuracy = {i: Average() for i in HP.train_layers} train_loss = {i: Average() for i in HP.train_layers} train_accuracy = {i: Average() for i in HP.train_layers} dev_loss = {i: Average() for i in HP.train_layers} dev_accuracy = {i: Average() for i in HP.train_layers} epoch_dev_loss = {i: Average() for i in HP.train_layers} epoch_dev_accuracy = {i: Average() for i in HP.train_layers} sess = tf.Session() var = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='bert') var = {v.name: v for v in var} start_saver = tf.train.Saver(var) start_saver.restore(sess, HP.start1_checkpoint) init = tf.initialize_all_variables() sess.run(init) logger = Logger(HP.log_files) loss = mod.losses(HP.train_layers) accuracy = mod.accuracy(HP.train_layers) train = mod.train(HP.learning_rate, HP.bert_train, HP.train_layers) var = mod.weights var = {v.name: v for v in var} end_saver = tf.train.Saver(var) saver = tf.train.Saver() step = 0 for epoch in range(1, HP.epochs + 1): while not train_data.has_ended(): step = step + 1 train_inputs, train_segments, train_inputs_length, train_answers, train_answers_length = \ train_data.get_next() feedict = { inputs: train_inputs, segments: train_segments, inputs_length: train_inputs_length, answers: train_answers, answers_length: train_answers_length } _loss, _accuracy, _ = sess.run([loss, accuracy, train], feed_dict=feedict) _batch_size = len(train_inputs_length) for i in HP.train_layers: train_loss[i].add(_loss[i], _batch_size) train_accuracy[i].add(_accuracy[i], _batch_size) long_train_loss[i].add(_loss[i], _batch_size) long_train_accuracy[i].add(_accuracy[i], _batch_size) if dev_data.has_ended(): dev_data.reset(False) dev_inputs, dev_segments, dev_inputs_length, dev_answers, dev_answers_length = dev_data.get_next( ) feedict = { inputs: dev_inputs, segments: dev_segments, inputs_length: dev_inputs_length, answers: dev_answers, answers_length: dev_answers_length } _loss, _accuracy = sess.run([loss, accuracy], feed_dict=feedict) _batch_size = len(dev_inputs_length) for i in HP.train_layers: dev_loss[i].add(_loss[i], _batch_size) dev_accuracy[i].add(_accuracy[i], _batch_size) if step % 500 == 0: for i in HP.train_layers: logger.log("train " + str(i), train_loss[i], train_accuracy[i], step) logger.log("dev " + str(i), dev_loss[i], dev_accuracy[i], step) train_loss[i].reset() train_accuracy[i].reset() dev_loss[i].reset() dev_accuracy[i].reset() if step % 1000 == 0: saver.save(sess, HP.save1_checkpoint) logger.log_text("saving checkpoint") for i in HP.train_layers: logger.log("epoch train " + str(i), long_train_loss[i], long_train_accuracy[i], step) long_train_loss[i].reset() long_train_accuracy[i].reset() train_data.reset(True) dev_data.reset(True) while not dev_data.has_ended(): dev_inputs, dev_segments, dev_inputs_length, dev_answers, dev_answers_length = dev_data.get_next( ) feedict = { inputs: dev_inputs, segments: dev_segments, inputs_length: dev_inputs_length, answers: dev_answers, answers_length: dev_answers_length } _loss, _accuracy = sess.run([loss, accuracy], feed_dict=feedict) _batch_size = len(dev_inputs_length) for i in HP.train_layers: epoch_dev_loss[i].add(_loss[i], _batch_size) epoch_dev_accuracy[i].add(_accuracy[i], _batch_size) for i in HP.train_layers: logger.log("epoch validation " + str(i), epoch_dev_loss[i], epoch_dev_accuracy[i], epoch) dev_data.reset(True) end_saver.save(sess, HP.end1_checkpoint)
def run(): bert_config = BertConfig.from_json_file(HP.bert_config) inputs = tf.placeholder(dtype=tf.int32, shape=[1, None]) segments = tf.placeholder(dtype=tf.int32, shape=[1, None]) inputs_length = tf.placeholder(dtype=tf.int32, shape=[1]) answers = tf.placeholder(dtype=tf.int32, shape=[1, None, 2]) answers_length = tf.placeholder(dtype=tf.int32, shape=[1]) mod = model.Model(bert_config, HP.is_training, HP.num_units, inputs, segments, inputs_length, answers, answers_length, layers=HP.layers) loss = mod.losses(HP.train_layers) loss = [loss[i] for i in HP.layers] accuracy = mod.accuracy(HP.train_layers) accuracy = list(accuracy[i] for i in HP.layers) num_experts = len(HP.layers) eta = np.arange(HP.eta[0], HP.eta[1], HP.eta[2], dtype=np.float32) num_eta = len(eta) log_probs = tf.get_variable(name='log_prob', shape=[num_eta, num_experts], dtype=tf.float32, trainable=False) probs = tf.nn.softmax(log_probs, 1) init = tf.assign(log_probs, tf.zeros_like(log_probs)) choice = tf.random.multinomial(log_probs, 1, output_dtype=tf.int32) choice = tf.squeeze(choice, -1) loss = tf.stack(loss) accuracy = tf.stack(accuracy) my_loss = tf.gather(loss, choice) my_accuracy = tf.gather(accuracy, choice) mean_loss = tf.reduce_sum((1 - accuracy) * probs, -1) regret = tf.expand_dims(mean_loss, 1) - (1-accuracy) log_update = tf.expand_dims(eta, 1) * regret log_update = log_update - log_update**2 with tf.control_dependencies([my_loss, my_accuracy]): update = tf.assign(log_probs, log_probs + log_update) dev_data = DataGenerator(HP.dev_file, HP.max_seq_length, 1) weights = {v.name: v for v in mod.weights} starter = tf.train.Saver(weights) saver = tf.train.Saver({log_probs.name: log_probs}) sess_config = tf.ConfigProto() sess_config.gpu_options.allow_growth = True sess = tf.Session(config=sess_config) starter.restore(sess, HP.start2_checkpoint) sess.run(init) keys = ["loss", "accuracy", "my_loss", "my_accuracy", "probs"] step = 0 saved_items = [[] for _ in keys] with open(HP.log_files, 'w') as f: f.write("***START***\n\n\n") print("***START***\n\n\n") while not dev_data.has_ended(): step = step + 1 dev_inputs, dev_segments, dev_inputs_length, dev_answers, dev_answers_length = \ dev_data.get_next() feed_dict = {inputs: dev_inputs, segments: dev_segments, inputs_length: dev_inputs_length, answers: dev_answers, answers_length: dev_answers_length} items = sess.run([loss, accuracy, my_loss, my_accuracy, probs, update], feed_dict=feed_dict) items = items[:-1] for saved_item, item in zip(saved_items, items): saved_item.append(item.tolist()) if step % 1000 == 0: text = "step: %d\nsaving checkpoint\n\n" % step print(text) with open(HP.log_files, 'a') as f: f.write(text) dictionary = dict(zip(keys, saved_items)) with open(HP.weights_file, 'w') as f: json.dump(dictionary, f) saver.save(sess, HP.save2_checkpoint) dictionary = dict(zip(keys, saved_items)) with open(HP.weights_file, 'w') as f: json.dump(dictionary, f) print("\n\n\nEND") with open(HP.log_files, 'w') as f: f.write("***END***\n\n\n")
def load_model(cls, model_path, bert_config, init_spec, *inputs, **kwargs): """ Instantiate a NNModule from a pre-trained model file. Download and cache the pre-trained model file if needed. Params: pretrained_model_name: either: - a str with the name of a pre-trained model to load selected in the list of: . `bert-base-uncased` . `bert-large-uncased` . `bert-base-cased` . `bert-base-multilingual` . `bert-base-chinese` - a path or url to a pretrained model archive containing: . `bert_config.json` a configuration file for the model . `pytorch_model.bin` a PyTorch dump of a BertForPreTraining instance *inputs, **kwargs: additional input for the specific Bert class (ex: num_labels for BertForSequenceClassification) """ # Load config config = BertConfig.from_json_file(bert_config) logger.info("Model config {}".format(config)) # Instantiate model. model = cls(config, *inputs, **kwargs) if model_path is None: return model logger.info("loading prtrained local model file {}".format(model_path)) state_dict = torch.load(model_path, map_location='cpu') missing_keys = [] unexpected_keys = [] error_msgs = [] # copy state_dict so _load_from_state_dict can modify it metadata = getattr(state_dict, '_metadata', None) state_dict = state_dict.copy() for k in list(state_dict.keys()): if 'LayerNorm.gamma' in k: nk = k.replace('LayerNorm.gamma', 'LayerNorm.weight') state_dict[nk]=state_dict[k] del state_dict[k] if 'LayerNorm.beta' in k: nk = k.replace('LayerNorm.beta', 'LayerNorm.bias') state_dict[nk]=state_dict[k] del state_dict[k] ignore_init = [] if init_spec: remap_dict = type(state_dict)() for var in init_spec: mapping = init_spec[var].mapping name = init_spec[var].name if name.startswith('module.'): name = name[len('module.'):] if (not init_spec[var].use_pretrain): ignore_init += [name] elif mapping: if mapping.startswith('module.'): mapping = mapping[len('module.'):] if mapping in state_dict: remap_dict[name] = state_dict[mapping] logger.info('Variables not using pretraining: {}'.format(ignore_init)) for ig in ignore_init: if ig in state_dict: del state_dict[ig] for key in state_dict: if key not in remap_dict: remap_dict[key]=state_dict[key] state_dict = remap_dict if metadata is not None: state_dict._metadata = metadata def load(module, prefix=''): local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {}) module._load_from_state_dict( state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs) for name, child in module._modules.items(): if child is not None: load(child, prefix + name + '.') load(model) if len(missing_keys) > 0: logger.warn("Weights of {} not initialized from pretrained model: {}".format( model.__class__.__name__, '\n '.join(missing_keys))) if len(unexpected_keys) > 0: logger.warn("Weights from pretrained model not used in {}: {}".format( model.__class__.__name__, '\n '.join(unexpected_keys))) return model
def __init__(self, opt): super(TransformerMapping, self).__init__() self.opt = opt bert_config = BertConfig.from_json_file(opt.trans_cfg) self.layer = bert.BERTLayer(bert_config) self.mapping = nn.Linear(opt.img_dim, opt.final_dims)