def __init__(self): self.decoder = SelfAttendedDecoder(vocab=data_utils.vocab, device_id=configs.decoder_device_id) # self.decoder.apply(init_weights) self.xe_loss = nn.CrossEntropyLoss(ignore_index=data_utils.vocab.padding_id) self.optimizer = DenseSparseAdam(self.decoder.parameters(), lr=configs.adam_lr) # , lr=configs.lr) # self.optimizer = optim.ASGD(self.model.parameters(), lr=configs.lr, weight_decay=configs.l2_weight_decay) self.lr_scheduler = lr_scheduler.ReduceLROnPlateau( self.optimizer, 'min', patience=configs.lr_scheduler_patience, factor=configs.lr_scheduler_factor, verbose=True ) self.epoch_idx = 0 self.min_ppl = 1000. self.ckpt_path = 'decoder.pretrained.params'
def main(): data_holder, task2id, id2task, num_feat, num_voc, num_char, tgt_dict, embeddings = Dataloader_onto.multitask_dataloader( pkl_path, num_task=num_task, batch_size=BATCH_SIZE) para = model_para task2label = {"conll2000": "chunk", "wsjpos": "POS", "ontonotes": "NER"} #task2label = {"conll2000": "chunk", "wsjpos": "POS", "conll2003": "NER"} #logger = Logger('./logs/'+str(args.gpu)) para["id2task"] = id2task para["n_feats"] = num_feat para["n_vocs"] = num_voc para["n_tasks"] = num_task para["out_size"] = [ len(tgt_dict[task2label[id2task[ids]]]) for ids in range(num_task) ] para["n_chars"] = num_char model = Model_s.build_model_cnn(para) model.Word_embeddings.apply_weights(embeddings) params = list(filter(lambda p: p.requires_grad, model.parameters())) num_params = sum(p.numel() for p in model.parameters()) print(model) print("Num of paras:", num_params) print(model.concat_flag) def lr_decay(optimizer, epoch, decay_rate=0.05, init_lr=0.015): lr = init_lr / (1 + decay_rate * epoch) print(" Learning rate is set as:", lr) for param_group in optimizer.param_groups: param_group['lr'] = lr return optimizer def exp_lr_decay(optimizer, epoch, decay_rate=0.05, init_lr=0.015): lr = init_lr * decay_rate**epoch print(" Learning rate is set as:", lr) for param_group in optimizer.param_groups: param_group['lr'] = lr return optimizer if args.optim == "noam": model_optim = optim_custorm.NoamOpt( para["d_hid"], 1, 1000, DenseSparseAdam(params, lr=0.0015, betas=(0.9, 0.98), eps=1e-9)) args.decay = None elif args.optim == "sgd": model_optim = optim.SGD(params, lr=0.015, momentum=args.momentum, weight_decay=1e-8) elif args.optim == "adam": model_optim = optim.Adam(params, lr=0.0, betas=(0.9, 0.98), eps=1e-9, weight_decay=1e-8) if args.mode == "train": best_F1 = 0 if not para["crf"]: calculate_loss = nn.NLLLoss() else: calculate_loss = None #calculate_loss = [CRFLoss_vb(len(tgt_dict[task2label[id2task[idx]]])+2, len(tgt_dict[task2label[id2task[idx]]]), len(tgt_dict[task2label[id2task[idx]]])+1) for idx in range(num_task)] #if USE_CUDA: # for x in calculate_loss: # x = x.cuda() print("Start training...") print('-' * 60) KLLoss = None #nn.KLDivLoss() start_point = time.time() for epoch_idx in range(NUM_EPOCH): if args.optim == "sgd": if args.decay == "exp": model_optim = exp_lr_decay(model_optim, epoch_idx) elif args.decay == "normal": model_optim = lr_decay(model_optim, epoch_idx) Pre, Rec, F1, loss_list = run_epoch(model, data_holder, model_optim, calculate_loss, KLLoss, para, epoch_idx, id2task) use_time = time.time() - start_point print("Time using: %f mins" % (use_time / 60)) if not best_F1 or best_F1 < F1: best_F1 = F1 Model_s.save_model(model_path, model, para) print('*' * 60) print( "Save model with average Pre: %f, Rec: %f, F1: %f on dev set." % (Pre, Rec, F1)) save_idx = epoch_idx print('*' * 60) print("save model at epoch:", save_idx) else: para_path = os.path.join(path, 'para.pkl') with open(para_path, "wb") as f: para_save = pickle.load(f) model = Model_s.build_model(para_save) model = Model_s.read_model(model_path, model) prec_list, rec_list, f1_list = infer(model, data_holder, "test")
def get_optimizer(self): # return AdamWOptimizer(self.named_parameters(), lr=5e-4, weight_decay=0.0001) return DenseSparseAdam(self.named_parameters(), lr=5e-4)
class DecoderTrainer: def __init__(self): self.decoder = SelfAttendedDecoder(vocab=data_utils.vocab, device_id=configs.decoder_device_id) # self.decoder.apply(init_weights) self.xe_loss = nn.CrossEntropyLoss(ignore_index=data_utils.vocab.padding_id) self.optimizer = DenseSparseAdam(self.decoder.parameters(), lr=configs.adam_lr) # , lr=configs.lr) # self.optimizer = optim.ASGD(self.model.parameters(), lr=configs.lr, weight_decay=configs.l2_weight_decay) self.lr_scheduler = lr_scheduler.ReduceLROnPlateau( self.optimizer, 'min', patience=configs.lr_scheduler_patience, factor=configs.lr_scheduler_factor, verbose=True ) self.epoch_idx = 0 self.min_ppl = 1000. self.ckpt_path = 'decoder.pretrained.params' def train(self): # self.load_ckpt() start_epoch_idx = self.epoch_idx for epoch_idx in range(start_epoch_idx, configs.epoch_num): self.epoch_idx = epoch_idx log(f'starting epoch {epoch_idx}') log('training') self.decoder.train() avg_epoch_loss = 0. batch_num = 0 next_logging_pct = .5 start_time = time.time() for pct, (report_batch, report_len_batch) in data_utils.gen_report_batches('train'): batch_num += 1 self.optimizer.zero_grad() max_report_len, batch_size = report_batch.shape # [max_report_len - 1, batch_size, vocab_size] word_logits_seq_batch = self.decoder.run_lang_model( # [max_report_len, batch_size], [batch_size] report_batch, report_len_batch ) loss = self.xe_loss( # [(max_report_len - 1) * batch_size, vocab_size] word_logits_seq_batch.view(-1, data_utils.vocab.size), # [(max_report_len - 1) * batch_size] report_batch[1:, :].contiguous().view(-1).to(torch.device(configs.decoder_device_id)) ) # print(torch.argmax(output_batch[:50, :5, :], dim=-1)) loss.backward() self.optimizer.step() avg_epoch_loss += loss.item() if pct >= next_logging_pct: log( f'{int(pct)}%, avg_train_loss: {avg_epoch_loss / batch_num}, ' f'time: {time.time() - start_time}' ) next_logging_pct += 10. avg_epoch_loss /= batch_num log( f'avg_train_loss: {avg_epoch_loss}\n' f'avg_train_time: {time.time() - start_time}' ) with torch.no_grad(): log('validating') self.decoder.eval() batch_num = 0 avg_epoch_ppl = 0 next_logging_pct = 10. start_time = time.time() for pct, (report_batch, report_len_batch) in data_utils.gen_report_batches('test'): batch_num += 1 max_report_len, batch_size = report_batch.shape # [max_report_len - 1, batch_size, vocab_size] word_logits_seq_batch = self.decoder.run_lang_model( # [max_report_len, batch_size], [batch_size] report_batch, report_len_batch ) loss = self.xe_loss( # [(max_report_len - 1) * batch_size, vocab_size] word_logits_seq_batch.view(-1, data_utils.vocab.size), # [(max_report_len - 1) * batch_size] report_batch[1:, :].contiguous().view(-1).to(torch.device(configs.decoder_device_id)) ) # print(torch.argmax(output_batch[:50, :5, :], dim=-1)) avg_epoch_ppl += math.exp(loss.item()) if pct >= next_logging_pct: log( f'{int(pct)}%, avg_dev_ppl: {avg_epoch_ppl / batch_num}, ' f'time: {time.time() - start_time}' ) next_logging_pct += 10. avg_epoch_ppl /= batch_num self.lr_scheduler.step(avg_epoch_ppl) log( f'avg_dev_time: {time.time() - start_time}\n' f'avg_dev_ppl: {avg_epoch_ppl}' ) if avg_epoch_ppl < self.min_ppl: self.min_ppl = avg_epoch_ppl self.save_ckpt() def get_ckpt(self): return { 'epoch_idx': self.epoch_idx, 'min_ppl': self.min_ppl, 'decoder': self.decoder.state_dict(), 'optimizer': self.optimizer.state_dict(), 'lr_scheduler': self.lr_scheduler.state_dict() } def set_ckpt(self, ckpt_dict): self.epoch_idx = ckpt_dict['epoch_idx'] + 1 self.min_ppl = ckpt_dict['min_ppl'] if configs.uses_gumbel_softmax: ckpt_dict['decoder']['embedder.weight'] = \ ckpt_dict['decoder']['embedder.weight'].to(torch.device(configs.decoder_device_id)) else: ckpt_dict['decoder']['embedder.weight'] = \ ckpt_dict['decoder']['embedder.weight'].cpu() self.decoder.load_state_dict(ckpt_dict['decoder']) if configs.uses_new_optimizer: self.optimizer.load_state_dict(ckpt_dict['optimizer']) self.lr_scheduler.load_state_dict(ckpt_dict['lr_scheduler']) del ckpt_dict torch.cuda.empty_cache() ckpt = property(get_ckpt, set_ckpt) def save_ckpt(self): torch.save(self.ckpt, f=self.ckpt_path) print(f'saved checkpoint to {self.ckpt_path}') def load_ckpt(self): self.ckpt = torch.load(self.ckpt_path) print(f'loaded checkpoint from {self.ckpt_path}')
def train(): cmd = argparse.ArgumentParser(sys.argv[0], conflict_handler='resolve') cmd.add_argument('--seed', default=1, type=int, help='The random seed.') cmd.add_argument('--gpu', default=-1, type=int, help='Use id of gpu, -1 if cpu.') cmd.add_argument('--format', default='plain', choices=('plain', 'pickle'), help='the input format.') cmd.add_argument('--train_path', required=True, help='The path to the training file.') cmd.add_argument('--vocab_path', required=True, help='The path to the vocabulary.') cmd.add_argument('--valid_path', help='The path to the development file.') cmd.add_argument('--test_path', help='The path to the testing file.') cmd.add_argument('--config_path', required=True, help='the path to the config file.') cmd.add_argument("--model", required=True, help="path to save model") cmd.add_argument("--batch_size", "--batch", type=int, default=32, help='the batch size.') cmd.add_argument("--max_epoch", type=int, default=100, help='the maximum number of iteration.') cmd.add_argument('--max_sent_len', type=int, default=20, help='maximum sentence length.') cmd.add_argument('--bucket', action='store_true', default=False, help='do bucket batching.') cmd.add_argument('--save_classify_layer', default=False, action='store_true', help="whether to save the classify layer") cmd.add_argument( '--always_save', default=False, action='store_true', help="always saving the model, appending global epoch id.") cmd.add_argument('--valid_size', type=int, default=0, help="size of validation dataset when there's no valid.") cmd.add_argument('--eval_steps', type=int, help='evaluate every xx batches.') cmd.add_argument('--report_steps', type=int, default=32, help='report every xx batches.') opt = cmd.parse_args(sys.argv[2:]) with open(opt.config_path, 'r') as fin: conf = json.load(fin) # Dump configurations logger.info('hostname: {}'.format(socket.gethostname())) logger.info('cmd opt: {}'.format(opt)) logger.info('model configuration: {}'.format(conf)) # Set seed. torch.manual_seed(opt.seed) random.seed(opt.seed) np.random.seed(opt.seed) if opt.gpu >= 0: torch.cuda.set_device(opt.gpu) if opt.seed > 0: torch.cuda.manual_seed(opt.seed) use_cuda = opt.gpu >= 0 and torch.cuda.is_available() use_fp16 = False if conf['optimizer'].get('fp16', False): use_fp16 = True if not use_cuda: logger.warning( 'WARNING: fp16 requires --gpu, ignoring fp16 option') conf['optimizer']['fp16'] = False use_fp16 = False else: try: from apex.fp16_utils import FP16_Optimizer except: print('WARNING: apex not installed, ignoring --fp16 option') conf['optimizer']['fp16'] = False use_fp16 = False c = conf['token_embedder'] token_embedder_max_chars = c.get('max_characters_per_token', None) # Load training data. if opt.format == 'plain': raw_training_data = read_corpus(opt.train_path, opt.max_sent_len, token_embedder_max_chars) else: raw_training_data = pickle.load(open(opt.train_path, 'rb')) logger.info('training instance: {}, training tokens: {}.'.format( len(raw_training_data), count_tokens(raw_training_data))) # Load valid data if path is provided, else use 10% of training data as valid data if opt.valid_path is not None: if opt.format == 'plain': raw_valid_data = read_corpus(opt.valid_path, opt.max_sent_len, token_embedder_max_chars) else: raw_valid_data = pickle.load(open(opt.valid_path, 'rb')) logger.info('valid instance: {}, valid tokens: {}.'.format( len(raw_valid_data), count_tokens(raw_valid_data))) elif opt.valid_size > 0: raw_training_data, raw_valid_data = split_train_and_valid( raw_training_data, opt.valid_size) logger.info( 'training instance: {}, training tokens after division: {}.'. format(len(raw_training_data), count_tokens(raw_training_data))) logger.info('valid instance: {}, valid tokens: {}.'.format( len(raw_valid_data), count_tokens(raw_valid_data))) else: raw_valid_data = None # Load test data if path is provided. if opt.test_path is not None: if opt.format == 'plain': raw_test_data = read_corpus(opt.test_path, opt.max_sent_len, token_embedder_max_chars) else: raw_test_data = pickle.load(open(opt.test_path, 'rb')) logger.info('testing instance: {}, testing tokens: {}.'.format( len(raw_test_data), count_tokens(raw_test_data))) else: raw_test_data = None # Initialized vocab_batch vocab_batch = VocabBatch( lower=not conf['classifier'].get('vocab_cased', True), normalize_digits=conf['classifier'].get('vocab_digit_normalized', False), use_cuda=use_cuda) vocab_batch.create_dict_from_file(opt.vocab_path) # Word if c.get('word_dim', 0) > 0: word_batch = WordBatch(min_cut=c.get('word_min_cut', 0), lower=not c.get('word_cased', True), add_sentence_boundary=c.get( 'add_sentence_boundary_ids', False), use_cuda=use_cuda) word_batch.create_dict_from_dataset(raw_training_data) else: word_batch = None # SubToken if c.get('char_dim', 0) > 0 or c.get('wordpiece_dim', 0) > 0: min_char = max([w for w, _ in c['filters']]) if c['name'] == 'cnn' else 1 if c.get('char_dim', 0) > 0: char_batch = CharacterBatch( min_char=min_char, lower=not c.get('char_cased', True), add_sentence_boundary=c.get('add_sentence_boundary_ids', False), add_word_boundary=c.get('add_word_boundary_ids', False), use_cuda=use_cuda) else: char_batch = WordPieceBatch( min_char=min_char, vocab_file=c['wordpiece_vocab'], lower=not c.get('word_cased', True), add_sentence_boundary=c.get('add_sentence_boundary_ids', False), add_word_boundary=c.get('add_word_boundary_ids', False), use_cuda=use_cuda) char_batch.create_dict_from_dataset(raw_training_data) else: char_batch = None logger.info('vocab size: {0}'.format(len(vocab_batch.mapping))) n_classes = len(vocab_batch.mapping) model = Model(conf, word_batch, char_batch, n_classes) logger.info(str(model)) if use_cuda: model = model.cuda() if use_fp16: model = model.half() # Create training batch if opt.bucket: training_batcher = BucketBatcher(raw_training_data, word_batch, char_batch, vocab_batch, opt.batch_size) else: training_batcher = Batcher(raw_training_data, word_batch, char_batch, vocab_batch, opt.batch_size, keep_full=False, sorting=True, shuffle=True) # Set up evaluation steps. if opt.eval_steps is None: opt.eval_steps = training_batcher.num_batches() logger.info('Evaluate every {0} batches.'.format(opt.eval_steps)) # If there is valid, create valid batch. if raw_valid_data is not None: valid_batcher = Batcher(raw_valid_data, word_batch, char_batch, vocab_batch, opt.batch_size, keep_full=True, sorting=True, shuffle=False) else: valid_batcher = None # If there is test, create test batch. if raw_test_data is not None: test_batcher = Batcher(raw_test_data, word_batch, char_batch, vocab_batch, opt.batch_size, keep_full=True, sorting=True, shuffle=False) else: test_batcher = None # Save meta data of try: os.makedirs(opt.model) except OSError as exception: if exception.errno != errno.EEXIST: raise if char_batch is not None: with codecs.open(os.path.join(opt.model, 'char.dic'), 'w', encoding='utf-8') as fpo: for ch, i in char_batch.mapping.items(): print('{0}\t{1}'.format(ch, i), file=fpo) if word_batch is not None: with codecs.open(os.path.join(opt.model, 'word.dic'), 'w', encoding='utf-8') as fpo: for w, i in word_batch.mapping.items(): print('{0}\t{1}'.format(w, i), file=fpo) with codecs.open(os.path.join(opt.model, 'vocab.dic'), 'w', encoding='utf-8') as fpo: for w, i in vocab_batch.mapping.items(): print('{0}\t{1}'.format(w, i), file=fpo) new_config_path = os.path.join(opt.model, os.path.basename(opt.config_path)) shutil.copy(opt.config_path, new_config_path) opt.config_path = new_config_path json.dump( vars(opt), codecs.open(os.path.join(opt.model, 'config.json'), 'w', encoding='utf-8')) c = conf['optimizer'] optimizer_name = c['type'].lower() params = filter(lambda param: param.requires_grad, model.parameters()) if optimizer_name == 'adamax': optimizer = torch.optim.Adamax(params, lr=c.get('lr', 2e-3), betas=c.get('betas', (0.9, 0.999)), eps=c.get('eps', 1e-8)) elif optimizer_name == 'sgd': optimizer = torch.optim.SGD(params, lr=c.get('lr', 0.01), momentum=c.get('momentum', 0), nesterov=c.get('nesterov', False)) elif optimizer_name == 'dense_sparse_adam' or optimizer_name == 'adam': optimizer = DenseSparseAdam(params, lr=c.get('lr', 1e-3), betas=c.get('betas', (0.9, 0.999)), eps=c.get('eps', 1e-8)) else: raise ValueError('Unknown optimizer name: {0}'.format(optimizer_name)) if use_fp16: optimizer = FP16_Optimizer(optimizer, static_loss_scale=1., dynamic_loss_scale=True, dynamic_loss_args={'init_scale': 2**16}) scheduler_name = c.get('scheduler', 'noam') if scheduler_name == 'cosine': scheduler = CosineWithRestarts(optimizer, c['max_step'], eta_min=c.get('eta_min', 0.0)) elif scheduler_name == 'dev_perf': scheduler = LearningRateScheduler.by_name('reduce_on_plateau')( optimizer, factor=c.get('decay_rate', 0.5), patience=c.get('patience', 5), min_lr=c.get('lr_min', 1e-6)) elif scheduler_name == 'noam': scheduler = NoamLR(optimizer, model_size=c.get('model_size', 512), warmup_steps=c.get('warmup_step', 6000)) else: scheduler = None best_train, best_valid, test_result = 1e8, 1e8, 1e8 for epoch in range(opt.max_epoch): best_train, best_valid, test_result, improved = train_model( epoch, conf, opt, model, optimizer, scheduler, training_batcher, valid_batcher, test_batcher, best_train, best_valid, test_result) if raw_valid_data is None: logger.info("best train ppl: {:.6f}.".format(best_train)) elif raw_test_data is None: logger.info("best train ppl: {:.6f}, best valid ppl: {:.6f}.".format( best_train, best_valid)) else: logger.info( "best train ppl: {:.6f}, best valid ppl: {:.6f}, test ppl: {:.6f}." .format(best_train, best_valid, test_result))
def get_optimizer(self): return DenseSparseAdam(self.named_parameters(), lr=5e-4) # return AdadeltaOptimizer(self.named_parameters())