def __init__(self, opt): self.opt = opt self.tt = torch.cuda if opt.cuda else torch checkpoint = torch.load(opt.model, map_location=lambda storage, loc: storage) model_opt = checkpoint['settings'] if 'use_ctx' not in model_opt.__dict__: model_opt.use_ctx = False self.model_opt = model_opt model = Transformer(model_opt.src_vocab_size, model_opt.tgt_vocab_size, model_opt.max_token_seq_len, proj_share_weight=model_opt.proj_share_weight, embs_share_weight=model_opt.embs_share_weight, d_k=model_opt.d_k, d_v=model_opt.d_v, d_model=model_opt.d_model, d_word_vec=model_opt.d_word_vec, d_inner_hid=model_opt.d_inner_hid, n_layers=model_opt.n_layers, n_head=model_opt.n_head, dropout=model_opt.dropout, use_ctx=model_opt.use_ctx) prob_projection = nn.LogSoftmax() model.load_state_dict(checkpoint['model']) # New max_token_seq_len for position encoding model = self.change_position_embedings(model, opt.max_token_seq_len, model_opt.d_word_vec, model_opt.use_ctx) model_opt.max_token_seq_len = opt.max_token_seq_len print('[Info] Trained model state loaded.') if opt.cuda: model.cuda() prob_projection.cuda() else: model.cpu() prob_projection.cpu() model.prob_projection = prob_projection self.model = model self.model.eval()
def __init__(self, opt): self.opt = opt self.tt = torch.cuda if opt.cuda else torch checkpoint = torch.load(opt.model) model_opt = checkpoint['settings'] self.model_opt = model_opt model = Transformer( model_opt.src_vocab_size, model_opt.tgt_vocab_size, model_opt.max_token_seq_len, proj_share_weight=model_opt.proj_share_weight, embs_share_weight=model_opt.embs_share_weight, d_k=model_opt.d_k, d_v=model_opt.d_v, d_model=model_opt.d_model, d_word_vec=model_opt.d_word_vec, d_inner_hid=model_opt.d_inner_hid, n_layers=model_opt.n_layers, n_head=model_opt.n_head, dropout=model_opt.dropout) prob_projection = nn.LogSoftmax() model.load_state_dict(checkpoint['model']) print('[Info] Trained model state loaded.') if opt.cuda: model.cuda() prob_projection.cuda() else: model.cpu() prob_projection.cpu() model.prob_projection = prob_projection self.model = model self.model.eval()
def main(): ''' Main function ''' parser = argparse.ArgumentParser() parser.add_argument('-data', required=True) parser.add_argument('-epoch', type=int, default=10) parser.add_argument('-batch_size', type=int, default=64) #parser.add_argument('-d_word_vec', type=int, default=512) parser.add_argument('-d_model', type=int, default=512) parser.add_argument('-d_inner_hid', type=int, default=1024) parser.add_argument('-d_k', type=int, default=64) parser.add_argument('-d_v', type=int, default=64) parser.add_argument('-n_head', type=int, default=8) parser.add_argument('-n_layers', type=int, default=6) parser.add_argument('-n_warmup_steps', type=int, default=4000) parser.add_argument('-dropout', type=float, default=0.1) parser.add_argument('-embs_share_weight', action='store_true') parser.add_argument('-proj_share_weight', action='store_true') parser.add_argument('-log', default=None) parser.add_argument('-save_model', default=None) parser.add_argument('-save_mode', type=str, choices=['all', 'best'], default='best') parser.add_argument('-no_cuda', action='store_true') opt = parser.parse_args() opt.cuda = not opt.no_cuda opt.d_word_vec = opt.d_model #========= Loading Dataset =========# data = torch.load(opt.data) opt.max_token_seq_len = data['settings'].max_token_seq_len #========= Preparing DataLoader =========# training_data = DataLoader(data['dict']['src'], data['dict']['tgt'], src_insts=data['train']['src'], tgt_insts=data['train']['tgt'], batch_size=opt.batch_size, cuda=opt.cuda) validation_data = DataLoader(data['dict']['src'], data['dict']['tgt'], src_insts=data['valid']['src'], tgt_insts=data['valid']['tgt'], batch_size=opt.batch_size, shuffle=False, test=True, cuda=opt.cuda) opt.src_vocab_size = training_data.src_vocab_size opt.tgt_vocab_size = training_data.tgt_vocab_size #========= Preparing Model =========# if opt.embs_share_weight and training_data.src_word2idx != training_data.tgt_word2idx: print( '[Warning]', 'The src/tgt word2idx table are different but asked to share word embedding.' ) print(opt) transformer = Transformer(opt.src_vocab_size, opt.tgt_vocab_size, opt.max_token_seq_len, proj_share_weight=opt.proj_share_weight, embs_share_weight=opt.embs_share_weight, d_k=opt.d_k, d_v=opt.d_v, d_model=opt.d_model, d_word_vec=opt.d_word_vec, d_inner_hid=opt.d_inner_hid, n_layers=opt.n_layers, n_head=opt.n_head, dropout=opt.dropout) #print(transformer) optimizer = ScheduledOptim( optim.Adam(transformer.get_trainable_parameters(), betas=(0.9, 0.98), eps=1e-09), opt.d_model, opt.n_warmup_steps) def get_criterion(vocab_size): ''' With PAD token zero weight ''' weight = torch.ones(vocab_size) weight[Constants.PAD] = 0 return nn.CrossEntropyLoss(weight, size_average=False) crit = get_criterion(training_data.tgt_vocab_size) if opt.cuda: transformer = transformer.cuda() crit = crit.cuda() print("===>TRAIN\n") train(transformer, training_data, validation_data, crit, optimizer, opt)
def main(): ''' Main function ''' parser = argparse.ArgumentParser() parser.add_argument('-data', required=True) parser.add_argument('-epoch', type=int, default=100) parser.add_argument('-batch_size', type=int, default=64) #parser.add_argument('-d_word_vec', type=int, default=512) parser.add_argument('-d_model', type=int, default=512) parser.add_argument('-d_inner_hid', type=int, default=1024) parser.add_argument('-d_k', type=int, default=64) parser.add_argument('-d_v', type=int, default=64) parser.add_argument('-n_head', type=int, default=8) parser.add_argument('-n_layers', type=int, default=6) parser.add_argument('-n_warmup_steps', type=int, default=4000) parser.add_argument('-dropout', type=float, default=0.1) parser.add_argument('-embs_share_weight', action='store_true') parser.add_argument('-proj_share_weight', action='store_true') parser.add_argument('-log', default=None) parser.add_argument('-save_model', default=None) parser.add_argument('-save_mode', type=str, choices=['all', 'best'], default='all') parser.add_argument('-no_cuda', action='store_true') parser.add_argument('-multi_gpu', action='store_true') parser.add_argument('-use_ctx', action='store_true') parser.add_argument( '-external_validation_script', type=str, default=None, metavar='PATH', nargs='*', help= "location of validation script (to run your favorite metric for validation) (default: %(default)s)" ) opt = parser.parse_args() opt.cuda = not opt.no_cuda opt.d_word_vec = opt.d_model #========= Loading Dataset =========# data = torch.load(opt.data) opt.max_token_seq_len = data['settings'].max_token_seq_len #========= Preparing DataLoader =========# training_data = DataLoader( data['dict']['src'], data['dict']['tgt'], src_insts=data['train']['src'], tgt_insts=data['train']['tgt'], ctx_insts=(data['train']['ctx'] if opt.use_ctx else None), batch_size=opt.batch_size, cuda=opt.cuda, is_train=True, sort_by_length=True) validation_data = DataLoader( data['dict']['src'], data['dict']['tgt'], src_insts=data['valid']['src'], tgt_insts=data['valid']['tgt'], ctx_insts=(data['valid']['ctx'] if opt.use_ctx else None), batch_size=opt.batch_size, shuffle=False, cuda=opt.cuda, is_train=False, sort_by_length=True) opt.src_vocab_size = training_data.src_vocab_size opt.tgt_vocab_size = training_data.tgt_vocab_size #========= Preparing Model =========# if opt.embs_share_weight and training_data.src_word2idx != training_data.tgt_word2idx: print( '[Warning]', 'The src/tgt word2idx table are different but asked to share word embedding.' ) print(opt) transformer = Transformer(opt.src_vocab_size, opt.tgt_vocab_size, opt.max_token_seq_len, proj_share_weight=opt.proj_share_weight, embs_share_weight=opt.embs_share_weight, d_k=opt.d_k, d_v=opt.d_v, d_model=opt.d_model, d_word_vec=opt.d_word_vec, d_inner_hid=opt.d_inner_hid, n_layers=opt.n_layers, n_head=opt.n_head, dropout=opt.dropout, use_ctx=opt.use_ctx) #print(transformer) # optimizer = ScheduledOptim( # optim.Adam( # transformer.get_trainable_parameters(), # betas=(0.9, 0.98), eps=1e-09), # opt.d_model, opt.n_warmup_steps) optimizer = ScheduledOptim( optim.Adam(transformer.get_trainable_parameters(), betas=(0.9, 0.98), eps=1e-09), opt.d_model, opt.n_warmup_steps) def get_criterion(vocab_size): ''' With PAD token zero weight ''' weight = torch.ones(vocab_size) weight[Constants.PAD] = 0 #return nn.CrossEntropyLoss(weight, size_average=False) return nn.NLLLoss(weight, size_average=False) crit = get_criterion(training_data.tgt_vocab_size) logsoftmax = nn.LogSoftmax() if opt.cuda: transformer = transformer.cuda() crit = crit.cuda() logsoftmax = logsoftmax.cuda() if opt.multi_gpu: transformer = nn.DataParallel(transformer) crit = nn.DataParallel(crit) logsoftmax = nn.DataParallel(logsoftmax) train(transformer, training_data, validation_data, crit, logsoftmax, optimizer, opt)
hp.checkpoint_path, hp.model_path_pre + "_" + str(num_step) + ".pth") print("save model at steps of {}".format(num_step)) torch.save(save_model, save_name) if __name__ == "__main__": torch.cuda.set_device(hp.gpu) net1 = resnet.resnet34() net2 = Transformer(len_encoder=hp.enc_input_len, n_tgt_vocab=hp.num_classes, len_max_seq=hp.MAX_LEN, n_layers=hp.n_layers) net1 = net1.cuda() net2 = net2.cuda() trainLoader = dataset.getDataLoader(is_train=True, batch_size=hp.BATCH_SIZE, shuffle=True) iter_one_epoch = len(trainLoader) print("iteration_every_epoch: ", iter_one_epoch) #testloader = dataset.getDataLoader(is_train=False, batch_size=BATCH_SIZE, shuffle=False) lossFunction = nn.CrossEntropyLoss(ignore_index=Constants.PAD) optimizer_ = optim.Adam( [{ 'params': net1.parameters() }, { 'params': filter(lambda x: x.requires_grad, net2.parameters()) }], betas=[0.9, 0.98],
def main(): ''' Main function ''' parser = argparse.ArgumentParser() parser.add_argument('-config', type=str, default='config/rnnt.yaml') parser.add_argument('-load_model', type=str, default=None) parser.add_argument('-num_workers', type=int, default=0, help='how many subprocesses to use for data loading. ' '0 means that the data will be loaded in the main process') parser.add_argument('-log', type=str, default='train.log') opt = parser.parse_args() configfile = open(opt.config) config = AttrDict(yaml.load(configfile)) exp_name = config.data.name if not os.path.isdir(exp_name): os.mkdir(exp_name) logger = init_logger(exp_name + '/' + opt.log) if torch.cuda.is_available(): torch.cuda.manual_seed(config.training.seed) torch.backends.cudnn.deterministic = True else: raise NotImplementedError #========= Build DataLoader =========# train_dataset = AudioDateset(config.data, 'train') train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.data.train.batch_size, shuffle=True, num_workers=opt.num_workers) assert train_dataset.vocab_size == config.model.vocab_size #========= Build A Model Or Load Pre-trained Model=========# model = Transformer(config.model) n_params, enc_params, dec_params = count_parameters(model) logger.info('# the number of parameters in the whole model: %d' % n_params) logger.info('# the number of parameters in encoder: %d' % enc_params) logger.info('# the number of parameters in decoder: %d' % dec_params) if torch.cuda.is_available(): model.cuda() global global_step global_step = 0 # define an optimizer optimizer = ScheduledOptim(model, config.model.d_model, config.optimizer) # load pretrain model if opt.load_model is not None: checkpoint = torch.load(opt.load_model) model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) logger.info('Load pretrainded Model and previous Optimizer!') else: init_parameters(model) logger.info('Initialized all parameters!') # define loss function crit = nn.CrossEntropyLoss(ignore_index=0) # create a visualizer if config.training.visualization: visualizer = SummaryWriter(exp_name + '/log') logger.info('Created a visualizer.') else: visualizer = None for epoch in range(config.training.epoches): train(epoch, model, crit, optimizer, train_loader, logger, visualizer, config) save_model(epoch, model, optimizer, config, logger) logger.info('Traing Process Finished')
def get_embedding(): import transformer.Constants as Constants from transformer.Models import Transformer from transformer.Optim import ScheduledOptim from transformer.Modules import LabelSmoothing from transformer.Beam import Beam from transformer.Translator import translate from preprocess import read_instances_from_file, convert_instance_to_idx_seq import evals from evals import Logger from DataLoader import DataLoader data = torch.load(opt.data) opt.max_token_seq_len_e = data['settings'].max_seq_len opt.max_token_seq_len_d = 30 opt.proj_share_weight = True opt.d_word_vec = opt.d_model # training_data = DataLoader( # data['dict']['src'], # data['dict']['tgt'], # src_insts=data['train']['src'], # tgt_insts=data['train']['tgt'], # batch_size=opt.batch_size, # shuffle=True, # cuda=opt.cuda) opt.src_vocab_size = training_data.src_vocab_size opt.tgt_vocab_size = training_data.tgt_vocab_size opt.tgt_vocab_size = opt.tgt_vocab_size - 4 opt.src_vocab_size = training_data.src_vocab_size opt.tgt_vocab_size = training_data.tgt_vocab_size opt.tgt_vocab_size = opt.tgt_vocab_size - 4 opt.d_v = int(opt.d_model / opt.n_head) opt.d_k = int(opt.d_model / opt.n_head) model = Transformer(opt.src_vocab_size, opt.tgt_vocab_size, opt.max_token_seq_len_e, opt.max_token_seq_len_d, proj_share_weight=opt.proj_share_weight, embs_share_weight=False, d_k=opt.d_k, d_v=opt.d_v, d_model=opt.d_model, d_word_vec=opt.d_word_vec, d_inner_hid=opt.d_inner_hid, n_layers_enc=opt.n_layers_enc, n_layers_dec=opt.n_layers_dec, n_head=opt.n_head, dropout=opt.dropout, dec_dropout=opt.dec_dropout, encoder=opt.encoder, decoder=opt.decoder, enc_transform=opt.enc_transform, onehot=opt.onehot, no_enc_pos_embedding=opt.no_enc_pos_embedding, dec_reverse=opt.dec_reverse, no_residual=opt.no_residual) state_dict = torch.load(opt.results_dir + '/' + opt.mname + '/model.chkpt') model.load_state_dict(state_dict['model']) model = model.cuda() model.eval() model.decoder.tgt_word_emb.weight W = model.decoder.tgt_word_emb.weight.data.cpu().numpy() numpy.save(W, 'Embedding')
def main(): ''' Main function ''' parser = argparse.ArgumentParser() parser.add_argument('-config', type=str, default='config/rnnt.yaml') parser.add_argument('-load_model', type=str, default=None) parser.add_argument('-fp16_allreduce', action='store_true', default=False, help='use fp16 compression during allreduce') parser.add_argument('-batches_per_allreduce', type=int, default=1, help='number of batches processed locally before ' 'executing allreduce across workers; it multiplies ' 'total batch size.') parser.add_argument( '-num_wokers', type=int, default=0, help='how many subprocesses to use for data loading. ' '0 means that the data will be loaded in the main process') parser.add_argument('-log', type=str, default='train.log') opt = parser.parse_args() configfile = open(opt.config) config = AttrDict(yaml.load(configfile)) global global_step global_step = 0 if hvd.rank() == 0: exp_name = config.data.name if not os.path.isdir(exp_name): os.mkdir(exp_name) logger = init_logger(exp_name + '/' + opt.log) else: logger = None if torch.cuda.is_available(): torch.cuda.set_device(hvd.local_rank()) torch.cuda.manual_seed(config.training.seed) torch.backends.cudnn.deterministic = True else: raise NotImplementedError #========= Build DataLoader =========# train_dataset = AudioDateset(config.data, 'train') train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=hvd.size(), rank=hvd.rank()) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.data.train.batch_size, sampler=train_sampler) assert train_dataset.vocab_size == config.model.vocab_size #========= Build A Model Or Load Pre-trained Model=========# model = Transformer(config.model) if hvd.rank() == 0: n_params, enc_params, dec_params = count_parameters(model) logger.info('# the number of parameters in the whole model: %d' % n_params) logger.info('# the number of parameters in encoder: %d' % enc_params) logger.info('# the number of parameters in decoder: %d' % dec_params) model.cuda() # define an optimizer optimizer = torch.optim.Adam(model.parameters(), lr=lr, betas=(0.9, 0.98), eps=1e-9) # Horovod: (optional) compression algorithm. compression = hvd.Compression.fp16 if opt.fp16_allreduce else hvd.Compression.none optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters(), compression=compression) # load pretrain model if opt.load_model is not None and hvd.rank() == 0: checkpoint = torch.load(opt.load_model) model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) logger.info('Load pretrainded Model and previous Optimizer!') elif hvd.rank() == 0: init_parameters(model) logger.info('Initialized all parameters!') # Horovod: broadcast parameters & optimizer state. hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) # define loss function crit = nn.CrossEntropyLoss(ignore_index=0) # create a visualizer if config.training.visualization and hvd.rank() == 0: visualizer = SummaryWriter(exp_name + '/log') logger.info('Created a visualizer.') else: visualizer = None for epoch in range(config.training.epoches): train(epoch, model, crit, optimizer, train_loader, train_sampler, logger, visualizer, config) if hvd.rank() == 0: save_model(epoch, model, optimizer, config, logger) if hvd.rank() == 0: logger.info('Traing Process Finished')