def __init__(self, args, train_loader, test_loader, tokenizer_src, tokenizer_tgt): self.args = args self.train_loader = train_loader self.test_loader = test_loader self.src_vocab_size = tokenizer_src.vocab_size self.tgt_vocab_size = tokenizer_tgt.vocab_size self.pad_id = tokenizer_src.pad_token_id # pad_token_id in tokenizer_tgt.vocab should be the same with this. self.device = 'cuda' if torch.cuda.is_available() and not args.no_cuda else 'cpu' self.model = Transformer(src_vocab_size = self.src_vocab_size, tgt_vocab_size = self.tgt_vocab_size, seq_len = args.max_seq_len, d_model = args.hidden, n_layers = args.n_layers, n_heads = args.n_attn_heads, p_drop = args.dropout, d_ff = args.ffn_hidden, pad_id = self.pad_id) if args.multi_gpu: self.model = nn.DataParallel(self.model) self.model.to(self.device) self.optimizer = ScheduledOptim(optim.Adam(self.model.parameters(), betas=(0.9, 0.98), eps=1e-9), init_lr=2.0, d_model=args.hidden) self.criterion = nn.CrossEntropyLoss(ignore_index=self.pad_id)
def __init__(self, hparams, **kwargs): super(Transformer_pl, self).__init__() self.hparams = hparams self.transformer = Transformer(self.hparams) self.sp_kor = korean_tokenizer_load() self.sp_eng = english_tokenizer_load()
def build_train_model(self): self.train_mode = None print("# Select train mode [{}]".format("/".join([i[:3] for i in TRAIN_MODE_LIST]))) for mode in TRAIN_MODE_LIST: if mode.startswith(self.hparams.train_mode): self.train_mode = mode assert self.train_mode self.data_loader = DataLoader(hparams = self.hparams, training = self.training, mode = self.train_mode) with tf.variable_scope('Network_Operator'): self.dataset_handler = tf.placeholder(tf.string, shape=[], name='dataset_handler') self.train_batch_iter = self.data_loader.get_training_batch(self.data_loader.train_dataset) self.test_batch_iter = self.data_loader.get_training_batch(self.data_loader.test_dataset) self.train_dataset_count, self.test_dataset_count = self.data_loader.train_dataset_count, self.data_loader.test_dataset_count input_batch = self.data_loader.multiple_batch(self.dataset_handler, self.train_batch_iter.batched_dataset) print("# Build model =", self.train_mode) self.model = Transformer(mode = self.train_mode, graph = self.graph, hparams = self.hparams, data_loader = self.data_loader, batch_input = input_batch) self.global_step = self.model.global_step self.epoch_num = self.model.train_epoch
def train(): inputs, src_vocab_size, tgt_vocab_size, idx2word = create_data() enc_inputs, dec_inputs, dec_outputs = make_data(*inputs) data_loader = Data.DataLoader(dataset=MyDataSet(enc_inputs, dec_inputs, dec_outputs), batch_size=2, shuffle=True) model = Transformer(src_vocab_size, tgt_vocab_size).cuda() criterion = nn.CrossEntropyLoss(ignore_index=0) # PAD本身无意义,单词索引为0,设置ignore_index=0,可避免计算PAD的损失 optimizer = optim.SGD(model.parameters(), lr=1e-3, momentum=0.09) for epoch in range(30): for enc_inputs, dec_inputs, dec_outputs in data_loader: """ enc_inputs: [batch_size, src_len] dec_inputs: [batch_size, tgt_len] dec_outputs: [batch_size, tgt_len] """ enc_inputs, dec_inputs, dec_outputs = enc_inputs.cuda(), dec_inputs.cuda(), dec_outputs.cuda() outputs, enc_self_attns, dec_self_attns, dec_enc_attns = model(enc_inputs, dec_inputs) loss = criterion(outputs, dec_outputs.view(-1)) print('Epoch:', '%04d' % (epoch + 1), 'loss =', '{:.6f}'.format(loss)) optimizer.zero_grad() loss.backward() optimizer.step()
def main(): dataset = Dataset(transform=transform, n_datas=10000, seed=None) #生成10000个数据,确保字符都出现 model = Transformer(n_head=2) try: trained_epoch = sl.find_last_checkpoint('./checkpoint') print('load model %d' % (trained_epoch)) except Exception as e: print('no trained model found, {}'.format(e)) return model = sl.load_model('./checkpoint', -1, model) model.eval() x, y, extra = dataset.__getitem__(0) #值使用y的第0个特征向量,即<pad>对应的onehot向量 # print(x.shape, y.shape) # pred = model(torch.from_numpy(x).unsqueeze(0), torch.from_numpy(y).unsqueeze(0)).squeeze() pred = translate(model, x, y[0]) #日期格式转换时,对于输入序列,我们全部知道;但是对于输出序列,只有开头的<pad>的已知的 # print(pred.shape) pred = np.argmax(pred.detach().numpy(), axis=1)[1:] # print(extra['machine_readable']) pred = [dataset.inv_machine_vocab[p] for p in pred] pred_str = ''.join(pred) human_readable = extra['human_readable'] machine_readable = extra['machine_readable'] print('[%s] --> [%s], answer: [%s]' % (human_readable, pred_str, list(machine_readable))) dec_scores = model.decoder.scores_for_paint # print(dec_scores.shape) paint_score(dec_scores[0], human_readable, pred) #[0]是去batch中的第0个
def generate( x: str, beam_width: int, device: torch.device, max_seq_len: int, model: Transformer, tokenizer: Tokenizer ) -> str: model.eval() seq = torch.LongTensor([tokenizer.bos_id]).to(device) x = torch.LongTensor([tokenizer.encode(x, max_len=-1)]).to(device) accum_prob = torch.zeros(beam_width).to(device) for _ in range(max_seq_len): pred_y = model.predict(x, seq) top_k_in_all_beams = [] for out_beams in range(seq.size(0)): top_k_prob_in_beam, top_k_index_in_beam = \ pred_y[out_beams, -1].topk( k=beam_width, dim=-1 ) for in_beam in range(beam_width): prob = accum_prob[out_beams] -\ top_k_prob_in_beam[in_beam].log() prob = prob.unsqueeze(0) temp_seq = torch.cat([ seq[out_beams], top_k_index_in_beam[in_beam].unsqueeze(0) ], dim=-1).unsqueeze(0) top_k_in_all_beams.append({ 'prob': prob, 'seq': temp_seq }) _, top_k_index_in_all_beams = torch.cat([ beam['prob'] for beam in top_k_in_all_beams ]).topk(k=beam_width, dim=0) seq = torch.cat([ top_k_in_all_beams[index]['seq'] for index in top_k_index_in_all_beams ], dim=0) accum_prob = torch.cat([ top_k_in_all_beams[index]['prob'] for index in top_k_index_in_all_beams ], dim=0) if x.size(0) != seq.size(0): x = x.repeat(seq.size(0) // x.size(0), 1) for i in tokenizer.batch_decode(seq.tolist()): print(i)
def main(): device = config.device p = Preprocess("data/europarl-v7.fr-en.en", "data/europarl-v7.fr-en.fr") transformer = Transformer(p.src_word2ind, p.trg_word2ind) transformer.to(device) train(p, transformer)
def test_transformer_with_convolution(self): train_dataset = StocksDataset(files=FILES[:10], min_length=30) train_dataloader = DataLoader(train_dataset, batch_size=2, shuffle=False) model = Transformer(use_convolutions=True).double() for batch in train_dataloader: model.training_step(batch.double(), 0) break
def test_vanilla_transformer(self): train_dataset = StocksDataset(files=FILES[:10], min_length=30) train_dataloader = DataLoader(train_dataset, batch_size=2, shuffle=False) model = Transformer().double() for batch in train_dataloader: model.training_step(batch.double(), 0) break
def build_model(self, args): encoder_embed_tokens = nn.Embedding( self.src_dict.token_num, args.encoder_embed_dim, padding_idx=self.src_dict.padding_idx) if args.share_all_embeddings: decoder_embed_tokens = encoder_embed_tokens else: decoder_embed_tokens = nn.Embedding( self.trg_dict.token_num, args.decoder_embed_dim, padding_idx=self.trg_dict.padding_idx) self.model = Transformer(args, self.src_dict, self.trg_dict)
def init_from_config(self, config): # self.model = Model(config) self.model = Transformer(config, config.test.devices) self.model.build_test_model() sess_config = tf.ConfigProto() sess_config.gpu_options.allow_growth = True sess_config.allow_soft_placement = True self.sess = tf.Session(config=sess_config, graph=self.model.graph) # Restore model. self.model.saver.restore(self.sess, tf.train.latest_checkpoint(config.train.logdir)) self.data_reader = DataReader(config)
def gen_soft_labels(c): c.setdefault(hebbian=False, distributed=False) net = Transformer(c) net, step = c.init_model(net, step='max', train=False) print('generating soft labels...') data_gen_tr = SequentialIterator(c, 1, 'train') net.eval() with torch.no_grad(): i = 0 for batch in tqdm(data_gen_tr): x = to_torch(batch, c.device).t() inputs, labels = x[:-1], x[1:] probs, _ = net(inputs, labels) values, indices = torch.topk(probs, c.topk, dim=1) indices_ = indices.cpu().numpy() values_ = values.cpu().numpy() labels_ = labels.cpu().numpy() if probs.size(0) != inputs.size(0): indices_ = indices_[-inputs.size(0):, :] values_ = values_[-inputs.size(0):, :] if i == 0: all_soft_indices = indices_ all_soft_values = values_ else: all_soft_indices = np.concatenate((all_soft_indices, indices_), axis=0) all_soft_values = np.concatenate((all_soft_values, values_), axis=0) i += 1 all_soft_indices = np.concatenate( (all_soft_indices[0:1, :], all_soft_indices), axis=0) all_soft_values = np.concatenate( (all_soft_values[0:1, :], all_soft_values), axis=0) np.save(Cache / 'wikitext-103' / 'train_soft_labels.npy', all_soft_indices) np.save(Cache / 'wikitext-103' / 'train_soft_probs.npy', all_soft_values) print('Saved %s' % (Cache / 'wikitext-103' / 'train_soft_labels.npy')) print('Saved %s' % (Cache / 'wikitext-103' / 'train_soft_probs.npy')) cnt = 0. for k in range(len(data_gen_tr.tokens)): if data_gen_tr.tokens[k] in all_soft_indices[k]: cnt += 1 print('%s%% of the tokens are predicted within the top %s logits' % (100 * cnt / len(data_gen_tr.tokens), c.topk))
def __init__(self, model_source, rewrite_len=30, beam_size=4, debug=False): self.beam_size = beam_size self.rewrite_len = rewrite_len self.debug = debug model_source = torch.load(model_source, map_location=lambda storage, loc: storage) self.dict = model_source["word2idx"] self.idx2word = {v: k for k, v in model_source["word2idx"].items()} self.args = args = model_source["settings"] torch.manual_seed(args.seed) model = Transformer(args) model.load_state_dict(model_source['model']) self.model = model.eval()
def build_predict_model(self): self.src_placeholder = tf.placeholder(shape=[None], dtype=tf.string, name = 'Inputs') self.src_length_placeholder = tf.placeholder(shape=[None], dtype=tf.int32, name = 'Inputs_length') src_dataset = tf.data.Dataset.from_tensor_slices((self.src_placeholder, self.src_length_placeholder)) self.infer_batch = self.data_loader.get_inference_batch(src_dataset) print("# Build inference model ...") self.model = Transformer(mode = 'inference', graph = self.graph, hparams = self.hparams, data_loader = self.data_loader, batch_input = self.infer_batch) print("# Restoring model weights ...") self.saver, self.restore = variable_loader(self.session, RESULT_DIR) assert self.restore self.session.run(tf.tables_initializer())
def main(): global D_MODEL, N_LAYERS, N_HEADS, DROPOUT, N_EPOCHS, B_SIZE, LR D_MODEL = args.modeldim N_LAYERS = args.nlayers N_HEADS = args.nheads DROPOUT = args.dropout N_EPOCHS = args.epochs B_SIZE = args.batchsize LR = args.lr train_iter, val_iter, TEXT, LABEL = get_dataiter(args.datapath, batch_size=B_SIZE) if args.predict: model = Transformer(len(TEXT.vocab), len(LABEL.vocab), D_MODEL, N_LAYERS, N_HEADS, dropout=DROPOUT) model = torch.load(args.predmodel, map_location=torch.device('cpu')) predict(model, args.predict, TEXT, LABEL, custom_sent=True) exit(0) print( f'Training start time: {datetime.now().strftime("%d/%m/%Y %H:%M:%S")}') if args.linear: el_train(train_iter, val_iter, TEXT, LABEL) else: ed_train(train_iter, val_iter, TEXT, LABEL) print( f'Training completion time: {datetime.now().strftime("%d/%m/%Y %H:%M:%S")}' )
class Translation(object): def __init__(self, args): super(Translation, self).__init__() self.datasets = {} self.data_dir = args.data_dir self.src_lang, self.trg_lang = dataset_utils.infer_language_pair( args.data_dir) src_dict_path = os.path.join(args.data_dir, dict_path.format(self.src_lang)) trg_dict_path = os.path.join(args.data_dir, dict_path.format(self.trg_lang)) self.src_dict = Dictionary.build_from_dict_file(src_dict_path) self.trg_dict = Dictionary.build_from_dict_file(trg_dict_path) self.model = None self.criterion = None self.optimizer = None def load_dataset(self, split): # 根据split找到路径 src_split_path = os.path.join( self.data_dir, subset_path.format(split, self.src_lang, self.trg_lang, self.src_lang)) trg_split_path = os.path.join( self.data_dir, subset_path.format(split, self.src_lang, self.trg_lang, self.trg_lang)) src_dataset = SingleDataset(src_split_path) trg_dataset = SingleDataset(trg_split_path) pair_dataset = PairDataset(src_dataset, trg_dataset) self.datasets[split] = pair_dataset def build_model(self, args): encoder_embed_tokens = nn.Embedding( self.src_dict.token_num, args.encoder_embed_dim, padding_idx=self.src_dict.padding_idx) if args.share_all_embeddings: decoder_embed_tokens = encoder_embed_tokens else: decoder_embed_tokens = nn.Embedding( self.trg_dict.token_num, args.decoder_embed_dim, padding_idx=self.trg_dict.padding_idx) self.model = Transformer(args, self.src_dict, self.trg_dict) def build_criterion(self, label_smooth): self.criterion = LabelSmoothedCrossEntropyCriterion(label_smooth) def build_optimizer(self): if self.model is None: print("should build model first!") else: self.optimizer = CustomAdam(self.model.parameters(), lr=self.args.lr, betas=self.args.betas)
def instantiate_model(self, english_vocab_size, norwegian_vocab_size, embedding_dim=256, num_heads=8, num_encoders=6, ff_dim=256): model = Transformer(english_vocab_size, norwegian_vocab_size, embedding_dim, num_heads, num_encoders, ff_dim, self.cuda).to(self.cuda) for p in model.parameters(): if p.dim() > 1: torch.nn.init.xavier_uniform(p) return model
def model_testing(test_dataset, parameters): loc_to = '/home/preetham/Documents/Preetham/masters-thesis/results/gloss-to-grapheme/transformer/' global val_loss, val_accuracy, loss_object, transformer val_loss = tf.keras.metrics.Mean(name='val_loss') val_accuracy = tf.keras.metrics.SparseCategoricalAccuracy( name='val_accuracy') val_loss.reset_states() val_accuracy.reset_states() checkpoint_dir = loc_to + 'model_' + str( parameters['model']) + '/training_checkpoints' if parameters['n_layers'] <= 6: n_layers = parameters['n_layers'] else: n_layers = parameters['n_layers'] - 6 transformer = Transformer(n_layers, parameters['d_model'], parameters['n_heads'], parameters['dff'], parameters['inp_vocab_size'], parameters['tar_vocab_size'], pe_input=parameters['inp_vocab_size'], pe_target=parameters['tar_vocab_size'], rate=parameters['dropout']) checkpoint = tf.train.Checkpoint(transformer=transformer) loss_object = tf.keras.losses.SparseCategoricalCrossentropy( from_logits=True, reduction='none') checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir)) for (batch, (inp, tar)) in enumerate(test_dataset.take(parameters['test_steps'])): val_step(inp, tar) print('Test Loss=', round(val_loss.result().numpy(), 3)) print('Test Accuracy=', round(val_accuracy.result().numpy(), 3)) print()
def greedy_test(args): """ Test function """ # load vocabulary vocab = torch.load(args.vocab) # build model translator = Transformer(args, vocab) translator.eval() # load parameters translator.load_state_dict(torch.load(args.decode_model_path)) if args.cuda: translator = translator.cuda() test_data = read_corpus(args.decode_from_file, source="src") # ['<BOS>', '<PAD>', 'PAD', '<PAD>', '<PAD>'] pred_data = len(test_data) * [[ constants.PAD_WORD if i else constants.BOS_WORD for i in range(args.decode_max_steps) ]] output_file = codecs.open(args.decode_output_file, "w", encoding="utf-8") for test, pred in zip(test_data, pred_data): pred_output = [constants.PAD_WORD] * args.decode_max_steps test_var = to_input_variable([test], vocab.src, cuda=args.cuda) # only need one time enc_output = translator.encode(test_var[0], test_var[1]) for i in range(args.decode_max_steps): pred_var = to_input_variable([pred[:i + 1]], vocab.tgt, cuda=args.cuda) scores = translator.translate(enc_output, test_var[0], pred_var) _, argmax_idxs = torch.max(scores, dim=-1) one_step_idx = argmax_idxs[-1].item() pred_output[i] = vocab.tgt.id2word[one_step_idx] if (one_step_idx == constants.EOS) or (i == args.decode_max_steps - 1): print("[Source] %s" % " ".join(test)) print("[Predict] %s" % " ".join(pred_output[:i])) print() output_file.write(" ".join(pred_output[:i]) + "\n") output_file.flush() break pred[i + 1] = vocab.tgt.id2word[one_step_idx] output_file.close()
def test(hp): # Loading hyper params load_hparams(hp, hp.ckpt) logging.info("# Prepare test batches") test_batches, num_test_batches, num_test_samples = get_batch( hp.test1, hp.test1, 100000, 100000, hp.vocab, hp.test_batch_size, shuffle=False) iter = tf.data.Iterator.from_structure(test_batches.output_types, test_batches.output_shapes) xs, ys = iter.get_next() test_init_op = iter.make_initializer(test_batches) logging.info("# Load model") model = Transformer(hp) logging.info("# Session") with tf.Session() as sess: ckpt_ = tf.train.latest_checkpoint(hp.ckpt) ckpt = ckpt_ if ckpt_ else hp.ckpt saver = tf.train.Saver() saver.restore(sess, ckpt) y_hat, mean_loss = model.eval(sess, test_init_op, xs, ys, num_test_batches) logging.info("# get hypotheses") hypotheses = get_hypotheses(num_test_samples, y_hat, model.idx2token) logging.info("# write results") model_output = os.path.split(ckpt)[-1] if not os.path.exists(hp.testdir): os.makedirs(hp.testdir) translation = os.path.join(hp.testdir, model_output) with open(translation, 'w', encoding="utf-8") as fout: fout.write("\n".join(hypotheses)) logging.info("# calc bleu score and append it to translation") calc_bleu_nltk(hp.test2, translation)
def my_model_fn(features, labels, mode, params): warmup_steps = min(params['warmup_steps'], params['train_steps'] * 0.1) config = params['config'] x, y = features y_label = labels if FLAGS.model_type == 'transformer': transformer = Transformer(config=config, mode=mode) else: transformer = RNNTransformer(config=config, mode=mode) logits, predicts = transformer.create_model(x_input=x, y_input=y) loss = transformer.calculate_loss(logits=logits, y_labels=y_label) for v in tf.trainable_variables(): tf.logging.info(v.name) if mode == tf.estimator.ModeKeys.TRAIN: ''' 训练rnn 模型的时候推荐的方法 ''' train_op, learning_rate = create_train_opt_with_clip(loss=loss, step_num_in_epoch=params['train_steps'] / params[ 'num_epoches']) hook_dict = { 'loss': loss, 'learning_rate': learning_rate, } hook = tf.train.LoggingTensorHook( hook_dict, every_n_iter=10 ) return tf.estimator.EstimatorSpec( mode=mode, training_hooks=[hook], loss=loss, train_op=train_op) elif mode == tf.estimator.ModeKeys.PREDICT: return tf.estimator.EstimatorSpec( mode=mode, predictions={'prediction': predicts} ) else: raise NotImplementedError('not implemented')
def padding_for_trs(batch): items = zip(*batch) padded_src, padded_trg, src_pos, trg_pos = list( map(lambda x: torch.nn.utils.rnn.pad_sequence(x, padding_value=C.PAD), items)) trg_mask, src_key_padding_mask, trg_key_padding_mask, memory_key_padding_mask = Transformer.get_masks( padded_src, padded_trg[:-1], PAD=C.PAD) return padded_src, padded_trg, src_pos, trg_pos, trg_mask, src_key_padding_mask, trg_key_padding_mask, memory_key_padding_mask
def __init__(self, model_dir, vocab_file): """ :param model_dir: model dir path :param vocab_file: vocab file path """ self.tf = import_tf(0) self.model_dir = model_dir self.vocab_file = vocab_file self.token2idx, self.idx2token = _load_vocab(vocab_file) hparams = Hparams() parser = hparams.parser self.hp = parser.parse_args() self.model = Transformer(self.hp) self._add_placeholder() self._init_graph()
def __init__(self, cfg): super(LightningTransformer, self).__init__() self.model_cfg = cfg.model self.data_cfg = cfg.data self.train_cfg = cfg.train_cfg self.lr_cfg = cfg.lr_cfg self._update_model_cfg_by_data() self.transformer = Transformer(**self.model_cfg)
def create_src_masks(src, SRC_SEQ_LEN, TEXT, use_srcmask=False): if use_srcmask: src_mask = Transformer.generate_square_subsequent_mask(SRC_SEQ_LEN).to( device) else: src_mask = None src_key_padding_mask = (src == TEXT.vocab.stoi['<pad>']).bool().to(device) memory_key_padding_mask = ( src == TEXT.vocab.stoi['<pad>']).bool().to(device) return src_mask, src_key_padding_mask, memory_key_padding_mask
def load_model(checkpoint, device): model_args = checkpoint["settings"] model = Transformer( model_args["embedding_size"], model_args["src_vocab_size"], model_args["tgt_vocab_size"], model_args["src_pad_idx"], model_args["num_heads"], model_args["num_encoder_layers"], model_args["num_decoder_layers"], model_args["forward_expansion"], model_args["dropout"], model_args["max_len"], model_args["device"], ).to(device) model.load_state_dict(checkpoint["state_dict"]) print("[Info] Trained model state loaded.") return model
def create_model(): transformer = Transformer( opt.num_layers, opt.d_model, opt.num_heads, opt.dff, encoder_vocab_size, decoder_vocab_size, pe_input=encoder_vocab_size, pe_target=decoder_vocab_size, ) return transformer
def main(): parser = argparse.ArgumentParser(description="Train the model") parser.add_argument('-data', required=True) parser.add_argument('-epoch', type=int, default=10) parser.add_argument('-batch_size', type=int, default=64) parser.add_argument('-d_model', type=int, default=512) parser.add_argument('-no_cuda', action='store_true') opt = parser.parse_args() opt.cuda = not opt.no_cuda opt.d_word_vec = opt.d_model # Load data data = torch.load(opt.data) opt.max_token_seq_len = data['settings'].max_word_seq_len + 2 training_data, validation_data = prepare_dataloaders(data, opt) opt.src_vocab_size = training_data.dataset.src_vocab_size opt.tgt_vocab_size = training_data.dataset.tgt_vocab_size print(opt) # opt.cuda = True device = torch.device('cuda' if opt.cuda else 'cpu') # TODO: Fill the code transformer = Transformer(d_word_embedding=opt.d_word_vec, d_h=opt.d_model, d_s=opt.d_model, src_vocab_size=opt.src_vocab_size, tgt_vocab_size=opt.tgt_vocab_size, max_sent_len=opt.max_token_seq_len).to(device) optimizer = optim.Adam(filter(lambda x: x.requires_grad, transformer.parameters()), betas=(0.9, 0.98), eps=1e-09) train(transformer, training_data, validation_data, optimizer, device, opt)
def main(gpu_id=None): dataset = Dataset(transform=transform, n_datas=10000) pad_vec = np.zeros(len(dataset.human_vocab)) pad_vec[dataset.human_vocab['<pad>']] = 1 dataloader = torch.utils.data.DataLoader(dataset=dataset, batch_size=6, shuffle=True, num_workers=6, collate_fn=partial( collate_fn, pad_vec)) model = Transformer(n_head=2) if gpu_id is not None: print('use gpu') os.environ["CUDA_VISIBLE_DEVICES"] = gpu_id n_gpus = torch.cuda.device_count() # print('use %d gpu [%s]' % (n_gpus, gpu_id)) model = model.cuda() # model = torch.nn.DataParallel(model, device_ids=[i for i in range(n_gpus)]) # loss_fn = torch.nn.CrossEntropyLoss() loss_fn = torch.nn.MSELoss() optimizer = torch.optim.Adam(model.parameters()) model = sl.load_model('./checkpoint', -1, model) optimizer = sl.load_optimizer('./checkpoint', -1, optimizer) try: trained_epoch = sl.find_last_checkpoint('./checkpoint') print('train form epoch %d' % (trained_epoch + 1)) except Exception as e: print('train from the very begining, {}'.format(e)) trained_epoch = -1 for epoch in range(trained_epoch + 1, 20): train(model, loss_fn, optimizer, dataloader, epoch, use_gpu=True if gpu_id is not None else False)
def init_training(args): """ Initialize training process """ # load vocabulary vocab = torch.load(args.vocab) # build model transformer = Transformer(args, vocab) # if finetune if args.finetune: print("[Finetune] %s" % args.finetune_model_path) transformer.load_state_dict(torch.load(args.finetune_model_path)) # vocab_mask for masking padding vocab_mask = torch.ones(len(vocab.tgt)) vocab_mask[vocab.tgt[constants.PAD_WORD]] = 0 # loss object cross_entropy_loss = nn.CrossEntropyLoss(weight=vocab_mask, size_average=False) if args.cuda: transformer = transformer.cuda() cross_entropy_loss = cross_entropy_loss.cuda() if args.optimizer == "Warmup_Adam": optimizer = ScheduledOptim( torch.optim.Adam(transformer.get_trainable_parameters(), betas=(0.9, 0.98), eps=1e-09), args.d_model, args.n_warmup_steps) if args.optimizer == "Adam": optimizer = torch.optim.Adam( params=transformer.get_trainable_parameters(), lr=args.lr, betas=(0.9, 0.98), eps=1e-8) if args.optimizer == 'SGD': optimizer = torch.optim.SGD( params=transformer.get_trainable_parameters(), lr=args.lr) # multi gpus if torch.cuda.device_count() > 1: print("[Multi GPU] using", torch.cuda.device_count(), "GPUs\n") transformer = nn.DataParallel(transformer) return vocab, transformer, optimizer, cross_entropy_loss