def test_running(): epochs = 100000 seq_batch_size = 100 print_yes = 100 loss_func = torch.nn.functional.nll_loss # create network and optimizer net = RNN(100, 120, 150, 2) net.to(device) # add cuda to device optim = torch.optim.Adam(net.parameters(), lr=3e-5) # main training loop: for epoch in range(epochs): dat = get_batch(train_data, seq_batch_size) dat = torch.LongTensor([vocab.find(item) for item in dat]) # pull x and y x_t = dat[:-1] y_t = dat[1:] hidden = net.init_hidden() # turn all into cuda x_t, y_t, hidden = x_t.to(device), y_t.to(device), hidden.to(device) # initialize hidden state and forward pass logprob, hidden = net.forward(x_t, hidden) loss = loss_func(logprob, y_t) # update optim.zero_grad() loss.backward() optim.step() # print the loss for every kth iteration if epoch % print_yes == 0: print('*' * 100) print('\n epoch {}, loss:{} \n'.format(epoch, loss)) # make sure to pass True flag for running on cuda print('sample speech:\n', run_words(net, vocab, 500, True))
def train(): iterators, dataset = data_loaders(batch_size) model = RNN(input_size, hidden_size, num_layers, dropout, n_classes, dataset.get_class_weights()) optimizer = optim.SGD(model.parameters(), lr) log = train_procedure(model, iterators, n_epochs, optimizer) save_log(log)
def load_model(args): if args.model == 'rnn': model = RNN(25, args.dim_hidden, 25) elif args.model == 'ffnn': model = FFNN(25, args.dim_hidden, memory=args.memory, num_hidden_layers=args.num_hidden_layers) elif args.model == 'cnn3d': model = CNN3D((10, 10), ((4, 4, 2), (7, 12, 2)), 10, 7, 10, 7, 2, 100) #/TODO un hard code else: raise Exception( "Only rnn, ffnn, cnn3d model types currently supported") if args.loss_fn == 'mse': loss_fn = torch.nn.MSELoss() else: raise Exception("Only mse loss function currently supported") if args.optimizer == 'adam': optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) else: raise Exception("Only adam optimizer currently supported") return model, loss_fn, optimizer
def attention_test(args): chosen_params = dict(params) results = [] for attention in [False, True]: chosen_params['attention'] = attention runs = 5 acc_d = {} f1_d = {} for i in tqdm(range(runs)): train_dataset, valid_dataset, test_dataset = data.load_dataset( args.train_batch_size, args.test_batch_size, min_freq=chosen_params['min_freq']) embedding = data.generate_embedding_matrix( train_dataset.dataset.text_vocab, rand=chosen_params['rand_emb'], freeze=chosen_params['freeze']) model = RNN(embedding, chosen_params) criterion = nn.BCEWithLogitsLoss() optimizer = torch.optim.Adam(model.parameters(), lr=chosen_params['lr']) for epoch in range(args.epochs): print(f'******* epoch: {epoch+1} *******') train(model, train_dataset, optimizer, criterion, args) evaluate(model, valid_dataset, criterion, 'Validation') acc, f1 = evaluate(model, test_dataset, criterion, 'Test') acc_d['acc_' + 'run' + str(i)] = acc f1_d['f1_' + 'run' + str(i)] = f1 mean = np.mean(list(acc_d.values())) std = np.std(list(acc_d.values())) acc_d['mean'] = mean acc_d['std'] = std mean = np.mean(list(f1_d.values())) std = np.std(list(f1_d.values())) f1_d['mean'] = mean f1_d['std'] = std results.append((acc_d, f1_d)) with open(os.path.join(SAVE_DIR, 'attention.txt'), 'a') as f: print(f'', file=f) for idx, (acc, f1) in enumerate(results): print('[no attention]' if idx == 0 else '[attention]', file=f) print(acc, file=f) print(f1, file=f)
def hyperparam_optim_test(args): var_params = { 'cell_name': ['lstm'], 'hidden_size': [50, 150, 300], 'num_layers': [2, 4, 5], 'min_freq': [0, 100, 500], 'lr': [1e-3, 1e-4, 1e-7], 'dropout': [0, 0.4, 0.7], 'freeze': [False, True], 'rand_emb': [False, True], 'attention': [False] } results = [] for i in tqdm(range(10)): chosen_params = {k: random.choice(v) for (k, v) in var_params.items()} train_dataset, valid_dataset, test_dataset = data.load_dataset( args.train_batch_size, args.test_batch_size, min_freq=chosen_params['min_freq']) embedding = data.generate_embedding_matrix( train_dataset.dataset.text_vocab, rand=chosen_params['rand_emb'], freeze=chosen_params['freeze']) model = RNN(embedding, chosen_params) criterion = nn.BCEWithLogitsLoss() optimizer = torch.optim.Adam(model.parameters(), lr=chosen_params['lr']) for epoch in range(args.epochs): print(f'******* epoch: {epoch+1} *******') train(model, train_dataset, optimizer, criterion, args) evaluate(model, valid_dataset, criterion, 'Validation') acc, f1 = evaluate(model, test_dataset, criterion, 'Test') result = dict(chosen_params) result['acc'] = acc result['f1'] = f1 results.append(result) with open(os.path.join(SAVE_DIR, 'params_search.txt'), 'a') as f: for result in results: print(result, file=f)
def cell_comparison_test(args): train_dataset, valid_dataset, test_dataset = data.load_dataset( args.train_batch_size, args.test_batch_size) embedding = data.generate_embedding_matrix( train_dataset.dataset.text_vocab) var_params = { 'hidden_size': [50, 150, 300], 'num_layers': [1, 2, 4], 'dropout': [0.1, 0.4, 0.7], 'bidirectional': [True, False], 'attention': [False] } for idx, (key, values) in enumerate(var_params.items()): fig, ax = plt.subplots(nrows=1, ncols=1) ax.set_title('Variable ' + key) for cell_name in tqdm(['rnn', 'lstm', 'gru']): results = [] for i in range(len(values)): current_params = { k: v[i] if k == key else v[1] for (k, v) in var_params.items() } current_params['cell_name'] = cell_name model = RNN(embedding, current_params) model criterion = nn.BCEWithLogitsLoss() optimizer = torch.optim.Adam(model.parameters(), lr=1e-4) for epoch in range(args.epochs): print(f'******* epoch: {epoch+1} *******') train(model, train_dataset, optimizer, criterion, args) evaluate(model, valid_dataset, criterion, 'Validation') result, _ = evaluate(model, test_dataset, criterion, 'Test') results.append(result) ax.plot(values, results, marker='o', label=cell_name) plt.legend(loc='best') plt.xlabel(key) plt.ylabel('accuracy') fig.savefig(os.path.join(SAVE_DIR, key + '.png')) plt.close(fig)
def main(): epochs = 301 seq_batch_size = 200 print_yes = 100 iscuda = False # create our network, optimizer and loss function net = RNN(len(chars), 100, 150, 2) #instanciate a RNN object optim = torch.optim.Adam(net.parameters(), lr=6e-4) loss_func = torch.nn.functional.nll_loss if iscuda: net = net.cuda() # main training loop: for epoch in range(epochs): dat = getSequence(book, seq_batch_size) dat = torch.LongTensor( [chars.find(item) for item in dat] ) #find corresponding char index for each character and store this in tensor # pull x, y and initialize hidden state if iscuda: x_t = dat[:-1].cuda() y_t = dat[1:].cuda() hidden = net.init_hidden().cuda() else: x_t = dat[:-1] y_t = dat[1:] hidden = net.init_hidden() # forward pass logprob, hidden = net.forward(x_t, hidden) loss = loss_func(logprob, y_t) # update optim.zero_grad() loss.backward() optim.step() # print the loss for every kth iteration if epoch % print_yes == 0: print('*' * 60) print('\n epoch {}, loss:{} \n'.format(epoch, loss)) print('sample speech:\n', test_words(net, chars, seq_batch_size)) torch.save(net.state_dict(), 'trainedBook_v2.pt')
def main(args): chosen_params = dict(params) chosen_params['attention'] = True train_dataset, valid_dataset, test_dataset = data.load_dataset( args.train_batch_size, args.test_batch_size) embedding = data.generate_embedding_matrix( train_dataset.dataset.text_vocab, freeze=False) model = RNN(embedding, chosen_params) criterion = nn.BCEWithLogitsLoss() optimizer = torch.optim.Adam(model.parameters(), lr=1e-4) for epoch in range(args.epochs): print(f'******* epoch: {epoch} *******') train(model, train_dataset, optimizer, criterion, args) evaluate(model, valid_dataset, criterion, 'Validation') evaluate(model, test_dataset, criterion, 'Test')
def no_test_forward(): loss_func = torch.nn.functional.nll_loss net = RNN(100, 100, 100) net.to(device) # add cuda to device optim = torch.optim.Adam(net.parameters(), lr=1e-4) # step 2: create a training batch of data, size 101, format this data and convert it to pytorch long tensors dat = get_batch(train_data, 100) dat = torch.LongTensor([vocab.find(item) for item in dat]) # step 3: convert our dat into input/output x_t = dat[:-1] y_t = dat[1:] ho = net.init_hidden() # remember to load all variables used by the model to the device, this means the i/o as well as the hidden state x_t, y_t, ho = x_t.to(device), y_t.to(device), ho.to(device) # test forward pass log_prob, hidden = net.forward(x_t, ho) # let's see if the forward pass of the next hidden state is already cuda #log_prob2, hidden2 = net.forward(x_t, hidden) loss = loss_func(log_prob, y_t) optim.zero_grad() loss.backward() optim.step()
def embedding_baseline_test(args): chosen_params = dict(params) results = [] for rand_emb in [True, False]: chosen_params['rand_emb'] = rand_emb train_dataset, valid_dataset, test_dataset = data.load_dataset( args.train_batch_size, args.test_batch_size, min_freq=chosen_params['min_freq']) embedding = data.generate_embedding_matrix( train_dataset.dataset.text_vocab, rand=chosen_params['rand_emb'], freeze=chosen_params['freeze']) result = {} for m in ['baseline', 'rnn']: if m == 'rnn': model = RNN(embedding, chosen_params) else: model = Baseline(embedding) criterion = nn.BCEWithLogitsLoss() optimizer = torch.optim.Adam(model.parameters(), lr=1e-4) for epoch in range(args.epochs): print(f'******* epoch: {epoch} *******') train(model, train_dataset, optimizer, criterion, args) evaluate(model, valid_dataset, criterion, 'Validation') acc, f1 = evaluate(model, test_dataset, criterion, 'Test') result[m + '_acc_rand_emb' + str(rand_emb)] = acc result[m + '_f1_rand_emb' + str(rand_emb)] = f1 results.append(result) with open(os.path.join(SAVE_DIR, 'embedding_baseline.txt'), 'a') as f: for res in results: print(res, file=f)
'pretrained_word_embeddings_file': pretrained_word_embeddings_file, 'transform_train': transform_train, 'transform_val': transform_val, 'WEIGHT_DECAY': WEIGHT_DECAY, 'ADAM_FLAG': ADAM_FLAG, 'RNN_DROPOUT':RNN_DROPOUT, 'CNN_DROPOUT': CNN_DROPOUT, 'GRAD_CLIP': GRAD_CLIP} print('Initializing models...') encoder = CNN(NO_WORD_EMBEDDINGS, pretrained_cnn_dir, freeze=True, dropout_prob=CNN_DROPOUT, model_name='resnet152') decoder = RNN(VOCAB_SIZE, NO_WORD_EMBEDDINGS, hidden_size=HIDDEN_SIZE, num_layers=NUM_LAYERS, pre_trained_file=pretrained_word_embeddings_file, freeze=False, dropout_prob=RNN_DROPOUT) params['encoder'] = encoder params['decoder'] = decoder encoder.cuda() decoder.cuda() print('Initializing optimizer...') model_paras = list(encoder.parameters()) + list(decoder.parameters()) optimizer = optim.Adam(model_paras, lr=LR, weight_decay=WEIGHT_DECAY) params['optimizer'] = optimizer pickle.dump(params, open(init_params_file, 'wb')) # initialize accumulators. current_epoch = 1 batch_step_count = 1 time_used_global = 0.0 checkpoint = 1 # load lastest model to resume training
class Trainer: """ 训练 """ def __init__(self, _hparams): utils.set_seed(_hparams.fixed_seed) self.train_loader = get_train_loader(_hparams) self.val_loader = get_val_loader(_hparams) self.encoder = CNN().to(DEVICE) self.decoder = RNN(fea_dim=_hparams.fea_dim, embed_dim=_hparams.embed_dim, hid_dim=_hparams.hid_dim, max_sen_len=_hparams.max_sen_len, vocab_pkl=_hparams.vocab_pkl).to(DEVICE) self.loss_fn = nn.CrossEntropyLoss() self.optimizer = torch.optim.Adam(self.get_params(), lr=_hparams.lr) self.writer = SummaryWriter() self.max_sen_len = _hparams.max_sen_len self.val_cap = _hparams.val_cap self.ft_encoder_lr = _hparams.ft_encoder_lr self.ft_decoder_lr = _hparams.ft_decoder_lr self.best_CIDEr = 0 def fine_tune_encoder(self, fine_tune_epochs, val_interval, save_path, val_path): print('*' * 20, 'fine tune encoder for', fine_tune_epochs, 'epochs', '*' * 20) self.encoder.fine_tune() self.optimizer = torch.optim.Adam([ { 'params': self.encoder.parameters(), 'lr': self.ft_encoder_lr }, { 'params': self.decoder.parameters(), 'lr': self.ft_decoder_lr }, ]) self.training(fine_tune_epochs, val_interval, save_path, val_path) self.encoder.froze() print('*' * 20, 'fine tune encoder complete', '*' * 20) def get_params(self): """ 模型需要优化的全部参数,此处encoder暂时设计不用训练,故不加参数 :return: """ return list(self.decoder.parameters()) def training(self, max_epochs, val_interval, save_path, val_path): """ 训练 :param val_path: 保存验证过程生成句子的路径 :param save_path: 保存模型的地址 :param val_interval: 验证的间隔 :param max_epochs: 最大训练的轮次 :return: """ print('*' * 20, 'train', '*' * 20) for epoch in range(max_epochs): self.set_train() epoch_loss = 0 epoch_steps = len(self.train_loader) for step, (img, cap, cap_len) in tqdm(enumerate(self.train_loader)): # batch_size * 3 * 224 * 224 img = img.to(DEVICE) cap = cap.to(DEVICE) self.optimizer.zero_grad() features = self.encoder.forward(img) outputs = self.decoder.forward(features, cap) outputs = pack_padded_sequence(outputs, cap_len - 1, batch_first=True)[0] targets = pack_padded_sequence(cap[:, 1:], cap_len - 1, batch_first=True)[0] train_loss = self.loss_fn(outputs, targets) epoch_loss += train_loss.item() train_loss.backward() self.optimizer.step() epoch_loss /= epoch_steps self.writer.add_scalar('epoch_loss', epoch_loss, epoch) print('epoch_loss: {}, epoch: {}'.format(epoch_loss, epoch)) if (epoch + 1) % val_interval == 0: CIDEr = self.validating(epoch, val_path) if self.best_CIDEr <= CIDEr: self.best_CIDEr = CIDEr self.save_model(save_path, epoch) def save_model(self, save_path, train_epoch): """ 保存最好的模型 :param save_path: 保存模型文件的地址 :param train_epoch: 当前训练的轮次 :return: """ model_state_dict = { 'encoder_state_dict': self.encoder.state_dict(), 'decoder_state_dict': self.decoder.state_dict(), 'tran_epoch': train_epoch, } print('*' * 20, 'save model to: ', save_path, '*' * 20) torch.save(model_state_dict, save_path) def validating(self, train_epoch, val_path): """ 验证 :param val_path: 保存验证过程生成句子的路径 :param train_epoch: 当前训练的epoch :return: """ print('*' * 20, 'validate', '*' * 20) self.set_eval() sen_json = [] with torch.no_grad(): for val_step, (img, img_id) in tqdm(enumerate(self.val_loader)): img = img.to(DEVICE) features = self.encoder.forward(img) sens, _ = self.decoder.sample(features) sen_json.append({'image_id': int(img_id), 'caption': sens[0]}) with open(val_path, 'w') as f: json.dump(sen_json, f) result = coco_eval(self.val_cap, val_path) scores = {} for metric, score in result: scores[metric] = score self.writer.add_scalar(metric, score, train_epoch) return scores['CIDEr'] def set_train(self): self.encoder.train() self.decoder.train() def set_eval(self): self.encoder.eval() self.decoder.eval()
train_X, train_y = util.create_dataset(training_set_scaled, input_size=INPUT_SIZE) # print(train_X.shape) ''' (672, 60) ''' # Reshape为 (batch, time_step, input_size), 这是放入LSTM的shape train_X = train_X.reshape(train_X.shape[0], 1, train_X.shape[1]) # print(train_X.shape) ''' (672, 1, 60) ''' # Part 2 - Building the RNN rnn = RNN(INPUT_SIZE, HIDDEN_SIZE, NUM_LAYERS, OUTPUT_SIZE) optimiser = torch.optim.Adam(rnn.parameters(), lr=args.lr) loss_func = nn.MSELoss() if CUDA: loss_func.cuda() plt.figure(1, figsize=(12, 5)) plt.ion() hidden_state = None for epoch in range(args.epochs): inputs = Variable(torch.from_numpy(train_X).float()) labels = Variable(torch.from_numpy(train_y).float()) if CUDA: inputs, labels = inputs.cuda(), labels.cuda()
class TextClassifier: def __init__(self, batch_size, iterations, initial_lr, hidden_size, dropout, kernel_sz, num_layers): self.use_cuda = torch.cuda.is_available() self.device = torch.device('cuda:0' if self.use_cuda else 'cpu') self.data = DataReader() train_iter, val_iter, test_iter = self.data.init_dataset( batch_size, ('cuda:0' if self.use_cuda else 'cpu')) self.train_batch_loader = BatchGenerator(train_iter, 'text', 'label') self.val_batch_loader = BatchGenerator(val_iter, 'text', 'label') self.test_batch_loader = BatchGenerator(test_iter, 'text', 'label') # Store hyperparameters self.batch_size = batch_size self.iterations = iterations self.initial_lr = initial_lr # Create Model emb_size, emb_dim = self.data.TEXT.vocab.vectors.size() # padding = (math.floor(kernel_sz / 2), 0) # self.model = CNN(emb_size=emb_size, emb_dimension=emb_dim, # output_size=len(self.data.LABEL.vocab), # dropout=dropout, kernel_sz=kernel_sz, stride=1, padding=padding, # out_filters=hidden_size, pretrained_emb=self.data.TEXT.vocab.vectors) self.model = RNN(emb_size=emb_size, emb_dimension=emb_dim, pretrained_emb=self.data.TEXT.vocab.vectors, output_size=len(self.data.LABEL.vocab), num_layers=num_layers, hidden_size=hidden_size, dropout=dropout) if self.use_cuda: self.model.cuda() def train(self, min_stride=3): train_loss_hist = [] val_loss_hist = [] train_acc_hist = [] val_acc_hist = [] test_acc_hist = [] best_score = 0.0 loss = 0.0 for itr in range(self.iterations): print("\nIteration: " + str(itr + 1)) optimizer = optim.SGD(self.model.parameters(), lr=self.initial_lr) self.model.train() total_loss = 0.0 total_acc = 0.0 steps = 0 data_iter = iter(self.train_batch_loader) for i in range(len(self.train_batch_loader)): ((x_batch, x_len_batch), y_batch) = next(data_iter) # if torch.min(x_len_batch) > min_stride: optimizer.zero_grad() loss, logits = self.model.forward(x_batch, y_batch) acc = torch.sum(torch.argmax(logits, dim=1) == y_batch) total_loss += loss.item() total_acc += acc.item() steps += 1 loss.backward() optimizer.step() train_loss_hist.append(total_loss / steps) train_acc_hist.append(total_acc / len(self.data.train_data)) val_loss, val_acc = self.eval_model(self.val_batch_loader, len(self.data.val_data)) val_loss_hist.append(val_loss) val_acc_hist.append(val_acc) if val_acc > best_score: best_score = val_acc test_loss, test_acc = self.eval_model(self.test_batch_loader, len(self.data.test_data)) print("Train: {Loss: " + str(total_loss / steps) + ", Acc: " + str(total_acc / len(self.data.train_data)) + " }") print("Val: {Loss: " + str(val_loss) + ", Acc: " + str(val_acc) + " }") test_acc_hist.append(test_acc) return train_loss_hist, train_acc_hist, val_loss_hist, val_acc_hist, test_acc_hist def eval_model(self, batch_loader, N, min_stride=3): self.model.eval() total_loss = 0.0 total_acc = 0.0 steps = 0 batch_iterator = iter(batch_loader) with torch.no_grad(): for i in range(len(batch_loader)): ((x_batch, x_len_batch), y_batch) = next(batch_iterator) # if torch.min(x_len_batch) > min_stride: loss, logits = self.model(x_batch, y_batch) acc = torch.sum(torch.argmax(logits, dim=1) == y_batch) total_loss += loss.item() total_acc += acc.item() return (total_loss / N), (total_acc / N)
# different things here than in the RNNs. # Also, the Transformer also has other hyperparameters # (such as the number of attention heads) which can change it's behavior. model = TRANSFORMER(vocab_size=vocab_size, n_units=args.hidden_size, n_blocks=args.num_layers, dropout=1.-args.dp_keep_prob) # these 3 attributes don't affect the Transformer's computations; # they are only used in run_epoch model.batch_size=args.batch_size model.seq_len=args.seq_len model.vocab_size=vocab_size else: print("Model type not recognized.") model = model.to(device) total_params = sum(p.numel() for p in model.parameters()) print("===> Total Model Parameters: ", total_params) with open (os.path.join(experiment_path,'exp_config.txt'), 'a') as f: f.write('Total Model Parameters:'+' '+str(total_params)+'\n') # LOSS FUNCTION loss_fn = nn.CrossEntropyLoss() if args.optimizer == 'ADAM': optimizer = torch.optim.Adam(model.parameters(), lr=args.initial_lr) # LEARNING RATE SCHEDULE lr = args.initial_lr lr_decay_base = 1 / 1.15 m_flat_lr = 14.0 # we will not touch lr for the first m_flat_lr epochs
args = get_args() device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') if not os.path.exists(args.prepro_root): os.makedirs(args.prepro_root) train_dataset = torch.load(args.prepro_root + args.train_dataset_path) valid_dataset = torch.load(args.prepro_root + args.valid_dataset_path) train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True) valid_loader = data.DataLoader(valid_dataset, batch_size=args.batch_size, shuffle=False) valid_tensor = torch.load(args.prepro_root+'valid_writer_keywd.pkl').to(device) model = RNN(args.num_readers, args.num_writers, args.num_keywords, args.num_items, args.num_magazines, args.hid_dim, valid_tensor).to(device) optimizer = optim.Adam(model.parameters(), lr=args.lr) criterion = torch.nn.CrossEntropyLoss(ignore_index=0) scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5) model_parameters = filter(lambda p: p.requires_grad, model.parameters()) params = sum([np.prod(p.size()) for p in model_parameters]) print(model) print('# of params : ', params) if args.start_epoch: model.load_state_dict(torch.load(args.save_path+'%d_rnn_attention.pkl' % args.start_epoch)) best_loss = 9999999 for epoch in range(args.num_epochs): model.train() for i, data in enumerate(tqdm.tqdm(train_loader, desc='Train')):
# chunk into sequences of length seq_length + 1 batches = list(chunks(batches, seq_length + 1)) # chunk sequences into batches batches = list(chunks(batches, minibatch_size)) # convert batches to tensors and transpose batches = [torch.LongTensor(batch).transpose_(0, 1) for batch in batches] loss_function = nn.CrossEntropyLoss() model = RNN(minibatch_size, chars_len, hidden_size, chars_len, n_layers, minibatch_size).to(DEVICE) hidden = Variable(model.create_hidden()).to(DEVICE) optimizer = torch.optim.Adam(model.parameters(), lr=lr) def loop(itr, extras, stateful): batch_tensor = next(itr) if use_cuda: batch_tensor = batch_tensor.cuda() # reset the model model.zero_grad() # everything except the last input_variable = Variable(batch_tensor[:-1]).to(DEVICE) # everything except the first, flattened target_variable = Variable(
def main(): args = get_args() log.info(f'Parsed arguments: \n{pformat(args.__dict__)}') assert args.cond_type.lower() in ['none', 'platanios', 'oestling'] device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') log.info('Using device {}.'.format(device)) use_apex = False if torch.cuda.is_available() and args.fp16: log.info('Loading Nvidia Apex and using AMP') from apex import amp, optimizers use_apex = True else: log.info('Using FP32') amp = None log.info(f'Using time stamp {timestamp} to save models and logs.') if not args.no_seed: log.info(f'Setting random seed to {args.seed} for reproducibility.') torch.manual_seed(args.seed) random.seed(args.seed) data = Corpus(args.datadir) data_splits = [ { 'split': 'train', 'languages': args.dev_langs + args.target_langs, 'invert_include': True, }, { 'split': 'valid', 'languages': args.dev_langs, }, { 'split': 'test', 'languages': args.target_langs, }, ] if args.refine: data_splits.append({ 'split': 'train_100', 'languages': args.target_langs, 'ignore_missing': True, }) data_splits = data.make_datasets(data_splits, force_rebuild=args.rebuild) train_set, val_set, test_set = data_splits['train'], data_splits[ 'valid'], data_splits['test'] dictionary = data_splits['dictionary'] train_language_distr = get_sampling_probabilities(train_set, 1.0) train_set = Dataset(train_set, batchsize=args.batchsize, bptt=args.bptt, reset_on_iter=True, language_probabilities=train_language_distr) val_set = Dataset(val_set, make_config=True, batchsize=args.valid_batchsize, bptt=args.bptt, eval=True) test_set = Dataset(test_set, make_config=True, batchsize=args.test_batchsize, bptt=args.bptt, eval=True) train_loader = DataLoader(train_set, num_workers=args.workers) val_loader = DataLoader(val_set, num_workers=args.workers) test_loader = DataLoader(test_set, num_workers=args.workers) if args.refine: refine_set = dict() for lang, lang_d in data_splits['train_100'].items(): refine_set[lang] = Dataset({lang: lang_d}, batchsize=args.valid_batchsize, bptt=args.bptt, make_config=True) n_token = len(dictionary.idx2tkn) # Load and preprocess matrix of typological features # TODO: implement this, the OEST # prior_matrix = load_prior(args.prior, corpus.dictionary.lang2idx) # n_components = min(50, *prior_matrix.shape) # pca = PCA(n_components=n_components, whiten=True) # prior_matrix = pca.fit_transform(prior_matrix) prior = None model = RNN(args.cond_type, prior, n_token, n_input=args.emsize, n_hidden=args.nhidden, n_layers=args.nlayers, dropout=args.dropouto, dropoute=args.dropoute, dropouth=args.dropouth, dropouti=args.dropouti, wdrop=args.wdrop, wdrop_layers=[0, 1, 2], tie_weights=True).to(device) if args.opt_level != 'O2': loss_function = SplitCrossEntropyLoss(args.emsize, splits=[]).to(device) else: loss_function = CrossEntropyLoss().to( device) # Should be ok to use with a vocabulary of this small size if use_apex: optimizer = optimizers.FusedAdam(model.parameters(), lr=args.lr, weight_decay=args.wdecay) else: params = list(filter(lambda p: p.requires_grad, model.parameters())) + list( loss_function.parameters()) optimizer = Adam(params, lr=args.lr, weight_decay=args.wdecay) if use_apex: model, optimizer = amp.initialize(model, optimizer, opt_level=args.opt_level) parameters = { 'model': model, 'optimizer': optimizer, 'loss_function': loss_function, 'use_apex': use_apex, 'amp': amp if use_apex else None, 'clip': args.clip, 'alpha': args.alpha, 'beta': args.beta, 'bptt': args.bptt, 'device': device, 'prior': args.prior, } # Add backward hook for gradient clipping if args.clip: if use_apex: for p in amp.master_params(optimizer): p.register_hook( lambda grad: torch.clamp(grad, -args.clip, args.clip)) else: for p in model.parameters(): p.register_hook( lambda grad: torch.clamp(grad, -args.clip, args.clip)) if args.prior == 'vi': prior = VIPrior(model, device=device) parameters['prior'] = prior def sample_weights(module: torch.nn.Module, input: torch.Tensor): prior.sample_weights(module) sample_weights_hook = model.register_forward_pre_hook(sample_weights) # Load model checkpoint if available start_epoch = 1 if args.resume: if args.checkpoint is None: log.error( 'No checkpoint passed. Specify it using the --checkpoint flag') checkpoint = None else: log.info('Loading the checkpoint at {}'.format(args.checkpoint)) checkpoint = load_model(args.checkpoint, **parameters) start_epoch = checkpoint['epoch'] if args.wdrop: for rnn in model.rnns: if isinstance(rnn, WeightDrop): rnn.dropout = args.wdrop elif rnn.zoneout > 0: rnn.zoneout = args.wdrop saved_models = list() result_str = '| Language {} | test loss {:5.2f} | test ppl {:8.2f} | test bpc {:8.3f}' def test(): log.info('=' * 89) log.info('Running test set (zero-shot results)...') test_loss, avg_loss = evaluate(test_loader, **parameters) log.info('Test set finished | test loss {} | test bpc {}'.format( test_loss, test_loss / math.log(2))) for lang, avg_l_loss in avg_loss.items(): langstr = dictionary.idx2lang[lang] log.info( result_str.format(langstr, avg_l_loss, math.exp(avg_l_loss), avg_l_loss / math.log(2))) log.info('=' * 89) if args.train: f = 1. stored_loss = 1e32 epochs_no_improve = 0 val_losses = list() # calculate specific language lr data_spec_count = sum([len(ds) for l, ds in train_set.data.items()]) data_spec_avg = data_spec_count / len(train_set.data.items()) data_spec_lrweights = dict([(l, data_spec_avg / len(ds)) for l, ds in train_set.data.items()]) # estimate total number of steps total_steps = sum( [len(ds) // args.bptt for l, ds in train_set.data.items()]) * args.no_epochs steps = 0 try: pbar = tqdm.trange(start_epoch, args.no_epochs + 1, position=1, dynamic_ncols=True) for epoch in pbar: steps = train(train_loader, lr_weights=data_spec_lrweights, **parameters, total_steps=total_steps, steps=steps, scaling=args.scaling, n_samples=args.n_samples, tb_writer=tb_writer) val_loss, _ = evaluate(val_loader, **parameters) pbar.set_description('Epoch {} | Val loss {}'.format( epoch, val_loss)) # Save model if args.prior == 'vi': sample_weights_hook.remove() filename = path.join( args.checkpoint_dir, '{}_epoch{}{}_{}.pth'.format( timestamp, epoch, '_with_apex' if use_apex else '', args.prior)) torch.save(make_checkpoint(epoch + 1, **parameters), filename) saved_models.append(filename) if args.prior == 'vi': sample_weights_hook = model.register_forward_pre_hook( sample_weights) # Early stopping if val_loss < stored_loss: epochs_no_improve = 0 stored_loss = val_loss else: epochs_no_improve += 1 if epochs_no_improve == args.patience: log.info('Early stopping at epoch {}'.format(epoch)) break val_losses.append(val_loss) # Reduce lr every 1/3 total epochs if epoch - 1 > f / 3 * args.no_epochs: log.info('Epoch {}/{}. Dividing LR by 10'.format( epoch, args.no_epochs)) for g in optimizer.param_groups: g['lr'] = g['lr'] / 10 f += 1. test() except KeyboardInterrupt: log.info('Registered KeyboardInterrupt. Stopping training.') log.info('Saving last model to disk') if args.prior == 'vi': sample_weights_hook.remove() torch.save( make_checkpoint(epoch, **parameters), path.join( args.checkpoint_dir, '{}_epoch{}{}_{}.pth'.format( timestamp, epoch, '_with_apex' if use_apex else '', args.prior))) return elif args.test: test() # Only test on existing languages if there are no held out languages if not args.target_langs: exit(0) importance = 1e-5 # If use UNIV, calculate informed prior, else use boring prior if args.prior == 'laplace': if not isinstance( prior, LaplacePrior): # only calculate matrix if it is not supplied. log.info('Creating laplace approximation dataset') laplace_set = Dataset(data_splits['train'], batchsize=args.batchsize, bptt=100, reset_on_iter=True) laplace_loader = DataLoader(laplace_set, num_workers=args.workers) log.info('Creating Laplacian prior') prior = LaplacePrior(model, loss_function, laplace_loader, use_apex=use_apex, amp=amp, device=device) parameters['prior'] = prior torch.save( make_checkpoint('fisher_matrix', **parameters), path.join( args.checkpoint_dir, '{}_fishers_matrix{}_{}.pth'.format( timestamp, '_with_apex' if use_apex else '', args.prior))) importance = 1e5 elif args.prior == 'ninf': log.info('Creating non-informative Gaussian prior') parameters['prior'] = GaussianPrior() elif args.prior == 'vi': importance = 1e-5 elif args.prior == 'hmc': raise NotImplementedError else: raise ValueError( f'Passed prior {args.prior} is not an implemented inference technique.' ) best_model = saved_models[-1] if not len( saved_models) == 0 else args.checkpoint # Remove sampling hook from model if args.prior == 'vi': sample_weights_hook.remove() # Refine on 100 samples on each target if args.refine: # reset learning rate optimizer.param_groups[0]['lr'] = args.lr loss = 0 results = dict() # Create individual tests sets test_sets = dict() for lang, lang_d in data_splits['test'].items(): test_sets[lang] = DataLoader(Dataset({lang: lang_d}, make_config=True, batchsize=args.test_batchsize, bptt=args.bptt, eval=True), num_workers=args.workers) for lang, lang_data in tqdm.tqdm(refine_set.items()): final_loss = False refine_dataloader = DataLoader(lang_data, num_workers=args.workers) load_model(best_model, **parameters) log.info(f'Refining for language {dictionary.idx2lang[lang]}') for epoch in range(1, args.refine_epochs + 1): refine(refine_dataloader, **parameters, importance=importance) if epoch % 5 == 0: final_loss = True loss, avg_loss = evaluate(test_sets[lang], model, loss_function, only_l=lang, report_all=True, device=device) for lang, avg_l_loss in avg_loss.items(): langstr = dictionary.idx2lang[lang] log.debug( result_str.format(langstr, avg_l_loss, math.exp(avg_l_loss), avg_l_loss / math.log(2))) if not final_loss: loss, avg_loss = evaluate(test_sets[lang], model, loss_function, only_l=lang, report_all=True, device=device) for lang, avg_l_loss in avg_loss.items(): langstr = dictionary.idx2lang[lang] log.info( result_str.format(langstr, avg_l_loss, math.exp(avg_l_loss), avg_l_loss / math.log(2))) results[lang] = avg_l_loss log.info('=' * 89) log.info('FINAL FEW SHOT RESULTS: ') log.info('=' * 89) for lang, avg_l_loss in results.items(): langstr = dictionary.idx2lang[lang] log.info( result_str.format(langstr, avg_l_loss, math.exp(avg_l_loss), avg_l_loss / math.log(2))) log.info('=' * 89)
def main(args): this_dir = osp.join(osp.dirname(__file__), '.') os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') datasets = { phase: DataLayer( data_root=osp.join(args.data_root, phase), phase=phase, ) for phase in args.phases } data_loaders = { phase: data.DataLoader( datasets[phase], batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, ) for phase in args.phases } model = Model( input_size=args.input_size, hidden_size=args.hidden_size, bidirectional=args.bidirectional, num_classes=args.num_classes, ).apply(utl.weights_init).to(device) criterion = nn.CrossEntropyLoss().to(device) softmax = nn.Softmax(dim=1).to(device) optimizer = optim.RMSprop(model.parameters(), lr=args.lr) for epoch in range(args.start_epoch, args.start_epoch + args.epochs): losses = {phase: 0.0 for phase in args.phases} corrects = {phase: 0.0 for phase in args.phases} start = time.time() for phase in args.phases: training = 'Test' not in phase if training: model.train(True) else: if epoch in args.test_intervals: model.train(False) else: continue with torch.set_grad_enabled(training): for batch_idx, (spatial, temporal, length, target) in enumerate(data_loaders[phase]): spatial_input = torch.zeros(*spatial.shape) temporal_input = torch.zeros(*temporal.shape) target_input = [] length_input = [] index = utl.argsort(length)[::-1] for i, idx in enumerate(index): spatial_input[i] = spatial[idx] temporal_input[i] = temporal[idx] target_input.append(target[idx]) length_input.append(length[idx]) spatial_input = spatial_input.to(device) temporal_input = temporal_input.to(device) target_input = torch.LongTensor(target_input).to(device) pack1 = pack_padded_sequence(spatial_input, length_input, batch_first=True) pack2 = pack_padded_sequence(temporal_input, length_input, batch_first=True) score = model(pack1, pack2) loss = criterion(score, target_input) losses[phase] += loss.item() * target_input.shape[0] if args.debug: print(loss.item()) if training: optimizer.zero_grad() loss.backward() optimizer.step() else: pred = torch.max(softmax(score), 1)[1].cpu() corrects[phase] += torch.sum( pred == target_input.cpu()).item() end = time.time() print('Epoch {:2} | ' 'Train loss: {:.5f} Val loss: {:.5f} | ' 'Test loss: {:.5f} accuracy: {:.5f} | ' 'running time: {:.2f} sec'.format( epoch, losses['Train'] / len(data_loaders['Train'].dataset), losses['Validation'] / len(data_loaders['Validation'].dataset), losses['Test'] / len(data_loaders['Test'].dataset), corrects['Test'] / len(data_loaders['Test'].dataset), end - start, )) if epoch in args.test_intervals: torch.save( model.state_dict(), osp.join(this_dir, './state_dict-epoch-' + str(epoch) + '.pth'))
# init model if arch == "rnn": model = RNN(**params_model, output_size=num_classes).to(device=device) elif arch == "gru": model = GRU(**params_model, output_size=num_classes).to(device=device) elif arch == "lstm": model = LSTM(**params_model, output_size=num_classes).to(device=device) else: raise Exception( "Only 'rnn', 'gru', and 'lstm' are available model options.") print("Model size: {0}".format(count_parameters(model))) criterion = nn.NLLLoss() op = torch.optim.SGD(model.parameters(), **params_op) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(op, patience=4, factor=0.5, verbose=True) # resume from checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] best_acc1 = checkpoint['best_acc1'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format(
def main(argv): global args args = parser.parse_args(argv) if args.threads == -1: args.threads = torch.multiprocessing.cpu_count() - 1 or 1 print('===> Configuration') print(args) cuda = args.cuda if cuda: if torch.cuda.is_available(): print('===> {} GPUs are available'.format( torch.cuda.device_count())) else: raise Exception("No GPU found, please run with --no-cuda") # Fix the random seed for reproducibility # random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if cuda: torch.cuda.manual_seed(args.seed) # Data loading print('===> Loading entire datasets') with open(args.data_path + 'train.seqs', 'rb') as f: train_seqs = pickle.load(f) with open(args.data_path + 'train.labels', 'rb') as f: train_labels = pickle.load(f) with open(args.data_path + 'valid.seqs', 'rb') as f: valid_seqs = pickle.load(f) with open(args.data_path + 'valid.labels', 'rb') as f: valid_labels = pickle.load(f) with open(args.data_path + 'test.seqs', 'rb') as f: test_seqs = pickle.load(f) with open(args.data_path + 'test.labels', 'rb') as f: test_labels = pickle.load(f) max_code = max( map(lambda p: max(map(lambda v: max(v), p)), train_seqs + valid_seqs + test_seqs)) num_features = max_code + 1 print(" ===> Construct train set") train_set = VisitSequenceWithLabelDataset(train_seqs, train_labels, num_features, reverse=False) print(" ===> Construct validation set") valid_set = VisitSequenceWithLabelDataset(valid_seqs, valid_labels, num_features, reverse=False) print(" ===> Construct test set") test_set = VisitSequenceWithLabelDataset(test_seqs, test_labels, num_features, reverse=False) train_loader = DataLoader(dataset=train_set, batch_size=args.batch_size, shuffle=True, collate_fn=visit_collate_fn, num_workers=args.threads) valid_loader = DataLoader(dataset=valid_set, batch_size=args.eval_batch_size, shuffle=False, collate_fn=visit_collate_fn, num_workers=args.threads) test_loader = DataLoader(dataset=test_set, batch_size=args.eval_batch_size, shuffle=False, collate_fn=visit_collate_fn, num_workers=args.threads) print('===> Dataset loaded!') # Create model print('===> Building a Model') model = RNN(dim_input=num_features, dim_emb=128, dim_hidden=128) if cuda: model = model.cuda() print(model) print('===> Model built!') weight_class0 = torch.mean(torch.FloatTensor(train_set.labels)) weight_class1 = 1.0 - weight_class0 weight = torch.FloatTensor([weight_class0, weight_class1]) criterion = nn.CrossEntropyLoss(weight=weight) if args.cuda: criterion = criterion.cuda() optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, nesterov=False, weight_decay=args.weight_decay) scheduler = ReduceLROnPlateau(optimizer, 'min') best_valid_epoch = 0 best_valid_loss = sys.float_info.max train_losses = [] valid_losses = [] if not os.path.exists(args.save): os.makedirs(args.save) for ei in trange(args.epochs, desc="Epochs"): # Train _, _, train_loss = rnn_epoch(train_loader, model, criterion=criterion, optimizer=optimizer, train=True) train_losses.append(train_loss) # Eval _, _, valid_loss = rnn_epoch(valid_loader, model, criterion=criterion) valid_losses.append(valid_loss) scheduler.step(valid_loss) is_best = valid_loss < best_valid_loss if is_best: best_valid_epoch = ei best_valid_loss = valid_loss # evaluate on the test set test_y_true, test_y_pred, test_loss = rnn_epoch( test_loader, model, criterion=criterion) if args.cuda: test_y_true = test_y_true.cpu() test_y_pred = test_y_pred.cpu() test_auc = roc_auc_score(test_y_true.numpy(), test_y_pred.numpy()[:, 1], average="weighted") test_aupr = average_precision_score(test_y_true.numpy(), test_y_pred.numpy()[:, 1], average="weighted") with open(args.save + 'train_result.txt', 'w') as f: f.write('Best Validation Epoch: {}\n'.format(ei)) f.write('Best Validation Loss: {}\n'.format(valid_loss)) f.write('Train Loss: {}\n'.format(train_loss)) f.write('Test Loss: {}\n'.format(test_loss)) f.write('Test AUROC: {}\n'.format(test_auc)) f.write('Test AUPR: {}\n'.format(test_aupr)) torch.save(model, args.save + 'best_model.pth') torch.save(model.state_dict(), args.save + 'best_model_params.pth') # plot if args.plot: plt.figure(figsize=(12, 9)) plt.plot(np.arange(len(train_losses)), np.array(train_losses), label='Training Loss') plt.plot(np.arange(len(valid_losses)), np.array(valid_losses), label='Validation Loss') plt.xlabel('epoch') plt.ylabel('Loss') plt.legend(loc="best") plt.tight_layout() plt.savefig(args.save + 'loss_plot.eps', format='eps') plt.close() print('Best Validation Epoch: {}\n'.format(best_valid_epoch)) print('Best Validation Loss: {}\n'.format(best_valid_loss)) print('Train Loss: {}\n'.format(train_loss)) print('Test Loss: {}\n'.format(test_loss)) print('Test AUROC: {}\n'.format(test_auc)) print('Test AUPR: {}\n'.format(test_aupr))
if __name__ == "__main__": train_iter, dev_iter, test_iter, TEXT, LABEL = load_iters( batch_size, device, data_path, vectors) vocab_size = len(TEXT.vocab.itos) # build model if use_rnn: model = RNN(vocab_size, embed_size, hidden_size, num_layers, num_classes, bidirectional, dropout_rate) else: model = CNN(vocab_size, embed_size, num_classes, num_filters, kernel_sizes, dropout_rate) if vectors is not None: model.embed.from_pretrained(TEXT.vocab.vectors, freeze=freeze) model.to(device) optimizer = Adam(model.parameters(), lr=learning_rate) loss_func = nn.CrossEntropyLoss() writer = SummaryWriter('logs', comment="rnn") for epoch in trange(train_epochs, desc="Epoch"): model.train() ep_loss = 0 for step, batch in enumerate(tqdm(train_iter, desc="Iteration")): (inputs, lens), labels = batch.text, batch.label outputs = model(inputs, lens) loss = loss_func(outputs, labels) ep_loss += loss.item() model.zero_grad() loss.backward() nn.utils.clip_grad_norm_(model.parameters(), clip) optimizer.step()
def train(): global_step = 0 # Loaded pretrained VAE vae = VAE(hp.vsize).to(DEVICE) ckpt = sorted(glob.glob(os.path.join(hp.ckpt_dir, 'vae', '*k.pth.tar')))[-1] vae_state = torch.load(ckpt) vae.load_state_dict(vae_state['model']) vae.eval() print('Loaded vae ckpt {}'.format(ckpt)) rnn = RNN(hp.vsize, hp.asize, hp.rnn_hunits).to(DEVICE) ckpts = sorted(glob.glob(os.path.join(hp.ckpt_dir, 'rnn', '*k.pth.tar'))) if ckpts: ckpt = ckpts[-1] rnn_state = torch.load(ckpt) rnn.load_state_dict(rnn_state['model']) global_step = int(os.path.basename(ckpt).split('.')[0][:-1]) * 1000 print('Loaded rnn ckpt {}'.format(ckpt)) data_path = hp.data_dir if not hp.extra else hp.extra_dir # optimizer = torch.optim.RMSprop(rnn.parameters(), lr=1e-3) optimizer = torch.optim.Adam(rnn.parameters(), lr=1e-4) dataset = GameEpisodeDataset(data_path, seq_len=hp.seq_len) loader = DataLoader(dataset, batch_size=1, shuffle=True, drop_last=True, num_workers=hp.n_workers, collate_fn=collate_fn) testset = GameEpisodeDataset(data_path, seq_len=hp.seq_len, training=False) test_loader = DataLoader(testset, batch_size=1, shuffle=False, drop_last=False, collate_fn=collate_fn) ckpt_dir = os.path.join(hp.ckpt_dir, 'rnn') sample_dir = os.path.join(ckpt_dir, 'samples') os.makedirs(sample_dir, exist_ok=True) l1 = nn.L1Loss() while global_step < hp.max_step: # GO_states = torch.zeros([hp.batch_size, 1, hp.vsize+hp.asize]).to(DEVICE) with tqdm(enumerate(loader), total=len(loader), ncols=70, leave=False) as t: t.set_description('Step {}'.format(global_step)) for idx, (obs, actions) in t: obs, actions = obs.to(DEVICE), actions.to(DEVICE) with torch.no_grad(): latent_mu, latent_var = vae.encoder(obs) # (B*T, vsize) z = latent_mu # z = vae.reparam(latent_mu, latent_var) # (B*T, vsize) z = z.view(-1, hp.seq_len, hp.vsize) # (B*n_seq, T, vsize) # import pdb; pdb.set_trace() next_z = z[:, 1:, :] z, actions = z[:, :-1, :], actions[:, :-1, :] states = torch.cat([z, actions], dim=-1) # (B, T, vsize+asize) # states = torch.cat([GO_states, next_states[:,:-1,:]], dim=1) x, _, _ = rnn(states) loss = l1(x, next_z) optimizer.zero_grad() loss.backward() optimizer.step() global_step += 1 if global_step % hp.log_interval == 0: eval_loss = evaluate(test_loader, vae, rnn, global_step) now = datetime.now().strftime("%Y-%m-%d %H:%M:%S") with open(os.path.join(ckpt_dir, 'train.log'), 'a') as f: log = '{} || Step: {}, train_loss: {:.4f}, loss: {:.4f}\n'.format( now, global_step, loss.item(), eval_loss) f.write(log) S = 2 y = vae.decoder(x[S, :, :]) v = vae.decoder(next_z[S, :, :]) save_image( y, os.path.join(sample_dir, '{:04d}-rnn.png'.format(global_step))) save_image( v, os.path.join(sample_dir, '{:04d}-vae.png'.format(global_step))) save_image( obs[S:S + hp.seq_len - 1], os.path.join(sample_dir, '{:04d}-obs.png'.format(global_step))) if global_step % hp.save_interval == 0: d = { 'model': rnn.state_dict(), 'optimizer': optimizer.state_dict(), } torch.save( d, os.path.join( ckpt_dir, '{:03d}k.pth.tar'.format(global_step // 1000)))
# All 14 years of data X_test_dep_std = X X_train_dep_std = np.expand_dims(X_train_dep_std, axis=0) y_train_dep_std = np.expand_dims(y_train_dep_std, axis=0) X_test_dep_std = np.expand_dims(X_test_dep_std, axis=0) # Transfer to Pytorch Variable X_train_dep_std = Variable(torch.from_numpy(X_train_dep_std).float()) y_train_dep_std = Variable(torch.from_numpy(y_train_dep_std).float()) X_test_dep_std = Variable(torch.from_numpy(X_test_dep_std).float()) # Define rnn model model = RNN(input_size=5, hidden_size=40, num_layers=2, class_size=1, dropout=0.5, rnn_type='lstm',dropout_bool=True) # Define optimization function optimizer = torch.optim.Adam(model.parameters(), lr=1e-4) # optimize all rnn parameters # Define loss function loss_func = nn.MSELoss() # Start training for iter in range(10000): model.train() prediction = model(X_train_dep_std) loss = loss_func(prediction, y_train_dep_std) optimizer.zero_grad() # clear gradients for this training step loss.backward() # back propagation, compute gradients optimizer.step() if iter % 100 == 0: print("iteration: %s, loss: %s" % (iter, loss.item())) # Save model
if args.freeze_embedding: model.embedding.weight.requires_grad = False model = model.to(device) def count_parameters(model): return sum(p.numel() for p in model.parameters() if p.requires_grad) myprint(f'The model has {count_parameters(model):,} trainable parameters') if args.optimizer == "SGD": print("Using SGD") optimizer = optim.SGD(model.parameters(), weight_decay=args.weight_decay, lr=args.lr, momentum=args.momentum, nesterov=args.nesterov) else: print("Using Adam") optimizer = optim.Adam(model.parameters(), weight_decay=args.weight_decay, lr=args.lr, amsgrad=args.amsgrad) criterion = nn.CrossEntropyLoss().to(device) accuracy = categorical_accuracy if args.task == "MIMIC-D": accuracy = f1_score
train_set = PCDataset(DATA_ROOT, exclude_patterns=val_signer) val_set = PCDataset(DATA_ROOT, exclude_patterns=signerlist) print('-' * 80) print( f'[INFO] Training on {len(train_set)} samples from {len(signerlist)} signers' ) print( f'[INFO] Validating on {len(val_set)} samples from {len(val_signer)} signers' ) train_loader = PCDataLoader(train_set) val_loader = PCDataLoader(val_set) model = RNN(60, 100, 30) criterion = torch.nn.NLLLoss() optimizer = torch.optim.SGD(model.parameters(), lr=1e-3, momentum=0.9) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1) _, history = train(model, train_loader, val_loader, criterion, optimizer, scheduler, NUM_EPOCH, None, SAVE_PATH) print( f'[INFO] Training finished with best validation accuracy: {max(history["val_acc"]):.4f}' ) print('-' * 80) plot_history(history, SAVE_PATH, 'svg')
n_blocks=args.num_layers, dropout=1. - args.dp_keep_prob) # these 3 attributes don't affect the Transformer's computations; # they are only used in run_epoch model.batch_size = args.batch_size model.seq_len = args.seq_len model.vocab_size = vocab_size else: print("Model type not recognized.") model = model.to(device) # LOSS FUNCTION loss_fn = nn.CrossEntropyLoss() if args.optimizer == 'ADAM': optimizer = torch.optim.Adam(model.parameters(), lr=args.initial_lr) # LEARNING RATE SCHEDULE lr = args.initial_lr lr_decay_base = 1 / 1.15 m_flat_lr = 14.0 # we will not touch lr for the first m_flat_lr epochs ############################################################################### # # DEFINE COMPUTATIONS FOR PROCESSING ONE EPOCH # ############################################################################### def repackage_hidden(h): """
else: # no embedding layer (one-hot encoding) model = OneHotRNN(vocabulary=dataset.vocabulary, rnn_type=args.rnn_type, hidden_size=args.hidden_size, n_layers=args.n_layers, dropout=args.dropout, bidirectional=args.bidirectional, nonlinearity=args.nonlinearity) # optionally, load model parameters from file if args.pretrain_model is not None: model.load_state_dict(torch.load(args.pretrain_model)) # set up optimizer optimizer = optim.Adam(model.parameters(), betas=(0.9, 0.999), ## default eps=1e-08, ## default lr=args.learning_rate) # set up early stopping early_stop = EarlyStopping(patience=args.patience) # set up training schedule file sched_filename = "training_schedule-" + str(args.sample_idx + 1) + ".csv" sched_file = os.path.join(args.output_dir, sched_filename) # iterate over epochs counter = 0 for epoch in range(args.max_epochs): # iterate over batches
print("Train Size: ", len(loader_train.sampler.indices)) print("Validation Size: ", len(loader_val.sampler.indices)) num_classes = len(train_set.label_encoder.classes_) model = RNN(embeddings, num_classes=num_classes, **_hparams) weights = class_weigths(train_set.labels) if torch.cuda.is_available(): model.cuda() weights = weights.cuda() criterion = torch.nn.CrossEntropyLoss(weight=weights) parameters = filter(lambda p: p.requires_grad, model.parameters()) optimizer = torch.optim.Adam(parameters, lr=lr) ############################################################# # Train ############################################################# best_val_loss = None colms_l = ['Train_Loss', 'Val_Loss'] colms_acc = ['Train_Acc', 'Val_Acc'] colms_f1 = ['Train_F1', 'Val_F1'] df_l = pd.DataFrame(columns=colms_l, index=range(1, EPOCHS + 1)) df_acc = pd.DataFrame(columns=colms_acc, index=range(1, EPOCHS + 1)) df_f1 = pd.DataFrame(columns=colms_f1, index=range(1, EPOCHS + 1))
def main(args): # hyperparameters batch_size = args.batch_size num_workers = 1 # Image Preprocessing transform = transforms.Compose([ transforms.Resize((224, 224)), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), ]) # load COCOs dataset IMAGES_PATH = 'data/train2014' CAPTION_FILE_PATH = 'data/annotations/captions_train2014.json' vocab = load_vocab() train_loader = get_coco_data_loader(path=IMAGES_PATH, json=CAPTION_FILE_PATH, vocab=vocab, transform=transform, batch_size=batch_size, shuffle=True, num_workers=num_workers) IMAGES_PATH = 'data/val2014' CAPTION_FILE_PATH = 'data/annotations/captions_val2014.json' val_loader = get_coco_data_loader(path=IMAGES_PATH, json=CAPTION_FILE_PATH, vocab=vocab, transform=transform, batch_size=batch_size, shuffle=True, num_workers=num_workers) losses_val = [] losses_train = [] # Build the models ngpu = 1 initial_step = initial_epoch = 0 embed_size = args.embed_size num_hiddens = args.num_hidden learning_rate = 1e-3 num_epochs = 3 log_step = args.log_step save_step = 500 checkpoint_dir = args.checkpoint_dir encoder = CNN(embed_size) decoder = RNN(embed_size, num_hiddens, len(vocab), 1, rec_unit=args.rec_unit) # Loss criterion = nn.CrossEntropyLoss() if args.checkpoint_file: encoder_state_dict, decoder_state_dict, optimizer, *meta = utils.load_models( args.checkpoint_file, args.sample) initial_step, initial_epoch, losses_train, losses_val = meta encoder.load_state_dict(encoder_state_dict) decoder.load_state_dict(decoder_state_dict) else: params = list(decoder.parameters()) + list( encoder.linear.parameters()) + list(encoder.batchnorm.parameters()) optimizer = torch.optim.Adam(params, lr=learning_rate) if torch.cuda.is_available(): encoder.cuda() decoder.cuda() if args.sample: return utils.sample(encoder, decoder, vocab, val_loader) # Train the Models total_step = len(train_loader) try: for epoch in range(initial_epoch, num_epochs): for step, (images, captions, lengths) in enumerate(train_loader, start=initial_step): # Set mini-batch dataset images = utils.to_var(images, volatile=True) captions = utils.to_var(captions) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # Forward, Backward and Optimize decoder.zero_grad() encoder.zero_grad() if ngpu > 1: # run on multiple GPU features = nn.parallel.data_parallel( encoder, images, range(ngpu)) outputs = nn.parallel.data_parallel( decoder, features, range(ngpu)) else: # run on single GPU features = encoder(images) outputs = decoder(features, captions, lengths) train_loss = criterion(outputs, targets) losses_train.append(train_loss.data[0]) train_loss.backward() optimizer.step() # Run validation set and predict if step % log_step == 0: encoder.batchnorm.eval() # run validation set batch_loss_val = [] for val_step, (images, captions, lengths) in enumerate(val_loader): images = utils.to_var(images, volatile=True) captions = utils.to_var(captions, volatile=True) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] features = encoder(images) outputs = decoder(features, captions, lengths) val_loss = criterion(outputs, targets) batch_loss_val.append(val_loss.data[0]) losses_val.append(np.mean(batch_loss_val)) # predict sampled_ids = decoder.sample(features) sampled_ids = sampled_ids.cpu().data.numpy()[0] sentence = utils.convert_back_to_text(sampled_ids, vocab) print('Sample:', sentence) true_ids = captions.cpu().data.numpy()[0] sentence = utils.convert_back_to_text(true_ids, vocab) print('Target:', sentence) print( 'Epoch: {} - Step: {} - Train Loss: {} - Eval Loss: {}' .format(epoch, step, losses_train[-1], losses_val[-1])) encoder.batchnorm.train() # Save the models if (step + 1) % save_step == 0: utils.save_models(encoder, decoder, optimizer, step, epoch, losses_train, losses_val, checkpoint_dir) utils.dump_losses( losses_train, losses_val, os.path.join(checkpoint_dir, 'losses.pkl')) except KeyboardInterrupt: pass finally: # Do final save utils.save_models(encoder, decoder, optimizer, step, epoch, losses_train, losses_val, checkpoint_dir) utils.dump_losses(losses_train, losses_val, os.path.join(checkpoint_dir, 'losses.pkl'))