def train(args):
    common.make_dir(args.checkout_dir)
    # nnet
    nnet = RNN((args.left_context + args.right_context + 1) * args.feat_dim, \
               hidden_layer, hidden_size, args.num_classes, dropout=dropout)
    print(nnet)
    nnet.cuda()

    criterion = nn.CrossEntropyLoss()
    optimizer = th.optim.Adam(nnet.parameters(), lr=args.learning_rate)

    train_dataset = THCHS30(root=args.data_dir, data_type='train')
    train_loader = data.DataLoader(dataset=train_dataset,
                                   batch_size=args.min_batch,
                                   shuffle=True)

    test_dataset = THCHS30(root=args.data_dir, data_type='test')
    test_loader = data.DataLoader(dataset=test_dataset,
                                  batch_size=args.min_batch,
                                  shuffle=True)

    cross_validate(-1, nnet, test_loader, test_dataset.num_frames)
    for epoch in range(args.num_epochs):
        common.train_one_epoch(nnet,
                               criterion,
                               optimizer,
                               train_loader,
                               is_rnn=True)
        cross_validate(epoch, nnet, test_loader, test_dataset.num_frames)
        th.save(
            nnet,
            common.join_path(args.checkout_dir,
                             'rnn.{}.pkl'.format(epoch + 1)))
Beispiel #2
0
class Model():
    def __init__(self, input_size, hidden_size, output_size, n_layers=1, gpu=-1):
        self.decoder = RNN(input_size, hidden_size, output_size, n_layers, gpu)
        if gpu >= 0:
            print("Use GPU %d" % torch.cuda.current_device())
            self.decoder.cuda()

        self.optimizer = torch.optim.Adam(self.decoder.parameters(), lr=0.01)
        self.criterion = nn.CrossEntropyLoss()

    def train(self, inp, target, chunk_len=200):
        hidden = self.decoder.init_hidden()
        self.decoder.zero_grad()
        loss = 0

        for c in range(chunk_len):
            out, hidden = self.decoder(inp[c], hidden)
            loss += self.criterion(out, target[c])

        loss.backward()
        self.optimizer.step()

        return loss.data[0] / chunk_len

    def generate(self, prime_str, predict_len=100, temperature=0.8):
        predicted = prime_str

        hidden = self.decoder.init_hidden()
        prime_input = char_tensor(prime_str, self.decoder.gpu)

        # Use prime string to build up hidden state
        for p in range(len(prime_str) - 1):
            _, hidden = self.decoder(prime_input[p], hidden)

        inp  = prime_input[-1]
        for p in range(predict_len):
            out, hidden = self.decoder(inp, hidden)

            # sample from network as a multinomial distribution out_dist = out.data.view(-1).div(temperature).exp()
            out_dist = out.data.view(-1).div(temperature).exp()
            top_i = torch.multinomial(out_dist, 1)[0]

            # Add predicted character to string and use as next input
            predicted_char = all_characters[top_i]
            predicted += predicted_char
            inp = char_tensor(predicted_char, self.decoder.gpu)

        return predicted

    def save(self):
        model_name = "char-rnn-gru.pt"

        if not os.path.exists("save"):
            os.mkdir("save")
        torch.save(self.decoder, "save/%s" % model_name)
        print("--------------> [Checkpoint] Save model into save/%s" % model_name)

    def load(self, model_path="save/char-rnn-gru.pt"):
        self.decoder = torch.load(model_path)
Beispiel #3
0
def load_model(args, train_len):
    model = RNN(args.emb_dim, args.hidden_dim)
    if torch.cuda.is_available():
        model.cuda()
    loss_fnc = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=args.lr)

    return model, loss_fnc, optimizer
Beispiel #4
0
class Sampler:
    """
    Samples all the detections for a given video and query
    """
    def __init__(self,
                 input_size=600,
                 hidden_size=256,
                 weights_path='models/best/model-epoch-last.pth',
                 num_descriptors=10):
        self.model = RNN(num_descriptors=num_descriptors,
                         hidden_size=hidden_size,
                         lstm_in_size=input_size)
        self.model.load_state_dict(torch.load(weights_path))
        self.num_descriptors = num_descriptors

        if torch.cuda.is_available():
            self.model.cuda()
        self.model.eval()

    def sample_video(self,
                     query,
                     video_name,
                     descriptors_path='extracted_descriptors_100',
                     print_sorted_files=False):
        self.model.eval()

        files = glob(
            os.path.join(
                descriptors_path,
                'descriptors_top' + str(self.num_descriptors) + '_' +
                video_name + '_' + query + '_*'))
        files = sorted(files)
        if print_sorted_files:
            print(
                os.path.join(
                    descriptors_path,
                    'descriptors_top' + str(self.num_descriptors) + '_' +
                    video_name + '_' + query + '_*'))
            print(files)

        predictions = None
        for desc_file in files:
            descriptors = np.load(desc_file)
            descriptors = torch.from_numpy(descriptors).type(torch.FloatTensor)\
                .reshape((1, descriptors.shape[1], int(descriptors.shape[2]/6), 6))
            if torch.cuda.is_available():
                descriptors = descriptors.cuda()
            preds = self.model(descriptors)
            if predictions is None:
                predictions = preds
            else:
                predictions = torch.cat((predictions, preds), 1)

        return predictions
Beispiel #5
0
def run():
    category_lines, all_categories, n_categories = init_cate_dict()
    rnn = RNN(n_letters, n_categories)
    rnn.cuda()

    train_set, test_set = get_data_set(category_lines)
    random.shuffle(train_set)
    for e in range(EPOCH):
        batch_train(rnn, train_set, all_categories)
        model_testing(rnn, test_set, all_categories)

    model_path = os.path.join(os.getcwd(), 'rnn3.pkl')
    torch.save(rnn, model_path)  # 保存整个网络
Beispiel #6
0
def main():
    prepare()
    print(print_str.format("Begin to loading Data"))

    net = RNN(90, 256, 2, 2, 0.1)
    if use_cuda():
        net = net.cuda()
    optimizer = torch.optim.Adam(net.parameters(), lr=0.1)
    cross_entropy = nn.CrossEntropyLoss()

    if mode == "train":
        train_data, train_label, train_wav_ids, train_lengths = load_rnn_data(
            "train", train_protocol, mode=mode, feature_type=feature_type)
        train_dataset = ASVDataSet(train_data,
                                   train_label,
                                   wav_ids=train_wav_ids,
                                   mode=mode,
                                   lengths=train_lengths)
        train_dataloader = DataLoader(train_dataset,
                                      batch_size=batch_size,
                                      num_workers=4,
                                      shuffle=True)

    for epoch in range(num_epochs):
        correct = 0
        total = 0
        total_loss = 0
        for tmp in tqdm(train_dataloader, desc="Epoch {}".format(epoch + 1)):
            data = tmp['data']
            label = tmp['label']
            length = tmp['length']

            max_len = int(torch.max(length))
            data = data[:, :max_len, :]
            label = label[:, :max_len]

            sorted_length, indices = torch.sort(length.view(-1),
                                                dim=0,
                                                descending=True)
            sorted_length = sorted_length.long().numpy()

            data, label = data[indices], label[indices]

            data, label = Variable(data), Variable(label).view(-1)
            if use_cuda():
                data, label = data.cuda(), label.cuda()

            optimizer.zero_grad()
            outputs, out_length = net(data, sorted_length)
            loss = cross_entropy(outputs, label)
            loss.backward()
            optimizer.step()

            total_loss += loss.data[0]
            _, predict = torch.max(outputs, 1)
            correct += (predict.data == label.data).sum()
            total += label.size(0)

        print("Loss: {} \t Acc: {}".format(total_loss / len(train_dataloader),
                                           correct / total))
Beispiel #7
0
def test_model(args):
    # Hyper Parameters
    sequence_length = args.seq_len
    input_size = args.input_size
    hidden_size = args.hidden_size
    num_layers = args.num_layers
    num_classes = args.num_classes
    batch_size = args.batch_size
    num_epochs = args.num_epochs
    learning_rate = args.learning_rate
    dropout = args.dropout

    # Load back the best performing model
    rnn = RNN('LSTM', input_size, hidden_size, num_layers, num_classes,
              dropout)
    if args.cuda:
        rnn = rnn.cuda()
    rnn.load_state_dict(torch.load(args.model_path))

    # train_dataset = create_dataset('data/train/', timesteps=sequence_length)
    # train_loader = dataloader(train_dataset, batch_size=batch_size)
    test_dataset = create_dataset('data/test/', timesteps=sequence_length)
    test_loader = dataloader(test_dataset, batch_size=batch_size)

    print('-' * 50)
    # print('training accuracy = %.4f, test accuracy = %.4f' % (eval_model(rnn, train_loader), eval_model(rnn, test_loader)))
    # print('training accuracy = %.4f' % eval_model(rnn, train_loader))
    print('test accuracy = %.4f' % eval_model(rnn, test_loader))
    # print('test f1-score = %.4f' % get_f1score(rnn, test_loader))
    print_confusion_matrix(rnn, test_loader)
Beispiel #8
0
def run():
    start = time.time()
    category_lines, all_categories, n_categories = init_cate_dict()
    rnn = RNN(n_letters, n_categories)
    rnn.cuda()

    line_tensors, category_tensors = get_batch_train_data(category_lines, all_categories, 100)
    line_tensors = line_tensors.cuda()
    category_tensors = category_tensors.cuda()
    for it in range(1, n_iters + 1):
        output, loss = train(rnn, category_tensors, line_tensors)

        # Print iter number, loss, name and guess
        if it % print_every == 0:
            print('%d %d%% (%s) %.4f' % (it, it / n_iters * 100, time_since(start), loss))

    model_path = os.path.join(os.getcwd(), 'rnn1.pkl')
    torch.save(rnn, model_path)  # 保存整个网络
Beispiel #9
0
class TrainModel():
    def __init__(self):

        self.model_2048 = RNN(rnn_size)

    def trainModel(self):

        trainDataset = DealDataset_enhanced(
            root=trainfilertoread,
            transform=transforms.Compose(transforms=[transforms.ToTensor()]))
        train_loader = DataLoader(dataset=trainDataset,
                                  batch_size=BATCH_SIZE,
                                  shuffle=True,
                                  num_workers=0)
        criterion = torch.nn.CrossEntropyLoss()
        optimizer = torch.optim.Adam(self.model_2048.parameters(), lr=LR)
        for epoch in range(NUM_EPOCHS):

            for index, (board, direc) in enumerate(train_loader):
                board, direc = Variable(board), Variable(direc)

                if torch.cuda.is_available():
                    board, direc = board.cuda(), direc.cuda()
                    self.model_2048.cuda()

                board = board.view(-1, 4, 4)
                out = self.model_2048(board)
                loss = criterion(out, direc)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                if index % 50 == 0:
                    out = self.model_2048(board)
                    pred = torch.max(out, 1)[1]
                    train_correct = (pred == direc).sum().item()
                    print(
                        'Epoch: ', epoch, '| train loss: %.4f' % loss,
                        '| test accuracy: %.4f' % (train_correct /
                                                   (BATCH_SIZE * 1.0)))
            torch.save(self.model_2048, 'rnn_model_' + str(epoch) + '.pkl')
        torch.save(self.model_2048, 'rnn_model_final.pkl')
Beispiel #10
0
    dataloader_train = data.DataLoader(dataset,
                                       batch_size=8,
                                       shuffle=True,
                                       num_workers=4)

    dataloader_val = data.DataLoader(dataset_val,
                                     batch_size=1,
                                     shuffle=False,
                                     num_workers=4)

    print(dataset.n_categories)
    categories = dataset.all_categories

    # Initialize the network. Hidden size: 1024.
    # 57 is the length of the one-hot-encoded input at each timestep
    model = RNN(57, 1024, dataset.n_categories)
    # criterion = nn.NLLLoss()
    criterion = nn.CrossEntropyLoss()

    # comment if not using a gpu
    model = model.cuda()
    criterion = criterion.cuda()
    optimizer = torch.optim.SGD(model.parameters(), 0.005)  #$, momentum = 0.9)
    n_epochs = 10
    for i in range(n_epochs):
        train(i, dataloader_train, model, criterion, optimizer, categories,
              'train')
        if i % 2 == 1:
            train(i, dataloader_val, model, criterion, optimizer, categories,
                  'val')
Beispiel #11
0
                                             labels, hidden)
            batch_losses.append(loss)

            if batch_i % show_every_n_batches == 0:
                print('Epoch: {:>4}/{:<4}  Loss: {}\n'.format(
                    epoch_i, n_epochs, np.average(batch_losses)))
                batch_losses = []

    return rnn


rnn = RNN(vocab_size,
          output_size,
          opt.embedding_dim,
          opt.hidden_dim,
          opt.n_layers,
          dropout=0.5)
if train_on_gpu:
    rnn.cuda()

optimizer = torch.optim.Adam(rnn.parameters(), lr=opt.learning_rate)
criterion = nn.CrossEntropyLoss()

# training the model
trained_rnn = train_rnn(rnn, opt.batch_size, optimizer, criterion,
                        opt.num_epochs, opt.show_every_n_batches)

# saving the trained model
save_model('./save/trained_rnn', trained_rnn)
print('Model Trained and Saved')
Beispiel #12
0
print("===> creating dataloaders ...")
end = time.time()
train_loader = ClassDataLoader(args.train_path,word_to_index,fasttext_word_to_index,char_to_index,pos_to_index,xpos_to_index,rel_to_index,args.batch_size,predict_flag=0,train=1)
val_loader = ClassDataLoader(args.dev_path, word_to_index,fasttext_word_to_index,char_to_index,pos_to_index,xpos_to_index,rel_to_index,args.batch_size,predict_flag=0,train=0)
print('===> dataloaders creatinng in: {t:.3f}s'.format(t=time.time()-end))


#create model
print("===> creating rnn model ...")
model = RNN(word_to_index,fasttext_word_to_index,char_to_index,args.cembedding_size,args.posembedding_size,args.char_hidden_size, args.wembedding_size, fasttext_embed, args.layers, args.hidden_size,
            args.dropout,args.var_dropout, args.mlp_arc_size, args.mlp_label_size,pos_to_index, xpos_to_index,rel_to_index,args.cuda, batch_first=True)
print(model)

if args.cuda:
    model.cuda()


#optimizer and losses
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.param_group_dense), lr=args.lr, betas=(0.9, 0.9), eps=1e-12)
optimizer_sparse = torch.optim.SparseAdam(filter(lambda p: p.requires_grad, model.param_group_sparse), lr=args.lr,  betas=(0.9, 0.9), eps=1e-12)
criterion_arc = nn.CrossEntropyLoss(ignore_index=-1) # ignore PADDED targets
criterion_label = nn.CrossEntropyLoss(ignore_index=model.rel_to_index['__PADDING__']) # ignore PADDED targets


def test(val_loader, model):

    # switch to evaluate mode
    model.eval()
    gold_arcs = np.array([])
    pred_arcs = np.array([])
Beispiel #13
0
def main(args):
    print(sys.argv)

    if not os.path.exists('models'):
        os.mkdir('models')

    num_epochs = args.ne
    lr_decay = args.decay
    learning_rate = args.lr

    data_loader = get_data_loader(args.gt_path, args.tensors_path,
                                  args.json_labels_path, args.bs)
    model = RNN(lstm_hidden_size=args.hidden_size)
    if torch.cuda.is_available():
        model.cuda()
    model.train()

    #optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.mm)
    if args.rms:
        optimizer = optim.RMSprop(model.parameters(),
                                  lr=args.lr,
                                  momentum=args.mm)
    else:
        optimizer = optim.Adam(model.parameters(), lr=args.lr)
    model_loss = torch.nn.BCEWithLogitsLoss()
    # model_loss = Loss()

    losses = []
    p = 1
    try:
        for epoch in range(num_epochs):
            if epoch % args.decay_epoch == 0 and epoch > 0:
                learning_rate = learning_rate * lr_decay
                for param_group in optimizer.param_groups:
                    param_group['lr'] = learning_rate

            if epoch in (3, 7, 15):
                if epoch == 3:
                    p = 2 / 3
                if epoch == 7:
                    p = 1 / 3
                if epoch == 15:
                    p = 0

            loss_epoch = []
            loss1_epoch = []
            loss2_epoch = []
            for step, (tensors, masks, gt) in enumerate(data_loader):
                if torch.cuda.is_available():
                    tensors = tensors.cuda()
                    masks = masks.cuda()
                    gt = gt.cuda()
                model.zero_grad()

                out, att = model(tensors, masks, gt, p)
                loss1 = model_loss(out, gt)
                # att[:, :-1, :] -> attention produced (location in the next frame) until the last frame -1 (49)
                # gt[:, 1:, :] -> gt from the second frame until the last frame (49)
                loss2 = model_loss(att[:, :-1, :], gt[:, 1:, :])
                loss = loss1 + loss2
                loss.backward()
                optimizer.step()

                loss_epoch.append(loss.cpu().detach().numpy())
                loss1_epoch.append(loss1.cpu().detach().numpy())
                loss2_epoch.append(loss2.cpu().detach().numpy())

                #print('Epoch ' + str(epoch + 1) + '/' + str(num_epochs) + ' - Step ' + str(step + 1) + '/' +
                #      str(len(data_loader)) + ' - Loss: ' + str(float(loss)) + " (Loss1: " + str(float(loss1))
                #       + ", Loss2: " + str(float(loss2)) + ")")
            loss_epoch_mean = np.mean(np.array(loss_epoch))
            loss1_epoch_mean = np.mean(np.array(loss_epoch))
            loss2_epoch_mean = np.mean(np.array(loss_epoch))
            losses.append(loss_epoch_mean)
            print('Total epoch loss: ' + str(loss_epoch_mean) + " (loss1: " +
                  str(loss1_epoch_mean) + ", loss2: " + str(loss2_epoch_mean) +
                  ")")
            if (epoch + 1) % args.save_epoch == 0 and epoch > 0:
                filename = 'model-epoch-' + str(epoch + 1) + '.pth'
                model_path = os.path.join('models/', filename)
                torch.save(model.state_dict(), model_path)
    except KeyboardInterrupt:
        pass

    filename = 'model-epoch-last.pth'
    model_path = os.path.join('models', filename)
    torch.save(model.state_dict(), model_path)
    plt.plot(losses)
    plt.show()
Beispiel #14
0
else:
    torch.set_default_tensor_type('torch.FloatTensor')

weights = args.weights

n_hidden = 128

batch_size = args.batch_size
num_workers = args.num_workers
log_iters = args.log_iters
weights = PROJECT_DIR + args.weights

print('Loading weights...')
rnn = RNN(N_LETTERS, n_hidden, N_GENDERS)
if args.cuda:
    rnn = rnn.cuda()
rnn.load_state_dict(torch.load(weights))
rnn.eval()


def _evaluate(name_tensor):
    hidden = rnn.init_hidden()

    for letter_tensor in name_tensor:
        letter_tensor.data.unsqueeze_(0)
        output, hidden = rnn(letter_tensor, hidden)

    return output


def predict(name, n_predictions=2):
Beispiel #15
0
class dl_model():

	def __init__(self, mode):

		# Read config fielewhich contains parameters
		self.config = config
		self.mode = mode

		# Architecture name decides prefix for storing models and plots
		feature_dim = self.config['vocab_size']
		self.arch_name = '_'.join(
			[self.config['rnn'], str(self.config['num_layers']), str(self.config['hidden_dim']), str(feature_dim)])

		print("Architecture:", self.arch_name)
		# Change paths for storing models
		self.config['models'] = self.config['models'].split('/')[0] + '_' + self.arch_name + '/'
		self.config['plots'] = self.config['plots'].split('/')[0] + '_' + self.arch_name + '/'

		# Make folders if DNE
		if not os.path.exists(self.config['models']):
			os.mkdir(self.config['models'])
		if not os.path.exists(self.config['plots']):
			os.mkdir(self.config['plots'])
		if not os.path.exists(self.config['pickle']):
			os.mkdir(self.config['pickle'])

		self.cuda = (self.config['cuda'] and torch.cuda.is_available())

		# load/initialise metrics to be stored and load model
		if mode == 'train' or mode == 'test':

			self.plots_dir = self.config['plots']
			# store hyperparameters
			self.total_epochs = self.config['epochs']
			self.test_every = self.config['test_every_epoch']
			self.test_per = self.config['test_per_epoch']
			self.print_per = self.config['print_per_epoch']
			self.save_every = self.config['save_every']
			self.plot_every = self.config['plot_every']

			# dataloader which returns batches of data
			self.train_loader = dataloader('train', self.config)
			self.test_loader = dataloader('test', self.config)
			#declare model
			self.model = RNN(self.config)

			self.start_epoch = 1
			self.edit_dist = []
			self.train_losses, self.test_losses = [], []

		else:

			self.model = RNN(self.config)

		if self.cuda:
			self.model.cuda()

		# resume training from some stored model
		if self.mode == 'train' and self.config['resume']:
			self.start_epoch, self.train_losses, self.test_losses = self.model.load_model(mode, self.model.rnn_name, self.model.num_layers, self.model.hidden_dim)
			self.start_epoch += 1

		# load best model for testing/inference
		elif self.mode == 'test' or mode == 'test_one':
			self.model.load_model(mode, self.config['rnn'], self.model.num_layers, self.model.hidden_dim)

		#whether using embeddings
		if self.config['use_embedding']:
			self.use_embedding = True
		else:
			self.use_embedding = False

	# Train the model
	def train(self):

		print("Starting training at t =", datetime.datetime.now())
		print('Batches per epoch:', len(self.train_loader))
		self.model.train()

		# when to print losses during the epoch
		print_range = list(np.linspace(0, len(self.train_loader), self.print_per + 2, dtype=np.uint32)[1:-1])
		if self.test_per == 0:
			test_range = []
		else:
			test_range = list(np.linspace(0, len(self.train_loader), self.test_per + 2, dtype=np.uint32)[1:-1])

		for epoch in range(self.start_epoch, self.total_epochs + 1):

			try:

				print("Epoch:", str(epoch))
				epoch_loss = 0.0
				# i used for monitoring batch and printing loss, etc.
				i = 0

				while True:

					i += 1

					# Get batch of inputs, labels, missed_chars and lengths along with status (when to end epoch)
					inputs, labels, miss_chars, input_lens, status = self.train_loader.return_batch()

					if self.use_embedding:
						inputs = torch.from_numpy(inputs).long() #embeddings should be of dtype long
					else:
						inputs = torch.from_numpy(inputs).float()

					#convert to torch tensors
					labels = torch.from_numpy(labels).float()
					miss_chars = torch.from_numpy(miss_chars).float()
					input_lens = torch.from_numpy(input_lens).long()

					if self.cuda:
						inputs = inputs.cuda()
						labels = labels.cuda()
						miss_chars = miss_chars.cuda()
						input_lens = input_lens.cuda()

					# zero the parameter gradients
					self.model.optimizer.zero_grad()
					# forward + backward + optimize
					outputs = self.model(inputs, input_lens, miss_chars)
					loss, miss_penalty = self.model.calculate_loss(outputs, labels, input_lens, miss_chars, self.cuda)
					loss.backward()

					# clip gradient
					# torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.config['grad_clip'])
					self.model.optimizer.step()

					# store loss
					epoch_loss += loss.item()

					# print loss
					if i in print_range and epoch == 1:
						print('After %i batches, Current Loss = %.7f' % (i, epoch_loss / i))
					elif i in print_range and epoch > 1:
						print('After %i batches, Current Loss = %.7f, Avg. Loss = %.7f, Miss Loss = %.7f' % (
								i, epoch_loss / i, np.mean(np.array([x[0] for x in self.train_losses])), miss_penalty))

					# test model periodically
					if i in test_range:
						self.test(epoch)
						self.model.train()

					# Reached end of dataset
					if status == 1:
						break

				#refresh dataset i.e. generate a new dataset from corpurs
				if epoch % self.config['reset_after'] == 0:
					self.train_loader.refresh_data(epoch)

				#take the last example from the epoch and print the incomplete word, target characters and missed characters
				random_eg = min(np.random.randint(self.train_loader.batch_size), inputs.shape[0]-1)
				encoded_to_string(inputs.cpu().numpy()[random_eg], labels.cpu().numpy()[random_eg], miss_chars.cpu().numpy()[random_eg],
								  input_lens.cpu().numpy()[random_eg], self.train_loader.char_to_id, self.use_embedding)

				# Store tuple of training loss and epoch number
				self.train_losses.append((epoch_loss / len(self.train_loader), epoch))

				# save model
				if epoch % self.save_every == 0:
					self.model.save_model(False, epoch, self.train_losses, self.test_losses,
										  self.model.rnn_name, self.model.num_layers, self.model.hidden_dim)

				# test every 5 epochs in the beginning and then every fixed no of epochs specified in config file
				# useful to see how loss stabilises in the beginning
				if epoch % 5 == 0 and epoch < self.test_every:
					self.test(epoch)
					self.model.train()
				elif epoch % self.test_every == 0:
					self.test(epoch)
					self.model.train()
				# plot loss and accuracy
				if epoch % self.plot_every == 0:
					self.plot_loss_acc(epoch)

			except KeyboardInterrupt:
				#save model before exiting
				print("Saving model before quitting")
				self.model.save_model(False, epoch-1, self.train_losses, self.test_losses,
									  self.model.rnn_name, self.model.num_layers, self.model.hidden_dim)
				exit(0)


	# test model
	def test(self, epoch=None):

		self.model.eval()

		print("Testing...")
		print('Total batches:', len(self.test_loader))
		test_loss = 0

		#generate a new dataset form corpus
		self.test_loader.refresh_data(epoch)

		with torch.no_grad():

			while True:

				# Get batch of input, labels, missed characters and lengths along with status (when to end epoch)
				inputs, labels, miss_chars, input_lens, status = self.test_loader.return_batch()
				
				if self.use_embedding:
					inputs = torch.from_numpy(inputs).long()
				else:
					inputs = torch.from_numpy(inputs).float()

				labels = torch.from_numpy(labels).float()
				miss_chars = torch.from_numpy(miss_chars).float()
				input_lens= torch.from_numpy(input_lens).long()

				if self.cuda:
					inputs = inputs.cuda()
					labels = labels.cuda()
					miss_chars = miss_chars.cuda()
					input_lens = input_lens.cuda()

				# zero the parameter gradients
				self.model.optimizer.zero_grad()
				# forward + backward + optimize
				outputs = self.model(inputs, input_lens, miss_chars)
				loss, miss_penalty = self.model.calculate_loss(outputs, labels, input_lens, miss_chars, self.cuda)
				test_loss += loss.item()

				# Reached end of dataset
				if status == 1:
					break

		#take a random example from the epoch and print the incomplete word, target characters and missed characters
		#min since the last batch may not be of length batch_size
		random_eg = min(np.random.randint(self.train_loader.batch_size), inputs.shape[0]-1)
		encoded_to_string(inputs.cpu().numpy()[random_eg], labels.cpu().numpy()[random_eg], miss_chars.cpu().numpy()[random_eg],
			input_lens.cpu().numpy()[random_eg], self.train_loader.char_to_id, self.use_embedding)

		# Average out the losses and edit distance
		test_loss /= len(self.test_loader)

		print("Test Loss: %.7f, Miss Penalty: %.7f" % (test_loss, miss_penalty))

		# Store in lists for keeping track of model performance
		self.test_losses.append((test_loss, epoch))

		# if testing loss is minimum, store it as the 'best.pth' model, which is used during inference
		# store only when doing train/test together i.e. mode is train
		if test_loss == min([x[0] for x in self.test_losses]) and self.mode == 'train':
			print("Best new model found!")
			self.model.save_model(True, epoch, self.train_losses, self.test_losses,
								  self.model.rnn_name, self.model.num_layers, self.model.hidden_dim)

		return test_loss

	def predict(self, string, misses, char_to_id):
		"""
		called during inference
		:param string: word with predicted characters and blanks at remaining places
		:param misses: list of characters which were predicted but game feedback indicated that they are not present
		:param char_to_id: mapping from characters to id
		"""

		id_to_char = {v:k for k,v in char_to_id.items()}

		#convert string into desired input tensor
		if self.use_embedding:
			encoded = np.zeros((len(char_to_id)))
			for i, c in enumerate(string):
				if c == '*':
					encoded[i] = len(id_to_char) - 1 
				else:
					encoded[i] = char_to_id[c]

			inputs = np.array(encoded)[None, :]
			inputs = torch.from_numpy(inputs).long()

		else:

			encoded = np.zeros((len(string), len(char_to_id)))
			for i, c in enumerate(string):
				if c == '*':
					encoded[i][len(id_to_char) - 1] = 1
				else:
					encoded[i][char_to_id[c]] = 1

			inputs = np.array(encoded)[None, :, :]
			inputs = torch.from_numpy(inputs).float()

		#encode the missed characters
		miss_encoded = np.zeros((len(char_to_id) - 1))
		for c in misses:
			miss_encoded[char_to_id[c]] = 1
		miss_encoded = np.array(miss_encoded)[None, :]
		miss_encoded = torch.from_numpy(miss_encoded).float()

		input_lens = np.array([len(string)])
		input_lens= torch.from_numpy(input_lens).long()	

		#pass through model
		output = self.model(inputs, input_lens, miss_encoded).detach().cpu().numpy()[0]

		#sort predictions
		sorted_predictions = np.argsort(output)[::-1]
		
		#we cannnot consider only the argmax since a missed character may also get assigned a high probability
		#in case of a well-trained model, we shouldn't observe this
		return [id_to_char[x] for x in sorted_predictions]

	def plot_loss_acc(self, epoch):
		"""
		take train/test loss and test accuracy input and plot it over time
		:param epoch: to track performance across epochs
		"""

		plt.clf()
		fig, ax1 = plt.subplots()

		ax1.set_xlabel('Epoch')
		ax1.set_ylabel('Loss')
		ax1.plot([x[1] for x in self.train_losses], [x[0] for x in self.train_losses], color='r', label='Train Loss')
		ax1.plot([x[1] for x in self.test_losses], [x[0] for x in self.test_losses], color='b', label='Test Loss')
		ax1.tick_params(axis='y')
		ax1.legend(loc='upper left')

		fig.tight_layout()  # otherwise the right y-label is slightly clipped
		plt.grid(True)
		plt.legend()
		plt.title(self.arch_name)

		filename = self.plots_dir + 'plot_' + self.arch_name + '_' + str(epoch) + '.png'
		plt.savefig(filename)

		print("Saved plots")
Beispiel #16
0
# Print train-dataset statistics
text_len = len(train_data)
all_characters = tuple(sorted(set(train_data)))
n_characters = len(all_characters)
print('Total characters: {} - Total vocab: {}'.format(text_len, n_characters))

decoder = RNN(n_characters, hidden_size, n_characters, n_layers)
decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=lr)
decoder_criterion = torch.nn.CrossEntropyLoss()

if not torch.cuda.is_available():
    print('CUDA is not available.  Training on CPU ...\n')
else:
    print('CUDA is available!  Training on GPU ...\n')
    decoder.cuda()

start = time.time()
for epoch in range(1, n_epochs):

    # Get a random training chunk for each epoch
    input_chunk, target_chunk = random_chunk(chunk_len, batch_size,
                                             'train', train_data,
                                             len(train_data), all_characters)
    # Train and calculate the loss
    loss = net_train(input_chunk, target_chunk, chunk_len, decoder, batch_size,
                     decoder_criterion, decoder_optimizer)
    # Print every 100 epochs and calculate the validation loss
    if epoch % 100 == 0:
        input_valid_chunk, target_valid_chunk = random_chunk(
            chunk_len, batch_size, 'valid', validation_data,
Beispiel #17
0
def train_model(args):
    # Hyper Parameters
    sequence_length = args.seq_len
    input_size = args.input_size
    hidden_size = args.hidden_size
    num_layers = args.num_layers
    num_classes = args.num_classes
    batch_size = args.batch_size
    num_epochs = args.num_epochs
    learning_rate = args.learning_rate
    dropout = args.dropout

    # Create the dataset
    train_dataset = create_dataset('data/train/', timesteps=sequence_length)
    train_loader = dataloader(train_dataset, batch_size=batch_size)
    test_dataset = create_dataset('data/test/', timesteps=sequence_length)
    test_loader = dataloader(test_dataset, batch_size=batch_size)

    # Define model and loss
    rnn = RNN('LSTM', input_size, hidden_size, num_layers, num_classes,
              dropout)
    criterion = nn.CrossEntropyLoss()
    if args.cuda:  # switch to cuda
        rnn, criterion = rnn.cuda(), criterion.cuda()

    # Adam Optimizer
    optimizer = torch.optim.Adam(rnn.parameters(), learning_rate)

    # Train the Model
    i = 0  # updates
    best_test_acc = 0.0
    for epoch in range(num_epochs):
        # Generate random batches every epoch
        train_loader = dataloader(train_dataset, batch_size)
        for batch_X, batch_y in train_loader:
            # points = pack_padded_sequence(Variable(torch.from_numpy(batch_X)), batch_seq_lens)
            points = Variable(torch.from_numpy(batch_X))
            labels = Variable(torch.from_numpy(batch_y))

            if args.cuda:
                points, labels = points.cuda(), labels.cuda()

            # Forward + Backward + Optimize
            optimizer.zero_grad()
            outputs = rnn(points)  # final hidden state
            # outputs = pad_packed_sequence(outputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            print('Epoch [%d/%d], Loss: %.4f' %
                  (epoch + 1, num_epochs, loss.data[0]))
            if i % 100 == 0:  # every 100 updates, evaluate on test set
                # print("training accuracy = %.4f" % eval_model(rnn, train_loader))
                test_acc = eval_model(rnn, test_loader)
                print("test accuracy = %.4f" % test_acc)
                if test_acc > best_test_acc:
                    print("best test accuracy found")
                    best_test_acc = test_acc
                    torch.save(rnn.state_dict(), 'rnn_best.pkl')
            i += 1
Beispiel #18
0
def train(args):
    if args.create_dataset:
        df = pd.read_csv("../data/endpoints_calculated_std.csv")
        smiles = df["smiles"].to_list()
        data = df[df.columns[3:]].to_numpy()
        print("Building LegoModel")
        legoModel = LegoGram(smiles = smiles, nworkers=8)
        torch.save(legoModel, "legoModel.pk")
        print("Building sampler")
        sampler = LegoGramRNNSampler(legoModel)
        torch.save(sampler, "sampler.pk")
        print("Constracting dataset")
        dataset = MolecularNotationDataset(smiles,sampler,data)
        torch.save(dataset,'lg.bin')
    else:
        dataset = torch.load('lg.bin')

    train_loader = DataLoader(dataset, batch_size=args.batch_size, collate_fn=collect)
    device = torch.device('cpu')
    if args.cuda:
        device = torch.device('cuda')
    model = RNN(voc_size=dataset.vocsize, device=device)
    model.train()
    model.cuda()
    print(f"Model has been created on device {device}")
    smiles_dataset = dataset.smiles
    optimizer = optim.Adam(model.parameters(), lr=args.lr)
    loss_f = nn.CrossEntropyLoss(reduction='mean', ignore_index=0)
    writer = SummaryWriter(comment = args.name_task)
    losses = []
    out_counter = 0
    cnt = 0
    for epoch in range(args.num_epochs):
        loss_list =[]
        for iteration, (batch, lengths) in enumerate(tqdm(train_loader)):
            batch = batch.cuda()
            logits, endp_model = model(batch, lengths)
            print(logits.shape)
            print(batch.shape)
            loss = loss_f(logits[:, :, :-1], batch[:, 1:])

            loss_list.append(loss.item())
            writer.add_scalar("CrossEntropyLoss", loss_list[-1], iteration+epoch*len(train_loader))
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            if iteration % args.print_every == 0 and iteration > 0:
                model.eval()
                number_generate = 100

                res = model.sample(number_generate, dataset.model)
                writer.add_text("Molecules after generator", json.dumps([res]))
                valid = len(res) * 100 / number_generate
                print(res)
                print("valid : {} %".format(valid))
                writer.add_scalar("Valid", valid, cnt)
                res = [robust_standardizer(mol) for mol in res]
                res = list(filter(lambda x: x is not None, res))
                unique = len([elem for elem in res if elem not in smiles_dataset])

                writer.add_text("Unique mols", json.dumps([res]))
                print(f"There are unique mols {unique}")
                print(res)
                writer.add_scalar("Unique", unique, cnt)
                cnt += 1
                model.train()
        writer.flush()
        epoch_loss = np.mean(loss_list)
        print(f"Loss on epoch {epoch } is {epoch_loss}")
        if out_counter < args.stop_after and epoch>0:
            if losses[-1] <= epoch_loss:
                out_counter += 1
            else:
                out_counter = 0
                torch.save(model, "experiments/" + args.name_task + "/model.pt")
        if epoch == 0:
            torch.save(model, "experiments/" + args.name_task + "/model.pt")
        losses.append(epoch_loss)
    return losses
Beispiel #19
0
def main(args):
    if not os.path.exists('models'):
        os.mkdir('models')

    num_epochs = args.ne
    lr_decay = args.decay
    learning_rate = args.lr

    data_loader = get_data_loader(args.gt_path, args.descriptors_path,
                                  args.json_labels_path, args.bs)
    model = RNN(num_descriptors=args.num_descriptors,
                hidden_size=args.hidden_size,
                lstm_in_size=args.input_size)
    if torch.cuda.is_available():
        model.cuda()
    model.train()

    # optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.mm)
    optimizer = optim.Adam(model.parameters(), lr=args.lr)
    # model_loss = torch.nn.BCEWithLogitsLoss()
    model_loss = Loss()

    losses = []
    try:
        for epoch in range(num_epochs):
            if epoch % args.decay_epoch == 0 and epoch > 0:
                learning_rate = learning_rate * lr_decay
                for param_group in optimizer.param_groups:
                    param_group['lr'] = learning_rate

            loss_epoch = []
            for step, (descriptors, labels) in enumerate(data_loader):
                if torch.cuda.is_available():
                    descriptors = descriptors.cuda()
                    labels = labels.cuda()
                model.zero_grad()

                attention = model(descriptors)
                loss = model_loss(attention, labels)
                loss.backward()
                optimizer.step()

                loss_epoch.append(loss.cpu().detach().numpy())

                print('Epoch ' + str(epoch + 1) + '/' + str(num_epochs) +
                      ' - Step ' + str(step + 1) + '/' +
                      str(len(data_loader)) + ' - Loss: ' + str(float(loss)))
            loss_epoch_mean = np.mean(np.array(loss_epoch))
            losses.append(loss_epoch_mean)
            print('Total epoch loss: ' + str(loss_epoch_mean))
            if (epoch + 1) % args.save_epoch == 0 and epoch > 0:
                filename = 'model-epoch-' + str(epoch + 1) + '.pth'
                model_path = os.path.join('models/models_361_dropout',
                                          filename)
                torch.save(model.state_dict(), model_path)
    except KeyboardInterrupt:
        pass

    filename = 'model-epoch-last.pth'
    model_path = os.path.join('models', filename)
    torch.save(model.state_dict(), model_path)
    plt.plot(losses)
    plt.show()
Beispiel #20
0
def main(args):
    print(sys.argv)

    if not os.path.exists('models'):
        os.mkdir('models')

    num_epochs = args.ne
    lr_decay = args.decay
    learning_rate = args.lr

    data_loader = get_data_loader(args.gt_path,
                                  args.tensors_path,
                                  args.bs,
                                  args.json_labels_path,
                                  num_workers=8)
    model = RNN()
    if torch.cuda.is_available():
        model.cuda()
    model.train()

    #optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.mm)
    if args.rms:
        optimizer = optim.RMSprop(model.parameters(),
                                  lr=args.lr,
                                  momentum=args.mm)
    else:
        optimizer = optim.Adam(model.parameters(), lr=args.lr)
    model_loss = torch.nn.BCEWithLogitsLoss()

    losses = []
    p = 1
    try:
        for epoch in range(num_epochs):
            if epoch % args.decay_epoch == 0 and epoch > 0:
                learning_rate = learning_rate * lr_decay
                for param_group in optimizer.param_groups:
                    param_group['lr'] = learning_rate

            if epoch < 3:
                p = 1.0
            elif epoch >= 3 and epoch < 6:
                p = 0.5
            elif epoch >= 6 and epoch < 9:
                p = 0.25
            else:
                p = 0.0

            loss_epoch = []
            for step, (feat_maps, gt) in enumerate(data_loader):
                if torch.cuda.is_available():
                    feat_maps = feat_maps.cuda()
                    gt = gt.cuda()
                model.zero_grad()

                out = model(feat_maps, gt, p)
                loss = model_loss(out, gt)
                loss.backward()
                optimizer.step()

                loss_step = loss.cpu().detach().numpy()
                loss_epoch.append(loss_step)

                print('Epoch ' + str(epoch + 1) + '/' + str(num_epochs) +
                      ' - Step ' + str(step + 1) + '/' +
                      str(len(data_loader)) + " - Loss: " + str(loss_step))

            loss_epoch_mean = np.mean(np.array(loss_epoch))
            losses.append(loss_epoch_mean)
            print('Total epoch loss: ' + str(loss_epoch_mean))

            if (epoch + 1) % args.save_epoch == 0 and epoch > 0:
                filename = 'model-epoch-' + str(epoch + 1) + '.pth'
                model_path = os.path.join('models/', filename)
                torch.save(model.state_dict(), model_path)
    except KeyboardInterrupt:
        pass

    filename = 'model-epoch-last.pth'
    model_path = os.path.join('models', filename)
    torch.save(model.state_dict(), model_path)
    plt.plot(losses)
    plt.show()
Beispiel #21
0
def main(arguments=sys.argv[1:]):
    parser = argparse.ArgumentParser(prog="Sparse Evaluation on MNIST dataset")
    parser.add_argument('--nhid',
                        type=int,
                        default=128,
                        help='number of hidden units per layer')
    parser.add_argument('-ws',
                        '--w_sp',
                        type=float,
                        nargs='+',
                        default=[0, 0, 0, 0],
                        help="Weight sparsity setting.")
    parser.add_argument('-wt',
                        '--w_th',
                        type=float,
                        default=0,
                        help="Weight threshold setting.")
    parser.add_argument('-hs',
                        '--h_sp',
                        type=float,
                        nargs='+',
                        default=[0., 0.],
                        help="Hidden state sparsity setting.")
    parser.add_argument('-ht',
                        '--h_th',
                        type=float,
                        nargs='+',
                        default=[0., 0.],
                        help="Hidden state threshold setting.")
    parser.add_argument('-b',
                        '--size_block',
                        type=int,
                        default=-1,
                        help="Block size for hidden state sparsification.")
    parser.add_argument('-v',
                        '--verbose',
                        action='store_true',
                        help="Verbose mode.")
    parser.add_argument('-model',
                        '--model_path',
                        default='MNIST/models/nhid:128-nlayer:2-epoch:10.ckpt',
                        help="Model path.")

    args = parser.parse_args(arguments)

    # load model for GPU / CPU
    if torch.cuda.is_available():
        state_dict = torch.load(args.model_path)
    else:
        state_dict = torch.load(args.model_path, map_location='cpu')

    #sparsity_dict = {}

    for k, v in state_dict.items():
        if 'lstm1' in k:
            if 'weight_x' in k:
                state_dict[k] = set_to_zero_sparsity(v, sparsity=args.w_sp[0])
            if 'weight_h' in k:
                state_dict[k] = set_to_zero_sparsity(v, sparsity=args.w_sp[1])
        if 'lstm2' in k:
            if 'weight_x' in k:
                state_dict[k] = set_to_zero_sparsity(v, sparsity=args.w_sp[2])
            if 'weight_h' in k:
                state_dict[k] = set_to_zero_sparsity(v, sparsity=args.w_sp[3])

    # save weight and bias for RISCV simulation
    # l1_w = np.hstack((state_dict['lstm1.cell_f.weight_xf'].numpy(), state_dict['lstm1.cell_f.weight_xi'].numpy(), state_dict['lstm1.cell_f.weight_xu'].numpy(), state_dict['lstm1.cell_f.weight_xo'].numpy()))
    # l1_u = np.hstack((state_dict['lstm1.cell_f.weight_hf'].numpy(), state_dict['lstm1.cell_f.weight_hi'].numpy(), state_dict['lstm1.cell_f.weight_hu'].numpy(), state_dict['lstm1.cell_f.weight_ho'].numpy()))
    #
    # l2_w = np.hstack((state_dict['lstm2.cell_f.weight_xf'].numpy(), state_dict['lstm2.cell_f.weight_xi'].numpy(), state_dict['lstm2.cell_f.weight_xu'].numpy(), state_dict['lstm2.cell_f.weight_xo'].numpy()))
    # l2_u = np.hstack((state_dict['lstm2.cell_f.weight_hf'].numpy(), state_dict['lstm2.cell_f.weight_hi'].numpy(), state_dict['lstm2.cell_f.weight_hu'].numpy(), state_dict['lstm2.cell_f.weight_ho'].numpy()))
    #
    # l1_w = np.transpose(l1_w)
    # l1_u = np.transpose(l1_u)
    # l2_w = np.transpose(l2_w)
    # l2_u = np.transpose(l2_u)
    #
    # l1_b = np.hstack((state_dict['lstm1.cell_f.bias_f'].numpy(), state_dict['lstm1.cell_f.bias_i'].numpy(), state_dict['lstm1.cell_f.bias_u'].numpy(), state_dict['lstm1.cell_f.bias_o'].numpy()))
    # l2_b = np.hstack((state_dict['lstm2.cell_f.bias_f'].numpy(), state_dict['lstm2.cell_f.bias_i'].numpy(), state_dict['lstm2.cell_f.bias_u'].numpy(), state_dict['lstm2.cell_f.bias_o'].numpy()))
    #
    # np.savez('mnist-l1.npz', w=l1_w, u=l1_u, b=l1_b)
    # np.savez('mnist-l2.npz', w=l2_w, u=l2_u, b=l2_b)

    test_dataset = torchvision.datasets.MNIST(root='./MNIST/data/',
                                              train=False,
                                              transform=transforms.ToTensor())
    test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
                                              batch_size=batch_size,
                                              shuffle=False)
    model = RNN(input_size, args.nhid, num_layers, num_classes)
    model.load_state_dict(state_dict)
    if torch.cuda.is_available():
        model.cuda()
        # model.half()
    # TODO: trans the model to half-precision float

    with torch.no_grad():
        correct = 0
        total = 0
        #hidden = model.init_hidden(batch_size)
        sparse_dict = {"LSTM1": 0., "LSTM2": 0.}
        iteration = 0
        for images, labels in test_loader:

            # output vec_x for RISC-V simulation
            # f = open('mnist_x.txt', 'w')
            # images_flat = images.view(-1)
            # for elem in images_flat:
            #     f.write('{:f},'.format(elem))
            # f.close()

            hidden = (torch.zeros(num_layers, batch_size,
                                  args.nhid).to(device),
                      torch.zeros(num_layers, batch_size,
                                  args.nhid).to(device))
            images = images.reshape(-1, sequence_length, input_size).to(device)
            labels = labels.to(device)
            outputs, hidden, cur_dict = model(images,
                                              hidden,
                                              sparse=True,
                                              h_th=args.h_th,
                                              h_sp=args.h_sp,
                                              block=args.size_block)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            iteration += 1
            for k, v in sparse_dict.items():
                sparse_dict[k] += cur_dict[k]

        accuracy = 100.0 * correct / total
        print(
            '|| Test Accuracy : {:.5f} || LSTM1 sparsity: {:.5f} || LSTM2 Sparsity: {:.5f} ||'
            .format(accuracy, sparse_dict['LSTM1'] / iteration,
                    sparse_dict['LSTM2'] / iteration))
Beispiel #22
0
def main(batch_size,
         embed_size,
         num_hiddens,
         num_layers,
         ln_hidden,
         ln_output,
         rec_unit,
         learning_rate=1e-4,
         log_step=10,
         num_epochs=50,
         save_step=100,
         ngpu=1):
    # hyperparameters
    num_workers = 0
    checkpoint_dir = 'checkpoint'
    # Image Preprocessing

    transform = {
        'train':
        transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
        ]),
        'val':
        transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
        ]),
    }
    # load data
    vocab = build_vocab(path='relative_captions_shoes.json')
    train_data, train_loader = data_and_loader(
        path='relative_captions_shoes.json',
        mode='train',
        vocab=vocab,
        transform=transform['train'],
        batch_size=batch_size)

    val_data, val_loader = data_and_loader(path='relative_captions_shoes.json',
                                           mode='valid',
                                           vocab=vocab,
                                           transform=transform['val'],
                                           batch_size=batch_size)

    losses_val = []
    losses_train = []

    # Build the models
    initial_step = initial_epoch = 0

    encoder = CNN(embed_size)  ### embed_size: power of 2
    middle = fcNet(embed_size, ln_hidden, ln_output)
    decoder = RNN(ln_output,
                  num_hiddens,
                  len(vocab),
                  num_layers,
                  rec_unit=rec_unit,
                  drop_out=0.1)

    # Loss, parameters & optimizer
    loss_fun = nn.CrossEntropyLoss()
    params = list(decoder.parameters()) + list(
        encoder.linear.parameters()) + list(encoder.batchnorm.parameters())
    optimizer = torch.optim.Adam(params, lr=learning_rate)

    if torch.cuda.is_available():
        encoder.cuda()
        decoder.cuda()

    # Train the Models
    total_step = len(train_loader)
    try:
        for epoch in range(initial_epoch, num_epochs):
            print('Epoch: {}'.format(epoch))
            for step, (images, captions,
                       lengths) in enumerate(train_loader, start=initial_step):

                # Set mini-batch dataset
                images = Variable(images)
                captions = Variable(captions)
                targets = pack_padded_sequence(captions,
                                               lengths,
                                               batch_first=True)[0]

                # Forward, Backward and Optimize
                decoder.zero_grad()
                middle.zero_grad()
                encoder.zero_grad()

                if ngpu > 1:
                    # run on multiple GPUs
                    features = nn.parallel.data_parallel(
                        encoder, images, range(ngpu))
                    rnn_input = nn.parallel.data_parallel(
                        middle, features, range(ngpu))
                    outputs = nn.parallel.data_parallel(
                        decoder, features, range(ngpu))
                else:
                    # run on single GPU
                    features = encoder(images)
                    rnn_input = middle(features)
                    outputs = decoder(rnn_input, captions, lengths)

                train_loss = loss_fun(outputs, targets)
                losses_train.append(train_loss.item())
                train_loss.backward()
                optimizer.step()

                # Run validation set and predict
                if step % log_step == 0:
                    encoder.batchnorm.eval()
                    # run validation set
                    batch_loss_val = []
                    for val_step, (images, captions,
                                   lengths) in enumerate(val_loader):
                        images = Variable(images)
                        captions = Variable(captions)
                        targets = pack_padded_sequence(captions,
                                                       lengths,
                                                       batch_first=True)[0]
                        #features = encoder(target_images) - encoder(refer_images)
                        features = encoder(images)
                        rnn_input = middle(features)
                        outputs = decoder(rnn_input, captions, lengths)
                        val_loss = loss_fun(outputs, targets)
                        batch_loss_val.append(val_loss.item())

                    losses_val.append(np.mean(batch_loss_val))

                    # predict
                    sampled_ids = decoder.sample(rnn_input)
                    sampled_ids = sampled_ids.cpu().data.numpy()[0]
                    sentence = utils.convert_back_to_text(sampled_ids, vocab)
                    print('Sample:', sentence)

                    true_ids = captions.cpu().data.numpy()[0]
                    sentence = utils.convert_back_to_text(true_ids, vocab)
                    print('Target:', sentence)

                    print(
                        'Epoch: {} - Step: {} - Train Loss: {} - Eval Loss: {}'
                        .format(epoch, step, losses_train[-1], losses_val[-1]))
                    encoder.batchnorm.train()

                # Save the models
                if (step + 1) % save_step == 0:
                    save_models(encoder, middle, decoder, optimizer, step,
                                epoch, losses_train, losses_val,
                                checkpoint_dir)
                    dump_losses(losses_train, losses_val,
                                os.path.join(checkpoint_dir, 'losses.pkl'))

    except KeyboardInterrupt:
        pass
    finally:
        # Do final save
        utils.save_models(encoder, middle, decoder, optimizer, step, epoch,
                          losses_train, losses_val, checkpoint_dir)
        utils.dump_losses(losses_train, losses_val,
                          os.path.join(checkpoint_dir, 'losses.pkl'))