def find_bb(): im_path = 'EM\data\Other data\EM CA1 hippocampus region of brain/training_groundtruth.tif' im_path = root_path + im_path mask = read_image(im_path) find_boundingbox(mask[0]) return mask
def colorize(img_path, model, img_size, show_original=False): ''' Colorize image with specified model. Since human eye is much more sensitive to luminance (lightness change) than to chrominance (color change), we convert image to specified image size needed by the model used (e.g. 192x192), colorize it and then assemble final image in CIE LAB color space by putting together original image lightness and colorized image's AB channels resized to original image size. Args: img_path (string): JPEG image full path. model (Keras model): Model instance to use. img_size (tuple): Model input image size. Output image will keep the size of original image. show_original (bool): Concatenate original image with colorized image ''' orig_rgb = read_image(img_path) orig_lab = skimage.color.rgb2lab((orig_rgb + 1) / 2) input_rgb = skimage.transform.resize(orig_rgb, img_size) input_gray = rgb_to_lab(input_rgb)[:, :, :1] input_gray = np.repeat(input_gray, 3, axis=-1) # Repeat channel to keep input 3-dimensional output_rgb = model.predict(input_gray.reshape((1, *img_size, 3)))[0] / 2 + 0.5 # Colorize output_lab = skimage.color.rgb2lab(output_rgb) # Convert colorized image to LAB output_lab = skimage.transform.resize(output_lab, (orig_rgb.shape[0], orig_rgb.shape[1])) # Resize LAB to orig size final_lab = np.zeros((orig_rgb.shape[0], orig_rgb.shape[1], 3)) # Finals image LAB image final_lab[:, :, 0] = orig_lab[:, :, 0] # Original image lightness channel final_lab[:, :, 1:] = output_lab[:, :, 1:] # Take colorized image AB channels final_rgb = skimage.color.lab2rgb(final_lab) if show_original: final_rgb = np.concatenate(((orig_rgb + 1) / 2, final_rgb), axis=1) return np.rint(final_rgb * 255).astype(np.uint8) # Rescale to (0,255)
def _parse_data(self, gt, pd): """ gt: ground truth, pd: predicted data """ # read data if isinstance(gt, str): gt = read_image(gt) if isinstance(pd, str): pd = read_image(pd) # uniting data shape if gt.shape != pd.shape: if self.shape_constrains: raise ValueError("the shape of ground truth is not consitent with predicted's") else: pd = augmentation.resize(pd, gt.shape) return gt, pd
def log_colorized_images(self, iter): ground_truth = [] batch = [] # Read images for path in self.paths: img_rgb = read_image(path, self.img_size) ground_truth.append(img_rgb) img_gray = rgb2gray(img_rgb).reshape(self.img_size + (1, )) batch.append(img_gray) # Predict color using trained model colorized = self.model.predict(np.repeat(batch, 3, axis=-1)) # Concat ground truth and predicted images and log them to comet for i in range(len(colorized)): final = np.concatenate((ground_truth[i], colorized[i]), axis=1) final = np.rint(final * 127.5 + 127.5).astype(np.uint8) self.experiment.log_image(final, name=f'iter_{iter:06d}_image_{i:02d}')
def get_features(dir, read=True, download=True): if read: if download: vgg_net = vis.models.vgg16(pretrained="imagenet", progress=True) else: ## Load model parameters from path vgg_net = vis.models.vgg16() vgg_net.load_state_dict(torch.load('./models/vgg16-397923af.pth')) jpg_files = ds.images_info(dir) ## Set requires to eliminate space taken for grads for p in vgg_net.parameters(): p.requires_grad = False ## Net architecture print(vgg_net) # summary(vgg_net, input_size=(3, 224, 224)) ## Remove the last classifier layer: Softmax print("Removing softmax layer of VGG16 ... ") vgg_net.classifier = vgg_net.classifier[:-1] print(vgg_net) # summary(vgg_net, input_size=(3, 224, 224)) ## Read images with specified transforms print("Reading images ... ", end='') images = ds.read_image(dir, normalize=True, resize=224, tensor=True) print("done.") # print(images.keys()) ## Get feature map for image tensor through VGG-16 img_featrs = OD() print("Gathering images' features from last conv layer ... ", end='') for i, jpg_name in enumerate(images.keys()): with torch.no_grad(): print(i, jpg_name) img_featrs[jpg_name] = vgg_net(images[jpg_name].unsqueeze(0)) print("done.") return img_featrs
def main(): print("Loading the image") dir_path = os.path.dirname(os.path.realpath(__file__)) image_path = sys.argv[1] if image_path[0] != "/": image_path = dir_path + '/' + image_path image = np.array( [dataset.read_image(filename=image_path, image_size=IMG_SIZE)], dtype=np.uint8) print("Shapping the image for the model input") # The input to the network is of shape [None image_size image_size num_channels]. Hence we reshape. x_batch = image.reshape(1, IMG_SIZE, IMG_SIZE, NUM_CHANNELS) print("Please choose the model to use : ") les_meta_path = locate_files(extension=".meta", path=os.getcwd(), dbName="meta") for i, meta_path in enumerate(les_meta_path): print("\n\n" + str(i) + " : " + str(meta_path)) info_txt_path = str('/'.join(meta_path.split("/")[:-1]) + "/info.txt") try: with open(info_txt_path, 'r') as f: for line in f: print("\t" + str(line.replace("\n", ""))) print("") except FileNotFoundError: print("// No info.txt \n") model_num = int(input(">> ")) try: meta_path = les_meta_path[model_num] model_dir_path = '/'.join(meta_path.split("/")[:-1]) + "/" except IndexError or TypeError: print("Wrong input") return -1 print("Restoring the model", end="") sys.stdout.flush() sess = tf.Session() # Step-1: Recreate the network graph. At this step only graph is created. saver = tf.train.import_meta_graph(meta_path) # Step-2: Now let's load the weights saved using the restore method. saver.restore(sess, tf.train.latest_checkpoint(model_dir_path)) graph = tf.get_default_graph() y_pred = graph.get_tensor_by_name("y_pred:0") print(" - Done") print("Feeding the image to the input") x = graph.get_tensor_by_name("x:0") y_true = graph.get_tensor_by_name("y_true:0") les_labels = [] try: with open(model_dir_path + "labels.txt", 'r') as f: for line in f: label = line.replace("\n", "") if label != "": les_labels.append(label) except Exception as e: les_labels = ['Bathroom', 'Bedroom', 'Kitchen', 'Living Room'] print( "Error openning labels.txt. We are going to use default values : " + str(les_labels)) print("***\n" + str(e) + "\n***") print("Using labels : " + str(les_labels)) y_test_images = np.zeros((1, len(les_labels))) ### Creating the feed_dict that is required to be fed to calculate y_pred feed_dict_testing = {x: x_batch, y_true: y_test_images} result = sess.run(y_pred, feed_dict=feed_dict_testing) print(result[0]) # result is of this format [probabiliy_of_rose probability_of_sunflower] print("Prediction : ") for i in range(len(result[0])): print("\t" + les_labels[i] + " : " + str('{0:f}'.format(round(result[0][i] * 100, 5))) + "%")
for k, v in org_dict['net'].items(): temp['.'.join(k.split('.')[1:])] = v pretrained_model.load_state_dict(temp) val_transform = data_transform(False) val_data = TextImageSet(data_root, transform=val_transform, is_train=False) val_loader = DataLoader(val_data, batch_size=1, shuffle=False, num_workers=6) for i, (data, labels_pro, img_path) in enumerate(val_loader): if i > 10: break img_path = img_path[0] original_image = read_image(img_path) original_image = original_image.resize((192, 64)) file_name_to_export = img_path[img_path.rfind('/') + 1:img_path.rfind('.')] # Grad cam grad_cam = GradCam(pretrained_model) # Generate cam mask single # cam = grad_cam.generate_cam( # data, labels_pro, num_classes=len(cfg.alphabets)) for l in range(3, 26): cam = grad_cam.generate_cam(data, labels_pro, num_classes=len(cfg.alphabets), nl=l) # Save mask
def main(): dir_photos = "./data/flickr8k/Flicker8k_photos/" file_annot = "./data/flickr8k/Flickr8k_text/Flickr8k.token.txt" jpg_files = ds.images_info(dir_photos) ann_dframe = ds.annots_info(file_annot, df=True) print( "Dataset overview\n-------------------------------------------------------------------------------------------------------------\n" ) print(ann_dframe) print( "\n-------------------------------------------------------------------------------------------------------------\n" ) ## Prepare captions print("Preparing caption data for images") word_count = ds.word_freq(ann_dframe) # print(word_count) ## Clean text print("Cleaning text ... ", end="") for i, cpt in enumerate(ann_dframe.caption.values): ann_dframe["caption"].iloc[i] = ds.clean_text(cpt) print("done.") print(ann_dframe) word_count = ds.word_freq(ann_dframe) # print(word_count) ## Add start and end sequence token ann_dframe_orig = copy(ann_dframe) ann_dfrm = ds.add_start_end_tokens(ann_dframe) print(ann_dfrm) vgg_net = vis.models.vgg16(pretrained="imagenet", progress=True) for p in vgg_net.parameters(): p.requires_grad = False ## Load model parameters from path # vgg_net.load_state_dict(torch.load('./models/vgg16-397923af.pth')) ## Features in the last layer num_ftrs = vgg_net.classifier[-1].in_features print(num_ftrs) print(vgg_net) ## Remove the last classifier layer: Softmax, ReLU, Dropout vgg_net.classifier = vgg_net.classifier[:-1] # ## Net architecture # summary(vgg_net, input_size=(3, 224, 224)) print(vgg_net) # ## Features in the last layer # num_ftrs = vgg_net.classifier[-1].in_features # print(num_ftrs) # ## Read images with specified transforms print("Reading images ... ", end='') images = ds.read_image(jpg_files, dir_photos, normalize=True, resize=224, tensor=True) print("done.") # print(images.keys()) ## Get feature map for image tensor through VGG-16 img_featrs = OD() print("Gathering images' features from last conv layer ... ", end='') for i, jpg_name in enumerate(images.keys()): with torch.no_grad(): print(i, jpg_name) img_featrs[jpg_name] = vgg_net(images[jpg_name].unsqueeze(0)) print("done.") # print(img_featrs, img_featrs[jpg_name].size(), sep='\n') print(img_featrs.keys()) # Get features for images in our dataset from pretrained VGG-16 features = mdl.get_features(dir_photos, read=True, download=False) print(features) ## Prep image tensor print("Prepping image tensor ... ", end="") fnames = [] img_tns_list = [] cap_list = [] for i, jpg_name in enumerate(ann_dfrm.filename.values): if (i % 5) == 0: if jpg_name in img_featrs.keys(): fnames.append(jpg_name) img_tns_list.append(img_featrs[jpg_name]) cap_list.append(ann_dfrm.iloc[i]["caption"]) print("done.") print(len(img_tns_list), len(cap_list)) img_tns = torch.cat(img_tns_list) print(img_tns.shape) print( "Saving filenames list, image tensor list, captions tensor list ... ", end="") torch.save(fnames, 'fnames.pkl') torch.save(img_tns_list, 'image_tns_list.pkl') torch.save(cap_list, 'captions_list.pkl') print("done.") print("Loading fnames, image tensor list and captions tensor list ... ", end="") fnames = torch.load('fnames.pkl') img_tns_list = torch.load('image_tns_list.pkl') img_tns = torch.cat(img_tns_list) cap_list = torch.load('captions_list.pkl') # print(len(fnames), cap_list) print("done.") cap_seq, vocab_size, cap_max_len, tokens = ds.tokenizer(cap_list) n_cap = len(cap_seq) vald_prop, test_prop = 0.2, 0.2 n_vald = int(n_cap * vald_prop) n_test = int(n_cap * test_prop) train_cap, valid_cap, evaln_cap = ds.split_dset(cap_seq, n_vald, n_test) train_ims, valid_ims, evaln_ims = ds.split_dset(img_tns, n_vald, n_test) # train_fnm, valid_fnm, evaln_fnm = ds.split_dset(fnames, n_vald, n_test) print(len(train_cap), len(valid_cap), len(evaln_cap)) print(len(train_ims), len(valid_ims), len(evaln_ims)) # print(len(train_fnm), len(valid_fnm), len(evaln_fnm)) images_train, captions_train, target_caps_train = ds.prep_data( train_ims, train_cap, vocab_size, cap_max_len) images_valid, captions_valid, target_caps_valid = ds.prep_data( valid_ims, valid_cap, vocab_size, cap_max_len) ## Dataloader bs = 64 trainset = ds.Flickr8k(images_train, captions_train, target_caps_train) validset = ds.Flickr8k(images_valid, captions_valid, target_caps_valid) trainloader = torch.utils.data.DataLoader(trainset, batch_size=bs, shuffle=True) validloader = torch.utils.data.DataLoader(validset, batch_size=bs) # # ## Device: CPU or GPU? device = "cuda:0" if torch.cuda.is_available() else "cpu" print("Using " + device) ## Model model = mdl.CapNet(vocab_size, cap_max_len).to(device) criterion = nn.CrossEntropyLoss() ## Optimizer optimizer = optim.Adam(model.parameters(), lr=0.001) scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.1) max_n_epochs = 5 # ## Training print("Starting training ... ") epoch_train_loss, epoch_valid_loss = [], [] min_val_loss = 100 for epoch in range(1, max_n_epochs + 1): print("-------------------- Epoch: [%d / %d] ----------------------" % (epoch, max_n_epochs)) training_loss, validation_loss = 0.0, 0.0 ## Batch training for i, data in enumerate(trainloader): images, captions, target_caps = data[0].to(device), data[1].to( device), data[2].to(device) optimizer.zero_grad() out = model(images, captions.t()) loss = criterion(out, target_caps) loss.backward() optimizer.step() training_loss += loss.item() epoch_train_loss.append(training_loss / len(trainloader)) print("Training loss: %f" % (epoch_train_loss[-1]), end=" ") for i, data in enumerate(validloader): with torch.set_grad_enabled(False): images, captions, target_caps = data[0].to(device), data[1].to( device), data[2].to(device) out = model(images, captions.t()) loss = criterion(out, target_caps) validation_loss += loss.item() epoch_valid_loss.append(validation_loss / len(validloader)) print("Validation loss: %f" % (epoch_valid_loss[-1])) scheduler.step() if epoch_valid_loss[-1] < min_val_loss: print("Found best model.") best_model = deepcopy(model) plt.plot(list(range(max_n_epochs)), epoch_train_loss, label="Training loss") plt.plot(list(range(max_n_epochs)), epoch_valid_loss, label="Validation loss") plt.xlabel("Number of epochs") plt.ylabel("Loss") plt.title("Number of epochs vs loss") plt.legend() plt.show() ########### # Save model print("Saving best model ... ") torch.save(best_model, 'best_model.pkl')
def main(): dir_photos = "./data/flickr8k/Flicker8k_photos/" file_annot = "./data/flickr8k/Flickr8k_text/Flickr8k.token.txt" jpg_files = ds.images_info(dir_photos) ann_dframe = ds.annots_info(file_annot, df=True) print("Dataset overview\n-------------------------------------------------------------------------------------------------------------\n") print(ann_dframe) print("\n-------------------------------------------------------------------------------------------------------------\n") ## Prepare captions print("Preparing caption data for images") word_count = ds.word_freq(ann_dframe) # print(word_count) ## Clean text print("Cleaning text ... ", end="") for i, cpt in enumerate(ann_dframe.caption.values): ann_dframe["caption"].iloc[i] = ds.clean_text(cpt) print("done.") print(ann_dframe) word_count = ds.word_freq(ann_dframe) # print(word_count) ## Add start and end sequence token ann_dframe_orig = copy(ann_dframe) ann_dfrm = ds.add_start_end_tokens(ann_dframe) print(ann_dfrm) vgg_net = vis.models.vgg16(pretrained="imagenet", progress=True) for p in vgg_net.parameters(): p.requires_grad = False ## Load model parameters from path # vgg_net.load_state_dict(torch.load('./models/vgg16-397923af.pth')) ## Features in the last layer num_ftrs = vgg_net.classifier[-1].in_features print(num_ftrs) print(vgg_net) ## Remove the last classifier layer: Softmax, ReLU, Dropout vgg_net.classifier = vgg_net.classifier[:-1] # ## Net architecture # summary(vgg_net, input_size=(3, 224, 224)) print(vgg_net) # ## Features in the last layer # num_ftrs = vgg_net.classifier[-1].in_features # print(num_ftrs) # ## Read images with specified transforms print("Reading images ... ", end='') images = ds.read_image(jpg_files, dir_photos, normalize=True, resize=224, tensor=True) print("done.") # print(images.keys()) ## Get feature map for image tensor through VGG-16 img_featrs = OD() print("Gathering images' features from last conv layer ... ", end='') for i, jpg_name in enumerate(images.keys()): with torch.no_grad(): print(i, jpg_name) img_featrs[jpg_name] = vgg_net(images[jpg_name].unsqueeze(0)) print("done.") # print(img_featrs, img_featrs[jpg_name].size(), sep='\n') print(img_featrs.keys()) # Get features for images in our dataset from pretrained VGG-16 features = mdl.get_features(dir_photos, read=True, download=False) print(features) ## Prep image tensor print("Prepping image tensor ... ", end="") fnames = [] img_tns_list = [] cap_list = [] for i, jpg_name in enumerate(ann_dfrm.filename.values): if (i % 5) == 0: if jpg_name in img_featrs.keys(): fnames.append(jpg_name) img_tns_list.append(img_featrs[jpg_name]) cap_list.append(ann_dfrm.iloc[i]["caption"]) print("done.") print(len(img_tns_list), len(cap_list)) img_tns = torch.cat(img_tns_list) print(img_tns.shape) print("Saving filenames list, image tensor list, captions tensor list ... ", end="") torch.save(fnames, 'fnames.pkl') torch.save(img_tns_list, 'image_tns_list.pkl') torch.save(cap_list, 'captions_list.pkl') print("done.") print("Loading fnames, image tensor list and captions tensor list ... ", end="") fnames = torch.load('fnames.pkl') img_tns_list = torch.load('image_tns_list.pkl') img_tns = torch.cat(img_tns_list) cap_list = torch.load('captions_list.pkl') # print(len(fnames), cap_list) print("done.") cap_seq, vocab_size, cap_max_len, tokens = ds.tokenizer(cap_list) n_cap = len(cap_seq) vald_prop, test_prop = 0.2, 0.2 n_vald = int(n_cap * vald_prop) n_test = int(n_cap * test_prop) train_cap, valid_cap, evaln_cap = ds.split_dset(cap_seq, n_vald, n_test) train_ims, valid_ims, evaln_ims = ds.split_dset(img_tns, n_vald, n_test) # train_fnm, valid_fnm, evaln_fnm = ds.split_dset(fnames, n_vald, n_test) print(len(train_cap), len(valid_cap), len(evaln_cap)) print(len(train_ims), len(valid_ims), len(evaln_ims)) device = "cuda:0" if torch.cuda.is_available() else "cpu" print("Using " + device) print("Loading model ...") model = torch.load('best_model.pkl') print(model) model.eval() # print(fnames) preds = [] for feat in evaln_ims: preds.append(predict_caption(model, feat, cap_max_len, tokens, device)) best_targets = [] for p, t in zip(preds, cap_list[:n_test]): pred = p.split(" ") targ = [t.split(" ")] z=sentence_bleu(targ, pred, weights=(1, 0, 0, 0)) if z > 0.50: print(p, t, z, sep='\n') print("\n") best_targets.append(t) print(best_targets) for cap in best_targets: rows = ann_dfrm.loc[ann_dfrm["caption"]==cap, "filename"] print(rows)
def main(): dir_photos = "./data/Flickr8k/Flickr8k_Dataset/Flicker8k_Dataset/" file_annot = "./data/Flickr8k/Flickr8k_text/Flickr8k.token.txt" print( "\n-------------------------------------------------------------------------------------------------------\n" ) ## Get basic dataset info print("DATASET INFO") print( "---------------------------------------------------------------------------------------------------------\n" ) jpg_files = ds.images_info(dir_photos) print("Number of photos in Flickr8k: %d" % (len(jpg_files))) ann_dframe = ds.annots_info(file_annot, df=True) print( "\n-------------------------------------------------------------------------------------------------------\n" ) ## Visualize data overview print("DATASET OVERVIEW") print( "---------------------------------------------------------------------------------------------------------\n" ) print(ann_dframe) print( "\n-------------------------------------------------------------------------------------------------------\n" ) ## Prepare captions print("CURATE CAPTIONS") print( "---------------------------------------------------------------------------------------------------------\n" ) word_count = ds.word_freq(ann_dframe) # print(word_count) ## Clean text start = time.time() print("Cleaning text ... ", end="") for i, cpt in enumerate(ann_dframe.caption.values): ann_dframe["caption"].iloc[i] = ds.clean_text(cpt) print("done.") # print(ann_dframe) # word_count = ds.word_freq(ann_dframe) # print(word_count) ## Add start and end sequence token ann_dframe_orig = copy(ann_dframe) print("Adding start and end tokens ... ", end="") ann_dfrm = ds.add_start_end_tokens(ann_dframe) print("done.") elapsed = time.time() - start print("\nTime to preprocess {} captions: {:.2f} \ seconds".format(i, elapsed)) # print(ann_dfrm) print( "\n-------------------------------------------------------------------------------------------------------\n" ) # ## Read images with specified transforms print("READ IMAGES & EXTRACT FEATURES") print( "---------------------------------------------------------------------------------------------------------\n" ) mean = [0.485, 0.456, 0.406] stdv = [0.229, 0.224, 0.225] transforms = vis.transforms.Compose([ vis.transforms.Resize(256), vis.transforms.CenterCrop(224), vis.transforms.ToTensor(), vis.transforms.Normalize(mean=mean, std=stdv) ]) print("Reading images ... ", end='') images = ds.read_image(dir_photos, transforms) print("done.") # Get feature maps for image tensor through VGG-16 features_dict, features_fname = mdl.get_features(images, download_wts=False, save=True, cuda=True) # print(features_dict) ## Load feature maps features_dict = torch.load(features_fname) print( "\n-------------------------------------------------------------------------------------------------------\n" ) ## Prep image tensor print("PREP IMAGE TENSOR") print( "---------------------------------------------------------------------------------------------------------\n" ) ann_dfrm = ann_dfrm.loc[ann_dfrm["idx"].values == "0", :] print(ann_dfrm) ds.word_freq(ann_dfrm) fnames = [] img_tns_list = [] cap_list = [] for i, jpg_name in enumerate(ann_dfrm.filename.values): if jpg_name in features_dict.keys(): fnames.append(jpg_name) img_tns_list.append(features_dict[jpg_name]) cap_list.append(ann_dfrm.iloc[i]["caption"]) print(len(img_tns_list), len(cap_list)) img_tns = torch.cat(img_tns_list) print(img_tns.shape) print( "\n-------------------------------------------------------------------------------------------------------\n" ) ## Text tokenize print("TEXT TOKENIZE") print( "---------------------------------------------------------------------------------------------------------\n" ) tokens, cap_seq, vocab_size, cap_max_len = ds.tokenizer(cap_list) print("Vocab size: ", vocab_size) print( "\n-------------------------------------------------------------------------------------------------------\n" ) ## Dataset splits print("DATASET SPLIT") print( "---------------------------------------------------------------------------------------------------------\n" ) n_cap = len(cap_seq) vald_prop, test_prop = 0.2, 0.2 n_vald = int(n_cap * vald_prop) n_test = int(n_cap * test_prop) train_cap, valid_cap, evaln_cap = ds.split_dset(cap_seq, n_vald, n_test) train_ims, valid_ims, evaln_ims = ds.split_dset(img_tns, n_vald, n_test) train_fnm, valid_fnm, evaln_fnm = ds.split_dset(fnames, n_vald, n_test) print(len(train_cap), len(valid_cap), len(evaln_cap)) print(len(train_ims), len(valid_ims), len(evaln_ims)) print(len(train_fnm), len(valid_fnm), len(evaln_fnm)) print( "\n-------------------------------------------------------------------------------------------------------\n" ) ## Prep data for training and validation print("FINAL PREP FOR TRAINING & VALIDATION") print( "---------------------------------------------------------------------------------------------------------\n" ) images_train, captions_train, target_caps_train = ds.prep_data( train_ims, train_cap, cap_max_len) images_valid, captions_valid, target_caps_valid = ds.prep_data( valid_ims, valid_cap, cap_max_len) print( "\n-------------------------------------------------------------------------------------------------------\n" ) ## TRAINING print("TRAINING") print( "---------------------------------------------------------------------------------------------------------\n" ) ## Hyperparameters bs = 64 lr = 0.001 lr_steps = 20 gamma = 0.1 max_n_epochs = 5 ## Dataloader print("DATALOADERS") trainset = ds.Flickr8k(images_train, captions_train, target_caps_train) validset = ds.Flickr8k(images_valid, captions_valid, target_caps_valid) trainloader = torch.utils.data.DataLoader(trainset, batch_size=bs, shuffle=True) validloader = torch.utils.data.DataLoader(validset, batch_size=bs) ## Device: CPU or GPU? print("DEVICE:", end=" ") device = "cuda:0" if torch.cuda.is_available() else "cpu" print("Using " + device) ## Model print("MODEL:") model = mdl.CapNet(vocab_size, cap_max_len).to(device) # Criterion criterion = nn.CrossEntropyLoss() ## Optimizer optimizer = optim.Adam(model.parameters(), lr=lr) scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=lr_steps, gamma=gamma) ## Training print("\nStarting training ... ") epoch_train_loss, epoch_valid_loss = [], [] min_val_loss = 100 for epoch in range(1, max_n_epochs + 1): print("-------------------- Epoch: [%d / %d] ----------------------" % (epoch, max_n_epochs)) training_loss, validation_loss = 0.0, 0.0 ## Batch training for i, data in enumerate(trainloader): tr_images, tr_captions, tr_target_caps = data[0].to( device), data[1].to(device), data[2].to(device) optimizer.zero_grad() tr_out = model(tr_images, tr_captions.t()) tr_loss = criterion(tr_out, tr_target_caps) tr_loss.backward() optimizer.step() training_loss += tr_loss.item() epoch_train_loss.append(training_loss / len(trainloader)) print("Training loss: %f" % (epoch_train_loss[-1]), end=" || ") for i, data in enumerate(validloader): with torch.set_grad_enabled(False): vl_images, vl_captions, vl_target_caps = data[0].to( device), data[1].to(device), data[2].to(device) vl_out = model(vl_images, vl_captions.t()) vl_loss = criterion(vl_out, vl_target_caps) validation_loss += vl_loss.item() epoch_valid_loss.append(validation_loss / len(validloader)) print("Validation loss: %f" % (epoch_valid_loss[-1])) scheduler.step(epoch=epoch) if epoch_valid_loss[-1] < min_val_loss: print("Found best model.") best_model = deepcopy(model) min_val_loss = epoch_valid_loss[-1] plt.plot(list(range(max_n_epochs)), epoch_train_loss, label="Training loss") plt.plot(list(range(max_n_epochs)), epoch_valid_loss, label="Validation loss") plt.xlabel("Number of epochs") plt.ylabel("Loss") plt.title("Number of epochs vs loss") plt.legend() plt.show() ## Save model print("Saving best model ... ") torch.save(best_model, 'best_model.pkl') print( "\n-------------------------------------------------------------------------------------------------------\n" ) ## Check output print("Loading model ...") model = torch.load('best_model.pkl') print(model) model.eval() preds = [] for feat in evaln_ims: preds.append(model.prediction(feat, tokens, device)) best_targets = [] bleu_scores = [] for p, t in zip(preds, cap_list[:n_test]): pred = p.split(" ") targ = [t.split(" ")] z = sentence_bleu(targ, pred, weights=(1, 0, 0, 0)) bleu_scores.append(z) if z > 0.50: print(p, t, z, sep='\n') print("\n") best_targets.append((p, t, z)) for i, tgt in enumerate(best_targets): print("{}: {}".format(i, tgt)) print("MEAN BLEU SCORE: %3f" % np.mean(bleu_scores))