def infer(test_path, output_path, config): '''Train the network :param test_path: Test data directory path :param output_path: Output directory path :param config: config parser ''' for name in os.listdir(test_path): # Load image image = preprocess.load_image(os.path.join(test_path, name)) net_input = preprocess.preinference(image, config) # Create model model = get_model(config, net_input[0].shape) # Load model load_model(model, config) # Make prediction prediction = model.predict(net_input, verbose=1)[0] # Clip result prediction = preprocess.postinference(config, prediction, image) # Save images preprocess.save_image(name, prediction, output_path)
def predict(image_name, data_dir="/home/shagun/projects/Image-Caption-Generator/data/", weights_path=None, mode="test"): '''Method to predict the caption for a given image. weights_path is the path to the .h5 file (model)''' image_path = data_dir + "images/" + image_name vgg_model = load_vgg16() vgg_embedding = vgg_model.predict(load_image(image_path)) image_embeddings = [vgg_embedding] config_dict = generate_config(data_dir=data_dir, mode=mode) print(config_dict) model = create_model(config_dict=config_dict, compile_model=False) model.load_weights(data_dir + "model/" + weights_path) tokenizer = get_tokenizer(config_dict=config_dict, data_dir=data_dir) index_to_word = {v: k for k, v in tokenizer.word_index.items()} for image_embedding in image_embeddings: gen_captions(config=config_dict, model=model, image_embedding=image_embedding, tokenizer=tokenizer, num_captions=2, index_to_word=index_to_word)
def visualize_bbox(img, bounding_box, segm=None, img_size=(1280, 720)): """ Visualize a bounding box in an image with optional segmentation # Params: - img: string of the filepath or numpy array - bounding_box: (x, y, width, height) or None - segm: numpy array of segmentation, same size as image - img_size: if a path is given for the image, the image will be resize to this (width, height) """ if isinstance(img, basestring): img = load_image(img) img = cv2.resize(img, img_size, interpolation=cv2.INTER_LINEAR) plt.figure(figsize=(10, 7)) plt.imshow(img) if segm is not None: plt.imshow(segm, alpha=0.3) if bounding_box is not None: (x, y, width, height) = bounding_box ax = plt.gca() ax.add_patch( Rectangle((x, y), width, height, fill=False, edgecolor='red', linewidth=3)) else: print 'No bounding box' plt.show()
def extract_features(directory): # load the model model = Encoder() # model.to(device) model.eval() # extract features from each photo features = dict() for i, name in enumerate(listdir(directory)): # load an image from file filename = directory + '/' + name image = load_image(filename, size=224) # convert the image pixels to a numpy array image = transforms.ToTensor()(image) # reshape data for the model image = image.unsqueeze(0) # prepare the image for the VGG model image = normalize_batch(image) # get features feature = model(image) # get image id image_id = name.split('.')[0] # store feature features[image_id] = feature # print('>%s' % name) if i % 50 == 0: print("{} image done.".format(i)) return features
def get_bounding_boxes(bbox_folder, resize=None, data_folder='', include_class=False): """ Read bounding boxes from json files created by Sloth # Params - bbox_folder : folder containing json files with bounding boxes - resize : None or tuple (height, width). If set, the bounding box will be rescaled to this size. - data_folder: If resize is not None, this folder will be used for getting the size of each image. This should be the original training data. - include_class : If true, the fifth number returned will be the class # Returns - A dictionary mapping filename to a list of bounding boxes of the form (x, y, width, height) """ bboxes = defaultdict(list) file_paths = glob.glob(os.path.join(bbox_folder, '*.json')) if len(file_paths) == 0: raise ValueError('No boundingboxes found in %s' % bbox_folder) for file_path in file_paths: with open(file_path) as file: data = json.load(file) for image in data: img_name = os.path.basename(image['filename']) for annot in image['annotations']: x, y, width, height = annot['x'], annot['y'], annot[ 'width'], annot['height'] label = annot['class'] if resize is not None: img = load_image( os.path.join(data_folder, label, img_name)) size = np.array(img.shape[:2]) aspect = size.astype( np.float32) / np.array(resize).astype(np.float32) x, width = x / aspect[1], width / aspect[1] y, height = y / aspect[1], height / aspect[1] # make sure that coordinates are valid bbox = (max(0, int(round(annot['x']))), max(0, int(round(annot['y']))), int(round(annot['width'])), int(round(annot['height'])), annot['class']) if not include_class: bbox = bbox[:4] bboxes[img_name].append(bbox) return bboxes
def main(): # assertion if len(sys.argv) != 3: print('usage: {} image_path output_path'.format(sys.argv[0])) return # constants image_path = sys.argv[1] output_path = sys.argv[2] num_channels = cfg.config['num_channels'] classes = cfg.config['classes'] num_classes = len(classes) height = cfg.config['height'] width = cfg.config['width'] anchors = cfg.config['anchors'] num_anchors = len(anchors[0]) nms_iou = cfg.config['NMS_IoU'] confidency = cfg.config['confidency'] cuda = cfg.config['CUDA'] weight_path = cfg.config['path']['detect_weight'] # network net = model.YOLOv3(num_channels, num_classes, num_anchors) if cuda: net = net.cuda() net.load_state_dict(torch.load(weight_path)) print('Load weight from {}.'.format(weight_path)) # detection image = pre.load_image(image_path, height, width, cuda) prediction = detect(net, image, anchors, confidency, nms_iou, cuda) # write prediction into output write_prediction(prediction, output_path, height, width) return
def main(): # constants num_channels = cfg.config['num_channels'] classes = cfg.config['classes'] num_classes = len(classes) height = cfg.config['height'] width = cfg.config['width'] anchors = cfg.config['anchors'] num_anchors = len(anchors[0]) nms_iou = cfg.config['NMS_IoU'] confidency = cfg.config['confidency'] cuda = cfg.config['CUDA'] weight_path = cfg.config['path']['detect_weight'] # unloaded net net = None # main loop while True: # print usage # print('Available commands:') # print(' detect image_path output_path') # print(' train') # print(' test') # print(' quit | q | exit') print('The Detector is Ready.') # get user input with open('pipe', 'r') as pipe: command = pipe.read().replace('\n', '') # condition to quit if command == 'q': break elif command == 'quit': break elif command == 'exit': break command = command.split(' ') # detect if command[0] == 'detect': if len(command) != 3: print('usage: detect image_path output_path') else: image_path = command[1] output_path = command[2] # load net if it is not loaded if net is None: net = model.YOLOv3(num_channels, num_classes, num_anchors) if cuda: net = net.cuda() net.load_state_dict(torch.load(weight_path)) print('Load weight from {}'.format(weight_path)) # load image image = pre.load_image(image_path, height, width, cuda) # predict and write bbox prediction = detect.detect(net, image, anchors, confidency, nms_iou, cuda) detect.write_prediction(prediction, output_path, height, width) # train elif command[0] == 'train': if len(command) != 1: print('usage: train') train.main() # test elif command[0] == 'test': if len(command) != 1: print('usage: test') test.main() # show usage else: print('{}: Unknown command.'.format(command))
os.mkdir(mouth_usr_path) imgList = os.listdir(user_path) # get the user's images for img in imgList: if(img[len(img) - 3:len(img)] != image_extension): # check the file extension continue image_path = os.path.join(user_path, img) # define the input image path print (datetime.now().strftime('%d/%m/%Y %H:%M:%S') + " - Current image " + image_path) if processed_before(eyebrows_usr_path, eyes_usr_path, nose_usr_path, mouth_usr_path, img): continue # load the input image image = preprocess.load_image(image_path) coords_path = os.path.join(coords_usr_path, img.replace(".jpg", ".csv")) if not(os.path.exists(coords_path)): # if openface did not find landmarks print("openface did not find landmarks") # Detect face using Dlib dets = detector(image, 1) if (len(dets) <= 0): print("face not found " + str(len(dets))) d = dlib.rectangle(0,0,image.shape[1], image.shape[0]) else: d = find_biggest_face(dets) shape = predictor(image, d)
def main(): # constants num_channels = cfg.config['num_channels'] classes = cfg.config['classes'] num_classes = len(classes) height = cfg.config['height'] width = cfg.config['width'] anchors = cfg.config['anchors'] num_anchors = len(anchors[0]) confidency = cfg.config['confidency'] tp_iou = cfg.config['TP_IoU'] nms_iou = cfg.config['NMS_IoU'] cuda = cfg.config['CUDA'] target_dir = cfg.config['path']['test'] image_dir = cfg.config['path']['image'] weight_dir = cfg.config['path']['weight_test'] image_paths = pre.load_image_paths(image_dir, target_dir) target_paths = pre.load_dir_paths(target_dir) weight_paths = pre.load_dir_paths(weight_dir) num_images = len(image_paths) # network net = model.YOLOv3(num_channels, num_classes, num_anchors) if cuda: net = net.cuda() # calculate loss or mAP for each weights for weight_path in weight_paths: net.load_state_dict(torch.load(weight_path)) net.eval() loss_giou = 0.0 loss_obj = 0.0 loss_cls = 0.0 loss_blc = 0.0 predictions = [] targets = [] t0 = time.time() for i in range(num_images): # load image and target image = pre.load_image(image_paths[i], height, width, cuda) image = image.unsqueeze(0) target = pre.load_targets(target_paths[i:i + 1], num_classes, height, width, cuda) # predict bbox prediction = [pred.detach() for pred in net(image)] # calculate loss loss = post.calculate_loss(prediction, target, anchors, height, width, cuda).detach() loss_giou += float(loss[0]) loss_obj += float(loss[1]) loss_cls += float(loss[2]) loss_blc += float(loss[3]) # save prediction and target as numpy array prediction = post.postprocess(prediction, anchors, height, width, confidency, nms_iou, cuda) predictions.extend([pred.detach() for pred in prediction]) targets.extend([tagt.detach() for tagt in target]) # normalize loss loss_giou /= num_images loss_obj /= num_images loss_cls /= num_images loss_blc /= num_images # calculate AP AP = post.calculate_AP(predictions, targets, tp_iou, cuda) elapsed_time = time.time() - t0 print(( 'Weight: {}, Elapsed Time: {:.2f}s, ' + # 'GIoU Loss: {:.2f}, ' + # 'Objectness Loss: {:.2f}, ' + # 'Class Loss: {:.2f}, ' + # 'Balance Loss: {:.2f}, ' + 'Loss: {:.2f}, ' + 'AP: ' + ', '.join(['{:.2f}'.format(ap * 100) for ap in AP]) + ', ' 'mAP: {:.2f}'.format(100 * sum(AP) / len(AP))).format( weight_path, elapsed_time, loss_giou, loss_obj, loss_cls, loss_blc, loss_giou + loss_obj + loss_cls + loss_blc))
eyebrows_dir, eyes_dir, nose_dir, mouth_dir = preprocess.create_facial_parts_dir(output_dir) # start in the second line due to the first one is the file header for user_idx in range(1, 3756): line = lines[user_idx].rstrip() fields = line.split(',') user = fields[0] + image_extension # first column has the image file # define the input image path image_path = os.path.join(image_dir, user) # define the output image path user = user.replace(image_extension, output_extension) # load the input image image = preprocess.load_image(image_path) print (datetime.now().strftime('%d/%m/%Y %H:%M:%S') + " - Current image " + image_path) # read landmark annotations coordinates = numpy.zeros((76,2), numpy.float) for coord_idx in range(1, 77): coordinates[coord_idx - 1, 0] = float(fields[coord_idx * 2]) coordinates[coord_idx - 1, 1] = float(fields[coord_idx * 2 + 1]) angle = preprocess.get_angle(coordinates[29, 0], coordinates[29, 1], coordinates[34, 0], coordinates[34, 1]) image,coordinates = preprocess.rotate_image_and_coordinates(image, coordinates, angle) # segment the eyes region coords = coordinates[27:37, 0:2] coords = numpy.concatenate((coords, coordinates[68:76, 0:2]), axis = 0)
# append as input for generating the next word in_text += ' ' + word # stop if we predict the end of the sequence if word == 'endseq': break return in_text device = 'cuda' if torch.cuda.is_available() else 'cpu' encoder_model = Encoder() decoder_model = CaptionModel(vocab_size).to(device) decoder_model.load_state_dict(torch.load(args.checkpoint)) decoder_model.eval() for image_name in os.listdir("evaluate/images"): print(image_name) image = load_image(os.path.join("evaluate/images/", image_name), size=224) # convert the image pixels to a numpy array image = transforms.ToTensor()(image) # reshape data for the model image = image.unsqueeze(0) # prepare the image for the VGG model image = normalize_batch(image) features = encoder_model(image) predicted_sentence = generate_desc(decoder_model, tokenizer, features, max_length) img = plt.imread(os.path.join("evaluate/images/", image_name)) plt.imshow(img) plt.axis('off') plt.title(predicted_sentence) plt.savefig(os.path.join("evaluate/results/", image_name + '_result.jpg'))