Ejemplo n.º 1
0
def infer(test_path, output_path, config):
    '''Train the network
    :param test_path: Test data directory path
    :param output_path: Output directory path
    :param config: config parser
    '''
    for name in os.listdir(test_path):
        # Load image
        image = preprocess.load_image(os.path.join(test_path, name))
        net_input = preprocess.preinference(image, config)

        # Create model
        model = get_model(config, net_input[0].shape)
            
        # Load model
        load_model(model, config)

        # Make prediction
        prediction = model.predict(net_input, verbose=1)[0]
        
        # Clip result
        prediction = preprocess.postinference(config, prediction, image)

        # Save images
        preprocess.save_image(name, prediction, output_path)
Ejemplo n.º 2
0
def predict(image_name,
            data_dir="/home/shagun/projects/Image-Caption-Generator/data/",
            weights_path=None,
            mode="test"):
    '''Method to predict the caption for a given image.
    weights_path is the path to the .h5 file (model)'''

    image_path = data_dir + "images/" + image_name
    vgg_model = load_vgg16()
    vgg_embedding = vgg_model.predict(load_image(image_path))
    image_embeddings = [vgg_embedding]

    config_dict = generate_config(data_dir=data_dir, mode=mode)
    print(config_dict)

    model = create_model(config_dict=config_dict, compile_model=False)

    model.load_weights(data_dir + "model/" + weights_path)

    tokenizer = get_tokenizer(config_dict=config_dict, data_dir=data_dir)

    index_to_word = {v: k for k, v in tokenizer.word_index.items()}

    for image_embedding in image_embeddings:
        gen_captions(config=config_dict,
                     model=model,
                     image_embedding=image_embedding,
                     tokenizer=tokenizer,
                     num_captions=2,
                     index_to_word=index_to_word)
Ejemplo n.º 3
0
def visualize_bbox(img, bounding_box, segm=None, img_size=(1280, 720)):
    """
    Visualize a bounding box in an image with optional segmentation

    # Params:
    - img: string of the filepath or numpy array
    - bounding_box: (x, y, width, height) or None
    - segm: numpy array of segmentation, same size as image
    - img_size: if a path is given for the image, the image will
                be resize to this (width, height)
    """
    if isinstance(img, basestring):
        img = load_image(img)
        img = cv2.resize(img, img_size, interpolation=cv2.INTER_LINEAR)

    plt.figure(figsize=(10, 7))
    plt.imshow(img)
    if segm is not None:
        plt.imshow(segm, alpha=0.3)

    if bounding_box is not None:
        (x, y, width, height) = bounding_box
        ax = plt.gca()
        ax.add_patch(
            Rectangle((x, y),
                      width,
                      height,
                      fill=False,
                      edgecolor='red',
                      linewidth=3))
    else:
        print 'No bounding box'
    plt.show()
Ejemplo n.º 4
0
def extract_features(directory):
    # load the model
    model = Encoder()
    # model.to(device)
    model.eval()
    # extract features from each photo
    features = dict()
    for i, name in enumerate(listdir(directory)):
        # load an image from file
        filename = directory + '/' + name
        image = load_image(filename, size=224)
        # convert the image pixels to a numpy array
        image = transforms.ToTensor()(image)
        # reshape data for the model
        image = image.unsqueeze(0)
        # prepare the image for the VGG model
        image = normalize_batch(image)
        # get features
        feature = model(image)
        # get image id
        image_id = name.split('.')[0]
        # store feature
        features[image_id] = feature
        #         print('>%s' % name)
        if i % 50 == 0:
            print("{} image done.".format(i))
    return features
Ejemplo n.º 5
0
def get_bounding_boxes(bbox_folder,
                       resize=None,
                       data_folder='',
                       include_class=False):
    """
    Read bounding boxes from json files created by Sloth

    # Params
    - bbox_folder : folder containing json files with bounding boxes
    - resize : None or tuple (height, width). If set, the bounding box
               will be rescaled to this size.
    - data_folder: If resize is not None, this folder will be used for
                   getting the size of each image. This should be the
                   original training data.
    - include_class : If true, the fifth number returned will be the class

    # Returns
    - A dictionary mapping filename to a list of bounding boxes
      of the form (x, y, width, height)
    """
    bboxes = defaultdict(list)
    file_paths = glob.glob(os.path.join(bbox_folder, '*.json'))
    if len(file_paths) == 0:
        raise ValueError('No boundingboxes found in %s' % bbox_folder)
    for file_path in file_paths:
        with open(file_path) as file:
            data = json.load(file)
            for image in data:
                img_name = os.path.basename(image['filename'])
                for annot in image['annotations']:
                    x, y, width, height = annot['x'], annot['y'], annot[
                        'width'], annot['height']
                    label = annot['class']
                    if resize is not None:
                        img = load_image(
                            os.path.join(data_folder, label, img_name))
                        size = np.array(img.shape[:2])
                        aspect = size.astype(
                            np.float32) / np.array(resize).astype(np.float32)
                        x, width = x / aspect[1], width / aspect[1]
                        y, height = y / aspect[1], height / aspect[1]

                    # make sure that coordinates are valid
                    bbox = (max(0, int(round(annot['x']))),
                            max(0, int(round(annot['y']))),
                            int(round(annot['width'])),
                            int(round(annot['height'])), annot['class'])
                    if not include_class:
                        bbox = bbox[:4]
                    bboxes[img_name].append(bbox)
    return bboxes
Ejemplo n.º 6
0
def main():
    # assertion
    if len(sys.argv) != 3:
        print('usage: {} image_path output_path'.format(sys.argv[0]))
        return

    # constants
    image_path = sys.argv[1]
    output_path = sys.argv[2]
    num_channels = cfg.config['num_channels']
    classes = cfg.config['classes']
    num_classes = len(classes)
    height = cfg.config['height']
    width = cfg.config['width']
    anchors = cfg.config['anchors']
    num_anchors = len(anchors[0])
    nms_iou = cfg.config['NMS_IoU']
    confidency = cfg.config['confidency']
    cuda = cfg.config['CUDA']
    weight_path = cfg.config['path']['detect_weight']

    # network
    net = model.YOLOv3(num_channels, num_classes, num_anchors)
    if cuda:
        net = net.cuda()
    net.load_state_dict(torch.load(weight_path))
    print('Load weight from {}.'.format(weight_path))

    # detection
    image = pre.load_image(image_path, height, width, cuda)
    prediction = detect(net, image, anchors, confidency, nms_iou, cuda)

    # write prediction into output
    write_prediction(prediction, output_path, height, width)

    return
Ejemplo n.º 7
0
def main():
    # constants
    num_channels = cfg.config['num_channels']
    classes = cfg.config['classes']
    num_classes = len(classes)
    height = cfg.config['height']
    width = cfg.config['width']
    anchors = cfg.config['anchors']
    num_anchors = len(anchors[0])
    nms_iou = cfg.config['NMS_IoU']
    confidency = cfg.config['confidency']
    cuda = cfg.config['CUDA']
    weight_path = cfg.config['path']['detect_weight']

    # unloaded net
    net = None

    # main loop
    while True:
        # print usage
        # print('Available commands:')
        # print('  detect image_path output_path')
        # print('  train')
        # print('  test')
        # print('  quit | q | exit')
        print('The Detector is Ready.')

        # get user input
        with open('pipe', 'r') as pipe:
            command = pipe.read().replace('\n', '')

        # condition to quit
        if command == 'q':
            break
        elif command == 'quit':
            break
        elif command == 'exit':
            break

        command = command.split(' ')

        # detect
        if command[0] == 'detect':
            if len(command) != 3:
                print('usage: detect image_path output_path')
            else:
                image_path = command[1]
                output_path = command[2]

                # load net if it is not loaded
                if net is None:
                    net = model.YOLOv3(num_channels, num_classes, num_anchors)
                    if cuda:
                        net = net.cuda()
                    net.load_state_dict(torch.load(weight_path))
                    print('Load weight from {}'.format(weight_path))

                # load image
                image = pre.load_image(image_path, height, width, cuda)

                # predict and write bbox
                prediction = detect.detect(net, image, anchors, confidency,
                                           nms_iou, cuda)
                detect.write_prediction(prediction, output_path, height, width)

        # train
        elif command[0] == 'train':
            if len(command) != 1:
                print('usage: train')
            train.main()

        # test
        elif command[0] == 'test':
            if len(command) != 1:
                print('usage: test')
            test.main()

        # show usage
        else:
            print('{}: Unknown command.'.format(command))
        os.mkdir(mouth_usr_path)

    imgList = os.listdir(user_path) # get the user's images

    for img in imgList:
        if(img[len(img) - 3:len(img)] != image_extension): # check the file extension
            continue

        image_path = os.path.join(user_path, img) # define the input image path
        print (datetime.now().strftime('%d/%m/%Y %H:%M:%S') + " - Current image " + image_path)

        if processed_before(eyebrows_usr_path, eyes_usr_path, nose_usr_path, mouth_usr_path, img):
            continue

        # load the input image
        image = preprocess.load_image(image_path)
        coords_path = os.path.join(coords_usr_path, img.replace(".jpg", ".csv"))

        if not(os.path.exists(coords_path)): # if openface did not find landmarks
            print("openface did not find landmarks")
            
            # Detect face using Dlib
            dets = detector(image, 1)

            if (len(dets) <= 0):
                print("face not found " + str(len(dets)))
                d = dlib.rectangle(0,0,image.shape[1], image.shape[0])
            else:
                d = find_biggest_face(dets)

            shape = predictor(image, d)
Ejemplo n.º 9
0
def main():
    # constants
    num_channels = cfg.config['num_channels']
    classes = cfg.config['classes']
    num_classes = len(classes)
    height = cfg.config['height']
    width = cfg.config['width']
    anchors = cfg.config['anchors']
    num_anchors = len(anchors[0])
    confidency = cfg.config['confidency']
    tp_iou = cfg.config['TP_IoU']
    nms_iou = cfg.config['NMS_IoU']
    cuda = cfg.config['CUDA']
    target_dir = cfg.config['path']['test']
    image_dir = cfg.config['path']['image']
    weight_dir = cfg.config['path']['weight_test']
    image_paths = pre.load_image_paths(image_dir, target_dir)
    target_paths = pre.load_dir_paths(target_dir)
    weight_paths = pre.load_dir_paths(weight_dir)
    num_images = len(image_paths)

    # network
    net = model.YOLOv3(num_channels, num_classes, num_anchors)
    if cuda:
        net = net.cuda()

    # calculate loss or mAP for each weights
    for weight_path in weight_paths:
        net.load_state_dict(torch.load(weight_path))
        net.eval()
        loss_giou = 0.0
        loss_obj = 0.0
        loss_cls = 0.0
        loss_blc = 0.0
        predictions = []
        targets = []
        t0 = time.time()

        for i in range(num_images):
            # load image and target
            image = pre.load_image(image_paths[i], height, width, cuda)
            image = image.unsqueeze(0)
            target = pre.load_targets(target_paths[i:i + 1], num_classes,
                                      height, width, cuda)

            # predict bbox
            prediction = [pred.detach() for pred in net(image)]

            # calculate loss
            loss = post.calculate_loss(prediction, target, anchors, height,
                                       width, cuda).detach()
            loss_giou += float(loss[0])
            loss_obj += float(loss[1])
            loss_cls += float(loss[2])
            loss_blc += float(loss[3])

            # save prediction and target as numpy array
            prediction = post.postprocess(prediction, anchors, height, width,
                                          confidency, nms_iou, cuda)
            predictions.extend([pred.detach() for pred in prediction])
            targets.extend([tagt.detach() for tagt in target])

        # normalize loss
        loss_giou /= num_images
        loss_obj /= num_images
        loss_cls /= num_images
        loss_blc /= num_images

        # calculate AP
        AP = post.calculate_AP(predictions, targets, tp_iou, cuda)

        elapsed_time = time.time() - t0

        print((
            'Weight: {}, Elapsed Time: {:.2f}s, ' +
            # 'GIoU Loss: {:.2f}, ' +
            # 'Objectness Loss: {:.2f}, ' +
            # 'Class Loss: {:.2f}, ' +
            # 'Balance Loss: {:.2f}, ' +
            'Loss: {:.2f}, ' + 'AP: ' +
            ', '.join(['{:.2f}'.format(ap * 100) for ap in AP]) + ', '
            'mAP: {:.2f}'.format(100 * sum(AP) / len(AP))).format(
                weight_path, elapsed_time, loss_giou, loss_obj, loss_cls,
                loss_blc, loss_giou + loss_obj + loss_cls + loss_blc))
Ejemplo n.º 10
0
eyebrows_dir, eyes_dir, nose_dir, mouth_dir = preprocess.create_facial_parts_dir(output_dir)

# start in the second line due to the first one is the file header
for user_idx in range(1, 3756):
    line = lines[user_idx].rstrip()
    fields = line.split(',')
    user = fields[0] + image_extension # first column has the image file
    
    # define the input image path
    image_path = os.path.join(image_dir, user)

    # define the output image path
    user = user.replace(image_extension, output_extension)
    
    # load the input image
    image = preprocess.load_image(image_path)
    print (datetime.now().strftime('%d/%m/%Y %H:%M:%S') + " - Current image " + image_path)

    # read landmark annotations
    coordinates = numpy.zeros((76,2), numpy.float)

    for coord_idx in range(1, 77):
        coordinates[coord_idx - 1, 0] = float(fields[coord_idx * 2])
        coordinates[coord_idx - 1, 1] = float(fields[coord_idx * 2 + 1])
   
    angle = preprocess.get_angle(coordinates[29, 0], coordinates[29, 1], coordinates[34, 0], coordinates[34, 1])
    image,coordinates = preprocess.rotate_image_and_coordinates(image, coordinates, angle)

    # segment the eyes region
    coords = coordinates[27:37, 0:2]
    coords = numpy.concatenate((coords, coordinates[68:76, 0:2]), axis = 0)
Ejemplo n.º 11
0
        # append as input for generating the next word
        in_text += ' ' + word
        # stop if we predict the end of the sequence
        if word == 'endseq':
            break
    return in_text


device = 'cuda' if torch.cuda.is_available() else 'cpu'
encoder_model = Encoder()
decoder_model = CaptionModel(vocab_size).to(device)
decoder_model.load_state_dict(torch.load(args.checkpoint))
decoder_model.eval()
for image_name in os.listdir("evaluate/images"):
    print(image_name)
    image = load_image(os.path.join("evaluate/images/", image_name), size=224)
    # convert the image pixels to a numpy array
    image = transforms.ToTensor()(image)
    # reshape data for the model
    image = image.unsqueeze(0)
    # prepare the image for the VGG model
    image = normalize_batch(image)
    features = encoder_model(image)
    predicted_sentence = generate_desc(decoder_model, tokenizer, features,
                                       max_length)
    img = plt.imread(os.path.join("evaluate/images/", image_name))
    plt.imshow(img)
    plt.axis('off')
    plt.title(predicted_sentence)
    plt.savefig(os.path.join("evaluate/results/", image_name + '_result.jpg'))