Example #1
0
def preprocess(img):
    h, w = img.shape[:2]

    resize = 256
    if h > w:
        h = resize * h // w
        w = resize
    else:
        w = resize * w // h
        h = resize

    img = np.array(Image.fromarray(img).resize((w, h), Image.BILINEAR))

    if h > IMAGE_SIZE:
        pad = (h - IMAGE_SIZE) // 2
        img = img[pad:pad + IMAGE_SIZE, :]
    if w > IMAGE_SIZE:
        pad = (w - IMAGE_SIZE) // 2
        img = img[:, pad:pad + IMAGE_SIZE]

    img = normalize_image(img, normalize_type='ImageNet')

    img = img.transpose((2, 0, 1))
    img = np.expand_dims(img, axis=0)

    return img
Example #2
0
def preprocess(img, mask=False):
    h, w = img.shape[:2]
    size = IMAGE_RESIZE
    crop_size = IMAGE_SIZE

    # resize
    if h > w:
        size = (size, int(size * h / w))
    else:
        size = (int(size * w / h), size)
    img = np.array(
        Image.fromarray(img).resize(
            size, resample=Image.ANTIALIAS if not mask else Image.NEAREST))

    # center crop
    h, w = img.shape[:2]
    pad_h = (h - crop_size) // 2
    pad_w = (w - crop_size) // 2
    img = img[pad_h:pad_h + crop_size, pad_w:pad_w + crop_size, :]

    # normalize
    if not mask:
        img = normalize_image(img.astype(np.float32), 'ImageNet')
    else:
        img = img / 255

    img = img.transpose(2, 0, 1)  # HWC -> CHW
    img = np.expand_dims(img, axis=0)

    return img
Example #3
0
def preprocess(img):
    h, w = (IMAGE_HEIGHT, IMAGE_WIDTH)
    im_h, im_w, _ = img.shape

    max_orig_size = max(im_h, im_w)
    min_orig_size = min(im_h, im_w)
    if max_orig_size / min_orig_size * h > w:
        size = int(round(w * min_orig_size / max_orig_size))
    else:
        size = h

    if im_h > im_w:
        scale = size / im_w
        ow = size
        oh = (size * im_h) // im_w
    else:
        scale = size / im_h
        oh = size
        ow = (size * im_w) // im_h
    if ow != im_w or oh != im_h:
        img = np.array(Image.fromarray(img).resize((ow, oh), Image.BILINEAR))

    img = normalize_image(img, normalize_type='ImageNet')

    # padding
    new_img = np.zeros((h, w, 3))
    x = (w - ow) // 2
    y = (h - oh) // 2
    new_img[y:y + oh, x:x + ow, :] = img
    img = new_img

    img = img.transpose(2, 0, 1)  # HWC -> CHW
    img = img.astype(np.float32)

    return img, (x, y), scale
Example #4
0
def preprocess(img):
    img = img.astype(np.float32)
    img = normalize_image(img, normalize_type='ImageNet')
    img = img.transpose((2, 0, 1))  # HWC -> CHW
    img = np.expand_dims(img, axis=0)

    return img
def predict(landmark_detector, face_detector, img):
    if face_detector is not None:
        bboxes = detect_faces(img, face_detector)
    else:
        h, w = img.shape[:2]
        bboxes = [np.array([0, 0, w - 1, h - 1, 1])]
    bboxes = np.array(bboxes)

    pose_results = []
    if len(bboxes) == 0:
        return pose_results

    bboxes_xywh = xyxy2xywh(bboxes)

    img_size = (256, 256)
    batch_data = []
    img_metas = []
    for bbox in bboxes_xywh:
        c, s = box2cs(bbox)
        r = 0
        img_metas.append({
            "center": c,
            "scale": s,
        })
        trans = get_affine_transform(c, s, r, img_size)
        _img = cv2.warpAffine(img,
                              trans, (img_size[0], img_size[1]),
                              flags=cv2.INTER_LINEAR)

        _img = normalize_image(_img[:, :, ::-1], 'ImageNet')
        batch_data.append(_img)

    batch_data = np.asarray(batch_data)
    batch_data = batch_data.transpose((0, 3, 1, 2))

    output = landmark_detector.predict([batch_data])
    heatmap = output[0]
    if 1:  # do flip
        batch_data = batch_data[:, :, :, ::-1]  # horizontal flip
        output = landmark_detector.predict([batch_data])
        flipped_heatmap = output[0]

        flip_pairs = [[0, 4], [1, 3], [5, 10], [6, 9], [7, 8], [11, 19],
                      [12, 18], [13, 17], [14, 22], [15, 21], [16, 20],
                      [24, 26]]
        flipped_heatmap = flip_back(flipped_heatmap, flip_pairs)

        # feature is not aligned, shift flipped heatmap for higher accuracy
        flipped_heatmap[:, :, :, 1:] = flipped_heatmap[:, :, :, :-1]

        heatmap = (heatmap + flipped_heatmap) * 0.5

    keypoint_result = keypoint_decode(heatmap, img_metas)

    return keypoint_result, bboxes
Example #6
0
def midas_imread(image_path):
    if not os.path.isfile(image_path):
        print(f'[ERROR] {image_path} not found.')
        sys.exit()
    image = cv2.imread(image_path)
    if image.ndim == 2:
        image = cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image = normalize_image(image, 'ImageNet')

    return midas_resize(image, IMAGE_HEIGHT, IMAGE_WIDTH)
Example #7
0
def midas_imread(image_path):
    if not os.path.isfile(image_path):
        logger.error(f'{image_path} not found.')
        sys.exit()
    image = cv2.imread(image_path)
    if image.ndim == 2:
        image = cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image = normalize_image(image, 'ImageNet')

    h, w = (IMAGE_HEIGHT, IMAGE_WIDTH) if not args.v21 or args.model_type == 'large' \
               else (IMAGE_HEIGHT_SMALL, IMAGE_WIDTH_SMALL)
    return midas_resize(image, h, w)
Example #8
0
def preprocess(img, gray=False):
    if gray:
        img = img / 255
        img = (img - 0.5) / 0.5
        img = img[:, :, None]
    else:
        img = normalize_image(img, normalize_type='ImageNet')

    img = img.transpose(2, 0, 1)  # HWC -> CHW
    img = np.expand_dims(img, axis=0)
    img = img.astype(np.float32)

    return img
Example #9
0
def preprocess_aug(img, mask=False, angle_range=[-10, 10], return_refs=False):
    h, w = img.shape[:2]
    size = IMAGE_RESIZE
    crop_size = IMAGE_SIZE

    # resize
    if h > w:
        size = (size, int(size * h / w))
    else:
        size = (int(size * w / h), size)
    img = np.array(
        Image.fromarray(img).resize(
            size, resample=Image.ANTIALIAS if not mask else Image.NEAREST))

    # for visualize
    img_resized = img.copy()

    # random rotate
    if not mask:
        h, w = img.shape[:2]
        angle = np.random.randint(angle_range[0], angle_range[0] + 1)
        rot_mat = cv2.getRotationMatrix2D((w / 2, h / 2), angle, 1)
        img = cv2.warpAffine(src=img,
                             M=rot_mat,
                             dsize=(w, h),
                             borderMode=cv2.BORDER_REPLICATE,
                             flags=cv2.INTER_LINEAR)

    # random crop
    if not mask:
        h, w = img.shape[:2]
        pad_h = np.random.randint(0, (h - crop_size))
        pad_w = np.random.randint(0, (w - crop_size))
        img = img[pad_h:pad_h + crop_size, pad_w:pad_w + crop_size, :]

    # normalize
    if not mask:
        img = normalize_image(img.astype(np.float32), 'ImageNet')
    else:
        img = img / 255

    img = img.transpose(2, 0, 1)  # HWC -> CHW
    img = np.expand_dims(img, axis=0)

    if return_refs:
        return img, img_resized, angle, pad_h, pad_w
    else:
        return img
Example #10
0
def preprocess_frame(frame,
                     input_height,
                     input_width,
                     data_rgb=True,
                     normalize_type='255'):
    """
    Pre-process the frames taken from the webcam to input to ailia.

    Parameters
    ----------
    frame: numpy array
    input_height: int
        ailia model input height
    input_width: int
        ailia model input width
    data_rgb: bool (default: True)
        Convert as rgb image when True, as gray scale image when False.
        Only `data` will be influenced by this configuration.
    normalize_type: string (default: 255)
        Normalize type should be chosen from the type below.
        - '255': simply dividing by 255.0
        - '127.5': output range : -1 and 1
        - 'ImageNet': normalize by mean and std of ImageNet
        - 'None': no normalization

    Returns
    -------
    img: numpy array
        Image with the propotions of height and width
        adjusted by padding for ailia model input.
    data: numpy array
        Input data for ailia
    """
    img, resized_img = adjust_frame_size(frame, input_height, input_width)

    if data_rgb:
        resized_img = cv2.cvtColor(resized_img, cv2.COLOR_BGR2RGB)

    data = normalize_image(resized_img, normalize_type)

    if data_rgb:
        data = np.rollaxis(data, 2, 0)
        data = np.expand_dims(data, axis=0).astype(np.float32)
    else:
        data = cv2.cvtColor(data.astype(np.float32), cv2.COLOR_BGR2GRAY)
        data = data[np.newaxis, np.newaxis, :, :]
    return img, data
Example #11
0
def face_detect(img, face_net):
    IMAGE_BLAZE_SIZE = 128

    img_0 = img

    img = normalize_image(img, normalize_type='127.5')
    img = cv2.resize(img, (IMAGE_BLAZE_SIZE, IMAGE_BLAZE_SIZE))
    img = img.transpose((2, 0, 1))
    img = np.expand_dims(img, axis=0)

    output = face_net.predict([img])
    detections = but.postprocess(output)
    detections = detections[0]

    # sort by confidence
    detections = sorted(detections, key=lambda x: x[16], reverse=True)
    if len(detections) == 0:
        return None, (0, 0)

    detection = detections[0]

    h, w = img_0.shape[:2]
    ymin = int(detection[0] * h)
    xmin = int(detection[1] * w)
    ymax = int(detection[2] * h)
    xmax = int(detection[3] * w)

    h = ymax - ymin
    w = xmax - xmin
    if h > w:
        p = (h - w) // 2
        w = h
        xmin -= p
    else:
        p = (w - h) // 2
        h = w
        ymin -= p

    img = img_0[ymin:ymin + h, xmin:xmin + w]

    h2, w2 = img.shape[:2]
    if h != h2 or w != w2:
        return None, (0, 0)

    return img, (ymin, xmin)
Example #12
0
def preprocess(img, image_shape):
    h, w = image_shape
    im_h, im_w, _ = img.shape

    # keep_aspect
    scale = min(h / im_h, w / im_w)
    ow, oh = int(im_w * scale + 0.5), int(im_h * scale + 0.5)
    if ow != im_w or oh != im_h:
        img = cv2.resize(img, (ow, oh), interpolation=cv2.INTER_LINEAR)

    img = normalize_image(img, normalize_type='ImageNet')

    pad_img = np.zeros((h, w, 3), dtype=img.dtype)
    pad_img[:oh, :ow, ...] = img
    img = pad_img

    img = img.transpose(2, 0, 1)  # HWC -> CHW
    img = np.expand_dims(img, axis=0)
    img = img.astype(np.float32)

    return img, scale
Example #13
0
def transform(img, pp_net):
    img_0 = img

    img = cv2.resize(img, (U2NET_IMAGE_SIZE, U2NET_IMAGE_SIZE))

    # ToTensorLab part in original repo
    img = img / np.max(img) * 255
    img = normalize_image(img, normalize_type='ImageNet')
    input_data = img.transpose((2, 0, 1))[np.newaxis, :, :, :]

    output = pp_net.predict(input_data)
    pred = output[0, 0, :, :]

    h, w = img_0.shape[:2]
    mask = cv2.resize(pred, (w, h))
    mask = np.clip(mask, 0, 1)
    mask = np.expand_dims(mask, axis=2)

    back = np.ones((h, w, 3)) * 255
    img = img_0 * mask + back * (1 - mask)

    return img
Example #14
0
def preprocess(img, bbox):
    image_size = (IMAGE_SIZE, IMAGE_SIZE)

    c, s = _box2cs(bbox)
    r = 0

    trans = get_affine_transform(c, s, r, image_size)
    img = cv2.warpAffine(img,
                         trans, (int(image_size[0]), int(image_size[1])),
                         flags=cv2.INTER_LINEAR)

    # normalize
    img = normalize_image(img, normalize_type='ImageNet')

    img = img.transpose(2, 0, 1)  # HWC -> CHW
    img = np.expand_dims(img, axis=0)

    img_metas = [{
        'center': c,
        'scale': s,
    }]

    return img, img_metas
Example #15
0
def preprocess(img, bboxs, num_pos=2):
    IMAGE_SIZE = (288, 384)

    inputs = []
    centers = []
    scales = []
    for bbox in bboxs[:num_pos]:
        c, s = box_to_center_scale(bbox, img.shape[0], img.shape[1])
        centers.append(c)
        scales.append(s)
        r = 0

        trans = get_affine_transform(c, s, r, IMAGE_SIZE)
        input = cv2.warpAffine(img,
                               trans, (IMAGE_SIZE[0], IMAGE_SIZE[1]),
                               flags=cv2.INTER_LINEAR)

        input = normalize_image(input.astype(np.float32), 'ImageNet')
        input = input.transpose(2, 0, 1)  # HWC -> CHW
        input = np.expand_dims(input, axis=0)
        inputs.append(input)

    inputs = np.vstack(inputs)
    return inputs, img, centers, scales
Example #16
0
def preprocess(img, image_shape):
    h, w = image_shape
    im_h, im_w, _ = img.shape

    r = min(h / im_h, w / im_w)
    oh, ow = int(im_h * r), int(im_w * r)

    resized_img = cv2.resize(
        img,
        (ow, oh),
        interpolation=cv2.INTER_LINEAR,
    )

    data = np.zeros((h, w, 3), dtype=np.uint8)
    ph, pw = (h - oh) // 2, (w - ow) // 2
    data[ph:ph + oh, pw:pw + ow] = resized_img

    data = normalize_image(data, '127.5')

    data = data.transpose((2, 0, 1))
    data = np.expand_dims(data, axis=0)
    data = data.astype(np.float32)

    return data, (ph, pw), (oh, ow)
Example #17
0
def preprocess(img):
    im_h, im_w, _ = img.shape

    ow, oh = im_w, im_h
    if im_w % (1 << 7) != 0:
        ow = (((im_w >> 7) + 1) << 7)
    if im_h % (1 << 7) != 0:
        oh = (((im_h >> 7) + 1) << 7)

    pad = np.zeros((oh, ow, 3))
    pad_h = (oh - im_h) // 2
    pad_w = (ow - im_w) // 2

    # reflection padding
    pad[pad_h:pad_h + im_h, pad_w:pad_w + im_w, :] = img
    if 0 < pad_w:
        ref = img[:, ::-1, :]
        pad[pad_h:pad_h + im_h, :pad_w, :] = ref[:, -pad_w:, :]
        rem = ow - pad_w - im_w
        pad[pad_h:pad_h + im_h, -rem:, :] = ref[:, :rem, :]
    if 0 < pad_h:
        ref = pad[pad_h:pad_h + im_h, :, :][::-1]
        pad[:pad_h, ...] = ref[-pad_h:, ...]
        rem = oh - pad_h - im_h
        pad[-rem:, ...] = ref[:rem, ...]

    img = pad

    img = normalize_image(img, normalize_type='255')

    img = img[:, :, ::-1]  # BGR -> RGB
    img = img.transpose(2, 0, 1)  # HWC -> CHW
    img = np.expand_dims(img, axis=0)
    img = img.astype(np.float32)

    return img, (pad_h, pad_w)
Example #18
0
def run_training(continue_run):

    logging.info('EXPERIMENT NAME: %s' % config.experiment_name)

    init_step = 0

    if continue_run:
        logging.info(
            '!!!!!!!!!!!!!!!!!!!!!!!!!!!! Continuing previous run !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!'
        )
        try:
            init_checkpoint_path = utils.get_latest_model_checkpoint_path(
                log_dir, 'model.ckpt')
            logging.info('Checkpoint path: %s' % init_checkpoint_path)
            init_step = int(
                init_checkpoint_path.split('/')[-1].split('-')
                [-1]) + 1  # plus 1 b/c otherwise starts with eval
            logging.info('Latest step was: %d' % init_step)
        except:
            logging.warning(
                '!!! Didnt find init checkpoint. Maybe first run failed. Disabling continue mode...'
            )
            continue_run = False
            init_step = 0

        logging.info(
            '!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!'
        )

    train_on_all_data = config.train_on_all_data

    # Load data
    data = acdc_data.load_and_maybe_process_data(
        input_folder=config.input_folder,
        preprocessing_folder=config.preprocessing_folder,
        mode=config.data_mode,
        size=config.image_size,
        target_resolution=config.target_resolution,
        force_overwrite=False,
        split_test_train=config.split_test_train)

    # the following are HDF5 datasets, not numpy arrays
    images_train = data['images_train']
    labels_train = data['masks_train']
    id_train = data['id_images_train']

    if not train_on_all_data:
        images_val = data['images_test']
        labels_val = data['masks_test']
        id_val = data['id_images_test']

    if config.use_data_fraction:
        num_images = images_train.shape[0]
        new_last_index = int(float(num_images) * config.use_data_fraction)

        logging.warning('USING ONLY FRACTION OF DATA!')
        logging.warning(' - Number of imgs orig: %d, Number of imgs new: %d' %
                        (num_images, new_last_index))
        images_train = images_train[0:new_last_index, ...]
        labels_train = labels_train[0:new_last_index, ...]

    logging.info('Data summary:')
    logging.info(' - Images:')
    logging.info(images_train.shape)
    logging.info(images_train.dtype)
    logging.info(' - Labels:')
    logging.info(labels_train.shape)
    logging.info(labels_train.dtype)

    #pre-process
    for img in images_train:
        if config.equalize:
            img = image_utils.equalization_image(img)
        if config.clahe:
            img = image_utils.CLAHE(img)
        if config.standardize:
            img = image_utils.standardize_image(img)
        if config.normalize:
            img = image_utils.normalize_image(img)

    if not train_on_all_data:
        for img in images_val:
            if config.equalize:
                img = image_utils.equalization_image(img)
            if config.clahe:
                img = image_utils.CLAHE(img)
            if config.standardize:
                img = image_utils.standardize_image(img)
            if config.normalize:
                img = image_utils.normalize_image(img)

    if config.prob:  #if prob is not 0
        logging.info(
            'Before data_augmentation the number of training images is:')
        logging.info(images_train.shape[0])
        #augmentation
        image_aug, label_aug = aug.augmentation_function(
            images_train, labels_train)

        #num_aug = image_aug.shape[0]
        # id images augmented will be b'0.0'
        #id_aug = np.zeros([num_aug,]).astype('|S9')
        #concatenate
        #id_train = np.concatenate((id__train,id_aug))
        images_train = np.concatenate((images_train, image_aug))
        labels_train = np.concatenate((labels_train, label_aug))

        logging.info(
            'After data_augmentation the number of training images is:')
        logging.info(images_train.shape[0])
    else:
        logging.info('No data_augmentation. Number of training images is:')
        logging.info(images_train.shape[0])

    # Tell TensorFlow that the model will be built into the default Graph.

    with tf.Graph().as_default():

        # Generate placeholders for the images and labels.

        image_tensor_shape = [config.batch_size] + list(
            config.image_size) + [1]
        mask_tensor_shape = [config.batch_size] + list(config.image_size)

        images_pl = tf.placeholder(tf.float32,
                                   shape=image_tensor_shape,
                                   name='images')
        labels_pl = tf.placeholder(tf.uint8,
                                   shape=mask_tensor_shape,
                                   name='labels')

        learning_rate_pl = tf.placeholder(tf.float32, shape=[])
        training_pl = tf.placeholder(tf.bool, shape=[])

        tf.summary.scalar('learning_rate', learning_rate_pl)

        # Build a Graph that computes predictions from the inference model.
        if (config.experiment_name == 'unet2D_valid'
                or config.experiment_name == 'unet2D_same'
                or config.experiment_name == 'unet2D_same_mod'):
            logits = model.inference(images_pl, config, training=training_pl)
        elif config.experiment_name == 'ENet':
            with slim.arg_scope(
                    model_structure.ENet_arg_scope(weight_decay=2e-4)):
                logits = model_structure.ENet(
                    images_pl,
                    num_classes=config.nlabels,
                    batch_size=config.batch_size,
                    is_training=True,
                    reuse=None,
                    num_initial_blocks=1,
                    stage_two_repeat=2,
                    skip_connections=config.skip_connections)
        else:
            logging.warning('invalid experiment_name!')

        logging.info('images_pl shape')
        logging.info(images_pl.shape)
        logging.info('labels_pl shape')
        logging.info(labels_pl.shape)
        logging.info('logits shape:')
        logging.info(logits.shape)
        # Add to the Graph the Ops for loss calculation.
        [loss, _,
         weights_norm] = model.loss(logits,
                                    labels_pl,
                                    nlabels=config.nlabels,
                                    loss_type=config.loss_type,
                                    weight_decay=config.weight_decay
                                    )  # second output is unregularised loss

        # record how Total loss and weight decay change over time
        tf.summary.scalar('loss', loss)
        tf.summary.scalar('weights_norm_term', weights_norm)

        # Add to the Graph the Ops that calculate and apply gradients.
        if config.momentum is not None:
            train_op = model.training_step(loss,
                                           config.optimizer_handle,
                                           learning_rate_pl,
                                           momentum=config.momentum)
        else:
            train_op = model.training_step(loss, config.optimizer_handle,
                                           learning_rate_pl)

        # Add the Op to compare the logits to the labels during evaluation.
        # loss and dice on a minibatch
        eval_loss = model.evaluation(logits,
                                     labels_pl,
                                     images_pl,
                                     nlabels=config.nlabels,
                                     loss_type=config.loss_type)

        # Build the summary Tensor based on the TF collection of Summaries.
        summary = tf.summary.merge_all()

        # Add the variable initializer Op.
        init = tf.global_variables_initializer()

        # Create a saver for writing training checkpoints.

        if train_on_all_data:
            max_to_keep = None
        else:
            max_to_keep = 5

        saver = tf.train.Saver(max_to_keep=max_to_keep)
        saver_best_dice = tf.train.Saver()
        saver_best_xent = tf.train.Saver()

        # Create a session for running Ops on the Graph.
        configP = tf.ConfigProto()
        configP.gpu_options.allow_growth = True  # Do not assign whole gpu memory, just use it on the go
        configP.allow_soft_placement = True  # If a operation is not define it the default device, let it execute in another.
        sess = tf.Session(config=configP)

        # Instantiate a SummaryWriter to output summaries and the Graph.
        summary_writer = tf.summary.FileWriter(log_dir, sess.graph)

        # with tf.name_scope('monitoring'):

        val_error_ = tf.placeholder(tf.float32, shape=[], name='val_error')
        val_error_summary = tf.summary.scalar('validation_loss', val_error_)

        val_dice_ = tf.placeholder(tf.float32, shape=[], name='val_dice')
        val_dice_summary = tf.summary.scalar('validation_dice', val_dice_)

        val_summary = tf.summary.merge([val_error_summary, val_dice_summary])

        train_error_ = tf.placeholder(tf.float32, shape=[], name='train_error')
        train_error_summary = tf.summary.scalar('training_loss', train_error_)

        train_dice_ = tf.placeholder(tf.float32, shape=[], name='train_dice')
        train_dice_summary = tf.summary.scalar('training_dice', train_dice_)

        train_summary = tf.summary.merge(
            [train_error_summary, train_dice_summary])

        # Run the Op to initialize the variables.
        sess.run(init)

        if continue_run:
            # Restore session
            saver.restore(sess, init_checkpoint_path)

        step = init_step
        curr_lr = config.learning_rate

        no_improvement_counter = 0
        best_val = np.inf
        last_train = np.inf
        loss_history = []
        loss_gradient = np.inf
        best_dice = 0

        for epoch in range(config.max_epochs):

            logging.info('EPOCH %d' % epoch)

            for batch in iterate_minibatches(images_train,
                                             labels_train,
                                             batch_size=config.batch_size):

                start_time = time.time()

                # batch = bgn_train.retrieve()
                x, y = batch

                # TEMPORARY HACK (to avoid incomplete batches)
                if y.shape[0] < config.batch_size:
                    step += 1
                    continue

                feed_dict = {
                    images_pl: x,
                    labels_pl: y,
                    learning_rate_pl: curr_lr,
                    training_pl: True
                }

                _, loss_value = sess.run([train_op, loss], feed_dict=feed_dict)

                duration = time.time() - start_time

                # Write the summaries and print an overview fairly often.
                if step % 10 == 0:
                    # Print status to stdout.
                    logging.info('Step %d: loss = %.2f (%.3f sec)' %
                                 (step, loss_value, duration))
                    # Update the events file.

                    summary_str = sess.run(summary, feed_dict=feed_dict)
                    summary_writer.add_summary(summary_str, step)
                    summary_writer.flush()

                step += 1

            # end epoch
            logging.info('Training Data Eval:')
            [train_loss,
             train_dice] = do_eval(sess, eval_loss, images_pl, labels_pl,
                                   training_pl, images_train, labels_train,
                                   config.batch_size)

            train_summary_msg = sess.run(train_summary,
                                         feed_dict={
                                             train_error_: train_loss,
                                             train_dice_: train_dice
                                         })
            summary_writer.add_summary(train_summary_msg, step)

            loss_history.append(train_loss)
            if len(loss_history) > 5:
                loss_history.pop(0)
                loss_gradient = (loss_history[-5] - loss_history[-1]) / 2

            logging.info('loss gradient is currently %f' % loss_gradient)

            if train_loss <= last_train:  # best_train:
                no_improvement_counter = 0
                logging.info('Decrease in training error!')
            else:
                no_improvement_counter = no_improvement_counter + 1
                logging.info('No improvment in training error for %d steps' %
                             no_improvement_counter)

            last_train = train_loss

            # Save a checkpoint and evaluate the model periodically.
            checkpoint_file = os.path.join(log_dir, 'model.ckpt')
            saver.save(sess, checkpoint_file, global_step=step)
            # Evaluate against the training set.

            if not train_on_all_data:

                # Evaluate against the validation set.
                logging.info('Validation Data Eval:')
                [val_loss,
                 val_dice] = do_eval(sess, eval_loss, images_pl, labels_pl,
                                     training_pl, images_val, labels_val,
                                     config.batch_size)

                val_summary_msg = sess.run(val_summary,
                                           feed_dict={
                                               val_error_: val_loss,
                                               val_dice_: val_dice
                                           })
                summary_writer.add_summary(val_summary_msg, step)

                if val_dice > best_dice:
                    best_dice = val_dice
                    best_file = os.path.join(log_dir, 'model_best_dice.ckpt')
                    saver_best_dice.save(sess, best_file, global_step=step)
                    logging.info(
                        'Found new best dice on validation set! - %f -  Saving model_best_dice.ckpt'
                        % val_dice)

                if val_loss < best_val:
                    best_val = val_loss
                    best_file = os.path.join(log_dir, 'model_best_xent.ckpt')
                    saver_best_xent.save(sess, best_file, global_step=step)
                    logging.info(
                        'Found new best crossentropy on validation set! - %f -  Saving model_best_xent.ckpt'
                        % val_loss)
        sess.close()
    data.close()
Example #19
0
def pred(dataURL):
    """
    Render prediction result.
    """

    # decode base64  '._-' -> '+/='
    dataURL = dataURL.replace('.', '+')
    dataURL = dataURL.replace('_', '/')
    dataURL = dataURL.replace('-', '=')

    # get the base64 string
    image_b64_str = dataURL
    # convert string to bytes
    byte_data = base64.b64decode(image_b64_str)
    image_data = BytesIO(byte_data)
    # open Image with PIL
    img = Image.open(image_data)

    # save original image as png (for debugging)
    ts = time.time()
    #img.save('image' + str(ts) + '.png', 'PNG')

    # convert image to RGBA
    img = img.convert("RGBA")

    # preprocess the image for the model
    image_cropped = crop_image(img) # crop the image and resize to 28x28
    image_normalized = normalize_image(image_cropped) # normalize color after crop

    # convert image from RGBA to RGB
    img_rgb = convert_to_rgb(image_normalized)

    # convert image to numpy
    image_np = convert_to_np(img_rgb)

    # apply model and print prediction
    label, label_num, preds = get_prediction(model, image_np)
    print("This is a {}".format(label_num))

    # save classification results as a diagram
    view_classify(image_np, preds)

    # create plotly visualization
    graphs = [
        #plot with probabilities for each class of images
        {
            'data': [
                go.Bar(
                        x = preds.ravel().tolist(),
                        y = list(label_dict.values()),
                        orientation = 'h')
            ],

            'layout': {
                'title': 'Class Probabilities',
                'yaxis': {
                    'title': "Classes"
                },
                'xaxis': {
                    'title': "Probability",
                }
            }
        }]

    # encode plotly graphs in JSON
    ids = ["graph-{}".format(i) for i, _ in enumerate(graphs)]
    graphJSON = json.dumps(graphs, cls=plotly.utils.PlotlyJSONEncoder)

    # render the hook.html passing prediction resuls
    return render_template(
        'hook.html',
        result = label_num, # predicted class label
        ids=ids, # plotly graph ids
        graphJSON=graphJSON, # json plotly graphs
        dataURL = dataURL # image to display with result
    )
Example #20
0
def prepare_data(input_folder, output_file, mode, size, target_resolution):
    '''
    Main function that prepares a dataset from the raw challenge data to an hdf5 dataset
    '''

    assert (mode in ['2D', '3D']), 'Unknown mode: %s' % mode
    if mode == '2D' and not len(size) == 2:
        raise AssertionError('Inadequate number of size parameters')
    if mode == '3D' and not len(size) == 3:
        raise AssertionError('Inadequate number of size parameters')
    if mode == '2D' and not len(target_resolution) == 2:
        raise AssertionError(
            'Inadequate number of target resolution parameters')
    if mode == '3D' and not len(target_resolution) == 3:
        raise AssertionError(
            'Inadequate number of target resolution parameters')

    hdf5_file = h5py.File(output_file, "w")

    nx, ny = size
    # scale_vector = [config.pixel_size[0] / target_resolution[0], config.pixel_size[1] / target_resolution[1]]
    count = 1
    train_addrs = []
    val_addrs = []
    masktrain_addrs = []
    maskval_addrs = []

    # se split_test_train รจ True allora splitto tra train e validation i pazienti. Quando faccio il test,
    # split_test_train deve essere False. Split mi dice ogni quanti pazienti vanno in validation. Con 2, il 50% sono divisi.
    # con 5 per esempio uno ogni 5 finisc in validation etc.
    split_test_train = config.split_test_train
    if split_test_train:
        split = config.split
    else:
        split = 99999

    path_img = os.path.join(input_folder, 'img')
    path_mask = os.path.join(input_folder, 'mask')
    for folders_img, folders_mask in zip(sorted(os.listdir(path_img)),
                                         sorted(os.listdir(path_mask))):
        folder_path_img = os.path.join(path_img, folders_img)
        folder_path_mask = os.path.join(path_mask, folders_mask)
        if count % split == 0:
            #validation
            path = os.path.join(folder_path_img, '*.png')
            for file in sorted(glob.glob(path)):
                val_addrs.append(file)
            path = os.path.join(folder_path_mask, '*.png')
            for file in sorted(glob.glob(path)):
                maskval_addrs.append(file)
        else:
            #training
            path = os.path.join(folder_path_img, '*.png')
            for file in sorted(glob.glob(path)):
                train_addrs.append(file)
            path = os.path.join(folder_path_mask, '*.png')
            for file in sorted(glob.glob(path)):
                masktrain_addrs.append(file)

        count = count + 1

    train_shape = (len(train_addrs), nx, ny)
    val_shape = (len(val_addrs), nx, ny)

    if config.split_test_train:
        if len(train_addrs) != len(masktrain_addrs) or len(val_addrs) != len(
                maskval_addrs):
            raise AssertionError(
                'Error: Masks and Images have not the same number !!!')

    hdf5_file.create_dataset("images_train", train_shape, np.float32)
    hdf5_file.create_dataset("masks_train", train_shape, np.uint8)
    if config.split_test_train:
        hdf5_file.create_dataset("images_val", val_shape, np.float32)
        hdf5_file.create_dataset("masks_val", val_shape, np.uint8)

    for i in range(len(train_addrs)):
        addr_img = train_addrs[i]
        addr_mask = masktrain_addrs[i]
        img = cv2.imread(addr_img, 0)  #0 for grayscale
        mask = cv2.imread(addr_mask, 0)

        if config.standardize:
            img = image_utils.standardize_image(img)
        if config.normalize:
            img = image_utils.normalize_image(img)
        img = cv2.resize(img, (nx, ny), interpolation=cv2.INTER_AREA)
        mask = cv2.resize(mask, (nx, ny), interpolation=cv2.INTER_NEAREST)

        #img = crop_or_pad_slice_to_size(img, nx, ny)
        #mask = crop_or_pad_slice_to_size(mask, nx, ny)
        hdf5_file["images_train"][i, ...] = img[None]
        hdf5_file["masks_train"][i, ...] = mask[None]

    if config.split_test_train:
        for i in range(len(val_addrs)):
            addr_img = val_addrs[i]
            addr_mask = maskval_addrs[i]
            img = cv2.imread(addr_img, 0)
            mask = cv2.imread(addr_mask, 0)

            if config.standardize:
                img = image_utils.standardize_image(img)
            if config.normalize:
                img = image_utils.normalize_image(img)
            img = cv2.resize(img, (nx, ny), interpolation=cv2.INTER_AREA)
            mask = cv2.resize(mask, (nx, ny), interpolation=cv2.INTER_NEAREST)

            #img = crop_or_pad_slice_to_size(img, nx, ny)
            #mask = crop_or_pad_slice_to_size(mask, nx, ny)
            hdf5_file["images_val"][i, ...] = img[None]
            hdf5_file["masks_val"][i, ...] = mask[None]

    # After test train loop:
    hdf5_file.close()
Example #21
0
def recognize_from_image(net):
    mask_paths = glob.glob('masks/*.jpg')
    N_mask = len(mask_paths)

    # input image loop
    for image_path in args.input:
        logger.info(image_path)

        # prepare grand truth
        gt_img = load_image(image_path)
        gt_img = cv2.cvtColor(gt_img, cv2.COLOR_BGRA2RGB)
        gt_img = np.array(
            Image.fromarray(gt_img).resize((IMAGE_WIDTH, IMAGE_HEIGHT),
                                           Image.BILINEAR))
        gt_img = normalize_image(gt_img, 'ImageNet')
        gt_img = gt_img.transpose((2, 0, 1))  # channel first

        # prepare mask
        if args.mask_index is not None:
            mask_path = mask_paths[args.mask_index % N_mask]
        else:
            mask_path = mask_paths[random.randint(0, N_mask - 1)]
        mask = load_image(mask_path)
        mask = cv2.cvtColor(mask, cv2.COLOR_BGRA2RGB)
        mask = np.array(
            Image.fromarray(mask).resize((IMAGE_WIDTH, IMAGE_HEIGHT),
                                         Image.BILINEAR))
        mask = mask.transpose((2, 0, 1)) / 255  # channel first

        # prepare input data
        img = gt_img * mask
        img = np.expand_dims(img, axis=0)
        mask = np.expand_dims(mask, axis=0)
        gt_img = np.expand_dims(gt_img, axis=0)

        logger.debug(f'input image shape: {img.shape}')

        # inference
        logger.info('Start inference...')
        if args.benchmark:
            logger.info('BENCHMARK mode')
            total_time = 0
            for i in range(args.benchmark_count):
                start = int(round(time.time() * 1000))
                output = net.predict({'image': img, 'mask': mask})
                end = int(round(time.time() * 1000))
                logger.info(f'\tailia processing time {end - start} ms')
                if i != 0:
                    total_time = total_time + (end - start)
            logger.info(
                f'\taverage time {total_time / (args.benchmark_count - 1)} ms')
        else:
            output = net.predict({'image': img, 'mask': mask})

        output, _ = output

        img = postprocess(img[0])
        mask = mask[0].transpose(1, 2, 0) * 255
        output = postprocess(output[0])
        gt_img = postprocess(gt_img[0])
        res_img = np.hstack((img, mask, output, gt_img))

        savepath = get_savepath(args.savepath, image_path, ext='.png')
        logger.info(f'saved at : {savepath}')
        cv2.imwrite(savepath, res_img)

    logger.info('Script finished successfully.')
Example #22
0
def recognize_from_video(net):
    capture = get_capture(args.video)

    # allocate output buffer
    f_h = int(capture.get(cv2.CAP_PROP_FRAME_HEIGHT))
    f_w = int(capture.get(cv2.CAP_PROP_FRAME_WIDTH))

    h, w = (IMAGE_HEIGHT, IMAGE_WIDTH) if not args.v21 or args.model_type == 'large' \
               else (IMAGE_HEIGHT_SMALL, IMAGE_WIDTH_SMALL)

    zero_frame = np.zeros((f_h, f_w, 3))
    resized_img = midas_resize(zero_frame, h, w)
    save_h, save_w = resized_img.shape[0], resized_img.shape[1]

    output_frame = np.zeros((save_h, save_w * 2, 3))

    # create video writer if savepath is specified as video format
    if args.savepath != SAVE_IMAGE_PATH:
        logger.warning(
            'currently, video results cannot be output correctly...')
        writer = get_writer(args.savepath, save_h, save_w * 2)
    else:
        writer = None

    input_shape_set = False
    while (True):
        ret, frame = capture.read()
        if (cv2.waitKey(1) & 0xFF == ord('q')) or not ret:
            break

        # resize to midas input size
        frame = midas_resize(frame, h, w)
        resized_img = normalize_image(frame, 'ImageNet')
        resized_img = resized_img.transpose((2, 0, 1))  # channel first
        resized_img = resized_img[np.newaxis, :, :, :]

        # predict
        if (not input_shape_set):
            net.set_input_shape(resized_img.shape)
            input_shape_set = True
        result = net.predict(resized_img)

        # normalize to 16bit
        depth_min = result.min()
        depth_max = result.max()
        max_val = (2**16) - 1
        if depth_max - depth_min > np.finfo("float").eps:
            out = max_val * (result - depth_min) / (depth_max - depth_min)
        else:
            out = 0

        # convert to 8bit
        res_img = (out.transpose(1, 2, 0) / 256).astype("uint8")
        res_img = cv2.cvtColor(res_img, cv2.COLOR_GRAY2BGR)

        output_frame[:, save_w:save_w * 2, :] = res_img
        output_frame[:, 0:save_w, :] = frame
        output_frame = output_frame.astype("uint8")

        cv2.imshow('depth', output_frame)

        # save results
        if writer is not None:
            writer.write(output_frame)

    capture.release()
    cv2.destroyAllWindows()
    if writer is not None:
        writer.release()
    logger.info('Script finished successfully.')
Example #23
0
def compare_images():
    """
    This is a mode to determine if two input images have the same person
    by using the CNN model, which is used in DeepSORT to track the same person.
    It is assumed that there is always only one person in each image.
    We have not verified, and do not assume, the behavor in the case of
    multiple people. (Future work)
    """

    # net initialize
    detector = init_detector(args.env_id)
    extractor = ailia.Net(EX_MODEL_PATH, EX_WEIGHT_PATH, env_id=args.env_id)

    # prepare input data
    input_data = []
    for i in range(len(args.pairimage)):
        input_data.append(load_image(args.pairimage[i]))

    # inference
    print('Start inference...')
    features = []
    for i in range(len(input_data)):
        # do detection
        detector.compute(input_data[i], THRESHOLD, IOU)
        h, w = input_data[i].shape[0], input_data[i].shape[1]
        bbox_xywh, cls_conf, cls_ids = get_detector_result(detector, h, w)

        # select person class
        mask = cls_ids == 0
        if mask.sum() == 0:
            print('Detector could not detect any person '
                  f'in the input image: {args.pairimage[i]}')
            print('Program finished.')
            sys.exit(0)

        bbox_xywh = bbox_xywh[mask]

        # bbox dilation just in case bbox too small,
        # delete this line if using a better pedestrian detector
        bbox_xywh[:, 3:] *= 1.2
        cls_conf = cls_conf[mask]

        # image crop
        """
        [INFO] If more than one bounding box is detected,
        the one with the highest confidence is used as correct box.
        It should be noted that this works because we assume that
        the input image has only one person.
        """
        x1, y1, x2, y2 = xywh_to_xyxy(bbox_xywh[np.argmax(cls_conf)], h, w)
        src_img = cv2.cvtColor(input_data[i], cv2.COLOR_BGRA2RGB)
        img_crop = src_img[y1:y2, x1:x2]

        # preprocess
        img_crop = normalize_image(
            resize(img_crop), 'ImageNet'
        )[np.newaxis, :, :, :].transpose(0, 3, 1, 2)

        if args.benchmark:
            print('BENCHMARK mode')
            for i in range(5):
                start = int(round(time.time() * 1000))
                feature = extractor.predict(img_crop)
                end = int(round(time.time() * 1000))
                print(f'\tailia processing time {end - start} ms')
        else:
            feature = extractor.predict(img_crop)

        features.append(feature[0])

    sim = cosin_metric(features[0], features[1])
    if sim >= (1 - MAX_COSINE_DISTANCE):
        print(f'{args.pairimage}: SAME person (confidence: {sim})')
    else:
        print(f'{args.pairimage}: Diefferent person (confidence: {sim})')
Example #24
0
def recognize_from_video():
    results = []
    idx_frame = 0

    # net initialize
    detector = init_detector(args.env_id)
    extractor = ailia.Net(EX_MODEL_PATH, EX_WEIGHT_PATH, env_id=args.env_id)

    # tracker class instance
    metric = NearestNeighborDistanceMetric(
        "cosine", MAX_COSINE_DISTANCE, NN_BUDGET
    )
    tracker = Tracker(
        metric,
        max_iou_distance=0.7,
        max_age=70,
        n_init=3
    )

    capture = webcamera_utils.get_capture(args.video)

    # create video writer
    if args.savepath is not None:
        writer = webcamera_utils.get_writer(
            args.savepath,
            int(capture.get(cv2.CAP_PROP_FRAME_HEIGHT)),
            int(capture.get(cv2.CAP_PROP_FRAME_WIDTH)),
        )
    else:
        writer = None

    print('Start Inference...')
    while(True):
        idx_frame += 1
        ret, frame = capture.read()
        if (cv2.waitKey(1) & 0xFF == ord('q')) or not ret:
            break

        # In order to use ailia.Detector, the input should have 4 channels.
        input_img = cv2.cvtColor(frame, cv2.COLOR_BGR2BGRA)
        h, w = frame.shape[0], frame.shape[1]

        # do detection
        detector.compute(input_img, THRESHOLD, IOU)
        bbox_xywh, cls_conf, cls_ids = get_detector_result(detector, h, w)

        # select person class
        mask = cls_ids == 0
        bbox_xywh = bbox_xywh[mask]

        # bbox dilation just in case bbox too small,
        # delete this line if using a better pedestrian detector
        bbox_xywh[:, 3:] *= 1.2
        cls_conf = cls_conf[mask]

        # do tracking
        img_crops = []
        for box in bbox_xywh:
            x1, y1, x2, y2 = xywh_to_xyxy(box, h, w)
            img_crops.append(frame[y1:y2, x1:x2])

        if img_crops:
            # preprocess
            img_batch = np.concatenate([
                normalize_image(resize(img), 'ImageNet')[np.newaxis, :, :, :]
                for img in img_crops
            ], axis=0).transpose(0, 3, 1, 2)

            # TODO better to pass a batch at once
            # features = extractor.predict(img_batch)
            features = []
            for img in img_batch:
                features.append(extractor.predict(img[np.newaxis, :, :, :])[0])
            features = np.array(features)
        else:
            features = np.array([])

        bbox_tlwh = xywh_to_tlwh(bbox_xywh)
        detections = [
            Detection(bbox_tlwh[i], conf, features[i])
            for i, conf in enumerate(cls_conf) if conf > MIN_CONFIDENCE
        ]

        # run on non-maximum supression
        boxes = np.array([d.tlwh for d in detections])
        scores = np.array([d.confidence for d in detections])
        nms_max_overlap = 1.0
        indices = non_max_suppression(boxes, nms_max_overlap, scores)
        detections = [detections[i] for i in indices]

        # update tracker
        tracker.predict()
        tracker.update(detections)

        # update bbox identities
        outputs = []
        for track in tracker.tracks:
            if not track.is_confirmed() or track.time_since_update > 1:
                continue
            box = track.to_tlwh()
            x1, y1, x2, y2 = tlwh_to_xyxy(box, h, w)
            track_id = track.track_id
            outputs.append(np.array([x1, y1, x2, y2, track_id], dtype=np.int))
        if len(outputs) > 0:
            outputs = np.stack(outputs, axis=0)

        # draw box for visualization
        if len(outputs) > 0:
            bbox_tlwh = []
            bbox_xyxy = outputs[:, :4]
            identities = outputs[:, -1]
            frame = draw_boxes(frame, bbox_xyxy, identities)

            for bb_xyxy in bbox_xyxy:
                bbox_tlwh.append(xyxy_to_tlwh(bb_xyxy))

            results.append((idx_frame - 1, bbox_tlwh, identities))

        cv2.imshow('frame', frame)

        if writer is not None:
            writer.write(frame)

        if args.savepath is not None:
            write_results(args.savepath.split('.')[0] + '.txt', results, 'mot')
        else:
            write_results('result.txt', results, 'mot')

    capture.release()
    cv2.destroyAllWindows()
    if writer is not None:
        writer.release()

    print(f'Save results to {args.savepath}')
    print('Script finished successfully.')
def recognize_from_video():
    try:
        print('[INFO] Webcam mode is activated')
        RECORD_TIME = 80
        capture = cv2.VideoCapture(int(args.video))
        if not capture.isOpened():
            print("[ERROR] webcamera not found")
            sys.exit(1)
    except ValueError:
        if check_file_existance(args.video):
            capture = cv2.VideoCapture(args.video)

    frame_rate = capture.get(cv2.CAP_PROP_FPS)
    if FRAME_SKIP:
        action_recognize_fps = int(args.fps)
    else:
        action_recognize_fps = frame_rate

    if args.savepath != "":
        size = (int(capture.get(cv2.CAP_PROP_FRAME_WIDTH)),
                int(capture.get(cv2.CAP_PROP_FRAME_HEIGHT)))
        fmt = cv2.VideoWriter_fourcc('m', 'p', '4', 'v')
        writer = cv2.VideoWriter(args.savepath, fmt, action_recognize_fps,
                                 size)
    else:
        writer = None

    # pose estimation
    env_id = ailia.get_gpu_environment_id()
    print(f'env_id: {env_id}')
    if args.arch == "lw_human_pose":
        pose = ailia.PoseEstimator(MODEL_PATH,
                                   WEIGHT_PATH,
                                   env_id=env_id,
                                   algorithm=ALGORITHM)

        detector = None
    else:
        detector = ailia.Detector(DETECTOR_MODEL_PATH,
                                  DETECTOR_WEIGHT_PATH,
                                  len(COCO_CATEGORY),
                                  format=ailia.NETWORK_IMAGE_FORMAT_RGB,
                                  channel=ailia.NETWORK_IMAGE_CHANNEL_FIRST,
                                  range=ailia.NETWORK_IMAGE_RANGE_U_FP32,
                                  algorithm=ailia.DETECTOR_ALGORITHM_YOLOV3,
                                  env_id=env_id)

        pose = ailia.Net(POSE_MODEL_PATH, POSE_WEIGHT_PATH, env_id=env_id)

    # tracker class instance
    extractor = ailia.Net(EX_MODEL_PATH, EX_WEIGHT_PATH, env_id=env_id)
    metric = NearestNeighborDistanceMetric("cosine", MAX_COSINE_DISTANCE,
                                           NN_BUDGET)
    tracker = Tracker(metric, max_iou_distance=0.7, max_age=70, n_init=3)

    # action recognition
    env_id = ailia.get_gpu_environment_id()
    print(f'env_id: {env_id}')
    model = ailia.Net(ACTION_MODEL_PATH, ACTION_WEIGHT_PATH, env_id=env_id)

    action_data = {}

    frame_nb = int(capture.get(cv2.CAP_PROP_FRAME_COUNT))
    idx_frame = 0

    time_start = time.time()
    while (True):
        time_curr = time.time()
        if args.video == '0' and time_curr - time_start > RECORD_TIME:
            break
        ret, frame = capture.read()

        if cv2.waitKey(1) & 0xFF == ord('q'):
            break
        if (not ret) or (frame_nb >= 1 and idx_frame >= frame_nb):
            break

        if FRAME_SKIP:
            mod = round(frame_rate / action_recognize_fps)
            if mod >= 1:
                if idx_frame % mod != 0:
                    idx_frame = idx_frame + 1
                    continue

        input_image, input_data = adjust_frame_size(
            frame,
            frame.shape[0],
            frame.shape[1],
        )
        input_data = cv2.cvtColor(input_data, cv2.COLOR_BGR2BGRA)

        # inferece
        if args.arch == "lw_human_pose":
            _ = pose.compute(input_data)
        else:
            detector.compute(input_data, THRESHOLD, IOU)

        # deepsort format
        h, w = input_image.shape[0], input_image.shape[1]
        if args.arch == "lw_human_pose":
            bbox_xywh, cls_conf, cls_ids = get_detector_result_lw_human_pose(
                pose, h, w)
        else:
            bbox_xywh, cls_conf, cls_ids = get_detector_result(detector, h, w)

        mask = cls_ids == 0
        bbox_xywh = bbox_xywh[mask]

        # bbox dilation just in case bbox too small,
        # delete this line if using a better pedestrian detector
        if args.arch == "pose_resnet":
            # bbox_xywh[:, 3:] *= 1.2   #May need to be removed in the future
            cls_conf = cls_conf[mask]

        # do tracking
        img_crops = []
        for box in bbox_xywh:
            x1, y1, x2, y2 = xywh_to_xyxy(box, h, w)
            img_crops.append(input_image[y1:y2, x1:x2])

        if img_crops:
            # preprocess
            img_batch = np.concatenate([
                normalize_image(resize(img), 'ImageNet')[np.newaxis, :, :, :]
                for img in img_crops
            ],
                                       axis=0).transpose(0, 3, 1, 2)

            # TODO better to pass a batch at once
            # features = extractor.predict(img_batch)
            features = []
            for img in img_batch:
                features.append(extractor.predict(img[np.newaxis, :, :, :])[0])
            features = np.array(features)
        else:
            features = np.array([])

        bbox_tlwh = xywh_to_tlwh(bbox_xywh)
        detections = [
            Detection(bbox_tlwh[i], conf, features[i])
            for i, conf in enumerate(cls_conf) if conf > MIN_CONFIDENCE
        ]

        # run on non-maximum supression
        boxes = np.array([d.tlwh for d in detections])
        scores = np.array([d.confidence for d in detections])
        nms_max_overlap = 1.0
        indices = non_max_suppression(boxes, nms_max_overlap, scores)
        detections = [detections[i] for i in indices]

        # update tracker
        tracker.predict()
        tracker.update(detections)

        # update bbox identities
        outputs = []
        for track in tracker.tracks:
            if not track.is_confirmed() or track.time_since_update > 1:
                continue
            box = track.to_tlwh()
            x1, y1, x2, y2 = tlwh_to_xyxy(box, h, w)
            track_id = track.track_id
            outputs.append(np.array([x1, y1, x2, y2, track_id], dtype=np.int))
        if len(outputs) > 0:
            outputs = np.stack(outputs, axis=0)

        # action detection
        actions = []
        persons = []
        if len(outputs) > 0:
            bbox_xyxy = outputs[:, :4]
            identities = outputs[:, -1]
            for i, box in enumerate(bbox_xyxy):
                id = identities[i]

                if not (id in action_data):
                    action_data[id] = np.zeros(
                        (ailia.POSE_KEYPOINT_CNT - 1, TIME_RANGE, 3))

                # action recognition
                action, person = action_recognition(box, input_image, pose,
                                                    detector, model,
                                                    action_data[id])
                actions.append(action)
                persons.append(person)

        # draw box for visualization
        if len(outputs) > 0:
            bbox_tlwh = []
            bbox_xyxy = outputs[:, :4]
            identities = outputs[:, -1]
            frame = draw_boxes(input_image, bbox_xyxy, identities, actions,
                               action_data, (0, 0))

            for bb_xyxy in bbox_xyxy:
                bbox_tlwh.append(xyxy_to_tlwh(bb_xyxy))

        # draw skelton
        for person in persons:
            if person != None:
                display_result(input_image, person)

        if writer is not None:
            writer.write(input_image)

            # show progress
            if idx_frame == "0":
                print()
            print("\r" + str(idx_frame + 1) + " / " + str(frame_nb), end="")
            if idx_frame == frame_nb - 1:
                print()

        cv2.imshow('frame', input_image)

        idx_frame = idx_frame + 1

    if writer is not None:
        writer.release()

    capture.release()
    cv2.destroyAllWindows()
    print('Script finished successfully.')