Beispiel #1
0
    def one2one_identity(self, im1, im2):
        normalized_im1 = T.normalize(im1, mean=self.mean, std=self.std)
        scale_im1, scale_ratio1 = T.scale(normalized_im1, short_size=self.base_size)
        input_im1 = T.center_crop(scale_im1, crop_size=self.crop_size)

        normalized_im2 = T.normalize(im2, mean=self.mean, std=self.std)
        scale_im2, scale_ratio2 = T.scale(normalized_im2, short_size=self.base_size)
        input_im2 = T.center_crop(scale_im2, crop_size=self.crop_size)

        batch = np.asarray([input_im1, input_im2], dtype=np.float32)
        scores = self.inference(batch, output_layer=self.prob_layer)

        return M.cosine_similarity(scores[0], scores[1])
Beispiel #2
0
    def __getitem__(self, index):
        img_path = os.path.join(self.img_dir, self.json_data[index]['img_fn'])
        img = np.array(cv2.imread(img_path), dtype=np.float32)
        keypoints = self.json_data[index]['keypoints']
        if 'bodysize' in self.json_data[index]:
            norm = self.json_data[index]['bodysize']
        elif 'headsize' in self.json_data[index]:
            norm = self.json_data[index]['headsize']
        else:
            norm = self.json_data[index]['normalize']

        img, keypoints, ratio = self.trans(img, keypoints)

        label = np.zeros((self.s, self.s, self.num_kpt), dtype=np.float32)
        offset = np.zeros((self.s, self.s, self.num_kpt * 2), dtype=np.float32)

        for px in range(self.num_kpt):
            if keypoints[px * 3 + 2] == 0 or keypoints[
                    px * 3 +
                    0] <= 0 or keypoints[px * 3 + 0] >= self.size or keypoints[
                        px * 3 + 1] <= 0 or keypoints[
                            px * 3 + 1] >= self.size or keypoints[px * 3 +
                                                                  2] == 0:
                continue
            else:
                grid_loc_x = math.floor(keypoints[px * 3 + 0] //
                                        self.grid_size)
                grid_loc_y = math.floor(keypoints[px * 3 + 1] //
                                        self.grid_size)
                label[grid_loc_y][grid_loc_x][px] = 1
                offset[grid_loc_y][grid_loc_x][px] = (
                    keypoints[px * 3 + 0] % self.grid_size) / self.grid_size
                offset[grid_loc_y][grid_loc_x][self.num_kpt + px] = (
                    keypoints[px * 3 + 1] % self.grid_size) / self.grid_size

        img1 = self._enhance(img.copy(), 1.0)
        img2 = self._enhance(img.copy(), 1.5)
        img3 = self._enhance(img.copy(), 2.0)
        img0 = normalize(to_tensor(img)).unsqueeze(dim=0)
        img1 = normalize(to_tensor(img1)).unsqueeze(dim=0)
        img2 = normalize(to_tensor(img2)).unsqueeze(dim=0)
        img3 = normalize(to_tensor(img3)).unsqueeze(dim=0)
        img = img0
        label = to_tensor(label)
        offset = to_tensor(offset)
        norm = norm * ratio

        return img, label, offset, norm
Beispiel #3
0
    def __getitem__(self, index):

        img_path = os.path.join(self.data_root, self.image_set, self.img_list[index])
        img = np.array(cv2.imread(img_path), dtype=np.float32)
        mask_path = os.path.join(self.info_root, 'pose_mask', self.img_list[index].replace('.jpg', '.npy'))
        mask = np.load(mask_path)
        mask = np.array(mask, dtype=np.float32)

        kpt = self.kpt_list[index]
        center = self.center_list[index]
        scale = self.scale_list[index]

        img, mask, kpt, center = self.transformer(img, mask, kpt, center, scale)

        height, width, _ = img.shape

        mask = cv2.resize(mask, (width / self.stride, height / self.stride)).reshape((height / self.stride, width / self.stride, 1))

        heatmap = np.zeros((height / self.stride, width / self.stride, len(kpt[0]) + 1), dtype=np.float32)
        heatmap = generate_heatmap(heatmap, kpt, self.stride, self.sigma)
        heatmap[:,:,0] = 1.0 - np.max(heatmap[:,:,1:], axis=2) # for background
        heatmap = heatmap * mask

        vecmap = np.zeros((height / self.stride, width / self.stride, len(self.vec_pair[0]) * 2), dtype=np.float32)
        cnt = np.zeros((height / self.stride, width / self.stride, len(self.vec_pair[0])), dtype=np.int32)

        vecmap = generate_vector(vecmap, cnt, kpt, self.vec_pair, self.stride, self.theta)
        vecmap = vecmap * mask

        img = transforms.normalize(transforms.to_tensor(img), [128.0, 128.0, 128.0], [256.0, 256.0, 256.0]) # mean, std
        mask = transforms.to_tensor(mask)
        heatmap = transforms.to_tensor(heatmap)
        vecmap = transforms.to_tensor(vecmap)

        return img, heatmap, vecmap, mask
Beispiel #4
0
    def eval_im(self, im):
        im = im.astype(np.float32, copy=True)
        h, w = im.shape[:2]

        normalized_im = T.normalize(im, mean=self.mean, std=self.std)
        scale_ims, scale_ratios = T.multi_scale_by_max(normalized_im, scales=self.scales, image_flip=self.image_flip)

        score_map = np.zeros((h, w, self.class_num), dtype=np.float32)
        for _im, _ratio in zip(scale_ims, scale_ratios):
            if _ratio > 0:
                score_map += cv2.resize(self.scale_process(_im), (w, h))
            else:
                score_map += cv2.resize(self.scale_process(_im), (w, h))[:, ::-1]
        score_map /= len(self.scales)

        if self.crf:
            tmp_data = np.asarray([im.transpose(2, 0, 1)], dtype=np.float32)
            tmp_score = np.asarray([score_map.transpose(2, 0, 1)], dtype=np.float32)
            self.crf.blobs['data'].reshape(*tmp_data.shape)
            self.crf.blobs['data'].data[...] = tmp_data
            self.crf.blobs['data_dim'].data[...] = [[[h, w]]]
            self.crf.blobs['score'].reshape(*tmp_score.shape)
            self.crf.blobs['score'].data[...] = tmp_score * self.crf_factor
            self.crf.forward()
            score_map = self.crf.blobs[self.prob_layer].data[0].transpose(1, 2, 0)

        return score_map.argmax(2)
Beispiel #5
0
    def __getitem__(self, index):
        img_path = os.path.join(self.img_dir, self.json_data[index]['img_fn'])
        img = np.array(cv2.imread(img_path), dtype=np.float32)
        keypoints = self.json_data[index]['keypoints']

        if self.trans is not None:
            img, keypoints = self.trans(img, keypoints)

        label = np.zeros((self.s, self.s, self.num_kpt), dtype=np.float32)
        offset = np.zeros((self.s, self.s, self.num_kpt * 2), dtype=np.float32)
        for px in range(self.num_kpt):
            if keypoints[px * 3 + 2] == 0 or keypoints[
                    px * 3 +
                    0] <= 0 or keypoints[px * 3 + 0] >= self.size or keypoints[
                        px * 3 + 1] <= 0 or keypoints[
                            px * 3 + 1] >= self.size or keypoints[px * 3 +
                                                                  2] == 0:
                continue
            else:
                grid_loc_x = math.floor(keypoints[px * 3 + 0] //
                                        self.grid_size)
                grid_loc_y = math.floor(keypoints[px * 3 + 1] //
                                        self.grid_size)
                label[grid_loc_y][grid_loc_x][px] = 1
                offset[grid_loc_y][grid_loc_x][px] = (
                    keypoints[px * 3 + 0] % self.grid_size) / self.grid_size
                offset[grid_loc_y][grid_loc_x][self.num_kpt + px] = (
                    keypoints[px * 3 + 1] % self.grid_size) / self.grid_size

        img = normalize(to_tensor(img))
        label = to_tensor(label)
        offset = to_tensor(offset)

        return img, label, offset
Beispiel #6
0
 def process(filename):
     image, label, filename = read_tfrecord(filename)
     image = transforms.resize_and_crop_image(image, target_size=image_size)
     image = transforms.normalize(image, dtype=dtype)
     result = (image, label)
     if not drop_filename:
         result += (filename, )
     return result
Beispiel #7
0
    def det_im(self, im):
        im = im.astype(np.float32, copy=True)
        normalized_im = T.normalize(im, mean=self.mean, std=self.std)
        scale_im, scale_ratio = T.scale(normalized_im, short_size=self.scales[0], max_size=self.max_sizes[0])

        input_data = scale_im.transpose(2, 0, 1)
        input_data = input_data.reshape((1,) + input_data.shape)
        self.net.blobs['data'].reshape(*input_data.shape)
        input_blob = {'data': input_data, 'rois': None}

        input_blob['im_info'] = np.array([[scale_im.shape[0], scale_im.shape[1], 1.0]], dtype=np.float32)
        self.net.blobs['im_info'].reshape(*input_blob['im_info'].shape)

        # do forward
        forward_kwargs = {'data': input_blob['data'].astype(np.float32, copy=False)}
        forward_kwargs['im_info'] = input_blob['im_info'].astype(np.float32, copy=False)
        output_blob = self.net.forward(**forward_kwargs)

        rois = self.net.blobs['rois'].data.copy()
        boxes = rois[:, 1:5]

        scores = output_blob['cls_prob']
        scores = scores.reshape(*scores.shape[:2])

        # Apply bounding-box regression deltas
        box_deltas = output_blob['bbox_pred']
        box_deltas = box_deltas.reshape(*box_deltas.shape[:2])
        pred_boxes = bbox_transform_inv(boxes, box_deltas)
        pred_boxes = clip_boxes(pred_boxes, scale_im.shape)

        objs = []
        for cls_ind, cls in enumerate(self.class_map[1:]):
            cls_ind += 1  # because we skipped background
            if cfg.TEST.AGNOSTIC:
                cls_boxes = pred_boxes[:, 4:8]
            else:
                cls_boxes = pred_boxes[:, cls_ind * 4:(cls_ind + 1) * 4]
            cls_scores = scores[:, cls_ind]
            dets = np.hstack((cls_boxes, cls_scores[:, np.newaxis])).astype(np.float32)
            inds = np.where(dets[:, 4] > self.conf_thresh)
            cls_dets = dets[inds]

            keep = nms(cls_dets, self.nms_thresh)
            dets_NMSed = cls_dets[keep, :]
            if self.box_vote:
                VOTEed = bbox_vote(dets_NMSed, cls_dets)
            else:
                VOTEed = dets_NMSed

            _obj = boxes_filter(VOTEed, bbox_id=cls_ind, class_name=cls, color=self.color_map[cls_ind],
                                scale=scale_ratio, thresh=self.conf_thresh)
            objs.extend(_obj)

        return objs
Beispiel #8
0
    def cls_batch(self, batch_ims):
        input_ims = []
        for im in batch_ims:
            im = im.astype(np.float32, copy=True)
            normalized_im = T.normalize(im, mean=self.mean, std=self.std)
            scale_im, scale_ratio = T.scale(normalized_im, short_size=self.base_size)
            input_ims.append(T.center_crop(scale_im, crop_size=self.crop_size))

        scores = self.inference(np.asarray(input_ims, dtype=np.float32), output_layer=self.prob_layer)

        return scores
Beispiel #9
0
    def seg_im(self, im):
        """ Ignore self.scales; """
        im = im.astype(np.float32, copy=True)
        h, w = im.shape[:2]
        normalized_im = T.normalize(im, mean=self.mean, std=self.std)
        scale_im, scale_ratio = T.scale_by_max(normalized_im, long_size=self.crop_size)
        input_im = T.padding_im(scale_im, target_size=(self.crop_size, self.crop_size),
                                borderType=cv2.BORDER_CONSTANT)
        output = self.inference(np.asarray([input_im], dtype=np.float32))
        score = output[0].transpose(1, 2, 0)
        score_map = cv2.resize(score, None, None, fx=1. / scale_ratio, fy=1. / scale_ratio)[:h, :w, :]

        return score_map.argmax(2)
Beispiel #10
0
    def cls_im(self, im):
        im = im.astype(np.float32, copy=True)
        normalized_im = T.normalize(im, mean=self.mean, std=self.std)
        scale_im, scale_ratio = T.scale(normalized_im, short_size=self.base_size)
        crop_ims = []
        if self.crop_type == 'center' or self.crop_type == 'single':  # for single crop
            crop_ims.append(T.center_crop(scale_im, crop_size=self.crop_size))
        elif self.crop_type == 'mirror' or self.crop_type == 'multi':  # for 10 crops
            crop_ims.extend(T.mirror_crop(scale_im, crop_size=self.crop_size))
        else:
            crop_ims.append(scale_im)

        scores = self.inference(np.asarray(crop_ims, dtype=np.float32), output_layer=self.prob_layer)

        return np.sum(scores, axis=0)
def get_contacts(oracle, body_name, direction): # NOTE - Other objects will only have a fixed set of contacts and directions
  #print 'direction', direction[2]
  assert direction[2] == 0
  contacts = []
  direction = normalize(direction)
  aabb = oracle.get_aabb(body_name)
  radius = sqrt(oracle.get_radius2D2(body_name))
  #radius = body.GetLinks()[0].GetGeometries()[0].GetCylinderRadius()
  height = 2*aabb.extents()[2]
  #height = body.GetLinks()[0].GetGeometries()[0].GetCylinderHeight()

  distance = radius + PUSH_SEPERATION
  z = -height/2 + PUSH_HEIGHT
  tool_quat = quat_from_trans(get_tool_trans(oracle))
  manip_point = -distance*direction + np.array([0, 0, z]) + aabb.pos()
  for rotation in [0, PI]: # NOTE - 2 hand trans can push in a given direction
    manip_quat = quat_dot(quat_look_at(-direction), quat_look_at(-unit_z()), quat_from_angle_vector(rotation, unit_x()), tool_quat) # Grip * Tool = Manip
    contacts.append(Contact(compute_grasp(trans_from_quat_point(manip_quat, manip_point), unit_trans()), direction))
  return contacts
Beispiel #12
0
    def seg_im(self, im):
        """ Ignore self.scales; """
        im = im.astype(np.float32, copy=True)
        h, w = im.shape[:2]
        normalized_im = T.normalize(im, mean=self.mean, std=self.std)
        scale_im, scale_ratio = T.scale_by_max(normalized_im,
                                               long_size=self.crop_size)
        input_im = T.padding_im(scale_im,
                                target_size=(self.crop_size, self.crop_size),
                                borderType=cv2.BORDER_CONSTANT,
                                borderValue=(0.0, 0.0, 0.0))
        score = self.caffe_process(input_im)
        score_map = cv2.resize(score,
                               None,
                               None,
                               fx=1. / scale_ratio,
                               fy=1. / scale_ratio)[:h, :w, :]

        return score_map.argmax(2)
Beispiel #13
0
def get_contacts(oracle, body_name, direction): # NOTE - Other objects will only have a fixed set of contacts and directions
  #print 'direction', direction[2]
  assert direction[2] == 0
  contacts = []
  direction = normalize(direction)
  aabb = oracle.get_aabb(body_name)
  radius = sqrt(oracle.get_radius2D2(body_name))
  #radius = body.GetLinks()[0].GetGeometries()[0].GetCylinderRadius()
  height = 2*aabb.extents()[2]
  #height = body.GetLinks()[0].GetGeometries()[0].GetCylinderHeight()

  distance = radius + PUSH_SEPERATION
  z = -height/2 + PUSH_HEIGHT
  tool_quat = quat_from_trans(get_tool_trans(oracle))
  manip_point = -distance*direction + np.array([0, 0, z]) + aabb.pos()
  for rotation in [0, PI]: # NOTE - 2 hand trans can push in a given direction
    manip_quat = quat_dot(quat_look_at(-direction), quat_look_at(-unit_z()), quat_from_angle_vector(rotation, unit_x()), tool_quat) # Grip * Tool = Manip
    contacts.append(Contact(compute_grasp(trans_from_quat_point(manip_quat, manip_point), unit_trans()), direction))
  return contacts

# def box_contacts(oracle, body_name): # TODO - push boxes along their faces
Beispiel #14
0
def eval_batch():
    # shuffle_conv1_channel()
    eval_len = len(SET_DICT)
    # eval_len = 1000
    accuracy = np.zeros(len(args.top_k))
    start_time = datetime.datetime.now()

    for i in xrange(eval_len - args.skip_num):
        im = cv2.imread(SET_DICT[i + args.skip_num]['path'])
        if (PIXEL_MEANS == np.array([103.52, 116.28, 123.675])).all() and \
                (PIXEL_STDS == np.array([57.375, 57.12, 58.395])).all():
            scale_im = T.pil_scale(Image.fromarray(im), args.base_size)
            scale_im = np.asarray(scale_im)
        else:
            scale_im, _ = T.scale(im, short_size=args.base_size) 
        input_im = T.normalize(scale_im, mean=PIXEL_MEANS, std=PIXEL_STDS)
        crop_ims = []
        if args.crop_type == 'center':  # for single crop
            crop_ims.append(T.center_crop(input_im, crop_size=args.crop_size))
        elif args.crop_type == 'multi':  # for 10 crops
            crop_ims.extend(T.mirror_crop(input_im, crop_size=args.crop_size))
        else:
            crop_ims.append(input_im)

        score_vec = np.zeros(args.class_num, dtype=np.float32)
        iter_num = int(len(crop_ims) / args.batch_size)
        timer_pt1 = datetime.datetime.now()
        for j in xrange(iter_num):
            scores = CLS.inference(
                np.asarray(crop_ims, dtype=np.float32)[j * args.batch_size:(j + 1) * args.batch_size],
                output_layer=args.prob_layer
            )
            score_vec += np.sum(scores, axis=0)
        score_index = (-score_vec / len(crop_ims)).argsort()
        timer_pt2 = datetime.datetime.now()

        SET_DICT[i + args.skip_num]['evaluated'] = True
        SET_DICT[i + args.skip_num]['score_vec'] = score_vec / len(crop_ims)

        print 'Testing image: {}/{} {} {}/{} {}s' \
            .format(str(i + 1), str(eval_len - args.skip_num), str(SET_DICT[i + args.skip_num]['path'].split('/')[-1]),
                    str(score_index[0]), str(SET_DICT[i + args.skip_num]['gt']),
                    str((timer_pt2 - timer_pt1).microseconds / 1e6 + (timer_pt2 - timer_pt1).seconds)),

        for j in xrange(len(args.top_k)):
            if SET_DICT[i + args.skip_num]['gt'] in score_index[:args.top_k[j]]:
                accuracy[j] += 1
            tmp_acc = float(accuracy[j]) / float(i + 1)
            if args.top_k[j] == 1:
                print '\ttop_' + str(args.top_k[j]) + ':' + str(tmp_acc),
            else:
                print 'top_' + str(args.top_k[j]) + ':' + str(tmp_acc)
    end_time = datetime.datetime.now()

    w = open(LOG_PTH, 'w')
    s1 = 'Evaluation process ends at: {}. \nTime cost is: {}. '.format(str(end_time), str(end_time - start_time))
    s2 = '\nThe model is: {}. \nThe val file is: {}. \n{} images has been tested, crop_type is: {}, base_size is: {}, ' \
         'crop_size is: {}.'.format(args.model_weights, args.val_file, str(eval_len - args.skip_num),
                                    args.crop_type, str(args.base_size), str(args.crop_size))
    s3 = '\nThe PIXEL_MEANS is: ({}, {}, {}), PIXEL_STDS is : ({}, {}, {}).' \
        .format(str(PIXEL_MEANS[0]), str(PIXEL_MEANS[1]), str(PIXEL_MEANS[2]), str(PIXEL_STDS[0]), str(PIXEL_STDS[1]),
                str(PIXEL_STDS[2]))
    s4 = ''
    for i in xrange(len(args.top_k)):
        _acc = float(accuracy[i]) / float(eval_len - args.skip_num)
        s4 += '\nAccuracy of top_{} is: {}; correct num is {}.'.format(str(args.top_k[i]), str(_acc),
                                                                       str(int(accuracy[i])))
    print s1, s2, s3, s4
    w.write(s1 + s2 + s3 + s4)
    w.close()

    if args.save_score_vec:
        w = open(LOG_PTH.replace('.txt', 'scorevec.txt'), 'w')
        for i in xrange(eval_len - args.skip_num):
            w.write(SET_DICT[i + args.skip_num]['score_vec'])
    w.close()
    print('DONE!')
Beispiel #15
0
def main():
    a = get_args()

    prev_enc = 0

    def train(i):
        loss = 0

        noise = a.noise * torch.randn(1, 1, *params[0].shape[2:4],
                                      1).cuda() if a.noise > 0 else None
        img_out = image_f(noise)

        if a.sharp != 0:
            lx = torch.mean(
                torch.abs(img_out[0, :, :, 1:] - img_out[0, :, :, :-1]))
            ly = torch.mean(
                torch.abs(img_out[0, :, 1:, :] - img_out[0, :, :-1, :]))
            loss -= a.sharp * (ly + lx)

        micro = 1 - a.macro if a.in_txt2 is None else False
        imgs_sliced = slice_imgs([img_out],
                                 a.samples,
                                 a.modsize,
                                 trform_f,
                                 a.align,
                                 micro=micro)
        out_enc = model_clip.encode_image(imgs_sliced[-1])
        if a.diverse != 0:
            imgs_sliced = slice_imgs([image_f(noise)],
                                     a.samples,
                                     a.modsize,
                                     trform_f,
                                     a.align,
                                     micro=micro)
            out_enc2 = model_clip.encode_image(imgs_sliced[-1])
            loss += a.diverse * torch.cosine_similarity(
                out_enc, out_enc2, dim=-1).mean()
            del out_enc2
            torch.cuda.empty_cache()
        if a.in_img is not None and os.path.isfile(a.in_img):  # input image
            loss += sign * 0.5 * torch.cosine_similarity(
                img_enc, out_enc, dim=-1).mean()
        if a.in_txt is not None:  # input text
            loss += sign * torch.cosine_similarity(txt_enc, out_enc,
                                                   dim=-1).mean()
            if a.notext > 0:
                loss -= sign * a.notext * torch.cosine_similarity(
                    txt_plot_enc, out_enc, dim=-1).mean()
        if a.in_txt0 is not None:  # subtract text
            loss += -sign * torch.cosine_similarity(txt_enc0, out_enc,
                                                    dim=-1).mean()
        if a.sync > 0 and a.in_img is not None and os.path.isfile(
                a.in_img):  # image composition
            prog_sync = (a.steps // a.fstep - i) / (a.steps // a.fstep)
            loss += prog_sync * a.sync * sim_loss(F.interpolate(
                img_out, sim_size).float(),
                                                  img_in,
                                                  normalize=True).squeeze()
        if a.in_txt2 is not None:  # input text for micro details
            imgs_sliced = slice_imgs([img_out],
                                     a.samples,
                                     a.modsize,
                                     trform_f,
                                     a.align,
                                     micro=True)
            out_enc2 = model_clip.encode_image(imgs_sliced[-1])
            loss += sign * torch.cosine_similarity(txt_enc2, out_enc2,
                                                   dim=-1).mean()
            del out_enc2
            torch.cuda.empty_cache()
        if a.expand > 0:
            global prev_enc
            if i > 0:
                loss += a.expand * torch.cosine_similarity(
                    out_enc, prev_enc, dim=-1).mean()
            prev_enc = out_enc.detach()

        del img_out, imgs_sliced, out_enc
        torch.cuda.empty_cache()
        assert not isinstance(loss, int), ' Loss not defined, check the inputs'

        if a.prog is True:
            lr_cur = lr0 + (i / a.steps) * (lr1 - lr0)
            for g in optimizer.param_groups:
                g['lr'] = lr_cur

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if i % a.fstep == 0:
            with torch.no_grad():
                img = image_f(contrast=a.contrast).cpu().numpy()[0]
            if (a.sync > 0 and a.in_img is not None) or a.sharp != 0:
                img = img**1.3  # empirical tone mapping
            checkout(img,
                     os.path.join(tempdir, '%04d.jpg' % (i // a.fstep)),
                     verbose=a.verbose)
            pbar.upd()

    # Load CLIP models
    model_clip, _ = clip.load(a.model)
    if a.verbose is True: print(' using model', a.model)
    xmem = {'RN50': 0.5, 'RN50x4': 0.16, 'RN101': 0.33}
    if 'RN' in a.model:
        a.samples = int(a.samples * xmem[a.model])

    if a.multilang is True:
        model_lang = SentenceTransformer(
            'clip-ViT-B-32-multilingual-v1').cuda()

    def enc_text(txt):
        if a.multilang is True:
            emb = model_lang.encode([txt],
                                    convert_to_tensor=True,
                                    show_progress_bar=False)
        else:
            emb = model_clip.encode_text(clip.tokenize(txt).cuda())
        return emb.detach().clone()

    if a.diverse != 0:
        a.samples = int(a.samples * 0.5)
    if a.sync > 0:
        a.samples = int(a.samples * 0.5)

    if a.transform is True:
        trform_f = transforms.transforms_custom
        a.samples = int(a.samples * 0.95)
    else:
        trform_f = transforms.normalize()

    out_name = []
    if a.in_txt is not None:
        if a.verbose is True: print(' ref text: ', basename(a.in_txt))
        if a.translate:
            translator = Translator()
            a.in_txt = translator.translate(a.in_txt, dest='en').text
            if a.verbose is True: print(' translated to:', a.in_txt)
        txt_enc = enc_text(a.in_txt)
        out_name.append(txt_clean(a.in_txt))

        if a.notext > 0:
            txt_plot = torch.from_numpy(plot_text(a.in_txt, a.modsize) /
                                        255.).unsqueeze(0).permute(0, 3, 1,
                                                                   2).cuda()
            txt_plot_enc = model_clip.encode_image(txt_plot).detach().clone()

    if a.in_txt2 is not None:
        if a.verbose is True: print(' micro text:', basename(a.in_txt2))
        a.samples = int(a.samples * 0.75)
        if a.translate:
            translator = Translator()
            a.in_txt2 = translator.translate(a.in_txt2, dest='en').text
            if a.verbose is True: print(' translated to:', a.in_txt2)
        txt_enc2 = enc_text(a.in_txt2)
        out_name.append(txt_clean(a.in_txt2))

    if a.in_txt0 is not None:
        if a.verbose is True: print(' subtract text:', basename(a.in_txt0))
        a.samples = int(a.samples * 0.75)
        if a.translate:
            translator = Translator()
            a.in_txt0 = translator.translate(a.in_txt0, dest='en').text
            if a.verbose is True: print(' translated to:', a.in_txt0)
        txt_enc0 = enc_text(a.in_txt0)
        out_name.append('off-' + txt_clean(a.in_txt0))

    if a.multilang is True: del model_lang

    if a.in_img is not None and os.path.isfile(a.in_img):
        if a.verbose is True: print(' ref image:', basename(a.in_img))
        img_in = torch.from_numpy(
            img_read(a.in_img) / 255.).unsqueeze(0).permute(0, 3, 1, 2).cuda()
        img_in = img_in[:, :3, :, :]  # fix rgb channels
        in_sliced = slice_imgs([img_in],
                               a.samples,
                               a.modsize,
                               transforms.normalize(),
                               a.align,
                               micro=False)[0]
        img_enc = model_clip.encode_image(in_sliced).detach().clone()
        if a.sync > 0:
            sim_loss = lpips.LPIPS(net='vgg', verbose=False).cuda()
            sim_size = [s // 2 for s in a.size]
            img_in = F.interpolate(img_in, sim_size).float()
        else:
            del img_in
        del in_sliced
        torch.cuda.empty_cache()
        out_name.append(basename(a.in_img).replace(' ', '_'))

    params, image_f = fft_image([1, 3, *a.size],
                                resume=a.resume,
                                decay_power=a.decay)
    image_f = to_valid_rgb(image_f, colors=a.colors)

    if a.prog is True:
        lr1 = a.lrate * 2
        lr0 = lr1 * 0.01
    else:
        lr0 = a.lrate
    optimizer = torch.optim.Adam(params, lr0)
    sign = 1. if a.invert is True else -1.

    if a.verbose is True: print(' samples:', a.samples)
    out_name = '-'.join(out_name)
    out_name += '-%s' % a.model if 'RN' in a.model.upper() else ''
    tempdir = os.path.join(a.out_dir, out_name)
    os.makedirs(tempdir, exist_ok=True)

    pbar = ProgressBar(a.steps // a.fstep)
    for i in range(a.steps):
        train(i)

    os.system('ffmpeg -v warning -y -i %s\%%04d.jpg "%s.mp4"' %
              (tempdir, os.path.join(a.out_dir, out_name)))
    shutil.copy(
        img_list(tempdir)[-1],
        os.path.join(a.out_dir, '%s-%d.jpg' % (out_name, a.steps)))
    if a.save_pt is True:
        torch.save(params, '%s.pt' % os.path.join(a.out_dir, out_name))
Beispiel #16
0
def eval_batch():
    eval_len = len(SET_DICT)
    accuracy = np.zeros(len(args.top_k))
    start_time = datetime.datetime.now()

    for i in xrange(eval_len - args.skip_num):
        im = cv2.imread(SET_DICT[i + args.skip_num]['path'])
        im = T.bgr2rgb(im)
        scale_im = T.pil_resize(Image.fromarray(im), args.base_size)
        normalized_im = T.normalize(np.asarray(scale_im) / 255.0, mean=PIXEL_MEANS, std=PIXEL_STDS)
        crop_ims = []
        if args.crop_type == 'center':  # for single crop
            crop_ims.append(T.center_crop(normalized_im, crop_size=args.crop_size))
        elif args.crop_type == 'multi':  # for 10 crops
            crop_ims.extend(T.mirror_crop(normalized_im, crop_size=args.crop_size))
        else:
            crop_ims.append(normalized_im)

        score_vec = np.zeros(args.class_num, dtype=np.float32)
        iter_num = int(len(crop_ims) / args.batch_size)
        timer_pt1 = datetime.datetime.now()
        for j in xrange(iter_num):
            input_data = np.asarray(crop_ims, dtype=np.float32)[j * args.batch_size:(j + 1) * args.batch_size]
            input_data = input_data.transpose(0, 3, 1, 2)
            input_data = torch.autograd.Variable(torch.from_numpy(input_data).cuda(), volatile=True)
            outputs = MODEL(input_data)
            scores = outputs.data.cpu().numpy()
            score_vec += np.sum(scores, axis=0)
        score_index = (-score_vec / len(crop_ims)).argsort() - 1
        timer_pt2 = datetime.datetime.now()

        SET_DICT[i + args.skip_num]['evaluated'] = True
        SET_DICT[i + args.skip_num]['score_vec'] = score_vec / len(crop_ims)

        print 'Testing image: {}/{} {} {}/{} {}s' \
            .format(str(i + 1), str(eval_len - args.skip_num), str(SET_DICT[i + args.skip_num]['path'].split('/')[-1]),
                    str(score_index[0]), str(SET_DICT[i + args.skip_num]['gt']),
                    str((timer_pt2 - timer_pt1).microseconds / 1e6 + (timer_pt2 - timer_pt1).seconds)),

        for j in xrange(len(args.top_k)):
            if SET_DICT[i + args.skip_num]['gt'] in score_index[:args.top_k[j]]:
                accuracy[j] += 1
            tmp_acc = float(accuracy[j]) / float(i + 1)
            if args.top_k[j] == 1:
                print '\ttop_' + str(args.top_k[j]) + ':' + str(tmp_acc),
            else:
                print 'top_' + str(args.top_k[j]) + ':' + str(tmp_acc)
    end_time = datetime.datetime.now()

    w = open(LOG_PTH, 'w')
    s1 = 'Evaluation process ends at: {}. \nTime cost is: {}. '.format(str(end_time), str(end_time - start_time))
    s2 = '\nThe model is: {}. \nThe val file is: {}. \n{} images has been tested, crop_type is: {}, base_size is: {}, ' \
         'crop_size is: {}.'.format(args.model_weights, args.val_file, str(eval_len - args.skip_num),
                                    args.crop_type, str(args.base_size), str(args.crop_size))
    s3 = '\nThe PIXEL_MEANS is: ({}, {}, {}), PIXEL_STDS is : ({}, {}, {}).' \
        .format(str(PIXEL_MEANS[0]), str(PIXEL_MEANS[1]), str(PIXEL_MEANS[2]), str(PIXEL_STDS[0]), str(PIXEL_STDS[1]),
                str(PIXEL_STDS[2]))
    s4 = ''
    for i in xrange(len(args.top_k)):
        _acc = float(accuracy[i]) / float(eval_len - args.skip_num)
        s4 += '\nAccuracy of top_{} is: {}; correct num is {}.'.format(str(args.top_k[i]), str(_acc),
                                                                       str(int(accuracy[i])))
    print s1, s2, s3, s4
    w.write(s1 + s2 + s3 + s4)
    w.close()

    if args.save_score_vec:
        w = open(LOG_PTH.replace('.txt', 'scorevec.txt'), 'w')
        for i in xrange(eval_len - args.skip_num):
            w.write(SET_DICT[i + args.skip_num]['score_vec'])
        w.close()
    print('DONE!')
Beispiel #17
0
import transforms as t

import numpy as np
import matplotlib.pyplot as plt
from hcat.unet import Unet_Constructor as GUnet
from hcat.loss import dice_loss
import torch

data = dataloader.stack(
    path='/home/chris/Desktop/ColorImages',
    joint_transforms=[
        t.to_float(),
        t.reshape(),
        t.random_crop([512, 512, 30]),
    ],
    image_transforms=[t.normalize([0.5, 0.5, 0.5, 0.5], [0.5, 0.5, 0.5, 0.5])])

if torch.cuda.is_available():
    device = 'cuda:0'
else:
    device = 'cpu'

print('Initalizing Unet:  ', end='')
unet = GUnet(image_dimensions=3,
             in_channels=4,
             out_channels=1,
             feature_sizes=[16, 32, 64, 128],
             kernel={
                 'conv1': (3, 3, 2),
                 'conv2': (3, 3, 1)
             },
Beispiel #18
0
from math import sqrt

import numpy as np

from transforms import quat_dot, normalize, unit_x, quat_look_at, compute_grasp, trans_from_quat_point, \
  quat_from_angle_vector, unit_trans, quat_from_trans, unit_z, manip_trans_from_object_trans, trans_from_pose, \
  quat_transform_point
from manipulation.bodies.robot import get_tool_trans
from tools.numerical import PI
from tools.objects import str_object
from manipulation.constants import APPROACH_DISTANCE

APPROACH_VECTOR = APPROACH_DISTANCE*normalize(np.array([1, 0, -1])) # TODO - move this elsewhere

class Contact(object):
  def __init__(self, contact_trans, direction, gripper_config=None, gripper_traj=None): # TODO - fill in
    self.direction = direction
    self.grasp_trans = contact_trans # TODO - rename self.contact_trans
    self.gripper_config = gripper_config
    self.gripper_traj = gripper_traj
  def __repr__(self):
    return self.__class__.__name__ + str_object(self.grasp_trans[:3, 3])

def manip_trans_from_pose_contact(pose, contact):
  return manip_trans_from_object_trans(trans_from_pose(pose.value), contact.grasp_trans)

# NOTE - cannot use center of object to infer approach vector because gripper might not be tangent
def approach_vector_from_pose_contact(pose, contact): # TODO - universal way of inferring approach_vector from manip_trans (probably not possible)
  approach_vector = quat_transform_point(quat_from_trans(manip_trans_from_pose_contact(pose, contact)), APPROACH_VECTOR)
  if contact.grasp_trans[0, 3] > 0: approach_vector[:2] *= -1
  return approach_vector
from math import sqrt

import numpy as np

from transforms import quat_dot, normalize, unit_x, quat_look_at, compute_grasp, trans_from_quat_point, \
  quat_from_angle_vector, unit_trans, quat_from_trans, unit_z, manip_trans_from_object_trans, trans_from_pose, \
  quat_transform_point
from manipulation.bodies.robot import get_tool_trans
from misc.numerical import PI
from misc.objects import str_object
from manipulation.constants import APPROACH_DISTANCE
import operator

APPROACH_VECTOR = APPROACH_DISTANCE*normalize(np.array([1, 0, -1])) # TODO - move this elsewhere

class Contact(object):
  def __init__(self, contact_trans, direction, gripper_config=None, gripper_traj=None): # TODO - fill in
    self.direction = direction
    self.grasp_trans = contact_trans # TODO - rename self.contact_trans
    self.gripper_config = gripper_config
    self.gripper_traj = gripper_traj
  def __repr__(self):
    return self.__class__.__name__ + str_object(self.grasp_trans[:3, 3])

def manip_trans_from_pose_contact(pose, contact):
  return manip_trans_from_object_trans(trans_from_pose(pose.value), contact.grasp_trans)

# NOTE - cannot use center of object to infer approach vector because gripper might not be tangent
def approach_vector_from_pose_contact(pose, contact): # TODO - universal way of inferring approach_vector from manip_trans (probably not possible)
  approach_vector = quat_transform_point(quat_from_trans(manip_trans_from_pose_contact(pose, contact)), APPROACH_VECTOR)
  if contact.grasp_trans[0, 3] > 0: approach_vector[:2] *= -1
Beispiel #20
0
def main():
    a = get_args()

    # Load CLIP models
    model_clip, _ = clip.load(a.model)
    if a.verbose is True: print(' using model', a.model)
    xmem = {'RN50':0.5, 'RN50x4':0.16, 'RN101':0.33}
    if 'RN' in a.model:
        a.samples = int(a.samples * xmem[a.model])
    workdir = os.path.join(a.out_dir, basename(a.in_txt))
    workdir += '-%s' % a.model if 'RN' in a.model.upper() else ''
    os.makedirs(workdir, exist_ok=True)

    if a.diverse != 0:
        a.samples = int(a.samples * 0.5)
            
    if a.transform is True:
        trform_f = transforms.transforms_custom  
        a.samples = int(a.samples * 0.95)
    else:
        trform_f = transforms.normalize()

    if a.in_txt0 is not None:
        if a.verbose is True: print(' subtract text:', basename(a.in_txt0))
        if a.translate:
            translator = Translator()
            a.in_txt0 = translator.translate(a.in_txt0, dest='en').text
            if a.verbose is True: print(' translated to:', a.in_txt0) 
        if a.multilang is True:
            model_lang = SentenceTransformer('clip-ViT-B-32-multilingual-v1').cuda()
            txt_enc0 = model_lang.encode([a.in_txt0], convert_to_tensor=True, show_progress_bar=False).detach().clone()
            del model_lang
        else:
            txt_enc0 = model_clip.encode_text(clip.tokenize(a.in_txt0).cuda()).detach().clone()

    # make init
    global params_start, params_ema
    params_shape = [1, 3, a.size[0], a.size[1]//2+1, 2]
    params_start = torch.randn(*params_shape).cuda() # random init
    params_ema = 0.
    if a.resume is not None and os.path.isfile(a.resume):
        if a.verbose is True: print(' resuming from', a.resume)
        params_start = load_params(a.resume).cuda()
        if a.keep > 0:
            params_ema = params_start[0].detach().clone()
    else:
        a.resume = 'init.pt'

    torch.save(params_start, 'init.pt') # final init
    shutil.copy(a.resume, os.path.join(workdir, '000-%s.pt' % basename(a.resume)))
    
    prev_enc = 0
    def process(txt, num):

        sd = 0.01
        if a.keep > 0: sd = a.keep + (1-a.keep) * sd
        params, image_f = fft_image([1, 3, *a.size], resume='init.pt', sd=sd, decay_power=a.decay)
        image_f = to_valid_rgb(image_f, colors = a.colors)

        if a.prog is True:
            lr1 = a.lrate * 2
            lr0 = a.lrate * 0.1
        else:
            lr0 = a.lrate
        optimizer = torch.optim.Adam(params, lr0)
    
        if a.verbose is True: print(' ref text: ', txt)
        if a.translate:
            translator = Translator()
            txt = translator.translate(txt, dest='en').text
            if a.verbose is True: print(' translated to:', txt)
        if a.multilang is True:
            model_lang = SentenceTransformer('clip-ViT-B-32-multilingual-v1').cuda()
            txt_enc = model_lang.encode([txt], convert_to_tensor=True, show_progress_bar=False).detach().clone()
            del model_lang
        else:
            txt_enc = model_clip.encode_text(clip.tokenize(txt).cuda()).detach().clone()
        if a.notext > 0:
            txt_plot = torch.from_numpy(plot_text(txt, a.modsize)/255.).unsqueeze(0).permute(0,3,1,2).cuda()
            txt_plot_enc = model_clip.encode_image(txt_plot).detach().clone()
        else: txt_plot_enc = None

        out_name = '%03d-%s' % (num+1, txt_clean(txt))
        out_name += '-%s' % a.model if 'RN' in a.model.upper() else ''
        tempdir = os.path.join(workdir, out_name)
        os.makedirs(tempdir, exist_ok=True)
        
        pbar = ProgressBar(a.steps // a.fstep)
        for i in range(a.steps):
            loss = 0

            noise = a.noise * torch.randn(1, 1, *params[0].shape[2:4], 1).cuda() if a.noise > 0 else None
            img_out = image_f(noise)
            
            if a.sharp != 0:
                lx = torch.mean(torch.abs(img_out[0,:,:,1:] - img_out[0,:,:,:-1]))
                ly = torch.mean(torch.abs(img_out[0,:,1:,:] - img_out[0,:,:-1,:]))
                loss -= a.sharp * (ly+lx)

            imgs_sliced = slice_imgs([img_out], a.samples, a.modsize, trform_f, a.align, micro=1.)
            out_enc = model_clip.encode_image(imgs_sliced[-1])
            loss -= torch.cosine_similarity(txt_enc, out_enc, dim=-1).mean()
            if a.notext > 0:
                loss += a.notext * torch.cosine_similarity(txt_plot_enc, out_enc, dim=-1).mean()
            if a.diverse != 0:
                imgs_sliced = slice_imgs([image_f(noise)], a.samples, a.modsize, trform_f, a.align, micro=1.)
                out_enc2 = model_clip.encode_image(imgs_sliced[-1])
                loss += a.diverse * torch.cosine_similarity(out_enc, out_enc2, dim=-1).mean()
                del out_enc2; torch.cuda.empty_cache()
            if a.expand > 0:
                global prev_enc
                if i > 0:
                    loss += a.expand * torch.cosine_similarity(out_enc, prev_enc, dim=-1).mean()
                prev_enc = out_enc.detach().clone()
            if a.in_txt0 is not None: # subtract text
                loss += torch.cosine_similarity(txt_enc0, out_enc, dim=-1).mean()
            del img_out, imgs_sliced, out_enc; torch.cuda.empty_cache()

            if a.prog is True:
                lr_cur = lr0 + (i / a.steps) * (lr1 - lr0)
                for g in optimizer.param_groups: 
                    g['lr'] = lr_cur
        
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            if i % a.fstep == 0:
                with torch.no_grad():
                    img = image_f(contrast=a.contrast).cpu().numpy()[0]
                if a.sharp != 0:
                    img = img **1.3 # empirical tone mapping
                checkout(img, os.path.join(tempdir, '%04d.jpg' % (i // a.fstep)), verbose=a.verbose)
                pbar.upd()
                del img

        if a.keep > 0:
            global params_start, params_ema
            params_ema = ema(params_ema, params[0].detach().clone(), num+1)
            torch.save((1-a.keep) * params_start + a.keep * params_ema, 'init.pt')
        
        torch.save(params[0], '%s.pt' % os.path.join(workdir, out_name))
        shutil.copy(img_list(tempdir)[-1], os.path.join(workdir, '%s-%d.jpg' % (out_name, a.steps)))
        os.system('ffmpeg -v warning -y -i %s\%%04d.jpg "%s.mp4"' % (tempdir, os.path.join(workdir, out_name)))

    with open(a.in_txt, 'r', encoding="utf-8") as f:
        texts = f.readlines()
        texts = [tt.strip() for tt in texts if len(tt.strip()) > 0 and tt[0] != '#']
    if a.verbose is True: 
        print(' total lines:', len(texts))
        print(' samples:', a.samples)

    for i, txt in enumerate(texts):
        process(txt, i)

    vsteps = int(a.length * 25 / len(texts)) # 25 fps
    tempdir = os.path.join(workdir, '_final')
    os.makedirs(tempdir, exist_ok=True)
    
    def read_pt(file):
        return torch.load(file).cuda()

    if a.verbose is True: print(' rendering complete piece')
    ptfiles = file_list(workdir, 'pt')
    pbar = ProgressBar(vsteps * len(ptfiles))
    for px in range(len(ptfiles)):
        params1 = read_pt(ptfiles[px])
        params2 = read_pt(ptfiles[(px+1) % len(ptfiles)])

        params, image_f = fft_image([1, 3, *a.size], resume=params1, sd=1., decay_power=a.decay)
        image_f = to_valid_rgb(image_f, colors = a.colors)

        for i in range(vsteps):
            with torch.no_grad():
                img = image_f((params2 - params1) * math.sin(1.5708 * i/vsteps)**2)[0].permute(1,2,0)
                img = torch.clip(img*255, 0, 255).cpu().numpy().astype(np.uint8)
            imsave(os.path.join(tempdir, '%05d.jpg' % (px * vsteps + i)), img)
            if a.verbose is True: cvshow(img)
            pbar.upd()

    os.system('ffmpeg -v warning -y -i %s\%%05d.jpg "%s.mp4"' % (tempdir, os.path.join(a.out_dir, basename(a.in_txt))))
    if a.keep > 0: os.remove('init.pt')
Beispiel #21
0
try:
    d_feats = np.load(d_feats_file)
except OSError as e:
    print(
        'ERROR: File {} not found. Please follow the instructions to download the pre-computed features.'
        .format(d_feats_file))
    sys.exit()

# Load the query image
img = Image.open(dataset.get_query_filename(args.qidx))
# Crop the query ROI
img = img.crop(tuple(dataset.get_query_roi(args.qidx)))
# Apply transformations
img = trf.resize_image(img, 800)
I = trf.to_tensor(img)
I = trf.normalize(
    I, dict(rgb_means=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]))
I = I.unsqueeze(0).to(device)
# Forward pass to extract the features
with torch.no_grad():
    print('Extracting the representation of the query...')
    q_feat = model(I).numpy()
print('Done\n')

# Rank the database and visualize the top-k most similar images in the database
dataset.vis_top(d_feats,
                args.qidx,
                q_feat=q_feat,
                topk=args.topk,
                out_image_file='out.png')