コード例 #1
0
def detect():
    config.batch_size = 1
    imgs = tf.placeholder(shape=(1, 512, 512, 3), dtype=tf.float32)
    #ig = AddCoords(x_dim=512, y_dim=512)(imgs)
    pred_loc, pred_confs, vbs = retinanet.model(imgs,config)
    box,score,pp = predict(imgs,pred_loc, pred_confs, vbs,config.Config)
    saver = tf.train.Saver()
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        saver.restore(sess, '/home/dsl/all_check/face_detect/resnet50_pasc/model.ckpt-199863')
        for ip in glob.glob('/media/dsl/20d6b919-92e1-4489-b2be-a092290668e4/VOCdevkit/VOCdevkit/VOC2012/JPEGImages/*.jpg'):
            print(ip)
            img = cv2.imread(ip)
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            org, window, scale, padding, crop = utils.resize_image(img, min_dim=512, max_dim=512)

            #img = (org/ 255.0-0.5)*2
            img = org - [123.15, 115.90, 103.06]
            img = np.expand_dims(img, axis=0)
            t = time.time()
            bx,sc,p= sess.run([box,score,pp],feed_dict={imgs:img})
            print(time.time()-t)
            bxx = []
            cls = []
            scores = []
            for s in range(len(p)):
                if sc[s]>0.3:
                    bxx.append(bx[s])
                    cls.append(p[s])
                    scores.append(sc[s])
            if len(bxx) > 0:
                #visual.display_instances(org,np.asarray(bxx)*300)
                visual.display_instances_title(org,np.asarray(bxx)*512,class_ids=np.asarray(cls),class_names=config.VOC_CLASSES,scores=scores)
コード例 #2
0
def video():
    config.batch_size = 1
    ig = tf.placeholder(shape=(1, 512, 512, 3), dtype=tf.float32)
    pred_loc, pred_confs, vbs = retinanet.model(ig,config)
    box,score,pp = predict(ig,pred_loc, pred_confs, vbs,config.Config)
    saver = tf.train.Saver()
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        saver.restore(sess, '/home/dsl/all_check/face_detect/resnet50/model.ckpt-18756')
        cap = cv2.VideoCapture('/media/dsl/20d6b919-92e1-4489-b2be-a092290668e4/face_detect/jijing.mp4')
        #cap = cv2.VideoCapture(0)
        cap.set(3, 320 * 3)
        cap.set(4, 320 * 3)
        t1 = time.time()
        while True:
            ret, frame = cap.read()

            if not ret:
                continue

            img = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            org, window, scale, padding, crop = utils.resize_image(img, min_dim=config.Config['min_dim'],
                                                                   max_dim=config.Config['min_dim'])

            img = org - [123.15, 115.90, 103.06]
            img = np.expand_dims(img, axis=0)
            t = time.time()

            bx, sc, p = sess.run([box, score, pp], feed_dict={ig: img})

            fps = int(1 / (time.time() - t) * 10) / 10.0

            cv2.putText(frame, 'fps:' + str(fps), (10, 30),
                        cv2.FONT_HERSHEY_SIMPLEX, 1.0, (0, 0, 255), lineType=cv2.LINE_AA)

            bxx = []
            cls = []
            scores = []
            for s in range(len(p)):
                if sc[s] > 0.4:
                    bxx.append(bx[s])
                    cls.append(p[s])
                    scores.append(sc[s])
            if len(bxx) > 0:
                finbox = utils.revert_image(scale, padding, config.Config['min_dim'], np.asarray(bxx))
                for ix, s in enumerate(finbox):
                    cv2.rectangle(frame, pt1=(s[0], s[1]), pt2=(s[2], s[3]), color=(0, 255, 0), thickness=2)
                    cv2.putText(frame, config.VOC_CLASSES[cls[ix]] + '_' + str(scores[ix])[0:4], (s[0], s[1]),
                                cv2.FONT_HERSHEY_SIMPLEX, 1.0, (0, 0, 255), lineType=cv2.LINE_AA)

            cv2.imshow('fram', frame)

            if cv2.waitKeyEx(1) & 0xFF == ord('q'):
                break
        print('ss')

        cap.release()
        cv2.destroyAllWindows()
コード例 #3
0
def train():
    img = tf.placeholder(shape=[config.batch_size, config.Config['min_dim'], config.Config['min_dim'], 3], dtype=tf.float32)
    #ig = AddCoords(x_dim=512,y_dim=512)(img)
    anchors_num = sum(
        [config.Config['feature_maps'][s] ** 2 * config.Config['aspect_num'][s] for s in range(5)])
    loc = tf.placeholder(shape=[config.batch_size, anchors_num, 4], dtype=tf.float32)
    conf = tf.placeholder(shape=[config.batch_size, anchors_num], dtype=tf.float32)
    pred_loc, pred_confs, vbs = retinanet.model(img,config)
    train_tensors = get_loss(conf, loc, pred_loc, pred_confs,config)
    gen = data_gen.get_batch_inception(batch_size=config.batch_size,image_size=config.Config['min_dim'],max_detect=50)

    global_step = slim.get_or_create_global_step()
    lr = tf.train.exponential_decay(
        learning_rate=0.001,
        global_step=global_step,
        decay_steps=40000,
        decay_rate=0.7,
        staircase=True)

    tf.summary.scalar('lr', lr)
    sum_op = tf.summary.merge_all()

    optimizer = tf.train.MomentumOptimizer(learning_rate=lr,momentum=0.9)
    train_op = slim.learning.create_train_op(train_tensors, optimizer)
    vbs = []
    for s in slim.get_variables():
        print(s.name)
        if 'resnet_v2_50' in s.name and 'Momentum' not in s.name:
            print(s.name)
            vbs.append(s)

    saver = tf.train.Saver(vbs)

    def restore(sess):
        saver.restore(sess, config.check_dir)


    sv = tf.train.Supervisor(logdir=config.save_dir, summary_op=None, init_fn=restore)

    with sv.managed_session() as sess:
        for step in range(200000):
            print('       '+' '.join(['*']*(step%10)))
            images, true_box, true_label = q.get()

            loct, conft = np_utils.get_loc_conf(true_box, true_label, batch_size=config.batch_size,cfg=config.Config)
            feed_dict = {img: images, loc: loct,
                         conf: conft}

            ls, step = sess.run([train_op, global_step], feed_dict=feed_dict)

            if step % 10 == 0:
                print('step:' + str(step) +
                      ' ' + 'class_loss:' + str(ls[0]) +
                      ' ' + 'loc_loss:' + str(ls[1])
                      )
                summaries = sess.run(sum_op, feed_dict=feed_dict)
                sv.summary_computed(sess, summaries)
コード例 #4
0
def main(args=None):
    global thres
    global rel_thresh
    global attr_thresh

    parser = argparse.ArgumentParser(description='Simple training script for training a RetinaNet network.')

    parser.add_argument('--dataset', help='Dataset type, must be one of csv or coco.')
    parser.add_argument('--data_path', help='Path to COCO directory')
    parser.add_argument('--csv_classes', help='Path to file containing class list (see readme)')
    parser.add_argument('--csv_val', help='Path to file containing validation annotations (optional, see readme)')
    parser.add_argument('--net', help='Network to use', default='fasterrcnn')
    parser.add_argument('--set', help='Set on which evaluation will be performed', default='validation')
    parser.add_argument('--store_detections', action='store_true', default=False,
                        help='Cache all detections with very low threshold in order to enable filtering after extraction')
    parser.add_argument('--load_detections', action='store_true', default=False,
                        help='Load cached detections')

    parser.add_argument('--model_rel', help='Path to model (.pt) file for relationships.', default=None)
    parser.add_argument('--model_attr', help='Path to model (.pt) file for attributes.', default=None)
    parser.add_argument('--model_detector', help='Path to model (.pt) file for the detector.')
    parser.add_argument('--depth', help='Resnet depth, must be one of 18, 34, 50, 101, 152', type=int, default=50)

    parser = parser.parse_args(args)

    assert parser.model_rel is not None and parser.model_attr is not None and parser.model_detector is not None, \
           'Models snapshots have to be specified!'
    assert not (parser.load_detections and parser.store_detections)

    det_output_path = os.path.split(parser.model_rel)[0]

    if parser.dataset == 'openimages':
        dataset_val = OidDatasetVRD(parser.data_path, subset=parser.set,
                                    transform=Compose([ToTensor()]))
    else:
        raise ValueError('Dataset type not understood (must be csv or coco), exiting.')

    #sampler_val = AspectRatioBasedSampler(dataset_val, batch_size=1, drop_last=False)
    dataloader_val = DataLoader(dataset_val, num_workers=1, collate_fn=collate_fn, batch_size=1)

    # Create the model
    detector = create_detection_model(dataset_val.num_classes(), parser, box_score_thresh=thres)
    model = VRD(detector, dataset=dataset_val, train_relationships=parser.model_rel is not None,
                train_attributes=parser.model_attr is not None, max_objects=max_objects)

    # Load the detector
    checkpoint = torch.load(parser.model_detector, map_location=lambda storage, loc: storage)
    weights = checkpoint['model']
    weights = {k.replace('module.', ''): v for k, v in weights.items()}
    model.detector.load_state_dict(weights)
    print('Detector correctly loaded!')

    # Load the attributes, if needed
    if parser.model_rel:
        checkpoint = torch.load(parser.model_rel, map_location=lambda storage, loc: storage)
        weights = checkpoint['model_rel']
        weights = {k.replace('module.', ''): v for k, v in weights.items()}
        model.relationships_net.load_state_dict(weights)
        print('Relationships correctly loaded!')

    if parser.model_attr:
        checkpoint = torch.load(parser.model_attr, map_location=lambda storage, loc: storage)
        weights = checkpoint['model_attr']
        weights = {k.replace('module.', ''): v for k, v in weights.items()}
        model.attributes_net.load_state_dict(weights)
        print('Attributes correctly loaded!')

    if use_gpu:
        model = model.cuda()

    model.eval()

    all_detections = []
    if parser.load_detections or parser.store_detections:
        print('Opening detections database file...')
        flag = 'r' if parser.load_detections else 'c'
        loaded_detections = shelve.open(os.path.join(det_output_path, 'cached_detections_detthr{}.db'.format(thres)), flag=flag)

    for idx, data in enumerate(tqdm.tqdm(dataloader_val)):
        if parser.load_detections:
            loaded_det = loaded_detections[str(idx)]
            scores = loaded_det[0]
            classification = loaded_det[1]
            boxes = loaded_det[2]
            relationships = loaded_det[3]
            rel_scores = loaded_det[4]
            attributes = loaded_det[5]
            attr_scores = loaded_det[6]
        else:
            with torch.no_grad():
                st = time.time()

                images, targets = data

                # targets = [{k: v.cuda() for k, v in t.items()} for t in targets]
                if use_gpu:
                    input_images = list(image.cuda().float() for image in images)
                else:
                    input_images = list(image.float() for image in images)
                # TODO: adapt retinanet output to the one by torchvision 0.3
                # scores, classification, transformed_anchors = model(data_img.float())
                outputs = model(input_images)
                outputs = [{k: v.cpu() for k, v in t.items()} for t in outputs]

            output = outputs[0]  # take the only batch
            scores = output['scores']
            classification = output['labels']
            boxes = output['boxes']
            relationships = output['relationships']
            rel_scores = output['relationships_scores']
            attributes = output['attributes']
            attr_scores = output['attributes_scores']

        if parser.store_detections:
            loaded_detections[str(idx)] = [scores, classification, boxes, relationships, rel_scores, attributes, attr_scores]
        else:
            '''if parser.load_detections:
                pdb.set_trace()
                # filter objects, relationships and attributes
                filtered_idxs = np.where(scores > thres)[0]
                scores = scores[filtered_idxs]
                classification = classification[filtered_idxs]
                boxes = boxes[filtered_idxs]
                relationships = relationships[np.ix_(filtered_idxs, filtered_idxs)]
                rel_scores = rel_scores[np.ix_(filtered_idxs, filtered_idxs)]
                attributes = attributes[filtered_idxs]
                attr_scores = attr_scores[filtered_idxs]
            '''
            subj_boxes_out = []
            subj_labels_out = []
            obj_boxes_out = []
            obj_labels_out = []
            rel_labels_out = []
            rel_scores_out = []

            if len(boxes) != 0:
                # num_objects = min(boxes.shape[0], max_objects)

                # Collect objects and attributes
                for j in range(attributes.shape[0]):
                    bbox = boxes[j, :4]
                    attr = attributes[j, 0].item() if parser.model_attr is not None and attr_scores[j, 0] > attr_thresh else 0      # TODO: only the top rank attribute is considered, generalize better!
                    # We add an 'is' relation. 'is' relation is mapped to relation index of -1.
                    if attr != 0:
                        subj_boxes_out.append(bbox)
                        obj_boxes_out.append(bbox)
                        rel_labels_out.append(-1)
                        rel_scores_out.append(attr_scores[j, 0])
                        subj_labels_out.append(int(classification[j]))
                        obj_labels_out.append(attr)
                # Collect relationships
                for s_ind in range(relationships.shape[0]):
                    for o_ind in range(relationships.shape[1]):
                        subj = boxes[s_ind, :4]
                        obj = boxes[o_ind, :4]
                        rel = relationships[s_ind, o_ind].item() if rel_scores[s_ind, o_ind] > rel_thresh else 0
                        if rel != 0:
                            subj_boxes_out.append(subj)
                            obj_boxes_out.append(obj)
                            rel_labels_out.append(rel)
                            rel_scores_out.append(rel_scores[s_ind, o_ind])
                            subj_labels_out.append(int(classification[s_ind]))
                            obj_labels_out.append(int(classification[o_ind]))

            all_detections.append([idx, subj_boxes_out, subj_labels_out, obj_boxes_out, obj_labels_out, rel_labels_out, rel_scores_out])
            # if idx == 400:
            #    break

    if not parser.store_detections:
        print('Evaluating...')
        # TODO: add identification parameter to evaluate so that detections from different checkpoints are not overwritten
        dataset_val.evaluate(all_detections, det_output_path, file_identifier='{}_relthr{}_attrthr{}_detthr{}'.format(parser.set, rel_thresh, attr_thresh, thres))
        print('DONE!')
コード例 #5
0
def main(args=None):
    parser = argparse.ArgumentParser(
        description='Simple training script for training a RetinaNet network.')

    parser.add_argument('--dataset',
                        help='Dataset type, must be one of csv or coco.')
    parser.add_argument('--data_path', help='Path to COCO directory')
    parser.add_argument('--csv_classes',
                        help='Path to file containing class list (see readme)')
    parser.add_argument(
        '--csv_val',
        help=
        'Path to file containing validation annotations (optional, see readme)'
    )
    parser.add_argument('--net', help='Network to use', default='fasterrcnn')

    parser.add_argument('--model', help='Path to model (.pt) file.')
    parser.add_argument(
        '--depth',
        help='Resnet depth, must be one of 18, 34, 50, 101, 152',
        type=int,
        default=50)

    parser = parser.parse_args(args)

    if parser.dataset == 'coco':
        raise NotImplementedError()
        dataset_val = CocoDataset(parser.data_path,
                                  set_name='val2017',
                                  transform=Compose([Normalizer(),
                                                     Resizer()]))
    elif parser.dataset == 'openimages':
        dataset_val = OidDataset(parser.data_path,
                                 subset='validation',
                                 transform=Compose([ToTensor()]))
    elif parser.dataset == 'csv':
        raise NotImplementedError()
        dataset_val = CSVDataset(train_file=parser.csv_train,
                                 class_list=parser.csv_classes,
                                 transform=Compose([Normalizer(),
                                                    Resizer()]))
    else:
        raise ValueError(
            'Dataset type not understood (must be csv or coco), exiting.')

    sampler_val = AspectRatioBasedSampler(dataset_val,
                                          batch_size=1,
                                          drop_last=False)
    dataloader_val = DataLoader(dataset_val,
                                num_workers=1,
                                collate_fn=collate_fn,
                                batch_sampler=sampler_val)

    # Create the model
    model = create_detection_model(dataset_val.num_classes(), parser)

    checkpoint = torch.load(parser.model,
                            map_location=lambda storage, loc: storage)
    weights = checkpoint['model']
    weights = {k.replace('module.', ''): v for k, v in weights.items()}
    model.load_state_dict(weights)

    if use_gpu:
        model = model.cuda()

    model.eval()

    def draw_caption(image, box, caption):

        b = np.array(box).astype(int)
        cv2.putText(image, caption, (b[0], b[1] - 10), cv2.FONT_HERSHEY_PLAIN,
                    1, (0, 0, 0), 2)
        cv2.putText(image, caption, (b[0], b[1] - 10), cv2.FONT_HERSHEY_PLAIN,
                    1, (255, 255, 255), 1)

    for idx, data in enumerate(dataloader_val):
        with torch.no_grad():
            st = time.time()

            images, targets = data

            # targets = [{k: v.cuda() for k, v in t.items()} for t in targets]
            if use_gpu:
                input_images = list(image.cuda().float() for image in images)
            else:
                input_images = list(image.float() for image in images)
            # TODO: adapt retinanet output to the one by torchvision 0.3
            # scores, classification, transformed_anchors = model(data_img.float())
            outputs = model(input_images)
            outputs = [{k: v.cpu() for k, v in t.items()} for t in outputs]

            output = outputs[0]  # take the only batch
            scores = output['scores']
            classification = output['labels']
            transformed_anchors = output['boxes']
            # from here, interface to the code already written in the original repo

            print('Elapsed time: {}'.format(time.time() - st))
            idxs = np.where(scores > thres)
            img = np.array(255 * images[0]).copy()

            img[img < 0] = 0
            img[img > 255] = 255

            img = np.transpose(img, (1, 2, 0))

            img = cv2.cvtColor(img.astype(np.uint8), cv2.COLOR_BGR2RGB)
            '''
            # Visualize ground truth bounding boxes
            for bbox, label in zip(targets[0]['boxes'], targets[0]['labels']):
                # bbox = transformed_anchors[idxs[0][j], :]
                x1 = int(bbox[0])
                y1 = int(bbox[1])
                x2 = int(bbox[2])
                y2 = int(bbox[3])
                label_name = dataset_val.labels[int(label)]
                draw_caption(img, (x1, y1, x2, y2), label_name)

                cv2.rectangle(img, (x1, y1), (x2, y2), color=(0, 255, 0), thickness=1)
                print('GT: '+label_name)
            '''

            for j in range(idxs[0].shape[0]):
                bbox = transformed_anchors[idxs[0][j], :]
                x1 = int(bbox[0])
                y1 = int(bbox[1])
                x2 = int(bbox[2])
                y2 = int(bbox[3])
                label_name = dataset_val.labels[int(
                    classification[idxs[0][j]])]
                draw_caption(img, (x1, y1, x2, y2), label_name)

                cv2.rectangle(img, (x1, y1), (x2, y2),
                              color=(0, 0, 255),
                              thickness=2)
                print('Detection: ' + label_name)

            cv2.imshow('img', img)
            cv2.waitKey(0)
コード例 #6
0
def detect():
    rt = '/media/dsl/20d6b919-92e1-4489-b2be-a092290668e4/VOCdevkit/VOCdevkit/VOC2007/'
    dts = '/media/dsl/20d6b919-92e1-4489-b2be-a092290668e4/VOCdevkit/VOCdevkit/VOC2007/ImageSets/Main/test.txt'
    config.batch_size = 1
    ig = tf.placeholder(shape=(1, 512, 512, 3), dtype=tf.float32)
    pred_loc, pred_confs, vbs = retinanet.model(ig, config)
    box, score, pp = predict(ig, pred_loc, pred_confs, vbs, config)
    saver = tf.train.Saver()
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        saver.restore(
            sess,
            '/home/dsl/all_check/face_detect/resnet50_pasc/model.ckpt-199863')
        with open(dts) as f:
            ct = 1
            total_aps = []
            for s in f.readlines():
                img_id = s.replace('\n', '')

                img_path = os.path.join(rt, 'JPEGImages', img_id + '.jpg')
                img = cv2.imread(img_path)

                height, width, channels = img.shape

                gt_box, gt_cls = parse_rec(
                    os.path.join(rt, 'Annotations', img_id + '.xml'), height,
                    width)

                img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
                org, window, scale, padding, crop = utils.resize_image(
                    img, min_dim=512, max_dim=512)

                img = (org / 255.0 - 0.5) * 2
                img = np.expand_dims(img, axis=0)
                t = time.time()
                bx, sc, p = sess.run([box, score, pp], feed_dict={ig: img})

                bxx = []
                cls = []
                scores = []
                for kk in range(len(p)):
                    if sc[kk] > 0.4:
                        bxx.append(bx[kk])
                        cls.append(p[kk])
                        scores.append(sc[kk])

                if len(cls) > 0:
                    finbox = utils.revert_image(scale, padding,
                                                config.Config['min_dim'],
                                                np.asarray(bxx))
                    finbox = np.asarray(finbox, np.float32)
                    finbox[:, 0] = finbox[:, 0] * 1.0 / width
                    finbox[:, 1] = finbox[:, 1] * 1.0 / height
                    finbox[:, 2] = finbox[:, 2] * 1.0 / width
                    finbox[:, 3] = finbox[:, 3] * 1.0 / height

                    mAP, precisions, recalls, overlaps = eval_utils.compute_ap(
                        gt_boxes=np.asarray(gt_box),
                        gt_class_ids=np.asarray(gt_cls),
                        pred_boxes=finbox,
                        pred_class_ids=np.asarray(cls),
                        pred_scores=np.asarray(scores))
                    print(mAP)
                    print(precisions)
                    total_aps.append(mAP)

                    print(sum(total_aps) / len(total_aps))
                ct = ct + 1
                visual.display_instances_title(org,
                                               np.asarray(bxx) * 512,
                                               class_ids=np.asarray(cls),
                                               class_names=config.VOC_CLASSES,
                                               scores=scores)
コード例 #7
0
def main(args=None):
    parser = argparse.ArgumentParser(
        description='Simple training script for training a RetinaNet network.')

    parser.add_argument('--dataset',
                        help='Dataset type, must be one of csv or coco.')
    parser.add_argument('--data_path', help='Path to COCO directory')
    parser.add_argument('--csv_classes',
                        help='Path to file containing class list (see readme)')
    parser.add_argument(
        '--csv_val',
        help=
        'Path to file containing validation annotations (optional, see readme)'
    )
    parser.add_argument('--net', help='Network to use', default='fasterrcnn')
    parser.add_argument('--set',
                        help='Set on which evaluation will be performed',
                        default='validation')

    parser.add_argument('--model_rel',
                        help='Path to model (.pt) file for relationships.',
                        default=None)
    parser.add_argument('--model_attr',
                        help='Path to model (.pt) file for attributes.',
                        default=None)
    parser.add_argument('--model_detector',
                        help='Path to model (.pt) file for the detector.')
    parser.add_argument(
        '--depth',
        help='Resnet depth, must be one of 18, 34, 50, 101, 152',
        type=int,
        default=50)

    parser = parser.parse_args(args)

    if parser.dataset == 'openimages':
        dataset_val = OidDatasetVRD(parser.data_path,
                                    subset=parser.set,
                                    transform=Compose([ToTensor()]))
    else:
        raise ValueError(
            'Dataset type not understood (must be csv or coco), exiting.')

    #sampler_val = AspectRatioBasedSampler(dataset_val, batch_size=1, drop_last=False)
    dataloader_val = DataLoader(dataset_val,
                                num_workers=1,
                                collate_fn=collate_fn,
                                batch_size=1,
                                shuffle=True)

    # Create the model
    detector = create_detection_model(dataset_val.num_classes(),
                                      parser,
                                      box_score_thresh=thres)
    model = VRD(detector,
                dataset=dataset_val,
                train_relationships=parser.model_rel is not None,
                train_attributes=parser.model_attr is not None,
                max_objects=max_objects)

    # Load the detector
    checkpoint = torch.load(parser.model_detector,
                            map_location=lambda storage, loc: storage)
    weights = checkpoint['model']
    weights = {k.replace('module.', ''): v for k, v in weights.items()}
    model.detector.load_state_dict(weights)
    print('Detector correctly loaded!')

    # Load the attributes, if needed
    if parser.model_rel:
        checkpoint = torch.load(parser.model_rel,
                                map_location=lambda storage, loc: storage)
        weights = checkpoint['model_rel']
        weights = {k.replace('module.', ''): v for k, v in weights.items()}
        model.relationships_net.load_state_dict(weights)
        print('Relationships correctly loaded!')

    if parser.model_attr:
        checkpoint = torch.load(parser.model_attr,
                                map_location=lambda storage, loc: storage)
        weights = checkpoint['model_attr']
        weights = {k.replace('module.', ''): v for k, v in weights.items()}
        model.attributes_net.load_state_dict(weights)
        print('Attributes correctly loaded!')

    if use_gpu:
        model = model.cuda()

    model.eval()

    def draw_object_bb(image, box, caption):
        b = np.array(box).astype(int)
        cv2.putText(image, caption, (b[0], b[1] - 10), cv2.FONT_HERSHEY_PLAIN,
                    1, (0, 0, 0), 2)
        cv2.putText(image, caption, (b[0], b[1] - 10), cv2.FONT_HERSHEY_PLAIN,
                    1, (255, 255, 255), 1)
        cv2.rectangle(img, (b[0], b[1]), (b[2], b[3]),
                      color=(0, 0, 255),
                      thickness=2)

    def draw_relationship(image, subj, obj, rel_name):
        cv2.arrowedLine(image, (subj[0], subj[1]), (obj[0], obj[1]),
                        (255, 0, 0),
                        2,
                        tipLength=0.02)
        cv2.putText(image, rel_name,
                    ((subj[0] + obj[0]) / 2, (subj[1] + obj[1]) / 2),
                    cv2.FONT_HERSHEY_PLAIN, 1, (0, 255, 0), 2)

    for idx, data in enumerate(dataloader_val):
        with torch.no_grad():
            st = time.time()

            images, targets = data

            # targets = [{k: v.cuda() for k, v in t.items()} for t in targets]
            if use_gpu:
                input_images = list(image.cuda().float() for image in images)
            else:
                input_images = list(image.float() for image in images)
            # TODO: adapt retinanet output to the one by torchvision 0.3
            # scores, classification, transformed_anchors = model(data_img.float())
            outputs = model(input_images)
            outputs = [{k: v.cpu() for k, v in t.items()} for t in outputs]

            output = outputs[0]  # take the only batch
            scores = output['scores']
            classification = output['labels']
            boxes = output['boxes']
            if parser.model_rel:
                relationships = output['relationships']
                rel_scores = output['relationships_scores']
            if parser.model_attr:
                attributes = output['attributes']
                attr_scores = output['attributes_scores']
            # from here, interface to the code already written in the original repo

            print('Elapsed time: {}'.format(time.time() - st))
            img = np.array(255 * images[0]).copy()

            img[img < 0] = 0
            img[img > 255] = 255

            img = np.transpose(img, (1, 2, 0))

            img = cv2.cvtColor(img.astype(np.uint8), cv2.COLOR_BGR2RGB)
            '''
            # Visualize ground truth bounding boxes
            for bbox, label in zip(targets[0]['boxes'], targets[0]['labels']):
                # bbox = transformed_anchors[idxs[0][j], :]
                x1 = int(bbox[0])
                y1 = int(bbox[1])
                x2 = int(bbox[2])
                y2 = int(bbox[3])
                label_name = dataset_val.labels[int(label)]
                draw_caption(img, (x1, y1, x2, y2), label_name)

                cv2.rectangle(img, (x1, y1), (x2, y2), color=(0, 255, 0), thickness=1)
                print('GT: '+label_name)
            '''
            if len(boxes) != 0:

                # Draw objects
                for j in range(attributes.shape[0]):
                    bbox = boxes[j, :4].int()
                    attr = attributes[j, 0].item(
                    ) if parser.model_attr is not None and attr_scores[
                        j,
                        0] > attr_thresh else 0  # TODO: only the top rank attribute is considered, generalize better!
                    label_name = dataset_val.labels[int(classification[j])]
                    attr_name = ': ' + dataset_val.attr_id_to_labels[
                        attr] if attr != 0 else ''
                    draw_object_bb(img, bbox, label_name + attr_name)
                    print('Detection: ' + label_name)

                # Draw relationships
                if parser.model_rel:
                    for s_ind in range(relationships.shape[0]):
                        for o_ind in range(relationships.shape[1]):
                            subj = boxes[s_ind, :4].int()
                            obj = boxes[o_ind, :4].int()
                            rel = relationships[s_ind, o_ind].item(
                            ) if rel_scores[s_ind, o_ind] > rel_thresh else 0
                            if rel != 0:
                                rel_name = dataset_val.rel_id_to_labels[rel]
                                draw_relationship(img, subj, obj, rel_name)

            cv2.imshow('img', img)
            cv2.waitKey(0)
コード例 #8
0
def main(args=None):
    parser = argparse.ArgumentParser(
        description='Simple training script for training a RetinaNet network.')

    parser.add_argument('--dataset',
                        help='Dataset type, must be one of csv or coco.')
    parser.add_argument('--data_path', help='Path to COCO directory')
    parser.add_argument('--csv_classes',
                        help='Path to file containing class list (see readme)')
    parser.add_argument(
        '--csv_val',
        help=
        'Path to file containing validation annotations (optional, see readme)'
    )
    parser.add_argument('--net', help='Network to use', default='fasterrcnn')
    parser.add_argument('--set',
                        help='Set on which evaluation will be performed',
                        default='validation')

    parser.add_argument('--model', help='Path to model (.pt) file.')
    parser.add_argument(
        '--depth',
        help='Resnet depth, must be one of 18, 34, 50, 101, 152',
        type=int,
        default=50)

    parser = parser.parse_args(args)

    if parser.dataset == 'coco':
        raise NotImplementedError()
        dataset = CocoDataset(parser.data_path,
                              set_name='val2017',
                              transform=Compose([Normalizer(),
                                                 Resizer()]))
    elif parser.dataset == 'openimages':
        dataset = OidDataset(parser.data_path,
                             subset=parser.set,
                             transform=Compose([ToTensor()]))
    elif parser.dataset == 'csv':
        raise NotImplementedError()
        dataset = CSVDataset(train_file=parser.csv_train,
                             class_list=parser.csv_classes,
                             transform=Compose([Normalizer(),
                                                Resizer()]))
    else:
        raise ValueError(
            'Dataset type not understood (must be csv or coco), exiting.')

    # sampler_val = AspectRatioBasedSampler(dataset_val, batch_size=1, drop_last=False)
    dataloader = DataLoader(dataset,
                            num_workers=1,
                            collate_fn=collate_fn,
                            shuffle=False)

    # Create the model
    model = create_detection_model(dataset.num_classes(), parser)

    checkpoint = torch.load(parser.model,
                            map_location=lambda storage, loc: storage)
    weights = checkpoint['model']
    weights = {k.replace('module.', ''): v for k, v in weights.items()}
    model.load_state_dict(weights)

    if use_gpu:
        model = model.cuda()

    model.eval()

    all_detections = []
    det_output_path = os.path.split(parser.model)[0]

    for idx, data in enumerate(tqdm.tqdm(dataloader)):
        with torch.no_grad():
            st = time.time()

            images, targets = data

            # targets = [{k: v.cuda() for k, v in t.items()} for t in targets]
            if use_gpu:
                input_images = list(image.cuda().float() for image in images)
            else:
                input_images = list(image.float() for image in images)

            outputs = model(input_images)
            outputs = [{k: v.cpu() for k, v in t.items()} for t in outputs]

            output = outputs[0]  # take the only batch
            scores = output['scores']
            classification = output['labels']
            transformed_anchors = output['boxes']
            # from here, interface to the code already written in the original repo

            # TODO: 0.5 should be a parameter in a configuration file.. that hopefully should be created and handled..
            det_idxs = np.where(scores > det_thres)

            bboxes = transformed_anchors[
                det_idxs[0][det_idxs], :].cpu().numpy()
            labels = classification[det_idxs[0][det_idxs]].cpu().numpy()
            scores = scores[det_idxs[0][det_idxs]].cpu().numpy()

            packed_detections = [idx, bboxes, labels, scores]
            all_detections.append(packed_detections)

            #if idx == 3:
            #    break

    print('Evaluating...')
    # TODO: add identification parameter to evaluate so that detections from different checkpoints are not overwritten
    dataset.evaluate(all_detections,
                     det_output_path,
                     file_identifier='{}_IoU{}'.format(parser.set, det_thres))
    print('DONE!')