Ejemplo n.º 1
0
def get_imagenet_dataflow(datadir,
                          name,
                          batch_size,
                          augmentors=None,
                          parallel=None):
    """
    Args:
        augmentors (list[imgaug.Augmentor]): Defaults to `fbresnet_augmentor(isTrain)`

    Returns: A DataFlow which produces BGR images and labels.

    See explanations in the tutorial:
    http://tensorpack.readthedocs.io/en/latest/tutorial/efficient-dataflow.html
    """
    assert name in ['train', 'val', 'test']
    isTrain = name == 'train'
    assert datadir is not None
    if augmentors is None:
        augmentors = fbresnet_augmentor(isTrain)
    assert isinstance(augmentors, list)
    if parallel is None:
        parallel = min(40,
                       multiprocessing.cpu_count() //
                       2)  # assuming hyperthreading

    if isTrain:
        ds = dataset.ILSVRC12(datadir, name, shuffle=True)
        ds = AugmentImageComponent(ds, augmentors, copy=False)
        if parallel < 16:
            logger.warn(
                "DataFlow may become the bottleneck when too few processes are used."
            )
        ds = PrefetchDataZMQ(ds, parallel)
        ds = BatchData(ds, batch_size, remainder=False)
    else:
        ds = dataset.ILSVRC12Files(datadir, name, shuffle=False)
        aug = imgaug.AugmentorList(augmentors)

        def mapf(dp):
            fname, cls = dp
            im = cv2.imread(fname, cv2.IMREAD_COLOR)
            im = aug.augment(im)
            return im, cls

        ds = MultiThreadMapData(ds,
                                parallel,
                                mapf,
                                buffer_size=2000,
                                strict=True)
        ds = BatchData(ds, batch_size, remainder=True)
        ds = PrefetchDataZMQ(ds, 1)
    return ds
Ejemplo n.º 2
0
def get_moco_dataflow(datadir, batch_size, augmentors):
    """
    Dataflow for training MOCO.
    """
    augmentors = imgaug.AugmentorList(augmentors)
    parallel = min(30, mp.cpu_count())  # tuned on a 40-CPU 80-core machine
    ds = dataset.ILSVRC12Files(datadir, 'train', shuffle=True)
    ds = MultiProcessMapAndBatchDataZMQ(ds,
                                        parallel,
                                        MoCoMapper(augmentors),
                                        batch_size,
                                        buffer_size=5000)
    return ds
Ejemplo n.º 3
0
def get_imagenet_dataflow(datadir,
                          is_train,
                          batch_size,
                          augmentors,
                          parallel=None):
    """
    See explanations in the tutorial:
    http://tensorpack.readthedocs.io/en/latest/tutorial/efficient-dataflow.html
    """
    assert datadir is not None
    assert isinstance(augmentors, list)
    if parallel is None:
        parallel = min(40,
                       multiprocessing.cpu_count() //
                       2)  # assuming hyperthreading
    if is_train:
        ds = dataset.ILSVRC12(datadir, "train", shuffle=True)
        ds = AugmentImageComponent(ds, augmentors, copy=False)
        if parallel < 16:
            logging.warning(
                "DataFlow may become the bottleneck when too few processes are used."
            )
        ds = PrefetchDataZMQ(ds, parallel)
        ds = BatchData(ds, batch_size, remainder=False)
    else:
        ds = dataset.ILSVRC12Files(datadir, "val", shuffle=False)
        aug = imgaug.AugmentorList(augmentors)

        def mapf(dp):
            fname, cls = dp
            im = cv2.imread(fname, cv2.IMREAD_COLOR)
            im = np.flip(im, axis=2)
            # print("fname={}".format(fname))
            im = aug.augment(im)
            return im, cls

        ds = MultiThreadMapData(ds,
                                parallel,
                                mapf,
                                buffer_size=2000,
                                strict=True)
        # ds = MapData(ds, mapf)
        ds = BatchData(ds, batch_size, remainder=True)
        ds = PrefetchDataZMQ(ds, 1)
        # ds = PrefetchData(ds, 1)
    return ds
Ejemplo n.º 4
0
    def get_data(self, name, num_gpu):
        gpu_batch = self.batch_size // num_gpu

        assert name in ['train', 'val', 'test']
        isTrain = name == 'train'

        augmentors = fbresnet_augmentor(isTrain)
        assert isinstance(augmentors, list)

        parallel = min(40,
                       multiprocessing.cpu_count() //
                       2)  # assuming hyperthreading

        if isTrain:
            ds = dataset.ILSVRC12(self.datadir,
                                  name,
                                  shuffle=True,
                                  dir_structure='train')
            ds = AugmentImageComponent(ds, augmentors, copy=False)
            ds = MultiProcessRunnerZMQ(ds, parallel)
            ds = BatchData(ds, gpu_batch, remainder=False)
            #ds = QueueInput(ds)
        else:
            ds = dataset.ILSVRC12Files(self.datadir,
                                       name,
                                       shuffle=False,
                                       dir_structure='train')
            aug = imgaug.AugmentorList(augmentors)

            def mapf(dp):
                fname, cls = dp
                im = cv2.imread(fname, cv2.IMREAD_COLOR)
                im = aug.augment(im)
                return im, cls

            ds = MultiThreadMapData(ds,
                                    parallel,
                                    mapf,
                                    buffer_size=2000,
                                    strict=True)
            ds = BatchData(ds, gpu_batch, remainder=True)
            ds = MultiProcessRunnerZMQ(ds, 1)

            if num_gpu == 1:
                ds = QueueInput(ds)
        return ds
Ejemplo n.º 5
0
def get_imagenet_dataflow(
        datadir, name, batch_size,
        augmentors, parallel=None):
    """
    See explanations in the tutorial:
    http://tensorpack.readthedocs.io/en/latest/tutorial/efficient-dataflow.html
    """
    assert name in ['train', 'val', 'test']
    assert datadir is not None
    assert isinstance(augmentors, list)
    isTrain = name == 'train'
    if parallel is None:
        parallel = min(40, 16)  # assuming hyperthreading
    if isTrain:
        ds1 = ilsvrcsemi.ILSVRC12(datadir, name, shuffle=True, labeled=True)
        ds2 = ilsvrcsemi.ILSVRC12(datadir, name, shuffle=True, labeled=False)
        ds1 = AugmentImageComponent(ds1, augmentors, copy=False)
        ds2 = AugmentImageComponent(ds2, augmentors, copy=False)
        ds = JoinData([ds1, ds2])

        if parallel < 16:
            logger.warn("DataFlow may become the bottleneck when too few processes are used.")
        ds = PrefetchDataZMQ(ds, parallel)
        ds = BatchData(ds, batch_size, remainder=False)
    else:
        ds = dataset.ILSVRC12Files(datadir, name, shuffle=False)
        aug = imgaug.AugmentorList(augmentors)

        def mapf(dp):
            fname, cls = dp
            im = cv2.imread(fname, cv2.IMREAD_COLOR)
            im = aug.augment(im)
            return im, cls, im, cls
        ds = MultiThreadMapData(ds, parallel, mapf, buffer_size=2000, strict=True)
        ds = BatchData(ds, batch_size, remainder=True)
        ds = PrefetchDataZMQ(ds, 1)
    return ds
Ejemplo n.º 6
0
    parser.add_argument('--data', help='imagenet data dir')
    parser.add_argument('--batch',
                        default=512,
                        type=int,
                        help='total batch size')
    parser.add_argument('--load',
                        required=True,
                        help='file or directory to evaluate')
    parser.add_argument('--top-k', type=int, default=200, help='top-k in KNN')
    parser.add_argument('--v2', action='store_true', help='use mocov2')
    args = parser.parse_args()

    hvd.init()
    local_batch_size = args.batch // hvd.size()

    train_files = dataset.ILSVRC12Files(args.data, 'train', shuffle=True)
    train_files.reset_state()
    all_train_files = list(train_files)
    all_train_files = all_train_files[:len(all_train_files) // args.batch *
                                      args.batch]  # truncate
    num_train_images = len(all_train_files)
    logger.info(
        f"Creating graph for KNN of {num_train_images} training images ...")
    local_train_files = [(idx, fname, label)
                         for idx, (fname, label) in enumerate(all_train_files)
                         if idx % hvd.size() == hvd.rank()]

    image_input = tf.placeholder(tf.uint8, [None, 224, 224, 3], "image")
    idx_input = tf.placeholder(tf.int64, [None], "image_idx")

    feat_buffer = tf.get_variable("feature_buffer",
Ejemplo n.º 7
0
    parser.add_argument('--out_dir',
                        type=str,
                        default="/home/sherry/datasets/ilsvrc-lmdb111")
    parser.add_argument('--procs', type=int, default=20)

    args = parser.parse_args()

    if not os.path.exists(args.out_dir):
        os.makedirs(args.out_dir)

    if args.dataset == "ILSVRC":

        # class BinaryILSVRC12(dataset.ILSVRC12Files):
        #     def __iter__(self):
        #         for fname, label in super(BinaryILSVRC12, self).__iter__():
        #             with open(fname, 'rb') as f:
        #                 bytes = f.read()
        #             bytes = np.asarray(bytearray(bytes), dtype='uint8')
        #             yield [bytes, label]
        # ds = BinaryILSVRC12(args.data_dir, args.split)
        # ds = MultiProcessRunnerZMQ(ds, nr_proc=args.procs)
        # LMDBSerializer.save(ds, os.path.join(args.out_dir, '%s-%s.lmdb' % (args.dataset, args.split))

        ds = dataset.ILSVRC12Files(args.data_dir, args.split)
    else:
        ds = BinaryFolder(args.data_dir, args.split, IMG_EXTENSIONS)

    output_path = os.path.join(args.out_dir,
                               '{}-{}.lmdb'.format(args.dataset, args.split))
    dump_imdb(ds, output_path, parallel=args.procs)