def main():
    """ create RecordFileFormat files """

    do_it()
    """im2rec = utils.download('https://raw.githubusercontent.com/apache/incubator-mxnet/' +
                        '6843914f642c8343aaa9a09db803b6af6f5d94a2/tools/im2rec.py', 'im2rec.py')"""
    subprocess.check_output([
        sys.executable, 'im2rec.py',
        '../data_set_files/record_format_files/filtered_data_set/val.lst', '.',
        '--no-shuffle', '--pass-through', '--pack-label'
    ])
    subprocess.check_output([
        sys.executable, 'im2rec.py',
        '../data_set_files/record_format_files/filtered_data_set/train.lst',
        '.', '--no-shuffle', '--pass-through', '--pack-label'
    ])
    record_dataset = RecordFileDetection(
        '../data_set_files/record_format_files/filtered_data_set/val.rec',
        coord_normalized=True)
    record_dataset = RecordFileDetection(
        '../data_set_files/record_format_files/filtered_data_set/train.rec',
        coord_normalized=True)

    print('length:', len(record_dataset))
    first_img = record_dataset[0][0]
    print('image shape:', first_img.shape)
    print('Label example:')
    print(record_dataset[0][1])
Example #2
0
def get_dataset(train_dir, test_dir, args):
    train_dataset = RecordFileDetection(os.path.join(
        train_dir, 'birds_ssd_sample_train.rec'),
                                        coord_normalized=True)
    val_dataset = RecordFileDetection(os.path.join(test_dir,
                                                   'birds_ssd_sample_val.rec'),
                                      coord_normalized=True)
    val_metric = VOC07MApMetric(iou_thresh=0.5)

    if args.num_samples < 0:
        args.num_samples = len(train_dataset)
    if args.mixup:
        from gluoncv.data import MixupDetection
        train_dataset = MixupDetection(train_dataset)
    return train_dataset, val_dataset, val_metric
def main():
    """ create RecordFileFormat files """
    """im2rec = utils.download('https://raw.githubusercontent.com/apache/incubator-mxnet/' +
                        '6843914f642c8343aaa9a09db803b6af6f5d94a2/tools/im2rec.py', 'im2rec.py')"""
    if TEST:
        # subprocess.check_output([sys.executable, 'im2rec.py', '--list', '--no-shuffle',
        #                         '../data_set_files/record_format_files/data_set_test/test', '../data_set_files/test'])
        subprocess.check_output([
            sys.executable, 'im2rec.py',
            '../data_set_files/record_format_files/data_set_test/test_min.lst',
            '.', '--no-shuffle', '--pass-through', '--pack-label'
        ])
        record_dataset = RecordFileDetection(
            '../data_set_files/record_format_files/data_set_test/test_min.rec',
            coord_normalized=True)
    else:
        do_it()
        subprocess.check_output([
            sys.executable, 'im2rec.py',
            '../data_set_files/record_format_files/data-set_min/val.lst', '.',
            '--no-shuffle', '--pass-through', '--pack-label'
        ])
        subprocess.check_output([
            sys.executable, 'im2rec.py',
            '../data_set_files/record_format_files/data-set_min/train.lst',
            '.', '--no-shuffle', '--pass-through', '--pack-label'
        ])

        record_dataset = RecordFileDetection(
            '../data_set_files/record_format_files/data-set_min/val.rec',
            coord_normalized=True)
        record_dataset = RecordFileDetection(
            '../data_set_files/record_format_files/data-set_min/train.rec',
            coord_normalized=True)

    print('length:', len(record_dataset))
from gluoncv.data import RecordFileDetection
record_dataset = RecordFileDetection('voc2007.rec', coord_normalized=True)

# we expect same results from LstDetection
print('length:', len(record_dataset))
first_img = record_dataset[3][0]
print('image shape:', first_img.shape)
print('Label example:')
print(record_dataset[3][1])

record_dataset = RecordFileDetection('voc2007.rec')
img, label = record_dataset[6]
print('imgShape: ' + str(img.shape), 'labelShape: ' + str(label.shape))
Example #5
0
im2rec = utils.download(
    'https://raw.githubusercontent.com/apache/incubator-mxnet/' +
    '6843914f642c8343aaa9a09db803b6af6f5d94a2/tools/im2rec.py', 'im2rec.py')
# In this tutorial we skip generating in subprocess but instead download a prepared val.rec
# subprocess.check_output([sys.executable, 'im2rec.py', 'val', '.', '--no-shuffle', '--pass-through', '--pack-label'])
utils.download(
    'https://gist.github.com/zhreshold/599999eab290e951fcfb26cdd59885e2/raw/0d945eeea2a71ba7bd3e39d463f39921acb786d1/val.rec',
    'val.rec')
utils.download(
    'https://gist.github.com/zhreshold/599999eab290e951fcfb26cdd59885e2/raw/0d945eeea2a71ba7bd3e39d463f39921acb786d1/val.idx',
    'val.idx')

##############################################################################
# Now similarly, we can create a dataset from the binary file we just created with on line of code:
from gluoncv.data import RecordFileDetection
record_dataset = RecordFileDetection('val.rec', coord_normalized=True)

# we expect same results from LstDetection
print('length:', len(record_dataset))
first_img = record_dataset[0][0]
print('image shape:', first_img.shape)
print('Label example:')
print(record_dataset[0][1])

##############################################################################
#
# .. _pascal_voc_like:
#
# 2. Derive from PASCAL VOC format
# --------------------------------
# It you have a custom dataset fully comply with the `Pascal VOC <http://host.robots.ox.ac.uk/pascal/VOC/>`_ object detection format,
Example #6
0
import argparse
from gluoncv import utils as gcv_utils
import numpy as np
from gluoncv.data import RecordFileDetection
from matplotlib import pyplot as plt

parser = argparse.ArgumentParser()
parser.add_argument('record_file')
args = parser.parse_args()

CLASS_NAMES = ['rook', 'knight', 'bishop', 'king', 'queen', 'pawn']

# USAGE: python greenseer/synthetic_generator/spotcheck.py ./data/syn-gen-images/chess_train.rec

# Load record file from ".rec" and ".idx" files
record_dataset = RecordFileDetection(args.record_file, coord_normalized=True)
num_records = len(record_dataset)
print(num_records)

# Focus on a random record
index = np.random.randint(0, num_records)
random_record = record_dataset[index]
img, label = random_record
print('Type of image', type(img))
print('Type of label', type(label))

# Display bounding boxes together with labels
ax = gcv_utils.viz.plot_bbox(img,
                             label[:, :-1],
                             labels=label[:, -1],
                             class_names=CLASS_NAMES)
Example #7
0
def pipe_detection_minibatch(
    epoch:int,
    batch_size:int=50,
    channel:str="/opt/ml/input/data/train",
    discard_partial_final:bool=False
):
    """Legacy generator method for batching RecordFileDetectors from SageMaker Pipe Mode stream

    This procedural method was explored before the cleaner approach of overriding dataset classes.
    The generator reads batches of records from a RecordIO stream, and converts each "stream-batch"
    into a GluonCV RecordFileDetection by buffering the records to an indexed local RecordIO file.

    Pros:

    - Doesn't require a length parameter up-front, because it really iterates through the stream
    - Can be used with shuffling transforms (which isn't necessary as SM can shuffle for you)
    - Total GluonCV compatibility, as it instantiates genuine GluonCV RecordFileDetection class

    Cons:

    - Need to set the stream batch size as a multiple of the minibatch size, which is fiddly
    - Stream is read in batches, which can still block processing on I/O
    - Introduces an outer loop in the training script - not consistent with standard patterns

    Example SageMaker input channel configuration:

    ```
    train_channel = sagemaker.session.s3_input(
        f"s3://{BUCKET_NAME}/{DATA_PREFIX}/train.manifest", # SM Ground Truth output manifest
        content_type="application/x-recordio",
        s3_data_type="AugmentedManifestFile",
        record_wrapping="RecordIO",
        attribute_names=["source-ref", "annotations"],  # To guarantee only 2 attributes fed in
        shuffle_config=sagemaker.session.ShuffleConfig(seed=1337)
    )
    ```

    ...SageMaker will produce a RecordIO stream with alternating records of image and annotation.

    This generator reads batches of records from the stream and converts each into a GluonCV 
    RecordFileDetection.
    """
    ixbatch = -1
    epoch_end = False
    epoch_file = f"{channel}_{epoch}"
    epoch_records = recordio.MXRecordIO(epoch_file, "r")
    with TemporaryDirectory() as tmpdirname:
        batch_records_file = os.path.join(tmpdirname, "data.rec")
        batch_idx_file = os.path.join(tmpdirname, "data.idx")
        while not epoch_end:
            ixbatch += 1
            logger.info(f"Epoch {epoch}, stream-batch {ixbatch}, channel {channel}")

            # TODO: Wish we could use with statements for file contexts, but I think MXNet can't?
            try:
                os.remove(batch_records_file)
                os.remove(batch_idx_file)
            except OSError:
                pass
            try:
                os.mknod(batch_idx_file)
            except OSError:
                pass

            # Stream batch of data in to temporary batch_records file (pair):
            batch_records = recordio.MXIndexedRecordIO(batch_idx_file, batch_records_file, "w")
            image_raw = None
            image_meta = None
            ixdatum = 0
            invalid = False
            while (ixdatum < batch_size):
                # Read from the SageMaker stream:
                raw = epoch_records.read()
                # Determine whether this object is the image or the annotation:
                if (not raw):
                    if (image_meta or image_raw):
                        raise ValueError(
                            f"Bad stream {epoch_file}: Finished with partial record {ixdatum}...\n"
                            f"{'Had' if image_raw else 'Did not have'} image; "
                            f"{'Had' if image_raw else 'Did not have'} annotations."
                        )
                    epoch_end = True
                    break
                elif (raw[0] == b"{"[0]): # Binary in Python is weird...
                    logger.debug(f"Record {ixdatum} got metadata: {raw[:20]}...")
                    if (image_meta):
                        raise ValueError(
                            f"Bad stream {epoch_file}: Already got annotations for record {ixdatum}...\n"
                            f"Existing: {image_meta}\n"
                            f"New: {raw}"
                        )
                    else:
                        image_meta = json.loads(raw)
                else:
                    logger.debug(f"Record {ixdatum} got image: {raw[:20]}...")
                    if (image_raw):
                        raise ValueError(
                            f"Bad stream {epoch_file}: Missing annotations for record {ixdatum}...\n"
                        )
                    else:
                        image_raw = raw
                        # Since a stream-batch becomes an iterable GluonCV dataset, to which
                        # downstream transformations are applied in bulk, it's best to weed out any
                        # corrupted files here if possible rather than risk a whole mini-batch or
                        # stream-batch getting discarded:
                        try:
                            img = image.imdecode(bytearray(raw))
                            logger.debug(f"Loaded image shape {img.shape}")
                        except ValueError as e:
                            logger.exception("Failed to load image data - skipping...")
                            invalid = True
                        # TODO: Since we already parse images, try to buffer the tensors not JPG

                # If both image and annotation are collected, we're ready to pack for GluonCV:
                if (image_raw is not None and len(image_raw) and image_meta):
                    if invalid:
                        image_raw = None
                        image_meta = None
                        invalid = False
                        continue

                    if (image_meta.get("image_size")):
                        image_width = image_meta["image_size"][0]["width"]
                        image_height = image_meta["image_size"][0]["height"]
                        boxes = [[
                            ann["class_id"],
                            ann["left"] / image_width,
                            ann["top"] / image_height,
                            (ann["left"] + ann["width"]) / image_width,
                            (ann["top"] + ann["height"]) / image_height
                        ] for ann in image_meta["annotations"]]
                    else:
                        logger.debug(
                            "Writing non-normalized bounding box (no image_size in manifest)"
                        )
                        boxes = [[
                            ann["class_id"],
                            ann["left"],
                            ann["top"],
                            ann["left"] + ann["width"],
                            ann["top"] + ann["height"]
                        ] for ann in image_meta["annotations"]]

                    boxes_flat = [ val for box in boxes for val in box ]
                    header_data = [2, 5] + boxes_flat
                    logger.debug(f"Annotation header data {header_data}")
                    header = recordio.IRHeader(
                        0, # Convenience value not used
                        # Flatten nested boxes array:
                        header_data,
                        ixdatum,
                        0
                    )
                    batch_records.write_idx(ixdatum, recordio.pack(header, image_raw))
                    image_raw = None
                    image_meta = None
                    ixdatum += 1

            # Close the write stream (we'll re-open the file-pair to read):
            batch_records.close()

            if ixdatum == 0:
                logger.debug("Reached end of stream with no valid records - discarding")
                break

            if (epoch_end and discard_partial_final):
                logger.debug("Discarding final partial batch")
                break # (Don't yield the part-completed batch)

            dataset = RecordFileDetection(batch_records_file)
            logger.debug(f"Stream batch ready with {len(dataset)} records")
            if not len(dataset):
                raise ValueError(
                    "Why is the dataset empty after loading as RecordFileDetection!?!?"
                )
            yield dataset