Example #1
0
 def _info(self):
     if self.config.name == "full_numbers":
         features = datasets.Features({
             "image":
             datasets.Image(),
             "digits":
             datasets.Sequence({
                 "bbox":
                 datasets.Sequence(datasets.Value("int32"), length=4),
                 "label":
                 datasets.ClassLabel(num_classes=10),
             }),
         })
     else:
         features = datasets.Features({
             "image":
             datasets.Image(),
             "label":
             datasets.ClassLabel(num_classes=10),
         })
     return datasets.DatasetInfo(
         description=_DESCRIPTION,
         features=features,
         supervised_keys=None,
         homepage=_HOMEPAGE,
         license=_LICENSE,
         citation=_CITATION,
         task_templates=[
             ImageClassification(image_column="image", label_column="label")
         ] if self.config.name == "cropped_digits" else None,
     )
 def _info(self):
     features = datasets.Features({
         "image_id":
         datasets.Value("int64"),
         "image":
         datasets.Image(),
         "width":
         datasets.Value("int32"),
         "height":
         datasets.Value("int32"),
         "objects":
         datasets.Sequence({
             "id":
             datasets.Value("int64"),
             "area":
             datasets.Value("int64"),
             "bbox":
             datasets.Sequence(datasets.Value("float32"), length=4),
             "category":
             datasets.ClassLabel(names=_CATEGORIES),
         }),
     })
     return datasets.DatasetInfo(
         description=_DESCRIPTION,
         features=features,
         homepage=_HOMEPAGE,
         license=_LICENSE,
         citation=_CITATION,
     )
 def _info(self):
     return datasets.DatasetInfo(
         description=_DESCRIPTION,
         features=datasets.Features(
             {
                 "name": datasets.Value("string"),
                 "year": datasets.Value("string"),
                 "category": datasets.Value("string"),
                 "qid": datasets.Value("int32"),
                 "qtext": datasets.Value("string"),
                 "ra": datasets.Value("int32"),
                 "image": datasets.Image(),
                 "answers": [
                     {
                         "aid": datasets.Value("int32"),
                         "atext": datasets.Value("string"),
                     }
                 ],
             }
         ),
         supervised_keys=None,
         homepage=_HOMEPAGE,
         license=_LICENSE,
         citation=_CITATION,
     )
 def _info(self):
     return datasets.DatasetInfo(
         description=_DESCRIPTION,
         features=datasets.Features({
             "image":
             datasets.Image(),
             "faces":
             datasets.Sequence({
                 "bbox":
                 datasets.Sequence(datasets.Value("float32"), length=4),
                 "blur":
                 datasets.ClassLabel(names=["clear", "normal", "heavy"]),
                 "expression":
                 datasets.ClassLabel(names=["typical", "exaggerate"]),
                 "illumination":
                 datasets.ClassLabel(names=["normal", "exaggerate "]),
                 "occlusion":
                 datasets.ClassLabel(names=["no", "partial", "heavy"]),
                 "pose":
                 datasets.ClassLabel(names=["typical", "atypical"]),
                 "invalid":
                 datasets.Value("bool"),
             }),
         }),
         supervised_keys=None,
         homepage=_HOMEPAGE,
         license=_LICENSE,
         citation=_CITATION,
     )
Example #5
0
 def features(self):
     return datasets.Features({
         **({
             "image": datasets.Image()
         } if self.with_image else {}),
         **_BASE_IMAGE_METADATA_FEATURES,
         **self.annotations_features,
     })
Example #6
0
    def _get_feature_types(self):
        features = {
            "image_id": datasets.Value("string"),
            "image": datasets.Image(),
        }

        if self.config.attributes_dataset:
            features.update(
                {"label": datasets.ClassLabel(names=self.config.attributes)})
        else:
            features.update({
                "label":
                datasets.ClassLabel(names=self.config.selected_classes),
                "context":
                datasets.Value("string"),
            })

        if self.config.with_image_metadata:
            features.update({
                "width":
                datasets.Value("int64"),
                "height":
                datasets.Value("int64"),
                "location":
                datasets.Value("string"),
                "weather":
                datasets.Value("string"),
                "objects":
                datasets.Sequence({
                    "object_id":
                    datasets.Value("string"),
                    "name":
                    datasets.Value("string"),
                    "x":
                    datasets.Value("int64"),
                    "y":
                    datasets.Value("int64"),
                    "w":
                    datasets.Value("int64"),
                    "h":
                    datasets.Value("int64"),
                    "attributes":
                    datasets.Sequence(datasets.Value("string")),
                    "relations":
                    datasets.Sequence({
                        "name": datasets.Value("string"),
                        "object": datasets.Value("string"),
                    }),
                }),
            })

        return features
 def _info(self):
     if self.config.name == "scene_parsing":
         features = datasets.Features({
             "image":
             datasets.Image(),
             "annotation":
             datasets.Image(),
             "scene_category":
             datasets.ClassLabel(names=_SCENE_CATEGORIES),
         })
     else:
         features = datasets.Features({
             "image": datasets.Image(),
             "annotation": datasets.Image(),
         })
     return datasets.DatasetInfo(
         description=_DESCRIPTION,
         features=features,
         homepage=_HOMEPAGE,
         license=_LICENSE,
         citation=_CITATION,
     )
Example #8
0
 def _info(self):
     return ds.DatasetInfo(
         description="",
         citation="",
         homepage="",
         license="",
         features=ds.Features({
             "img": ds.Image(),
             "label": ds.features.ClassLabel(names=_NAMES),
         }),
         supervised_keys=("img", "label"),
         task_templates=ImageClassification(image_column="img",
                                            label_column="label"),
     )
 def _info(self):
     return datasets.DatasetInfo(
         description=_DESCRIPTION,
         features=datasets.Features(
             {
                 "image_file_path": datasets.Value("string"),
                 "image": datasets.Image(),
                 "labels": datasets.features.ClassLabel(names=_NAMES),
             }
         ),
         supervised_keys=("image", "labels"),
         homepage=_HOMEPAGE,
         citation=_CITATION,
         task_templates=[ImageClassification(image_column="image", label_column="labels")],
     )
Example #10
0
 def _info(self):
     return datasets.DatasetInfo(
         description=_DESCRIPTION,
         features=datasets.Features({
             "img":
             datasets.Image(),
             "label":
             datasets.features.ClassLabel(names=_NAMES),
         }),
         supervised_keys=("img", "label"),
         homepage="https://www.cs.toronto.edu/~kriz/cifar.html",
         citation=_CITATION,
         task_templates=ImageClassification(image_column="img",
                                            label_column="label"),
     )
Example #11
0
 def _info(self):
     assert len(IMAGENET2012_CLASSES) == 1000
     return datasets.DatasetInfo(
         description=_DESCRIPTION,
         features=datasets.Features({
             "image":
             datasets.Image(),
             "label":
             datasets.ClassLabel(names=list(IMAGENET2012_CLASSES.values())),
         }),
         homepage=_HOMEPAGE,
         citation=_CITATION,
         task_templates=[
             ImageClassification(image_column="image", label_column="label")
         ],
     )
Example #12
0
 def _info(self):
     return datasets.DatasetInfo(
         description=_DESCRIPTION,
         features=datasets.Features({
             "sequence_number":
             datasets.Value("string"),
             "subject_id":
             datasets.Value("string"),
             "rgb":
             datasets.Sequence(datasets.Image()),
             "rgb_cal": {
                 "intrisic_mat": datasets.Array2D(shape=(3, 3),
                                                  dtype="float64"),
                 "extrinsic_mat": {
                     "rotation":
                     datasets.Array2D(shape=(3, 3), dtype="float64"),
                     "translation":
                     datasets.Sequence(datasets.Value("float64"), length=3),
                 },
             },
             "depth":
             datasets.Sequence(datasets.Value("string")),
             "depth_cal": {
                 "intrisic_mat": datasets.Array2D(shape=(3, 3),
                                                  dtype="float64"),
                 "extrinsic_mat": {
                     "rotation":
                     datasets.Array2D(shape=(3, 3), dtype="float64"),
                     "translation":
                     datasets.Sequence(datasets.Value("float64"), length=3),
                 },
             },
             "head_pose_gt":
             datasets.Sequence({
                 "center":
                 datasets.Sequence(datasets.Value("float64"), length=3),
                 "rotation":
                 datasets.Array2D(shape=(3, 3), dtype="float64"),
             }),
             "head_template":
             datasets.Value("string"),
         }),
         homepage=_HOMEPAGE,
         license=_LICENSE,
         citation=_CITATION,
     )
Example #13
0
 def _info(self):
     return datasets.DatasetInfo(
         description=_DESCRIPTION,
         features=datasets.Features({
             "image":
             datasets.Image(),
             "label":
             datasets.ClassLabel(names=_NAMES),
         }),
         supervised_keys=("image", "label"),
         homepage=_HOMEPAGE,
         citation=_CITATION,
         license=_LICENSE,
         task_templates=[
             ImageClassification(image_column="image", label_column="label")
         ],
     )
Example #14
0
 def _info(self):
     return datasets.DatasetInfo(
         description=_DESCRIPTION,
         features=datasets.Features(
             {
                 "img": datasets.Image(),
                 "fine_label": datasets.features.ClassLabel(names=_FINE_LABEL_NAMES),
                 "coarse_label": datasets.features.ClassLabel(names=_COARSE_LABEL_NAMES),
             }
         ),
         supervised_keys=None,  # Probably needs to be fixed.
         homepage="https://www.cs.toronto.edu/~kriz/cifar.html",
         citation=_CITATION,
         task_templates=[
             ImageClassification(image_column="img", label_column="fine_label", labels=_FINE_LABEL_NAMES)
         ],
     )
Example #15
0
 def _info(self):
     return datasets.DatasetInfo(
         description=_DESCRIPTION,
         features=datasets.Features(
             {
                 "image": datasets.Image(),
                 "creator_username": datasets.Value("string"),
                 "hash": datasets.Value("string"),
                 "gps_latitude": datasets.Value("float32"),
                 "gps_longitude": datasets.Value("float32"),
                 "date_taken": datasets.Value("timestamp[us]"),
             }
         ),
         homepage=_HOMEPAGE,
         license=_LICENSE,
         citation=_CITATION,
     )
Example #16
0
 def _info(self):
     return datasets.DatasetInfo(
         description=_DESCRIPTION,
         features=datasets.Features({
             "image":
             datasets.Image(),
             "label":
             datasets.features.ClassLabel(
                 names=["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]),
         }),
         supervised_keys=("image", "label"),
         homepage="http://yann.lecun.com/exdb/mnist/",
         citation=_CITATION,
         task_templates=[
             ImageClassification(
                 image_column="image",
                 label_column="label",
             )
         ],
     )
Example #17
0
    def _info(self):
        features = datasets.Features({
            "image_id":
            datasets.Value("string"),
            "question_id":
            datasets.Value("int32"),
            "question":
            datasets.Value("string"),
            "question_tokens":
            datasets.Sequence(datasets.Value("string")),
            "image":
            datasets.Image(),
            "image_width":
            datasets.Value("int32"),
            "image_height":
            datasets.Value("int32"),
            "flickr_original_url":
            datasets.Value("string"),
            "flickr_300k_url":
            datasets.Value("string"),
            "answers":
            datasets.Sequence(datasets.Value("string")),
            "image_classes":
            datasets.Sequence(datasets.Value("string")),
            "set_name":
            datasets.Value("string"),
        })

        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=features,
            supervised_keys=None,
            homepage=_HOMEPAGE,
            license=_LICENSE,
            citation=_CITATION,
        )
Example #18
0
    def _split_generators(self, dl_manager):
        if not self.config.data_files:
            raise ValueError(
                f"At least one data file must be specified, but got data_files={self.config.data_files}"
            )

        # Do an early pass if:
        # * `features` are not specified, to infer the class labels
        # * `drop_metadata` is False, to find the metadata files
        do_analyze = (
            self.config.features is None
            and not self.config.drop_labels) or not self.config.drop_metadata
        if do_analyze:
            labels = set()
            metadata_files = collections.defaultdict(list)

            def analyze(files_or_archives, downloaded_files_or_dirs, split):
                if len(downloaded_files_or_dirs) == 0:
                    return
                # The files are separated from the archives at this point, so check the first sample
                # to see if it's a file or a directory and iterate accordingly
                if os.path.isfile(downloaded_files_or_dirs[0]):
                    original_files, downloaded_files = files_or_archives, downloaded_files_or_dirs
                    for original_file, downloaded_file in zip(
                            original_files, downloaded_files):
                        original_file, downloaded_file = str(
                            original_file), str(downloaded_file)
                        _, original_file_ext = os.path.splitext(original_file)
                        if original_file_ext.lower() in self.IMAGE_EXTENSIONS:
                            labels.add(
                                os.path.basename(
                                    os.path.dirname(original_file)))
                        elif os.path.basename(
                                original_file) == self.METADATA_FILENAME:
                            metadata_files[split].append(
                                (original_file, downloaded_file))
                        else:
                            original_file_name = os.path.basename(
                                original_file)
                            logger.debug(
                                f"The file '{original_file_name}' was ignored: it is not an image, and is not {self.METADATA_FILENAME} either."
                            )
                else:
                    archives, downloaded_dirs = files_or_archives, downloaded_files_or_dirs
                    for archive, downloaded_dir in zip(archives,
                                                       downloaded_dirs):
                        archive, downloaded_dir = str(archive), str(
                            downloaded_dir)
                        for downloaded_dir_file in dl_manager.iter_files(
                                downloaded_dir):
                            _, downloaded_dir_file_ext = os.path.splitext(
                                downloaded_dir_file)
                            if downloaded_dir_file_ext in self.IMAGE_EXTENSIONS:
                                labels.add(
                                    os.path.basename(
                                        os.path.dirname(downloaded_dir_file)))
                            elif os.path.basename(downloaded_dir_file
                                                  ) == self.METADATA_FILENAME:
                                metadata_files[split].append(
                                    (None, downloaded_dir_file))
                            else:
                                archive_file_name = os.path.basename(archive)
                                original_file_name = os.path.basename(
                                    downloaded_dir_file)
                                logger.debug(
                                    f"The file '{original_file_name}' from the archive '{archive_file_name}' was ignored: it is not an image, and is not {self.METADATA_FILENAME} either."
                                )

            if not self.config.drop_labels:
                logger.info("Inferring labels from data files...")
            if not self.config.drop_metadata:
                logger.info("Analyzing metadata files...")

        data_files = self.config.data_files
        splits = []
        for split_name, files in data_files.items():
            if isinstance(files, str):
                files = [files]
            files, archives = self._split_files_and_archives(files)
            downloaded_files = dl_manager.download(files)
            downloaded_dirs = dl_manager.download_and_extract(archives)
            if do_analyze:
                analyze(files, downloaded_files, split_name)
                analyze(archives, downloaded_dirs, split_name)
            splits.append(
                datasets.SplitGenerator(
                    name=split_name,
                    gen_kwargs={
                        "files": [(file, downloaded_file)
                                  for file, downloaded_file in zip(
                                      files, downloaded_files)] +
                        [(None, dl_manager.iter_files(downloaded_dir))
                         for downloaded_dir in downloaded_dirs],
                        "metadata_files":
                        metadata_files
                        if not self.config.drop_metadata else None,
                        "split_name":
                        split_name,
                    },
                ))

        if not self.config.drop_metadata and metadata_files:
            # Verify that:
            # * all metadata files have the same set of features
            # * the `file_name` key is one of the metadata keys and is of type string
            features_per_metadata_file: List[Tuple[str,
                                                   datasets.Features]] = []
            for _, downloaded_metadata_file in itertools.chain.from_iterable(
                    metadata_files.values()):
                with open(downloaded_metadata_file, "rb") as f:
                    pa_metadata_table = paj.read_json(f)
                features_per_metadata_file.append(
                    (downloaded_metadata_file,
                     datasets.Features.from_arrow_schema(
                         pa_metadata_table.schema)))
            for downloaded_metadata_file, metadata_features in features_per_metadata_file:
                if metadata_features != features_per_metadata_file[0][1]:
                    raise ValueError(
                        f"Metadata files {downloaded_metadata_file} and {features_per_metadata_file[0][0]} have different features: {features_per_metadata_file[0]} != {metadata_features}"
                    )
            metadata_features = features_per_metadata_file[0][1]
            if "file_name" not in metadata_features:
                raise ValueError(
                    "`file_name` must be present as dictionary key in metadata files"
                )
            if metadata_features["file_name"] != datasets.Value("string"):
                raise ValueError("`file_name` key must be a string")
            del metadata_features["file_name"]
        else:
            metadata_features = None

        # Normally, we would do this in _info, but we need to know the labels and/or metadata
        # before building the features
        if self.config.features is None:
            if not self.config.drop_labels and not metadata_files:
                self.info.features = datasets.Features({
                    "image":
                    datasets.Image(),
                    "label":
                    datasets.ClassLabel(names=sorted(labels))
                })
                task_template = ImageClassification(image_column="image",
                                                    label_column="label")
                task_template = task_template.align_with_features(
                    self.info.features)
                self.info.task_templates = [task_template]
            else:
                self.info.features = datasets.Features(
                    {"image": datasets.Image()})

            if not self.config.drop_metadata and metadata_files:
                # Verify that there are no duplicated keys when compared to the existing features ("image", optionally "label")
                duplicated_keys = set(
                    self.info.features) & set(metadata_features)
                if duplicated_keys:
                    raise ValueError(
                        f"Metadata feature keys {list(duplicated_keys)} are already present as the image features"
                    )
                self.info.features.update(metadata_features)

        return splits
Example #19
0
def release2dataset(release, download_images=True):
    content = requests.get(release['attributes']['url'])
    release_dict = json.loads(content.content)

    task_type = release_dict['dataset']['task_type']

    if task_type in ['vector', 'bboxes', 'keypoint']:
        features = datasets.Features({
            'name': datasets.Value('string'),
            'uuid': datasets.Value('string'),
            'image': {
                'url': datasets.Value('string')
            },
            'status': datasets.Value('string'),
            'label': {
                'annotations': [{
                    'id': datasets.Value('int32'),
                    'category_id': datasets.Value('int32'),
                    'type': datasets.Value('string'),
                    'points': [[datasets.Value('float32')]],
                }],
            }
        })

    elif task_type in ['segmentation-bitmap', 'segmentation-bitmap-highres']:
        features = datasets.Features({
            'name': datasets.Value('string'),
            'uuid': datasets.Value('string'),
            'image': {
                'url': datasets.Value('string')
            },
            'status': datasets.Value('string'),
            'label': {
                'annotations': [{
                    'id': datasets.Value('int32'),
                    'category_id': datasets.Value('int32'),
                }],
                'segmentation_bitmap': {
                    'url': datasets.Value('string')
                },
            }
        })

    elif task_type in ['text-named-entities', 'text-span-categorization']:
        features = datasets.Features({
            'name': datasets.Value('string'),
            'uuid': datasets.Value('string'),
            'text': datasets.Value('string'),
            'status': datasets.Value('string'),
            'label': {
                'annotations': [{
                    'start': datasets.Value('int32'),
                    'end': datasets.Value('int32'),
                    'category_id': datasets.Value('int32'),
                }],
            }
        })

    else:
        print("This type of dataset is not yet supported.")
        assert False

    samples = release_dict['dataset']['samples']

    data_rows = []
    for sample in samples:
        try:
            del sample['labels']['ground-truth']['attributes'][
                'format_version']
        except:
            pass

        data_row = {}

        # Name
        data_row['name'] = sample['name']

        # Uuid
        data_row['uuid'] = sample['uuid']

        # Status
        try:
            data_row['status'] = sample['labels']['ground-truth'][
                'label_status']
        except (KeyError, TypeError):
            data_row['status'] = 'UNLABELED'

        # Image or text
        if task_type in [
                'vector', 'bboxes', 'keypoint', 'segmentation-bitmap',
                'segmentation-bitmap-highres'
        ]:
            try:
                data_row['image'] = sample['attributes']['image']
            except KeyError:
                data_row['image'] = {'url': None}
        elif task_type in ['text-named-entities', 'text-span-categorization']:
            try:
                data_row['text'] = sample['attributes']['text']
            except KeyError:
                data_row['text'] = None

        # Label
        try:
            label = sample['labels']['ground-truth']['attributes']

            # Remove the image-level attributes
            if 'attributes' in label:
                del label['attributes']

            # Remove the object-level attributes
            for annotation in label['annotations']:
                if 'attributes' in annotation:
                    del annotation['attributes']

            data_row['label'] = label

        except (KeyError, TypeError):
            label = {'annotations': []}
            if task_type in [
                    'segmentation-bitmap', 'segmentation-bitmap-highres'
            ]:
                label['segmentation_bitmap'] = {'url': None}
            data_row['label'] = label

        data_rows.append(data_row)

    # print(data_rows)

    # Now transform to column format
    dataset_dict = {key: [] for key in features.keys()}
    for data_row in data_rows:
        for key in dataset_dict.keys():
            dataset_dict[key].append(data_row[key])

    # Create the HF Dataset and flatten it
    dataset = datasets.Dataset.from_dict(dataset_dict, features, split='train')
    dataset = dataset.flatten()

    # Optionally download the images
    if task_type in [
            'vector', 'bboxes', 'keypoint', 'segmentation-bitmap',
            'segmentation-bitmap-highres'
    ] and download_images:

        def download_image(data_row):
            try:
                data_row['image'] = load_image_from_url(data_row['image.url'])
            except:
                data_row['image'] = None
            return data_row

        def download_segmentation_bitmap(data_row):
            try:
                segmentation_bitmap = load_label_bitmap_from_url(
                    data_row['label.segmentation_bitmap.url'])
                data_row['label.segmentation_bitmap'] = Image.fromarray(
                    segmentation_bitmap)
            except:
                data_row['label.segmentation_bitmap'] = Image.new(
                    'RGB', (1, 1))  # TODO: replace with None
            return data_row

        dataset = dataset.map(download_image, remove_columns=['image.url'])
        if task_type in ['segmentation-bitmap', 'segmentation-bitmap-highres']:
            dataset = dataset.map(
                download_segmentation_bitmap,
                remove_columns=['label.segmentation_bitmap.url'])
            # Reorder the features
            features = datasets.Features({
                'name':
                dataset.features['name'],
                'uuid':
                dataset.features['uuid'],
                'status':
                dataset.features['status'],
                'image':
                datasets.Image(),
                'label.annotations':
                dataset.features['label.annotations'],
                'label.segmentation_bitmap':
                datasets.Image()
            })
            dataset.info.features = features
        else:
            # Reorder the features
            features = datasets.Features({
                'name':
                dataset.features['name'],
                'uuid':
                dataset.features['uuid'],
                'status':
                dataset.features['status'],
                'image':
                datasets.Image(),
                'label.annotations':
                dataset.features['label.annotations'],
            })
            dataset.info.features = features

    # Create id2label
    id2label = {}
    for category in release_dict['dataset']['task_attributes']['categories']:
        id2label[category['id']] = category['name']
    id2label[0] = "unlabeled"
    dataset.id2label = id2label

    # Create readme.md and update DatasetInfo
    # https://stackoverflow.com/questions/6385686/is-there-a-native-templating-system-for-plain-text-files-in-python

    task_type = release_dict['dataset']['task_type']
    if task_type in ['segmentation-bitmap', 'segmentation-bitmap-highres']:
        task_category = 'image-segmentation'
    elif task_type in ['vector', 'bboxes']:
        task_category = 'object-detection'
    elif task_type in ['text-named-entities', 'text-span-categorization']:
        task_category = 'named-entity-recognition'
    else:
        task_category = 'other'

    info = {
        'name':
        release_dict['dataset']['name'],
        'segments_url':
        f'https://segments.ai/{release_dict["dataset"]["owner"]}/{release_dict["dataset"]["name"]}',
        'short_description':
        release_dict['dataset']['description'],
        'release':
        release_dict['name'],
        'taxonomy_table':
        get_taxonomy_table(release_dict['dataset']['task_attributes']),
        'task_category':
        task_category
    }

    ## Create readme.md
    with open(
            os.path.join(os.path.dirname(__file__), 'data',
                         'dataset_card_template.md'), 'r') as f:
        template = Template(f.read())
        readme = template.substitute(info)
        dataset.readme = readme

    ## Update DatasetInfo
    dataset.info.description = info['short_description']
    dataset.info.homepage = info['segments_url']

    return dataset
Example #20
0
    def _info(self):
        features = datasets.Features({
            "image": {
                "filename": datasets.Value("string")
            },
            "image/id": datasets.Value("int64"),
        })
        if self.config.has_panoptic:
            features.update({
                "panoptic_image":
                datasets.Image(),
                "panoptic_image/filename":
                datasets.Value("string"),
                "panoptic_objects":
                datasets.Sequence({
                    "id": datasets.Value("int64"),
                    "area": datasets.Value("int64"),
                    "bbox": datasets.Sequence(dtype, length=4),
                    "label": datasets.ClassLabel(num_classes=133),
                    "is_crowd": datasets.Value("bool"),
                }),
            })
            if "2017" in self.config.name:
                features.update({
                    "coarse_label":
                    datasets.features.ClassLabel(names=[
                        "person",
                        "vehicle",
                        "outdoor",
                        "animal",
                        "accessory",
                        "sports",
                        "kitchen",
                        "food",
                        "electronic",
                        "appliance",
                        "indoor",
                    ]),
                    "fine_label":
                    datasets.features.ClassLabel(names=[
                        "person",
                        "bicycle",
                        "car",
                        "motorcycle",
                        "airplane",
                        "bus",
                        "train",
                        "truck",
                        "boat",
                        "traffic light",
                        "fire hydrant",
                        "stop sign",
                        "parking meter",
                        "bench",
                        "bird",
                        "cat",
                        "dog",
                        "horse",
                        "sheep",
                        "cow",
                        "elephant",
                        "bear",
                        "zebra",
                        "giraffe",
                        "backpack",
                        "umbrella",
                        "handbag",
                        "tie",
                        "suitcase",
                        "frisbee",
                        "skis",
                        "snowboard",
                        "sports ball",
                        "kite",
                        "baseball bat",
                        "baseball glove",
                        "skateboard",
                        "surfboard",
                        "tennis racket",
                        "bottle",
                        "wine glass",
                        "cup",
                        "fork",
                        "knife",
                        "spoon",
                        "bowl",
                        "banana",
                        "apple",
                        "sandwich",
                        "orange",
                        "broccoli",
                        "carrot",
                        "hot dog",
                        "pizza",
                        "donut",
                        "cake",
                        "chair",
                        "couch",
                        "potted plant",
                        "bed",
                        "dining table",
                        "toilet",
                        "tv",
                        "laptop",
                        "mouse",
                        "remote",
                        "keyboard",
                        "cell phone",
                        "microwave",
                        "oven",
                        "toaster",
                        "sink",
                        "refrigerator",
                        "book",
                        "clock",
                        "vase",
                        "scissors",
                        "teddy bear",
                        "hair drier",
                        "toothbrush",
                    ]),
                })
            if "2014" in self.config.name or "2015" in self.config.name:
                features.update({
                    "coarse_label":
                    datasets.features.ClassLabel(names=[
                        "person",
                        "vehicle",
                        "outdoor",
                        "animal",
                        "accessory",
                        "sports",
                        "kitchen",
                        "food",
                        "electronic",
                        "appliance",
                        "indoor",
                    ]),
                    "fine_label":
                    datasets.features.ClassLabel(names=[
                        "person",
                        "bicycle",
                        "car",
                        "motorcycle",
                        "airplane",
                        "bus",
                        "train",
                        "truck",
                        "boat",
                        "traffic light",
                        "fire hydrant",
                        "stop sign",
                        "parking meter",
                        "bench",
                        "bird",
                        "cat",
                        "dog",
                        "horse",
                        "sheep",
                        "cow",
                        "elephant",
                        "bear",
                        "zebra",
                        "giraffe",
                        "backpack",
                        "umbrella",
                        "handbag",
                        "tie",
                        "suitcase",
                        "frisbee",
                        "skis",
                        "snowboard",
                        "sports ball",
                        "kite",
                        "baseball bat",
                        "baseball glove",
                        "skateboard",
                        "surfboard",
                        "tennis racket",
                        "bottle",
                        "wine glass",
                        "cup",
                        "fork",
                        "knife",
                        "spoon",
                        "bowl",
                        "banana",
                        "apple",
                        "sandwich",
                        "orange",
                        "broccoli",
                        "carrot",
                        "hot dog",
                        "pizza",
                        "donut",
                        "cake",
                        "chair",
                        "couch",
                        "potted plant",
                        "bed",
                        "dining table",
                        "toilet",
                        "tv",
                        "laptop",
                        "mouse",
                        "remote",
                        "keyboard",
                        "cell phone",
                        "microwave",
                        "oven",
                        "toaster",
                        "sink",
                        "refrigerator",
                        "book",
                        "clock",
                        "vase",
                        "scissors",
                        "teddy bear",
                        "hair drier",
                        "toothbrush",
                    ]),
                })

        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=features,
            homepage="https://cocodataset.org/",
            citation=_CITATION,
        )
Example #21
0
 def _info(self):
     if self.config.name == "raw":
         features = datasets.Features({
             "key_id":
             datasets.Value("string"),
             "word":
             datasets.ClassLabel(names=_NAMES),
             "recognized":
             datasets.Value("bool"),
             "timestamp":
             datasets.Value("timestamp[us, tz=UTC]"),
             "countrycode":
             datasets.Value("string"),
             "drawing":
             datasets.Sequence({
                 "x":
                 datasets.Sequence(datasets.Value("float32")),
                 "y":
                 datasets.Sequence(datasets.Value("float32")),
                 "t":
                 datasets.Sequence(datasets.Value("int32")),
             }),
         })
     elif self.config.name == "preprocessed_simplified_drawings":
         features = datasets.Features({
             "key_id":
             datasets.Value("string"),
             "word":
             datasets.ClassLabel(names=_NAMES),
             "recognized":
             datasets.Value("bool"),
             "timestamp":
             datasets.Value("timestamp[us, tz=UTC]"),
             "countrycode":
             datasets.Value("string"),
             "drawing":
             datasets.Sequence({
                 "x":
                 datasets.Sequence(datasets.Value("uint8")),
                 "y":
                 datasets.Sequence(datasets.Value("uint8")),
             }),
         })
     elif self.config.name == "preprocessed_bitmaps":
         features = datasets.Features({
             "image":
             datasets.Image(),
             "label":
             datasets.ClassLabel(names=_NAMES),
         })
     else:  # sketch_rnn, sketch_rnn_full
         features = datasets.Features({
             "word":
             datasets.ClassLabel(names=_NAMES),
             "drawing":
             datasets.Array2D(shape=(None, 3), dtype="int16"),
         })
     return datasets.DatasetInfo(
         description=_DESCRIPTION,
         features=features,
         homepage=_HOMEPAGE,
         license=_LICENSE,
         citation=_CITATION,
         task_templates=[
             ImageClassification(image_column="image", label_column="label")
         ] if self.config.name == "preprocessed_bitmaps" else None,
     )
Example #22
0
    def _split_generators(self, dl_manager):
        if not self.config.data_files:
            raise ValueError(
                f"At least one data file must be specified, but got data_files={self.config.data_files}"
            )

        capture_labels = not self.config.drop_labels and self.config.features is None
        if capture_labels:
            labels = set()

            def capture_labels_for_split(files_or_archives,
                                         downloaded_files_or_dirs):
                if len(downloaded_files_or_dirs) == 0:
                    return
                # The files are separated from the archives at this point, so check the first sample
                # to see if it's a file or a directory and iterate accordingly
                if os.path.isfile(downloaded_files_or_dirs[0]):
                    files, downloaded_files = files_or_archives, downloaded_files_or_dirs
                    for file, downloaded_file in zip(files, downloaded_files):
                        file, downloaded_file = str(file), str(downloaded_file)
                        _, file_ext = os.path.splitext(file)
                        if file_ext.lower() in self.IMAGE_EXTENSIONS:
                            labels.add(os.path.basename(os.path.dirname(file)))
                else:
                    archives, downloaded_dirs = files_or_archives, downloaded_files_or_dirs
                    for archive, downloaded_dir in zip(archives,
                                                       downloaded_dirs):
                        archive, downloaded_file = str(archive), str(
                            downloaded_dir)
                        for downloaded_dir_file in dl_manager.iter_files(
                                downloaded_dir):
                            _, downloaded_dir_file_ext = os.path.splitext(
                                downloaded_dir_file)
                            if downloaded_dir_file_ext in self.IMAGE_EXTENSIONS:
                                labels.add(
                                    os.path.basename(
                                        os.path.dirname(downloaded_dir_file)))

            logger.info("Inferring labels from data files...")

        data_files = self.config.data_files
        splits = []
        if isinstance(data_files, (str, list, tuple)):
            files = data_files
            if isinstance(files, str):
                files = [files]
            files, archives = self._split_files_and_archives(files)
            downloaded_files = dl_manager.download(files)
            downloaded_dirs = dl_manager.download_and_extract(archives)
            if capture_labels:
                capture_labels_for_split(files, downloaded_files)
                capture_labels_for_split(archives, downloaded_dirs)
            splits.append(
                datasets.SplitGenerator(
                    name=datasets.Split.TRAIN,
                    gen_kwargs={
                        "files": [(file, downloaded_file)
                                  for file, downloaded_file in zip(
                                      files, downloaded_files)] +
                        [(None, dl_manager.iter_files(downloaded_dir))
                         for downloaded_dir in downloaded_dirs]
                    },
                ))
        else:
            for split_name, files in data_files.items():
                if isinstance(files, str):
                    files = [files]
                files, archives = self._split_files_and_archives(files)
                downloaded_files = dl_manager.download(files)
                downloaded_dirs = dl_manager.download_and_extract(archives)
                if capture_labels:
                    capture_labels_for_split(files, downloaded_files)
                    capture_labels_for_split(archives, downloaded_dirs)
                splits.append(
                    datasets.SplitGenerator(
                        name=split_name,
                        gen_kwargs={
                            "files": [(file, downloaded_file)
                                      for file, downloaded_file in zip(
                                          files, downloaded_files)] +
                            [(None, dl_manager.iter_files(downloaded_dir))
                             for downloaded_dir in downloaded_dirs]
                        },
                    ))

        # Normally we would do this in _info, but we need to know the labels before building the features
        if capture_labels:
            if not self.config.drop_labels:
                self.info.features = datasets.Features({
                    "image":
                    datasets.Image(),
                    "label":
                    datasets.ClassLabel(names=sorted(labels))
                })
                task_template = ImageClassification(image_column="image",
                                                    label_column="label")
                task_template = task_template.align_with_features(
                    self.info.features)
                self.info.task_templates = [task_template]
            else:
                self.info.features = datasets.Features(
                    {"image": datasets.Image()})

        return splits