def _info(self): if self.config.name == "full_numbers": features = datasets.Features({ "image": datasets.Image(), "digits": datasets.Sequence({ "bbox": datasets.Sequence(datasets.Value("int32"), length=4), "label": datasets.ClassLabel(num_classes=10), }), }) else: features = datasets.Features({ "image": datasets.Image(), "label": datasets.ClassLabel(num_classes=10), }) return datasets.DatasetInfo( description=_DESCRIPTION, features=features, supervised_keys=None, homepage=_HOMEPAGE, license=_LICENSE, citation=_CITATION, task_templates=[ ImageClassification(image_column="image", label_column="label") ] if self.config.name == "cropped_digits" else None, )
def _info(self): features = datasets.Features({ "image_id": datasets.Value("int64"), "image": datasets.Image(), "width": datasets.Value("int32"), "height": datasets.Value("int32"), "objects": datasets.Sequence({ "id": datasets.Value("int64"), "area": datasets.Value("int64"), "bbox": datasets.Sequence(datasets.Value("float32"), length=4), "category": datasets.ClassLabel(names=_CATEGORIES), }), }) return datasets.DatasetInfo( description=_DESCRIPTION, features=features, homepage=_HOMEPAGE, license=_LICENSE, citation=_CITATION, )
def _info(self): return datasets.DatasetInfo( description=_DESCRIPTION, features=datasets.Features( { "name": datasets.Value("string"), "year": datasets.Value("string"), "category": datasets.Value("string"), "qid": datasets.Value("int32"), "qtext": datasets.Value("string"), "ra": datasets.Value("int32"), "image": datasets.Image(), "answers": [ { "aid": datasets.Value("int32"), "atext": datasets.Value("string"), } ], } ), supervised_keys=None, homepage=_HOMEPAGE, license=_LICENSE, citation=_CITATION, )
def _info(self): return datasets.DatasetInfo( description=_DESCRIPTION, features=datasets.Features({ "image": datasets.Image(), "faces": datasets.Sequence({ "bbox": datasets.Sequence(datasets.Value("float32"), length=4), "blur": datasets.ClassLabel(names=["clear", "normal", "heavy"]), "expression": datasets.ClassLabel(names=["typical", "exaggerate"]), "illumination": datasets.ClassLabel(names=["normal", "exaggerate "]), "occlusion": datasets.ClassLabel(names=["no", "partial", "heavy"]), "pose": datasets.ClassLabel(names=["typical", "atypical"]), "invalid": datasets.Value("bool"), }), }), supervised_keys=None, homepage=_HOMEPAGE, license=_LICENSE, citation=_CITATION, )
def features(self): return datasets.Features({ **({ "image": datasets.Image() } if self.with_image else {}), **_BASE_IMAGE_METADATA_FEATURES, **self.annotations_features, })
def _get_feature_types(self): features = { "image_id": datasets.Value("string"), "image": datasets.Image(), } if self.config.attributes_dataset: features.update( {"label": datasets.ClassLabel(names=self.config.attributes)}) else: features.update({ "label": datasets.ClassLabel(names=self.config.selected_classes), "context": datasets.Value("string"), }) if self.config.with_image_metadata: features.update({ "width": datasets.Value("int64"), "height": datasets.Value("int64"), "location": datasets.Value("string"), "weather": datasets.Value("string"), "objects": datasets.Sequence({ "object_id": datasets.Value("string"), "name": datasets.Value("string"), "x": datasets.Value("int64"), "y": datasets.Value("int64"), "w": datasets.Value("int64"), "h": datasets.Value("int64"), "attributes": datasets.Sequence(datasets.Value("string")), "relations": datasets.Sequence({ "name": datasets.Value("string"), "object": datasets.Value("string"), }), }), }) return features
def _info(self): if self.config.name == "scene_parsing": features = datasets.Features({ "image": datasets.Image(), "annotation": datasets.Image(), "scene_category": datasets.ClassLabel(names=_SCENE_CATEGORIES), }) else: features = datasets.Features({ "image": datasets.Image(), "annotation": datasets.Image(), }) return datasets.DatasetInfo( description=_DESCRIPTION, features=features, homepage=_HOMEPAGE, license=_LICENSE, citation=_CITATION, )
def _info(self): return ds.DatasetInfo( description="", citation="", homepage="", license="", features=ds.Features({ "img": ds.Image(), "label": ds.features.ClassLabel(names=_NAMES), }), supervised_keys=("img", "label"), task_templates=ImageClassification(image_column="img", label_column="label"), )
def _info(self): return datasets.DatasetInfo( description=_DESCRIPTION, features=datasets.Features( { "image_file_path": datasets.Value("string"), "image": datasets.Image(), "labels": datasets.features.ClassLabel(names=_NAMES), } ), supervised_keys=("image", "labels"), homepage=_HOMEPAGE, citation=_CITATION, task_templates=[ImageClassification(image_column="image", label_column="labels")], )
def _info(self): return datasets.DatasetInfo( description=_DESCRIPTION, features=datasets.Features({ "img": datasets.Image(), "label": datasets.features.ClassLabel(names=_NAMES), }), supervised_keys=("img", "label"), homepage="https://www.cs.toronto.edu/~kriz/cifar.html", citation=_CITATION, task_templates=ImageClassification(image_column="img", label_column="label"), )
def _info(self): assert len(IMAGENET2012_CLASSES) == 1000 return datasets.DatasetInfo( description=_DESCRIPTION, features=datasets.Features({ "image": datasets.Image(), "label": datasets.ClassLabel(names=list(IMAGENET2012_CLASSES.values())), }), homepage=_HOMEPAGE, citation=_CITATION, task_templates=[ ImageClassification(image_column="image", label_column="label") ], )
def _info(self): return datasets.DatasetInfo( description=_DESCRIPTION, features=datasets.Features({ "sequence_number": datasets.Value("string"), "subject_id": datasets.Value("string"), "rgb": datasets.Sequence(datasets.Image()), "rgb_cal": { "intrisic_mat": datasets.Array2D(shape=(3, 3), dtype="float64"), "extrinsic_mat": { "rotation": datasets.Array2D(shape=(3, 3), dtype="float64"), "translation": datasets.Sequence(datasets.Value("float64"), length=3), }, }, "depth": datasets.Sequence(datasets.Value("string")), "depth_cal": { "intrisic_mat": datasets.Array2D(shape=(3, 3), dtype="float64"), "extrinsic_mat": { "rotation": datasets.Array2D(shape=(3, 3), dtype="float64"), "translation": datasets.Sequence(datasets.Value("float64"), length=3), }, }, "head_pose_gt": datasets.Sequence({ "center": datasets.Sequence(datasets.Value("float64"), length=3), "rotation": datasets.Array2D(shape=(3, 3), dtype="float64"), }), "head_template": datasets.Value("string"), }), homepage=_HOMEPAGE, license=_LICENSE, citation=_CITATION, )
def _info(self): return datasets.DatasetInfo( description=_DESCRIPTION, features=datasets.Features({ "image": datasets.Image(), "label": datasets.ClassLabel(names=_NAMES), }), supervised_keys=("image", "label"), homepage=_HOMEPAGE, citation=_CITATION, license=_LICENSE, task_templates=[ ImageClassification(image_column="image", label_column="label") ], )
def _info(self): return datasets.DatasetInfo( description=_DESCRIPTION, features=datasets.Features( { "img": datasets.Image(), "fine_label": datasets.features.ClassLabel(names=_FINE_LABEL_NAMES), "coarse_label": datasets.features.ClassLabel(names=_COARSE_LABEL_NAMES), } ), supervised_keys=None, # Probably needs to be fixed. homepage="https://www.cs.toronto.edu/~kriz/cifar.html", citation=_CITATION, task_templates=[ ImageClassification(image_column="img", label_column="fine_label", labels=_FINE_LABEL_NAMES) ], )
def _info(self): return datasets.DatasetInfo( description=_DESCRIPTION, features=datasets.Features( { "image": datasets.Image(), "creator_username": datasets.Value("string"), "hash": datasets.Value("string"), "gps_latitude": datasets.Value("float32"), "gps_longitude": datasets.Value("float32"), "date_taken": datasets.Value("timestamp[us]"), } ), homepage=_HOMEPAGE, license=_LICENSE, citation=_CITATION, )
def _info(self): return datasets.DatasetInfo( description=_DESCRIPTION, features=datasets.Features({ "image": datasets.Image(), "label": datasets.features.ClassLabel( names=["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]), }), supervised_keys=("image", "label"), homepage="http://yann.lecun.com/exdb/mnist/", citation=_CITATION, task_templates=[ ImageClassification( image_column="image", label_column="label", ) ], )
def _info(self): features = datasets.Features({ "image_id": datasets.Value("string"), "question_id": datasets.Value("int32"), "question": datasets.Value("string"), "question_tokens": datasets.Sequence(datasets.Value("string")), "image": datasets.Image(), "image_width": datasets.Value("int32"), "image_height": datasets.Value("int32"), "flickr_original_url": datasets.Value("string"), "flickr_300k_url": datasets.Value("string"), "answers": datasets.Sequence(datasets.Value("string")), "image_classes": datasets.Sequence(datasets.Value("string")), "set_name": datasets.Value("string"), }) return datasets.DatasetInfo( description=_DESCRIPTION, features=features, supervised_keys=None, homepage=_HOMEPAGE, license=_LICENSE, citation=_CITATION, )
def _split_generators(self, dl_manager): if not self.config.data_files: raise ValueError( f"At least one data file must be specified, but got data_files={self.config.data_files}" ) # Do an early pass if: # * `features` are not specified, to infer the class labels # * `drop_metadata` is False, to find the metadata files do_analyze = ( self.config.features is None and not self.config.drop_labels) or not self.config.drop_metadata if do_analyze: labels = set() metadata_files = collections.defaultdict(list) def analyze(files_or_archives, downloaded_files_or_dirs, split): if len(downloaded_files_or_dirs) == 0: return # The files are separated from the archives at this point, so check the first sample # to see if it's a file or a directory and iterate accordingly if os.path.isfile(downloaded_files_or_dirs[0]): original_files, downloaded_files = files_or_archives, downloaded_files_or_dirs for original_file, downloaded_file in zip( original_files, downloaded_files): original_file, downloaded_file = str( original_file), str(downloaded_file) _, original_file_ext = os.path.splitext(original_file) if original_file_ext.lower() in self.IMAGE_EXTENSIONS: labels.add( os.path.basename( os.path.dirname(original_file))) elif os.path.basename( original_file) == self.METADATA_FILENAME: metadata_files[split].append( (original_file, downloaded_file)) else: original_file_name = os.path.basename( original_file) logger.debug( f"The file '{original_file_name}' was ignored: it is not an image, and is not {self.METADATA_FILENAME} either." ) else: archives, downloaded_dirs = files_or_archives, downloaded_files_or_dirs for archive, downloaded_dir in zip(archives, downloaded_dirs): archive, downloaded_dir = str(archive), str( downloaded_dir) for downloaded_dir_file in dl_manager.iter_files( downloaded_dir): _, downloaded_dir_file_ext = os.path.splitext( downloaded_dir_file) if downloaded_dir_file_ext in self.IMAGE_EXTENSIONS: labels.add( os.path.basename( os.path.dirname(downloaded_dir_file))) elif os.path.basename(downloaded_dir_file ) == self.METADATA_FILENAME: metadata_files[split].append( (None, downloaded_dir_file)) else: archive_file_name = os.path.basename(archive) original_file_name = os.path.basename( downloaded_dir_file) logger.debug( f"The file '{original_file_name}' from the archive '{archive_file_name}' was ignored: it is not an image, and is not {self.METADATA_FILENAME} either." ) if not self.config.drop_labels: logger.info("Inferring labels from data files...") if not self.config.drop_metadata: logger.info("Analyzing metadata files...") data_files = self.config.data_files splits = [] for split_name, files in data_files.items(): if isinstance(files, str): files = [files] files, archives = self._split_files_and_archives(files) downloaded_files = dl_manager.download(files) downloaded_dirs = dl_manager.download_and_extract(archives) if do_analyze: analyze(files, downloaded_files, split_name) analyze(archives, downloaded_dirs, split_name) splits.append( datasets.SplitGenerator( name=split_name, gen_kwargs={ "files": [(file, downloaded_file) for file, downloaded_file in zip( files, downloaded_files)] + [(None, dl_manager.iter_files(downloaded_dir)) for downloaded_dir in downloaded_dirs], "metadata_files": metadata_files if not self.config.drop_metadata else None, "split_name": split_name, }, )) if not self.config.drop_metadata and metadata_files: # Verify that: # * all metadata files have the same set of features # * the `file_name` key is one of the metadata keys and is of type string features_per_metadata_file: List[Tuple[str, datasets.Features]] = [] for _, downloaded_metadata_file in itertools.chain.from_iterable( metadata_files.values()): with open(downloaded_metadata_file, "rb") as f: pa_metadata_table = paj.read_json(f) features_per_metadata_file.append( (downloaded_metadata_file, datasets.Features.from_arrow_schema( pa_metadata_table.schema))) for downloaded_metadata_file, metadata_features in features_per_metadata_file: if metadata_features != features_per_metadata_file[0][1]: raise ValueError( f"Metadata files {downloaded_metadata_file} and {features_per_metadata_file[0][0]} have different features: {features_per_metadata_file[0]} != {metadata_features}" ) metadata_features = features_per_metadata_file[0][1] if "file_name" not in metadata_features: raise ValueError( "`file_name` must be present as dictionary key in metadata files" ) if metadata_features["file_name"] != datasets.Value("string"): raise ValueError("`file_name` key must be a string") del metadata_features["file_name"] else: metadata_features = None # Normally, we would do this in _info, but we need to know the labels and/or metadata # before building the features if self.config.features is None: if not self.config.drop_labels and not metadata_files: self.info.features = datasets.Features({ "image": datasets.Image(), "label": datasets.ClassLabel(names=sorted(labels)) }) task_template = ImageClassification(image_column="image", label_column="label") task_template = task_template.align_with_features( self.info.features) self.info.task_templates = [task_template] else: self.info.features = datasets.Features( {"image": datasets.Image()}) if not self.config.drop_metadata and metadata_files: # Verify that there are no duplicated keys when compared to the existing features ("image", optionally "label") duplicated_keys = set( self.info.features) & set(metadata_features) if duplicated_keys: raise ValueError( f"Metadata feature keys {list(duplicated_keys)} are already present as the image features" ) self.info.features.update(metadata_features) return splits
def release2dataset(release, download_images=True): content = requests.get(release['attributes']['url']) release_dict = json.loads(content.content) task_type = release_dict['dataset']['task_type'] if task_type in ['vector', 'bboxes', 'keypoint']: features = datasets.Features({ 'name': datasets.Value('string'), 'uuid': datasets.Value('string'), 'image': { 'url': datasets.Value('string') }, 'status': datasets.Value('string'), 'label': { 'annotations': [{ 'id': datasets.Value('int32'), 'category_id': datasets.Value('int32'), 'type': datasets.Value('string'), 'points': [[datasets.Value('float32')]], }], } }) elif task_type in ['segmentation-bitmap', 'segmentation-bitmap-highres']: features = datasets.Features({ 'name': datasets.Value('string'), 'uuid': datasets.Value('string'), 'image': { 'url': datasets.Value('string') }, 'status': datasets.Value('string'), 'label': { 'annotations': [{ 'id': datasets.Value('int32'), 'category_id': datasets.Value('int32'), }], 'segmentation_bitmap': { 'url': datasets.Value('string') }, } }) elif task_type in ['text-named-entities', 'text-span-categorization']: features = datasets.Features({ 'name': datasets.Value('string'), 'uuid': datasets.Value('string'), 'text': datasets.Value('string'), 'status': datasets.Value('string'), 'label': { 'annotations': [{ 'start': datasets.Value('int32'), 'end': datasets.Value('int32'), 'category_id': datasets.Value('int32'), }], } }) else: print("This type of dataset is not yet supported.") assert False samples = release_dict['dataset']['samples'] data_rows = [] for sample in samples: try: del sample['labels']['ground-truth']['attributes'][ 'format_version'] except: pass data_row = {} # Name data_row['name'] = sample['name'] # Uuid data_row['uuid'] = sample['uuid'] # Status try: data_row['status'] = sample['labels']['ground-truth'][ 'label_status'] except (KeyError, TypeError): data_row['status'] = 'UNLABELED' # Image or text if task_type in [ 'vector', 'bboxes', 'keypoint', 'segmentation-bitmap', 'segmentation-bitmap-highres' ]: try: data_row['image'] = sample['attributes']['image'] except KeyError: data_row['image'] = {'url': None} elif task_type in ['text-named-entities', 'text-span-categorization']: try: data_row['text'] = sample['attributes']['text'] except KeyError: data_row['text'] = None # Label try: label = sample['labels']['ground-truth']['attributes'] # Remove the image-level attributes if 'attributes' in label: del label['attributes'] # Remove the object-level attributes for annotation in label['annotations']: if 'attributes' in annotation: del annotation['attributes'] data_row['label'] = label except (KeyError, TypeError): label = {'annotations': []} if task_type in [ 'segmentation-bitmap', 'segmentation-bitmap-highres' ]: label['segmentation_bitmap'] = {'url': None} data_row['label'] = label data_rows.append(data_row) # print(data_rows) # Now transform to column format dataset_dict = {key: [] for key in features.keys()} for data_row in data_rows: for key in dataset_dict.keys(): dataset_dict[key].append(data_row[key]) # Create the HF Dataset and flatten it dataset = datasets.Dataset.from_dict(dataset_dict, features, split='train') dataset = dataset.flatten() # Optionally download the images if task_type in [ 'vector', 'bboxes', 'keypoint', 'segmentation-bitmap', 'segmentation-bitmap-highres' ] and download_images: def download_image(data_row): try: data_row['image'] = load_image_from_url(data_row['image.url']) except: data_row['image'] = None return data_row def download_segmentation_bitmap(data_row): try: segmentation_bitmap = load_label_bitmap_from_url( data_row['label.segmentation_bitmap.url']) data_row['label.segmentation_bitmap'] = Image.fromarray( segmentation_bitmap) except: data_row['label.segmentation_bitmap'] = Image.new( 'RGB', (1, 1)) # TODO: replace with None return data_row dataset = dataset.map(download_image, remove_columns=['image.url']) if task_type in ['segmentation-bitmap', 'segmentation-bitmap-highres']: dataset = dataset.map( download_segmentation_bitmap, remove_columns=['label.segmentation_bitmap.url']) # Reorder the features features = datasets.Features({ 'name': dataset.features['name'], 'uuid': dataset.features['uuid'], 'status': dataset.features['status'], 'image': datasets.Image(), 'label.annotations': dataset.features['label.annotations'], 'label.segmentation_bitmap': datasets.Image() }) dataset.info.features = features else: # Reorder the features features = datasets.Features({ 'name': dataset.features['name'], 'uuid': dataset.features['uuid'], 'status': dataset.features['status'], 'image': datasets.Image(), 'label.annotations': dataset.features['label.annotations'], }) dataset.info.features = features # Create id2label id2label = {} for category in release_dict['dataset']['task_attributes']['categories']: id2label[category['id']] = category['name'] id2label[0] = "unlabeled" dataset.id2label = id2label # Create readme.md and update DatasetInfo # https://stackoverflow.com/questions/6385686/is-there-a-native-templating-system-for-plain-text-files-in-python task_type = release_dict['dataset']['task_type'] if task_type in ['segmentation-bitmap', 'segmentation-bitmap-highres']: task_category = 'image-segmentation' elif task_type in ['vector', 'bboxes']: task_category = 'object-detection' elif task_type in ['text-named-entities', 'text-span-categorization']: task_category = 'named-entity-recognition' else: task_category = 'other' info = { 'name': release_dict['dataset']['name'], 'segments_url': f'https://segments.ai/{release_dict["dataset"]["owner"]}/{release_dict["dataset"]["name"]}', 'short_description': release_dict['dataset']['description'], 'release': release_dict['name'], 'taxonomy_table': get_taxonomy_table(release_dict['dataset']['task_attributes']), 'task_category': task_category } ## Create readme.md with open( os.path.join(os.path.dirname(__file__), 'data', 'dataset_card_template.md'), 'r') as f: template = Template(f.read()) readme = template.substitute(info) dataset.readme = readme ## Update DatasetInfo dataset.info.description = info['short_description'] dataset.info.homepage = info['segments_url'] return dataset
def _info(self): features = datasets.Features({ "image": { "filename": datasets.Value("string") }, "image/id": datasets.Value("int64"), }) if self.config.has_panoptic: features.update({ "panoptic_image": datasets.Image(), "panoptic_image/filename": datasets.Value("string"), "panoptic_objects": datasets.Sequence({ "id": datasets.Value("int64"), "area": datasets.Value("int64"), "bbox": datasets.Sequence(dtype, length=4), "label": datasets.ClassLabel(num_classes=133), "is_crowd": datasets.Value("bool"), }), }) if "2017" in self.config.name: features.update({ "coarse_label": datasets.features.ClassLabel(names=[ "person", "vehicle", "outdoor", "animal", "accessory", "sports", "kitchen", "food", "electronic", "appliance", "indoor", ]), "fine_label": datasets.features.ClassLabel(names=[ "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush", ]), }) if "2014" in self.config.name or "2015" in self.config.name: features.update({ "coarse_label": datasets.features.ClassLabel(names=[ "person", "vehicle", "outdoor", "animal", "accessory", "sports", "kitchen", "food", "electronic", "appliance", "indoor", ]), "fine_label": datasets.features.ClassLabel(names=[ "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush", ]), }) return datasets.DatasetInfo( description=_DESCRIPTION, features=features, homepage="https://cocodataset.org/", citation=_CITATION, )
def _info(self): if self.config.name == "raw": features = datasets.Features({ "key_id": datasets.Value("string"), "word": datasets.ClassLabel(names=_NAMES), "recognized": datasets.Value("bool"), "timestamp": datasets.Value("timestamp[us, tz=UTC]"), "countrycode": datasets.Value("string"), "drawing": datasets.Sequence({ "x": datasets.Sequence(datasets.Value("float32")), "y": datasets.Sequence(datasets.Value("float32")), "t": datasets.Sequence(datasets.Value("int32")), }), }) elif self.config.name == "preprocessed_simplified_drawings": features = datasets.Features({ "key_id": datasets.Value("string"), "word": datasets.ClassLabel(names=_NAMES), "recognized": datasets.Value("bool"), "timestamp": datasets.Value("timestamp[us, tz=UTC]"), "countrycode": datasets.Value("string"), "drawing": datasets.Sequence({ "x": datasets.Sequence(datasets.Value("uint8")), "y": datasets.Sequence(datasets.Value("uint8")), }), }) elif self.config.name == "preprocessed_bitmaps": features = datasets.Features({ "image": datasets.Image(), "label": datasets.ClassLabel(names=_NAMES), }) else: # sketch_rnn, sketch_rnn_full features = datasets.Features({ "word": datasets.ClassLabel(names=_NAMES), "drawing": datasets.Array2D(shape=(None, 3), dtype="int16"), }) return datasets.DatasetInfo( description=_DESCRIPTION, features=features, homepage=_HOMEPAGE, license=_LICENSE, citation=_CITATION, task_templates=[ ImageClassification(image_column="image", label_column="label") ] if self.config.name == "preprocessed_bitmaps" else None, )
def _split_generators(self, dl_manager): if not self.config.data_files: raise ValueError( f"At least one data file must be specified, but got data_files={self.config.data_files}" ) capture_labels = not self.config.drop_labels and self.config.features is None if capture_labels: labels = set() def capture_labels_for_split(files_or_archives, downloaded_files_or_dirs): if len(downloaded_files_or_dirs) == 0: return # The files are separated from the archives at this point, so check the first sample # to see if it's a file or a directory and iterate accordingly if os.path.isfile(downloaded_files_or_dirs[0]): files, downloaded_files = files_or_archives, downloaded_files_or_dirs for file, downloaded_file in zip(files, downloaded_files): file, downloaded_file = str(file), str(downloaded_file) _, file_ext = os.path.splitext(file) if file_ext.lower() in self.IMAGE_EXTENSIONS: labels.add(os.path.basename(os.path.dirname(file))) else: archives, downloaded_dirs = files_or_archives, downloaded_files_or_dirs for archive, downloaded_dir in zip(archives, downloaded_dirs): archive, downloaded_file = str(archive), str( downloaded_dir) for downloaded_dir_file in dl_manager.iter_files( downloaded_dir): _, downloaded_dir_file_ext = os.path.splitext( downloaded_dir_file) if downloaded_dir_file_ext in self.IMAGE_EXTENSIONS: labels.add( os.path.basename( os.path.dirname(downloaded_dir_file))) logger.info("Inferring labels from data files...") data_files = self.config.data_files splits = [] if isinstance(data_files, (str, list, tuple)): files = data_files if isinstance(files, str): files = [files] files, archives = self._split_files_and_archives(files) downloaded_files = dl_manager.download(files) downloaded_dirs = dl_manager.download_and_extract(archives) if capture_labels: capture_labels_for_split(files, downloaded_files) capture_labels_for_split(archives, downloaded_dirs) splits.append( datasets.SplitGenerator( name=datasets.Split.TRAIN, gen_kwargs={ "files": [(file, downloaded_file) for file, downloaded_file in zip( files, downloaded_files)] + [(None, dl_manager.iter_files(downloaded_dir)) for downloaded_dir in downloaded_dirs] }, )) else: for split_name, files in data_files.items(): if isinstance(files, str): files = [files] files, archives = self._split_files_and_archives(files) downloaded_files = dl_manager.download(files) downloaded_dirs = dl_manager.download_and_extract(archives) if capture_labels: capture_labels_for_split(files, downloaded_files) capture_labels_for_split(archives, downloaded_dirs) splits.append( datasets.SplitGenerator( name=split_name, gen_kwargs={ "files": [(file, downloaded_file) for file, downloaded_file in zip( files, downloaded_files)] + [(None, dl_manager.iter_files(downloaded_dir)) for downloaded_dir in downloaded_dirs] }, )) # Normally we would do this in _info, but we need to know the labels before building the features if capture_labels: if not self.config.drop_labels: self.info.features = datasets.Features({ "image": datasets.Image(), "label": datasets.ClassLabel(names=sorted(labels)) }) task_template = ImageClassification(image_column="image", label_column="label") task_template = task_template.align_with_features( self.info.features) self.info.task_templates = [task_template] else: self.info.features = datasets.Features( {"image": datasets.Image()}) return splits