Beispiel #1
0
def build_tfrecord(dataset_root_dir, tfrecord_save_path):
    """build the TF record files.

  Args:
    dataset_root_dir: The dataset directory where the dataset is stored.
    tfrecord_save_path: The directory to save the tfrecord files
  """
    if not tf.gfile.Exists(tfrecord_save_path):
        tf.gfile.MakeDirs(tfrecord_save_path)

    photo_filenames, class_names = _get_filenames_and_classes(dataset_root_dir)

    # Shuffle and divide into train and test:
    random.seed(_RANDOM_SEED)
    random.shuffle(photo_filenames)
    valid_num = int((1 - _TRAIN_RATIO) * len(photo_filenames))
    training_filenames = photo_filenames[valid_num:]
    validation_filenames = photo_filenames[:valid_num]

    class_names_to_ids = dict(zip(class_names, range(len(class_names))))
    labels_to_class_names = dict(zip(range(len(class_names)), class_names))

    # First, convert the training and validation sets.
    _convert_dataset('train', training_filenames, class_names_to_ids)
    _convert_dataset('validation', validation_filenames, class_names_to_ids)

    # Finally, write the labels file:
    dataset_utils.write_label_file(labels_to_class_names, tfrecord_save_path)

    print('\nFinished converting the dataset!')
def run(dataset_dir):

    if not tf.gfile.Exists(dataset_dir):
        tf.gfile.MakeDirs(dataset_dir)

    if _dataset_exists(dataset_dir):
        print('Dataset files already exist. Exiting without re-creating them.')
        return

    dataset_utils.download_and_uncompress_tarball(_DATA_URL, dataset_dir)
    photo_filenames, class_names = _get_filenames_and_classes(dataset_dir)
    class_names_to_ids = dict(zip(class_names, range(len(class_names))))

    # Divide into train and test:
    random.seed(_RANDOM_SEED)
    random.shuffle(photo_filenames)
    training_filenames = photo_filenames[_NUM_VALIDATION:]
    validation_filenames = photo_filenames[:_NUM_VALIDATION]

    _convert_dataset('train', training_filenames, class_names_to_ids,
                     dataset_dir)
    _convert_dataset('validation', validation_filenames, class_names_to_ids,
                     dataset_dir)

    labels_to_class_names = dict(zip(range(len(class_names)), class_names))
    dataset_utils.write_label_file(labels_to_class_names, dataset_dir)

    _clean_up_temporary_files(dataset_dir)
    print('\nFinished converting the Flowers dataset!')
def run(dataset_dir):
    """Runs the download and conversion operation.
  Args:
	dataset_dir: The dataset directory where the dataset is stored.
  """
    if not tf.gfile.Exists(dataset_dir):
        tf.gfile.MakeDirs(dataset_dir)

    training_filename = _get_output_filename(dataset_dir, 'train')
    testing_filename = _get_output_filename(dataset_dir, 'val')

    classes_train = sorted(
        list(
            filter(lambda x: os.path.isdir(join(dataset_dir, 'train', x)),
                   os.listdir(join(dataset_dir, 'train')))))
    classes_map = {}
    for idx, cls_train in enumerate(classes_train):
        classes_map[cls_train] = idx

    with contextlib2.ExitStack() as tf_record_close_stack:
        train_writer = dataset_utils.open_sharded_output_tfrecords(
            tf_record_close_stack, training_filename, _NUM_TRAIN_FILES)
        _create_tfrecord_train(dataset_dir, train_writer, classes_map)

    with contextlib2.ExitStack() as tf_record_close_stack:
        test_writer = dataset_utils.open_sharded_output_tfrecords(
            tf_record_close_stack, testing_filename, _NUM_TRAIN_FILES)
        _create_tfrecord_test(dataset_dir, test_writer, classes_map)

    labels_to_class_names = dict(zip(range(len(classes_train)), classes_train))
    dataset_utils.write_label_file(labels_to_class_names, dataset_dir)
Beispiel #4
0
def run(dataset_dir):

    if not tf.gfile.Exists(dataset_dir):
        tf.gfile.MakeDirs(dataset_dir)

    if _dataset_exists(dataset_dir):
        print('Dataset files already exist. Exiting without re-creating them.')
        return

    photo_filenames, class_names = _get_filenames_and_classes(dataset_dir)
    class_names_to_ids = dict(zip(class_names, range(len(class_names))))

    # Divide into train and test:
    random.seed(_RANDOM_SEED)
    random.shuffle(photo_filenames)
    training_filenames = photo_filenames[_NUM_VALIDATION:]
    validation_filenames = photo_filenames[:_NUM_VALIDATION]

    # First, convert the training and validation sets.
    _convert_dataset('train', training_filenames, class_names_to_ids,
                     dataset_dir)
    _convert_dataset('validation', validation_filenames, class_names_to_ids,
                     dataset_dir)

    # Finally, write the labels file:
    labels_to_class_names = dict(zip(range(len(class_names)), class_names))
    dataset_utils.write_label_file(labels_to_class_names, dataset_dir)

    print('\nFinished converting the Flowers dataset!')
Beispiel #5
0
    def convertToTfrecord (self, nValidations = None):
        if not nValidations:
            nValidations = self.nValidations
        self.nValidations = nValidations
        #convert to Tfrecords
        photo_filenames, class_names = self._get_filenames_and_classes()
        class_names_to_ids = dict(zip(class_names, range(len(class_names))))

        # Divide into train and test:
        random.seed(self.randomSeed)
        random.shuffle(photo_filenames)
        training_filenames = photo_filenames[nValidations:]
        validation_filenames = photo_filenames[:nValidations]
        self.nTrains = len(photo_filenames) - nValidations
        
        # First, convert the training and validation sets.
        self._convert_dataset('train', training_filenames, class_names_to_ids)
        self._convert_dataset('validation', validation_filenames, class_names_to_ids)

        # Finally, write the labels file:
        labels_to_class_names = dict(zip(range(len(class_names)), class_names))
        dataset_utils.write_label_file(labels_to_class_names, self.datasetDir)
        self.labels_to_names = labels_to_class_names
        self.nClasses = len(labels_to_class_names)
        print('\nFinished converting the Matify dataset!')
def make_tfrecord(dataset_name, dataset_dir, train_fraction=0.9, num_channels=3, num_shards=4):
    if not tf.gfile.Exists(dataset_dir):
        tf.gfile.MakeDirs(dataset_dir)

    if _dataset_exists(dataset_name, dataset_dir, num_shards):
        print('Dataset files already exist. Exiting without re-creating them.')
        return None, None

    random.seed(_RANDOM_SEED)
    photo_filenames, class_names = _get_filenames_and_classes(dataset_dir)

    # Divide into train and test:
    print("Now let's start converting the Koreans dataset!")
    random.shuffle(photo_filenames)
    num_train = int(len(photo_filenames) * train_fraction)
    num_validation = int(len(photo_filenames) * (1 - train_fraction))
    num_dataset = len(photo_filenames)
    training_filenames = photo_filenames[:num_train]
    validation_filenames = photo_filenames[num_train:]

    class_names_to_ids = dict(zip(class_names, range(len(class_names))))
    # First, convert the training and validation sets.
    _convert_dataset(dataset_name, 'train', training_filenames, class_names_to_ids, dataset_dir, num_shards,
                     num_channels)
    _convert_dataset(dataset_name, 'validation', validation_filenames, class_names_to_ids, dataset_dir, num_shards,
                     num_channels)

    # Finally, write the labels file:
    labels_to_class_names = dict(zip(range(len(class_names)), class_names))
    dataset_utils.write_label_file(labels_to_class_names, dataset_dir)

    json.dump({"num_train": num_train, "num_validation": num_validation, "num_classes": len(class_names)},
              open(os.path.join(dataset_dir, "metadata"), mode="w+"))

    return num_dataset, len(class_names)
Beispiel #7
0
def run():
    """Runs the download and conversion operation.

  Args:
    dataset_dir: The dataset directory where the dataset is stored.
  """
    if not tf.gfile.Exists(_OUTPUT_PATH):
        tf.gfile.MakeDirs(_OUTPUT_PATH)

    if _dataset_exists(_OUTPUT_PATH):
        print('Dataset files already exist. Exiting without re-creating them.')
        return

    # dataset_utils.download_and_uncompress_tarball(_DATA_URL, dataset_dir)
    photo_filenames, class_names = _get_filenames_and_classes(_INPUT_PATH)
    class_names_to_ids = dict(zip(class_names, range(len(class_names))))

    # Divide into train and test:
    random.seed(_RANDOM_SEED)
    random.shuffle(photo_filenames)
    training_filenames = photo_filenames[_NUM_VALIDATION:]
    validation_filenames = photo_filenames[:_NUM_VALIDATION]

    # First, convert the training and validation sets.
    _convert_dataset('train', training_filenames, class_names_to_ids)
    _convert_dataset('validation', validation_filenames, class_names_to_ids)

    # Finally, write the labels file:
    labels_to_class_names = dict(zip(range(len(class_names)), class_names))
    dataset_utils.write_label_file(labels_to_class_names, _OUTPUT_PATH)

    # _clean_up_temporary_files(dataset_dir)
    print('\nFinished converting the Flowers dataset!')
Beispiel #8
0
def run(dataset_dir):
    """Runs the download and conversion operation.

    Parameters
    ----------
    dataset_dir : str
        The directory where the temporary files are stored.

    """
    if not tf.gfile.Exists(dataset_dir):
        tf.gfile.MakeDirs(dataset_dir)

    training_filename = os.path.join(dataset_dir, 'train-{:05d}-of-{:05d}')
    testing_filename = os.path.join(dataset_dir, 'test-{:05d}-of-{:05d}')

    _download_and_uncompress_dataset(dataset_dir)

    # First, process the training data:
    filenames = [os.path.join(dataset_dir, 'cifar-10-batches-py',
                              'data_batch_{}'.format(i + 1))
                 for i in range(_NUM_TRAIN_FILES)]
    _build_shards(filenames, training_filename)

    # Next, process the testing data:
    filename = os.path.join(dataset_dir, 'cifar-10-batches-py', 'test_batch')
    _build_shards([filename], testing_filename)

    # Finally, write the labels file:
    labels_to_class_names = dict(zip(range(len(_CLASS_NAMES)), _CLASS_NAMES))
    dataset_utils.write_label_file(labels_to_class_names, dataset_dir)

    _clean_up_temporary_files(dataset_dir)
    print('\nFinished converting the CIFAR10 dataset!')
Beispiel #9
0
def main():

    #==============================================================CHECKS==========================================================================
    #Check if there is a tfrecord_filename entered
    if not FLAGS.tfrecord_filename:
        raise ValueError(
            'tfrecord_filename is empty. Please state a tfrecord_filename argument.'
        )

    #Check if there is a dataset directory entered
    if not FLAGS.dataset_dir:
        raise ValueError(
            'dataset_dir is empty. Please state a dataset_dir argument.')

    #If the TFRecord files already exist in the directory, then exit without creating the files again
    if _dataset_exists(dataset_dir=FLAGS.dataset_dir,
                       _NUM_SHARDS=FLAGS.num_shards,
                       output_filename=FLAGS.tfrecord_filename):
        print 'Dataset files already exist. Exiting without re-creating them.'
        return None
    #==============================================================END OF CHECKS===================================================================

    #Get a list of photo_filenames like ['123.jpg', '456.jpg'...] and a list of sorted class names from parsing the subdirectories.
    photo_filenames, class_names, test_set = permutate(FLAGS.dataset_dir,
                                                       10000, 400)

    #Refer each of the class name to a specific integer number for predictions later
    class_names_to_ids = dict(zip(class_names, range(len(class_names))))

    #Find the number of validation examples we need
    #num_validation = int(FLAGS.validation_size * len(photo_filenames))

    #Divide the training datasets into train and test:
    random.seed(FLAGS.random_seed)
    random.shuffle(photo_filenames)
    random.shuffle(test_set)
    #training_filenames = photo_filenames[num_validation:]
    #validation_filenames = photo_filenames[:num_validation]
    training_filenames = photo_filenames
    validation_filenames = test_set

    # First, convert the training and validation sets.
    _convert_dataset('train',
                     training_filenames,
                     class_names_to_ids,
                     dataset_dir=FLAGS.dataset_dir,
                     tfrecord_filename=FLAGS.tfrecord_filename,
                     _NUM_SHARDS=FLAGS.num_shards)
    _convert_dataset('validation',
                     validation_filenames,
                     class_names_to_ids,
                     dataset_dir=FLAGS.dataset_dir,
                     tfrecord_filename=FLAGS.tfrecord_filename,
                     _NUM_SHARDS=FLAGS.num_shards)

    # Finally, write the labels file:
    labels_to_class_names = dict(zip(range(len(class_names)), class_names))
    write_label_file(labels_to_class_names, FLAGS.dataset_dir)

    print '\nFinished converting the %s dataset!' % (FLAGS.tfrecord_filename)
Beispiel #10
0
def run(dataset_dir, output_dir):
    """Runs the download and conversion operation.

    Args:
      dataset_dir: The dataset directory where the dataset is stored.
      output_dir: 输出目录
    """
    photo_filenames, class_names = _get_filenames_and_classes(dataset_dir)
    class_names_to_ids = dict(zip(class_names, range(len(class_names))))

    # 通过随机数打乱图片的列表顺序
    # Divide into train and test:
    random.seed(_RANDOM_SEED)
    random.shuffle(photo_filenames)
    training_filenames = photo_filenames[_NUM_VALIDATION:]
    validation_filenames = photo_filenames[:_NUM_VALIDATION]

    # First, convert the training and validation sets.
    _convert_dataset('train', training_filenames, class_names_to_ids,
                     output_dir)
    _convert_dataset('validation', validation_filenames, class_names_to_ids,
                     output_dir)

    # Finally, write the labels file:
    labels_to_class_names = dict(zip(range(len(class_names)), class_names))
    dataset_utils.write_label_file(labels_to_class_names, output_dir)

    print('\nFinished converting the dataset!')
def run(dataset_dir):
    """Runs the download and conversion operation.

  Args:
    dataset_dir: The dataset directory where the dataset is stored.
  """
    if not tf.gfile.Exists(dataset_dir):
        tf.gfile.MakeDirs(dataset_dir)

    if _dataset_exists(dataset_dir):
        print('Dataset files already exist. Exiting without re-creating them.')
        return

    dataset_dir_depth = "tmp/scenes/sc0013/depth"
    dataset_dir_rgb = "tmp/scenes/sc0013/rgb"

    photo_filenames_rgb, class_names_rgb = _get_filenames_and_classes(
        dataset_dir_rgb)
    photo_filenames_depth, class_names_depth = _get_filenames_and_classes(
        dataset_dir_depth)

    class_names_to_ids_rgb = dict(
        zip(class_names_rgb, range(len(class_names_rgb))))
    class_names_to_ids_depth = dict(
        zip(class_names_depth, range(len(class_names_depth))))

    # Divide into train and test:
    random.seed(_RANDOM_SEED)
    photo_filenames_rgb, photo_filenames_depth = shuffle(
        photo_filenames_rgb, photo_filenames_depth, random_state=_RANDOM_SEED)
    training_filenames_rgb = photo_filenames_rgb[_NUM_VALIDATION:]
    validation_filenames_rgb = photo_filenames_rgb[:_NUM_VALIDATION]
    training_filenames_depth = photo_filenames_depth[_NUM_VALIDATION:]
    validation_filenames_depth = photo_filenames_depth[:_NUM_VALIDATION]

    # First, convert the training and validation sets.
    _convert_dataset('train', training_filenames_rgb, class_names_to_ids_rgb,
                     dataset_dir_rgb)
    # _convert_dataset('validation', validation_filenames_rgb, class_names_to_ids_rgb,
    # dataset_dir_rgb)
    _convert_dataset('train', training_filenames_depth,
                     class_names_to_ids_depth, dataset_dir_depth)
    # _convert_dataset('validation', validation_filenames_depth, class_names_to_ids_depth,
    # dataset_dir_depth)

    # Finally, write the labels file:
    labels_to_class_names_rgb = dict(
        zip(range(len(class_names_rgb)), class_names_rgb))
    dataset_utils.write_label_file(labels_to_class_names_rgb, dataset_dir_rgb)
    labels_to_class_names_depth = dict(
        zip(range(len(class_names_depth)), class_names_depth))
    dataset_utils.write_label_file(labels_to_class_names_depth,
                                   dataset_dir_depth)

    print('\nFinished converting the Objects dataset!')
Beispiel #12
0
def main():

    #==============================================================CHECKS==========================================================================
    # Check if there is a tfrecord_filename entered
    if not FLAGS.tfrecord_filename:
        raise ValueError('tfrecord_filename is empty. Please state a tfrecord_filename argument.')

    # Check if there is a dataset directory entered
    if not FLAGS.dataset_dir:
        raise ValueError('dataset_dir is empty. Please state a dataset_dir argument.')

    # Check if there is a dataset directory entered
    if not FLAGS.tfrecord_dir:
        raise ValueError('tfrecord_dir is empty. Please state a tfrecord_dir argument.')

    if not os.path.exists(FLAGS.tfrecord_dir):
        os.makedirs(FLAGS.tfrecord_dir)

    # If the TFRecord files already exist in the directory, then exit without creating the files again
    if _dataset_exists(
            dataset_dir = FLAGS.tfrecord_dir,
            _NUM_SHARDS = FLAGS.num_shards,
            output_filename = FLAGS.tfrecord_filename):
        print 'Dataset files already exist. Exiting without re-creating them.'
        return None
    #==============================================================END OF CHECKS===================================================================

    # Get a list of photo_filenames and a list of sorted class names from parsing the subdirectories.
    photo_filenames, class_ids = _get_filenames_and_classes(FLAGS.dataset_dir)

    # Refer each of the class name to a specific integer number for predictions later
    class_ids_to_serial = dict(zip(class_ids, range(len(class_ids))))

    # Write the labels file:
    serial_to_class_ids = dict(zip(range(len(class_ids)), class_ids))
    write_label_file(serial_to_class_ids, FLAGS.dataset_dir)

    # Find the number of validation examples we need
    num_validation = int(float(FLAGS.validation_size) * len(photo_filenames))

    # Divide the training datasets into train and test:
    random.seed(FLAGS.random_seed)
    random.shuffle(photo_filenames)
    training_filenames = photo_filenames[num_validation:]
    validation_filenames = photo_filenames[:num_validation]

    # Convert the training and validation sets.
    _convert_dataset('train', training_filenames, class_ids_to_serial,
                     dataset_dir = FLAGS.tfrecord_dir, tfrecord_filename = FLAGS.tfrecord_filename, _NUM_SHARDS = FLAGS.num_shards,
                     simulate = FLAGS.simulate)
    _convert_dataset('validation', validation_filenames, class_ids_to_serial,
                     dataset_dir = FLAGS.tfrecord_dir, tfrecord_filename = FLAGS.tfrecord_filename, _NUM_SHARDS = FLAGS.num_shards,
                     simulate = FLAGS.simulate)

    print '\nFinished converting the %s dataset!' % (FLAGS.tfrecord_filename)
Beispiel #13
0
def main():

    #==============================================================CHECKS==========================================================================
    #Check if there is a tfrecord_filename entered
    if not FLAGS.tfrecord_filename:
        raise ValueError(
            'tfrecord_filename is empty. Please state a tfrecord_filename argument.'
        )

    #Check if there is a dataset directory entered
    if not FLAGS.dataset_dir:
        raise ValueError(
            'dataset_dir is empty. Please state a dataset_dir argument.')

    #If the TFRecord files already exist in the directory, then exit without creating the files again
    if _dataset_exists(dataset_dir=FLAGS.dataset_dir,
                       _NUM_SHARDS=FLAGS.num_shards,
                       output_filename=FLAGS.tfrecord_filename):
        print('Dataset files already exist. Exiting without re-creating them.')
        return None
    #==============================================================END OF CHECKS===================================================================

    #Get a list of photo_filenames like ['123.jpg', '456.jpg'...] and a list of sorted class names from parsing the subdirectories.
    photo__train_filenames, photo__val_filenames, class_names = _get_filenames_and_classes(
        FLAGS.dataset_dir)

    #Refer each of the class name to a specific integer number for predictions later
    class_names_to_ids = dict(zip(class_names, range(len(class_names))))

    training_filenames = photo__train_filenames
    validation_filenames = photo__val_filenames

    # First, convert the training and validation sets.
    _convert_dataset('train',
                     training_filenames,
                     class_names_to_ids,
                     dataset_dir=FLAGS.dataset_dir,
                     tfrecord_filename=FLAGS.tfrecord_filename,
                     _NUM_SHARDS=FLAGS.num_shards)
    _convert_dataset('validation',
                     validation_filenames,
                     class_names_to_ids,
                     dataset_dir=FLAGS.dataset_dir,
                     tfrecord_filename=FLAGS.tfrecord_filename,
                     _NUM_SHARDS=FLAGS.num_shards)

    # Finally, write the labels file:
    labels_to_class_names = dict(zip(range(len(class_names)), class_names))
    write_label_file(labels_to_class_names, FLAGS.dataset_dir)

    print('\nFinished converting the %s dataset!' % (FLAGS.tfrecord_filename))
Beispiel #14
0
def main(argv):

    if not FLAGS.tfrecord_filename:
        raise ValueError(
            "tfrecord_filename is empty. Please state a tfrecord_filename argument."
        )

    if not FLAGS.dataset_dir:
        raise ValueError(
            "dataset_dir is empty. Please state a dataset_dir argument.")

    if _dataset_exists(
            dataset_dir=FLAGS.dataset_dir,
            _NUM_SHARDS=FLAGS.num_shards,
            output_filename=FLAGS.tfrecord_filename,
    ):
        return None
    photo_filenames, class_names = _get_filenames_and_classes(
        FLAGS.dataset_dir)

    class_names_to_ids = dict(zip(class_names, range(len(class_names))))

    num_validation = int(FLAGS.validation_size * len(photo_filenames))
    random.seed(FLAGS.random_seed)
    random.shuffle(photo_filenames)
    training_filenames = photo_filenames[num_validation:]
    validation_filenames = photo_filenames[:num_validation]

    _convert_dataset(
        "train",
        training_filenames,
        class_names_to_ids,
        dataset_dir=FLAGS.dataset_dir,
        tfrecord_filename=FLAGS.tfrecord_filename,
        _NUM_SHARDS=FLAGS.num_shards,
    )
    _convert_dataset(
        "validation",
        validation_filenames,
        class_names_to_ids,
        dataset_dir=FLAGS.dataset_dir,
        tfrecord_filename=FLAGS.tfrecord_filename,
        _NUM_SHARDS=FLAGS.num_shards,
    )

    # Finally, write the labels file:
    labels_to_class_names = dict(zip(range(len(class_names)), class_names))
    write_label_file(labels_to_class_names, FLAGS.dataset_dir)
def main():
    if not FLAGS.tfrecord_filename:
        raise ValueError(
            'tfrecord_filename is empty. Please state a tfrecord_filename argument.'
        )

    #Check if there is a dataset directory entered
    if not FLAGS.dataset_dir:
        raise ValueError(
            'dataset_dir is empty. Please state a dataset_dir argument.')

    #If the TFRecord files already exist in the directory, then exit without creating the files again
    if _dataset_exists(dataset_dir=FLAGS.dataset_dir,
                       _NUM_SHARDS=FLAGS.num_shards,
                       output_filename=FLAGS.tfrecord_filename):
        print('Dataset files already exist. Exiting without re-creating them.')
        return None

    photo_filenames, class_names = _get_filenames_and_classes(
        FLAGS.dataset_dir)

    class_names_to_ids = dict(zip(class_names, range(len(class_names))))

    num_validation = int(FLAGS.validation_size * len(photo_filenames))

    random.seed(FLAGS.random_seed)
    random.shuffle(photo_filenames)
    training_filenames = photo_filenames[num_validation:]
    validation_filenames = photo_filenames[:num_validation]

    _convert_dataset('train',
                     training_filenames,
                     class_names_to_ids,
                     dataset_dir=FLAGS.dataset_dir,
                     tfrecord_filename=FLAGS.tfrecord_filename,
                     _NUM_SHARDS=FLAGS.num_shards)
    _convert_dataset('validation',
                     validation_filenames,
                     class_names_to_ids,
                     dataset_dir=FLAGS.dataset_dir,
                     tfrecord_filename=FLAGS.tfrecord_filename,
                     _NUM_SHARDS=FLAGS.num_shards)
    # Finally, write the labels file:
    labels_to_class_names = dict(zip(range(len(class_names)), class_names))
    write_label_file(labels_to_class_names, FLAGS.dataset_dir)
    write_data_summary(num_validation, len(photo_filenames), FLAGS.dataset_dir)
def run(dataset_dir, train_file, val_file):
    """Runs the download and conversion operation.

  Args:
    dataset_dir: The dataset directory where the dataset is stored.
  """
    if not tf.gfile.Exists(dataset_dir):
        tf.gfile.MakeDirs(dataset_dir)

    # First, convert the training and validation sets.
    _convert_dataset('train', train_file, dataset_dir)
    _convert_dataset('validation', val_file, dataset_dir)

    # Finally, write the labels file:
    labels_to_class_names = {0: 'background', 1: 'foreground'}
    dataset_utils.write_label_file(labels_to_class_names, dataset_dir)

    print('\nFinished converting the cityscapes dataset!')
Beispiel #17
0
def run(dataset_dir):
  training_filenames, class_names = _get_filenames_and_classes(os.path.join(dataset_dir, 'train'))
  validation_filenames, class_names = _get_filenames_and_classes(os.path.join(dataset_dir, 'validation'))
  class_names_to_ids = dict(zip(class_names, range(len(class_names))))

  random.seed(0)
  random.shuffle(training_filenames)
  random.shuffle(validation_filenames)

  # First, convert the training and validation sets.
  _convert_dataset('train', training_filenames, class_names_to_ids,
                   dataset_dir)
  _convert_dataset('validation', validation_filenames, class_names_to_ids,
                   dataset_dir)

  # Finally, write the labels file:
  labels_to_class_names = dict(zip(range(len(class_names)), class_names))
  dataset_utils.write_label_file(labels_to_class_names, dataset_dir)
Beispiel #18
0
def build_tfrecord(dataset_root_dir, tfrecord_save_path):
    """build the TF record files.

  Args:
    dataset_root_dir: The dataset directory where the dataset is stored.
    tfrecord_save_path: The directory to save the tfrecord files
  """
    print('\nStart...')

    if not tf.gfile.Exists(tfrecord_save_path):
        tf.gfile.MakeDirs(tfrecord_save_path)
    else:
        print("tfrecord_save_path has exist, please check!")
        print("stop")
        return

    print('\nloading all images\' filename list...')
    photo_filenames, class_names = _get_filenames_and_classes(dataset_root_dir)

    # Shuffle and divide into train and test:
    random.seed(_RANDOM_SEED)
    random.shuffle(photo_filenames)
    train_ratio = _TRAIN_NUM_SHARDS / (_TRAIN_NUM_SHARDS + _VALID_NUM_SHARDS)
    valid_num = int((1 - train_ratio) * len(photo_filenames))
    training_filenames = photo_filenames[valid_num:]
    validation_filenames = photo_filenames[:valid_num]
    print('the total size of training dataset: %d' % len(training_filenames))
    print('the total size of validation dataset: %d' %
          len(validation_filenames))

    class_names_to_ids = dict(zip(class_names, range(len(class_names))))
    labels_to_class_names = dict(zip(range(len(class_names)), class_names))

    # First, convert the training and validation sets.
    print('\nStart converting the training dataset...')
    _convert_dataset('train', training_filenames, class_names_to_ids)
    print('\nStart converting the validation dataset...')
    _convert_dataset('validation', validation_filenames, class_names_to_ids)

    # Finally, write the labels file:
    dataset_utils.write_label_file(labels_to_class_names, tfrecord_save_path)

    print('\nFinished converting the dataset!')
Beispiel #19
0
def buildTfRecordFile(input_images_path, out_path):
    file_directory = input_images_path
    dataset_dir = file_directory

    if not os.path.exists(dataset_dir):
        print('The directory for dataset (i.e., "' + dataset_dir +
              '") does not exist.')
        exit()

    output_dir = out_path
    if output_dir[-1] != "/":
        output_dir += "/"
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    #STORK sets the validation percentage elsewhere in the code and
    #reccomends to set to zero in the convertion
    validation_percentage = 0  #(float(0) / 100.0)
    if not os.path.exists(dataset_dir):
        tf.gfile.MakeDirs(dataset_dir)
    image_filenames, class_names = _get_filenames_and_classes(dataset_dir)
    _NUM_VALIDATION = int(len(image_filenames) * validation_percentage)
    class_names_to_ids = dict(zip(class_names, range(len(class_names))))

    #Divide into train and test:
    random.seed(_RANDOM_SEED)
    random.shuffle(image_filenames)
    training_filenames = image_filenames[_NUM_VALIDATION:]
    validation_filenames = image_filenames[:_NUM_VALIDATION]

    #First, convert the training and validation sets.
    _convert_dataset('train', training_filenames, class_names_to_ids,
                     dataset_dir, output_dir)
    _convert_dataset('validation', validation_filenames, class_names_to_ids,
                     dataset_dir, output_dir)

    #Finally, write the labels file:
    labels_to_class_names = dict(zip(range(len(class_names)), class_names))
    dataset_utils.write_label_file(labels_to_class_names, output_dir)

    print('\nFinished converting dataset!')
    print('The converted data is stored in the directory: "' + output_dir +
          '"')
def run(dataset_dir):
    """Runs the download and conversion operation.

    Args:
      dataset_dir: The dataset directory where the dataset is stored.
    """
    if not tf.gfile.Exists(dataset_dir):
        tf.gfile.MakeDirs(dataset_dir)

    photo_filenames, class_names = _get_filenames_and_classes(dataset_dir)
    if len(photo_filenames) == 0:
        print(" no files detected")
        exit(-1)
    zz = zip(class_names, range(len(class_names)))
    class_names_to_ids = dict(zip(class_names, range(len(class_names))))

    random.seed(_RANDOM_SEED)
    random.shuffle(photo_filenames)

    labels_to_class_names = dict(zip(range(len(class_names)), class_names))
    if os.path.exists(OUTPUT_PATH) is True:
        shutil.rmtree(OUTPUT_PATH)
    if os.path.exists(OUTPUT_PATH) is False:
        os.makedirs(OUTPUT_PATH)
    dataset_utils.write_label_file(labels_to_class_names, OUTPUT_PATH)

    print("all files:%d classes: %d " % (len(photo_filenames), len(class_names)))

    _NUM_VALIDATION = math.ceil(len(photo_filenames)/10)
    training_filenames = photo_filenames[_NUM_VALIDATION:]
    validation_filenames = photo_filenames[:_NUM_VALIDATION]

    num_shards = math.ceil(len(training_filenames) / 1000)
    g_process.set_all_step(num_shards)
    _convert_dataset("train", training_filenames, class_names_to_ids,
                     dataset_dir, num_shards)

    num_shards = math.ceil(len(validation_filenames) / 1000)
    g_process.reset()
    g_process.set_all_step(num_shards)
    _convert_dataset("validation", validation_filenames, class_names_to_ids,
                     dataset_dir, num_shards)
Beispiel #21
0
def main(_):
  """Runs the download and conversion operation.

  Args:
    dataset_dir: The dataset directory where the dataset is stored.
  """
  dataset_dir = FLAGS.dataset_dir

  if not tf.gfile.Exists(dataset_dir):
    tf.gfile.MakeDirs(dataset_dir)

  training_filename = _get_output_filename(dataset_dir, 'train')
  testing_filename = _get_output_filename(dataset_dir, 'test')

  if tf.gfile.Exists(training_filename) and tf.gfile.Exists(testing_filename):
    print('Dataset files already exist. Exiting without re-creating them.')
    return

  dataset_utils.download_and_uncompress_tarball(_DATA_URL, dataset_dir)

  # First, process the training data:
  with tf.python_io.TFRecordWriter(training_filename) as tfrecord_writer:
    offset = 0
    for i in range(_NUM_TRAIN_FILES):
      filename = os.path.join(dataset_dir,
                              'cifar-10-batches-py',
                              'data_batch_%d' % (i + 1))  # 1-indexed.
      offset = _add_to_tfrecord(filename, tfrecord_writer, offset)

  # Next, process the testing data:
  with tf.python_io.TFRecordWriter(testing_filename) as tfrecord_writer:
    filename = os.path.join(dataset_dir,
                            'cifar-10-batches-py',
                            'test_batch')
    _add_to_tfrecord(filename, tfrecord_writer)

  # Finally, write the labels file:
  labels_to_class_names = dict(zip(range(len(_CLASS_NAMES)), _CLASS_NAMES))
  dataset_utils.write_label_file(labels_to_class_names, dataset_dir)

  _clean_up_temporary_files(dataset_dir)
  print('\nFinished converting the Cifar10 dataset!')
def run(args):
    """Runs the download and conversion operation.

  Args:
    dataset_dir: The dataset directory where the dataset is stored.
  """
    dataset_dir = FLAGS.dataset_dir

    if not tf.gfile.Exists(dataset_dir):
        tf.gfile.MakeDirs(dataset_dir)

    training_filename = _get_output_filename(dataset_dir, 'train')
    testing_filename = _get_output_filename(dataset_dir, 'test')

    if tf.gfile.Exists(training_filename) and tf.gfile.Exists(
            testing_filename):
        print('Dataset files already exist. Exiting without re-creating them.')
        return

    # _download_dataset(dataset_dir)

    # First, process the training data:
    with tf.python_io.TFRecordWriter(training_filename) as tfrecord_writer:
        data_filename = os.path.join(dataset_dir, _TRAIN_DATA_FILENAME)
        labels_filename = os.path.join(dataset_dir, _TRAIN_LABELS_FILENAME)
        _add_to_tfrecord(data_filename, labels_filename, 60000,
                         tfrecord_writer)

    # Next, process the testing data:
    with tf.python_io.TFRecordWriter(testing_filename) as tfrecord_writer:
        data_filename = os.path.join(dataset_dir, _TEST_DATA_FILENAME)
        labels_filename = os.path.join(dataset_dir, _TEST_LABELS_FILENAME)
        _add_to_tfrecord(data_filename, labels_filename, 10000,
                         tfrecord_writer)

    # Finally, write the labels file:
    labels_to_class_names = dict(zip(range(len(_CLASS_NAMES)), _CLASS_NAMES))
    dataset_utils.write_label_file(labels_to_class_names, dataset_dir)

    _clean_up_temporary_files(dataset_dir)
    print('\nFinished converting the MNIST dataset!')
def run(dataset_dir):
  """Runs the download and conversion operation.

  Args:
    dataset_dir: The dataset directory where the dataset is stored.
  """
  # if not tf.gfile.Exists(dataset_dir):
  #   tf.gfile.MakeDirs(dataset_dir)

  # if _dataset_exists(dataset_dir):
  #   print('Dataset files already exist. Exiting without re-creating them.')
  #   return

  # dataset_utils.download_and_uncompress_tarball(_DATA_URL, dataset_dir)
  photo_filenames, class_names = _get_filenames_and_classes(dataset_dir)
  class_names_to_ids = dict(zip(class_names, range(len(class_names))))

  # # Divide into train and test:
  random.seed(_RANDOM_SEED)
  random.shuffle(photo_filenames)
  training_filenames = photo_filenames[_NUM_VALIDATION:]
  d = {'burgers': 0, 'notburgers': 0}
  for fn in training_filenames:
    if 'all/burgers' in fn:
      d['burgers'] += 1
    else:
      d['notburgers'] += 1
  print(d)
  validation_filenames = photo_filenames[:_NUM_VALIDATION]

  # First, convert the training and validation sets.
  _convert_dataset('train', training_filenames, class_names_to_ids,
                   dataset_dir)
  _convert_dataset('validation', validation_filenames, class_names_to_ids,
                   dataset_dir)

  # # Finally, write the labels file:
  labels_to_class_names = dict(zip(range(len(class_names)), class_names))
  dataset_utils.write_label_file(labels_to_class_names, dataset_dir)
Beispiel #24
0
def run(dataset_dir):
    """Runs the download and conversion operation.

  Args:
    dataset_dir: The dataset directory where the dataset is stored.
  """
    # if not tf.gfile.Exists(dataset_dir):
    #   tf.gfile.MakeDirs(dataset_dir)

    # if _dataset_exists(dataset_dir):
    #   print('Dataset files already exist. Exiting without re-creating them.')
    #   return

    # dataset_utils.download_and_uncompress_tarball(_DATA_URL, dataset_dir)
    photo_filenames, class_names = _get_filenames_and_classes(dataset_dir)
    class_names_to_ids = dict(zip(class_names, range(len(class_names))))

    # # Divide into train and test:
    random.seed(_RANDOM_SEED)
    random.shuffle(photo_filenames)
    training_filenames = photo_filenames[_NUM_VALIDATION:]
    d = {'burgers': 0, 'notburgers': 0}
    for fn in training_filenames:
        if 'all/burgers' in fn:
            d['burgers'] += 1
        else:
            d['notburgers'] += 1
    print(d)
    validation_filenames = photo_filenames[:_NUM_VALIDATION]

    # First, convert the training and validation sets.
    _convert_dataset('train', training_filenames, class_names_to_ids,
                     dataset_dir)
    _convert_dataset('validation', validation_filenames, class_names_to_ids,
                     dataset_dir)

    # # Finally, write the labels file:
    labels_to_class_names = dict(zip(range(len(class_names)), class_names))
    dataset_utils.write_label_file(labels_to_class_names, dataset_dir)
def run(argv):
  """Runs the download and conversion operation.

  Args:
    dataset_dir: The dataset directory where the dataset is stored.
  """
  dataset_dir = FLAGS.dataset_dir

  if not tf.gfile.Exists(dataset_dir):
    tf.gfile.MakeDirs(dataset_dir)

  training_filename = _get_output_filename(dataset_dir, 'train')
  testing_filename = _get_output_filename(dataset_dir, 'test')

  if tf.gfile.Exists(training_filename) and tf.gfile.Exists(testing_filename):
    print('Dataset files already exist. Exiting without re-creating them.')
    return

  _download_dataset(dataset_dir)

  # First, process the training data:
  with tf.python_io.TFRecordWriter(training_filename) as tfrecord_writer:
    data_filename = os.path.join(dataset_dir, _TRAIN_DATA_FILENAME)
    labels_filename = os.path.join(dataset_dir, _TRAIN_LABELS_FILENAME)
    _add_to_tfrecord(data_filename, labels_filename, 60000, tfrecord_writer)

  # Next, process the testing data:
  with tf.python_io.TFRecordWriter(testing_filename) as tfrecord_writer:
    data_filename = os.path.join(dataset_dir, _TEST_DATA_FILENAME)
    labels_filename = os.path.join(dataset_dir, _TEST_LABELS_FILENAME)
    _add_to_tfrecord(data_filename, labels_filename, 10000, tfrecord_writer)

  # Finally, write the labels file:
  labels_to_class_names = dict(zip(range(len(_CLASS_NAMES)), _CLASS_NAMES))
  dataset_utils.write_label_file(labels_to_class_names, dataset_dir)

  _clean_up_temporary_files(dataset_dir)
  print('\nFinished converting the MNIST dataset!')
Beispiel #26
0
def run(dataset_dir):
  """Runs the download and conversion operation.

  Args:
    dataset_dir: The dataset directory where the dataset is stored.
  """
  if not tf.gfile.Exists(dataset_dir):
    tf.gfile.MakeDirs(dataset_dir)

  training_filename = _get_output_filename(dataset_dir, 'train')
  testing_filename = _get_output_filename(dataset_dir, 'test')

  if tf.gfile.Exists(training_filename) and tf.gfile.Exists(testing_filename):
    print('Dataset files already exist. Exiting without re-creating them.')
    return
#
#  dataset_utils.download_and_uncompress_tarball(_DATA_URL, dataset_dir)

  # First, process the training data:
#  with tf.python_io.TFRecordWriter(training_filename) as tfrecord_writer:
#    offset = 0
#
#    filename = 'D:/pig_recognize/pig_slim1/pig_data_all'  # 1-indexed.
#    offset = _add_to_tfrecord(filename, tfrecord_writer, offset)

#'''
  # Next, process the testing data:
  with tf.python_io.TFRecordWriter(testing_filename) as tfrecord_writer:
    filename = 'D:/pig_recognize/pig_slim1/pig_test_b_body_face' 
    _add_to_tfrecord(filename, tfrecord_writer)

  # Finally, write the labels file:
  labels_to_class_names = dict(zip(range(len(_CLASS_NAMES)), _CLASS_NAMES))
  dataset_utils.write_label_file(labels_to_class_names, dataset_dir)

#  _clean_up_temporary_files(dataset_dir)
  print('\nFinished converting the Cifar10 dataset!')
Beispiel #27
0
def run(dataset_dir):
    """Runs the download and conversion operation.

  Args:
    dataset_dir: The dataset directory where the dataset is stored.
  """

    if not tf.gfile.Exists(dataset_dir):
        tf.gfile.MakeDirs(dataset_dir)

    if _dataset_exists(dataset_dir):
        print('Dataset files already exist. Exiting without re-creating them.')
        return

    #dataset_utils.download_and_uncompress_tarball(_DATA_URL, dataset_dir)
    photo_filenames, landmarks = _get_filenames_and_landmarks(dataset_dir)
    filenames_to_landmarks = dict(zip(photo_filenames, landmarks))
    # Divide into train and test:
    random.seed(_RANDOM_SEED)
    random.shuffle(photo_filenames)
    training_filenames = photo_filenames[_NUM_VALIDATION:]
    validation_filenames = photo_filenames[:_NUM_VALIDATION]

    # First, convert the training and validation sets.
    _convert_dataset('train', training_filenames, filenames_to_landmarks,
                     dataset_dir)
    _convert_dataset('validation', validation_filenames,
                     filenames_to_landmarks, dataset_dir)

    # Finally, write the labels file:
    landmark_names = ['left_eye', 'right_eye', 'nose']
    labels_landmark_names = dict(
        list(zip(list(range(len(landmark_names))), landmark_names)))
    dataset_utils.write_label_file(labels_landmark_names, dataset_dir)

    print('\nFinished converting the dataset!')
Beispiel #28
0
def main():

    #==============================================================CHECKS==========================================================================
    #Check if there is a tfrecord_filename entered
    if not FLAGS.tfrecord_filename:
        raise ValueError(
            'tfrecord_filename is empty. Please state a tfrecord_filename argument.'
        )

    #Check if there is a dataset directory entered
    if not FLAGS.dataset_dir:
        raise ValueError(
            'dataset_dir is empty. Please state a dataset_dir argument.')

    #If the TFRecord files already exist in the directory, then exit without creating the files again
    if _dataset_exists(dataset_dir=FLAGS.dataset_dir,
                       _NUM_SHARDS=FLAGS.num_shards,
                       output_filename=FLAGS.tfrecord_filename):
        print('Dataset files already exist. Exiting without re-creating them.')
        return None
    #==============================================================END OF CHECKS===================================================================

    #Get a list of photo_filenames like ['123.jpg', '456.jpg'...] and a list of sorted class names from parsing the subdirectories.
    photo_filenames, class_names = _get_filenames_and_classes(
        FLAGS.dataset_dir)

    print('{}; {}'.format(len(photo_filenames), len(class_names)))
    #Refer each of the class name to a specific integer number for predictions later
    class_names_to_ids = dict(zip(class_names, range(len(class_names))))

    #Find the number of validation examples we need
    num_validation = int(FLAGS.validation_size * len(photo_filenames))

    # Divide the training datasets into train and test:
    random.seed(FLAGS.random_seed)
    random.shuffle(photo_filenames)
    training_filenames = photo_filenames[num_validation:]
    validation_filenames = photo_filenames[:num_validation]

    # Save validation images list to CSV file
    with open("./data/validate.csv", 'w') as f:
        f.write('IMAGE_NAME\n')
        for file in validation_filenames:
            head, filename = os.path.split(file)
            class_name = os.path.basename(os.path.dirname(file))
            f.write(str(filename) + ',' + str(class_name) + '\n')

    # First, convert the training and validation sets.
    _convert_dataset('train',
                     training_filenames,
                     class_names_to_ids,
                     dataset_dir=FLAGS.dataset_dir,
                     tfrecord_filename=FLAGS.tfrecord_filename,
                     _NUM_SHARDS=FLAGS.num_shards)
    _convert_dataset('validation',
                     validation_filenames,
                     class_names_to_ids,
                     dataset_dir=FLAGS.dataset_dir,
                     tfrecord_filename=FLAGS.tfrecord_filename,
                     _NUM_SHARDS=FLAGS.num_shards)

    # Finally, write the labels file:
    labels_to_class_names = dict(zip(range(len(class_names)), class_names))
    write_label_file(labels_to_class_names, FLAGS.dataset_dir)

    print('\nFinished converting the %s dataset!' % (FLAGS.tfrecord_filename))
Beispiel #29
0
def run(dataset_dir, max_class_size=1000):
    """Runs the download and conversion operation.

	Args:
		dataset_dir: The dataset directory where the dataset is stored.
	"""
    if not tf.gfile.Exists(dataset_dir + "dataset_test/"):
        tf.gfile.MakeDirs(dataset_dir + "dataset_test/")

    if _dataset_exists(dataset_dir):
        print('Dataset files already exist. Exiting without re-creating them.')
        return

    ##########################
    #          ISIC          #
    ##########################
    # Shuffle the data
    filepaths_isic, fileclasses_isic = _get_filenames_and_classes_isic(
        dataset_dir)

    assert (len(filepaths_isic) == len(fileclasses_isic))
    random.seed(_RANDOM_SEED)
    random.shuffle(filepaths_isic)
    random.seed(_RANDOM_SEED)
    random.shuffle(fileclasses_isic)

    ##########################
    #        TEST ISIC       #
    ##########################
    # Contruct the test set
    test_filepaths_isic = []
    test_fileclasses_isic = []
    index_isic_ = []
    count_isic = 0
    count_isic_ = {}
    count_isic_["pigmented-lesions-benign/melanocytic-nevi/dermoscopy/"] = 0
    count_isic_["pigmented-lesions-malignant/melanoma/dermoscopy/"] = 0

    for j, i in enumerate(filepaths_isic):
        if fileclasses_isic[j] == "pigmented-lesions-benign/melanocytic-nevi/dermoscopy/" and \
         count_isic_["pigmented-lesions-benign/melanocytic-nevi/dermoscopy/"] < _NUM_TEST_ISIC_PER_CLASS:
            test_filepaths_isic.append(i)
            test_fileclasses_isic.append(fileclasses_isic[j])
            count_isic_[
                "pigmented-lesions-benign/melanocytic-nevi/dermoscopy/"] += 1
            index_isic_.append(j)
            count_isic += 1

        elif fileclasses_isic[j] == "pigmented-lesions-malignant/melanoma/dermoscopy/" and \
         count_isic_["pigmented-lesions-malignant/melanoma/dermoscopy/"] < _NUM_TEST_ISIC_PER_CLASS:
            test_filepaths_isic.append(i)
            test_fileclasses_isic.append(fileclasses_isic[j])
            count_isic_[
                "pigmented-lesions-malignant/melanoma/dermoscopy/"] += 1
            index_isic_.append(j)
            count_isic += 1

        if count_isic >= _NUM_TEST_ISIC:
            break

    val_dico_isic = [i for i in list(dico_isic.values()) if i is not None]
    count_training_isic = {}
    for i in val_dico_isic:
        count_training_isic[i] = 0
    training_filepaths_isic = []
    training_fileclasses_isic = []
    for i, j in enumerate(filepaths_isic):
        if i not in index_isic_ and count_training_isic[
                fileclasses_isic[i]] < max_class_size:
            training_filepaths_isic.append(j)
            training_fileclasses_isic.append(fileclasses_isic[i])
            count_training_isic[fileclasses_isic[i]] += 1

    ##########################
    #          URLS          #
    ##########################
    filepaths, fileclasses = _get_filenames_and_classes(dataset_dir)

    ##########################
    #          MERGE         #
    ##########################
    filepaths = filepaths + training_filepaths_isic
    fileclasses = fileclasses + training_fileclasses_isic

    # Shuffle the data
    assert (len(filepaths) == len(fileclasses))

    random.seed(_RANDOM_SEED)
    random.shuffle(filepaths)
    random.seed(_RANDOM_SEED)
    random.shuffle(fileclasses)

    ##########################
    #          VAL           #
    ##########################

    validation_filepaths = []
    validation_fileclasses = []
    index_ = []
    count = 0
    count_ = {}
    count_["benign-dermal-tumors-cysts-sinuses"] = 0
    count_["cutaneous-lymphoma-and-lymphoid-infiltrates"] = 0
    count_["epidermal-tumors-hamartomas-milia-and-growths-benign"] = 0
    count_["epidermal-tumors-pre-malignant-and-malignant"] = 0
    count_["genodermatoses-and-supernumerary-growths"] = 0
    count_["inflammatory"] = 0
    count_["malignant-dermal-tumor"] = 0
    count_["pigmented-lesions-benign"] = 0
    count_["pigmented-lesions-malignant"] = 0

    for j, i in enumerate(filepaths):
        name_ = fileclasses[j].split('/')[0]
        if name_ == "benign-dermal-tumors-cysts-sinuses" and \
         count_["benign-dermal-tumors-cysts-sinuses"] < _NUM_VALIDATION_PER_CLASS:
            validation_filepaths.append(i)
            validation_fileclasses.append(fileclasses[j])
            count_["benign-dermal-tumors-cysts-sinuses"] += 1
            index_.append(j)
            count += 1

        elif name_ == "cutaneous-lymphoma-and-lymphoid-infiltrates" and \
         count_["cutaneous-lymphoma-and-lymphoid-infiltrates"] < _NUM_VALIDATION_PER_CLASS:
            validation_filepaths.append(i)
            validation_fileclasses.append(fileclasses[j])
            count_["cutaneous-lymphoma-and-lymphoid-infiltrates"] += 1
            index_.append(j)
            count += 1

        elif name_ == "epidermal-tumors-hamartomas-milia-and-growths-benign" and \
         count_["epidermal-tumors-hamartomas-milia-and-growths-benign"] < _NUM_VALIDATION_PER_CLASS:
            validation_filepaths.append(i)
            validation_fileclasses.append(fileclasses[j])
            count_["epidermal-tumors-hamartomas-milia-and-growths-benign"] += 1
            index_.append(j)
            count += 1

        elif name_ == "epidermal-tumors-pre-malignant-and-malignant" and \
         count_["epidermal-tumors-pre-malignant-and-malignant"] < _NUM_VALIDATION_PER_CLASS:
            validation_filepaths.append(i)
            validation_fileclasses.append(fileclasses[j])
            count_["epidermal-tumors-pre-malignant-and-malignant"] += 1
            index_.append(j)
            count += 1

        elif name_ == "genodermatoses-and-supernumerary-growths" and \
         count_["genodermatoses-and-supernumerary-growths"] < _NUM_VALIDATION_PER_CLASS:
            validation_filepaths.append(i)
            validation_fileclasses.append(fileclasses[j])
            count_["genodermatoses-and-supernumerary-growths"] += 1
            index_.append(j)
            count += 1

        elif name_ == "inflammatory" and \
         count_["inflammatory"] < _NUM_VALIDATION_PER_CLASS:
            validation_filepaths.append(i)
            validation_fileclasses.append(fileclasses[j])
            count_["inflammatory"] += 1
            index_.append(j)
            count += 1

        elif name_ == "malignant-dermal-tumor" and \
         count_["malignant-dermal-tumor"] < _NUM_VALIDATION_PER_CLASS:
            validation_filepaths.append(i)
            validation_fileclasses.append(fileclasses[j])
            count_["malignant-dermal-tumor"] += 1
            index_.append(j)
            count += 1

        elif name_ == "pigmented-lesions-benign" and \
         count_["pigmented-lesions-benign"] < _NUM_VALIDATION_PER_CLASS:
            validation_filepaths.append(i)
            validation_fileclasses.append(fileclasses[j])
            count_["pigmented-lesions-benign"] += 1
            index_.append(j)
            count += 1

        elif name_ == "pigmented-lesions-malignant" and \
         count_["pigmented-lesions-malignant"] < _NUM_VALIDATION_PER_CLASS:
            validation_filepaths.append(i)
            validation_fileclasses.append(fileclasses[j])
            count_["pigmented-lesions-malignant"] += 1
            index_.append(j)
            count += 1

        if count >= _NUM_VALIDATION:
            break

    ##########################
    #          TRAIN         #
    ##########################
    training_filepaths = [
        filepaths[i] for i in range(len(filepaths)) if i not in index_
    ]
    training_fileclasses = [
        fileclasses[i] for i in range(len(fileclasses)) if i not in index_
    ]

    ##########################
    #          CLASS         #
    ##########################
    class_names = sorted(list(set(fileclasses)))
    class_names_to_ids = dict(zip(class_names, range(len(class_names))))

    ## First, convert the training and validation sets.
    _convert_dataset('test_isic', test_filepaths_isic, test_fileclasses_isic,
                     class_names_to_ids, dataset_dir)
    _convert_dataset('validation', validation_filepaths,
                     validation_fileclasses, class_names_to_ids, dataset_dir)
    _convert_dataset('train', training_filepaths, training_fileclasses,
                     class_names_to_ids, dataset_dir)

    # Finally, write the labels file:

    # We associate a number to finest level classes
    labels_to_class_names = dict(zip(range(len(class_names)), class_names))

    # We associate a number to level nine classes
    class_names_9 = sorted(
        list(set([name.split('/')[0] for name in class_names])))
    labels_to_class_names_9 = dict(
        zip(range(len(class_names_9)), class_names_9))

    # Mapping from finest level classes to level nine classes
    labels_to_labels_9 = {}
    for k, v in labels_to_class_names.items():
        name = v.split('/')[0]
        label_9 = list(labels_to_class_names_9.values()).index(name)
        labels_to_labels_9[k] = label_9

    # Write the corresponding files
    dataset_utils.write_label_file(labels_to_class_names,
                                   dataset_dir + "dataset_test/",
                                   filename='labels.txt')
    dataset_utils.write_label_file(labels_to_class_names_9,
                                   dataset_dir + "dataset_test/",
                                   filename='labels_9.txt')
    dataset_utils.write_label_file(labels_to_labels_9,
                                   dataset_dir + "dataset_test/",
                                   filename='labels_to_labels_9.txt')

    #_clean_up_temporary_files(dataset_dir)
    print('\nFinished!')
def main():
    args = get_args()

    # State your dataset directory
    flags.DEFINE_string('dataset_dir', os.path.expanduser(args.dataset_dir),
                        'String: Your dataset directory')

    # The number of images in the validation set. You would have to know the total number of examples in advance.
    # This is essentially your evaluation dataset.
    flags.DEFINE_float(
        'validation_size', 0.3,
        'Float: The proportion of examples in the dataset to be used for validation'
    )

    # The number of shards to split the dataset into
    flags.DEFINE_integer('num_shards', 2,
                         'Int: Number of shards to split the TFRecord files')

    # Seed for repeatability.
    flags.DEFINE_integer('random_seed', 0,
                         'Int: Random seed to use for repeatability.')

    # Output filename for the naming the TFRecord file
    flags.DEFINE_string(
        'tfrecord_filename', args.tfrecord_filename,
        'String: The output filename to name your TFRecord file')

    FLAGS = flags.FLAGS

    if not FLAGS.tfrecord_filename:
        raise ValueError(
            'tfrecord_filename is empty. Please state a tfrecord_filename argument.'
        )
    if not FLAGS.dataset_dir:
        raise ValueError(
            'dataset_dir is empty. Please state a dataset_dir argument.')
    if dataset_exists(dataset_dir=FLAGS.dataset_dir,
                      _NUM_SHARDS=FLAGS.num_shards,
                      output_filename=FLAGS.tfrecord_filename):
        print('Dataset files already exist. Exiting without re-creating them.')
        return None

    # Get a list of photo_filenames like ['123.jpg', '456.jpg'...] and a list of sorted
    # class names from parsing the subdirectories.
    photo_filenames, class_names = get_filenames_and_classes(FLAGS.dataset_dir)

    # Refer each of the class name to a specific integer number for predictions later
    class_names_to_ids = dict(zip(class_names, range(len(class_names))))

    # Find the number of validation examples we need
    num_validation = int(FLAGS.validation_size * len(photo_filenames))

    # Divide the training datasets into train and test:
    random.seed(FLAGS.random_seed)
    random.shuffle(photo_filenames)
    training_filenames = photo_filenames[num_validation:]
    validation_filenames = photo_filenames[:num_validation]

    # First, convert the training and validation sets.
    convert_dataset('train',
                    training_filenames,
                    class_names_to_ids,
                    dataset_dir=FLAGS.dataset_dir,
                    tfrecord_filename=FLAGS.tfrecord_filename,
                    _NUM_SHARDS=FLAGS.num_shards)
    convert_dataset('validation',
                    validation_filenames,
                    class_names_to_ids,
                    dataset_dir=FLAGS.dataset_dir,
                    tfrecord_filename=FLAGS.tfrecord_filename,
                    _NUM_SHARDS=FLAGS.num_shards)

    # Finally, write the labels file:
    labels_to_class_names = dict(zip(range(len(class_names)), class_names))
    write_label_file(labels_to_class_names, FLAGS.dataset_dir)

    print('\nFinished converting the %s dataset!' % FLAGS.tfrecord_filename)
Beispiel #31
0
def get_split(split_name, dataset_dir, labels_dir=None, file_pattern=None):
    """Retrieves a InputData object with the parameters for reading ImageNet data.

  Args:
    split_name: A train/test split name.
    dataset_dir: The base directory of the dataset sources.
    labels_dir: The folder where the labels file is located, and where it will
      be eventually written if missing.
    file_pattern: The file pattern to use when matching the dataset sources.
      It is assumed that the pattern contains a '%s' string so that the split
      name can be inserted.

  Returns:
    An `InputData` object.

  Raises:
    ValueError: if `split_name` is not a valid train/test split.
  """
    if split_name not in _SPLITS_TO_SIZES:
        raise ValueError('split name %s was not recognized.' % split_name)
    if not labels_dir:
        labels_dir = dataset_dir
    if not file_pattern:
        file_pattern = _FILE_PATTERN
    file_pattern = file_pattern % split_name
    files = []
    # Allow for filename expansion w/out using Glob().
    # Example: 'train-[0,1023,05d]-of-01024' to generate:
    #   train-00000-of-01024
    #   train-00001-of-01024
    #   ...
    #   train-01023-of-01024
    m = re.match(r'(.*)\[(\d+),(\d+),([a-zA-Z0-9]+)\](.*)', file_pattern)
    if m:
        format_string = '%' + m.group(4)
        for n in range(int(m.group(2)), int(m.group(3)) + 1):
            seqstr = format_string % n
            files.append(
                os.path.join(dataset_dir,
                             m.group(1) + seqstr + m.group(5)))
    else:
        path = os.path.join(dataset_dir, file_pattern)
        # If the file_pattern ends with '.list', then the file is supposed to be a
        # file which lists the input files one per line.
        if path.endswith('.list'):
            with gfile.Open(path, 'r') as list_file:
                for fpath in list_file:
                    fpath = fpath.strip()
                    if fpath:
                        files.append(fpath)
        elif path.find('*') < 0:
            # If the path does not contain any glob pattern, assume it is a single
            # input file. Detection for glob patters might be more complex, but all
            # the examples seen so far, uses '*' only.
            files.append(path)
        else:
            # Otherwise we assume it is a glob-able path.
            files = gfile.Glob(path)
    keys_to_features = {
        'image/encoded':
        tf.FixedLenFeature((), tf.string, default_value=''),
        'image/format':
        tf.FixedLenFeature((), tf.string, default_value='jpeg'),
        'image/class/label':
        tf.FixedLenFeature([], dtype=tf.int64, default_value=-1),
        'image/class/text':
        tf.FixedLenFeature([], dtype=tf.string, default_value=''),
        'image/object/bbox/xmin':
        tf.VarLenFeature(dtype=tf.float32),
        'image/object/bbox/ymin':
        tf.VarLenFeature(dtype=tf.float32),
        'image/object/bbox/xmax':
        tf.VarLenFeature(dtype=tf.float32),
        'image/object/bbox/ymax':
        tf.VarLenFeature(dtype=tf.float32),
        'image/object/class/label':
        tf.VarLenFeature(dtype=tf.int64),
    }
    items_to_handlers = {
        'image':
        slim.tfexample_decoder.Image('image/encoded', 'image/format'),
        'label':
        slim.tfexample_decoder.Tensor('image/class/label'),
        'label_text':
        slim.tfexample_decoder.Tensor('image/class/text'),
        'object/bbox':
        slim.tfexample_decoder.BoundingBox(['ymin', 'xmin', 'ymax', 'xmax'],
                                           'image/object/bbox/'),
        'object/label':
        slim.tfexample_decoder.Tensor('image/object/class/label'),
    }
    decoder = slim.tfexample_decoder.TFExampleDecoder(keys_to_features,
                                                      items_to_handlers)
    labels_to_names = None
    if dataset_utils.has_labels(labels_dir):
        labels_to_names = dataset_utils.read_label_file(labels_dir)
    else:
        labels_to_names = create_readable_names_for_imagenet_labels()
        dataset_utils.write_label_file(labels_to_names, labels_dir)
    return InputData(data_sources=files,
                     decoder=decoder,
                     num_samples=_SPLITS_TO_SIZES[split_name],
                     items_to_descriptions=_ITEMS_TO_DESCRIPTIONS,
                     num_classes=_NUM_CLASSES,
                     labels_to_names=labels_to_names)
def get_split(split_name, dataset_dir, file_pattern=None, reader=None):
    """Gets a dataset tuple with instructions for reading ImageNet.

  Args:
    split_name: A train/test split name.
    dataset_dir: The base directory of the dataset sources.
    file_pattern: The file pattern to use when matching the dataset sources.
      It is assumed that the pattern contains a '%s' string so that the split
      name can be inserted.
    reader: The TensorFlow reader type.

  Returns:
    A `Dataset` namedtuple.

  Raises:
    ValueError: if `split_name` is not a valid train/test split.
  """
    if split_name not in _SPLITS_TO_SIZES:
        raise ValueError('split name %s was not recognized.' % split_name)

    if not file_pattern:
        file_pattern = _FILE_PATTERN
    file_pattern = os.path.join(dataset_dir, file_pattern % split_name)

    # Allowing None in the signature so that dataset_factory can use the default.
    if reader is None:
        reader = tf.TFRecordReader

    keys_to_features = {
        'image/encoded':
        tf.FixedLenFeature((), tf.string, default_value=''),
        'image/format':
        tf.FixedLenFeature((), tf.string, default_value='jpeg'),
        'image/class/label':
        tf.FixedLenFeature([], dtype=tf.int64, default_value=-1),
        'image/class/text':
        tf.FixedLenFeature([], dtype=tf.string, default_value=''),
        'image/object/bbox/xmin':
        tf.VarLenFeature(dtype=tf.float32),
        'image/object/bbox/ymin':
        tf.VarLenFeature(dtype=tf.float32),
        'image/object/bbox/xmax':
        tf.VarLenFeature(dtype=tf.float32),
        'image/object/bbox/ymax':
        tf.VarLenFeature(dtype=tf.float32),
        'image/object/class/label':
        tf.VarLenFeature(dtype=tf.int64),
    }

    items_to_handlers = {
        'image':
        slim.tfexample_decoder.Image('image/encoded', 'image/format'),
        'label':
        slim.tfexample_decoder.Tensor('image/class/label'),
        'label_text':
        slim.tfexample_decoder.Tensor('image/class/text'),
        'object/bbox':
        slim.tfexample_decoder.BoundingBox(['ymin', 'xmin', 'ymax', 'xmax'],
                                           'image/object/bbox/'),
        'object/label':
        slim.tfexample_decoder.Tensor('image/object/class/label'),
    }

    decoder = slim.tfexample_decoder.TFExampleDecoder(keys_to_features,
                                                      items_to_handlers)

    labels_to_names = None
    if dataset_utils.has_labels(dataset_dir):
        labels_to_names = dataset_utils.read_label_file(dataset_dir)
    else:
        labels_to_names = create_readable_names_for_imagenet_labels()
        dataset_utils.write_label_file(labels_to_names, dataset_dir)

    return slim.dataset.Dataset(data_sources=file_pattern,
                                reader=reader,
                                decoder=decoder,
                                num_samples=_SPLITS_TO_SIZES[split_name],
                                items_to_descriptions=_ITEMS_TO_DESCRIPTIONS,
                                num_classes=_NUM_CLASSES,
                                labels_to_names=labels_to_names)
Beispiel #33
0
def main():
    #=============CHECKS==============
    #Check if there is a tfrecord_filename entered
    if not FLAGS.tfrecord_filename:
        raise ValueError(
            'tfrecord_filename is empty. Please state a tfrecord_filename argument.'
        )

    #Check if there is a dataset directory entered
    if not FLAGS.dataset_dir:
        raise ValueError(
            'dataset_dir is empty. Please state a dataset_dir argument.')

    #If the TFRecord files already exist in the directory, then exit without creating the files again
    if _dataset_exists(dataset_dir=FLAGS.dataset_dir,
                       _NUM_SHARDS=FLAGS.num_shards,
                       output_filename=FLAGS.tfrecord_filename):
        print('Dataset files already exist. Exiting without re-creating them.')
        return None
    #==========END OF CHECKS============

    #Get a list of photo_filenames like ['123.jpg', '456.jpg'...] and a list of sorted class names from parsing the subdirectories.
    photo_filenames, class_names = _get_filenames_and_classes(
        FLAGS.dataset_dir, output_filename=FLAGS.tfrecord_filename)

    # Some datasets have a "categories" file with actual names those photo_names correspond to
    # such as UECFOOD256 dataset. Let's map it out.
    # if os.path.exists(os.path.join(FLAGS.dataset_dir, 'category.txt')):
    #     with open(os.path.join(FLAGS.dataset_dir, 'category.txt')) as cat_file:
    #         replacement_dict = [cat_name.split('\t') for cat_name in cat_file]
    #         class_names = [replacement_dict[int(class_name)][1].replace('\n','') for class_name in class_names]
    #         import pdb; pdb.set_trace()

    #Refer each of the class name to a specific integer number for predictions later
    class_names_to_ids = dict(zip(class_names, range(len(class_names))))

    #Find the number of validation examples we need
    num_validation = int(FLAGS.validation_size * len(photo_filenames))

    # Divide the training datasets into train and test:
    random.seed(FLAGS.random_seed)
    random.shuffle(photo_filenames)
    training_filenames = photo_filenames[num_validation:]
    validation_filenames = photo_filenames[:num_validation]

    # First, convert the training and validation sets.
    _convert_dataset('train',
                     training_filenames,
                     class_names_to_ids,
                     dataset_dir=FLAGS.dataset_dir,
                     tfrecord_filename=FLAGS.tfrecord_filename,
                     _NUM_SHARDS=FLAGS.num_shards)
    _convert_dataset('validation',
                     validation_filenames,
                     class_names_to_ids,
                     dataset_dir=FLAGS.dataset_dir,
                     tfrecord_filename=FLAGS.tfrecord_filename,
                     _NUM_SHARDS=FLAGS.num_shards)

    # Finally, write the labels file:
    labels_to_class_names = dict(zip(range(len(class_names)), class_names))
    write_label_file(labels_to_class_names, FLAGS.dataset_dir)

    print('\nFinished converting the %s dataset!' % (FLAGS.tfrecord_filename))