Example #1
0
def run(dataset_dir):
  """Runs the download and conversion operation.

  Args:
    dataset_dir: The dataset directory where the dataset is stored.
  """
  if not tf.gfile.Exists(dataset_dir):
    tf.gfile.MakeDirs(dataset_dir)

  if _dataset_exists(dataset_dir):
    print('Dataset files already exist. Exiting without re-creating them.')
    return

  dataset_utils.download_and_uncompress_tarball(_DATA_URL, dataset_dir)
  photo_filenames, class_names = _get_filenames_and_classes(dataset_dir)
  class_names_to_ids = dict(zip(class_names, range(len(class_names))))

  # Divide into train and test:
  random.seed(_RANDOM_SEED)
  random.shuffle(photo_filenames)
  training_filenames = photo_filenames[_NUM_VALIDATION:]
  validation_filenames = photo_filenames[:_NUM_VALIDATION]

  # First, convert the training and validation sets.
  _convert_dataset('train', training_filenames, class_names_to_ids,
                   dataset_dir)
  _convert_dataset('validation', validation_filenames, class_names_to_ids,
                   dataset_dir)

  # Finally, write the labels file:
  labels_to_class_names = dict(zip(range(len(class_names)), class_names))
  dataset_utils.write_label_file(labels_to_class_names, dataset_dir)

  _clean_up_temporary_files(dataset_dir)
  print('\nFinished converting the Flowers dataset!')
def run(dataset_dir, dataset):
  """Runs the download and conversion operation.

  Args:
    dataset_dir: The dataset directory where the dataset is stored.
  """
  if not tf.gfile.Exists(dataset_dir):
    tf.gfile.MakeDirs(dataset_dir)

  training_filename = _get_output_filename(dataset_dir, 'train',
                                           dataset=dataset)
  testing_filename = _get_output_filename(dataset_dir, 'test',
                                          dataset=dataset)

  if tf.gfile.Exists(training_filename) and tf.gfile.Exists(testing_filename):
    print('Dataset files already exist. Exiting without re-creating them.')
    return

  dataset_utils.download_and_uncompress_tarball(_DATA_URL[dataset], dataset_dir)

  # First, process the training data:
  with tf.python_io.TFRecordWriter(training_filename) as tfrecord_writer:
    offset = 0
    for i in range(_NUM_TRAIN_FILES[dataset]):
      filename = os.path.join(dataset_dir,
                              _DATA_DIR[dataset],
                              _batch_name('train', offset=i, dataset=dataset))
      offset = _add_to_tfrecord(filename, tfrecord_writer, dataset, offset)

  # Next, process the testing data:
  with tf.python_io.TFRecordWriter(testing_filename) as tfrecord_writer:
    filename = os.path.join(dataset_dir,
                            _DATA_DIR[dataset],
                            _batch_name('test', offset=0, dataset=dataset))
    _add_to_tfrecord(filename, tfrecord_writer, dataset)

  # Finally, write the labels file:
  labels_to_class_names = dict(enumerate(_CLASS_NAMES[dataset]))
  dataset_utils.write_label_file(labels_to_class_names, dataset_dir)

  if dataset == 'cifar100':
    coarse_labels_to_class_names = dict(enumerate(_COARSE_CLASS_NAMES))
    dataset_utils.write_label_file(coarse_labels_to_class_names, dataset_dir,
                                   filename=_COARSE_LABELS_FILENAME)
    

  _clean_up_temporary_files(dataset_dir, dataset)
  print('\nFinished converting the %s dataset!' % dataset)
Example #3
0
def run(dataset_dir):
  """Runs the download and conversion operation.

  Args:
    dataset_dir: The dataset directory where the dataset is stored.
  """
  if not tf.gfile.Exists(dataset_dir):
    tf.gfile.MakeDirs(dataset_dir)

  training_filename = _get_output_filename(dataset_dir, 'train')
  testing_filename = _get_output_filename(dataset_dir, 'test')

  if tf.gfile.Exists(training_filename) and tf.gfile.Exists(testing_filename):
    print('Dataset files already exist. Exiting without re-creating them.')
    return

  dataset_utils.download_and_uncompress_tarball(_DATA_URL, dataset_dir)

  # First, process the training data:
  with tf.python_io.TFRecordWriter(training_filename) as tfrecord_writer:
    offset = 0
    for i in range(_NUM_TRAIN_FILES):
      filename = os.path.join(dataset_dir,
                              'cifar-10-batches-py',
                              'data_batch_%d' % (i + 1))  # 1-indexed.
      offset = _add_to_tfrecord(filename, tfrecord_writer, offset)

  # Next, process the testing data:
  with tf.python_io.TFRecordWriter(testing_filename) as tfrecord_writer:
    filename = os.path.join(dataset_dir,
                            'cifar-10-batches-py',
                            'test_batch')
    _add_to_tfrecord(filename, tfrecord_writer)

  # Finally, write the labels file:
  labels_to_class_names = dict(zip(range(len(_CLASS_NAMES)), _CLASS_NAMES))
  dataset_utils.write_label_file(labels_to_class_names, dataset_dir)

  #_clean_up_temporary_files(dataset_dir)
  print('\nFinished converting the Cifar10 dataset!')
def run(dataset_dir):
  """Runs the download and conversion operation.

  Args:
    dataset_dir: The dataset directory where the dataset is stored.
  """
  print(dataset_dir)
  if not tf.gfile.Exists(dataset_dir):
    tf.gfile.MakeDirs(dataset_dir)

  if _dataset_exists(dataset_dir):
    print('Dataset files already exist. Exiting without re-creating them.')
    return

  # dataset_utils.download_and_uncompress_tarball(_DATA_URL, dataset_dir)
  photo_filenames, class_names = _get_filenames_and_classes('/home/hoangtrunghieu/Medico2018/imdb/Medico_2018_development_set')
  # photo_filenames, class_names = _get_filenames_and_classes('/home/hoangtrunghieu/Medico2018/imdb/medico_full/images/kvasir-dataset-v2')
  class_names_to_ids = dict(zip(class_names, range(len(class_names))))

  # # Divide into train and test:
  random.seed(_RANDOM_SEED)
  random.shuffle(photo_filenames)
  _NUM_VALIDATION = int(len(photo_filenames) * _SPLIT_VALIDATION)
  training_filenames = photo_filenames[_NUM_VALIDATION:]
  validation_filenames = photo_filenames[:_NUM_VALIDATION]
  print(class_names)
  # # First, convert the training and validation sets.
  _convert_dataset('train', training_filenames, class_names_to_ids,
                   dataset_dir)
  _convert_dataset('validation', validation_filenames, class_names_to_ids,
                   dataset_dir)

  # # Finally, write the labels file:
  labels_to_class_names = dict(zip(range(len(class_names)), class_names))
  dataset_utils.write_label_file(labels_to_class_names, dataset_dir)

  # _clean_up_temporary_files(dataset_dir)
  print('\nFinished converting the Medico dataset!')

  
Example #5
0
def run(dataset_dir):
  """Runs the download and conversion operation.

  Args:
    dataset_dir: The dataset directory where the dataset is stored.
  """
  '''
  if not tf.gfile.Exists(dataset_dir):
    tf.gfile.MakeDirs(dataset_dir)

  if _dataset_exists(dataset_dir):
    print('Dataset files already exist. Exiting without re-creating them.')
    return
  '''
#  dataset_utils.download_and_uncompress_tarball(_DATA_URL, dataset_dir)
  photo_filenames, class_names = _get_filenames_and_classes(dataset_dir)
#  print(len(photo_filenames))
#  print(class_names)

  class_names_to_ids = dict(zip(class_names, range(len(class_names))))

  # Divide into train and test:
#  random.seed(_RANDOM_SEED)
#  random.shuffle(photo_filenames)
  training_filenames = photo_filenames[_NUM_VALIDATION:]
  validation_filenames = photo_filenames[:_NUM_VALIDATION]

  # First, convert the training and validation sets.
  _convert_dataset('train', training_filenames, class_names_to_ids,
                   dataset_dir)
  _convert_dataset('validation', validation_filenames, class_names_to_ids,
                   dataset_dir)

  # Finally, write the labels file:
  labels_to_class_names = dict(zip(range(len(class_names)), class_names))
  dataset_utils.write_label_file(labels_to_class_names, dataset_dir)
  print(labels_to_class_names)

#  _clean_up_temporary_files(dataset_dir)
  print('\nFinished converting the Flowers dataset!')
Example #6
0
def run(dataset_dir):
    """Runs the download and conversion operation.

  Args:
    dataset_dir: The dataset directory where the dataset is stored.
  """
    print('running')
    if not tf.gfile.Exists(dataset_dir):
        tf.gfile.MakeDirs(dataset_dir)

    training_filename = _get_output_filename(dataset_dir, 'train')
    testing_filename = _get_output_filename(dataset_dir, 'test')

    if tf.gfile.Exists(training_filename) and tf.gfile.Exists(
            testing_filename):
        print('Dataset files already exist. Exiting without re-creating them.')
        return

    _download_dataset(dataset_dir)

    # First, process the training data:
    with tf.python_io.TFRecordWriter(training_filename) as tfrecord_writer:
        data_filename = os.path.join(dataset_dir, _TRAIN_DATA_FILENAME)
        labels_filename = os.path.join(dataset_dir, _TRAIN_LABELS_FILENAME)
        _add_to_tfrecord(data_filename, labels_filename, 60000,
                         tfrecord_writer)

    # Next, process the testing data:
    with tf.python_io.TFRecordWriter(testing_filename) as tfrecord_writer:
        data_filename = os.path.join(dataset_dir, _TEST_DATA_FILENAME)
        labels_filename = os.path.join(dataset_dir, _TEST_LABELS_FILENAME)
        _add_to_tfrecord(data_filename, labels_filename, 10000,
                         tfrecord_writer)

    # Finally, write the labels file:
    labels_to_class_names = dict(zip(range(len(_CLASS_NAMES)), _CLASS_NAMES))
    dataset_utils.write_label_file(labels_to_class_names, dataset_dir)

    _clean_up_temporary_files(dataset_dir)
    print('\nFinished converting the MNIST dataset!')
Example #7
0
def run(dataset_dir):
  """Runs the download and conversion operation.

  Args:
    dataset_dir: The dataset directory where the dataset is stored.
  """
  if not tf.gfile.Exists(dataset_dir):
    tf.gfile.MakeDirs(dataset_dir)

  training_filename = _get_output_filename(dataset_dir, 'train')
  testing_filename = _get_output_filename(dataset_dir, 'test')

  if tf.gfile.Exists(training_filename) and tf.gfile.Exists(testing_filename):
    print('Dataset files already exist. Exiting without re-creating them.')
    return

  dataset_utils.download_and_uncompress_tarball(_DATA_URL, dataset_dir)

  # First, process the training data:
  with tf.python_io.TFRecordWriter(training_filename) as tfrecord_writer:
    offset = 0
    for i in range(_NUM_TRAIN_FILES):
      filename = os.path.join(dataset_dir,
                              'cifar-10-batches-py',
                              'data_batch_%d' % (i + 1))  # 1-indexed.
      offset = _add_to_tfrecord(filename, tfrecord_writer, offset)

  # Next, process the testing data:
  with tf.python_io.TFRecordWriter(testing_filename) as tfrecord_writer:
    filename = os.path.join(dataset_dir,
                            'cifar-10-batches-py',
                            'test_batch')
    _add_to_tfrecord(filename, tfrecord_writer)

  # Finally, write the labels file:
  labels_to_class_names = dict(zip(range(len(_CLASS_NAMES)), _CLASS_NAMES))
  dataset_utils.write_label_file(labels_to_class_names, dataset_dir)

  _clean_up_temporary_files(dataset_dir)
  print('\nFinished converting the Cifar10 dataset!')
def run(dataset_dir):
  """Runs the download and conversion operation.

  Args:
    dataset_dir: The dataset directory where the dataset is stored.
  """
  if not tf.gfile.Exists(dataset_dir):
    tf.gfile.MakeDirs(dataset_dir)

#  if _dataset_exists(dataset_dir):
#    print('Dataset files already exist. Exiting without re-creating them.')
#    return

#  dataset_utils.download_and_uncompress_tarball(_DATA_URL, dataset_dir)
  photo_filenames, class_names = _get_filenames_and_classes("/datashare/ImageCLEF/IRMA/ImageCLEF2008/sp/03/T/train")
  valid_photo_filenames, class_names_v = _get_filenames_and_classes("/datashare/ImageCLEF/IRMA/ImageCLEF2008/sp/03/T/validation")
  #valid_photo_filenames, class_names_v = _get_filenames_and_classes(dataset_dir)
  class_names_to_ids = dict(zip(class_names, range(len(class_names))))

  # Divide into train and test:
  random.seed(_RANDOM_SEED)
  random.shuffle(photo_filenames)
  training_filenames = photo_filenames
  #training_filenames = photo_filenames[_NUM_VALIDATION:]
  print(photo_filenames)
  #validation_filenames = photo_filenames[:_NUM_VALIDATION]
  validation_filenames = valid_photo_filenames

  # First, convert the training and validation sets.
  _convert_dataset('train', training_filenames, class_names_to_ids,
                   "/datashare/ImageCLEF/IRMA/ImageCLEF2008/sp/03/T/train")
  _convert_dataset('validation', validation_filenames, class_names_to_ids,
                   "/datashare/ImageCLEF/IRMA/ImageCLEF2008/sp/03/T/validation")

  # Finally, write the labels file:
  labels_to_class_names = dict(zip(range(len(class_names)), class_names))
  dataset_utils.write_label_file(labels_to_class_names, dataset_dir)

#  _clean_up_temporary_files(dataset_dir)
  print('\nFinished converting the 2T IRMA dataset!')
Example #9
0
def run(dataset_dir, train_name_list, test_name_list):
    """Runs the download and conversion operation.

  Args:
    dataset_dir: The dataset directory where the dataset is stored.
  """
    #if not tf.gfile.Exists(dataset_dir):
    #  tf.gfile.MakeDirs(dataset_dir)

    #if _dataset_exists(dataset_dir):
    #  print('Dataset files already exist. Exiting without re-creating them.')
    #  return

    print(dataset_dir)
    #dataset_utils.download_and_uncompress_tarball(_DATA_URL, dataset_dir)
    photo_filenames, class_names = _get_filenames_and_classes(dataset_dir)
    class_names_to_ids = dict(zip(class_names, range(len(class_names))))

    print(photo_filenames[0])

    print('training size : ' + str(len(train_name_list)))
    print('testing size : ' + str(len(test_name_list)))
    #training_filenames = photo_filenames[_NUM_VALIDATION:]
    #validation_filenames = photo_filenames[:_NUM_VALIDATION]

    # First, convert the training and validation sets.
    _convert_dataset('train', train_name_list, class_names_to_ids, dataset_dir)

    print('finished converting training')

    _convert_dataset('validation', test_name_list, class_names_to_ids,
                     dataset_dir)

    # Finally, write the labels file:
    labels_to_class_names = dict(zip(range(len(class_names)), class_names))
    dataset_utils.write_label_file(labels_to_class_names, dataset_dir)

    _clean_up_temporary_files(dataset_dir)
    print('\nFinished converting the nucleui dataset!')
Example #10
0
def run(dataset_dir):
    """Runs the download and conversion operation.

  Args:
    dataset_dir: The dataset directory where the dataset is stored.
  """
    if not tf.gfile.Exists(dataset_dir):
        tf.gfile.MakeDirs(dataset_dir)

    if _dataset_exists(dataset_dir):
        print('Dataset files already exist. Exiting without re-creating them.')
        return

    #dataset_utils.download_and_uncompress_tarball(_DATA_URL, dataset_dir)
    photo_filenames, class_names = _get_filenames_and_classes(dataset_dir)
    class_names_to_ids = dict(zip(class_names, range(len(class_names))))

    # Divide into train and test:
    random.shuffle(photo_filenames)
    validation_filenames = photo_filenames[:_NUM_VALIDATION]
    training_filenames = photo_filenames[_NUM_VALIDATION:]

    #f = open("tmp/colon/test.txt", "w")
    #for i in test_filenames:
    #    f.write(i + "\n")
    #f.close()

    # First, convert the training and validation sets.
    _convert_dataset('train', training_filenames, class_names_to_ids,
                     dataset_dir)
    _convert_dataset('validation', validation_filenames, class_names_to_ids,
                     dataset_dir)

    # Finally, write the labels file:
    labels_to_class_names = dict(zip(range(len(class_names)), class_names))
    dataset_utils.write_label_file(labels_to_class_names, dataset_dir)

    #_clean_up_temporary_files(dataset_dir)
    print('\nFinished converting the colon dataset!')
Example #11
0
def run(dataset_dir):
    """Runs the conversion operation
	Args:
	dataset_dir: the dataset directory where the dataset is stored
	"""

    photo_filenames, class_names = _get_filenames_and_classes(dataset_dir)
    class_names_to_ids = dict(zip(class_names, range(len(class_names))))

    random.seed(_RANDOM_SEED)
    random.shuffle(photo_filenames)
    training_filenames = photo_filenames[:_NUM_TRAIN]
    validation_filenames = photo_filenames[_NUM_TRAIN:]

    path = os.path.join(dataset_dir, 'pathology_splits.txt')
    print('Creating file: ' + path)
    f = open(path, 'w+')

    f.write('Training Files:\n')
    for training_filename in training_filenames:
        f.write(training_filename + '\n')

    f.write('Validation Files:\n')
    for validation_filename in validation_filenames:
        f.write(validation_filename + '\n')
#f.write('Testing Files:\n')
#for testing_filename in testing_filenames:
#  f.write(testing_filename + '\n')

    _convert_dataset('train', training_filenames, class_names_to_ids,
                     dataset_dir)
    _convert_dataset('validation', validation_filenames, class_names_to_ids,
                     dataset_dir)
    #	_convert_dataset('test', testing_filenames, class_names_to_ids, dataset_dir)

    labels_to_class_names = dict(zip(range(len(class_names)), class_names))
    dataset_utils.write_label_file(labels_to_class_names, dataset_dir)

    print('\nFinished converting the Pathology dataset!')
def run(dataset_dir, tf_record_dir):
    """Runs the download and conversion operation.

  Args:
    dataset_dir: The dataset directory where the dataset is stored.
  """
    if not tf.gfile.Exists(tf_record_dir):
        tf.gfile.MakeDirs(tf_record_dir)

    if _dataset_exists(tf_record_dir):
        print(
            'TF Record Dataset files already exist. Exiting without re-creating them.'
        )
        return

    # dataset_utils.download_and_uncompress_tarball(_DATA_URL, dataset_dir)
    photo_filenames, class_names = _get_filenames_and_classes(dataset_dir)
    class_names_to_ids = dict(zip(class_names, range(len(class_names))))
    #print("\n\n",photo_filenames,"\n\n")
    print("class_names_to_ids : ", class_names_to_ids)

    # Find the number of validation examples we need
    num_validation = int(_VALIDATION_SIZE * len(photo_filenames))

    # Divide into train and test:
    random.seed(_RANDOM_SEED)
    random.shuffle(photo_filenames)
    training_filenames = photo_filenames[num_validation:]
    validation_filenames = photo_filenames[:num_validation]

    #First, convert the training and validation sets.
    _convert_dataset('train', training_filenames, class_names_to_ids,
                     tf_record_dir)
    _convert_dataset('validation', validation_filenames, class_names_to_ids,
                     tf_record_dir)

    # Finally, write the labels file:
    labels_to_class_names = dict(zip(range(len(class_names)), class_names))
    dataset_utils.write_label_file(labels_to_class_names, tf_record_dir)
Example #13
0
def run(dataset_name, dataset_dir, num_shards, ratio_val):
    """Runs the download and conversion operation.

    Args:
      dataset_dir: The dataset directory where the dataset is stored.
    """
    tf_record_dir = os.path.join(dataset_dir, 'tfrecord')
    if not tf.gfile.Exists(tf_record_dir):
        tf.gfile.MakeDirs(tf_record_dir)

    if _dataset_exists(tf_record_dir, dataset_name, num_shards):
        print(
            'TFRecord files already exist. Exiting without re-creating them.')
        return

    photo_filenames, class_names = _get_filenames_and_classes(dataset_dir)
    class_names_to_ids = dict(zip(class_names, range(len(class_names))))

    # Cacluate number of validation proportional to ratio_val
    num_validation = int(len(photo_filenames) * ratio_val)

    # Divide into train and test:
    random.seed(_RANDOM_SEED)
    random.shuffle(photo_filenames)
    training_filenames = photo_filenames[num_validation:]
    validation_filenames = photo_filenames[:num_validation]

    # First, convert the training and validation sets.
    _convert_dataset('train', training_filenames, class_names_to_ids,
                     dataset_dir, dataset_name, tf_record_dir, num_shards)
    _convert_dataset('validation', validation_filenames, class_names_to_ids,
                     dataset_dir, dataset_name, tf_record_dir, num_shards)

    # Finally, write the labels file:
    labels_to_class_names = dict(zip(range(len(class_names)), class_names))
    dataset_utils.write_label_file(labels_to_class_names, tf_record_dir)

    print('\nFinished converting the  dataset!')
Example #14
0
def main(_):
    if not FLAGS.dataset_name:
        raise ValueError(
            'You must supply the dataset name with --dataset_name')
    if not FLAGS.dataset_dir:
        raise ValueError(
            'You must supply the dataset directory with --dataset_dir')

    dataset_dir = FLAGS.dataset_dir
    if not tf.gfile.Exists(dataset_dir):
        tf.gfile.MakeDirs(dataset_dir)

    if _dataset_exists(dataset_dir):
        print('Dataset files already exist. Exiting without re-creating them.')
        return

    #dataset_utils.download_and_uncompress_tarball(_DATA_URL, dataset_dir)
    photo_filenames, class_names = _get_filenames_and_classes(dataset_dir)
    class_names_to_ids = dict(zip(class_names, range(len(class_names))))

    # Divide into train and test:
    random.seed(_RANDOM_SEED)
    random.shuffle(photo_filenames)
    training_filenames = photo_filenames[_NUM_VALIDATION:]
    validation_filenames = photo_filenames[:_NUM_VALIDATION]

    # First, convert the training and validation sets.
    _convert_dataset('train', training_filenames, class_names_to_ids,
                     dataset_dir)
    _convert_dataset('validation', validation_filenames, class_names_to_ids,
                     dataset_dir)

    # Finally, write the labels file:
    labels_to_class_names = dict(zip(range(len(class_names)), class_names))
    dataset_utils.write_label_file(labels_to_class_names, dataset_dir)

    #_clean_up_temporary_files(dataset_dir)
    print('\nFinished converting the %s dataset!' % FLAGS.dataset_name)
Example #15
0
def main(_):
    dataset_dir = FLAGS.dataset_dir
    if not tf.gfile.Exists(dataset_dir):
        tf.gfile.MakeDirs(dataset_dir)

    if dataset_exists(dataset_dir):
        print('error: 数据集已存在')
        exit()

    photo_filenames, class_names = get_filenames_and_classes(dataset_dir)
    class_names_to_ids = dict(zip(class_names, range(len(class_names))))

    random.seed(_RANDOM_SEED)
    random.shuffle(photo_filenames)

    # 数据集转换
    convert_dataset(photo_filenames, class_names_to_ids, dataset_dir)

    # 保存类别标签
    labels_to_class_names = dict(zip(range(len(class_names)), class_names))
    dataset_utils.write_label_file(labels_to_class_names, dataset_dir)

    print('\nDone')
Example #16
0
def run(dataset_dir):
    """Runs the download and conversion operation.

  Args:
    dataset_dir: The dataset directory where the dataset is stored.
  """
    if not tf.gfile.Exists(dataset_dir):
        tf.gfile.MakeDirs(dataset_dir)

    validation_filenames, training_filenames, class_names = _get_filenames_and_classes(
    )

    class_names_to_ids = dict(zip(class_names, range(len(class_names))))

    # First, convert the training and validation sets.
    _convert_dataset('train', training_filenames, class_names_to_ids,
                     dataset_dir)
    _convert_dataset('validation', validation_filenames, class_names_to_ids,
                     dataset_dir)

    # Finally, write the labels file:
    labels_to_class_names = dict(zip(range(len(class_names)), class_names))
    dataset_utils.write_label_file(labels_to_class_names, dataset_dir)
Example #17
0
def run(dataset_dir):
    """Runs the download and conversion operation.

    Args:
      dataset_dir: The dataset directory where the dataset is stored.
    """
    photo_filenames = get_filenames(dataset_dir, jpg_or_tiff='jpg')
    labels_csv = read_labels(dataset_dir)
    class_names = get_classnames(labels_csv)
    class_names_to_ids = dict(zip(class_names, range(len(class_names))))

    output_dir = os.path.join(dataset_dir, 'tensorflow')

    random.seed(_RANDOM_SEED)
    random.shuffle(photo_filenames)
    training_filenames = photo_filenames[_NUM_VALIDATION:]
    validation_filenames = photo_filenames[:_NUM_VALIDATION]
    test_filenames = get_filenames(dataset_dir,
                                   jpg_or_tiff='jpg',
                                   test_or_train='test')

    fnames_to_class_ids = {}
    for fname, tags in zip(labels_csv.image_name, labels_csv.tags):
        fnames_to_class_ids[fname] = [
            class_names_to_ids[i] for i in tags.split()
        ]

    convert_dataset('test', test_filenames, None, output_dir)
    convert_dataset('train', training_filenames, fnames_to_class_ids,
                    output_dir)
    convert_dataset('validation', validation_filenames, fnames_to_class_ids,
                    output_dir)

    labels_to_class_names = dict(zip(range(len(class_names)), class_names))
    dataset_utils.write_label_file(labels_to_class_names, output_dir)

    print('\nFinished converting!')
def run(dataset_dir, output_dir, filename, data_type, num_tfrecords):
    """Runs the download and conversion operation.

  Args:
    dataset_dir: The directory where the dataset is stored.
    output_dir: The directory where the tfrecords should be stored.
    filename: Name of a txt file that stores all the training data details.
  """

    if not tf.gfile.Exists(dataset_dir):
        tf.gfile.MakeDirs(dataset_dir)
        print('\nNeed to download dataset first!')
        return

    # the name of the converted tfrecord
    TF_filename = _get_output_filename(output_dir, data_type, 0, num_tfrecords)

    if tf.gfile.Exists(TF_filename):
        print(
            '\nDataset files already exist. Remove them and recreate a new directory.'
        )
        shutil.rmtree(output_dir)
        os.makedirs(output_dir)

    # process the training data:
    filenames, tracklet_ids, cam_ids \
        = _get_image_filenames_and_labels(filename, data_type)

    _write_to_tfrecord(filenames, tracklet_ids, cam_ids, dataset_dir,
                       output_dir, data_type, num_tfrecords)

    unique_labels = list(set(tracklet_ids))
    unique_labels.sort()
    labels_to_write = dict(zip(range(len(unique_labels)), unique_labels))
    dataset_utils.write_label_file(labels_to_write, output_dir)

    print('\nFinished converting the training data!')
def run(dataset_dir):
    """Runs the download and conversion operation.

    Args:
      dataset_dir: The dataset directory where the dataset is stored.
    """
    if not tf.gfile.Exists(dataset_dir):
        tf.gfile.MakeDirs(dataset_dir)

    if _dataset_exists(dataset_dir):
        print('Dataset files already exist. Exiting without re-creating them.')
        return

    photo_filenames, class_names = _get_filenames_and_classes(dataset_dir)
    class_names_to_ids = dict(zip(class_names, range(len(class_names))))

    # Divide into train and test:
    random.seed(_RANDOM_SEED)
    random.shuffle(photo_filenames)
    training_filenames = photo_filenames[_NUM_VALIDATION:]
    validation_filenames = photo_filenames[:_NUM_VALIDATION]

    # First, convert the training and validation sets.
    _convert_dataset('train', training_filenames, class_names_to_ids,
                     dataset_dir)
    _convert_dataset('validation', validation_filenames, class_names_to_ids,
                     dataset_dir)

    # Finally, write the labels file:
    print("len(class_names)", len(class_names))
    labels_to_class_names = dict(zip(range(len(class_names)), class_names))
    dataset_utils.write_label_file(labels_to_class_names, dataset_dir)

    # Uncomment if cleaning the data files is desired.
    # # _clean_up_temporary_files(dataset_dir)
    # _clean_up_temporary_files(dataset_dir)
    print('\nFinished converting the CASIA_NDIRIS dataset!')
Example #20
0
def run(training_dataset_dir, testing_dataset_dir, convert_dataset_dir):
    """Runs the conversion operation.

    Args:
      dataset_dir: The dataset directory where the converted dataset is stored.
      input_dir: The directory where input photos are stored.
    """

    training_photo_filenames, training_class_names = _get_filenames_and_classes(
        training_dataset_dir)
    testing_photo_filenames, testing_class_names = _get_filenames_and_classes(
        testing_dataset_dir)

    class_names_to_ids = dict(
        zip(training_class_names, range(len(training_class_names))))
    if len(training_class_names) != len(testing_class_names):
        raise ValueError(
            'The training and testing datasets must contain the same classes')

    # Divide into train and test:
    random.seed(_RANDOM_SEED)
    random.shuffle(training_photo_filenames)
    random.shuffle(testing_photo_filenames)

    # First, convert the training and validation sets.
    _convert_dataset('train', training_photo_filenames, class_names_to_ids,
                     convert_dataset_dir)
    _convert_dataset('validation', testing_photo_filenames, class_names_to_ids,
                     convert_dataset_dir)

    # Finally, write the labels file:
    labels_to_class_names = dict(
        zip(range(len(training_class_names)), training_class_names))
    dataset_utils.write_label_file(labels_to_class_names, convert_dataset_dir)

    print('\nFinished converting the ICWT dataset!')
def run(dataset_dirs):
    """Runs the download and conversion operation.

  Args:
  dataset_dirs list that contains:
    -train_dir: The dataset directory where the train dataset is stored.
    -test_dir: The dataset directory where the test dataset is stored.
  """

    if os.path.isdir('tf_data') == False:
        os.makedirs('tf_data')

    for dir_index in range(len(dataset_dirs)):
        dataset_dir = dataset_dirs[dir_index]
        dataset_type = 'train'
        if dir_index == 1:
            dataset_type = 'validation'

        if not tf.gfile.Exists(dataset_dir):
            raise ValueError('train_dir does not exist')

        photo_filenames, class_names = _get_filenames_and_classes(dataset_dir)
        myfile = open('tf_data/' + dataset_type + '.txt', 'w')
        myfile.write(str(len(photo_filenames)))
        myfile.close()
        class_names_to_ids = dict(zip(class_names, range(len(class_names))))

        # First, convert the training and validation sets.
        _convert_dataset(dataset_type, photo_filenames, class_names_to_ids,
                         dataset_dir)

        # Finally, write the labels file:
        labels_to_class_names = dict(zip(range(len(class_names)), class_names))
        dataset_utils.write_label_file(labels_to_class_names, 'tf_data')

    print('\nFinished converting the Mushroom dataset!')
def run(dataset_dir):
  """Runs the download and conversion operation.

  Args:
    dataset_dir: The dataset directory where the dataset is stored.
  """
  if not tf.gfile.Exists(dataset_dir):
    tf.gfile.MakeDirs(dataset_dir)

  training_filename = _get_output_filename(dataset_dir, 'train')
  testing_filename = _get_output_filename(dataset_dir, 'test')

  if tf.gfile.Exists(training_filename) and tf.gfile.Exists(testing_filename):
    print('Dataset files already exist. Exiting without re-creating them.')
    return

  _download_dataset(dataset_dir)

  # First, process the training data:
  with tf.python_io.TFRecordWriter(training_filename) as tfrecord_writer:
    data_filename = os.path.join(dataset_dir, _TRAIN_DATA_FILENAME)
    labels_filename = os.path.join(dataset_dir, _TRAIN_LABELS_FILENAME)
    _add_to_tfrecord(data_filename, labels_filename, 60000, tfrecord_writer)

  # Next, process the testing data:
  with tf.python_io.TFRecordWriter(testing_filename) as tfrecord_writer:
    data_filename = os.path.join(dataset_dir, _TEST_DATA_FILENAME)
    labels_filename = os.path.join(dataset_dir, _TEST_LABELS_FILENAME)
    _add_to_tfrecord(data_filename, labels_filename, 10000, tfrecord_writer)

  # Finally, write the labels file:
  labels_to_class_names = dict(zip(range(len(_CLASS_NAMES)), _CLASS_NAMES))
  dataset_utils.write_label_file(labels_to_class_names, dataset_dir)

  _clean_up_temporary_files(dataset_dir)
  print('\nFinished converting the MNIST dataset!')
Example #23
0
def run(np_dir, tf_dir):
    """Runs the download and conversion operation.

    """
    if not tf.gfile.Exists(tf_dir):
        tf.gfile.MakeDirs(tf_dir)

    if _dataset_exists(tf_dir):
        print('Dataset files already exist. Exiting without re-creating them.')
        return

    photo_filenames, class_names = _get_filenames_and_classes(np_dir)
    class_names_to_ids = dict(zip(class_names, range(len(class_names))))

    # Divide into train and test:
    random.seed(_RANDOM_SEED)
    random.shuffle(photo_filenames)
    training_filenames = photo_filenames[_NUM_VALIDATION:]
    validation_filenames = photo_filenames[:_NUM_VALIDATION]

    # First, convert the training and validation sets.
    _convert_dataset('train', training_filenames, class_names_to_ids, tf_dir)
    _convert_dataset('validation', validation_filenames, class_names_to_ids,
                     tf_dir)

    image_count = os.path.join(tf_dir, 'image_count.txt')
    with tf.gfile.Open(image_count, 'w') as f:
        f.write('%d\n' % len(training_filenames))
        f.write('%d\n' % len(validation_filenames))

    # Finally, write the labels file:
    labels_to_class_names = dict(zip(range(len(class_names)), class_names))
    dataset_utils.write_label_file(labels_to_class_names, tf_dir)

    _clean_up_temporary_files(tf_dir)
    print('\nFinished converting the dataset!')
Example #24
0
def main(_):
  """Runs the download and conversion operation.

  Args:
    dataset_dir: The dataset directory where the dataset is stored.
  """
  dataset_dir=FLAGS.dataset_dir
  if not tf.gfile.Exists(dataset_dir):
    tf.gfile.MakeDirs(dataset_dir)

  if _dataset_exists(dataset_dir):
    print('Dataset files already exist. Exiting without re-creating them.')
    return
  
  if FLAGS.grey==True:
        _change_grow(dataset_dir)  
#tranfrom grayscale 
  #_change_grow(dataset_dir)
#  dataset_utils.download_and_uncompress_tarball(_DATA_URL, dataset_dir)
  photo_filenames, class_names = _get_filenames_and_classes(dataset_dir)
  class_names_to_ids = dict(zip(class_names, range(len(class_names))))

  # Divide into train and test:
  random.seed(_RANDOM_SEED)
  random.shuffle(photo_filenames)

  # First, convert the training and validation sets.
  _convert_dataset(photo_filenames, class_names_to_ids,
                   dataset_dir)

  # Finally, write the labels file:
  labels_to_class_names = dict(zip(range(len(class_names)), class_names))
  dataset_utils.write_label_file(labels_to_class_names, "./data_train")

  #_clean_up_temporary_files(dataset_dir)
  print('\nFinished converting the customized dataset at directory: {0}'.format(dataset_dir))
Example #25
0
def run(dataset_dir):
    """Runs the download and conversion operation.
  Args:
    dataset_dir: The dataset directory where the dataset is stored.
  """
    if not tf.gfile.Exists(dataset_dir):  # 路径不存在
        tf.gfile.MakeDirs(dataset_dir)  # 新建

    if _dataset_exists(dataset_dir):  # 查看tfr数据是否已经存在
        print('Dataset files already exist. Exiting without re-creating them.')
        return

    dataset_utils.download_and_uncompress_tarball(_DATA_URL, dataset_dir)
    photo_filenames, class_names = _get_filenames_and_classes(dataset_dir)
    class_names_to_ids = dict(zip(class_names,
                                  range(len(class_names))))  # 将类别名与id对应

    # Divide into train and test:
    random.seed(_RANDOM_SEED)
    random.shuffle(photo_filenames)  # 将文件路径名随机打乱,实现数据打乱
    training_filenames = photo_filenames[_NUM_VALIDATION:]  # 取其余做训练
    validation_filenames = photo_filenames[:_NUM_VALIDATION]  # 350个做验证

    # First, convert the training and validation sets.
    _convert_dataset('train', training_filenames, class_names_to_ids,
                     dataset_dir)  # 转成tfr数据
    _convert_dataset('validation', validation_filenames, class_names_to_ids,
                     dataset_dir)  # 转成tfr数据

    # Finally, write the labels file:
    labels_to_class_names = dict(zip(range(len(class_names)),
                                     class_names))  # 让id与文件名(类别)对应起来
    dataset_utils.write_label_file(labels_to_class_names, dataset_dir)

    _clean_up_temporary_files(dataset_dir)
    print('\nFinished converting the Flowers dataset!')
def run(dataset_dir, create_label_dict):
    """Runs the conversion operation.

  Args:
    dataset_dir: The root directory.
  """

    if not tf.gfile.Exists(dataset_dir):
        raise ValueError('The dataset directory must exist.')

    # Process for each of the data splits:
    for idx, split in enumerate(_SPLIT_NAMES):
        data_filename = os.path.join(dataset_dir, _DATA_FILENAMES[idx])
        label_filename = os.path.join(dataset_dir, _LABEL_FILENAMES[idx])
        _convert_to_tfrecord(dataset_dir, split, data_filename, label_filename)

    if create_label_dict == True:
        class_filename = os.path.join(dataset_dir, _CLASSNAMES_FILENAME)
        with open(class_filename) as fClassNames:
            class_names = fClassNames.read().splitlines()
        labels_to_class_names = dict(zip(range(len(class_names)), class_names))
        dataset_utils.write_label_file(labels_to_class_names, dataset_dir)

    print('\nFinished converting the NUSWIDE dataset!')
Example #27
0
def run(dataset_dir):
  """Runs the download and conversion operation.

  Args:
    dataset_dir: The dataset directory where the dataset is stored.
  """
  tf_record_path = os.path.join(dataset_dir, 'tf_record')
  if not tf.gfile.Exists(tf_record_path):
    tf.gfile.MakeDirs(tf_record_path)

  photo_filenames, class_names = _get_filenames_and_classes(dataset_dir)
  class_names_to_ids = dict(zip(class_names, range(len(class_names))))

  # Divide into train and test:
  total = len(photo_filenames)
  # 20% for validation
  num_val = int(0.2 * total)
  num_train = total - num_val
  random.seed(_RANDOM_SEED)
  random.shuffle(photo_filenames)
  training_filenames = photo_filenames[num_val:]
  validation_filenames = photo_filenames[:num_val]

  # First, convert the training and validation sets.
  _convert_dataset('train', training_filenames, class_names_to_ids,
                   dataset_dir)
  _convert_dataset('validation', validation_filenames, class_names_to_ids,
                   dataset_dir)

  # Finally, write the labels file:
  labels_to_class_names = dict(zip(range(len(class_names)), class_names))
  dataset_utils.write_label_file(labels_to_class_names, tf_record_path)
  dataset_utils.write_split_file(num_train, num_val, tf_record_path)

  # _clean_up_temporary_files(dataset_dir)
  print('\nFinished converting the Flowers dataset!')
Example #28
0
def run(dataset_dir):
    """Runs the download and conversion operation.

  Args:
    dataset_dir: The dataset directory where the dataset is stored.
  """
    # TODO: train with our own dataset modified by sdukaka
    # if not tf.gfile.Exists(dataset_dir):
    #   tf.gfile.MakeDirs(dataset_dir)
    #
    # if _dataset_exists(dataset_dir):
    #   print('Dataset files already exist. Exiting without re-creating them.')
    #   return
    #

    photo_filenames, class_names = _get_filenames_and_classes(dataset_dir)
    class_names_to_ids = dict(zip(class_names, range(len(class_names))))

    # Divide into train and test:
    random.seed(_RANDOM_SEED)
    random.shuffle(photo_filenames)
    # TODO: only generate train data modified by sdukaka
    training_filenames = photo_filenames[_NUM_VALIDATION:]
    validation_filenames = photo_filenames[:_NUM_VALIDATION]

    # First, convert the training and validation sets.
    _convert_dataset('train', training_filenames, class_names_to_ids,
                     dataset_dir)
    _convert_dataset('validation', validation_filenames, class_names_to_ids,
                     dataset_dir)

    # Finally, write the labels file:
    labels_to_class_names = dict(zip(range(len(class_names)), class_names))
    dataset_utils.write_label_file(labels_to_class_names, dataset_dir)

    print('\nFinished converting the Chairs dataset!')
def run(dataset_dir,
        custom_binary_validation=None,
        custom_binary_validation_label=None,
        custom_binary_validation_ratio=None,
        output_suffix=None,
        is_other_dir=None):
    """Runs the download and conversion operation.

  Args:
    dataset_dir: The dataset directory where the dataset is stored.
  """
    if is_other_dir:
        run_other_dir(dataset_dir, output_suffix)

    if not tf.gfile.Exists(dataset_dir):
        tf.gfile.MakeDirs(dataset_dir)

    if _dataset_exists(dataset_dir):
        print('Dataset files already exist. Exiting without re-creating them.')
        return
    random.seed(_RANDOM_SEED)
    # dataset_utils.download_and_uncompress_tarball(_DATA_URL, dataset_dir)
    if custom_binary_validation:
        tmp_photo_filenames, class_names = _get_filenames_and_classes_by_label(
            dataset_dir, 'apparelv_binary_without_dummy')
        if not custom_binary_validation_ratio:
            custom_binary_validation_ratio = 0.
        if custom_binary_validation_ratio > 1:
            custom_binary_validation_ratio = 1.
        validation_filenames = []
        training_filenames = []
        for key in tmp_photo_filenames:
            if key == custom_binary_validation_label:
                ratio = custom_binary_validation_ratio
            else:
                ratio = 1. - custom_binary_validation_ratio

            random.shuffle(tmp_photo_filenames[key])
            training_filenames += tmp_photo_filenames[key][
                int(_NUM_VALIDATION * ratio):]
            print(key,
                  len(tmp_photo_filenames[key][:int(_NUM_VALIDATION * ratio)]))
            validation_filenames += tmp_photo_filenames[
                key][:int(_NUM_VALIDATION * ratio)]
    else:
        photo_filenames, class_names = _get_filenames_and_classes(
            dataset_dir, 'apparelv_binary_without_dummy')

        # Divide into train and test:
        print("Now let's start converting the Koreans dataset!")
        random.shuffle(photo_filenames)
        training_filenames = photo_filenames[_NUM_VALIDATION:]
        validation_filenames = photo_filenames[:_NUM_VALIDATION]

    class_names_to_ids = dict(zip(class_names, range(len(class_names))))
    # First, convert the training and validation sets.
    _convert_dataset('train', training_filenames, class_names_to_ids,
                     dataset_dir, output_suffix)
    _convert_dataset('validation', validation_filenames, class_names_to_ids,
                     dataset_dir, output_suffix)

    # Finally, write the labels file:
    labels_to_class_names = dict(zip(range(len(class_names)), class_names))
    if output_suffix:
        dataset_utils.write_label_file(labels_to_class_names, dataset_dir,
                                       'labels_' + output_suffix + '.txt')
    else:
        dataset_utils.write_label_file(labels_to_class_names, dataset_dir)

    # _clean_up_temporary_files(dataset_dir, 'apparel')
    print('\nFinished converting the Koreans dataset!')
def get_split(split_name, dataset_dir, file_pattern=None, reader=None):
  """Gets a dataset tuple with instructions for reading ImageNet.

  Args:
    split_name: A train/test split name.
    dataset_dir: The base directory of the dataset sources.
    file_pattern: The file pattern to use when matching the dataset sources.
      It is assumed that the pattern contains a '%s' string so that the split
      name can be inserted.
    reader: The TensorFlow reader type.

  Returns:
    A `Dataset` namedtuple.

  Raises:
    ValueError: if `split_name` is not a valid train/test split.
  """
  if split_name not in _SPLITS_TO_SIZES:
    raise ValueError('split name %s was not recognized.' % split_name)

  if not file_pattern:
    file_pattern = _FILE_PATTERN
  file_pattern = os.path.join(dataset_dir, file_pattern % split_name)

  # Allowing None in the signature so that dataset_factory can use the default.
  if reader is None:
    reader = tf.TFRecordReader

  keys_to_features = {
      'image/encoded': tf.FixedLenFeature(
          (), tf.string, default_value=''),
      'image/format': tf.FixedLenFeature(
          (), tf.string, default_value='jpeg'),
      'image/class/label': tf.FixedLenFeature(
          [], dtype=tf.int64, default_value=-1),
      'image/class/text': tf.FixedLenFeature(
          [], dtype=tf.string, default_value=''),
      'image/object/bbox/xmin': tf.VarLenFeature(
          dtype=tf.float32),
      'image/object/bbox/ymin': tf.VarLenFeature(
          dtype=tf.float32),
      'image/object/bbox/xmax': tf.VarLenFeature(
          dtype=tf.float32),
      'image/object/bbox/ymax': tf.VarLenFeature(
          dtype=tf.float32),
      'image/object/class/label': tf.VarLenFeature(
          dtype=tf.int64),
  }

  items_to_handlers = {
      'image': slim.tfexample_decoder.Image('image/encoded', 'image/format'),
      'label': slim.tfexample_decoder.Tensor('image/class/label'),
      'label_text': slim.tfexample_decoder.Tensor('image/class/text'),
      'object/bbox': slim.tfexample_decoder.BoundingBox(
          ['ymin', 'xmin', 'ymax', 'xmax'], 'image/object/bbox/'),
      'object/label': slim.tfexample_decoder.Tensor('image/object/class/label'),
  }

  decoder = slim.tfexample_decoder.TFExampleDecoder(
      keys_to_features, items_to_handlers)

  labels_to_names = None
  if dataset_utils.has_labels(dataset_dir):
    labels_to_names = dataset_utils.read_label_file(dataset_dir)
  else:
    labels_to_names = create_readable_names_for_imagenet_labels()
    dataset_utils.write_label_file(labels_to_names, dataset_dir)

  return slim.dataset.Dataset(
      data_sources=file_pattern,
      reader=reader,
      decoder=decoder,
      num_samples=_SPLITS_TO_SIZES[split_name],
      items_to_descriptions=_ITEMS_TO_DESCRIPTIONS,
      num_classes=_NUM_CLASSES,
      labels_to_names=labels_to_names)
Example #31
0
  """
    if not tf.gfile.Exists(dataset_dir):
        tf.gfile.MakeDirs(dataset_dir)

    if _dataset_exists(dataset_dir):
        print('Dataset files already exist. Exiting without re-creating them.')
    else:
        # dataset_utils.download_and_uncompress_tarball(_DATA_URL, dataset_dir) 不用下载
        photo_filenames, class_names = _get_filenames_and_classes(dataset_dir)
        # 每一类别都转换成一个id
        class_names_to_ids = dict(zip(class_names, range(len(class_names))))

        # Divide into train and test:
        random.seed(_RANDOM_SEED)
        random.shuffle(photo_filenames)  # 其实已经打乱了顺序
        training_filenames = photo_filenames[_NUM_VALIDATION:]
        validation_filenames = photo_filenames[:_NUM_VALIDATION]

        # First, convert the training and validation sets.
        _convert_dataset('train', training_filenames, class_names_to_ids,
                         dataset_dir)
        _convert_dataset('validation', validation_filenames,
                         class_names_to_ids, dataset_dir)

        # Finally, write the labels file:
        labels_to_class_names = dict(zip(range(len(class_names)), class_names))
        dataset_utils.write_label_file(labels_to_class_names, dataset_dir)

        # _clean_up_temporary_files(dataset_dir+"flowers_photos")
        print('\nFinished converting the Cottons dataset!')
Example #32
0
def convert_img_to_tfrecord(project_dir,
        dataset_name,
        dataset_dir,
        image_dir,
        train_percentage,
        validation_percentage,
        test_percentage,
        image_height,
        image_width,
        **kwargs):
  """Runs conversion operation.

    Args:
      project_dir: (Default) Directory where the newly created dataset with tfrecord will be stored.
      dataset_name: The name of dataset that is created from input dataset.
      dataset_dir: (optional) Directory where the newly created dataset with tfrecord will be stored.
      image_dir: The dataset directory where the dataset is stored.
      train_percentage: train dataset
      validation_percentage: validation dataset
      test_percentage: test dataset
      image_height: Target image height for tfrecord.
      image_width: Target image width for tfrecord.
  """
  # print(dataset_dir)

  # if not os.listdir(image_dir):
  #   raise ValueError('No label folders found in image directory --image_dir')
  if not image_dir:
    raise ValueError('You must supply a image directory with --image_dir')

  if dataset_dir:
    dataset_dir = os.path.join(dataset_dir, dataset_name)
  else:
    # initialize default directories
    dataset_dir = os.path.join(os.path.join(project_dir, 'datasets'), dataset_name)
  # delete dataset directory if it exists
  if os.path.exists(dataset_dir):
    shutil.rmtree(dataset_dir)
  # call convert dataset function
  if len(os.listdir(image_dir)):
    # create new dataset directory
    if not tf.gfile.Exists(dataset_dir):
      tf.gfile.MakeDirs(dataset_dir)

    if train_percentage + validation_percentage + test_percentage > 100:
      raise ValueError('The sum of train, validation, and test percentages can not be greater than 100')

    photo_filenames, class_names = _get_filenames_and_classes(image_dir)

    class_names_to_ids = dict(zip(class_names, range(len(class_names))))
    class_id = [class_names_to_ids[x.split('/')[-2]] for x in photo_filenames]
    # print('############',len(class_id))

    # Divide into train, validation and test:
    random.seed(_RANDOM_SEED)
    random.shuffle(photo_filenames)
    dataset_split = dict()
    training_filenames = photo_filenames[:]

    if test_percentage > 0:
      training_filenames, test_filenames = train_test_split(training_filenames, test_size=test_percentage/100, random_state=_RANDOM_SEED, stratify=class_id)
      test_size = len(test_filenames)
      print('Number of test images: ', test_size)
      num_samples_per_class = _convert_dataset('test', test_filenames, class_names_to_ids,
                     dataset_dir, dataset_name, image_height, image_width)
      dataset_split['test'] = test_size
      dataset_split['test_per_class'] = num_samples_per_class

    if validation_percentage > 0:
      training_filenames, validation_filenames = train_test_split(training_filenames, test_size=validation_percentage/100, random_state=_RANDOM_SEED)
      validation_size = len(validation_filenames)
      print('Number of validation images: ', validation_size)
      num_samples_per_class = _convert_dataset('validation', validation_filenames, class_names_to_ids,
                     dataset_dir, dataset_name, image_height, image_width)
      dataset_split['validation'] = validation_size
      dataset_split['validation_per_class'] = num_samples_per_class

    if train_percentage > 0:
      training_filenames, train_filenames = train_test_split(training_filenames, test_size=train_percentage/100, random_state=_RANDOM_SEED)
      train_size = len(train_filenames)
      print('Number of training images: ', train_size)
      num_samples_per_class = _convert_dataset('train', train_filenames, class_names_to_ids, dataset_dir, dataset_name, image_height, image_width)
      dataset_split['train'] = train_size
      dataset_split['train_per_class'] = num_samples_per_class

    # Finally, write the label and dataset json files:
    labels_to_class_names = dict(zip(range(len(class_names)), class_names))
    dataset_utils.write_label_file(labels_to_class_names, dataset_dir)
    dataset_utils.write_dataset_config_json(dataset_name,
                       dataset_dir, class_names,
                       dataset_split)

    print('\nFinished converting the ',dataset_name,' dataset! under the following directory', dataset_dir)
    return dataset_dir

  else:
    raise ValueError(
        'image directory --image_dir=[%s] is empty'.format(image_dir))
Example #33
0
def run(dataset_dir):
    """Runs the download and conversion operation.

  Args:
    dataset_dir: The dataset directory where the dataset is stored.
  """
    if not tf.gfile.Exists(dataset_dir):
        tf.gfile.MakeDirs(dataset_dir)

    if _dataset_exists(dataset_dir):
        print('Dataset files already exist. Exiting without re-creating them.')
        return

    #dataset_utils.download_and_uncompress_tarball(_DATA_URL, dataset_dir)
    photo_filenames, class_names = _get_filenames_and_classes(dataset_dir)
    class_names_to_ids = dict(zip(class_names, range(len(class_names))))
    plant_filenames = photo_filenames['plants']
    no_plant_filenames = photo_filenames['no_plants']

    # check no_plant_filenames for corrupted images
    corrupted_images = 0
    no_plant_save_filenames = []
    print(len(no_plant_filenames))

    # Divide into train and test:

    random.seed(_RANDOM_SEED)
    random.shuffle(plant_filenames)
    random.shuffle(no_plant_save_filenames)

    training_filenames = plant_filenames[_NUM_PLANT_VALIDATION:]
    training_filenames.extend(
        no_plant_save_filenames[_NUM_NO_PLANT_VALIDATION:91758])

    random.seed(_RANDOM_SEED)
    random.shuffle(training_filenames)

    validation_plant_filenames = plant_filenames[:_NUM_PLANT_VALIDATION]
    validation_no_plant_filenames = no_plant_save_filenames[:
                                                            _NUM_NO_PLANT_VALIDATION]

    validation_filenames = validation_plant_filenames
    validation_filenames.extend(validation_no_plant_filenames)

    random.seed(_RANDOM_SEED)
    random.shuffle(validation_filenames)

    # First, convert the training and validation sets.
    _convert_dataset('train', training_filenames, class_names_to_ids,
                     dataset_dir, _NUM_SHARDS_TRAINING)

    _convert_dataset('validation', validation_filenames, class_names_to_ids,
                     dataset_dir, _NUM_SHARDS_VALIDATION)

    # Second, convert the validation sets for plant and no_plant separately
    #_convert_dataset('validation_plant', validation_plant_filenames, class_names_to_ids,
    #dataset_dir, _NUM_SHARDS_SPLIT)
    #_convert_dataset('validation_no_plant', validation_no_plant_filenames, class_names_to_ids,
    #dataset_dir, _NUM_SHARDS_SPLIT)

    # Finally, write the labels file:
    labels_to_class_names = dict(zip(range(len(class_names)), class_names))
    dataset_utils.write_label_file(labels_to_class_names, dataset_dir)

    #_clean_up_temporary_files(dataset_dir)
    print('\nFinished converting the PlantVsNoplant dataset!')
Example #34
0
def get_split(split_name, dataset_dir, file_pattern=None, reader=None):
    """Gets a dataset tuple with instructions for reading ImageNet.
  
    Args:
      split_name: A train/test split name.
      dataset_dir: The base directory of the dataset sources.
      file_pattern: The file pattern to use when matching the dataset sources.
        It is assumed that the pattern contains a '%s' string so that the split
        name can be inserted.
      reader: The TensorFlow reader type.
  
    Returns:
      A `Dataset` namedtuple.
  
    Raises:
      ValueError: if `split_name` is not a valid train/test split.
    """
    if split_name not in _SPLITS_TO_SIZES:
        raise ValueError('split name %s was not recognized.' % split_name)

    if not file_pattern:
        file_pattern = _FILE_PATTERN
    file_pattern = os.path.join(dataset_dir, file_pattern % split_name)

    # Allowing None in the signature so that dataset_factory can use the default.
    if reader is None:
        reader = tf.TFRecordReader

    keys_to_features = {
        'image/encoded':
        tf.FixedLenFeature((), tf.string, default_value=''),
        'image/format':
        tf.FixedLenFeature((), tf.string, default_value='jpeg'),
        'image/class/label':
        tf.FixedLenFeature([], dtype=tf.int64, default_value=-1),
        'image/class/text':
        tf.FixedLenFeature([], dtype=tf.string, default_value=''),
        'image/object/bbox/xmin':
        tf.VarLenFeature(dtype=tf.float32),
        'image/object/bbox/ymin':
        tf.VarLenFeature(dtype=tf.float32),
        'image/object/bbox/xmax':
        tf.VarLenFeature(dtype=tf.float32),
        'image/object/bbox/ymax':
        tf.VarLenFeature(dtype=tf.float32),
        'image/object/class/label':
        tf.VarLenFeature(dtype=tf.int64),
    }

    items_to_handlers = {
        'image':
        slim.tfexample_decoder.Image('image/encoded', 'image/format'),
        'label':
        slim.tfexample_decoder.Tensor('image/class/label'),
        'label_text':
        slim.tfexample_decoder.Tensor('image/class/text'),
        'object/bbox':
        slim.tfexample_decoder.BoundingBox(['ymin', 'xmin', 'ymax', 'xmax'],
                                           'image/object/bbox/'),
        'object/label':
        slim.tfexample_decoder.Tensor('image/object/class/label'),
    }

    decoder = slim.tfexample_decoder.TFExampleDecoder(keys_to_features,
                                                      items_to_handlers)

    labels_to_names = None
    if dataset_utils.has_labels(dataset_dir):
        labels_to_names = dataset_utils.read_label_file(dataset_dir)
    else:
        labels_to_names = create_readable_names_for_imagenet_labels()
        dataset_utils.write_label_file(labels_to_names, dataset_dir)

    return slim.dataset.Dataset(data_sources=file_pattern,
                                reader=reader,
                                decoder=decoder,
                                num_samples=_SPLITS_TO_SIZES[split_name],
                                items_to_descriptions=_ITEMS_TO_DESCRIPTIONS,
                                num_classes=_NUM_CLASSES,
                                labels_to_names=labels_to_names)
Example #35
0
def run(dataset_dir, dataset_type):
  """Runs the download and conversion operation.

  Args:
    dataset_dir: The dataset directory where the dataset is stored.
    dataset_type: One of "train", "test", "validation", which set should be downloaded and decompressed
  """
  if(dataset_type == "train2014"):
    _DATA_URL = url_train_2014
  elif(dataset_type == "test2014"):
    _DATA_URL = url_test_2014_1
  elif(dataset_type == "validation2014"):
    _DATA_URL = url_test_2014_2
  elif(dataset_type == "train2016"):
    _DATA_URL = url_train_2016
  elif(dataset_type == "test2016"):
    _DATA_URL = url_test_2016_1
  elif(dataset_type == "train2016_2"):
    _DATA_URL = url_train_2016_2
  else:
    print("There exists no such dataset %s, please choose one of 'train2014', 'test2014', 'validation2014','train2016', 'test2016', 'train2016_2'." % dataset_type)
    
    
  if not tf.gfile.Exists(dataset_dir):
    tf.gfile.MakeDirs(dataset_dir)

  if _dataset_exists(dataset_dir):
    print('Dataset files already exist. Exiting without re-creating them.')
    return

  # TODO Downloading
  #dataset_utils.download_and_uncompress_tarball(_DATA_URL, dataset_dir)
  if dataset_type != "test2016":  
    print("Getting filenames and class_id's ")  
    # Extract and save pictures and class names in dictionary
    photo_filenames, image_to_id, id_to_class_name, class_names = _get_filenames_and_classes(dataset_dir)
    class_names_to_ids = dict(zip(class_names, range(len(class_names))))
    print("All filenames and class_id's found")  
        
    # Divide into train and test: TODO division necessary if packages (urls) contain splits?
    random.seed(_RANDOM_SEED)
    random.shuffle(photo_filenames)
    training1_filenames = photo_filenames[:_NUM_TRAIN_SET1]
    training2_filenames = photo_filenames[_NUM_TRAIN_SET1:_NUM_TRAIN_SET2]
    training3_filenames = photo_filenames[_NUM_TRAIN_SET2:]

    # First, convert the training and validation sets. 
    _convert_dataset('train_set1', training1_filenames, class_names_to_ids, image_to_id, 
                        dataset_dir)
    _convert_dataset('train_set2', training2_filenames, class_names_to_ids, image_to_id, 
                        dataset_dir)
    _convert_dataset('train_set3', training3_filenames, class_names_to_ids, image_to_id,
                        dataset_dir) 

        # Finally, write the labels file:
    labels_to_class_names = dict(zip(range(len(class_names)), class_names))
    # writes "labels.txt" with id (1:1000) -> class_id (e.g. 17266)
    dataset_utils.write_label_file(labels_to_class_names, dataset_dir)  
    # writes "labels2.txt" with class_id (e.g. 17266) -> class_name (e.g. rose ...)
    dataset_utils.write_label_file(id_to_class_name, dataset_dir, filename="labels2.txt")

    _clean_up_temporary_files(_DATA_URL, dataset_dir)
    print('\nFinished converting the Flowers dataset!')
  else:
      photo_filenames, image_to_media_id = _get_filenames_and_classes(dataset_dir, is_train_set=False)
      
      _convert_dataset('test_set', photo_filenames, None, image_to_media_id, dataset_dir, is_train_set=False) 
     
      print('\nFinished converting the Flowers test dataset!')