def read_data_sets(train_dir, one_hot=False,):

  TRAIN_IMAGES = 'train-images-idx3-ubyte.gz'
  TRAIN_LABELS = 'train-labels-idx1-ubyte.gz'
  TEST_IMAGES = 't10k-images-idx3-ubyte.gz'
  TEST_LABELS = 't10k-labels-idx1-ubyte.gz'

  local_file = base.maybe_download(TRAIN_IMAGES, train_dir,
                                   SOURCE_URL + TRAIN_IMAGES)
  train_images = extract_images(local_file)

  local_file = base.maybe_download(TRAIN_LABELS, train_dir,
                                   SOURCE_URL + TRAIN_LABELS)
  train_labels = extract_labels(local_file, one_hot=one_hot)

  local_file = base.maybe_download(TEST_IMAGES, train_dir,
                                   SOURCE_URL + TEST_IMAGES)
  test_images = extract_images(local_file)

  local_file = base.maybe_download(TEST_LABELS, train_dir,
                                   SOURCE_URL + TEST_LABELS)
  test_labels = extract_labels(local_file, one_hot=one_hot)

  train_images = np.reshape(train_images, newshape=(train_images.shape[0],
                                        (train_images.shape[1]*train_images.shape[2]*train_images.shape[3]))) / 255.0
  test_images = np.reshape(test_images, newshape=(test_images.shape[0],
                                        (test_images.shape[1]*test_images.shape[2]*test_images.shape[3]))) / 255.0
  train_labels = train_labels / 1.0
  test_labels = test_labels / 1.0

  return train_images, train_labels, test_images, test_labels
Example #2
0
def read_data_sets(train_dir,
                   fake_data=False,
                   one_hot=False,
                   dtype=dtypes.float32,
                   reshape=True,
                   validation_size=5000):
  if fake_data:

    def fake():
      return DataSet([], [], fake_data=True, one_hot=one_hot, dtype=dtype)

    train = fake()
    validation = fake()
    test = fake()
    return base.Datasets(train=train, validation=validation, test=test)

  TRAIN_IMAGES = 'train-images-idx3-ubyte.gz'
  TRAIN_LABELS = 'train-labels-idx1-ubyte.gz'
  TEST_IMAGES = 't10k-images-idx3-ubyte.gz'
  TEST_LABELS = 't10k-labels-idx1-ubyte.gz'

  local_file = base.maybe_download(TRAIN_IMAGES, train_dir,
                                   SOURCE_URL + TRAIN_IMAGES)
  with open(local_file, 'rb') as f:
    train_images = extract_images(f)

  local_file = base.maybe_download(TRAIN_LABELS, train_dir,
                                   SOURCE_URL + TRAIN_LABELS)
  with open(local_file, 'rb') as f:
    train_labels = extract_labels(f, one_hot=one_hot)

  local_file = base.maybe_download(TEST_IMAGES, train_dir,
                                   SOURCE_URL + TEST_IMAGES)
  with open(local_file, 'rb') as f:
    test_images = extract_images(f)

  local_file = base.maybe_download(TEST_LABELS, train_dir,
                                   SOURCE_URL + TEST_LABELS)
  with open(local_file, 'rb') as f:
    test_labels = extract_labels(f, one_hot=one_hot)

  if not 0 <= validation_size <= len(train_images):
    raise ValueError(
        'Validation size should be between 0 and {}. Received: {}.'
        .format(len(train_images), validation_size))

  validation_images = train_images[:validation_size]
  validation_labels = train_labels[:validation_size]
  train_images = train_images[validation_size:]
  train_labels = train_labels[validation_size:]

  train = DataSet(train_images, train_labels, dtype=dtype, reshape=reshape)
  validation = DataSet(validation_images,
                       validation_labels,
                       dtype=dtype,
                       reshape=reshape)
  test = DataSet(test_images, test_labels, dtype=dtype, reshape=reshape)

  return base.Datasets(train=train, validation=validation, test=test)
Example #3
0
def train_and_eval(job_dir=None, model_type='WIDE_AND_DEEP'):
  print("Begin training and evaluation")

  # if local eval and no args passed, default
  if job_dir is None: job_dir = 'models/' 

  # Ensure path has a '/' at the end
  if job_dir[-1] != '/': job_dir += '/'

  gcs_base = 'https://storage.googleapis.com/'
  gcs_path = 'cloudml-public/census/data/'
  trainfile = 'adult.data.csv'
  testfile  = 'adult.test.csv'
  local_path = 'dataset_files'
  train_file = base.maybe_download(
    trainfile, local_path, gcs_base + gcs_path + trainfile)
  test_file = base.maybe_download(
    testfile, local_path, gcs_base + gcs_path + testfile)

  training_mode = 'learn_runner'
  train_steps = 1000
  test_steps = 100

  model_dir = job_dir + 'model_' + model_type + '_' + str(int(time.time()))
  print("Saving model checkpoints to " + model_dir)
  export_dir = model_dir + '/exports'

  # Manually train and export model
  if training_mode == 'manual':
    # In this function, editing below here is unlikely to be needed
    m = build_estimator(model_type, model_dir)

    m.fit(input_fn=generate_input_fn(train_file), steps=train_steps)
    print('fit done')

    results = m.evaluate(input_fn=generate_input_fn(test_file), steps=test_steps)
    print('evaluate done')

    print('Accuracy: %s' % results['accuracy'])

    export_folder = m.export_savedmodel(
      export_dir_base = export_dir,
      input_fn=serving_input_fn
    )

    print('Model exported to ' + export_dir)


  elif training_mode == 'learn_runner':
    # use learn_runner
    experiment_fn = generate_experiment(
      model_dir, train_file, test_file, model_type)

    metrics, output_folder = learn_runner.run(experiment_fn, model_dir)

    print('Accuracy: {}'.format(metrics['accuracy']))
    print('Model exported to {}'.format(output_folder))
 def download(d):
     """Binds voxforge_url, archive_dir, total, and counter into this scope
     Downloads the given file
     :param d: a tuple consisting of (index, file) where index is the index
               of the file to download and file is the name of the file to download
     """
     (i, file) = d
     download_url = voxforge_url + '/' + file
     c = counter.increment()
     print('Downloading file {} ({}/{})...'.format(i+1, c, total))
     base.maybe_download(filename_of(download_url), archive_dir, download_url)
def _download_and_preprocess_data(data_dir):
    # Conditionally download data
    LDC93S1_BASE = "LDC93S1"
    LDC93S1_BASE_URL = "https://catalog.ldc.upenn.edu/desc/addenda/"
    local_file = base.maybe_download(LDC93S1_BASE + ".wav", data_dir, LDC93S1_BASE_URL + LDC93S1_BASE + ".wav")
    trans_file = base.maybe_download(LDC93S1_BASE + ".txt", data_dir, LDC93S1_BASE_URL + LDC93S1_BASE + ".txt")
    with open(trans_file, "r") as fin:
        transcript = ' '.join(fin.read().strip().lower().split(' ')[2:]).replace('.', '')

    df = pandas.DataFrame(data=[(os.path.abspath(local_file), os.path.getsize(local_file), transcript)],
                          columns=["wav_filename", "wav_filesize", "transcript"])
    df.to_csv(os.path.join(data_dir, "ldc93s1.csv"), index=False)
def read_data_sets(train_dir,
                   fake_data=False,
                   one_hot=False,
                   dtype=dtypes.float32,
                   reshape=True):
  if fake_data:

    def fake():
      return DataSet([], [], fake_data=True, one_hot=one_hot, dtype=dtype)

    train = fake()
    validation = fake()
    test = fake()
    return base.Datasets(train=train, validation=validation, test=test)

  TRAIN_IMAGES = 'trainImage.txt.gz'
  TRAIN_LABELS = 'trainImageLabel.txt.gz'
  TEST_IMAGES = 'testImage.txt.gz'
  TEST_LABELS = 'testImageLabel.txt.gz'
  VALIDATION_SIZE = 36 

  local_file = base.maybe_download(TRAIN_IMAGES, train_dir,
                                   SOURCE_URL + TRAIN_IMAGES)
  train_images = extract_images(local_file)

  local_file = base.maybe_download(TRAIN_LABELS, train_dir,
                                   SOURCE_URL + TRAIN_LABELS)
  train_labels = extract_labels(local_file, one_hot=one_hot)

  local_file = base.maybe_download(TEST_IMAGES, train_dir,
                                   SOURCE_URL + TEST_IMAGES)
  test_images = extract_images(local_file)

  local_file = base.maybe_download(TEST_LABELS, train_dir,
                                   SOURCE_URL + TEST_LABELS)
  test_labels = extract_labels(local_file, one_hot=one_hot)

  validation_images = train_images[:VALIDATION_SIZE]
  validation_labels = train_labels[:VALIDATION_SIZE]
#  train_images = train_images[VALIDATION_SIZE:]
#  train_labels = train_labels[VALIDATION_SIZE:]
  train_images = validation_images
  train_labels = validation_labels

  train = DataSet(train_images, train_labels, dtype=dtype, reshape=reshape)
  validation = DataSet(validation_images,
                       validation_labels,
                       dtype=dtype,
                       reshape=reshape)
  test = DataSet(test_images, test_labels, dtype=dtype, reshape=reshape)
  
  return base.Datasets(train=train, validation=validation, test=test)
Example #7
0
def read_data_sets(train_dir,
                   fake_data=False,
                   one_hot=False,
                   dtype=dtypes.float32):
  if fake_data:

    def fake():
      return DataSet([], [], fake_data=True, one_hot=one_hot, dtype=dtype)

    train = fake()
    validation = fake()
    test = fake()
    return base.Datasets(train=train, validation=validation, test=test)

  TRAIN_IMAGES = 'train-images-idx3-ubyte.gz'
  TRAIN_LABELS = 'train-labels-idx1-ubyte.gz'
  TEST_IMAGES = 't10k-images-idx3-ubyte.gz'
  TEST_LABELS = 't10k-labels-idx1-ubyte.gz'
  VALIDATION_SIZE = 5000

  local_file = base.maybe_download(TRAIN_IMAGES, train_dir,
                                   SOURCE_URL + TRAIN_IMAGES)
  train_images = extract_images(local_file)

  local_file = base.maybe_download(TRAIN_LABELS, train_dir,
                                   SOURCE_URL + TRAIN_LABELS)
  train_labels = extract_labels(local_file, one_hot=one_hot)

  local_file = base.maybe_download(TEST_IMAGES, train_dir,
                                   SOURCE_URL + TEST_IMAGES)
  test_images = extract_images(local_file)

  local_file = base.maybe_download(TEST_LABELS, train_dir,
                                   SOURCE_URL + TEST_LABELS)
  test_labels = extract_labels(local_file, one_hot=one_hot)

  validation_images = train_images[:VALIDATION_SIZE]
  validation_labels = train_labels[:VALIDATION_SIZE]
  train_images = train_images[VALIDATION_SIZE:]
  train_labels = train_labels[VALIDATION_SIZE:]

  train = DataSet(train_images, train_labels, start_id=0, dtype=dtype)
  validation = DataSet(validation_images,
                       validation_labels,
                       start_id=len(train_images),
                       dtype=dtype)
  test = DataSet(test_images,
                 test_labels,
                 start_id=(len(train_images) + len(validation_images)),
                 dtype=dtype)

  return base.Datasets(train=train, validation=validation, test=test)
Example #8
0
def read_data_sets(data_dir):
    filename = "cifar-100-python.tar.gz"
    print("getting data")
    SOURCE_URL = 'https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz'

    local_file = base.maybe_download(filename, data_dir, SOURCE_URL)
    
   
    print('Extracting', filename)
    train_images,train_labels =[],[]
    test_images,test_labels =[],[]
    with gfile.Open(data_dir+"/"+filename, 'rb') as f, tarfile.open(fileobj=f) as tar:
        for x in tar.getnames():
            if "data_batch" in x:
               i,l = _get_data(tar.extractfile(x))
               train_images.extend(i.reshape((i.shape[0],32,32,3)))
               train_labels.extend(l) 
            if "test_batch" in x:
                i,l = _get_data(tar.extractfile(x)) 
                test_images.extend(i.reshape((i.shape[0],32,32,3)))
                test_labels.extend(l)

    train_images = np.array(train_images)
    test_images = np.array(test_images)
    train_labels = np.array(train_labels)
    test_labels = np.array(test_labels)

    train = DataSet(train_images, train_labels,dtype=dtypes.uint8,depth=100)
    test = DataSet(test_images, test_labels,dtype=dtypes.uint8,depth=100)
    
    return base.Datasets(train=train, validation=None, test=test)
Example #9
0
def maybe_download_dbpedia(data_dir):
  """Download if DBpedia data is not present."""
  train_path = os.path.join(data_dir, 'dbpedia_csv/train.csv')
  test_path = os.path.join(data_dir, 'dbpedia_csv/test.csv')
  if not (gfile.Exists(train_path) and gfile.Exists(test_path)):
    archive_path = base.maybe_download(
        'dbpedia_csv.tar.gz', data_dir, DBPEDIA_URL)
    tfile = tarfile.open(archive_path, 'r:*')
    tfile.extractall(data_dir)
Example #10
0
def read_data_sets(train_dir, fake_data=False, one_hot=False, dtype=tf.float32):
  class DataSets(object):
    pass
  data_sets = DataSets()

  if fake_data:
    def fake():
      return DataSet([], [], fake_data=True, one_hot=one_hot, dtype=dtype)
    data_sets.train = fake()
    data_sets.validation = fake()
    data_sets.test = fake()
    return data_sets

  TRAIN_IMAGES = 'train-images-idx3-ubyte.gz'
  TRAIN_LABELS = 'train-labels-idx1-ubyte.gz'
  TEST_IMAGES = 't10k-images-idx3-ubyte.gz'
  TEST_LABELS = 't10k-labels-idx1-ubyte.gz'
  VALIDATION_SIZE = 5000

  local_file = maybe_download(TRAIN_IMAGES, train_dir, SOURCE_URL + TRAIN_IMAGES)
  train_images = extract_images(local_file)

  local_file = maybe_download(TRAIN_LABELS, train_dir, SOURCE_URL + TRAIN_LABELS)
  train_labels = extract_labels(local_file, one_hot=one_hot)

  local_file = maybe_download(TEST_IMAGES, train_dir, SOURCE_URL + TEST_IMAGES)
  test_images = extract_images(local_file)

  local_file = maybe_download(TEST_LABELS, train_dir, SOURCE_URL + TEST_LABELS)
  test_labels = extract_labels(local_file, one_hot=one_hot)

  validation_images = train_images[:VALIDATION_SIZE]
  validation_labels = train_labels[:VALIDATION_SIZE]
  train_images = train_images[VALIDATION_SIZE:]
  train_labels = train_labels[VALIDATION_SIZE:]

  data_sets.train = DataSet(train_images, train_labels, dtype=dtype)
  data_sets.validation = DataSet(validation_images, validation_labels,
                                 dtype=dtype)
  data_sets.test = DataSet(test_images, test_labels, dtype=dtype)

  return data_sets
Example #11
0
def get_dbpedia(data_dir):
    train_path = os.path.join(data_dir, 'dbpedia_csv/train.csv')
    test_path = os.path.join(data_dir, 'dbpedia_csv/test.csv')
    if not (gfile.Exists(train_path) and gfile.Exists(test_path)):
        archive_path = base.maybe_download('dbpedia_csv.tar.gz', data_dir, DBPEDIA_URL)
        tfile = tarfile.open(archive_path, 'r:*')
        tfile.extractall(data_dir)
    train = base.load_csv(train_path, np.int32, 0, has_header=False)
    test = base.load_csv(test_path, np.int32, 0, has_header=False)
    datasets = base.Datasets(train=train, validation=None, test=test)
    return datasets
Example #12
0
def read_data_sets(train_dir,
                   fake_data=False,
                   one_hot=False,
                   dtype=dtypes.float32,
                   reshape=True,
                   validation_size=5000):

    TRAIN_TEST_IMAGES = 'cifar-10-python.tar.gz'
    SOURCE_TRAIN_TEST = 'https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz'
    local_file = base.maybe_download(TRAIN_TEST_IMAGES, train_dir,
                                     SOURCE_TRAIN_TEST)

    with open(local_file, 'rb') as f:
        train_images, train_labels, test_images, test_labels = extract_images(
            f)

    # local_file = base.maybe_download(TRAIN_LABELS, train_dir,
    #                                  SOURCE_URL + TRAIN_LABELS)
    # with open(local_file, 'rb') as f:
    #   train_labels = extract_labels(f, one_hot=one_hot)

    # local_file = base.maybe_download(TEST_IMAGES, train_dir,
    #                                  SOURCE_URL + TEST_IMAGES)
    # with open(local_file, 'rb') as f:
    #   test_images = extract_images(f)

    # local_file = base.maybe_download(TEST_LABELS, train_dir,
    #                                  SOURCE_URL + TEST_LABELS)
    # with open(local_file, 'rb') as f:
    #   test_labels = extract_labels(f, one_hot=one_hot)

    if not 0 <= validation_size <= len(train_images):
        raise ValueError(
            'Validation size should be between 0 and {}. Received: {}.'
            .format(len(train_images), validation_size))

    validation_images = train_images[:validation_size]
    validation_labels = train_labels[:validation_size]
    train_images = train_images[validation_size:]
    train_labels = train_labels[validation_size:]

    train = DataSet(train_images, train_labels, dtype=dtype, reshape=reshape)
    validation = DataSet(validation_images,
                         validation_labels,
                         dtype=dtype,
                         reshape=reshape)
    test = DataSet(test_images, test_labels, dtype=dtype, reshape=reshape)

    return base.Datasets(train=train, validation=validation, test=test)
Example #13
0
def _download_and_preprocess_data(data_dir):
    # Conditionally download data
    TED_DATA = "TEDLIUM_release2.tar.gz"
    TED_DATA_URL = "http://www.openslr.org/resources/19/TEDLIUM_release2.tar.gz"
    local_file = base.maybe_download(TED_DATA, data_dir, TED_DATA_URL)

    # Conditionally extract TED data
    TED_DIR = "TEDLIUM_release2"
    _maybe_extract(data_dir, TED_DIR, local_file)

    # Conditionally convert TED sph data to wav
    _maybe_convert_wav(data_dir, TED_DIR)

    # Conditionally split TED wav and text data into sentences
    train_files, dev_files, test_files = _maybe_split_sentences(data_dir, TED_DIR)

    # Write sets to disk as CSV files
    train_files.to_csv(path.join(data_dir, "ted-train.csv"), index=False)
    dev_files.to_csv(path.join(data_dir, "ted-dev.csv"), index=False)
    test_files.to_csv(path.join(data_dir, "ted-test.csv"), index=False)
Example #14
0
def _download_and_preprocess_data(data_dir):
    # Conditionally download data
    TED_DATA = "TEDLIUM_release2.tar.gz"
    TED_DATA_URL = "http://www.openslr.org/resources/19/TEDLIUM_release2.tar.gz"
    local_file = base.maybe_download(TED_DATA, data_dir, TED_DATA_URL)

    # Conditionally extract TED data
    TED_DIR = "TEDLIUM_release2"
    _maybe_extract(data_dir, TED_DIR, local_file)

    # Conditionally convert TED sph data to wav
    _maybe_convert_wav(data_dir, TED_DIR)

    # Conditionally split TED wav and text data into sentences
    train_files, dev_files, test_files = _maybe_split_sentences(
        data_dir, TED_DIR)

    # Write sets to disk as CSV files
    train_files.to_csv(path.join(data_dir, "ted-train.csv"), index=False)
    dev_files.to_csv(path.join(data_dir, "ted-dev.csv"), index=False)
    test_files.to_csv(path.join(data_dir, "ted-test.csv"), index=False)
Example #15
0
def get_mnist_images():
  import gzip
  from tensorflow.contrib.learn.python.learn.datasets import base
  import numpy
  
  def extract_images(f):
    """Extract the images into a 4D uint8 numpy array [index, y, x, depth].
    Args:
      f: A file object that can be passed into a gzip reader.
    Returns:
      data: A 4D uint8 numpy array [index, y, x, depth].
    Raises:
      ValueError: If the bytestream does not start with 2051.
    """
    print('Extracting', f.name)
    with gzip.GzipFile(fileobj=f) as bytestream:
      magic = _read32(bytestream)
      if magic != 2051:
        raise ValueError('Invalid magic number %d in MNIST image file: %s' %
                         (magic, f.name))
      num_images = _read32(bytestream)
      rows = _read32(bytestream)
      cols = _read32(bytestream)
      buf = bytestream.read(rows * cols * num_images)
      data = numpy.frombuffer(buf, dtype=numpy.uint8)
      data = data.reshape(num_images, rows, cols, 1)
      return data

  def _read32(bytestream):
    dt = numpy.dtype(numpy.uint32).newbyteorder('>')
    return numpy.frombuffer(bytestream.read(4), dtype=dt)[0]

  TRAIN_IMAGES = 'train-images-idx3-ubyte.gz'
  source_url = 'https://storage.googleapis.com/cvdf-datasets/mnist/'
  local_file = base.maybe_download(TRAIN_IMAGES, '/tmp',
                                     source_url + TRAIN_IMAGES)
  train_images = extract_images(open(local_file, 'rb'))
  train_images = train_images.reshape(60000, 28**2).T.astype(np.float64)/255
  return train_images
Example #16
0
def read_data_sets(graph,
                   data_dir,
                   batch_size,
                   numcep,
                   numcontext,
                   thread_count=8):
    # Conditionally download data
    TED_DATA = "TEDLIUM_release2.tar.gz"
    TED_DATA_URL = "http://www.openslr.org/resources/19/TEDLIUM_release2.tar.gz"
    local_file = base.maybe_download(TED_DATA, data_dir, TED_DATA_URL)

    # Conditionally extract TED data
    TED_DIR = "TEDLIUM_release2"
    _maybe_extract(data_dir, TED_DIR, local_file)

    # Conditionally convert TED sph data to wav
    _maybe_convert_wav(data_dir, TED_DIR)

    # Conditionally split TED wav data
    _maybe_split_wav(data_dir, TED_DIR)

    # Conditionally split TED stm data
    _maybe_split_stm(data_dir, TED_DIR)

    # Create dev DataSet
    dev = _read_data_set(graph, data_dir, TED_DIR, "dev", thread_count,
                         batch_size, numcep, numcontext)

    # Create test DataSet
    test = _read_data_set(graph, data_dir, TED_DIR, "test", thread_count,
                          batch_size, numcep, numcontext)

    # Create train DataSet
    train = _read_data_set(graph, data_dir, TED_DIR, "train", thread_count,
                           batch_size, numcep, numcontext)

    # Return DataSets
    return DataSets(train, dev, test)
def read_data_set(name):
    if name == 'mnist':
        return input_data.read_data_sets(FLAGS.data_dir_mnist), 28, 28, 1
    elif name == 'frey_faces':
        maybe_download('frey_rawface.mat', FLAGS.data_dir_frey, 'http://www.cs.nyu.edu/~roweis/data/frey_rawface.mat')
        images = sio.loadmat(FLAGS.data_dir_frey + '/frey_rawface.mat', squeeze_me=True)
        img_rows, img_cols = 28, 20
        n_pixels = img_rows * img_cols

        images = images["ff"].T.reshape((-1, img_rows, img_cols))
        train_images, test_images = train_test_split(images, test_size=0.185)

        train_images = train_images.reshape((-1, n_pixels))
        test_images = test_images.reshape((-1, n_pixels))

        train = DataSet(train_images, dtype=dtypes.float32, seed=None)
        test = DataSet(test_images, dtype=dtypes.float32, seed=None)
        return Datasets(train=train, test=test), 20, 28, 1
    elif name == 'svhn':
        maybe_download('train_32x32.mat', FLAGS.data_dir_svhn, 'http://ufldl.stanford.edu/housenumbers/train_32x32.mat')
        train_images = sio.loadmat(FLAGS.data_dir_svhn + '/train_32x32.mat')['X']
        train_images = np.transpose(train_images, [3, 0, 1, 2])
        train_images = np.reshape(train_images, [-1, 32*32*3])

        maybe_download('test_32x32.mat', FLAGS.data_dir_svhn, 'http://ufldl.stanford.edu/housenumbers/test_32x32.mat')
        test_images = sio.loadmat(FLAGS.data_dir_svhn + '/test_32x32.mat')['X']
        test_images = np.transpose(test_images, [3, 0, 1, 2])
        test_images = np.reshape(test_images, [-1, 32 * 32 * 3])

        train = DataSet(train_images, dtype=dtypes.float32, seed=None)
        test = DataSet(test_images, dtype=dtypes.float32, seed=None)
        return Datasets(train=train, test=test), 32, 32, 3
    elif name == 'cifar10':
        ds = CIFAR10.loadCIFAR10(8)
        train = DataSet(ds['train_set'], dtype=dtypes.float32, seed=None)
        test = DataSet(ds['test_set'], dtype=dtypes.float32, seed=None)
        return Datasets(train=train, test=test), 8, 8, 3
    elif name == 'cifar10_full':
        ds = CIFAR10.loadCIFAR10(32)
        train = DataSet(ds['train_set'], dtype=dtypes.float32, seed=None)
        test = DataSet(ds['test_set'], dtype=dtypes.float32, seed=None)
        return Datasets(train=train, test=test), 32, 32, 3
    else:
        print('No such data set')
Example #18
0
def read_data_sets(train_dir,
                   fake_data=False,
                   one_hot=False,
                   dtype=dtypes.float32,
                   reshape=True,
                   validation_size=5000):
  if fake_data:

    def fake():
      return DataSet([], [], fake_data=True, one_hot=one_hot, dtype=dtype)

    train = fake()
    validation = fake()
    test = fake()
    return base.Datasets(train=train, validation=validation, test=test)

  TRAIN_IMAGES = 'train-images-idx3-ubyte.gz'
  TRAIN_LABELS = 'train-labels-idx1-ubyte.gz'
  TEST_IMAGES = 't10k-images-idx3-ubyte.gz'
  TEST_LABELS = 't10k-labels-idx1-ubyte.gz'

  local_file = base.maybe_download(TRAIN_IMAGES, train_dir,
                                   SOURCE_URL + TRAIN_IMAGES)
  with open(local_file, 'rb') as f
Example #19
0
def read_data_sets(data_dir):
    filename = "cifar-10-python.tar.gz"
    print("getting data")
    SOURCE_URL = 'https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz'
    if rank == 0:
        local_file = base.maybe_download(filename, data_dir, SOURCE_URL)
    else:
        while not os.path.isfile(data_dir + "/" + filename):
            pass

    print('Extracting', filename)
    train_images, train_labels = [], []
    test_images, test_labels = [], []
    with gfile.Open(data_dir + "/" + filename,
                    'rb') as f, tarfile.open(fileobj=f) as tar:
        for x in tar.getnames():
            if "data_batch" in x:
                i, l = _get_data(tar.extractfile(x))
                train_images.extend(
                    i.reshape((i.shape[0], 3, 32, 32)).transpose(0, 2, 3, 1))
                train_labels.extend(l)
            if "test_batch" in x:
                i, l = _get_data(tar.extractfile(x))
                test_images.extend(
                    i.reshape((i.shape[0], 3, 32, 32)).transpose(0, 2, 3, 1))
                test_labels.extend(l)

    train_images = np.array(train_images)
    test_images = np.array(test_images)
    train_labels = np.array(train_labels)
    test_labels = np.array(test_labels)

    train = DataSet(train_images, train_labels, dtype=dtypes.uint8, depth=10)
    test = DataSet(test_images, test_labels, dtype=dtypes.uint8, depth=10)

    return base.Datasets(train=train, validation=None, test=test)
Example #20
0
def read_data_sets(train_dir,
                   fake_data=False,
                   one_hot=False,
                   dtype=dtypes.float32,
                   reshape=True,
                   validation_size=1000,
                   seed=None):
    if fake_data:

        def fake():
            return DataSet([], [],
                           fake_data=True,
                           one_hot=one_hot,
                           dtype=dtype,
                           seed=seed)

        train = fake()
        validation = fake()
        test = fake()
        return base.Datasets(train=train, validation=validation, test=test)

    TRAIN_IMAGES = 'train-images-idx3-ubyte.gz'
    TRAIN_LABELS = 'train-labels-idx1-ubyte.gz'
    TEST_IMAGES = 't10k-images-idx3-ubyte.gz'
    TEST_LABELS = 't10k-labels-idx1-ubyte.gz'

    local_file = base.maybe_download(TRAIN_IMAGES, train_dir,
                                     SOURCE_URL + TRAIN_IMAGES)

    with gfile.Open(local_file, 'rb') as f:
        train_images = extract_images(f)

    local_file = base.maybe_download(TRAIN_LABELS, train_dir,
                                     SOURCE_URL + TRAIN_LABELS)

    with gfile.Open(local_file, 'rb') as f:
        train_labels = extract_labels(f)

    local_file = base.maybe_download(TEST_IMAGES, train_dir,
                                     SOURCE_URL + TEST_IMAGES)

    with gfile.Open(local_file, 'rb') as f:
        test_images = extract_images(f)

    local_file = base.maybe_download(TEST_LABELS, train_dir,
                                     SOURCE_URL + TEST_LABELS)

    with gfile.Open(local_file, 'rb') as f:
        test_labels = extract_labels(f, one_hot=one_hot)

    if not 0 <= validation_size <= len(train_images):
        raise ValueError(
            'Validation size should be between 0 and {}. Received: {}.'.format(
                len(train_images), validation_size))

    # now we filter the data
    index = numpy.where(test_labels < 10)
    test_images = test_images[index]
    test_labels = test_labels[index]
    test_labels = dense_to_one_hot(test_labels, 10)
    index = numpy.where(train_labels < 10)
    train_images = train_images[index]
    train_labels = train_labels[index]
    train_labels = dense_to_one_hot(train_labels, 10)

    validation_images = train_images[:validation_size]
    validation_labels = train_labels[:validation_size]
    train_images = train_images[validation_size:]
    train_labels = train_labels[validation_size:]

    options = dict(dtype=dtype, reshape=reshape, seed=seed)

    train = DataSet(train_images, train_labels, **options)
    validation = DataSet(validation_images, validation_labels, **options)
    test = DataSet(test_images, test_labels, **options)

    return base.Datasets(train=train, validation=validation, test=test)
Example #21
0
def get_mnist_images(max_images=0, fold='train'):
  """Returns mnist images, batch dimension last."""
  
  import gzip
  from tensorflow.contrib.learn.python.learn.datasets import base
  import numpy
  
  def extract_images(f):
    """Extract the images into a 4D uint8 numpy array [index, y, x, depth].
    Args:
      f: A file object that can be passed into a gzip reader.
    Returns:
      data: A 4D uint8 numpy array [index, y, x, depth].
    Raises:
      ValueError: If the bytestream does not start with 2051.
    """
    #    print('Extracting', f.name) # todo: remove
    with gzip.GzipFile(fileobj=f) as bytestream:
      magic = _read32(bytestream)
      if magic != 2051:
        raise ValueError('Invalid magic number %d in MNIST image file: %s' %
                         (magic, f.name))
      num_images = _read32(bytestream)
      if max_images:
        num_images = max_images
      rows = _read32(bytestream)
      cols = _read32(bytestream)
      buf = bytestream.read(rows * cols * num_images)
      data = numpy.frombuffer(buf, dtype=numpy.uint8)
      data = data.reshape(num_images, rows, cols, 1)
      return data

  def _read32(bytestream):
    dt = numpy.dtype(numpy.uint32).newbyteorder('>')
    return numpy.frombuffer(bytestream.read(4), dtype=dt)[0]

  if fold == 'train': # todo: rename
    TRAIN_IMAGES = 'train-images-idx3-ubyte.gz'
  elif fold == 'test':
    TRAIN_IMAGES = 't10k-images-idx3-ubyte.gz'
  else:
    assert False, 'unknown fold %s'%(fold)
    
  source_url = 'https://storage.googleapis.com/cvdf-datasets/mnist/'
  local_file = base.maybe_download(TRAIN_IMAGES, '/tmp',
                                     source_url + TRAIN_IMAGES)
  train_images = extract_images(open(local_file, 'rb'))
  dsize = train_images.shape[0]
  if fold == 'train':
    if not max_images:
      dsize == 60000
    else:
      dsize = max_images
      assert dsize <= 60000
  else:
    if not max_images:
      dsize == 60000
    else:
      dsize = max_images
      assert dsize <= 10000

  train_images = train_images.reshape(dsize, 28**2).T.astype(np.float64)/255
  train_images = np.ascontiguousarray(train_images)
  return train_images.astype(default_np_dtype)
Example #22
0
def read_data_sets(train_dir,
                   fake_data=False,
                   one_hot=False,
                   dtype=dtypes.float32,
                   reshape=True,
                   train_size=50000,
                   valid_size=10000,
                   seed=None,
                   source_url=DEFAULT_SOURCE_URL):
    if fake_data:

        def fake():
            return DataSet([], [],
                           fake_data=True,
                           one_hot=one_hot,
                           dtype=dtype,
                           seed=seed)

        train = fake()
        validation = fake()
        test = fake()
        return base.Datasets(train=train, validation=validation, test=test)

    if not source_url:  # empty string check
        source_url = DEFAULT_SOURCE_URL

    TRAIN_IMAGES = 'train-images-idx3-ubyte.gz'
    TRAIN_LABELS = 'train-labels-idx1-ubyte.gz'
    TEST_IMAGES = 't10k-images-idx3-ubyte.gz'
    TEST_LABELS = 't10k-labels-idx1-ubyte.gz'

    local_file = base.maybe_download(TRAIN_IMAGES, train_dir,
                                     source_url + TRAIN_IMAGES)
    with gfile.Open(local_file, 'rb') as f:
        train_images = extract_images(f)

    # train_num_examples = train_images.shape[0]
    # ikeys = set()
    # for i in range(train_num_examples):
    #   inonzero_m, inonzero_n, inonzero_l = train_images[i].nonzero()
    #   ikey = []
    #   for m, n, l in zip(inonzero_m, inonzero_n, inonzero_l):
    #     ikey.append(str(train_images[i, m, n, l]))
    #   ikey = '_'.join(ikey)
    #   ikey = hashlib.sha224(ikey)
    #   ikey = ikey.hexdigest()
    #   # print('%d %s' % (i, ikey))
    #   ikeys.add(ikey)
    # print('#ikey=%d' % (len(ikeys)))

    local_file = base.maybe_download(TRAIN_LABELS, train_dir,
                                     source_url + TRAIN_LABELS)
    with gfile.Open(local_file, 'rb') as f:
        train_labels = extract_labels(f, one_hot=one_hot)

    local_file = base.maybe_download(TEST_IMAGES, train_dir,
                                     source_url + TEST_IMAGES)
    with gfile.Open(local_file, 'rb') as f:
        test_images = extract_images(f)

    local_file = base.maybe_download(TEST_LABELS, train_dir,
                                     source_url + TEST_LABELS)
    with gfile.Open(local_file, 'rb') as f:
        test_labels = extract_labels(f, one_hot=one_hot)

    if not 0 <= train_size <= len(train_images):
        raise ValueError(
            'train size should be between 0 and {}. Received: {}.'.format(
                len(train_images), train_size))

    if not 0 <= valid_size <= len(train_images):
        raise ValueError(
            'valid size should be between 0 and {}. Received: {}.'.format(
                len(train_images), valid_size))

    valid_images = train_images[:valid_size]
    valid_labels = train_labels[:valid_size]
    # train_images = train_images[valid_size:]
    # train_labels = train_labels[valid_size:]
    train_images = train_images[len(train_images) - train_size:]
    train_labels = train_labels[len(train_labels) - train_size:]
    # print('train image={} label={}'.format(train_images.shape, train_labels.shape))
    # train_label_cn = {}
    # for train_label in train_labels:
    #   train_label = train_label.nonzero()[0][0]
    #   train_label_cn[train_label] = train_label_cn.get(train_label, 0) + 1
    # for train_label, count in train_label_cn.items():
    #   print('train label=%d count=%d' % (train_label, count))

    options = dict(dtype=dtype, reshape=reshape, seed=seed)

    train = DataSet(train_images, train_labels, **options)
    validation = DataSet(valid_images, valid_labels, **options)
    test = DataSet(test_images, test_labels, **options)

    return base.Datasets(train=train, validation=validation, test=test)
Example #23
0
def read_data_sets(train_dir,
                   fake_data=False,
                   one_hot=False,
                   dtype=dtypes.float32,
                   reshape=True,
                   validation_size=5000,
                   num_classes=75,
                   seed=None,
                   source_url=DEFAULT_SOURCE_URL,
                   train_imgaes= 'train-swallowsound-images-idx3-float.gz',
                   train_labels='train-swallowsound-labels-idx1-ubyte.gz',
                   test_imgaes='t10k-swallowsound-images-idx3-float.gz',
                   test_labels='t10k-swallowsound-labels-idx1-ubyte.gz',
                   gzip_compress=True,
                   MSB=True):
  if fake_data:

    def fake():
      return DataSet(
          [], [], fake_data=True, one_hot=one_hot, dtype=dtype, seed=seed)

    train = fake()
    validation = fake()
    test = fake()
    return base.Datasets(train=train, validation=validation, test=test)

  if not source_url:  # empty string check
    source_url = DEFAULT_SOURCE_URL

  TRAIN_IMAGES = train_imgaes
  TRAIN_LABELS = train_labels
  TEST_IMAGES = test_imgaes
  TEST_LABELS = test_labels

  local_file = base.maybe_download(TRAIN_IMAGES, train_dir,
                                   source_url + TRAIN_IMAGES)
  with gfile.Open(local_file, 'rb') as f:
    train_images = extract_images(f,gzip_compress=gzip_compress,MSB=MSB)

  local_file = base.maybe_download(TRAIN_LABELS, train_dir,
                                   source_url + TRAIN_LABELS)
  with gfile.Open(local_file, 'rb') as f:
    train_labels = extract_labels(f, one_hot=one_hot,num_classes = num_classes,gzip_compress=gzip_compress,MSB=MSB)

  local_file = base.maybe_download(TEST_IMAGES, train_dir,
                                   source_url + TEST_IMAGES)
  with gfile.Open(local_file, 'rb') as f:
    test_images = extract_images(f,gzip_compress=gzip_compress,MSB=MSB)

  local_file = base.maybe_download(TEST_LABELS, train_dir,
                                   source_url + TEST_LABELS)
  with gfile.Open(local_file, 'rb') as f:
    test_labels = extract_labels(f, one_hot=one_hot,num_classes = num_classes,gzip_compress=gzip_compress,MSB=MSB)

  if not 0 <= validation_size <= len(train_images):
    raise ValueError(
        'Validation size should be between 0 and {}. Received: {}.'
        .format(len(train_images), validation_size))

  validation_images = train_images[:validation_size]
  validation_labels = train_labels[:validation_size]
  train_images = train_images[validation_size:]
  train_labels = train_labels[validation_size:]


  options = dict(dtype=dtype, reshape=reshape, seed=seed)

  train = DataSet(train_images, train_labels, **options)
  validation = DataSet(validation_images, validation_labels, **options)
  test = DataSet(test_images, test_labels, **options)

  return base.Datasets(train=train, validation=validation, test=test)
Example #24
0
def read_data_sets(train_dir,
                   fake_data=False,
                   one_hot=False,
                   dtype=dtypes.float32,
                   reshape=True,
                   splits=[-1, 5000, -1]):
    if fake_data:

        def fake():
            return DataSet([], [],
                           fake_data=True,
                           one_hot=one_hot,
                           dtype=dtype)

        train = fake()
        validation = fake()
        test = fake()
        return base.Datasets(train=train, validation=validation, test=test)

    TRAIN_IMAGES = 'train-images-idx3-ubyte.gz'
    TRAIN_LABELS = 'train-labels-idx1-ubyte.gz'
    TEST_IMAGES = 't10k-images-idx3-ubyte.gz'
    TEST_LABELS = 't10k-labels-idx1-ubyte.gz'

    local_file = base.maybe_download(TRAIN_IMAGES, train_dir,
                                     SOURCE_URL + TRAIN_IMAGES)
    train_images = extract_images(local_file)

    local_file = base.maybe_download(TRAIN_LABELS, train_dir,
                                     SOURCE_URL + TRAIN_LABELS)
    train_labels = extract_labels(local_file, one_hot=one_hot)

    local_file = base.maybe_download(TEST_IMAGES, train_dir,
                                     SOURCE_URL + TEST_IMAGES)
    test_images = extract_images(local_file)

    local_file = base.maybe_download(TEST_LABELS, train_dir,
                                     SOURCE_URL + TEST_LABELS)
    test_labels = extract_labels(local_file, one_hot=one_hot)

    validation_images = train_images[:5000]
    validation_labels = train_labels[:5000]
    train_images = train_images[5000:]
    train_labels = train_labels[5000:]

    [TRAIN_SIZE, VALIDATION_SIZE, TEST_SIZE] = splits
    # subsample the dataset if neccessary
    if (TRAIN_SIZE != -1):
        train_images, train_labels = stratified_subsampling(
            train_images, train_labels, TRAIN_SIZE)
        validation_image, validation_label = stratified_subsampling(
            validation_images, validation_labels, VALIDATION_SIZE)

    if (TEST_SIZE != -1):
        test_image, test_label = stratified_subsampling(
            test_images, test_labels, TEST_SIZE)

    train = DataSet(train_images, train_labels, dtype=dtype, reshape=reshape)
    validation = DataSet(validation_images,
                         validation_labels,
                         dtype=dtype,
                         reshape=reshape)
    test = DataSet(test_images, test_labels, dtype=dtype, reshape=reshape)

    return base.Datasets(train=train, validation=validation, test=test)
Example #25
0
    train = fake()
    validation = fake()
    test = fake()
    return base.Datasets(train=train, validation=validation, test=test)

  TRAIN_IMAGES = 'train-images-idx3-ubyte.gz'
  TRAIN_LABELS = 'train-labels-idx1-ubyte.gz'
  TEST_IMAGES = 't10k-images-idx3-ubyte.gz'
  TEST_LABELS = 't10k-labels-idx1-ubyte.gz'

  local_file = base.maybe_download(TRAIN_IMAGES, train_dir,
                                   SOURCE_URL + TRAIN_IMAGES)
  with open(local_file, 'rb') as f
    train_images = extract_images(f)

  local_file = base.maybe_download(TRAIN_LABELS, train_dir,
                                   SOURCE_URL + TRAIN_LABELS)
  with open(local_file, 'rb') as f:
    train_labels = extract_labels(f, one_hot=one_hot)

  local_file = base.maybe_download(TEST_IMAGES, train_dir,
                                   SOURCE_URL + TEST_IMAGES)
  with open(local_file, 'rb') as f:
    test_images = extract_images(f)

  local_file = base.maybe_download(TEST_LABELS, train_dir,
                                   SOURCE_URL + TEST_LABELS)
  with open(local_file, 'rb') as f:
    test_labels = extract_labels(f, one_hot=one_hot)

  if not 0 <= validation_size <= len(train_images):
    raise ValueError(
Example #26
0
sess = tf.InteractiveSession()

tf.global_variables_initializer().run()
# Train (10, 100, 1000)
for index in range(5000):
    batch_xs, batch_ys = mnist.train.next_batch(100)
    sess.run(train_step, feed_dict={X: batch_xs, Y_: batch_ys})

print(sess.run(accuracy, feed_dict={X: mnist.validation.images,
                                    Y_: mnist.validation.labels}))
# Test trained model before submission
print(sess.run(accuracy, feed_dict={X: mnist.test.images,
                                    Y_: mnist.test.labels}))

# kaggle test data
if km.DOWNLOAD_DATASETS:
    base.maybe_download(km.KAGGLE_TEST_CSV, km.DATA_DIR, km.SOURCE_URL + km.KAGGLE_TEST_CSV)
kaggle_test_images = pd.read_csv(km.DATA_DIR + km.KAGGLE_TEST_CSV).values.astype('float32')
kaggle_test_images = np.reshape(kaggle_test_images, (kaggle_test_images.shape[0], 28, 28, 1))

# convert from [0:255] => [0.0:1.0]
kaggle_test_images = np.multiply(kaggle_test_images, 1.0 / 255.0)

predictions_kaggle = sess.run(tf.argmax(tf.nn.softmax(Y), 1), feed_dict={X: kaggle_test_images})

with open(km.SUBMISSION_FILE, 'w') as submission:
    submission.write('ImageId,Label\n')
    for index, prediction in enumerate(predictions_kaggle):
        submission.write('{0},{1}\n'.format(index + 1, prediction))
    print("prediction submission written to {0}".format(km.SUBMISSION_FILE))
Example #27
0
def read_data_sets(train_dir,
                   fake_data=False,
                   one_hot=False,
                   dtype=dtypes.float32,
                   reshape=True,
                   validation_size=700,
                   seed=None):
    if fake_data:

        def fake():
            return DataSet([], [],
                           fake_data=True,
                           one_hot=one_hot,
                           dtype=dtype,
                           seed=seed)

        train = fake()
        validation = fake()
        test = fake()
        return base.Datasets(train=train, validation=validation, test=test)

    TRAIN_IMAGES = 'train_prime_images.gz'
    TRAIN_LABELS = 'train_prime_labels.gz'
    TEST_IMAGES = 'test_prime_images.gz'
    TEST_LABELS = 'test_prime_labels.gz'

    local_file = base.maybe_download(TRAIN_IMAGES, train_dir,
                                     SOURCE_URL + TRAIN_IMAGES)
    with open(local_file, 'rb') as f:
        train_images = extract_images(f)

    local_file = base.maybe_download(TRAIN_LABELS, train_dir,
                                     SOURCE_URL + TRAIN_LABELS)
    with open(local_file, 'rb') as f:
        train_labels = extract_labels(f, one_hot=one_hot)

    local_file = base.maybe_download(TEST_IMAGES, train_dir,
                                     SOURCE_URL + TEST_IMAGES)
    with open(local_file, 'rb') as f:
        test_images = extract_images(f)

    local_file = base.maybe_download(TEST_LABELS, train_dir,
                                     SOURCE_URL + TEST_LABELS)
    with open(local_file, 'rb') as f:
        test_labels = extract_labels(f, one_hot=one_hot)

    if not 0 <= validation_size <= len(train_images):
        raise ValueError(
            'Validation size should be between 0 and {}. Received: {}.'.format(
                len(train_images), validation_size))

    validation_images = train_images[:validation_size]
    validation_labels = train_labels[:validation_size]
    train_images = train_images[validation_size:]
    train_labels = train_labels[validation_size:]

    options = dict(dtype=dtype, reshape=reshape, seed=seed)

    train = DataSet(train_images, train_labels, **options)
    validation = DataSet(validation_images, validation_labels, **options)
    test = DataSet(test_images, test_labels, **options)

    return base.Datasets(train=train, validation=validation, test=test)
def read_data_sets(train_dir,
                   fake_data=False,
                   one_hot=False,
                   dtype=dtypes.float32,
                   reshape=True,
                   validation_size=5000,
                   seed=None):
    if fake_data:

        def fake():
            return DataSet([], [],
                           fake_data=True,
                           one_hot=one_hot,
                           dtype=dtype,
                           seed=seed)

        train = fake()
        validation = fake()
        test = fake()
        return base.Datasets(train=train, validation=validation, test=test)

    # TRAIN_IMAGES = 'train-images-idx3-ubyte.gz'
    # TRAIN_LABELS = 'train-labels-idx1-ubyte.gz'
    # TEST_IMAGES = 't10k-images-idx3-ubyte.gz'
    # TEST_LABELS = 't10k-labels-idx1-ubyte.gz'
    TRAIN_IMAGES = 'emnist-balanced-train-images-idx3-ubyte.gz'
    TRAIN_LABELS = 'emnist-balanced-train-labels-idx1-ubyte.gz'
    TEST_IMAGES = 'emnist-balanced-test-images-idx3-ubyte.gz'
    TEST_LABELS = 'emnist-balanced-test-labels-idx1-ubyte.gz'

    local_file = base.maybe_download(TRAIN_IMAGES, train_dir,
                                     SOURCE_URL + TRAIN_IMAGES)
    with gfile.Open(local_file, 'rb') as f:
        train_images = extract_images(f)

    # local_file = base.maybe_download(TRAIN_LABELS, train_dir,
    #                                  SOURCE_URL + TRAIN_LABELS)
    # with gfile.Open(local_file, 'rb') as f:
    #   train_labels = extract_labels(f, one_hot=one_hot)

    local_file = base.maybe_download(TEST_IMAGES, train_dir,
                                     SOURCE_URL + TEST_IMAGES)
    with gfile.Open(local_file, 'rb') as f:
        test_images = extract_images(f)

    # local_file = base.maybe_download(TEST_LABELS, train_dir,
    #                                  SOURCE_URL + TEST_LABELS)
    # with gfile.Open(local_file, 'rb') as f:
    #   test_labels = extract_labels(f, one_hot=one_hot)

    if not 0 <= validation_size <= len(train_images):
        raise ValueError(
            'Validation size should be between 0 and {}. Received: {}.'.format(
                len(train_images), validation_size))

    validation_images = train_images[:validation_size]
    # validation_labels = train_labels[:validation_size]
    train_images = train_images[validation_size:]
    # train_labels = train_labels[validation_size:]

    options = dict(dtype=dtype, reshape=reshape, seed=seed)

    train = DataSet(train_images, **options)
    validation = DataSet(validation_images, **options)
    test = DataSet(test_images, **options)

    return base.Datasets(train=train, validation=validation, test=test)
def read_data_sets(data_path, fake_data=False, one_hot=True,
                   percentage_train=1.,
                   validation_size=5000, source_url=DEFAULT_SOURCE_URL):

    train_dir = data_path

    class DataSets(object):
        pass

    data_sets = DataSets()

    if fake_data:
        data_sets.train = DataSet([], [], fake_data=True, one_hot=True)
        data_sets.val = DataSet([], [], fake_data=True, one_hot=True)
        data_sets.test = DataSet([], [], fake_data=True, one_hot=True)
        return data_sets

    if not source_url:  # empty string check
        source_url = DEFAULT_SOURCE_URL

    TRAIN_IMAGES = 'train-images-idx3-ubyte.gz'
    TRAIN_LABELS = 'train-labels-idx1-ubyte.gz'
    TEST_IMAGES = 't10k-images-idx3-ubyte.gz'
    TEST_LABELS = 't10k-labels-idx1-ubyte.gz'

    local_file = base.maybe_download(TRAIN_IMAGES, train_dir,
                                     source_url + TRAIN_IMAGES)
    with gfile.Open(local_file, 'rb') as f:
        train_images = extract_images(f)

    local_file = base.maybe_download(TRAIN_LABELS, train_dir,
                                     source_url + TRAIN_LABELS)
    with gfile.Open(local_file, 'rb') as f:
        train_labels = extract_labels(f, one_hot=one_hot)

    local_file = base.maybe_download(TEST_IMAGES, train_dir,
                                     source_url + TEST_IMAGES)
    with gfile.Open(local_file, 'rb') as f:
        test_images = extract_images(f)

    local_file = base.maybe_download(TEST_LABELS, train_dir,
                                     source_url + TEST_LABELS)
    with gfile.Open(local_file, 'rb') as f:
        test_labels = extract_labels(f, one_hot=one_hot)

    if not 0 <= validation_size <= len(train_images):
        raise ValueError(
            'Validation size should be between 0 and {}. Received: {}.'.format(len(train_images), validation_size))

    val_images = train_images[:validation_size]
    val_labels = train_labels[:validation_size]
    train_images = train_images[validation_size:]
    train_labels = train_labels[validation_size:]

    np.random.seed(42)

    # add random permutation
    n_train = train_images.shape[0]
    perm = np.random.permutation(n_train)
    train_images = train_images[perm]
    train_labels = train_labels[perm]

    n_val = val_images.shape[0]
    perm = np.random.permutation(n_val)
    val_images = val_images[perm]
    val_labels = val_labels[perm]

    n_test = test_images.shape[0]

    if percentage_train != 1.:
        train_size = int(percentage_train*train_images.shape[0])
        Xtrain_images, Xval_images, ytrain, yval = train_test_split(train_images, train_labels, train_size=train_size)
        train_images = Xtrain_images
        train_labels = ytrain

    data_sets.train = DataSet(train_images, train_labels, fake_data=True, one_hot=True)
    data_sets.val = DataSet(val_images, val_labels, fake_data=True, one_hot=True)
    data_sets.test = DataSet(test_images, test_labels, fake_data=True, one_hot=True)

    return data_sets
def custom_kaggle_mnist():
    """
    downloads and parses mnist train dataset for kaggle digit recognizer
    parsing and one_hot copied https://www.kaggle.com/kakauandme/tensorflow-deep-nn
    """
    if DOWNLOAD_DATASETS:
        base.maybe_download(KAGGLE_TRAIN_CSV, DATA_DIR,
                            SOURCE_URL + KAGGLE_TRAIN_CSV)

    # Import data from datasource, see https://www.kaggle.com/kakauandme/tensorflow-deep-nn
    # read training data from CSV file
    data = pd.read_csv(DATA_DIR + KAGGLE_TRAIN_CSV)

    from sklearn.utils import shuffle
    ## data = shuffle(data, random_state=42)

    images = data.iloc[:, 1:].values
    images = images.astype(np.float)
    images = np.reshape(images, (images.shape[0], 28, 28, 1))

    # convert from [0:255] => [0.0:1.0]
    ## images = np.multiply(images, 1.0 / 255.0)

    print('number of images in downloaded train dataset: {0[0]}'.format(
        images.shape))

    labels_flat = data.iloc[:, 0].values
    labels_count = np.unique(labels_flat).shape[0]

    def dense_to_one_hot(labels_dense, num_classes):
        num_labels = labels_dense.shape[0]
        index_offset = np.arange(num_labels) * num_classes
        labels_one_hot = np.zeros((num_labels, num_classes))
        labels_one_hot.flat[index_offset + labels_dense.ravel()] = 1
        return labels_one_hot

    labels = dense_to_one_hot(labels_flat, labels_count)
    ## labels = labels.astype(np.uint8)

    # split data into training & validation
    mnist_train_images = images[:TRAIN_SIZE]
    mnist_train_labels = labels[:TRAIN_SIZE]
    print('number of train images: {0[0]}'.format(mnist_train_images.shape))

    mnist_valid_images = images[TRAIN_SIZE:TRAIN_SIZE + VALID_SIZE]
    mnist_valid_labels = labels[TRAIN_SIZE:TRAIN_SIZE + VALID_SIZE]
    print('number of valid images: {0[0]}'.format(mnist_valid_images.shape))

    mnist_test_images = images[TRAIN_SIZE + VALID_SIZE:images.shape[0]]
    mnist_test_labels = labels[TRAIN_SIZE + VALID_SIZE:images.shape[0]]
    print('number of test images: {0[0]}'.format(mnist_test_images.shape))

    train = DataSet(mnist_train_images,
                    mnist_train_labels,
                    dtype=np.float32,
                    reshape=False)
    valid = DataSet(mnist_valid_images,
                    mnist_valid_labels,
                    dtype=np.float32,
                    reshape=False)
    test = DataSet(mnist_test_images,
                   mnist_test_labels,
                   dtype=np.float32,
                   reshape=False)

    return base.Datasets(train=train, validation=valid, test=test)
def read_data_sets(train_dir,
                   fake_data=False,
                   one_hot=False,
                   dtype=dtypes.float32,
                   reshape=True,
                   validation_size=0):
    if fake_data:

        def fake():
            return DataSet([], [],
                           fake_data=True,
                           one_hot=one_hot,
                           dtype=dtype)

        train = fake()
        validation = fake()
        test = fake()
        return base.Datasets(train=train, validation=validation, test=test)

    gz_file_name = 'cifar-10-python.tar.gz'

    local_file = base.maybe_download(gz_file_name, train_dir,
                                     SOURCE_URL + gz_file_name)

    train_images = []
    train_labels = []
    for i in range(1, 6):
        with open(
                os.path.join(train_dir, 'cifar-10-batches-py',
                             'data_batch_%d' % i)) as f:
            # batch = pickle.load(f, encoding='latin1')
            batch = numpy.load(os.path.join(train_dir, 'cifar-10-batches-py',
                                            'data_batch_%d' % i),
                               'rb',
                               encoding='latin1')

            tmp_images = batch['data'].reshape([-1, 3, 32, 32])
            train_images.append(tmp_images.transpose([0, 2, 3, 1]))
            train_labels += batch['labels']
    train_images = numpy.concatenate(train_images)
    train_labels = numpy.array(train_labels)

    if not 0 <= validation_size <= len(train_images):
        raise ValueError(
            'Validation size should be between 0 and {}. Received: {}.'.format(
                len(train_images), validation_size))

    validation_images = train_images[:validation_size]
    validation_labels = train_labels[:validation_size]
    train_images = train_images[validation_size:]
    train_labels = train_labels[validation_size:]

    train = DataSet(train_images, train_labels, dtype=dtype, reshape=reshape)
    validation = DataSet(validation_images,
                         validation_labels,
                         dtype=dtype,
                         reshape=reshape)
    # test = DataSet(test_images, test_labels, dtype=dtype, reshape=reshape)
    test = None

    return base.Datasets(train=train, validation=validation, test=test)
def read_data_sets(train_dir,
                   fake_data=False,
                   one_hot=False,
                   dtype=dtypes.float32,
                   reshape=True,
                   validation_size=5000,
                   seed=None,
                   source_url=DEFAULT_SOURCE_URL):
  if fake_data:

    def fake():
      return DataSet(
          [], [], fake_data=True, one_hot=one_hot, dtype=dtype, seed=seed)

    train = fake()
    validation = fake()
    test = fake()
    return base.Datasets(train=train, validation=validation, test=test)

  if not source_url:  # empty string check
    source_url = DEFAULT_SOURCE_URL

  # print("using %s" % source_url)
  TRAIN_IMAGES = 'train-images-idx3-ubyte.gz'
  TRAIN_LABELS = 'train-labels-idx1-ubyte.gz'
  TEST_IMAGES = 't10k-images-idx3-ubyte.gz'
  TEST_LABELS = 't10k-labels-idx1-ubyte.gz'

  local_file = base.maybe_download(TRAIN_IMAGES, train_dir,
                                   source_url + TRAIN_IMAGES)
  with gfile.Open(local_file, 'rb') as f:
    train_images = extract_images(f)

  local_file = base.maybe_download(TRAIN_LABELS, train_dir,
                                   source_url + TRAIN_LABELS)
  with gfile.Open(local_file, 'rb') as f:
    train_labels = extract_labels(f, one_hot=one_hot)

  local_file = base.maybe_download(TEST_IMAGES, train_dir,
                                   source_url + TEST_IMAGES)
  with gfile.Open(local_file, 'rb') as f:
    test_images = extract_images(f)

  local_file = base.maybe_download(TEST_LABELS, train_dir,
                                   source_url + TEST_LABELS)
  with gfile.Open(local_file, 'rb') as f:
    test_labels = extract_labels(f, one_hot=one_hot)

  if not 0 <= validation_size <= len(train_images):
    raise ValueError(
        'Validation size should be between 0 and {}. Received: {}.'
        .format(len(train_images), validation_size))

  validation_images = train_images[:validation_size]
  validation_labels = train_labels[:validation_size]
  train_images = train_images[validation_size:]
  train_labels = train_labels[validation_size:]


  options = dict(dtype=dtype, reshape=reshape, seed=seed)

  train = DataSet(train_images, train_labels, **options)
  validation = DataSet(validation_images, validation_labels, **options)
  test = DataSet(test_images, test_labels, **options)

  return base.Datasets(train=train, validation=validation, test=test)
Example #33
0
tf.logging.set_verbosity(tf.logging.INFO)

parser = argparse.ArgumentParser()

parser.add_argument('--job-dir',
                    help='GCS location to write checkpoints and export models',
                    required=False)
args = parser.parse_args()
job_dir = args.job_dir

# Data sets
IRIS_TRAINING_FILE = "iris_training.csv"
IRIS_TEST_FILE = "iris_test.csv"
gcs_folder = 'https://storage.googleapis.com/dataset-uploader/iris/'
IRIS_TRAINING = base.maybe_download(IRIS_TRAINING_FILE, '.',
                                    gcs_folder + IRIS_TRAINING_FILE)
IRIS_TEST = base.maybe_download(IRIS_TEST_FILE, '.',
                                gcs_folder + IRIS_TEST_FILE)

# Load datasets.
training_set = base.load_csv_with_header(filename=IRIS_TRAINING,
                                         features_dtype=np.float64,
                                         target_dtype=np.int)
test_set = base.load_csv_with_header(filename=IRIS_TEST,
                                     features_dtype=np.float64,
                                     target_dtype=np.int)

# Specify that all features have real-value data
feature_columns = [
    tf.contrib.layers.real_valued_column("flower_features", dimension=4)
]
def read_data_sets(train_dir,
                   fake_data=False,
                   one_hot=False,
                   dtype=dtypes.float32,
                   reshape=False,
                   validation_size=5000,
                   worker_id=-1,
                   n_workers=-1):
    if fake_data:

        def fake():
            return DataSet([], [],
                           fake_data=True,
                           one_hot=one_hot,
                           dtype=dtype)

        train = fake()
        validation = fake()
        test = fake()
        return base.Datasets(train=train, validation=validation, test=test)

    TRAIN_IMAGES = 'train-images-idx3-ubyte.gz'
    TRAIN_LABELS = 'train-labels-idx1-ubyte.gz'
    TEST_IMAGES = 't10k-images-idx3-ubyte.gz'
    TEST_LABELS = 't10k-labels-idx1-ubyte.gz'

    local_file = base.maybe_download(TRAIN_IMAGES, train_dir,
                                     SOURCE_URL + TRAIN_IMAGES)
    train_images = extract_data(local_file, 60000)

    local_file = base.maybe_download(TRAIN_LABELS, train_dir,
                                     SOURCE_URL + TRAIN_LABELS)
    train_labels = extract_labels(local_file, 60000)

    local_file = base.maybe_download(TEST_IMAGES, train_dir,
                                     SOURCE_URL + TEST_IMAGES)
    test_images = extract_data(local_file, 10000)

    local_file = base.maybe_download(TEST_LABELS, train_dir,
                                     SOURCE_URL + TEST_LABELS)
    test_labels = extract_labels(local_file, 10000)

    if not 0 <= validation_size <= len(train_images):
        raise ValueError(
            'Validation size should be between 0 and {}. Received: {}.'.format(
                len(train_images), validation_size))

    validation_images = test_images
    validation_labels = test_labels
    train_images = train_images
    train_labels = train_labels
    train_images_binary, train_labels_binary, test_images_binary, test_labels_binary = extract_for_binary(
        train_set=train_images,
        train_labels=train_labels,
        test_set=test_images,
        test_labels=test_labels)
    sampled_train_images, sampled_train_labels = down_sample(
        train_images_binary, train_labels_binary, down_sample_num=1024)
    new_data, new_labels = aug_data_set(sampled_train_images,
                                        sampled_train_labels,
                                        times_expand=1,
                                        aug_type='noise')
    #  train = DataSet(train_images, train_labels, dtype=dtype, reshape=reshape)
    #  train = DataSet(sampled_train_images, sampled_train_labels, dtype=dtype, reshape=reshape)
    train = DataSet(new_data, new_labels, dtype=dtype, reshape=reshape)
    validation = DataSet(validation_images,
                         validation_labels,
                         dtype=dtype,
                         reshape=reshape)
    print(new_data.shape, new_labels.shape)
    print("=================================================================")
    return base.Datasets(train=train, validation=validation, test=None)
Example #35
0
def _download_and_preprocess_data(data_dir):
    # Conditionally download data to data_dir
    print(
        "Downloading Librivox data set (55GB) into {} if not already present..."
        .format(data_dir))
    with progressbar.ProgressBar(max_value=7,
                                 widget=progressbar.AdaptiveETA) as bar:
        TRAIN_CLEAN_100_URL = "http://www.openslr.org/resources/12/train-clean-100.tar.gz"
        TRAIN_CLEAN_360_URL = "http://www.openslr.org/resources/12/train-clean-360.tar.gz"
        TRAIN_OTHER_500_URL = "http://www.openslr.org/resources/12/train-other-500.tar.gz"

        DEV_CLEAN_URL = "http://www.openslr.org/resources/12/dev-clean.tar.gz"
        DEV_OTHER_URL = "http://www.openslr.org/resources/12/dev-other.tar.gz"

        TEST_CLEAN_URL = "http://www.openslr.org/resources/12/test-clean.tar.gz"
        TEST_OTHER_URL = "http://www.openslr.org/resources/12/test-other.tar.gz"

        def filename_of(x):
            return os.path.split(x)[1]

        train_clean_100 = base.maybe_download(filename_of(TRAIN_CLEAN_100_URL),
                                              data_dir, TRAIN_CLEAN_100_URL)
        bar.update(0)
        train_clean_360 = base.maybe_download(filename_of(TRAIN_CLEAN_360_URL),
                                              data_dir, TRAIN_CLEAN_360_URL)
        bar.update(1)
        train_other_500 = base.maybe_download(filename_of(TRAIN_OTHER_500_URL),
                                              data_dir, TRAIN_OTHER_500_URL)
        bar.update(2)

        dev_clean = base.maybe_download(filename_of(DEV_CLEAN_URL), data_dir,
                                        DEV_CLEAN_URL)
        bar.update(3)
        dev_other = base.maybe_download(filename_of(DEV_OTHER_URL), data_dir,
                                        DEV_OTHER_URL)
        bar.update(4)

        test_clean = base.maybe_download(filename_of(TEST_CLEAN_URL), data_dir,
                                         TEST_CLEAN_URL)
        bar.update(5)
        test_other = base.maybe_download(filename_of(TEST_OTHER_URL), data_dir,
                                         TEST_OTHER_URL)
        bar.update(6)

    # Conditionally extract LibriSpeech data
    # We extract each archive into data_dir, but test for existence in
    # data_dir/LibriSpeech because the archives share that root.
    print("Extracting librivox data if not already extracted...")
    with progressbar.ProgressBar(max_value=7,
                                 widget=progressbar.AdaptiveETA) as bar:
        LIBRIVOX_DIR = "LibriSpeech"
        work_dir = os.path.join(data_dir, LIBRIVOX_DIR)

        _maybe_extract(data_dir, os.path.join(LIBRIVOX_DIR, "train-clean-100"),
                       train_clean_100)
        bar.update(0)
        _maybe_extract(data_dir, os.path.join(LIBRIVOX_DIR, "train-clean-360"),
                       train_clean_360)
        bar.update(1)
        _maybe_extract(data_dir, os.path.join(LIBRIVOX_DIR, "train-other-500"),
                       train_other_500)
        bar.update(2)

        _maybe_extract(data_dir, os.path.join(LIBRIVOX_DIR, "dev-clean"),
                       dev_clean)
        bar.update(3)
        _maybe_extract(data_dir, os.path.join(LIBRIVOX_DIR, "dev-other"),
                       dev_other)
        bar.update(4)

        _maybe_extract(data_dir, os.path.join(LIBRIVOX_DIR, "test-clean"),
                       test_clean)
        bar.update(5)
        _maybe_extract(data_dir, os.path.join(LIBRIVOX_DIR, "test-other"),
                       test_other)
        bar.update(6)

    # Convert FLAC data to wav, from:
    # data_dir/LibriSpeech/split/1/2/1-2-3.flac
    # to:
    # data_dir/LibriSpeech/split-wav/1-2-3.wav
    #
    # And split LibriSpeech transcriptions, from:
    # data_dir/LibriSpeech/split/1/2/1-2.trans.txt
    # to:
    # data_dir/LibriSpeech/split-wav/1-2-0.txt
    # data_dir/LibriSpeech/split-wav/1-2-1.txt
    # data_dir/LibriSpeech/split-wav/1-2-2.txt
    # ...
    print("Converting FLAC to WAV and splitting transcriptions...")
    with progressbar.ProgressBar(max_value=7,
                                 widget=progressbar.AdaptiveETA) as bar:
        train_100 = _convert_audio_and_split_sentences(work_dir,
                                                       "train-clean-100",
                                                       "train-clean-100-wav")
        bar.update(0)
        train_360 = _convert_audio_and_split_sentences(work_dir,
                                                       "train-clean-360",
                                                       "train-clean-360-wav")
        bar.update(1)
        train_500 = _convert_audio_and_split_sentences(work_dir,
                                                       "train-other-500",
                                                       "train-other-500-wav")
        bar.update(2)

        dev_clean = _convert_audio_and_split_sentences(work_dir, "dev-clean",
                                                       "dev-clean-wav")
        bar.update(3)
        dev_other = _convert_audio_and_split_sentences(work_dir, "dev-other",
                                                       "dev-other-wav")
        bar.update(4)

        test_clean = _convert_audio_and_split_sentences(
            work_dir, "test-clean", "test-clean-wav")
        bar.update(5)
        test_other = _convert_audio_and_split_sentences(
            work_dir, "test-other", "test-other-wav")
        bar.update(6)

    # Write sets to disk as CSV files
    train_100.to_csv(os.path.join(data_dir, "librivox-train-clean-100.csv"),
                     index=False)
    train_360.to_csv(os.path.join(data_dir, "librivox-train-clean-360.csv"),
                     index=False)
    train_500.to_csv(os.path.join(data_dir, "librivox-train-other-500.csv"),
                     index=False)

    dev_clean.to_csv(os.path.join(data_dir, "librivox-dev-clean.csv"),
                     index=False)
    dev_other.to_csv(os.path.join(data_dir, "librivox-dev-other.csv"),
                     index=False)

    test_clean.to_csv(os.path.join(data_dir, "librivox-test-clean.csv"),
                      index=False)
    test_other.to_csv(os.path.join(data_dir, "librivox-test-other.csv"),
                      index=False)
Example #36
0
def read_data_sets(data_dir,
                   train_batch_size,
                   dev_batch_size,
                   test_batch_size,
                   numcep,
                   numcontext,
                   thread_count=8,
                   limit_dev=0,
                   limit_test=0,
                   limit_train=0,
                   sets=[]):
    # Conditionally download data
    TED_DATA = "TEDLIUM_release1.tar.gz"
    TED_DATA_URL = "http://www.openslr.org/resources/7/TEDLIUM_release1.tar.gz"
    local_file = base.maybe_download(TED_DATA, data_dir, TED_DATA_URL)

    # Conditionally extract TED data
    TED_DIR = "TEDLIUM_release1"
    _maybe_extract(data_dir, TED_DIR, local_file)

    # Conditionally convert TED sph data to wav
    _maybe_convert_wav(data_dir, TED_DIR)

    # Conditionally split TED wav data
    _maybe_split_wav(data_dir, TED_DIR)

    # Conditionally split TED stm data
    _maybe_split_stm(data_dir, TED_DIR)

    # Create dev DataSet
    dev = None
    if "dev" in sets:
        dev = _read_data_set(data_dir,
                             TED_DIR,
                             "dev",
                             thread_count,
                             dev_batch_size,
                             numcep,
                             numcontext,
                             limit=limit_dev)

    # Create test DataSet
    test = None
    if "test" in sets:
        test = _read_data_set(data_dir,
                              TED_DIR,
                              "test",
                              thread_count,
                              test_batch_size,
                              numcep,
                              numcontext,
                              limit=limit_test)

    # Create train DataSet
    train = None
    if "train" in sets:
        train = _read_data_set(data_dir,
                               TED_DIR,
                               "train",
                               thread_count,
                               train_batch_size,
                               numcep,
                               numcontext,
                               limit=limit_train)

    # Return DataSets
    return DataSets(train, dev, test)
Example #37
0
def _download_and_preprocess_data(data_dir):
  # Conditionally download data to data_dir
  print("Downloading Librivox data set (55GB) into {} if not already present...".format(data_dir))
  with progressbar.ProgressBar(max_value=7, widget=progressbar.AdaptiveETA) as bar:
    TRAIN_CLEAN_100_URL = "http://www.openslr.org/resources/12/train-clean-100.tar.gz"
    TRAIN_CLEAN_360_URL = "http://www.openslr.org/resources/12/train-clean-360.tar.gz"
    TRAIN_OTHER_500_URL = "http://www.openslr.org/resources/12/train-other-500.tar.gz"

    DEV_CLEAN_URL = "http://www.openslr.org/resources/12/dev-clean.tar.gz"
    DEV_OTHER_URL = "http://www.openslr.org/resources/12/dev-other.tar.gz"

    TEST_CLEAN_URL = "http://www.openslr.org/resources/12/test-clean.tar.gz"
    TEST_OTHER_URL = "http://www.openslr.org/resources/12/test-other.tar.gz"

    def filename_of(x): return os.path.split(x)[1]
    train_clean_100 = base.maybe_download(filename_of(TRAIN_CLEAN_100_URL), data_dir, TRAIN_CLEAN_100_URL)
    bar.update(0)
    train_clean_360 = base.maybe_download(filename_of(TRAIN_CLEAN_360_URL), data_dir, TRAIN_CLEAN_360_URL)
    bar.update(1)
    train_other_500 = base.maybe_download(filename_of(TRAIN_OTHER_500_URL), data_dir, TRAIN_OTHER_500_URL)
    bar.update(2)

    dev_clean = base.maybe_download(filename_of(DEV_CLEAN_URL), data_dir, DEV_CLEAN_URL)
    bar.update(3)
    dev_other = base.maybe_download(filename_of(DEV_OTHER_URL), data_dir, DEV_OTHER_URL)
    bar.update(4)

    test_clean = base.maybe_download(filename_of(TEST_CLEAN_URL), data_dir, TEST_CLEAN_URL)
    bar.update(5)
    test_other = base.maybe_download(filename_of(TEST_OTHER_URL), data_dir, TEST_OTHER_URL)
    bar.update(6)

  # Conditionally extract LibriSpeech data
  # We extract each archive into data_dir, but test for existence in
  # data_dir/LibriSpeech because the archives share that root.
  print("Extracting librivox data if not already extracted...")
  with progressbar.ProgressBar(max_value=7, widget=progressbar.AdaptiveETA) as bar:
    LIBRIVOX_DIR = "LibriSpeech"
    work_dir = os.path.join(data_dir, LIBRIVOX_DIR)

    _maybe_extract(data_dir, os.path.join(LIBRIVOX_DIR, "train-clean-100"), train_clean_100)
    bar.update(0)
    _maybe_extract(data_dir, os.path.join(LIBRIVOX_DIR, "train-clean-360"), train_clean_360)
    bar.update(1)
    _maybe_extract(data_dir, os.path.join(LIBRIVOX_DIR, "train-other-500"), train_other_500)
    bar.update(2)

    _maybe_extract(data_dir, os.path.join(LIBRIVOX_DIR, "dev-clean"), dev_clean)
    bar.update(3)
    _maybe_extract(data_dir, os.path.join(LIBRIVOX_DIR, "dev-other"), dev_other)
    bar.update(4)

    _maybe_extract(data_dir, os.path.join(LIBRIVOX_DIR, "test-clean"), test_clean)
    bar.update(5)
    _maybe_extract(data_dir, os.path.join(LIBRIVOX_DIR, "test-other"), test_other)
    bar.update(6)

  # Convert FLAC data to wav, from:
  # data_dir/LibriSpeech/split/1/2/1-2-3.flac
  # to:
  # data_dir/LibriSpeech/split-wav/1-2-3.wav
  #
  # And split LibriSpeech transcriptions, from:
  # data_dir/LibriSpeech/split/1/2/1-2.trans.txt
  # to:
  # data_dir/LibriSpeech/split-wav/1-2-0.txt
  # data_dir/LibriSpeech/split-wav/1-2-1.txt
  # data_dir/LibriSpeech/split-wav/1-2-2.txt
  # ...
  print("Converting FLAC to WAV and splitting transcriptions...")
  with progressbar.ProgressBar(max_value=7, widget=progressbar.AdaptiveETA) as bar:
    train_100 = _convert_audio_and_split_sentences(work_dir, "train-clean-100", "train-clean-100-wav")
    bar.update(0)
    train_360 = _convert_audio_and_split_sentences(work_dir, "train-clean-360", "train-clean-360-wav")
    bar.update(1)
    train_500 = _convert_audio_and_split_sentences(work_dir, "train-other-500", "train-other-500-wav")
    bar.update(2)

    dev_clean = _convert_audio_and_split_sentences(work_dir, "dev-clean", "dev-clean-wav")
    bar.update(3)
    dev_other = _convert_audio_and_split_sentences(work_dir, "dev-other", "dev-other-wav")
    bar.update(4)

    test_clean = _convert_audio_and_split_sentences(work_dir, "test-clean", "test-clean-wav")
    bar.update(5)
    test_other = _convert_audio_and_split_sentences(work_dir, "test-other", "test-other-wav")
    bar.update(6)

  # Write sets to disk as CSV files
  train_100.to_csv(os.path.join(data_dir, "librivox-train-clean-100.csv"), index=False)
  train_360.to_csv(os.path.join(data_dir, "librivox-train-clean-360.csv"), index=False)
  train_500.to_csv(os.path.join(data_dir, "librivox-train-other-500.csv"), index=False)

  dev_clean.to_csv(os.path.join(data_dir, "librivox-dev-clean.csv"), index=False)
  dev_other.to_csv(os.path.join(data_dir, "librivox-dev-other.csv"), index=False)

  test_clean.to_csv(os.path.join(data_dir, "librivox-test-clean.csv"), index=False)
  test_other.to_csv(os.path.join(data_dir, "librivox-test-other.csv"), index=False)
Example #38
0
def read_data_sets(train_dir,
                   fake_data=False,
                   one_hot=False,
                   dtype=dtypes.float32,
                   reshape=False,
                   validation_size=5000,
                   worker_id=-1,
                   n_workers=-1):
    if fake_data:

        def fake():
            return DataSet([], [],
                           fake_data=True,
                           one_hot=one_hot,
                           dtype=dtype)

        train = fake()
        validation = fake()
        test = fake()
        return base.Datasets(train=train, validation=validation, test=test)

    TRAIN_IMAGES = 'train-images-idx3-ubyte.gz'
    TRAIN_LABELS = 'train-labels-idx1-ubyte.gz'
    TEST_IMAGES = 't10k-images-idx3-ubyte.gz'
    TEST_LABELS = 't10k-labels-idx1-ubyte.gz'

    local_file = base.maybe_download(TRAIN_IMAGES, train_dir,
                                     SOURCE_URL + TRAIN_IMAGES)
    train_images = extract_data(local_file, 60000)

    local_file = base.maybe_download(TRAIN_LABELS, train_dir,
                                     SOURCE_URL + TRAIN_LABELS)
    train_labels = extract_labels(local_file, 60000)

    local_file = base.maybe_download(TEST_IMAGES, train_dir,
                                     SOURCE_URL + TEST_IMAGES)
    test_images = extract_data(local_file, 10000)

    local_file = base.maybe_download(TEST_LABELS, train_dir,
                                     SOURCE_URL + TEST_LABELS)
    test_labels = extract_labels(local_file, 10000)

    if not 0 <= validation_size <= len(train_images):
        raise ValueError(
            'Validation size should be between 0 and {}. Received: {}.'.format(
                len(train_images), validation_size))

    validation_images = test_images
    validation_labels = test_labels
    train_images = train_images
    train_labels = train_labels

    # convert labels to on-hot labels here:
    train_labels_tmp = numpy.zeros((train_labels.shape[0], NUM_LABELS))
    train_labels_tmp[numpy.arange(len(train_labels_tmp)), train_labels] += 1

    valid_labels_tmp = numpy.zeros((validation_labels.shape[0], NUM_LABELS))
    valid_labels_tmp[numpy.arange(len(valid_labels_tmp)),
                     validation_labels] += 1

    #train = DataSet(train_images, train_labels, dtype=dtype, reshape=reshape)
    train = DataSet(train_images,
                    train_labels_tmp,
                    dtype=dtype,
                    reshape=reshape)

    validation = DataSet(validation_images,
                         valid_labels_tmp,
                         dtype=dtype,
                         reshape=reshape)

    return base.Datasets(train=train, validation=validation, test=None)
Example #39
0
def read_data_sets(data_dir, train_batch_size, dev_batch_size, test_batch_size, numcep, numcontext, thread_count=8,
                   limit_dev=0, limit_test=0, limit_train=0, sets=[]):
    # Check if we can convert FLAC with SoX before we start
    sox_help_out = subprocess.check_output(["sox", "-h"])
    if sox_help_out.find("flac") == -1:
        print("Error: SoX doesn't support FLAC. Please install SoX with FLAC support and try again.")
        exit(1)
    # Conditionally download data to data_dir
    print("Downloading Librivox data sets if not already present...")
    with progressbar.ProgressBar(max_value=7, widget=progressbar.AdaptiveETA) as bar:
        TRAIN_CLEAN_100_URL = "http://www.openslr.org/resources/12/train-clean-100.tar.gz"
        TRAIN_CLEAN_360_URL = "http://www.openslr.org/resources/12/train-clean-360.tar.gz"
        TRAIN_OTHER_500_URL = "http://www.openslr.org/resources/12/train-other-500.tar.gz"

        DEV_CLEAN_URL = "http://www.openslr.org/resources/12/dev-clean.tar.gz"
        DEV_OTHER_URL = "http://www.openslr.org/resources/12/dev-other.tar.gz"

        TEST_CLEAN_URL = "http://www.openslr.org/resources/12/test-clean.tar.gz"
        TEST_OTHER_URL = "http://www.openslr.org/resources/12/test-other.tar.gz"

        def filename_of(x): return path.split(x)[1]

        train_clean_100 = base.maybe_download(filename_of(TRAIN_CLEAN_100_URL), data_dir, TRAIN_CLEAN_100_URL)
        bar.update(0)
        train_clean_360 = base.maybe_download(filename_of(TRAIN_CLEAN_360_URL), data_dir, TRAIN_CLEAN_360_URL)
        bar.update(1)
        train_other_500 = base.maybe_download(filename_of(TRAIN_OTHER_500_URL), data_dir, TRAIN_OTHER_500_URL)
        bar.update(2)

        dev_clean = base.maybe_download(filename_of(DEV_CLEAN_URL), data_dir, DEV_CLEAN_URL)
        bar.update(3)
        dev_other = base.maybe_download(filename_of(DEV_OTHER_URL), data_dir, DEV_OTHER_URL)
        bar.update(4)

        test_clean = base.maybe_download(filename_of(TEST_CLEAN_URL), data_dir, TEST_CLEAN_URL)
        bar.update(5)
        test_other = base.maybe_download(filename_of(TEST_OTHER_URL), data_dir, TEST_OTHER_URL)
        bar.update(6)

    # Conditionally extract LibriSpeech data
    # We extract each archive into data_dir, but test for existence in
    # data_dir/LibriSpeech because the archives share that root.
    print("Extracting librivox data if not already extracted...")
    with progressbar.ProgressBar(max_value=7, widget=progressbar.AdaptiveETA) as bar:
        LIBRIVOX_DIR = "LibriSpeech"
        work_dir = os.path.join(data_dir, LIBRIVOX_DIR)

        _maybe_extract(data_dir, os.path.join(LIBRIVOX_DIR, "train-clean-100"), train_clean_100)
        bar.update(0)
        _maybe_extract(data_dir, os.path.join(LIBRIVOX_DIR, "train-clean-360"), train_clean_360)
        bar.update(1)
        _maybe_extract(data_dir, os.path.join(LIBRIVOX_DIR, "train-other-500"), train_other_500)
        bar.update(2)

        _maybe_extract(data_dir, os.path.join(LIBRIVOX_DIR, "dev-clean"), dev_clean)
        bar.update(3)
        _maybe_extract(data_dir, os.path.join(LIBRIVOX_DIR, "dev-other"), dev_other)
        bar.update(4)

        _maybe_extract(data_dir, os.path.join(LIBRIVOX_DIR, "test-clean"), test_clean)
        bar.update(5)
        _maybe_extract(data_dir, os.path.join(LIBRIVOX_DIR, "test-other"), test_other)
        bar.update(6)

    # Conditionally convert FLAC data to wav, from:
    #  data_dir/LibriSpeech/split/1/2/1-2-3.flac
    # to:
    #  data_dir/LibriSpeech/split-wav/1-2-3.wav
    print("Converting Librivox data from flac to wav if not already converted...")
    with progressbar.ProgressBar(max_value=7,  widget=progressbar.AdaptiveETA) as bar:
        _maybe_convert_wav(work_dir, "train-clean-100", "train-clean-100-wav")
        bar.update(0)
        _maybe_convert_wav(work_dir, "train-clean-360", "train-clean-360-wav")
        bar.update(1)
        _maybe_convert_wav(work_dir, "train-other-500", "train-other-500-wav")
        bar.update(2)

        _maybe_convert_wav(work_dir, "dev-clean", "dev-clean-wav")
        bar.update(3)
        _maybe_convert_wav(work_dir, "dev-other", "dev-other-wav")
        bar.update(4)

        _maybe_convert_wav(work_dir, "test-clean", "test-clean-wav")
        bar.update(5)
        _maybe_convert_wav(work_dir, "test-other", "test-other-wav")
        bar.update(6)

    # Conditionally split LibriSpeech transcriptions, from:
    #  data_dir/LibriSpeech/split/1/2/1-2.trans.txt
    # to:
    #  data_dir/LibriSpeech/split-wav/1-2-0.txt
    #  data_dir/LibriSpeech/split-wav/1-2-1.txt
    #  data_dir/LibriSpeech/split-wav/1-2-2.txt
    #  ...
    print("Splitting transcriptions if not already split ...")
    with progressbar.ProgressBar(max_value=7,  widget=progressbar.AdaptiveETA) as bar:
        _maybe_split_transcriptions(work_dir, "train-clean-100", "train-clean-100-wav")
        bar.update(0)
        _maybe_split_transcriptions(work_dir, "train-clean-360", "train-clean-360-wav")
        bar.update(1)
        _maybe_split_transcriptions(work_dir, "train-other-500", "train-other-500-wav")
        bar.update(2)

        _maybe_split_transcriptions(work_dir, "dev-clean", "dev-clean-wav")
        bar.update(3)
        _maybe_split_transcriptions(work_dir, "dev-other", "dev-other-wav")
        bar.update(4)

        _maybe_split_transcriptions(work_dir, "test-clean", "test-clean-wav")
        bar.update(5)
        _maybe_split_transcriptions(work_dir, "test-other", "test-other-wav")
        bar.update(6)
    print("Finished pre-processing librivox.  Initializing dataset...")
    # Create train DataSet from all the train archives
    train = None
    if "train" in sets:
        train = _read_data_set(work_dir, "train-*-wav", thread_count, train_batch_size, numcep, numcontext,
                               limit=limit_train)

    # Create dev DataSet from all the dev archives
    dev = None
    if "dev" in sets:
        dev = _read_data_set(work_dir, "dev-*-wav", thread_count, dev_batch_size, numcep, numcontext, limit=limit_dev)

    # Create test DataSet from all the test archives
    test = None
    if "test" in sets:
        test = _read_data_set(work_dir, "test-*-wav", thread_count, test_batch_size, numcep, numcontext,
                              limit=limit_test)

    # Return DataSets
    return DataSets(train, dev, test)
Example #40
0
def read_data_sets(
    train_dir,
    shard_index,
    fake_data=False,
    one_hot=False,
    dtype=dtypes.float32,
    reshape=True,
    validation_size=5000,
):
    if fake_data:

        def fake():
            return DataSet([], [],
                           fake_data=True,
                           one_hot=one_hot,
                           dtype=dtype)

        train = fake()
        validation = fake()
        test = fake()
        sharded_train = fake()
        # return base.Datasets(train=train, validation=validation, test=test, sharded_train=sharded_train)
        return base.Datasets(train=train, validation=validation, test=test)
    TRAIN_IMAGES = 'train-images-idx3-ubyte.gz'
    TRAIN_LABELS = 'train-labels-idx1-ubyte.gz'
    TEST_IMAGES = 't10k-images-idx3-ubyte.gz'
    TEST_LABELS = 't10k-labels-idx1-ubyte.gz'

    local_file = base.maybe_download(TRAIN_IMAGES, train_dir,
                                     SOURCE_URL + TRAIN_IMAGES)
    with open(local_file, 'rb') as f:
        train_images = extract_images(f)

    local_file = base.maybe_download(TRAIN_LABELS, train_dir,
                                     SOURCE_URL + TRAIN_LABELS)
    with open(local_file, 'rb') as f:
        train_labels = extract_labels(f, one_hot=one_hot)

    local_file = base.maybe_download(TEST_IMAGES, train_dir,
                                     SOURCE_URL + TEST_IMAGES)
    with open(local_file, 'rb') as f:
        test_images = extract_images(f)

    local_file = base.maybe_download(TEST_LABELS, train_dir,
                                     SOURCE_URL + TEST_LABELS)
    with open(local_file, 'rb') as f:
        test_labels = extract_labels(f, one_hot=one_hot)

    if not 0 <= validation_size <= len(train_images):
        raise ValueError(
            'Validation size should be between 0 and {}. Received: {}.'.format(
                len(train_images), validation_size))

    validation_images = train_images[:validation_size]
    validation_labels = train_labels[:validation_size]
    train_images = train_images[validation_size:]
    train_labels = train_labels[validation_size:]

    # print(shard_index)
    # print(type(shard_index))
    sharded_train_images = train_images[shard_index]
    sharded_train_labels = train_labels[shard_index]
    print(sharded_train_labels)
    train = DataSet(train_images, train_labels, dtype=dtype, reshape=reshape)
    validation = DataSet(validation_images,
                         validation_labels,
                         dtype=dtype,
                         reshape=reshape)
    test = DataSet(test_images, test_labels, dtype=dtype, reshape=reshape)
    sharded_train = DataSet(sharded_train_images,
                            sharded_train_labels,
                            dtype=dtype,
                            reshape=reshape)
    train = sharded_train
    return base.Datasets(train=train, validation=validation, test=test)
    # return base.Datasets(train=train, validation=validation, test=test, sharded_train=sharded_train)


# def load_mnist(train_dir='MNIST-data'):
#    return read_data_sets(train_dir)
Example #41
0
    def _load_data(self):
        work_directory = '.faces_data'
        images_path = maybe_download('img_align_celeba.zip', work_directory,
                                     FACES_IMAGES_URL)
        labels_path = maybe_download('list_attr_celeba.txt', work_directory,
                                     FACES_LABELS_URL)

        # Load labels.
        image_count = 0
        attributes = []
        attributes_classes = ['Male', 'Young', 'Smiling', 'Attractive']
        label_map = {}
        with open(labels_path, 'r') as labels_file:
            for line_no, line in enumerate(labels_file):
                if line_no == 0:
                    # Parse example count.
                    image_count = int(line)
                    continue
                elif line_no == 1:
                    # Parse header.
                    attributes = line.split()
                    continue

                # Parse line and determine class label.
                line = line.split()
                if self.options.dataset_random_labels:
                    label = (line_no - 2) % self.class_count
                else:
                    label = 0
                    for index, attribute in enumerate(attributes_classes):
                        value = int(line[attributes.index(attribute) + 1])
                        if value == 1:
                            label += 2**index

                    if label > 9:
                        continue

                label_map[line[0]] = label

        # Load images.
        images = np.zeros(
            [image_count, self.width * self.height * self.channels],
            dtype=np.float32)
        labels = np.zeros([image_count], dtype=np.int8)
        with zipfile.ZipFile(images_path, 'r') as images_zip:
            image_infos = images_zip.infolist()
            index = 0
            progress = tqdm.tqdm(total=image_count, leave=False)
            for image_info in image_infos:
                if not image_info.filename.endswith('.jpg'):
                    continue

                label = label_map.get(os.path.basename(image_info.filename),
                                      None)
                if label is None:
                    continue

                with images_zip.open(image_info) as image_file:
                    image = imread(image_file).astype(np.float32)

                    # Resize image to target dimensions.
                    h, w = image.shape[:2]
                    image = imresize(
                        image, [int((float(h) / w) * self.width), self.width])
                    j = int(round((image.shape[0] - self.height) / 2.))
                    image = image[j:j + self.height, :, :]
                    image = image / 255.

                    images[index, :] = image.flatten()
                    labels[index] = label
                    index += 1
                    progress.update()

            image_count = index + 1
            images = images[:image_count]
            labels = labels[:image_count]
            progress.close()

        print('Image count:', index)
        print('Values: min={} max={} mean={}'.format(np.min(images),
                                                     np.max(images),
                                                     np.mean(images)))

        print('Class distribution:')
        for label, count in zip(*np.unique(labels, return_counts=True)):
            print('  {}: {}'.format(label, count))

        train = DataWrapper(images, labels)
        test = DataWrapper(images[:1000], labels[:1000])
        validation = DataWrapper(np.asarray([]), np.asarray([]))

        return Datasets(train=train, test=test, validation=validation)
Example #42
0
def read_data_sets(data_dir, batch_size, numcep, numcontext, thread_count=8, limit_dev=0, limit_test=0, limit_train=0):
    # Check if we can convert FLAC with SoX before we start
    sox_help_out = subprocess.check_output(["sox", "-h"])
    if sox_help_out.find("flac") == -1:
        print("Error: SoX doesn't support FLAC. Please install SoX with FLAC support and try again.")
        exit(1)
    
    # Conditionally download data to data_dir
    TRAIN_CLEAN_100_URL = "http://www.openslr.org/resources/12/train-clean-100.tar.gz"
    TRAIN_CLEAN_360_URL = "http://www.openslr.org/resources/12/train-clean-360.tar.gz"
    TRAIN_OTHER_500_URL = "http://www.openslr.org/resources/12/train-other-500.tar.gz"
    
    DEV_CLEAN_URL = "http://www.openslr.org/resources/12/dev-clean.tar.gz"
    DEV_OTHER_URL = "http://www.openslr.org/resources/12/dev-other.tar.gz"
    
    TEST_CLEAN_URL = "http://www.openslr.org/resources/12/test-clean.tar.gz"
    TEST_OTHER_URL = "http://www.openslr.org/resources/12/test-other.tar.gz"
    
    train_clean_100 = base.maybe_download("train-clean-100.tar.gz", data_dir, TRAIN_CLEAN_100_URL)
    train_clean_360 = base.maybe_download("train-clean-360.tar.gz", data_dir, TRAIN_CLEAN_360_URL)
    train_other_500 = base.maybe_download("train-other-500.tar.gz", data_dir, TRAIN_OTHER_500_URL)
    
    dev_clean = base.maybe_download("dev-clean.tar.gz", data_dir, DEV_CLEAN_URL)
    dev_other = base.maybe_download("dev-other.tar.gz", data_dir, DEV_OTHER_URL)
    
    test_clean = base.maybe_download("test-clean.tar.gz", data_dir, TEST_CLEAN_URL)
    test_other = base.maybe_download("test-other.tar.gz", data_dir, TEST_OTHER_URL)
    
    # Conditionally extract LibriSpeech data
    # We extract each archive into data_dir, but test for existence in
    # data_dir/LibriSpeech because the archives share that root.
    LIBRIVOX_DIR = "LibriSpeech"
    work_dir = os.path.join(data_dir, LIBRIVOX_DIR)
    
    _maybe_extract(data_dir, os.path.join(LIBRIVOX_DIR, "train-clean-100"), train_clean_100)
    _maybe_extract(data_dir, os.path.join(LIBRIVOX_DIR, "train-clean-360"), train_clean_360)
    _maybe_extract(data_dir, os.path.join(LIBRIVOX_DIR, "train-other-500"), train_other_500)
    
    _maybe_extract(data_dir, os.path.join(LIBRIVOX_DIR, "dev-clean"), dev_clean)
    _maybe_extract(data_dir, os.path.join(LIBRIVOX_DIR, "dev-other"), dev_other)
    
    _maybe_extract(data_dir, os.path.join(LIBRIVOX_DIR, "test-clean"), test_clean)
    _maybe_extract(data_dir, os.path.join(LIBRIVOX_DIR, "test-other"), test_other)
    
    # Conditionally convert FLAC data to wav, from:
    #  data_dir/LibriSpeech/split/1/2/1-2-3.flac
    # to:
    #  data_dir/LibriSpeech/split-wav/1-2-3.wav
    _maybe_convert_wav(work_dir, "train-clean-100", "train-clean-100-wav")
    _maybe_convert_wav(work_dir, "train-clean-360", "train-clean-360-wav")
    _maybe_convert_wav(work_dir, "train-other-500", "train-other-500-wav")
    
    _maybe_convert_wav(work_dir, "dev-clean", "dev-clean-wav")
    _maybe_convert_wav(work_dir, "dev-other", "dev-other-wav")
    
    _maybe_convert_wav(work_dir, "test-clean", "test-clean-wav")
    _maybe_convert_wav(work_dir, "test-other", "test-other-wav")
    
    # Conditionally split LibriSpeech transcriptions, from:
    #  data_dir/LibriSpeech/split/1/2/1-2.trans.txt
    # to:
    #  data_dir/LibriSpeech/split-wav/1-2-0.txt
    #  data_dir/LibriSpeech/split-wav/1-2-1.txt
    #  data_dir/LibriSpeech/split-wav/1-2-2.txt
    #  ...
    _maybe_split_transcriptions(work_dir, "train-clean-100", "train-clean-100-wav")
    _maybe_split_transcriptions(work_dir, "train-clean-360", "train-clean-360-wav")
    _maybe_split_transcriptions(work_dir, "train-other-500", "train-other-500-wav")
    
    _maybe_split_transcriptions(work_dir, "dev-clean", "dev-clean-wav")
    _maybe_split_transcriptions(work_dir, "dev-other", "dev-other-wav")
    
    _maybe_split_transcriptions(work_dir, "test-clean", "test-clean-wav")
    _maybe_split_transcriptions(work_dir, "test-other", "test-other-wav")
    
    # Create train DataSet from all the train archives
    train = _read_data_set(work_dir, "train-*-wav", thread_count, batch_size, numcep, numcontext, limit=limit_train)
    
    # Create dev DataSet from all the dev archives
    dev = _read_data_set(work_dir, "dev-*-wav", thread_count, batch_size, numcep, numcontext, limit=limit_dev)
    
    # Create test DataSet from all the test archives
    test = _read_data_set(work_dir, "test-*-wav", thread_count, batch_size, numcep, numcontext, limit=limit_test)
    
    # Return DataSets
    return DataSets(train, dev, test)
Example #43
0
def load_mnist(train_dir, validation_size=5000):

    SOURCE_URL = 'http://yann.lecun.com/exdb/mnist/'

    TRAIN_IMAGES = 'train-images-idx3-ubyte.gz'
    TRAIN_LABELS = 'train-labels-idx1-ubyte.gz'
    TEST_IMAGES = 't10k-images-idx3-ubyte.gz'
    TEST_LABELS = 't10k-labels-idx1-ubyte.gz'

    local_file = base.maybe_download(TRAIN_IMAGES, train_dir,
                                     SOURCE_URL + TRAIN_IMAGES)
    with open(local_file, 'rb') as f:
        train_images = extract_images(f)

    local_file = base.maybe_download(TRAIN_LABELS, train_dir,
                                     SOURCE_URL + TRAIN_LABELS)
    with open(local_file, 'rb') as f:
        train_labels = extract_labels(f)

    local_file = base.maybe_download(TEST_IMAGES, train_dir,
                                     SOURCE_URL + TEST_IMAGES)
    with open(local_file, 'rb') as f:
        test_images = extract_images(f)

    local_file = base.maybe_download(TEST_LABELS, train_dir,
                                     SOURCE_URL + TEST_LABELS)
    with open(local_file, 'rb') as f:
        test_labels = extract_labels(f)

    if not 0 <= validation_size <= len(train_images):
        raise ValueError(
            'Validation size should be between 0 and {}. Received: {}.'.format(
                len(train_images), validation_size))

    validation_images = train_images[:validation_size]
    validation_labels = train_labels[:validation_size]
    train_images = train_images[validation_size:]
    train_labels = train_labels[validation_size:]
    #   print(np.shape(train_labels))
    #   plt.imshow(np.reshape(train_images[100], (28, 28)), cmap='gray', interpolation='none')

    #   train_images = train_images[np.where((train_labels == 3) | (train_labels == 5))[0]]
    #   train_labels = train_labels[np.where((train_labels == 3) | (train_labels == 5))[0]]
    #   test_images = test_images[np.where((test_labels == 3) | (test_labels == 5))[0]]
    #   test_labels = test_labels[np.where((test_labels == 3) | (test_labels == 5))[0]]
    #   validation_images = validation_images[np.where((validation_labels == 3) | (validation_labels == 5))[0]]
    #   validation_labels = validation_labels[np.where((validation_labels == 3) | (validation_labels == 5))[0]]

    train_images = train_images.astype(np.float32) / 255
    validation_images = validation_images.astype(np.float32) / 255
    test_images = test_images.astype(np.float32) / 255

    #   train_labels = label_binarize(train_labels, classes=[3,5])[:,0]
    #   test_labels = label_binarize(test_labels, classes=[3,5])[:,0]
    #   validation_labels = label_binarize(validation_labels, classes=[3,5])[:,0]

    train = DataSet(train_images, train_labels)
    validation = DataSet(validation_images, validation_labels)
    test = DataSet(test_images, test_labels)

    return base.Datasets(train=train, validation=validation, test=test)
Example #44
0
def read_data_sets(train_dir,
                   fake_data=False,
                   one_hot=False,
                   dtype=dtypes.float32,
                   reshape=True,
                   validation_size=1):
    if fake_data:

        def fake():
            return DataSet([], [],
                           fake_data=True,
                           one_hot=one_hot,
                           dtype=dtype)

        train = fake()
        validation = fake()
        test = fake()
        return base.Datasets(train=train, validation=validation, test=test)

    TRAIN_IMAGES = 'train-images-idx3-ubyte.gz'
    #TRAIN_IMAGES = 'Texture_Sample.gz'
    TRAIN_LABELS = 'train-labels-idx1-ubyte.gz'
    TEST_IMAGES = 'test-images-idx3-ubyte.gz'
    #TEST_IMAGES = 'Texture_Sample.gz'
    TEST_LABELS = 'test-labels-idx1-ubyte.gz'

    local_file = base.maybe_download(TRAIN_IMAGES, train_dir,
                                     SOURCE_URL + TRAIN_IMAGES)
    with open(local_file, 'rb') as f:
        train_images = extract_images(f)

    local_file = base.maybe_download(TRAIN_LABELS, train_dir,
                                     SOURCE_URL + TRAIN_LABELS)
    with open(local_file, 'rb') as f:
        train_labels = extract_labels(f, one_hot=one_hot)

    local_file = base.maybe_download(TEST_IMAGES, train_dir,
                                     SOURCE_URL + TEST_IMAGES)
    with open(local_file, 'rb') as f:
        test_images = extract_images(f)

    local_file = base.maybe_download(TEST_LABELS, train_dir,
                                     SOURCE_URL + TEST_LABELS)
    with open(local_file, 'rb') as f:
        test_labels = extract_labels(f, one_hot=one_hot)

    if not 0 <= validation_size <= len(train_images):
        raise ValueError(
            'Validation size should be between 0 and {}. Received: {}.'.format(
                len(train_images), validation_size))

    validation_images = train_images[:validation_size]
    validation_labels = train_labels[:validation_size]
    train_images = train_images[validation_size:]
    train_labels = train_labels[0:train_images.shape[0]]

    print('=======================================================')
    print('shape of images :' + str(train_images.shape))
    print('shape of labels :' + str(train_labels.shape))
    print('=======================================================')

    train = DataSet(train_images, train_labels, dtype=dtype, reshape=reshape)
    validation = DataSet(validation_images,
                         validation_labels,
                         dtype=dtype,
                         reshape=reshape)
    test_labels = test_labels[0:test_images.shape[0]]
    test = DataSet(test_images, test_labels, dtype=dtype, reshape=reshape)

    return base.Datasets(train=train, validation=validation, test=test)
Example #45
0
def read_data_sets(work_dir,
                   fake_data=False,
                   one_hot=False,
                   dtype=dtypes.float32,
                   reshape=True,
                   validation_size=None,
                   seed=None):
    if fake_data:

        def fake():
            return DataSet([], [],
                           fake_data=True,
                           image_dims=32 * 32 * 3,
                           num_class=10,
                           one_hot=one_hot,
                           dtype=dtype,
                           seed=seed)

        train = fake()
        validation = fake()
        test = fake()
        return base.Datasets(train=train, validation=validation, test=test)

    root_data_dir = os.path.join(work_dir, "cifar-10-batches-py")
    if not os.path.exists(root_data_dir):
        # no data directory found
        # download gz file
        print(
            "Trying to download cifar data (if the tar.gz file is not available)"
        )
        gz_fpath = base.maybe_download("cifar-10-python.tar.gz", work_dir,
                                       _SOURCE_URL)
        print("Extracting data in {}".format(root_data_dir))
        with tarfile.open(gz_fpath) as tar:
            tar.extractall(work_dir)
    else:
        print("cifar data directory found {}".format(root_data_dir))
    print("loading data...")
    X_train, Y_train, X_test, Y_test = load_CIFAR10(root_data_dir)
    if one_hot:
        num_class_train = len(np.unique(Y_train))
        num_class_test = len(np.unique(Y_test))
        assert num_class_test == num_class_train, \
            "number of classes mismatch: {} and {}".format(num_class_train, num_class_test)
        Y_train = dense_to_one_hot(Y_train, num_class_train)
        Y_test = dense_to_one_hot(Y_test, num_class_test)
    if validation_size is None:
        validation_size = int(X_train.shape[0] / 10)
    valid_idx = np.random.choice(range(X_train.shape[0]), validation_size)
    mask = np.array([
        True if row_idx in valid_idx else False
        for row_idx in range(X_train.shape[0])
    ])
    X_train, X_valid = X_train[~mask], X_train[mask]
    Y_train, Y_valid = Y_train[~mask], Y_train[mask]

    train_dataset = DataSet(X_train,
                            Y_train,
                            one_hot=one_hot,
                            dtype=dtype,
                            reshape=reshape,
                            seed=seed)
    valid_dataset = DataSet(X_valid,
                            Y_valid,
                            one_hot=one_hot,
                            dtype=dtype,
                            reshape=reshape,
                            seed=seed)
    test_dataset = DataSet(X_test,
                           Y_test,
                           one_hot=one_hot,
                           dtype=dtype,
                           reshape=reshape,
                           seed=seed)
    return base.Datasets(train=train_dataset,
                         validation=valid_dataset,
                         test=test_dataset)
Example #46
0
def read_data_sets(data_path,
                   fake_data=False,
                   one_hot=True,
                   subsets=False,
                   init_probs=[],
                   percentage_train=1.,
                   corrupt_labels=False,
                   unbalance=False,
                   unbalance_dict=None,
                   validation_size=5000,
                   source_url=DEFAULT_SOURCE_URL):
    """
    Returns a data provider for a dataset

    :param data_path: local directory to store data
    :param fake_data (optional): flag to indicate whether data should be reshaped
    :param one_hot (optional): flag to indicate whether data is one-hot encoded
    :param init_probs (optional): initial per-class probabilities
    :param percentage_train (optional): percentage of training data
    :param validation_size (optional): validation size
    :param source_url (optional): url where data can be found
    """

    if unbalance_dict is None:
        unbalance_dict = {"percentage": 20, "label1": 0, "label2": 8}
    train_dir = data_path

    class DataSets(object):
        pass

    data_sets = DataSets()

    if fake_data:
        data_sets.train = DataSet([], [], fake_data=True, one_hot=True)
        data_sets.val = DataSet([], [], fake_data=True, one_hot=True)
        data_sets.test = DataSet([], [], fake_data=True, one_hot=True)
        return data_sets

    if not source_url:  # empty string check
        source_url = DEFAULT_SOURCE_URL

    TRAIN_IMAGES = 'train-images-idx3-ubyte.gz'
    TRAIN_LABELS = 'train-labels-idx1-ubyte.gz'
    TEST_IMAGES = 't10k-images-idx3-ubyte.gz'
    TEST_LABELS = 't10k-labels-idx1-ubyte.gz'

    local_file = base.maybe_download(TRAIN_IMAGES, train_dir,
                                     source_url + TRAIN_IMAGES)
    with gfile.Open(local_file, 'rb') as f:
        train_images = extract_images(f)

    local_file = base.maybe_download(TRAIN_LABELS, train_dir,
                                     source_url + TRAIN_LABELS)
    with gfile.Open(local_file, 'rb') as f:
        train_labels = extract_labels(f, one_hot=one_hot)

    local_file = base.maybe_download(TEST_IMAGES, train_dir,
                                     source_url + TEST_IMAGES)
    with gfile.Open(local_file, 'rb') as f:
        test_images = extract_images(f)

    local_file = base.maybe_download(TEST_LABELS, train_dir,
                                     source_url + TEST_LABELS)
    with gfile.Open(local_file, 'rb') as f:
        test_labels = extract_labels(f, one_hot=one_hot)

    if not 0 <= validation_size <= len(train_images):
        raise ValueError(
            'Validation size should be between 0 and {}. Received: {}.'.format(
                len(train_images), validation_size))

    val_images = train_images[:validation_size]
    val_labels = train_labels[:validation_size]
    train_images = train_images[validation_size:]
    train_labels = train_labels[validation_size:]

    n_test = test_images.shape[0]
    n_val = val_images.shape[0]
    n_train = train_images.shape[0]

    if not init_probs:
        print('RANDOM INIT PROBABILITIES')
        probs = np.random.rand(n_train)
    else:
        init_probs = np.asarray(init_probs)
        probs_class = np.asarray(1.0 * init_probs / np.sum(init_probs),
                                 np.float32)
        dense_train_labels = np.argmax(train_labels, axis=1)
        probs = np.zeros_like(dense_train_labels, np.float32)
        for k in range(0, np.unique(dense_train_labels).max() + 1):
            i = np.where(dense_train_labels == k)[0]
            probs[i] = probs_class[k]

    train_probs = np.squeeze(
        normalize(np.expand_dims(probs, 1), axis=0, norm='l1'))
    val_probs = np.squeeze(
        normalize(np.expand_dims(np.ones(n_val, np.float32), 1),
                  axis=0,
                  norm='l1'))
    test_probs = np.squeeze(
        normalize(np.expand_dims(np.ones(n_test, np.float32), 1),
                  axis=0,
                  norm='l1'))

    # For experiments with limited amount of data
    if percentage_train != 1.:
        train_size = int(percentage_train * train_images.shape[0])
        Xtrain_images, Xval_images, ytrain, yval, ptrain, probs_val = train_test_split(
            train_images,
            train_labels,
            train_probs,
            train_size=train_size,
            random_state=0)
        train_images = Xtrain_images
        train_labels = ytrain
        train_probs = ptrain

    # For experiments with class-imbalance distribution
    if unbalance:
        print('CLASS-IMBALANCE')
        n_classes = len(np.unique(np.argmax(train_labels, 1)))
        reduceto = 0.01 * unbalance_dict[0]['percentage']
        label1 = unbalance_dict[0]['label1']
        label2 = unbalance_dict[0]['label2']

        pick_ids = []
        newsize = 0
        all_classes = np.arange(0, n_classes)
        all_classes = np.delete(all_classes,
                                np.where(all_classes == label1)[0])
        all_classes = np.delete(all_classes,
                                np.where(all_classes == label2)[0])

        for lab in [label1, label2]:
            allids = np.where(np.argmax(train_labels, 1) == lab)[0]
            selectedids = np.random.choice(allids,
                                           int(reduceto * allids.shape[0]),
                                           replace=False)
            pick_ids.append(selectedids)
            newsize += len(selectedids)

        new_ids = convert_list_to_array(pick_ids, newsize)

        other_ids = []
        othersize = 0
        for lab in all_classes.tolist():
            selectedids = np.where(np.argmax(train_labels, 1) == lab)[0]
            other_ids.append(selectedids)
            othersize += len(selectedids)

        keep_ids = convert_list_to_array(other_ids, othersize)

        # new_ids: contains the indices of the reduced (imbalance) classes
        # keep_ids: contains the indices of the rest (keep the same class distribution)
        resulting_ids = np.concatenate((new_ids, keep_ids))
        np.random.shuffle(resulting_ids)

        train_images = train_images[resulting_ids, ...]
        train_labels = train_labels[resulting_ids, ...]
        train_probs = train_probs[resulting_ids]

    train_indices = np.zeros(train_labels.shape[0])
    val_indices = np.zeros(val_labels.shape[0])
    test_indices = np.zeros(test_labels.shape[0])

    if corrupt_labels:
        print('NOISE / CORRUPT LABELS')
        percentage_corrupted_labels = 30
        number_corrupted_labels = int(1.0 * percentage_corrupted_labels / 100 *
                                      train_labels.shape[0])
        dense_train_labels = np.argmax(train_labels, 1)
        old_train_labels = np.copy(dense_train_labels)
        idx_train_labels = np.arange(train_labels.shape[0])
        idx_to_be_corrupted = np.random.choice(idx_train_labels,
                                               number_corrupted_labels,
                                               replace=False)
        train_indices[idx_to_be_corrupted] = 1
        dense_train_labels[idx_to_be_corrupted] += 1
        dense_train_labels[np.where(dense_train_labels == 10)[0]] = 0
        train_labels = dense_to_one_hot(dense_train_labels, n_class=10)

    data_sets.train = DataSet(train_images,
                              train_labels,
                              train_probs,
                              train_indices,
                              fake_data=True,
                              one_hot=True,
                              subsets=subsets)
    data_sets.val = DataSet(val_images,
                            val_labels,
                            val_probs,
                            val_indices,
                            fake_data=True,
                            one_hot=True,
                            subsets=False)
    data_sets.test = DataSet(test_images,
                             test_labels,
                             test_probs,
                             test_indices,
                             fake_data=True,
                             one_hot=True,
                             subsets=False)

    return data_sets