def read_data_sets(train_dir, fake_data=False, one_hot=False, dtype=dtypes.float32, reshape=True, validation_size=5000): if fake_data: def fake(): return DataSet([], [], fake_data=True, one_hot=one_hot, dtype=dtype) train = fake() validation = fake() test = fake() return base.Datasets(train=train, validation=validation, test=test) with open(os.path.join(train_dir, 'small_chairs.npy')) as f: train_images = numpy.load(f) train_labels = numpy.zeros(len(train_images)) train = DataSet(train_images, train_labels, dtype=dtype, reshape=reshape) validation = None test = None return base.Datasets(train=train, validation=validation, test=test)
def read_data_sets(train_dir, fake_data=False, one_hot=False, dtype=dtypes.float32, reshape=True, validation_size=5000): if fake_data: def fake(): return DataSet([], [], fake_data=True, one_hot=one_hot, dtype=dtype) train = fake() validation = fake() test = fake() return base.Datasets(train=train, validation=validation, test=test) TRAIN_IMAGES = 'train-images-idx3-ubyte.gz' TRAIN_LABELS = 'train-labels-idx1-ubyte.gz' local_file = base.maybe_download(TRAIN_IMAGES, train_dir, SOURCE_URL + TRAIN_IMAGES) with open(local_file, 'rb') as f: train_images = extract_images(f) local_file = base.maybe_download(TRAIN_LABELS, train_dir, SOURCE_URL + TRAIN_LABELS) with open(local_file, 'rb') as f: train_labels = extract_labels(f, one_hot=one_hot) local_file = os.path.join(train_dir, TEST_FILE) print('Extracting', TEST_FILE) with open(local_file, 'rb') as f: magic, num, rows, cols = struct.unpack(">IIII", f.read(16)) test_images = numpy.fromfile(f, dtype=numpy.uint8) test_images = test_images.reshape(num, rows, cols, 1) if one_hot: test_labels = numpy.zeros((NUM_TEST,NUM_CLASSES), dtype=numpy.uint8) else: test_labels = numpy.zeros((NUM_TEST,), dtype=numpy.uint8) if not 0 <= validation_size <= len(train_images): raise ValueError( 'Validation size should be between 0 and {}. Received: {}.' .format(len(train_images), validation_size)) validation_images = train_images[:validation_size] validation_labels = train_labels[:validation_size] train_images = train_images[validation_size:] train_labels = train_labels[validation_size:] train = DataSet(train_images, train_labels, dtype=dtype, reshape=reshape) validation = DataSet(validation_images, validation_labels, dtype=dtype, reshape=reshape) test = DataSet(test_images, test_labels, dtype=dtype, reshape=reshape) return base.Datasets(train=train, validation=validation, test=test)
def read_data_sets(train_dir, fake_data=False, one_hot=False, dtype=dtypes.float32, reshape=True, validation_size=5000): if fake_data: def fake(): return DataSet([], [], fake_data=True, one_hot=one_hot, dtype=dtype) train = fake() validation = fake() test = fake() return base.Datasets(train=train, validation=validation, test=test) #This is the directory containing the flag images TRAIN_IMAGES = '/home/michael/data/crop/' IMAGE_SHAPE = 28 #28x28 pixel images RESIZED_IMAGES = resize_images(TRAIN_IMAGES, IMAGE_SHAPE) train_images = extract_images(RESIZED_IMAGES) train_labels = extract_labels(RESIZED_IMAGES) train = DataSet(train_images, train_labels, dtype=dtype, reshape=reshape) test = train validation = train return base.Datasets(train=train, validation=validation, test=test)
def read_data_sets2(train_dir, fake_data=False, one_hot=False, dtype=dtypes.float32, reshape=True, validation_size=5000, seed=None): if fake_data: def fake(): return DataSet([], [], fake_data=True, one_hot=one_hot, dtype=dtype, seed=seed) train = fake() validation = fake() test = fake() return base.Datasets(train=train, validation=validation, test=test) TRAIN_IMAGES = 'train.data.60k.csv.ubyte2' TRAIN_LABELS = 'train.label.60k.csv.ubyte' TEST_IMAGES = 'test.data.10k.csv.ubyte2' TEST_LABELS = 'test.label.10k.csv.ubyte' local_file = TRAIN_IMAGES with open(local_file, 'rb') as f: train_images = extract_images(f) local_file = TRAIN_LABELS with open(local_file, 'rb') as f: train_labels = extract_labels(f, one_hot=one_hot) local_file = TEST_IMAGES with open(local_file, 'rb') as f: test_images = extract_images(f) local_file = TEST_LABELS with open(local_file, 'rb') as f: test_labels = extract_labels(f, one_hot=one_hot) if not 0 <= validation_size <= len(train_images): raise ValueError( 'Validation size should be between 0 and {}. Received: {}.'.format( len(train_images), validation_size)) validation_images = train_images[:validation_size] validation_labels = train_labels[:validation_size] train_images = train_images[validation_size:] train_labels = train_labels[validation_size:] options = dict(dtype=dtype, reshape=reshape, seed=seed) train = DataSet(train_images, train_labels, **options) validation = DataSet(validation_images, validation_labels, **options) test = DataSet(test_images, test_labels, **options) return base.Datasets(train=train, validation=validation, test=test)
def read_data_sets(train_dir, fake_data=False, one_hot=False, dtype=dtypes.float32, reshape=True, validation_size=5000): if fake_data: def fake(): return DataSet([], [], fake_data=True, one_hot=one_hot, dtype=dtype) train = fake() validation = fake() test = fake() return base.Datasets(train=train, validation=validation, test=test) TRAIN_IMAGES = 'training-images-and-ubyte_4.gz' TRAIN_LABELS = 'training-labels-and-ubyte_4.gz' VALIDATION_IMAGES = 'validation-images-and-ubyte_4.gz' VALIDATION_LABELS = 'validation-labels-and_ubyte_4.gz' TEST_IMAGES = 'testing-images-and-ubyte_4.gz' TEST_LABELS = 'testing-labels-and-ubyte_4.gz' with open(train_dir + TRAIN_IMAGES, 'rb') as f: train_images = extract_images(f) with open(train_dir + TRAIN_LABELS, 'rb') as f: train_labels = extract_labels(f, one_hot=one_hot) #train = DataSet(train_images, train_labels, dtype=dtype, reshape=reshape) #return train with open(train_dir + VALIDATION_IMAGES, 'rb') as f: validation_images = extract_images(f) with open(train_dir + VALIDATION_LABELS, 'rb') as f: validation_labels = extract_labels(f, one_hot=one_hot) with open(train_dir + TEST_IMAGES, 'rb') as f: test_images = extract_images(f) with open(train_dir + TEST_LABELS, 'rb') as f: test_labels = extract_labels(f, one_hot=one_hot) #validation_images = train_images[:validation_size] #validation_labels = train_labels[:validation_size] #train_images = train_images[validation_size:] #train_labels = train_labels[validation_size:] train = DataSet(train_images, train_labels, dtype=dtype, reshape=reshape) validation = DataSet(validation_images, validation_labels, dtype=dtype, reshape=reshape) test = DataSet(test_images, test_labels, dtype=dtype, reshape=reshape) return base.Datasets(train=train, validation=validation, test=test)
def read_data_sets(train_dir, fake_data=False, one_hot=False, dtype=dtypes.float32, reshape=True, validation_size=5000): if fake_data: def fake(): return DataSet([], [], fake_data=True, one_hot=one_hot, dtype=dtype) train = fake() validation = fake() test = fake() return base.Datasets(train=train, validation=validation, test=test) TRAIN_IMAGES = 'train-images-idx3-ubyte.gz' TRAIN_LABELS = 'train-labels-idx1-ubyte.gz' TEST_IMAGES = 't10k-images-idx3-ubyte.gz' TEST_LABELS = 't10k-labels-idx1-ubyte.gz' local_file = base.maybe_download(TRAIN_IMAGES, train_dir, SOURCE_URL + TRAIN_IMAGES) with open(local_file, 'rb') as f: train_images = extract_images(f) local_file = base.maybe_download(TRAIN_LABELS, train_dir, SOURCE_URL + TRAIN_LABELS) with open(local_file, 'rb') as f: train_labels = extract_labels(f, one_hot=one_hot) local_file = base.maybe_download(TEST_IMAGES, train_dir, SOURCE_URL + TEST_IMAGES) with open(local_file, 'rb') as f: test_images = extract_images(f) local_file = base.maybe_download(TEST_LABELS, train_dir, SOURCE_URL + TEST_LABELS) with open(local_file, 'rb') as f: test_labels = extract_labels(f, one_hot=one_hot) if not 0 <= validation_size <= len(train_images): raise ValueError( 'Validation size should be between 0 and {}. Received: {}.' .format(len(train_images), validation_size)) validation_images = train_images[:validation_size] validation_labels = train_labels[:validation_size] train_images = train_images[validation_size:] train_labels = train_labels[validation_size:] train = DataSet(train_images, train_labels, dtype=dtype, reshape=reshape) validation = DataSet(validation_images, validation_labels, dtype=dtype, reshape=reshape) test = DataSet(test_images, test_labels, dtype=dtype, reshape=reshape) return base.Datasets(train=train, validation=validation, test=test)
def read_data_sets(train_dir, fake_data=False, one_hot=False, dtype=dtypes.float32, reshape=True, validation_size=5000): if fake_data: def fake(): return DataSet([], [], fake_data=True, one_hot=one_hot, dtype=dtype) train = fake() validation = fake() test = fake() return base.Datasets(train=train, validation=validation, test=test) TRAIN_IMAGES = 'training-images-and-ubyte_19.gz' TRAIN_LABELS = 'training-labels-and-ubyte_19.gz' #TRAIN_IMAGES = 'testing-images-and-ubyte_19.gz' #TRAIN_LABELS = 'testing-labels-and-ubyte_19.gz' TEST_IMAGES = 'testing-images-and-ubyte_19.gz' TEST_LABELS = 'testing-labels-and-ubyte_19.gz' with open(train_dir + TRAIN_IMAGES, 'rb') as f: train_images = extract_images(f) with open(train_dir + TRAIN_LABELS, 'rb') as f: train_labels = extract_labels(f, one_hot=one_hot) #train = DataSet(train_images, train_labels, dtype=dtype, reshape=reshape) #return train with open(train_dir + TEST_IMAGES, 'rb') as f: test_images = extract_images(f) with open(train_dir + TEST_LABELS, 'rb') as f: test_labels = extract_labels(f, one_hot=one_hot) if not 0 <= validation_size <= len(train_images): raise ValueError( 'Validation size should be between 0 and {}. Received: {}.'.format( len(train_images), validation_size)) validation_images = train_images[:validation_size] validation_labels = train_labels[:validation_size] train_images = train_images[validation_size:] train_labels = train_labels[validation_size:] train = DataSet(train_images, train_labels, dtype=dtype, reshape=reshape) validation = DataSet(validation_images, validation_labels, dtype=dtype, reshape=reshape) test = DataSet(test_images, test_labels, dtype=dtype, reshape=reshape) return base.Datasets(train=train, validation=validation, test=test)
def read_data_sets(train_dir, fake_data=False, one_hot=False, dtype=dtypes.float32, reshape=True, validation_size=5000, seed=None): if fake_data: def fake(): return DataSet([], [], fake_data=True, one_hot=one_hot, dtype=dtype, seed=seed) train = fake() validation = fake() test = fake() return base.Datasets(train=train, validation=validation, test=test) TRAIN_IMAGES = 'train-images-idx3-ubyte.gz' TRAIN_LABELS = 'train-labels-idx1-ubyte.gz' TEST_IMAGES = 't10k-images-idx3-ubyte.gz' TEST_LABELS = 't10k-labels-idx1-ubyte.gz' HOST = "192.168.205.185" NAME_NODE_PORT = 50070 client = hdfs.Client('http://{}:{}'.format(HOST, NAME_NODE_PORT)) with client.read(train_dir + "/" + TRAIN_IMAGES) as f: train_images = extract_images(f) with client.read(train_dir + "/" + TRAIN_LABELS) as f: train_labels = extract_labels(f, one_hot=one_hot) with client.read(train_dir + "/" + TEST_IMAGES) as f: test_images = extract_images(f) with client.read(train_dir + "/" + TEST_LABELS) as f: test_labels = extract_labels(f, one_hot=one_hot) if not 0 <= validation_size <= len(train_images): raise ValueError( 'Validation size should be between 0 and {}. Received: {}.'.format( len(train_images), validation_size)) validation_images = train_images[:validation_size] validation_labels = train_labels[:validation_size] train_images = train_images[validation_size:] train_labels = train_labels[validation_size:] options = dict(dtype=dtype, reshape=reshape, seed=seed) train = DataSet(train_images, train_labels, **options) validation = DataSet(validation_images, validation_labels, **options) test = DataSet(test_images, test_labels, **options) return base.Datasets(train=train, validation=validation, test=test)
def read_data_sets(train_dir, fake_data=False, one_hot=False, dtype=dtypes.float32, reshape=True, validation_size=5000): if fake_data: def fake(): return DataSet([], [], fake_data=True, one_hot=one_hot, dtype=dtype) train = fake() validation = fake() test = fake() return base.Datasets(train=train, validation=validation, test=test) print('Starting download...') bucket = 'my-test-bucket' TRAIN_IMAGES = obj_tf.s3.download(bucket,'train-images-idx3-ubyte.gz') TRAIN_LABELS = obj_tf.s3.download(bucket,'train-labels-idx1-ubyte.gz') TEST_IMAGES = obj_tf.s3.download(bucket,'t10k-images-idx3-ubyte.gz') TEST_LABELS = obj_tf.s3.download(bucket,'t10k-labels-idx1-ubyte.gz') print('Done downloading...') local_file = TRAIN_IMAGES with open(local_file, 'rb') as f: train_images = extract_images(f) local_file = TRAIN_LABELS with open(local_file, 'rb') as f: train_labels = extract_labels(f, one_hot=one_hot) local_file = TEST_IMAGES with open(local_file, 'rb') as f: test_images = extract_images(f) local_file = TEST_LABELS with open(local_file, 'rb') as f: test_labels = extract_labels(f, one_hot=one_hot) if not 0 <= validation_size <= len(train_images): raise ValueError( 'Validation size should be between 0 and {}. Received: {}.' .format(len(train_images), validation_size)) validation_images = train_images[:validation_size] validation_labels = train_labels[:validation_size] train_images = train_images[validation_size:] train_labels = train_labels[validation_size:] train = DataSet(train_images, train_labels, dtype=dtype, reshape=reshape) validation = DataSet(validation_images, validation_labels, dtype=dtype, reshape=reshape) test = DataSet(test_images, test_labels, dtype=dtype, reshape=reshape) return base.Datasets(train=train, validation=validation, test=test)
def split_dataset(self, dtype=dtypes.float32, reshape=True, seed=None, validation_size=7000): labels = self.dataset.train.labels # SPLIT FIRST GROUP (1-4) # Find all training images/labels 1-4 train_labels_idx = np.nonzero(self.dataset.train.labels)[1] train_labels_idx = np.nonzero(train_labels_idx < 5)[0] train_labels = self.dataset.train.labels[train_labels_idx] train_images = self.dataset.train.images[train_labels_idx] # Find all testing images/labels 1-4 test_labels_idx = np.nonzero(self.dataset.test.labels)[1] test_labels_idx = np.nonzero(test_labels_idx < 5)[0] test_labels = self.dataset.test.labels[test_labels_idx] test_images = self.dataset.test.images[test_labels_idx] # Create validation/training groups validation_images = train_images[:validation_size] validation_labels = train_labels[:validation_size] train_images = train_images[validation_size:] train_labels = train_labels[validation_size:] options = dict(dtype=dtype, reshape=False, seed=seed) # Define training, validation, and testing datasets train = DataSet(train_images, train_labels, **options) validation = DataSet(validation_images, validation_labels, **options) test = DataSet(test_images, test_labels, **options) first_dataset = base.Datasets(train=train, validation=validation, test=test) # SPLIT SECOND GROUP (5-9) # Find all training images/labels 5-9 train_labels_idx = np.nonzero(self.dataset.train.labels)[1] train_labels_idx = np.nonzero(train_labels_idx >= 5)[0] train_labels_2 = self.dataset.train.labels[train_labels_idx] train_images_2 = self.dataset.train.images[train_labels_idx] # Find all testing images/labels 5-9 test_labels_idx = np.nonzero(self.dataset.test.labels)[1] test_labels_idx = np.nonzero(test_labels_idx >= 5)[0] test_labels_2 = self.dataset.test.labels[test_labels_idx] test_images_2 = self.dataset.test.images[test_labels_idx] # Create validation/training groups validation_images_2 = train_images_2[:validation_size] validation_labels_2 = train_labels_2[:validation_size] train_images_2 = train_images_2[validation_size:] train_labels_2 = train_labels_2[validation_size:] # Define training, validation, and testing datasets train_2 = DataSet(train_images_2, train_labels_2, **options) validation_2 = DataSet(validation_images_2, validation_labels_2, **options) test_2 = DataSet(test_images_2, test_labels_2, **options) second_dataset = base.Datasets(train=train_2, validation=validation_2, test=test_2) return first_dataset, second_dataset
def read_csv_data_sets(train_dir, num_classes=2, day_len=2, dup=1, fake_data=False, one_hot=True, dtype=dtypes.float64, reshape=False, validation_size=50, seed=None): if fake_data: def fake(): return DataSet([], [], fake_data=True, one_hot=one_hot, dtype=dtype, seed=seed) train = fake() validation = fake() test = fake() return base.Datasets(train=train, validation=validation, test=test) TRAIN_CSV = 'train.csv.gz' TEST_CSV = 'test.csv.gz' train_file = os.path.join(train_dir, TRAIN_CSV) test_file = os.path.join(train_dir, TEST_CSV) print('train_file:', train_file) print('test_file:', test_file) train_images, train_labels = read_csv_images_lables( train_file, day_len, dup) test_images, test_labels = read_csv_images_lables(test_file, day_len, dup) if one_hot: train_labels = dense_to_one_hot(train_labels, num_classes) test_labels = dense_to_one_hot(test_labels, num_classes) if not 0 <= validation_size <= len(train_images): raise ValueError( 'Validation size should be between 0 and {}. Received: {}.'.format( len(train_images), validation_size)) validation_images = train_images[:validation_size] validation_labels = train_labels[:validation_size] train_images = train_images[validation_size:] train_labels = train_labels[validation_size:] options = dict(dtype=dtype, reshape=reshape, seed=seed) train = DataSet(train_images, train_labels, **options) validation = DataSet(validation_images, validation_labels, **options) test = DataSet(test_images, test_labels, **options) return base.Datasets(train=train, validation=validation, test=test)
def read_data_sets(train_dir, fake_data=False, one_hot=False, dtype=dtypes.float32, reshape=True, validation_size=50, seed=None): if fake_data: def fake(): return DataSet( [], [], fake_data=True, one_hot=one_hot, dtype=dtype, seed=seed) train = fake() validation = fake() test = fake() return base.Datasets(train=train, validation=validation, test=test) # 수정부분 : 이미지 파일 TRAIN_IMAGES = 'train-images-idx3-ubyte.gz' TRAIN_LABELS = 'train-labels-idx1-ubyte.gz' TEST_IMAGES = 'test-images-idx3-ubyte.gz' TEST_LABELS = 'test-labels-idx1-ubyte.gz' local_file = os.path.join(train_dir,TRAIN_IMAGES) with gfile.Open(local_file, 'rb') as f: train_images = extract_images(f) local_file = os.path.join(train_dir, TRAIN_LABELS) with gfile.Open(local_file, 'rb') as f: train_labels = extract_labels(f, one_hot=one_hot) local_file = os.path.join(train_dir, TEST_IMAGES) with gfile.Open(local_file, 'rb') as f: test_images = extract_images(f) local_file = os.path.join(train_dir, TEST_LABELS) with gfile.Open(local_file, 'rb') as f: test_labels = extract_labels(f, one_hot=one_hot) if not 0 <= validation_size <= len(train_images): raise ValueError('Validation size should be between 0 and {}. Received: {}.' .format(len(train_images), validation_size)) validation_images = train_images[:validation_size] validation_labels = train_labels[:validation_size] train_images = train_images[validation_size:] train_labels = train_labels[validation_size:] options = dict(dtype=dtype, reshape=reshape, seed=seed) train = DataSet(train_images, train_labels, **options) validation = DataSet(validation_images, validation_labels, **options) test = DataSet(test_images, test_labels, **options) return base.Datasets(train=train, validation=validation, test=test)
def read_data_sets(train_images_filename, train_labels_filename, test_images_filename, test_labels_filename, train_dir, fake_data=False, one_hot=False, dtype=dtypes.float32, reshape=True, seed=None): if fake_data: def fake(): return DataSet([], [], fake_data=True, one_hot=one_hot, dtype=dtype, seed=seed) train = fake() test = fake() return base.Datasets(train=train, test=test) local_file = base.maybe_download(train_images_filename, train_dir, SOURCE_URL + train_images_filename) with open(local_file, 'rb') as f: train_images = extract_images(f) local_file = base.maybe_download(train_labels_filename, train_dir, SOURCE_URL + train_labels_filename) with open(local_file, 'rb') as f: train_labels = extract_labels(f, one_hot=one_hot) local_file = base.maybe_download(test_images_filename, train_dir, SOURCE_URL + test_images_filename) with open(local_file, 'rb') as f: test_images = extract_images(f) local_file = base.maybe_download(test_labels_filename, train_dir, SOURCE_URL + test_labels_filename) with open(local_file, 'rb') as f: test_labels = extract_labels(f, one_hot=one_hot) train = DataSet(train_images, train_labels, dtype=dtype, reshape=reshape, seed=seed) test = DataSet(test_images, test_labels, dtype=dtype, reshape=reshape, seed=seed) return base.Datasets(train=train, validation=test, test=test)
def read_data_sets(train_dir, fake_data=False, one_hot=False, dtype=dtypes.float32, reshape=False, validation_size=5000, worker_id=-1, n_workers=-1): if fake_data: def fake(): return DataSet([], [], fake_data=True, one_hot=one_hot, dtype=dtype) train = fake() validation = fake() test = fake() return base.Datasets(train=train, validation=validation, test=test) TRAIN_IMAGES = 'train-images-idx3-ubyte.gz' TRAIN_LABELS = 'train-labels-idx1-ubyte.gz' TEST_IMAGES = 't10k-images-idx3-ubyte.gz' TEST_LABELS = 't10k-labels-idx1-ubyte.gz' local_file = base.maybe_download(TRAIN_IMAGES, train_dir, SOURCE_URL + TRAIN_IMAGES) train_images = extract_data(local_file, 60000) local_file = base.maybe_download(TRAIN_LABELS, train_dir, SOURCE_URL + TRAIN_LABELS) train_labels = extract_labels(local_file, 60000) local_file = base.maybe_download(TEST_IMAGES, train_dir, SOURCE_URL + TEST_IMAGES) test_images = extract_data(local_file, 10000) local_file = base.maybe_download(TEST_LABELS, train_dir, SOURCE_URL + TEST_LABELS) test_labels = extract_labels(local_file, 10000) if not 0 <= validation_size <= len(train_images): raise ValueError( 'Validation size should be between 0 and {}. Received: {}.' .format(len(train_images), validation_size)) validation_images = test_images validation_labels = test_labels train_images = train_images train_labels = train_labels train = DataSet(train_images, train_labels, dtype=dtype, reshape=reshape) #train = DataSet(train_images, train_labels_tmp, dtype=dtype, reshape=reshape) validation = DataSet(validation_images, validation_labels, dtype=dtype, reshape=reshape) return base.Datasets(train=train, validation=validation, test=None)
def read_data_sets(train_dir, fake_data=False, one_hot=False, dtype=dtypes.float32, reshape=True): if fake_data: def fake(): return DataSet([], [], fake_data=True, one_hot=one_hot, dtype=dtype) train = fake() validation = fake() test = fake() return base.Datasets(train=train, validation=validation, test=test) TRAIN_IMAGES = 'train-images-idx3-ubyte.gz' TRAIN_LABELS = 'train-labels-idx1-ubyte.gz' TEST_IMAGES = 't10k-images-idx3-ubyte.gz' TEST_LABELS = 't10k-labels-idx1-ubyte.gz' VALIDATION_SIZE = 5000 local_file = base.maybe_download(TRAIN_IMAGES, train_dir, SOURCE_URL + TRAIN_IMAGES) train_images = extract_images(local_file) local_file = base.maybe_download(TRAIN_LABELS, train_dir, SOURCE_URL + TRAIN_LABELS) train_labels = extract_labels(local_file, one_hot=one_hot) local_file = base.maybe_download(TEST_IMAGES, train_dir, SOURCE_URL + TEST_IMAGES) test_images = extract_images(local_file) local_file = base.maybe_download(TEST_LABELS, train_dir, SOURCE_URL + TEST_LABELS) test_labels = extract_labels(local_file, one_hot=one_hot) validation_images = train_images[:VALIDATION_SIZE] validation_labels = train_labels[:VALIDATION_SIZE] train_images = train_images[VALIDATION_SIZE:] train_labels = train_labels[VALIDATION_SIZE:] train = DataSet(train_images, train_labels, dtype=dtype, reshape=reshape) validation = DataSet(validation_images, validation_labels, dtype=dtype, reshape=reshape) test = DataSet(test_images, test_labels, dtype=dtype, reshape=reshape) return base.Datasets(train=train, validation=validation, test=test)
def read_ext_data_sets(train_dir, fake_data=False, one_hot=False, dtype=dtypes.float32, reshape=True): if fake_data: def fake(): return DataSet([], [], fake_data=True, one_hot=one_hot, dtype=dtype) train = fake() validation = fake() test = fake() return base.Datasets(train=train, validation=validation, test=test) VALIDATION_SIZE = 5000 # local_file = base.maybe_download(TRAIN_IMAGES, train_dir, # SOURCE_URL + TRAIN_IMAGES) local_file = '/home/skynet0/data/train_images_ubyte.gz' train_images = extract_images(local_file) # local_file = base.maybe_download(TRAIN_LABELS, train_dir, # SOURCE_URL + TRAIN_LABELS) local_file = '/home/skynet0/data/train_labels_ubyte.gz' train_labels = extract_labels(local_file, one_hot=one_hot) # local_file = base.maybe_download(TEST_IMAGES, train_dir, # SOURCE_URL + TEST_IMAGES) local_file = '/home/skynet0/data/test_images_ubyte.gz' test_images = extract_images(local_file) # local_file = base.maybe_download(TEST_LABELS, train_dir, # SOURCE_URL + TEST_LABELS) local_file = '/home/skynet0/data/test_labels_ubyte.gz' test_labels = extract_labels(local_file, one_hot=one_hot) validation_images = train_images[:VALIDATION_SIZE] validation_labels = train_labels[:VALIDATION_SIZE] train_images = train_images[VALIDATION_SIZE:] train_labels = train_labels[VALIDATION_SIZE:] train = DataSet(train_images, train_labels, dtype=dtype, reshape=reshape) validation = DataSet(validation_images, validation_labels, dtype=dtype, reshape=reshape) test = DataSet(test_images, test_labels, dtype=dtype, reshape=reshape) return base.Datasets(train=train, validation=validation, test=test)
def read_data_sets(train_dir, fake_data=False, one_hot=False, dtype=dtypes.float32, reshape=True, validation_size=0): if fake_data: def fake(): return DataSet([], [], fake_data=True, one_hot=one_hot, dtype=dtype) train = fake() validation = fake() test = fake() return base.Datasets(train=train, validation=validation, test=test) gz_file_name = 'cifar-10-python.tar.gz' local_file = base.maybe_download(gz_file_name, train_dir, SOURCE_URL + gz_file_name) train_images = [] train_labels = [] for i in range(1, 6): with open(os.path.join(train_dir, 'cifar-10-batches-py', 'data_batch_%d'%i)) as f: batch = numpy.load(f) tmp_images = batch['data'].reshape([-1, 3, 32, 32]) train_images.append(tmp_images.transpose([0, 2, 3, 1])) train_labels += batch['labels'] train_images = numpy.concatenate(train_images) train_labels = numpy.array(train_labels) if not 0 <= validation_size <= len(train_images): raise ValueError( 'Validation size should be between 0 and {}. Received: {}.' .format(len(train_images), validation_size)) validation_images = train_images[:validation_size] validation_labels = train_labels[:validation_size] train_images = train_images[validation_size:] train_labels = train_labels[validation_size:] train = DataSet(train_images, train_labels, dtype=dtype, reshape=reshape) validation = DataSet(validation_images, validation_labels, dtype=dtype, reshape=reshape) #test = DataSet(test_images, test_labels, dtype=dtype, reshape=reshape) test = None return base.Datasets(train=train, validation=validation, test=test)
def read_data_sets( fake_data=False, one_hot=False, dtype=dtypes.float32, reshape=True, validation_size=5000, seed=None, ): if fake_data: def fake(): return DataSet([], [], fake_data=True, one_hot=one_hot, dtype=dtype, seed=seed) train = fake() validation = fake() test = fake() return base.Datasets(train=train, validation=validation, test=test) with gfile.Open(train_data_dir, 'rb') as f: train_images = extract_images(f) with gfile.Open(train_labels_dir, 'rb') as f: train_labels = extract_labels(f, one_hot=one_hot) with gfile.Open(eval_data_dir, 'rb') as f: test_images = extract_images(f) with gfile.Open(eval_labels_dir, 'rb') as f: test_labels = extract_labels(f, one_hot=one_hot) if not 0 <= validation_size <= len(train_images): raise ValueError( 'Validation size should be between 0 and {}. Received: {}.'.format( len(train_images), validation_size)) validation_images = train_images[:validation_size] validation_labels = train_labels[:validation_size] train_images = train_images[validation_size:] train_labels = train_labels[validation_size:] options = dict(dtype=dtype, reshape=reshape, seed=seed) train = DataSet(train_images, train_labels, **options) validation = DataSet(validation_images, validation_labels, **options) test = DataSet(test_images, test_labels, **options) return base.Datasets(train=train, validation=validation, test=test)
def read_data_sets(train_labels_csv, test_labels_csv, fake_data=False, one_hot=False, dtype=dtypes.float32, reshape=True, validation_size=5000, dataset_path='../'): """Read HASY data.""" if fake_data: def fake(): return DataSet([], [], fake_data=True, one_hot=one_hot, dtype=dtype) train = fake() validation = fake() test = fake() return base.Datasets(train=train, validation=validation, test=test) symbol_id2index = generate_index(os.path.join(dataset_path, 'symbols.csv')) test_images, test_labels, _ = load_images(test_labels_csv, symbol_id2index) train_images, train_labels, _ = load_images(train_labels_csv, symbol_id2index) if not 0 <= validation_size <= len(train_images): raise ValueError( 'Validation size should be between 0 and {}. Received: {}.'.format( len(train_images), validation_size)) # Shuffle data perm = np.arange(len(train_labels)) np.random.shuffle(perm) train_images = train_images[perm] train_labels = train_labels[perm] # Split training set in training and validation set validation_images = train_images[:validation_size] validation_labels = train_labels[:validation_size] train_images = train_images[validation_size:] train_labels = train_labels[validation_size:] train = DataSet(train_images, train_labels, dtype=dtype, reshape=reshape) validation = DataSet(validation_images, validation_labels, dtype=dtype, reshape=reshape) test = DataSet(test_images, test_labels, dtype=dtype, reshape=reshape) return base.Datasets(train=train, validation=validation, test=test)
def read_data_sets(train_dir, fake_data=False, one_hot=False, dtype=dtypes.float32, reshape=True, validation_rate=0.1, seed=None, mode="fire"): if fake_data: def fake(): return DataSet([], [], fake_data=True, one_hot=one_hot, dtype=dtype, seed=seed) train = fake() validation = fake() test = fake() return base.Datasets(train=train, validation=validation, test=test) base_path = train_dir TRAIN_IMAGES = base_path + mode + '-images-idx3-ubyte.gz' TRAIN_LABELS = base_path + mode + '-labels-idx1-ubyte.gz' local_file = TRAIN_IMAGES with gfile.Open(local_file, 'rb') as f: train_images = extract_images(f) local_file = TRAIN_LABELS with gfile.Open(local_file, 'rb') as f: train_labels = extract_labels(f, one_hot=one_hot) validation_size = int(0.1 * len(train_images)) if not 0 <= validation_size <= len(train_images): raise ValueError( 'Validation size should be between 0 and {}. Received: {}.'.format( len(train_images), validation_size)) validation_images = train_images[:validation_size] validation_labels = train_labels[:validation_size] train_images = train_images[validation_size:] train_labels = train_labels[validation_size:] options = dict(dtype=dtype, reshape=reshape, seed=seed) train = DataSet(train_images, train_labels, **options) validation = DataSet(validation_images, validation_labels, **options) return base.Datasets(train=train, validation=validation, test=validation)
def read_data_sets(train_dir, fake_data=False, one_hot=False, dtype=dtypes.float32, reshape=False, validation_size=10): if fake_data: def fake(): return DataSet([], [], fake_data=True, one_hot=one_hot, dtype=dtype) train = fake() validation = fake() test = fake() return base.Datasets(train=train, validation=validation, test=test) train_file = glob(train_dir) # print(train_file) train_images, train_labels = extract_images(train_file) train_labels = extract_labels(train_labels, one_hot=one_hot) print(train_images.shape) # TEST_IMAGES = (['test_data/rb.HOT.15m(1).csv']) test_file = glob(cfg.test_dataset) test_images, test_labels = extract_images(test_file) test_labels = extract_labels(test_labels, one_hot=one_hot) if not 0 <= validation_size <= len(train_images): raise ValueError( 'Validation size should be between 0 and {}. Received: {}.'.format( len(train_images), validation_size)) validation_images = train_images[:validation_size] validation_labels = train_labels[:validation_size] train_images = train_images[validation_size:] train_labels = train_labels[validation_size:] train = DataSet(train_images, train_labels, dtype=dtype, reshape=reshape) validation = DataSet(validation_images, validation_labels, dtype=dtype, reshape=reshape) test = DataSet(test_images, test_labels, dtype=dtype, reshape=reshape) return base.Datasets(train=train, validation=validation, test=test)
def augment_mnist_data(dataset, augmenter, augmented_ratio=1): """ Augments a data set and returns it. :param augmenter: The augmenter :param augmented_ratio: Returns how many times the data set needs to be replicated. :param data set: The data set that needs to be augmented. :return: """ train_images = [] train_labels = [] training_length = len(dataset.train.images) # Loop all training images for i in tqdm(range(augmented_ratio * training_length), desc="Augmenting images", unit="image"): # Augment images train_images.append( np.reshape(augmenter.augment_image(dataset.train.images[i % training_length].reshape(28, 28, 1)), 784)) # Append corresponding label train_labels.append(dataset.train.labels[i % training_length]) train = Dataset(train_images, train_labels) test = dataset.test return base.Datasets(train=train, test=test, validation=None)
def load_fashion_mnist_A(validation_size=5000): (train_images, train_labels), (test_images, test_labels) = tf.keras.datasets.fashion_mnist.load_data( ) #keras only added to tensorflow # since version 1.4, currently using 1.1, fashion_mnist only added to keras.datasets in even later versions than 1.4 # so apparently tensorflow==1.13 (latest) works even when i run run_spam_experiment. so I guess keep tf at this version # and all I had to do was downgrade spacy? if not 0 <= validation_size <= len(train_images): raise ValueError( 'Validation size should be between 0 and {}. Received: {}.'.format( len(train_images), validation_size)) validation_images = train_images[:validation_size] validation_labels = train_labels[:validation_size] train_images = train_images[validation_size:] train_labels = train_labels[validation_size:] train_images = train_images.astype(np.float32) / 255 validation_images = validation_images.astype(np.float32) / 255 test_images = test_images.astype(np.float32) / 255 train = DataSet(train_images, train_labels) validation = DataSet(validation_images, validation_labels) test = DataSet(test_images, test_labels) return base.Datasets(train=train, validation=validation, test=test)
def get_feature_vectors(model): train_feature_vectors = np.concatenate( (model.sess.run(model.feature_vector, feed_dict=model.all_train_feed_dict), model.sess.run(model.feature_vector, feed_dict=model.all_validation_feed_dict))) validation_feature_vectors = np.empty([0, 32]) test_feature_vectors = model.sess.run(model.feature_vector, feed_dict=model.all_test_feed_dict) # validation_feature_vectors = model.sess.run(model.feature_vector, feed_dict=model.all_validation_feed_dict) train_labels = np.concatenate( (model.data_sets.train.labels, model.data_sets.validation.labels)) validation_labels = np.empty([0]) test_labels = model.data_sets.test.labels # print('train_feature_vectors.shape', type(train_feature_vectors)) # print('train_feature_vectors.shape', type(train_feature_vectors)) # print('train_feature_vectors.shape', train_feature_vectors.shape) print('test_feature_vectors.shape', test_feature_vectors.shape) print('validation_feature_vectors.shape', validation_feature_vectors.shape) print('train_labels.shape', train_labels.shape) print('test_labels.shape', test_labels.shape) print('validation_labels.shape', validation_labels.shape) train = DataSet(train_feature_vectors, train_labels) validation = DataSet(validation_feature_vectors, validation_labels) test = DataSet(test_feature_vectors, test_labels) return base.Datasets(train=train, validation=validation, test=test)
def read_data_sets(dataset_tf_dir, dtype=dtypes.float32, reshape=True, seed=None): total_images = extract_images(dataset_tf_dir,if_training=True) total_labels = extract_labels(dataset_tf_dir,if_training=True) total_idx = [i for i in range(len(total_labels))] test_images = total_images[::10] test_labels = total_labels[::10] test_idx = total_idx[::10] train_idx = list(set(total_idx) - set(test_idx)) train_images = total_images[numpy.array(train_idx)] train_labels = total_labels[numpy.array(train_idx)] validation_images = extract_images(dataset_tf_dir,if_training=False) validation_labels = extract_labels(dataset_tf_dir,if_training=False) train = DataSet( train_images, train_labels, dtype=dtype, reshape=reshape, seed=seed) validation = DataSet( validation_images, validation_labels, dtype=dtype, reshape=reshape, seed=seed) test = DataSet( test_images, test_labels, dtype=dtype, reshape=reshape, seed=seed) return base.Datasets(train=train, validation=validation, test=test)
def read_data_sets(data_path, val_size=0.1, test_size=0.1, n_steps=10, n_test_steps=10, seed=None): print("loading time series ...") data = np.load(data_path) # Expand the dimension if univariate time series if (np.ndim(data) == 1): data = np.expand_dims(data, axis=1) print("input type ", type(data), np.shape(data)) # """normalize the data""" # print("normalize to (0-1)") # data = normalize_columns(data) ntest = int(round(len(data) * (1 - test_size))) nval = int(round(len(data[:ntest]) * (1 - val_size))) train_data, valid_data, test_data = data[:nval, ], data[ nval:ntest, ], data[ntest:, ] train_options = dict(num_steps=n_steps, seed=seed) test_options = dict(num_steps=n_test_steps, seed=seed) train = DataSet(train_data, **train_options) valid = DataSet(valid_data, **train_options) test = DataSet(test_data, **test_options) return base.Datasets(train=train, validation=valid, test=test)
def dataset_reshaped(data_sets): train_images=data_sets.train.x train_images=train_images.reshape(train_images.shape[0],28,28,1) train_labels=data_sets.train.labels n_values = np.max(train_labels) + 1 train_labels=np.eye(n_values)[train_labels] validation_images=data_sets.validation.x validation_images=validation_images.reshape(validation_images.shape[0],28,28,1) validation_labels=data_sets.validation.labels n_values = np.max(validation_labels) + 1 validation_labels=np.eye(n_values)[validation_labels] test_images=data_sets.test.x test_images=test_images.reshape(test_images.shape[0],28,28,1) test_labels=data_sets.test.labels n_values = np.max(test_labels) + 1 test_labels=np.eye(n_values)[test_labels] train = DataSet(train_images, train_labels,size_change=True) validation = DataSet(validation_images, validation_labels,size_change=True) test = DataSet(test_images, test_labels,size_change=True) return base.Datasets(train=train, validation=validation, test=test)
def generate_inception_features(model, poisoned_X_train_subset, labels_subset, batch_size=None): poisoned_train = DataSet(poisoned_X_train_subset, labels_subset) poisoned_data_sets = base.Datasets(train=poisoned_train, validation=None, test=None) if batch_size == None: batch_size = len(labels_subset) num_examples = poisoned_data_sets.train.num_examples assert num_examples % batch_size == 0 num_iter = int(num_examples / batch_size) poisoned_data_sets.train.reset_batch() inception_features_val = [] print(np.shape(poisoned_data_sets.train.x)) for i in range(num_iter): feed_dict = model.fill_feed_dict_with_batch(poisoned_data_sets.train, batch_size=batch_size) inception_features_val_temp = model.sess.run(model.inception_features, feed_dict=feed_dict) inception_features_val.append(inception_features_val_temp) return np.concatenate(inception_features_val)
def load_dbpedia(size='small', test_with_fake_data=False): """Get DBpedia datasets from CSV files.""" if not test_with_fake_data: data_dir = os.path.join(os.getenv('TF_EXP_BASE_DIR', ''), 'dbpedia_data') maybe_download_dbpedia(data_dir) train_path = os.path.join(data_dir, 'dbpedia_csv', 'train.csv') test_path = os.path.join(data_dir, 'dbpedia_csv', 'code.csv') if size == 'small': # Reduce the size of original data by a factor of 1000. base.shrink_csv(train_path, 1000) base.shrink_csv(test_path, 1000) train_path = train_path.replace('train.csv', 'train_small.csv') test_path = test_path.replace('code.csv', 'test_small.csv') else: module_path = os.path.dirname(__file__) train_path = os.path.join(module_path, 'data', 'text_train.csv') test_path = os.path.join(module_path, 'data', 'text_test.csv') train = base.load_csv_without_header(train_path, target_dtype=np.int32, features_dtype=np.str, target_column=0) test = base.load_csv_without_header(test_path, target_dtype=np.int32, features_dtype=np.str, target_column=0) return base.Datasets(train=train, validation=None, test=test)
def read_data_sets(pictures, labels, num_classes, one_hot=False, dtype=dtypes.float32, reshape=True, validation_size=5000): if not len(pictures) == len(labels): raise ValueError('Numbers of pictures and labels doesnot match') labels = dense_to_one_hot(labels, num_classes) test_size = len(pictures) // 8 test_images = pictures[:test_size] test_labels = labels[:test_size] train_images = pictures[test_size:] train_labels = labels[test_size:] validation_size = len(train_images) // 7 validation_images = train_images[:validation_size] validation_labels = train_labels[:validation_size] train_images = train_images[validation_size:] train_labels = train_labels[validation_size:] train = DataSet(train_images, train_labels, dtype=dtype, reshape=reshape) validation = DataSet(validation_images, validation_labels, dtype=dtype, reshape=reshape) test = DataSet(test_images, test_labels, dtype=dtype, reshape=reshape) return base.Datasets(train=train, validation=validation, test=test)