def get_feature_vectors(model):
    train_feature_vectors = np.concatenate(
        (model.sess.run(model.feature_vector,
                        feed_dict=model.all_train_feed_dict),
         model.sess.run(model.feature_vector,
                        feed_dict=model.all_validation_feed_dict)))
    validation_feature_vectors = np.empty([0, 32])
    test_feature_vectors = model.sess.run(model.feature_vector,
                                          feed_dict=model.all_test_feed_dict)
    # validation_feature_vectors = model.sess.run(model.feature_vector, feed_dict=model.all_validation_feed_dict)

    train_labels = np.concatenate(
        (model.data_sets.train.labels, model.data_sets.validation.labels))
    validation_labels = np.empty([0])
    test_labels = model.data_sets.test.labels

    # print('train_feature_vectors.shape', type(train_feature_vectors))
    # print('train_feature_vectors.shape', type(train_feature_vectors))
    #
    print('train_feature_vectors.shape', train_feature_vectors.shape)
    print('test_feature_vectors.shape', test_feature_vectors.shape)
    print('validation_feature_vectors.shape', validation_feature_vectors.shape)

    print('train_labels.shape', train_labels.shape)
    print('test_labels.shape', test_labels.shape)
    print('validation_labels.shape', validation_labels.shape)

    train = DataSet(train_feature_vectors, train_labels)
    validation = DataSet(validation_feature_vectors, validation_labels)
    test = DataSet(test_feature_vectors, test_labels)

    return base.Datasets(train=train, validation=validation, test=test)
def load_fashion_mnist_A(validation_size=5000):

    (train_images,
     train_labels), (test_images,
                     test_labels) = tf.keras.datasets.fashion_mnist.load_data(
                     )  #keras only added to tensorflow
    # since version 1.4, currently using 1.1, fashion_mnist only added to keras.datasets in even later versions than 1.4
    # so apparently tensorflow==1.13 (latest) works even when i run run_spam_experiment. so I guess keep tf at this version
    # and all I had to do was downgrade spacy?

    if not 0 <= validation_size <= len(train_images):
        raise ValueError(
            'Validation size should be between 0 and {}. Received: {}.'.format(
                len(train_images), validation_size))

    validation_images = train_images[:validation_size]
    validation_labels = train_labels[:validation_size]
    train_images = train_images[validation_size:]
    train_labels = train_labels[validation_size:]

    train_images = train_images.astype(np.float32) / 255
    validation_images = validation_images.astype(np.float32) / 255
    test_images = test_images.astype(np.float32) / 255

    train = DataSet(train_images, train_labels)
    validation = DataSet(validation_images, validation_labels)
    test = DataSet(test_images, test_labels)

    return base.Datasets(train=train, validation=validation, test=test)
Esempio n. 3
0
def load_spam(n=None):

    X_train, Y_train, X_valid, Y_valid, X_test, Y_test = process_spam(n)

    # Convert them to dense matrices
    X_train = X_train.toarray()
    X_valid = X_valid.toarray()
    X_test = X_test.toarray()

    print('type(X_train)', type(X_train))
    print(type(Y_train))
    print(X_train.shape)
    print(Y_train.shape)
    print(X_valid.shape)
    print(Y_valid.shape)
    # print(X_valid[0])
    # print(X_valid[1])
    print(X_test.shape)
    print(Y_test)
    print(Y_test.shape)

    train = DataSet(X_train, Y_train)
    validation = DataSet(X_valid, Y_valid)
    test = DataSet(X_test, Y_test)

    print('load_spam.py X_train.shape', X_train.shape)

    return base.Datasets(train=train, validation=validation, test=test)
Esempio n. 4
0
def load_cifar():
    (x_train, y_train), (x_test, y_test) = cifar10.load_data()
    x_train = x_train.transpose(0, 2, 3, 1)
    x_test = x_test.transpose(0, 2, 3, 1)
    train = DataSet(x_train, y_train.flatten())
    test = DataSet(x_test, y_test.flatten())
    validation = None

    return base.Datasets(train=train, validation=validation, test=test)
Esempio n. 5
0
def load_mnist(train_dir, validation_size=5000):

  SOURCE_URL = 'http://yann.lecun.com/exdb/mnist/'
 
  TRAIN_IMAGES = 'train-images-idx3-ubyte.gz'
  TRAIN_LABELS = 'train-labels-idx1-ubyte.gz'
  TEST_IMAGES = 't10k-images-idx3-ubyte.gz'
  TEST_LABELS = 't10k-labels-idx1-ubyte.gz'

  local_file = base.maybe_download(TRAIN_IMAGES, train_dir,
                                   SOURCE_URL + TRAIN_IMAGES)
  with open(local_file, 'rb') as f:
    train_images = extract_images(f)

  local_file = base.maybe_download(TRAIN_LABELS, train_dir,
                                   SOURCE_URL + TRAIN_LABELS)
  with open(local_file, 'rb') as f:
    train_labels = extract_labels(f)

  local_file = base.maybe_download(TEST_IMAGES, train_dir,
                                   SOURCE_URL + TEST_IMAGES)
  with open(local_file, 'rb') as f:
    test_images = extract_images(f)

  local_file = base.maybe_download(TEST_LABELS, train_dir,
                                   SOURCE_URL + TEST_LABELS)
  with open(local_file, 'rb') as f:
    test_labels = extract_labels(f)

  if not 0 <= validation_size <= len(train_images):
    raise ValueError(
        'Validation size should be between 0 and {}. Received: {}.'
        .format(len(train_images), validation_size))

  validation_images = train_images[:validation_size]
  validation_labels = train_labels[:validation_size]
  train_images = train_images[validation_size:]
  train_labels = train_labels[validation_size:]

  train_images = train_images.astype(np.float32) / 255
  validation_images = validation_images.astype(np.float32) / 255
  test_images = test_images.astype(np.float32) / 255
  print('train_images.shape', train_images.shape)
  print('validation_images.shape', validation_images.shape)
  print('test_images.shape', test_images.shape)

  train = DataSet(train_images, train_labels)
  validation = DataSet(validation_images, validation_labels)
  test = DataSet(test_images, test_labels)

  return base.Datasets(train=train, validation=validation, test=test)
Esempio n. 6
0
def load_spam(n = None):

    X_train, Y_train, X_valid, Y_valid, X_test, Y_test = process_spam(n)

    # Convert them to dense matrices
    X_train = X_train.toarray()
    X_valid = X_valid.toarray()
    X_test = X_test.toarray()

    train = DataSet(X_train, Y_train)
    validation = DataSet(X_valid, Y_valid)
    test = DataSet(X_test, Y_test)

    return base.Datasets(train=train, validation=validation, test=test)
Esempio n. 7
0
def load_toy(from_file=False):
    """
    Create a dataset object that could be loaded to the training scripts.
    If from_file == True, load from already saved data.
    """
    if from_file:
        data = np.load('data/toy_2d.npz')
        x_train, y_train, x_test, y_test = data['x_train'], data[
            'y_train'], data['x_test'], data['y_test']
    else:
        x_train, x_test, y_train, y_test = generate_toy_2d()
    train = DataSet(x_train, y_train)
    test = DataSet(x_test, y_test)
    validation = None
    return base.Datasets(train=train, validation=validation, test=test)
Esempio n. 8
0
    def retrain(self, num_steps, feed_dict):        

        retrain_dataset = DataSet(feed_dict[self.input_placeholder], feed_dict[self.labels_placeholder])

        for step in xrange(num_steps):   
            iter_feed_dict = self.fill_feed_dict_with_batch(retrain_dataset)
            self.sess.run(self.train_op, feed_dict=iter_feed_dict)
Esempio n. 9
0
def generate_inception_features(model,
                                poisoned_X_train_subset,
                                labels_subset,
                                batch_size=None):
    poisoned_train = DataSet(poisoned_X_train_subset, labels_subset)
    poisoned_data_sets = base.Datasets(train=poisoned_train,
                                       validation=None,
                                       test=None)

    if batch_size == None:
        batch_size = len(labels_subset)

    num_examples = poisoned_data_sets.train.num_examples
    assert num_examples % batch_size == 0
    num_iter = int(num_examples / batch_size)

    poisoned_data_sets.train.reset_batch()

    inception_features_val = []
    print(np.shape(poisoned_data_sets.train.x))
    for i in range(num_iter):
        feed_dict = model.fill_feed_dict_with_batch(poisoned_data_sets.train,
                                                    batch_size=batch_size)
        inception_features_val_temp = model.sess.run(model.inception_features,
                                                     feed_dict=feed_dict)
        inception_features_val.append(inception_features_val_temp)

    return np.concatenate(inception_features_val)
Esempio n. 10
0
def load_yelp(train_dir):

    train = np.loadtxt("%s/yelp-ex.train.rating" % train_dir, delimiter='\t')
    valid = np.loadtxt("%s/yelp-ex.valid.rating" % train_dir, delimiter='\t')
    test = np.loadtxt("%s/yelp-ex.test.rating" % train_dir, delimiter='\t')

    train_input = train[:628881, :2].astype(np.int32)
    train_output = train[:628881, 2]
    valid_input = valid[:, :2].astype(np.int32)
    valid_output = valid[:, 2]
    test_input = test[:51153, :2].astype(np.int32)
    test_output = test[:51153, 2]

    train = DataSet(train_input, train_output)
    validation = DataSet(valid_input, valid_output)
    test = DataSet(test_input, test_output)

    return base.Datasets(train=train, validation=validation, test=test)
 def update_train_x(self, new_train_x):
     assert np.all(new_train_x.shape == self.data_sets.train.x.shape)
     new_train = DataSet(new_train_x, np.copy(self.data_sets.train.labels))
     self.data_sets = base.Datasets(train=new_train,
                                    validation=self.data_sets.validation,
                                    test=self.data_sets.test)
     self.all_train_feed_dict = self.fill_feed_dict_with_all_ex(
         self.data_sets.train)
     self.reset_datasets()
 def update_test_x_y(self, new_test_x, new_test_y):
     new_test = DataSet(new_test_x, new_test_y)
     self.data_sets = base.Datasets(train=self.data_sets.train,
                                    validation=self.data_sets.validation,
                                    test=new_test)
     self.all_test_feed_dict = self.fill_feed_dict_with_all_ex(
         self.data_sets.test)
     self.num_test_examples = len(new_test_y)
     self.reset_datasets()
Esempio n. 13
0
def load_spam(ex_to_leave_out=None, num_examples=None):

    X_train, Y_train, X_valid, Y_valid, X_test, Y_test = process_spam(
        ex_to_leave_out, num_examples)

    # Convert them to dense matrices
    X_train = X_train.toarray()
    if X_valid is not None:
        X_valid = X_valid.toarray()
    X_test = X_test.toarray()

    train = DataSet(X_train, Y_train)
    if X_valid is not None:
        validation = DataSet(X_valid, Y_valid)
    else:
        validation = None
    test = DataSet(X_test, Y_test)
    #print(X_train[1])
    return base.Datasets(train=train, validation=validation, test=test)
Esempio n. 14
0
def load_movielens(train_dir, validation_size=5000):

    train = np.loadtxt("%s/ml-1m-ex.train.rating" % train_dir, delimiter='\t')
    valid = np.loadtxt("%s/ml-1m-ex.valid.rating" % train_dir, delimiter='\t')
    test = np.loadtxt("%s/ml-1m-ex.test.rating" % train_dir, delimiter='\t')

    train_input = train[:975460, :2].astype(np.int32)
    train_output = train[:975460, 2]
    valid_input = valid[:, :2].astype(np.int32)
    valid_output = valid[:, 2]
    # test_input = test[:-1, :2].astype(np.int32)
    # test_output = test[:-1, 2]
    test_input = test[:, :2].astype(np.int32)
    test_output = test[:, 2]

    train = DataSet(train_input, train_output)
    validation = DataSet(valid_input, valid_output)
    test = DataSet(test_input, test_output)

    return base.Datasets(train=train, validation=validation, test=test)
Esempio n. 15
0
def generate_inception_features(model,
                                poisoned_X_train_subset,
                                labels_subset,
                                batch_size=None):
    poisoned_train = DataSet(poisoned_X_train_subset, labels_subset)

    if batch_size == None:
        batch_size = len(labels_subset)

    assert len(poisoned_X_train_subset) % batch_size == 0
    num_iter = int(len(poisoned_X_train_subset) / batch_size)

    poisoned_train.reset_batch()

    inception_features_val = []
    for i in xrange(num_iter):
        inception_features_val_temp = model.generate_inception_features(
            poisoned_train, batch_size)
        inception_features_val.append(inception_features_val_temp)

    return np.concatenate(inception_features_val)
def load_heart_disease(ex_to_leave_out=None,num_examples=None):
    X_train, Y_train, X_valid, Y_valid, X_test, Y_test = process_heart_disease(ex_to_leave_out, num_examples)
    # Convert them to dense matrices
    Y_train = np.array(Y_train)
    Y_valid = np.array(Y_valid)
    Y_test = np.array(Y_test)
    X_train = np.array(X_train)
    if X_valid is not None:
        X_valid = np.array(X_valid)
    X_test = np.array(X_test)

    train = DataSet(X_train, Y_train)
    if X_valid is not None:
        validation = DataSet(X_valid, Y_valid)
    else:
        validation = None
    test = DataSet(X_test, Y_test)
    #print(X_train[1])
    return base.Datasets(train=train, validation=validation, test=test)


    
Esempio n. 17
0
def load_dogfish_with_koda():
    classes = ['dog', 'fish']
    X_test, Y_test = load_koda()

    data_sets = load_animals(num_train_ex_per_class=900,
                             num_test_ex_per_class=300,
                             num_valid_ex_per_class=0,
                             classes=classes)
    train = data_sets.train
    validation = data_sets.validation
    test = DataSet(X_test, Y_test)

    return base.Datasets(train=train, validation=validation, test=test)
Esempio n. 18
0
def load_small_mnist(train_dir, validation_size=5000, random_seed=0):
    np.random.seed(random_seed)
    data_sets = load_mnist(train_dir, validation_size)

    train_images = data_sets.train.x
    train_labels = data_sets.train.labels
    perm = np.arange(len(train_labels))
    np.random.shuffle(perm)
    num_to_keep = int(len(train_labels) / 10)
    perm = perm[:num_to_keep]
    train_images = train_images[perm, :]
    train_labels = train_labels[perm]

    validation_images = data_sets.validation.x
    validation_labels = data_sets.validation.labels
    # perm = np.arange(len(validation_labels))
    # np.random.shuffle(perm)
    # num_to_keep = int(len(validation_labels) / 10)
    # perm = perm[:num_to_keep]
    # validation_images = validation_images[perm, :]
    # validation_labels = validation_labels[perm]

    test_images = data_sets.test.x
    test_labels = data_sets.test.labels
    # perm = np.arange(len(test_labels))
    # np.random.shuffle(perm)
    # num_to_keep = int(len(test_labels) / 10)
    # perm = perm[:num_to_keep]
    # test_images = test_images[perm, :]
    # test_labels = test_labels[perm]

    train = DataSet(train_images, train_labels)
    validation = DataSet(validation_images, validation_labels)
    test = DataSet(test_images, test_labels)

    return base.Datasets(train=train, validation=validation, test=test)
Esempio n. 19
0
def load_dogfish_with_orig_and_koda():
    classes = ['dog', 'fish']
    X_test, Y_test = load_koda()
    X_test = np.reshape(X_test, (X_test.shape[0], -1))

    data_sets = load_animals(num_train_ex_per_class=900,
                             num_test_ex_per_class=300,
                             num_valid_ex_per_class=0,
                             classes=classes)
    train = data_sets.train
    validation = data_sets.validation

    test = DataSet(np.concatenate((data_sets.test.x, X_test), axis=0),
                   np.concatenate((data_sets.test.labels, Y_test), axis=0))

    return base.Datasets(train=train, validation=validation, test=test)
Esempio n. 20
0
## RBF

input_channels = 1
weight_decay = 0.001
batch_size = num_train
initial_learning_rate = 0.001
keep_probs = None
max_lbfgs_iter = 1000
use_bias = False
decay_epochs = [1000, 10000]

tf.reset_default_graph()

X_train = image_data_sets.train.x
Y_train = image_data_sets.train.labels * 2 - 1
train = DataSet(L_train, Y_train)
test = DataSet(L_test, Y_test)

data_sets = base.Datasets(train=train, validation=None, test=test)
input_dim = data_sets.train.x.shape[1]

# Train with hinge
rbf_model = SmoothHinge(
    temp=0,
    use_bias=use_bias,
    input_dim=input_dim,
    weight_decay=weight_decay,
    num_classes=num_classes,
    batch_size=batch_size,
    data_sets=data_sets,
    initial_learning_rate=initial_learning_rate,
Esempio n. 21
0
            print('Inception features do not exist. Generating %s...' % label)
            data_set.reset_batch()

            num_examples = data_set.num_examples
            assert num_examples % batch_size == 0

            inception_features_val = generate_inception_features(
                full_model, data_set.x, data_set.labels, batch_size=batch_size)

            np.savez(inception_features_path,
                     inception_features_val=inception_features_val,
                     labels=data_set.labels)

train_f = np.load('output/%s_inception_features_new_train.npz' % dataset_name)
train = DataSet(train_f['inception_features_val'], train_f['labels'])
test_f = np.load('output/%s_inception_features_new_test.npz' % dataset_name)
test = DataSet(test_f['inception_features_val'], test_f['labels'])

validation = None

inception_data_sets = base.Datasets(train=train,
                                    validation=validation,
                                    test=test)

print('*** Top:')
with top_graph.as_default():
    top_model_name = '%s_inception_onlytop_wd-%s' % (dataset_name,
                                                     weight_decay)
    input_dim = 2048
    top_model = BinaryLogisticRegressionWithLBFGS(
Esempio n. 22
0
X_test = data_sets.test.x
Y_test = data_sets.test.labels

X_train, Y_train = dataset.filter_dataset(X_train, Y_train, pos_class,
                                          neg_class)
X_test, Y_test = dataset.filter_dataset(X_test, Y_test, pos_class, neg_class)

# Round dataset size off to the nearest 100, just for batching convenience
num_train = int(np.floor(len(Y_train) / 100) * 100)
num_test = int(np.floor(len(Y_test) / 100) * 100)
X_train = X_train[:num_train, :]
Y_train = Y_train[:num_train]
X_test = X_test[:num_test, :]
Y_test = Y_test[:num_test]

train = DataSet(X_train, Y_train)
validation = None
test = DataSet(X_test, Y_test)
data_sets = base.Datasets(train=train, validation=validation, test=test)

num_classes = 2
input_side = 28
input_channels = 1
input_dim = input_side * input_side * input_channels
weight_decay = 0.01
use_bias = False
batch_size = 100
initial_learning_rate = 0.001
keep_probs = None
decay_epochs = [1000, 10000]
max_lbfgs_iter = 1000
Esempio n. 23
0
def load_mnist(train_dir, validation_size=5000):

    SOURCE_URL = 'http://yann.lecun.com/exdb/mnist/'

    TRAIN_IMAGES = 'train-images-idx3-ubyte.gz'
    TRAIN_LABELS = 'train-labels-idx1-ubyte.gz'
    TEST_IMAGES = 't10k-images-idx3-ubyte.gz'
    TEST_LABELS = 't10k-labels-idx1-ubyte.gz'

    local_file = base.maybe_download(TRAIN_IMAGES, train_dir,
                                     SOURCE_URL + TRAIN_IMAGES)
    with open(local_file, 'rb') as f:
        train_images = extract_images(f)

    local_file = base.maybe_download(TRAIN_LABELS, train_dir,
                                     SOURCE_URL + TRAIN_LABELS)
    with open(local_file, 'rb') as f:
        train_labels = extract_labels(f)

    local_file = base.maybe_download(TEST_IMAGES, train_dir,
                                     SOURCE_URL + TEST_IMAGES)
    with open(local_file, 'rb') as f:
        test_images = extract_images(f)

    local_file = base.maybe_download(TEST_LABELS, train_dir,
                                     SOURCE_URL + TEST_LABELS)
    with open(local_file, 'rb') as f:
        test_labels = extract_labels(f)

    if not 0 <= validation_size <= len(train_images):
        raise ValueError(
            'Validation size should be between 0 and {}. Received: {}.'.format(
                len(train_images), validation_size))

    validation_images = train_images[:validation_size]
    validation_labels = train_labels[:validation_size]
    train_images = train_images[validation_size:]
    train_labels = train_labels[validation_size:]
    #   print(np.shape(train_labels))
    #   plt.imshow(np.reshape(train_images[100], (28, 28)), cmap='gray', interpolation='none')

    #   train_images = train_images[np.where((train_labels == 3) | (train_labels == 5))[0]]
    #   train_labels = train_labels[np.where((train_labels == 3) | (train_labels == 5))[0]]
    #   test_images = test_images[np.where((test_labels == 3) | (test_labels == 5))[0]]
    #   test_labels = test_labels[np.where((test_labels == 3) | (test_labels == 5))[0]]
    #   validation_images = validation_images[np.where((validation_labels == 3) | (validation_labels == 5))[0]]
    #   validation_labels = validation_labels[np.where((validation_labels == 3) | (validation_labels == 5))[0]]

    train_images = train_images.astype(np.float32) / 255
    validation_images = validation_images.astype(np.float32) / 255
    test_images = test_images.astype(np.float32) / 255

    #   train_labels = label_binarize(train_labels, classes=[3,5])[:,0]
    #   test_labels = label_binarize(test_labels, classes=[3,5])[:,0]
    #   validation_labels = label_binarize(validation_labels, classes=[3,5])[:,0]

    train = DataSet(train_images, train_labels)
    validation = DataSet(validation_images, validation_labels)
    test = DataSet(test_images, test_labels)

    return base.Datasets(train=train, validation=validation, test=test)
Esempio n. 24
0
def load_diabetes():
    columns = [
        'encounter_id', 'patient_nbr', 'race', 'gender', 'age', 'weight',
        'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
        'time_in_hospital', 'payer_code', 'medical_specialty',
        'num_lab_procedures', 'num_procedures', 'num_medications',
        'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
        'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',
        'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
        'glimepiride', 'acetohexamide', 'glipizide', 'glyburide',
        'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol',
        'troglitazone', 'tolazamide', 'examide', 'citoglipton', 'insulin',
        'glyburide-metformin', 'glipizide-metformin',
        'glimepiride-pioglitazone', 'metformin-rosiglitazone',
        'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted'
    ]

    data = pd.read_csv('data/diabetes/diabetic_data.csv',
                       names=columns,
                       sep=' *, *',
                       skiprows=1,
                       na_values='?')
    # data = pd.read_csv('data/diabetes/diabetic_data.csv')
    # test_data = pd.read_csv('data/adult/adult.test', names=columns, sep=' *, *', skiprows=1, na_values='?')
    # print(data.info()) #race, weight, payer_code, medical speciality, diag_1, diag_2, diag_3 have missing values
    # print(data.shape)
    # print(data.iloc[0, :])
    # data = pd.concat([train_data, test_data])

    # # Before data.dropna() #############################################################################################
    # print('data[data.race == \'White\'].shape[0]', data[data.race == 'White'].shape[0])
    # print('data[data.race == \'Asian-Pac-Islander\'].shape[0]', data[data.race == 'Asian-Pac-Islander'].shape[0])
    # print('data[data.race == \'Amer-Indian-Eskimo\'].shape[0]', data[data.race == 'Amer-Indian-Eskimo'].shape[0])
    # print('data[data.race == \'Other\'].shape[0]', data[data.race == 'Other'].shape[0])
    # print('data[data.race == \'Black\'].shape[0]', data[data.race == 'Black'].shape[0])
    #
    # print('data[data.sex == \'Female\'].shape[0]', data[data.sex == 'Female'].shape[0])
    # print('data[data.sex == \'Male\'].shape[0]', data[data.sex == 'Male'].shape[0])
    # # Before data.dropna() #############################################################################################

    # drop rows with missing values (where there is '?') in these columns
    data = data.dropna(axis=0, subset=['race', 'diag_1', 'diag_2', 'diag_3'])
    data = data[data.gender != 'Unknown/Invalid']
    # print(data.info())
    # print(data)
    # print(data.shape)

    # drop columns weight, payer_code, medical_specialty because too many missing values and probably not useful info?
    data = data.drop([
        'encounter_id', 'patient_nbr', 'weight', 'payer_code',
        'medical_specialty'
    ],
                     axis=1)
    # print(data.info())
    # print(data)
    # print(data.shape)
    # drop the weight and payer-code columns because there are so many missing values?
    # drop rows where there are missing values for race or gender
    # drop rows where there are missing values for diag_1, diag_2, or diag_3

    # count number of occurrences of a specific value ##################################################################
    # Minority Group - race (index 8): Caucasian(75079), Asian(625), AfricanAmerican(18881), Hispanic(1984), Other(1483)
    # Minority Group - gender (index 9): Female(52833), Male(45219)
    print('data[data.race == \'Caucasian\'].shape[0]',
          data[data.race == 'Caucasian'].shape[0])
    print('data[data.race == \'Asian\'].shape[0]',
          data[data.race == 'Asian'].shape[0])
    print('data[data.race == \'AfricanAmerican\'].shape[0]',
          data[data.race == 'AfricanAmerican'].shape[0])
    print('data[data.race == \'Hispanic\'].shape[0]',
          data[data.race == 'Hispanic'].shape[0])
    print('data[data.race == \'Other\'].shape[0]',
          data[data.race == 'Other'].shape[0])

    print('data[data.gender == \'Female\'].shape[0]',
          data[data.gender == 'Female'].shape[0])
    print('data[data.gender == \'Male\'].shape[0]',
          data[data.gender == 'Male'].shape[0])
    # count number of occurrences of a specific value ##################################################################

    # convert to binary classes
    data[['readmitted']] = data[['readmitted']].replace({
        '<30': 1,
        '>30': 1,
        'NO': 0
    })
    print(data.info())
    print(data)
    print(data.shape)
    # extract labels from data
    labels = data['readmitted'].values
    data = data.drop(['readmitted'], axis=1)

    print(data.info())
    print(data)
    print(data.shape)

    data = data.to_numpy()

    print(data.shape)  # 98052 -> [0.7, 0.15, 0.15] = [68636, 14708, 14708]

    data, labels = shuffle(data, labels, random_state=0)

    # print('data[0]', data[0:10])  # [23, 'Private', 84726, 'Assoc-acdm', 12, 'Married-civ-spouse', 'Farming-fishing', 'Wife', 'White', 'Female', 0, 0, 45, 'Germany']
    # print('data[0].shape', data[0:10].shape)
    # # sex_column = data[:, 9]
    # print('np.argwhere(data[:37998, 9] == \'Female\')', np.argwhere(data[:37998, 9] == 'Female'))

    # Columns that are categorical: 1, 3, 5, 6, 7, 8, 9, 13
    data_categorical = data[:, [
        0, 1, 2, 3, 4, 5, 13, 14, 15, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
        27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43
    ]]
    data_numerical = data[:, [6, 7, 8, 9, 10, 11, 12, 16]]

    print('np.asarray(data_categorical).shape',
          np.asarray(data_categorical).shape)
    print('np.asarray(data_numerical).shape', np.asarray(data_numerical).shape)

    # one-hot encoding
    enc = OneHotEncoder(handle_unknown='ignore')
    enc.fit(data_categorical)
    data_categorical_onehot = enc.transform(data_categorical).toarray()

    print('enc.get_feature_names()', enc.get_feature_names()[:15])
    print('enc.get_feature_names().shape', enc.get_feature_names().shape)
    print(np.where(enc.get_feature_names() ==
                   'x0_AfricanAmerican'))  # AfricanAmerican: 0 + 8
    print(np.where(enc.get_feature_names() == 'x0_Asian'))  # Asian: 1 + 8
    print(np.where(
        enc.get_feature_names() == 'x0_Caucasian'))  # Caucasian: 2 + 8
    print(
        np.where(enc.get_feature_names() == 'x0_Hispanic'))  # Hispanic: 3 + 8
    print(np.where(enc.get_feature_names() == 'x0_Other'))  # Other: 4 + 8
    print(np.where(enc.get_feature_names() == 'x1_Female'))  # Female: 5 + 8
    print(np.where(enc.get_feature_names() == 'x1_Male'))  # Male: 6 + 8

    data_num_and_onehot = np.concatenate(
        (data_numerical, data_categorical_onehot), axis=1)

    print('data_num_and_onehot[0]', data_num_and_onehot[0])
    print('data_num_and_onehot[0].shape', data_num_and_onehot[0].shape)
    print('data_num_and_onehot.shape', data_num_and_onehot.shape)
    # print('np.argwhere(data_num_and_onehot[:37998, 61] == 1)', np.argwhere(data_num_and_onehot[:37998, 61] == 1))

    train_size = 84000
    validation_size = 7000  #data.shape[0] * 0.1  # fraction_size = 1, validation_size = 2000
    test_size = 7052
    train_and_validation_data = data_num_and_onehot[:train_size +
                                                    validation_size]
    test_data = data_num_and_onehot[train_size + validation_size:train_size +
                                    validation_size + test_size]

    # normalize
    scaler = MinMaxScaler()
    scaler.fit(train_and_validation_data)
    train_and_validation_data = scaler.transform(train_and_validation_data)
    test_data = scaler.transform(test_data)

    # print('np.argwhere(train_and_validation_data[:37998, 61] == 1)', np.argwhere(train_and_validation_data[:37998, 61] == 1))
    #
    # print('train_and_validation_data[0]', train_and_validation_data[0])
    # print('train_and_validation_data[0].shape', train_and_validation_data[0].shape)

    # X_train = train_and_validation_data[:train_size]
    # Y_train = labels[:train_size]
    # X_valid = train_and_validation_data[train_size:train_size + validation_size]
    # Y_valid = labels[train_size:train_size + validation_size]

    X_valid = train_and_validation_data[:validation_size]
    Y_valid = labels[:validation_size]
    X_train = train_and_validation_data[validation_size:validation_size +
                                        train_size]
    Y_train = labels[validation_size:validation_size + train_size]

    X_test = test_data
    Y_test = labels[train_size + validation_size:train_size + validation_size +
                    test_size]

    train = DataSet(X_train, Y_train)
    validation = DataSet(X_valid, Y_valid)
    test = DataSet(X_test, Y_test)

    return base.Datasets(train=train, validation=validation, test=test)
def load_fashion_mnist(validation_size=5000):

    (train_images,
     train_labels), (test_images,
                     test_labels) = tf.keras.datasets.fashion_mnist.load_data(
                     )  #keras only added to tensorflow
    # since version 1.4, currently using 1.1, fashion_mnist only added to keras.datasets in even later versions than 1.4
    # so apparently tensorflow==1.13 (latest) works even when i run run_spam_experiment. so I guess keep tf at this version
    # and all I had to do was downgrade spacy?

    plt.imshow(train_images[0])
    plt.show()
    if not 0 <= validation_size <= len(train_images):
        raise ValueError(
            'Validation size should be between 0 and {}. Received: {}.'.format(
                len(train_images), validation_size))

    # print('uniques', np.unique(train_labels))
    # print('uniques', np.unique(test_labels))
    #
    # print('train_images.shape', train_images.shape)
    # print('train_labels.shape', train_labels.shape)
    #
    # print('train_images.shape[0], train_images.shape[1], train_images.shape[2], 1', train_images.shape[0], train_images.shape[1], train_images.shape[2], 1)
    train_images = np.reshape(train_images,
                              (train_images.shape[0], train_images.shape[1],
                               train_images.shape[2], 1))
    test_images = np.reshape(
        test_images,
        (test_images.shape[0], test_images.shape[1], test_images.shape[2], 1))

    # # testing if error when using 6 classes is due to dataset too small (answer is no) ###################
    # validation_size = int(validation_size * 0.6)
    # train_images = train_images[:int(train_labels.shape[0] * 0.6)]
    # train_labels = train_labels[:int(train_labels.shape[0] * 0.6)]
    # test_images = test_images[:int(test_labels.shape[0] * 0.6)]
    # test_labels = test_labels[:int(test_labels.shape[0] * 0.6)]
    # # testing if error when using 6 classes is due to dataset too small (answer is no) ###################

    # print('train_images.shape', train_images.shape)
    # print('train_labels.shape', train_labels.shape)

    validation_images = train_images[:validation_size]
    validation_labels = train_labels[:validation_size]
    train_images = train_images[validation_size:]
    train_labels = train_labels[validation_size:]

    # print(len(validation_images))
    # print(len(validation_labels))
    # print(len(train_labels))
    # print(len(train_images))
    # print(len(test_images))
    # print(len(test_labels))

    train_images = train_images.astype(np.float64) / 255
    validation_images = validation_images.astype(np.float64) / 255
    test_images = test_images.astype(np.float64) / 255

    # print('train_images.shape', train_images.shape)
    # print('validation_images.shape', validation_images.shape)
    # print('test_images.shape', test_images.shape)

    train = DataSet(train_images, train_labels)
    validation = DataSet(validation_images, validation_labels)
    test = DataSet(test_images, test_labels)

    return base.Datasets(train=train, validation=validation, test=test)
def load_2_class_fashion_mnist(validation_size=1000):
    (train_images,
     train_labels), (test_images,
                     test_labels) = tf.keras.datasets.fashion_mnist.load_data(
                     )  #keras only added to tensorflow
    # since version 1.4, currently using 1.1, fashion_mnist only added to keras.datasets in even later versions than 1.4
    # so apparently tensorflow==1.13 (latest) works even when i run run_spam_experiment. so I guess keep tf at this version
    # and all I had to do was downgrade spacy?

    images = np.concatenate((train_images, test_images), axis=0)
    labels = np.concatenate((train_labels, test_labels), axis=0)

    print('1 images.shape', images.shape)
    print('1 labels.shape', labels.shape)

    class_0_indices = np.argwhere(labels == 0)
    class_1_indices = np.argwhere(labels == 1)

    print('class_0_indices.shape', class_0_indices.shape)
    class_0_indices = np.reshape(class_0_indices, (class_0_indices.shape[0], ))
    class_1_indices = np.reshape(class_1_indices, (class_1_indices.shape[0], ))

    print('class_0_indices.reshape', class_0_indices.shape)

    reduced_class_indices = np.concatenate((class_0_indices, class_1_indices))

    print('reduced_class_indices.shape', reduced_class_indices.shape)

    images = images[reduced_class_indices]
    labels = labels[reduced_class_indices]

    images, labels = shuffle(images, labels, random_state=0)

    print('images.shape', images.shape)
    print('labels.shape', labels.shape)
    print('np.unique(labels)', np.unique(labels))
    # print('images[0]')
    # print(images[0])
    # print('6 labels')
    # print(labels)

    train_size = 12000

    validation_images = images[:validation_size]
    validation_labels = labels[:validation_size]
    train_images = images[validation_size:train_size]
    train_labels = labels[validation_size:train_size]
    test_images = images[train_size:]
    test_labels = labels[train_size:]

    print(len(validation_images))
    print(len(validation_labels))
    print(len(train_labels))
    print(len(train_images))
    print(len(test_images))
    print(len(test_labels))

    if not 0 <= validation_size <= len(train_images):
        raise ValueError(
            'Validation size should be between 0 and {}. Received: {}.'.format(
                len(train_images), validation_size))

    print('train_images.shape', train_images.shape)
    print('train_labels.shape', train_labels.shape)
    print('train_images.shape', test_images.shape)
    print('train_images.shape', test_labels.shape)
    print('train_images.shape', validation_images.shape)
    print('train_images.shape', validation_labels.shape)

    train_images = train_images.astype(np.float32) / 255
    validation_images = validation_images.astype(np.float32) / 255
    test_images = test_images.astype(np.float32) / 255

    train = DataSet(train_images, train_labels)
    validation = DataSet(validation_images, validation_labels)
    test = DataSet(test_images, test_labels)

    return base.Datasets(train=train, validation=validation, test=test)
Esempio n. 27
0
def load_adult():
    columns = [
        "age", "workClass", "fnlwgt", "education", "education-num",
        "marital-status", "occupation", "relationship", "race", "sex",
        "capital-gain", "capital-loss", "hours-per-week", "native-country",
        "income"
    ]

    train_data = pd.read_csv('data/adult/adult.data',
                             names=columns,
                             sep=' *, *',
                             na_values='?')
    test_data = pd.read_csv('data/adult/adult.test',
                            names=columns,
                            sep=' *, *',
                            skiprows=1,
                            na_values='?')

    data = pd.concat([train_data, test_data])

    # # Before data.dropna() #############################################################################################
    # print('data[data.race == \'White\'].shape[0]', data[data.race == 'White'].shape[0])
    # print('data[data.race == \'Asian-Pac-Islander\'].shape[0]', data[data.race == 'Asian-Pac-Islander'].shape[0])
    # print('data[data.race == \'Amer-Indian-Eskimo\'].shape[0]', data[data.race == 'Amer-Indian-Eskimo'].shape[0])
    # print('data[data.race == \'Other\'].shape[0]', data[data.race == 'Other'].shape[0])
    # print('data[data.race == \'Black\'].shape[0]', data[data.race == 'Black'].shape[0])
    #
    # print('data[data.sex == \'Female\'].shape[0]', data[data.sex == 'Female'].shape[0])
    # print('data[data.sex == \'Male\'].shape[0]', data[data.sex == 'Male'].shape[0])
    # # Before data.dropna() #############################################################################################

    # drop rows with missing values (where there is '?')
    data = data.dropna()

    # # count number of occurrences of a specific value ##################################################################
    # # Minority Group - race (index 8): White(38903), Asian-Pac-Islander(1303), Amer-Indian-Eskimo(435), Other(353), Black(4228)
    # # Minority Group - sex (index 9): Female(14695), Male(30527)
    # print('data[data.race == \'White\'].shape[0]', data[data.race == 'White'].shape[0])
    # print('data[data.race == \'Asian-Pac-Islander\'].shape[0]', data[data.race == 'Asian-Pac-Islander'].shape[0])
    # print('data[data.race == \'Amer-Indian-Eskimo\'].shape[0]', data[data.race == 'Amer-Indian-Eskimo'].shape[0])
    # print('data[data.race == \'Other\'].shape[0]', data[data.race == 'Other'].shape[0])
    # print('data[data.race == \'Black\'].shape[0]', data[data.race == 'Black'].shape[0])
    #
    # print('data[data.sex == \'Female\'].shape[0]', data[data.sex == 'Female'].shape[0])
    # print('data[data.sex == \'Male\'].shape[0]', data[data.sex == 'Male'].shape[0])
    # count number of occurrences of a specific value ##################################################################

    data = data.replace({'<=50K.': '<=50K', '>50K.': '>50K'})
    data = data.replace({'<=50K': 0, '>50K': 1})

    labels = data['income'].values
    data = data.drop(['income'], axis=1)

    data = data.to_numpy()

    print(data.shape)  # 45222 -> [0.7, 0.15, 0.15] = [31656, 6783, 6783]

    data, labels = shuffle(data, labels, random_state=0)

    # print('data[0]', data[0:10])  # [23, 'Private', 84726, 'Assoc-acdm', 12, 'Married-civ-spouse', 'Farming-fishing', 'Wife', 'White', 'Female', 0, 0, 45, 'Germany']
    # print('data[0].shape', data[0:10].shape)
    # # sex_column = data[:, 9]
    # print('np.argwhere(data[:37998, 9] == \'Female\')', np.argwhere(data[:37998, 9] == 'Female'))

    # Columns that are categorical: 1, 3, 5, 6, 7, 8, 9, 13
    data_categorical = data[:, [1, 3, 5, 6, 7, 8, 9, 13]]
    data_numerical = data[:, [0, 2, 4, 10, 11, 12]]

    # print('np.asarray(data_categorical).shape', np.asarray(data_categorical).shape)

    # one-hot encoding
    enc = OneHotEncoder(handle_unknown='ignore')
    enc.fit(data_categorical)
    data_categorical_onehot = enc.transform(data_categorical).toarray()

    print('enc.get_feature_names()', enc.get_feature_names())
    # print('enc.get_feature_names().shape', enc.get_feature_names().shape)
    print(np.where(enc.get_feature_names() ==
                   'x6_Female'))  # Female: 55 + 6, Male: 56 + 6
    print(np.where(enc.get_feature_names() ==
                   'x5_Amer-Indian-Eskimo'))  # Amer-Indian-Eskimo: 50 + 6
    print(np.where(enc.get_feature_names() ==
                   'x5_Asian-Pac-Islander'))  # Asian-Pac-Islander: 51 + 6
    print(np.where(enc.get_feature_names() == 'x5_Black'))  # Black: 52 + 6
    print(np.where(enc.get_feature_names() == 'x5_Other'))  # Other: 53 + 6
    print(np.where(enc.get_feature_names() == 'x5_White'))  # Other: 54 + 6

    data_num_and_onehot = np.concatenate(
        (data_numerical, data_categorical_onehot), axis=1)

    # print('data_num_and_onehot[0]', data_num_and_onehot[0])
    # print('data_num_and_onehot[0].shape', data_num_and_onehot[0].shape)
    # print('np.argwhere(data_num_and_onehot[:37998, 61] == 1)', np.argwhere(data_num_and_onehot[:37998, 61] == 1))

    train_size = 38000
    validation_size = 3000  #data.shape[0] * 0.1  # fraction_size = 1, validation_size = 2000
    train_and_validation_data = data_num_and_onehot[:train_size +
                                                    validation_size]
    test_data = data_num_and_onehot[train_size + validation_size:]

    # normalize
    scaler = MinMaxScaler()
    scaler.fit(train_and_validation_data)
    train_and_validation_data = scaler.transform(train_and_validation_data)
    test_data = scaler.transform(test_data)

    # print('np.argwhere(train_and_validation_data[:37998, 61] == 1)', np.argwhere(train_and_validation_data[:37998, 61] == 1))
    #
    # print('train_and_validation_data[0]', train_and_validation_data[0])
    # print('train_and_validation_data[0].shape', train_and_validation_data[0].shape)

    # X_train = train_and_validation_data[:train_size]
    # Y_train = labels[:train_size]
    # X_valid = train_and_validation_data[train_size:train_size + validation_size]
    # Y_valid = labels[train_size:train_size + validation_size]

    X_valid = train_and_validation_data[:validation_size]
    Y_valid = labels[:validation_size]
    X_train = train_and_validation_data[validation_size:validation_size +
                                        train_size]
    Y_train = labels[validation_size:validation_size + train_size]

    X_test = test_data
    Y_test = labels[train_size + validation_size:]

    train = DataSet(X_train, Y_train)
    validation = DataSet(X_valid, Y_valid)
    test = DataSet(X_test, Y_test)

    return base.Datasets(train=train, validation=validation, test=test)
def load_6_class_fashion_mnist_small(validation_size=3000, fraction_size=0.5):
    (train_images,
     train_labels), (test_images,
                     test_labels) = tf.keras.datasets.fashion_mnist.load_data(
                     )  #keras only added to tensorflow
    # since version 1.4, currently using 1.1, fashion_mnist only added to keras.datasets in even later versions than 1.4
    # so apparently tensorflow==1.13 (latest) works even when i run run_spam_experiment. so I guess keep tf at this version
    # and all I had to do was downgrade spacy?

    train_images = np.reshape(train_images,
                              (train_images.shape[0], train_images.shape[1],
                               train_images.shape[2], 1))
    test_images = np.reshape(
        test_images,
        (test_images.shape[0], test_images.shape[1], test_images.shape[2], 1))

    images = np.concatenate((train_images, test_images), axis=0)
    labels = np.concatenate((train_labels, test_labels), axis=0)

    # print('images.shape', images.shape)
    # print('labels.shape', labels.shape)

    class_0_indices = np.argwhere(labels == 0)
    class_2_indices = np.argwhere(labels == 2)
    class_3_indices = np.argwhere(labels == 3)
    class_6_indices = np.argwhere(labels == 6)
    class_7_indices = np.argwhere(labels == 7)
    class_9_indices = np.argwhere(labels == 9)

    # print('class_0_indices.shape', class_0_indices.shape)
    class_0_indices = np.reshape(
        class_0_indices,
        (class_0_indices.shape[0], ))[:int(class_0_indices.shape[0] *
                                           fraction_size)]
    class_2_indices = np.reshape(
        class_2_indices,
        (class_2_indices.shape[0], ))[:int(class_2_indices.shape[0] *
                                           fraction_size)]
    class_3_indices = np.reshape(
        class_3_indices,
        (class_3_indices.shape[0], ))[:int(class_3_indices.shape[0] *
                                           fraction_size)]
    class_6_indices = np.reshape(
        class_6_indices,
        (class_6_indices.shape[0], ))[:int(class_6_indices.shape[0] *
                                           fraction_size)]
    class_7_indices = np.reshape(
        class_7_indices,
        (class_7_indices.shape[0], ))[:int(class_7_indices.shape[0] *
                                           fraction_size)]
    class_9_indices = np.reshape(
        class_9_indices,
        (class_9_indices.shape[0], ))[:int(class_9_indices.shape[0] *
                                           fraction_size)]

    # print('class_0_indices.shape', class_0_indices.shape)
    # print('class_2_indices.shape', class_2_indices.shape)
    # print('class_3_indices.shape', class_3_indices.shape)
    # print('class_6_indices.shape', class_6_indices.shape)
    # print('class_7_indices.shape', class_7_indices.shape)
    # print('class_9_indices.shape', class_9_indices.shape)

    reduced_class_indices = np.concatenate(
        (class_0_indices, class_2_indices, class_3_indices, class_6_indices,
         class_7_indices, class_9_indices))

    # print('reduced_class_indices.shape', reduced_class_indices.shape)

    images = images[reduced_class_indices]
    labels = labels[reduced_class_indices]

    total_num_samples = images.shape[0]

    # Have to replace labels with 0, 1, 2, 3, 4, 5 or training won't work
    labels = np.where(labels == 2, 1, labels)
    labels = np.where(labels == 3, 2, labels)
    labels = np.where(labels == 6, 3, labels)
    labels = np.where(labels == 7, 4, labels)
    labels = np.where(labels == 9, 5, labels)

    # images, labels = shuffle(images, labels, random_state=0)

    # print('images.shape', images.shape)
    # print('labels.shape', labels.shape)
    # print('6', np.unique(labels))
    # print('images[0]')
    # print(images[0])
    # print('6 labels')
    # print(labels)

    train_size = int(36000 * fraction_size)
    validation_size = int(validation_size * fraction_size)
    total_num_samples = int(total_num_samples * fraction_size)

    # print('validation_size', validation_size)

    validation_images = images[:validation_size]
    validation_labels = labels[:validation_size]
    train_images = images[validation_size:train_size]
    train_labels = labels[validation_size:train_size]
    test_images = images[train_size:]
    test_labels = labels[train_size:]

    # print(len(validation_images))
    # print(len(validation_labels))
    # print(len(train_labels))
    # print(len(train_images))
    # print(len(test_images))
    # print(len(test_labels))

    if not 0 <= validation_size <= len(train_images):
        raise ValueError(
            'Validation size should be between 0 and {}. Received: {}.'.format(
                len(train_images), validation_size))

    # print('train_images.shape', train_images.shape)
    # print('train_labels.shape', train_labels.shape)
    # print('test_images.shape', test_images.shape)
    # print('test_images.shape', test_labels.shape)
    # print('validation_images.shape', validation_images.shape)
    # print('validation_images.shape', validation_labels.shape)

    train_images = train_images.astype(np.float64) / 255
    validation_images = validation_images.astype(np.float64) / 255
    test_images = test_images.astype(np.float64) / 255

    # print('train_images.shape', train_images.shape)
    # print('validation_images.shape', validation_images.shape)
    # print('test_images.shape', test_images.shape)

    train = DataSet(train_images, train_labels)
    validation = DataSet(validation_images, validation_labels)
    test = DataSet(test_images, test_labels)

    return base.Datasets(train=train, validation=validation, test=test)
Esempio n. 29
0
def load_animals(num_train_ex_per_class=300,
                 num_test_ex_per_class=100,
                 num_valid_ex_per_class=0,
                 classes=None,
                 ):
    num_channels = 3
    img_side = 299

    if num_valid_ex_per_class == 0:
        valid_str = ''
    else:
        valid_str = '_valid-%s' % num_valid_examples

    if classes is None:
        classes = ['dog', 'cat', 'bird', 'fish', 'horse', 'monkey', 'zebra', 'panda', 'lemur', 'wombat']
        data_filename = os.path.join(BASE_DIR, 'dataset_train-%s_test-%s%s.npz' % (
        num_train_ex_per_class, num_test_ex_per_class, valid_str))
    else:
        data_filename = os.path.join(BASE_DIR, 'dataset_%s_train-%s_test-%s%s.npz' % (
        '-'.join(classes), num_train_ex_per_class, num_test_ex_per_class, valid_str))

    num_classes = len(classes)
    num_train_examples = num_train_ex_per_class * num_classes
    num_test_examples = num_test_ex_per_class * num_classes
    num_valid_examples = num_valid_ex_per_class * num_classes

    if os.path.exists(data_filename):
        print('Loading animals from disk...')
        f = np.load(data_filename)
        X_train = f['X_train']
        X_test = f['X_test']
        Y_train = f['Y_train']
        Y_test = f['Y_test']

        if 'X_valid' in f:
            X_valid = f['X_valid']
        else:
            X_valid = None

        if 'Y_valid' in f:
            Y_valid = f['Y_valid']
        else:
            Y_valid = None

    else:
        print('Reading animals from raw images...')
        X_train = np.zeros([num_train_examples, img_side, img_side, num_channels])
        X_test = np.zeros([num_test_examples, img_side, img_side, num_channels])
        X_valid = np.zeros([num_valid_examples, img_side, img_side, num_channels])

        Y_train = np.zeros([num_train_examples])
        Y_test = np.zeros([num_test_examples])
        Y_valid = np.zeros([num_valid_examples])

        for class_idx, class_string in enumerate(classes):
            print('class: %s' % class_string)
            # For some reason, a lot of numbers are skipped.
            i = 0
            num_filled = 0
            while num_filled < num_train_ex_per_class:
                img_path = os.path.join(BASE_DIR, '%s/%s_%s.JPEG' % (class_string, class_string, i))
                print(img_path)
                if os.path.exists(img_path):
                    fill(X_train, Y_train, num_filled + (num_train_ex_per_class * class_idx), class_idx, img_path,
                         img_side)
                    num_filled += 1
                    print(num_filled)
                i += 1

            num_filled = 0
            while num_filled < num_test_ex_per_class:
                img_path = os.path.join(BASE_DIR, '%s/%s_%s.JPEG' % (class_string, class_string, i))
                if os.path.exists(img_path):
                    fill(X_test, Y_test, num_filled + (num_test_ex_per_class * class_idx), class_idx, img_path,
                         img_side)
                    num_filled += 1
                    print(num_filled)
                i += 1

            num_filled = 0
            while num_filled < num_valid_ex_per_class:
                img_path = os.path.join(BASE_DIR, '%s/%s_%s.JPEG' % (class_string, class_string, i))
                if os.path.exists(img_path):
                    fill(X_valid, Y_valid, num_filled + (num_valid_ex_per_class * class_idx), class_idx, img_path,
                         img_side)
                    num_filled += 1
                    print(num_filled)
                i += 1

        X_train = preprocess_input(X_train)
        X_test = preprocess_input(X_test)
        X_valid = preprocess_input(X_valid)

        np.random.seed(0)
        permutation_idx = np.arange(num_train_examples)
        np.random.shuffle(permutation_idx)
        X_train = X_train[permutation_idx, :]
        Y_train = Y_train[permutation_idx]
        permutation_idx = np.arange(num_test_examples)
        np.random.shuffle(permutation_idx)
        X_test = X_test[permutation_idx, :]
        Y_test = Y_test[permutation_idx]
        permutation_idx = np.arange(num_valid_examples)
        np.random.shuffle(permutation_idx)
        X_valid = X_valid[permutation_idx, :]
        Y_valid = Y_valid[permutation_idx]

        np.savez_compressed(data_filename, X_train=X_train, Y_train=Y_train, X_test=X_test, Y_test=Y_test,
                            X_valid=X_valid, Y_valid=Y_valid)

    train = DataSet(X_train, Y_train)
    if (X_valid is not None) and (Y_valid is not None):
        validation = DataSet(X_valid, Y_valid)
    else:
        validation = None

    test = DataSet(X_test, Y_test)

    return base.Datasets(train=train, validation=validation, test=test)
Esempio n. 30
0
def load_heart():
    columns = [
        'age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
        'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target'
    ]

    data = pd.read_csv('data/heart/heart.csv',
                       names=columns,
                       sep=' *, *',
                       skiprows=1,
                       na_values='?')
    # test_data = pd.read_csv('data/adult/adult.test', names=columns, sep=' *, *', skiprows=1, na_values='?')

    # data = pd.concat([train_data, test_data])

    print(data.info())
    print(data)
    print(data.shape)

    # # Before data.dropna() #############################################################################################
    # print('data[data.sex == \'Female\'].shape[0]', data[data.sex == 'Female'].shape[0])
    # print('data[data.sex == \'Male\'].shape[0]', data[data.sex == 'Male'].shape[0])
    # # Before data.dropna() #############################################################################################

    # drop rows with missing values (where there is '?')
    # data = data.dropna()
    #
    # print(data.info())
    # print(data)
    # print(data.shape)

    # # count number of occurrences of a specific value ##################################################################
    # # Minority Group - sex (index 9): Female=0(96), Male=1(207)
    print('data[data.sex == \'Female\'].shape[0]',
          data[data.sex == 0].shape[0])
    print('data[data.sex == \'Male\'].shape[0]', data[data.sex == 1].shape[0])
    # count number of occurrences of a specific value ##################################################################

    # data = data.replace({'<=50K.': '<=50K', '>50K.': '>50K'})
    # data = data.replace({'<=50K': 0, '>50K': 1})

    labels = data['target'].values
    data = data.drop(['target'], axis=1)

    data = data.to_numpy()

    print(
        data.shape
    )  # 303 -> [0.7, 0.15, 0.15] = [213, 45, 45], [0.8, 0.1, 0.1] = [243, 30, 30]

    data, labels = shuffle(data, labels, random_state=0)

    # print('data[0]', data[0:10])  # [23, 'Private', 84726, 'Assoc-acdm', 12, 'Married-civ-spouse', 'Farming-fishing', 'Wife', 'White', 'Female', 0, 0, 45, 'Germany']
    # print('data[0].shape', data[0:10].shape)
    # # sex_column = data[:, 9]
    # print('np.argwhere(data[:37998, 9] == \'Female\')', np.argwhere(data[:37998, 9] == 'Female'))

    # Columns that are categorical: (sex, cp, fbs, restecg, exang, slope, ca, thal)
    data_categorical = data[:, [1, 2, 5, 6, 8, 10, 11, 12]]
    data_numerical = data[:, [0, 3, 4, 7, 9]]

    # print('np.asarray(data_categorical).shape', np.asarray(data_categorical).shape)

    # one-hot encoding
    enc = OneHotEncoder(handle_unknown='ignore')
    enc.fit(data_categorical)
    data_categorical_onehot = enc.transform(data_categorical).toarray()

    print('enc.get_feature_names()', enc.get_feature_names())
    # print('enc.get_feature_names().shape', enc.get_feature_names().shape)
    print(np.where(enc.get_feature_names() == 'x0_0.0'))  # Female: 0 + 5
    # print(np.where(enc.get_feature_names() == 'x5_Amer-Indian-Eskimo'))  # Amer-Indian-Eskimo: 50 + 6
    # print(np.where(enc.get_feature_names() == 'x5_Asian-Pac-Islander'))  # Asian-Pac-Islander: 51 + 6
    # print(np.where(enc.get_feature_names() == 'x5_Black'))  # Black: 52 + 6
    # print(np.where(enc.get_feature_names() == 'x5_Other'))  # Other: 53 + 6

    data_num_and_onehot = np.concatenate(
        (data_numerical, data_categorical_onehot), axis=1)

    # print('data_num_and_onehot[0]', data_num_and_onehot[0])
    # print('data_num_and_onehot[0].shape', data_num_and_onehot[0].shape)
    # print('np.argwhere(data_num_and_onehot[:37998, 61] == 1)', np.argwhere(data_num_and_onehot[:37998, 61] == 1))

    train_size = 240  # 240
    validation_size = 30  # 30 #data.shape[0] * 0.1  # fraction_size = 1, validation_size = 2000
    test_size = 33
    train_and_validation_data = data_num_and_onehot[:train_size +
                                                    validation_size]
    test_data = data_num_and_onehot[train_size + validation_size:train_size +
                                    validation_size + test_size]

    # normalize
    scaler = MinMaxScaler()
    scaler.fit(train_and_validation_data)
    train_and_validation_data = scaler.transform(train_and_validation_data)
    test_data = scaler.transform(test_data)

    # print('np.argwhere(train_and_validation_data[:37998, 61] == 1)', np.argwhere(train_and_validation_data[:37998, 61] == 1))
    #
    # print('train_and_validation_data[0]', train_and_validation_data[0])
    # print('train_and_validation_data[0].shape', train_and_validation_data[0].shape)

    # X_train = train_and_validation_data[:train_size]
    # Y_train = labels[:train_size]
    # X_valid = train_and_validation_data[train_size:train_size + validation_size]
    # Y_valid = labels[train_size:train_size + validation_size]

    X_valid = train_and_validation_data[:validation_size]
    Y_valid = labels[:validation_size]
    X_train = train_and_validation_data[validation_size:validation_size +
                                        train_size]
    Y_train = labels[validation_size:validation_size + train_size]

    X_test = test_data
    Y_test = labels[train_size + validation_size:train_size + validation_size +
                    test_size]

    train = DataSet(X_train, Y_train)
    validation = DataSet(X_valid, Y_valid)
    test = DataSet(X_test, Y_test)

    return base.Datasets(train=train, validation=validation, test=test)