def get_feature_vectors(model): train_feature_vectors = np.concatenate( (model.sess.run(model.feature_vector, feed_dict=model.all_train_feed_dict), model.sess.run(model.feature_vector, feed_dict=model.all_validation_feed_dict))) validation_feature_vectors = np.empty([0, 32]) test_feature_vectors = model.sess.run(model.feature_vector, feed_dict=model.all_test_feed_dict) # validation_feature_vectors = model.sess.run(model.feature_vector, feed_dict=model.all_validation_feed_dict) train_labels = np.concatenate( (model.data_sets.train.labels, model.data_sets.validation.labels)) validation_labels = np.empty([0]) test_labels = model.data_sets.test.labels # print('train_feature_vectors.shape', type(train_feature_vectors)) # print('train_feature_vectors.shape', type(train_feature_vectors)) # print('train_feature_vectors.shape', train_feature_vectors.shape) print('test_feature_vectors.shape', test_feature_vectors.shape) print('validation_feature_vectors.shape', validation_feature_vectors.shape) print('train_labels.shape', train_labels.shape) print('test_labels.shape', test_labels.shape) print('validation_labels.shape', validation_labels.shape) train = DataSet(train_feature_vectors, train_labels) validation = DataSet(validation_feature_vectors, validation_labels) test = DataSet(test_feature_vectors, test_labels) return base.Datasets(train=train, validation=validation, test=test)
def load_fashion_mnist_A(validation_size=5000): (train_images, train_labels), (test_images, test_labels) = tf.keras.datasets.fashion_mnist.load_data( ) #keras only added to tensorflow # since version 1.4, currently using 1.1, fashion_mnist only added to keras.datasets in even later versions than 1.4 # so apparently tensorflow==1.13 (latest) works even when i run run_spam_experiment. so I guess keep tf at this version # and all I had to do was downgrade spacy? if not 0 <= validation_size <= len(train_images): raise ValueError( 'Validation size should be between 0 and {}. Received: {}.'.format( len(train_images), validation_size)) validation_images = train_images[:validation_size] validation_labels = train_labels[:validation_size] train_images = train_images[validation_size:] train_labels = train_labels[validation_size:] train_images = train_images.astype(np.float32) / 255 validation_images = validation_images.astype(np.float32) / 255 test_images = test_images.astype(np.float32) / 255 train = DataSet(train_images, train_labels) validation = DataSet(validation_images, validation_labels) test = DataSet(test_images, test_labels) return base.Datasets(train=train, validation=validation, test=test)
def load_spam(n=None): X_train, Y_train, X_valid, Y_valid, X_test, Y_test = process_spam(n) # Convert them to dense matrices X_train = X_train.toarray() X_valid = X_valid.toarray() X_test = X_test.toarray() print('type(X_train)', type(X_train)) print(type(Y_train)) print(X_train.shape) print(Y_train.shape) print(X_valid.shape) print(Y_valid.shape) # print(X_valid[0]) # print(X_valid[1]) print(X_test.shape) print(Y_test) print(Y_test.shape) train = DataSet(X_train, Y_train) validation = DataSet(X_valid, Y_valid) test = DataSet(X_test, Y_test) print('load_spam.py X_train.shape', X_train.shape) return base.Datasets(train=train, validation=validation, test=test)
def load_cifar(): (x_train, y_train), (x_test, y_test) = cifar10.load_data() x_train = x_train.transpose(0, 2, 3, 1) x_test = x_test.transpose(0, 2, 3, 1) train = DataSet(x_train, y_train.flatten()) test = DataSet(x_test, y_test.flatten()) validation = None return base.Datasets(train=train, validation=validation, test=test)
def load_mnist(train_dir, validation_size=5000): SOURCE_URL = 'http://yann.lecun.com/exdb/mnist/' TRAIN_IMAGES = 'train-images-idx3-ubyte.gz' TRAIN_LABELS = 'train-labels-idx1-ubyte.gz' TEST_IMAGES = 't10k-images-idx3-ubyte.gz' TEST_LABELS = 't10k-labels-idx1-ubyte.gz' local_file = base.maybe_download(TRAIN_IMAGES, train_dir, SOURCE_URL + TRAIN_IMAGES) with open(local_file, 'rb') as f: train_images = extract_images(f) local_file = base.maybe_download(TRAIN_LABELS, train_dir, SOURCE_URL + TRAIN_LABELS) with open(local_file, 'rb') as f: train_labels = extract_labels(f) local_file = base.maybe_download(TEST_IMAGES, train_dir, SOURCE_URL + TEST_IMAGES) with open(local_file, 'rb') as f: test_images = extract_images(f) local_file = base.maybe_download(TEST_LABELS, train_dir, SOURCE_URL + TEST_LABELS) with open(local_file, 'rb') as f: test_labels = extract_labels(f) if not 0 <= validation_size <= len(train_images): raise ValueError( 'Validation size should be between 0 and {}. Received: {}.' .format(len(train_images), validation_size)) validation_images = train_images[:validation_size] validation_labels = train_labels[:validation_size] train_images = train_images[validation_size:] train_labels = train_labels[validation_size:] train_images = train_images.astype(np.float32) / 255 validation_images = validation_images.astype(np.float32) / 255 test_images = test_images.astype(np.float32) / 255 print('train_images.shape', train_images.shape) print('validation_images.shape', validation_images.shape) print('test_images.shape', test_images.shape) train = DataSet(train_images, train_labels) validation = DataSet(validation_images, validation_labels) test = DataSet(test_images, test_labels) return base.Datasets(train=train, validation=validation, test=test)
def load_spam(n = None): X_train, Y_train, X_valid, Y_valid, X_test, Y_test = process_spam(n) # Convert them to dense matrices X_train = X_train.toarray() X_valid = X_valid.toarray() X_test = X_test.toarray() train = DataSet(X_train, Y_train) validation = DataSet(X_valid, Y_valid) test = DataSet(X_test, Y_test) return base.Datasets(train=train, validation=validation, test=test)
def load_toy(from_file=False): """ Create a dataset object that could be loaded to the training scripts. If from_file == True, load from already saved data. """ if from_file: data = np.load('data/toy_2d.npz') x_train, y_train, x_test, y_test = data['x_train'], data[ 'y_train'], data['x_test'], data['y_test'] else: x_train, x_test, y_train, y_test = generate_toy_2d() train = DataSet(x_train, y_train) test = DataSet(x_test, y_test) validation = None return base.Datasets(train=train, validation=validation, test=test)
def retrain(self, num_steps, feed_dict): retrain_dataset = DataSet(feed_dict[self.input_placeholder], feed_dict[self.labels_placeholder]) for step in xrange(num_steps): iter_feed_dict = self.fill_feed_dict_with_batch(retrain_dataset) self.sess.run(self.train_op, feed_dict=iter_feed_dict)
def generate_inception_features(model, poisoned_X_train_subset, labels_subset, batch_size=None): poisoned_train = DataSet(poisoned_X_train_subset, labels_subset) poisoned_data_sets = base.Datasets(train=poisoned_train, validation=None, test=None) if batch_size == None: batch_size = len(labels_subset) num_examples = poisoned_data_sets.train.num_examples assert num_examples % batch_size == 0 num_iter = int(num_examples / batch_size) poisoned_data_sets.train.reset_batch() inception_features_val = [] print(np.shape(poisoned_data_sets.train.x)) for i in range(num_iter): feed_dict = model.fill_feed_dict_with_batch(poisoned_data_sets.train, batch_size=batch_size) inception_features_val_temp = model.sess.run(model.inception_features, feed_dict=feed_dict) inception_features_val.append(inception_features_val_temp) return np.concatenate(inception_features_val)
def load_yelp(train_dir): train = np.loadtxt("%s/yelp-ex.train.rating" % train_dir, delimiter='\t') valid = np.loadtxt("%s/yelp-ex.valid.rating" % train_dir, delimiter='\t') test = np.loadtxt("%s/yelp-ex.test.rating" % train_dir, delimiter='\t') train_input = train[:628881, :2].astype(np.int32) train_output = train[:628881, 2] valid_input = valid[:, :2].astype(np.int32) valid_output = valid[:, 2] test_input = test[:51153, :2].astype(np.int32) test_output = test[:51153, 2] train = DataSet(train_input, train_output) validation = DataSet(valid_input, valid_output) test = DataSet(test_input, test_output) return base.Datasets(train=train, validation=validation, test=test)
def update_train_x(self, new_train_x): assert np.all(new_train_x.shape == self.data_sets.train.x.shape) new_train = DataSet(new_train_x, np.copy(self.data_sets.train.labels)) self.data_sets = base.Datasets(train=new_train, validation=self.data_sets.validation, test=self.data_sets.test) self.all_train_feed_dict = self.fill_feed_dict_with_all_ex( self.data_sets.train) self.reset_datasets()
def update_test_x_y(self, new_test_x, new_test_y): new_test = DataSet(new_test_x, new_test_y) self.data_sets = base.Datasets(train=self.data_sets.train, validation=self.data_sets.validation, test=new_test) self.all_test_feed_dict = self.fill_feed_dict_with_all_ex( self.data_sets.test) self.num_test_examples = len(new_test_y) self.reset_datasets()
def load_spam(ex_to_leave_out=None, num_examples=None): X_train, Y_train, X_valid, Y_valid, X_test, Y_test = process_spam( ex_to_leave_out, num_examples) # Convert them to dense matrices X_train = X_train.toarray() if X_valid is not None: X_valid = X_valid.toarray() X_test = X_test.toarray() train = DataSet(X_train, Y_train) if X_valid is not None: validation = DataSet(X_valid, Y_valid) else: validation = None test = DataSet(X_test, Y_test) #print(X_train[1]) return base.Datasets(train=train, validation=validation, test=test)
def load_movielens(train_dir, validation_size=5000): train = np.loadtxt("%s/ml-1m-ex.train.rating" % train_dir, delimiter='\t') valid = np.loadtxt("%s/ml-1m-ex.valid.rating" % train_dir, delimiter='\t') test = np.loadtxt("%s/ml-1m-ex.test.rating" % train_dir, delimiter='\t') train_input = train[:975460, :2].astype(np.int32) train_output = train[:975460, 2] valid_input = valid[:, :2].astype(np.int32) valid_output = valid[:, 2] # test_input = test[:-1, :2].astype(np.int32) # test_output = test[:-1, 2] test_input = test[:, :2].astype(np.int32) test_output = test[:, 2] train = DataSet(train_input, train_output) validation = DataSet(valid_input, valid_output) test = DataSet(test_input, test_output) return base.Datasets(train=train, validation=validation, test=test)
def generate_inception_features(model, poisoned_X_train_subset, labels_subset, batch_size=None): poisoned_train = DataSet(poisoned_X_train_subset, labels_subset) if batch_size == None: batch_size = len(labels_subset) assert len(poisoned_X_train_subset) % batch_size == 0 num_iter = int(len(poisoned_X_train_subset) / batch_size) poisoned_train.reset_batch() inception_features_val = [] for i in xrange(num_iter): inception_features_val_temp = model.generate_inception_features( poisoned_train, batch_size) inception_features_val.append(inception_features_val_temp) return np.concatenate(inception_features_val)
def load_heart_disease(ex_to_leave_out=None,num_examples=None): X_train, Y_train, X_valid, Y_valid, X_test, Y_test = process_heart_disease(ex_to_leave_out, num_examples) # Convert them to dense matrices Y_train = np.array(Y_train) Y_valid = np.array(Y_valid) Y_test = np.array(Y_test) X_train = np.array(X_train) if X_valid is not None: X_valid = np.array(X_valid) X_test = np.array(X_test) train = DataSet(X_train, Y_train) if X_valid is not None: validation = DataSet(X_valid, Y_valid) else: validation = None test = DataSet(X_test, Y_test) #print(X_train[1]) return base.Datasets(train=train, validation=validation, test=test)
def load_dogfish_with_koda(): classes = ['dog', 'fish'] X_test, Y_test = load_koda() data_sets = load_animals(num_train_ex_per_class=900, num_test_ex_per_class=300, num_valid_ex_per_class=0, classes=classes) train = data_sets.train validation = data_sets.validation test = DataSet(X_test, Y_test) return base.Datasets(train=train, validation=validation, test=test)
def load_small_mnist(train_dir, validation_size=5000, random_seed=0): np.random.seed(random_seed) data_sets = load_mnist(train_dir, validation_size) train_images = data_sets.train.x train_labels = data_sets.train.labels perm = np.arange(len(train_labels)) np.random.shuffle(perm) num_to_keep = int(len(train_labels) / 10) perm = perm[:num_to_keep] train_images = train_images[perm, :] train_labels = train_labels[perm] validation_images = data_sets.validation.x validation_labels = data_sets.validation.labels # perm = np.arange(len(validation_labels)) # np.random.shuffle(perm) # num_to_keep = int(len(validation_labels) / 10) # perm = perm[:num_to_keep] # validation_images = validation_images[perm, :] # validation_labels = validation_labels[perm] test_images = data_sets.test.x test_labels = data_sets.test.labels # perm = np.arange(len(test_labels)) # np.random.shuffle(perm) # num_to_keep = int(len(test_labels) / 10) # perm = perm[:num_to_keep] # test_images = test_images[perm, :] # test_labels = test_labels[perm] train = DataSet(train_images, train_labels) validation = DataSet(validation_images, validation_labels) test = DataSet(test_images, test_labels) return base.Datasets(train=train, validation=validation, test=test)
def load_dogfish_with_orig_and_koda(): classes = ['dog', 'fish'] X_test, Y_test = load_koda() X_test = np.reshape(X_test, (X_test.shape[0], -1)) data_sets = load_animals(num_train_ex_per_class=900, num_test_ex_per_class=300, num_valid_ex_per_class=0, classes=classes) train = data_sets.train validation = data_sets.validation test = DataSet(np.concatenate((data_sets.test.x, X_test), axis=0), np.concatenate((data_sets.test.labels, Y_test), axis=0)) return base.Datasets(train=train, validation=validation, test=test)
## RBF input_channels = 1 weight_decay = 0.001 batch_size = num_train initial_learning_rate = 0.001 keep_probs = None max_lbfgs_iter = 1000 use_bias = False decay_epochs = [1000, 10000] tf.reset_default_graph() X_train = image_data_sets.train.x Y_train = image_data_sets.train.labels * 2 - 1 train = DataSet(L_train, Y_train) test = DataSet(L_test, Y_test) data_sets = base.Datasets(train=train, validation=None, test=test) input_dim = data_sets.train.x.shape[1] # Train with hinge rbf_model = SmoothHinge( temp=0, use_bias=use_bias, input_dim=input_dim, weight_decay=weight_decay, num_classes=num_classes, batch_size=batch_size, data_sets=data_sets, initial_learning_rate=initial_learning_rate,
print('Inception features do not exist. Generating %s...' % label) data_set.reset_batch() num_examples = data_set.num_examples assert num_examples % batch_size == 0 inception_features_val = generate_inception_features( full_model, data_set.x, data_set.labels, batch_size=batch_size) np.savez(inception_features_path, inception_features_val=inception_features_val, labels=data_set.labels) train_f = np.load('output/%s_inception_features_new_train.npz' % dataset_name) train = DataSet(train_f['inception_features_val'], train_f['labels']) test_f = np.load('output/%s_inception_features_new_test.npz' % dataset_name) test = DataSet(test_f['inception_features_val'], test_f['labels']) validation = None inception_data_sets = base.Datasets(train=train, validation=validation, test=test) print('*** Top:') with top_graph.as_default(): top_model_name = '%s_inception_onlytop_wd-%s' % (dataset_name, weight_decay) input_dim = 2048 top_model = BinaryLogisticRegressionWithLBFGS(
X_test = data_sets.test.x Y_test = data_sets.test.labels X_train, Y_train = dataset.filter_dataset(X_train, Y_train, pos_class, neg_class) X_test, Y_test = dataset.filter_dataset(X_test, Y_test, pos_class, neg_class) # Round dataset size off to the nearest 100, just for batching convenience num_train = int(np.floor(len(Y_train) / 100) * 100) num_test = int(np.floor(len(Y_test) / 100) * 100) X_train = X_train[:num_train, :] Y_train = Y_train[:num_train] X_test = X_test[:num_test, :] Y_test = Y_test[:num_test] train = DataSet(X_train, Y_train) validation = None test = DataSet(X_test, Y_test) data_sets = base.Datasets(train=train, validation=validation, test=test) num_classes = 2 input_side = 28 input_channels = 1 input_dim = input_side * input_side * input_channels weight_decay = 0.01 use_bias = False batch_size = 100 initial_learning_rate = 0.001 keep_probs = None decay_epochs = [1000, 10000] max_lbfgs_iter = 1000
def load_mnist(train_dir, validation_size=5000): SOURCE_URL = 'http://yann.lecun.com/exdb/mnist/' TRAIN_IMAGES = 'train-images-idx3-ubyte.gz' TRAIN_LABELS = 'train-labels-idx1-ubyte.gz' TEST_IMAGES = 't10k-images-idx3-ubyte.gz' TEST_LABELS = 't10k-labels-idx1-ubyte.gz' local_file = base.maybe_download(TRAIN_IMAGES, train_dir, SOURCE_URL + TRAIN_IMAGES) with open(local_file, 'rb') as f: train_images = extract_images(f) local_file = base.maybe_download(TRAIN_LABELS, train_dir, SOURCE_URL + TRAIN_LABELS) with open(local_file, 'rb') as f: train_labels = extract_labels(f) local_file = base.maybe_download(TEST_IMAGES, train_dir, SOURCE_URL + TEST_IMAGES) with open(local_file, 'rb') as f: test_images = extract_images(f) local_file = base.maybe_download(TEST_LABELS, train_dir, SOURCE_URL + TEST_LABELS) with open(local_file, 'rb') as f: test_labels = extract_labels(f) if not 0 <= validation_size <= len(train_images): raise ValueError( 'Validation size should be between 0 and {}. Received: {}.'.format( len(train_images), validation_size)) validation_images = train_images[:validation_size] validation_labels = train_labels[:validation_size] train_images = train_images[validation_size:] train_labels = train_labels[validation_size:] # print(np.shape(train_labels)) # plt.imshow(np.reshape(train_images[100], (28, 28)), cmap='gray', interpolation='none') # train_images = train_images[np.where((train_labels == 3) | (train_labels == 5))[0]] # train_labels = train_labels[np.where((train_labels == 3) | (train_labels == 5))[0]] # test_images = test_images[np.where((test_labels == 3) | (test_labels == 5))[0]] # test_labels = test_labels[np.where((test_labels == 3) | (test_labels == 5))[0]] # validation_images = validation_images[np.where((validation_labels == 3) | (validation_labels == 5))[0]] # validation_labels = validation_labels[np.where((validation_labels == 3) | (validation_labels == 5))[0]] train_images = train_images.astype(np.float32) / 255 validation_images = validation_images.astype(np.float32) / 255 test_images = test_images.astype(np.float32) / 255 # train_labels = label_binarize(train_labels, classes=[3,5])[:,0] # test_labels = label_binarize(test_labels, classes=[3,5])[:,0] # validation_labels = label_binarize(validation_labels, classes=[3,5])[:,0] train = DataSet(train_images, train_labels) validation = DataSet(validation_images, validation_labels) test = DataSet(test_images, test_labels) return base.Datasets(train=train, validation=validation, test=test)
def load_diabetes(): columns = [ 'encounter_id', 'patient_nbr', 'race', 'gender', 'age', 'weight', 'admission_type_id', 'discharge_disposition_id', 'admission_source_id', 'time_in_hospital', 'payer_code', 'medical_specialty', 'num_lab_procedures', 'num_procedures', 'num_medications', 'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1', 'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult', 'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton', 'insulin', 'glyburide-metformin', 'glipizide-metformin', 'glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted' ] data = pd.read_csv('data/diabetes/diabetic_data.csv', names=columns, sep=' *, *', skiprows=1, na_values='?') # data = pd.read_csv('data/diabetes/diabetic_data.csv') # test_data = pd.read_csv('data/adult/adult.test', names=columns, sep=' *, *', skiprows=1, na_values='?') # print(data.info()) #race, weight, payer_code, medical speciality, diag_1, diag_2, diag_3 have missing values # print(data.shape) # print(data.iloc[0, :]) # data = pd.concat([train_data, test_data]) # # Before data.dropna() ############################################################################################# # print('data[data.race == \'White\'].shape[0]', data[data.race == 'White'].shape[0]) # print('data[data.race == \'Asian-Pac-Islander\'].shape[0]', data[data.race == 'Asian-Pac-Islander'].shape[0]) # print('data[data.race == \'Amer-Indian-Eskimo\'].shape[0]', data[data.race == 'Amer-Indian-Eskimo'].shape[0]) # print('data[data.race == \'Other\'].shape[0]', data[data.race == 'Other'].shape[0]) # print('data[data.race == \'Black\'].shape[0]', data[data.race == 'Black'].shape[0]) # # print('data[data.sex == \'Female\'].shape[0]', data[data.sex == 'Female'].shape[0]) # print('data[data.sex == \'Male\'].shape[0]', data[data.sex == 'Male'].shape[0]) # # Before data.dropna() ############################################################################################# # drop rows with missing values (where there is '?') in these columns data = data.dropna(axis=0, subset=['race', 'diag_1', 'diag_2', 'diag_3']) data = data[data.gender != 'Unknown/Invalid'] # print(data.info()) # print(data) # print(data.shape) # drop columns weight, payer_code, medical_specialty because too many missing values and probably not useful info? data = data.drop([ 'encounter_id', 'patient_nbr', 'weight', 'payer_code', 'medical_specialty' ], axis=1) # print(data.info()) # print(data) # print(data.shape) # drop the weight and payer-code columns because there are so many missing values? # drop rows where there are missing values for race or gender # drop rows where there are missing values for diag_1, diag_2, or diag_3 # count number of occurrences of a specific value ################################################################## # Minority Group - race (index 8): Caucasian(75079), Asian(625), AfricanAmerican(18881), Hispanic(1984), Other(1483) # Minority Group - gender (index 9): Female(52833), Male(45219) print('data[data.race == \'Caucasian\'].shape[0]', data[data.race == 'Caucasian'].shape[0]) print('data[data.race == \'Asian\'].shape[0]', data[data.race == 'Asian'].shape[0]) print('data[data.race == \'AfricanAmerican\'].shape[0]', data[data.race == 'AfricanAmerican'].shape[0]) print('data[data.race == \'Hispanic\'].shape[0]', data[data.race == 'Hispanic'].shape[0]) print('data[data.race == \'Other\'].shape[0]', data[data.race == 'Other'].shape[0]) print('data[data.gender == \'Female\'].shape[0]', data[data.gender == 'Female'].shape[0]) print('data[data.gender == \'Male\'].shape[0]', data[data.gender == 'Male'].shape[0]) # count number of occurrences of a specific value ################################################################## # convert to binary classes data[['readmitted']] = data[['readmitted']].replace({ '<30': 1, '>30': 1, 'NO': 0 }) print(data.info()) print(data) print(data.shape) # extract labels from data labels = data['readmitted'].values data = data.drop(['readmitted'], axis=1) print(data.info()) print(data) print(data.shape) data = data.to_numpy() print(data.shape) # 98052 -> [0.7, 0.15, 0.15] = [68636, 14708, 14708] data, labels = shuffle(data, labels, random_state=0) # print('data[0]', data[0:10]) # [23, 'Private', 84726, 'Assoc-acdm', 12, 'Married-civ-spouse', 'Farming-fishing', 'Wife', 'White', 'Female', 0, 0, 45, 'Germany'] # print('data[0].shape', data[0:10].shape) # # sex_column = data[:, 9] # print('np.argwhere(data[:37998, 9] == \'Female\')', np.argwhere(data[:37998, 9] == 'Female')) # Columns that are categorical: 1, 3, 5, 6, 7, 8, 9, 13 data_categorical = data[:, [ 0, 1, 2, 3, 4, 5, 13, 14, 15, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43 ]] data_numerical = data[:, [6, 7, 8, 9, 10, 11, 12, 16]] print('np.asarray(data_categorical).shape', np.asarray(data_categorical).shape) print('np.asarray(data_numerical).shape', np.asarray(data_numerical).shape) # one-hot encoding enc = OneHotEncoder(handle_unknown='ignore') enc.fit(data_categorical) data_categorical_onehot = enc.transform(data_categorical).toarray() print('enc.get_feature_names()', enc.get_feature_names()[:15]) print('enc.get_feature_names().shape', enc.get_feature_names().shape) print(np.where(enc.get_feature_names() == 'x0_AfricanAmerican')) # AfricanAmerican: 0 + 8 print(np.where(enc.get_feature_names() == 'x0_Asian')) # Asian: 1 + 8 print(np.where( enc.get_feature_names() == 'x0_Caucasian')) # Caucasian: 2 + 8 print( np.where(enc.get_feature_names() == 'x0_Hispanic')) # Hispanic: 3 + 8 print(np.where(enc.get_feature_names() == 'x0_Other')) # Other: 4 + 8 print(np.where(enc.get_feature_names() == 'x1_Female')) # Female: 5 + 8 print(np.where(enc.get_feature_names() == 'x1_Male')) # Male: 6 + 8 data_num_and_onehot = np.concatenate( (data_numerical, data_categorical_onehot), axis=1) print('data_num_and_onehot[0]', data_num_and_onehot[0]) print('data_num_and_onehot[0].shape', data_num_and_onehot[0].shape) print('data_num_and_onehot.shape', data_num_and_onehot.shape) # print('np.argwhere(data_num_and_onehot[:37998, 61] == 1)', np.argwhere(data_num_and_onehot[:37998, 61] == 1)) train_size = 84000 validation_size = 7000 #data.shape[0] * 0.1 # fraction_size = 1, validation_size = 2000 test_size = 7052 train_and_validation_data = data_num_and_onehot[:train_size + validation_size] test_data = data_num_and_onehot[train_size + validation_size:train_size + validation_size + test_size] # normalize scaler = MinMaxScaler() scaler.fit(train_and_validation_data) train_and_validation_data = scaler.transform(train_and_validation_data) test_data = scaler.transform(test_data) # print('np.argwhere(train_and_validation_data[:37998, 61] == 1)', np.argwhere(train_and_validation_data[:37998, 61] == 1)) # # print('train_and_validation_data[0]', train_and_validation_data[0]) # print('train_and_validation_data[0].shape', train_and_validation_data[0].shape) # X_train = train_and_validation_data[:train_size] # Y_train = labels[:train_size] # X_valid = train_and_validation_data[train_size:train_size + validation_size] # Y_valid = labels[train_size:train_size + validation_size] X_valid = train_and_validation_data[:validation_size] Y_valid = labels[:validation_size] X_train = train_and_validation_data[validation_size:validation_size + train_size] Y_train = labels[validation_size:validation_size + train_size] X_test = test_data Y_test = labels[train_size + validation_size:train_size + validation_size + test_size] train = DataSet(X_train, Y_train) validation = DataSet(X_valid, Y_valid) test = DataSet(X_test, Y_test) return base.Datasets(train=train, validation=validation, test=test)
def load_fashion_mnist(validation_size=5000): (train_images, train_labels), (test_images, test_labels) = tf.keras.datasets.fashion_mnist.load_data( ) #keras only added to tensorflow # since version 1.4, currently using 1.1, fashion_mnist only added to keras.datasets in even later versions than 1.4 # so apparently tensorflow==1.13 (latest) works even when i run run_spam_experiment. so I guess keep tf at this version # and all I had to do was downgrade spacy? plt.imshow(train_images[0]) plt.show() if not 0 <= validation_size <= len(train_images): raise ValueError( 'Validation size should be between 0 and {}. Received: {}.'.format( len(train_images), validation_size)) # print('uniques', np.unique(train_labels)) # print('uniques', np.unique(test_labels)) # # print('train_images.shape', train_images.shape) # print('train_labels.shape', train_labels.shape) # # print('train_images.shape[0], train_images.shape[1], train_images.shape[2], 1', train_images.shape[0], train_images.shape[1], train_images.shape[2], 1) train_images = np.reshape(train_images, (train_images.shape[0], train_images.shape[1], train_images.shape[2], 1)) test_images = np.reshape( test_images, (test_images.shape[0], test_images.shape[1], test_images.shape[2], 1)) # # testing if error when using 6 classes is due to dataset too small (answer is no) ################### # validation_size = int(validation_size * 0.6) # train_images = train_images[:int(train_labels.shape[0] * 0.6)] # train_labels = train_labels[:int(train_labels.shape[0] * 0.6)] # test_images = test_images[:int(test_labels.shape[0] * 0.6)] # test_labels = test_labels[:int(test_labels.shape[0] * 0.6)] # # testing if error when using 6 classes is due to dataset too small (answer is no) ################### # print('train_images.shape', train_images.shape) # print('train_labels.shape', train_labels.shape) validation_images = train_images[:validation_size] validation_labels = train_labels[:validation_size] train_images = train_images[validation_size:] train_labels = train_labels[validation_size:] # print(len(validation_images)) # print(len(validation_labels)) # print(len(train_labels)) # print(len(train_images)) # print(len(test_images)) # print(len(test_labels)) train_images = train_images.astype(np.float64) / 255 validation_images = validation_images.astype(np.float64) / 255 test_images = test_images.astype(np.float64) / 255 # print('train_images.shape', train_images.shape) # print('validation_images.shape', validation_images.shape) # print('test_images.shape', test_images.shape) train = DataSet(train_images, train_labels) validation = DataSet(validation_images, validation_labels) test = DataSet(test_images, test_labels) return base.Datasets(train=train, validation=validation, test=test)
def load_2_class_fashion_mnist(validation_size=1000): (train_images, train_labels), (test_images, test_labels) = tf.keras.datasets.fashion_mnist.load_data( ) #keras only added to tensorflow # since version 1.4, currently using 1.1, fashion_mnist only added to keras.datasets in even later versions than 1.4 # so apparently tensorflow==1.13 (latest) works even when i run run_spam_experiment. so I guess keep tf at this version # and all I had to do was downgrade spacy? images = np.concatenate((train_images, test_images), axis=0) labels = np.concatenate((train_labels, test_labels), axis=0) print('1 images.shape', images.shape) print('1 labels.shape', labels.shape) class_0_indices = np.argwhere(labels == 0) class_1_indices = np.argwhere(labels == 1) print('class_0_indices.shape', class_0_indices.shape) class_0_indices = np.reshape(class_0_indices, (class_0_indices.shape[0], )) class_1_indices = np.reshape(class_1_indices, (class_1_indices.shape[0], )) print('class_0_indices.reshape', class_0_indices.shape) reduced_class_indices = np.concatenate((class_0_indices, class_1_indices)) print('reduced_class_indices.shape', reduced_class_indices.shape) images = images[reduced_class_indices] labels = labels[reduced_class_indices] images, labels = shuffle(images, labels, random_state=0) print('images.shape', images.shape) print('labels.shape', labels.shape) print('np.unique(labels)', np.unique(labels)) # print('images[0]') # print(images[0]) # print('6 labels') # print(labels) train_size = 12000 validation_images = images[:validation_size] validation_labels = labels[:validation_size] train_images = images[validation_size:train_size] train_labels = labels[validation_size:train_size] test_images = images[train_size:] test_labels = labels[train_size:] print(len(validation_images)) print(len(validation_labels)) print(len(train_labels)) print(len(train_images)) print(len(test_images)) print(len(test_labels)) if not 0 <= validation_size <= len(train_images): raise ValueError( 'Validation size should be between 0 and {}. Received: {}.'.format( len(train_images), validation_size)) print('train_images.shape', train_images.shape) print('train_labels.shape', train_labels.shape) print('train_images.shape', test_images.shape) print('train_images.shape', test_labels.shape) print('train_images.shape', validation_images.shape) print('train_images.shape', validation_labels.shape) train_images = train_images.astype(np.float32) / 255 validation_images = validation_images.astype(np.float32) / 255 test_images = test_images.astype(np.float32) / 255 train = DataSet(train_images, train_labels) validation = DataSet(validation_images, validation_labels) test = DataSet(test_images, test_labels) return base.Datasets(train=train, validation=validation, test=test)
def load_adult(): columns = [ "age", "workClass", "fnlwgt", "education", "education-num", "marital-status", "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "income" ] train_data = pd.read_csv('data/adult/adult.data', names=columns, sep=' *, *', na_values='?') test_data = pd.read_csv('data/adult/adult.test', names=columns, sep=' *, *', skiprows=1, na_values='?') data = pd.concat([train_data, test_data]) # # Before data.dropna() ############################################################################################# # print('data[data.race == \'White\'].shape[0]', data[data.race == 'White'].shape[0]) # print('data[data.race == \'Asian-Pac-Islander\'].shape[0]', data[data.race == 'Asian-Pac-Islander'].shape[0]) # print('data[data.race == \'Amer-Indian-Eskimo\'].shape[0]', data[data.race == 'Amer-Indian-Eskimo'].shape[0]) # print('data[data.race == \'Other\'].shape[0]', data[data.race == 'Other'].shape[0]) # print('data[data.race == \'Black\'].shape[0]', data[data.race == 'Black'].shape[0]) # # print('data[data.sex == \'Female\'].shape[0]', data[data.sex == 'Female'].shape[0]) # print('data[data.sex == \'Male\'].shape[0]', data[data.sex == 'Male'].shape[0]) # # Before data.dropna() ############################################################################################# # drop rows with missing values (where there is '?') data = data.dropna() # # count number of occurrences of a specific value ################################################################## # # Minority Group - race (index 8): White(38903), Asian-Pac-Islander(1303), Amer-Indian-Eskimo(435), Other(353), Black(4228) # # Minority Group - sex (index 9): Female(14695), Male(30527) # print('data[data.race == \'White\'].shape[0]', data[data.race == 'White'].shape[0]) # print('data[data.race == \'Asian-Pac-Islander\'].shape[0]', data[data.race == 'Asian-Pac-Islander'].shape[0]) # print('data[data.race == \'Amer-Indian-Eskimo\'].shape[0]', data[data.race == 'Amer-Indian-Eskimo'].shape[0]) # print('data[data.race == \'Other\'].shape[0]', data[data.race == 'Other'].shape[0]) # print('data[data.race == \'Black\'].shape[0]', data[data.race == 'Black'].shape[0]) # # print('data[data.sex == \'Female\'].shape[0]', data[data.sex == 'Female'].shape[0]) # print('data[data.sex == \'Male\'].shape[0]', data[data.sex == 'Male'].shape[0]) # count number of occurrences of a specific value ################################################################## data = data.replace({'<=50K.': '<=50K', '>50K.': '>50K'}) data = data.replace({'<=50K': 0, '>50K': 1}) labels = data['income'].values data = data.drop(['income'], axis=1) data = data.to_numpy() print(data.shape) # 45222 -> [0.7, 0.15, 0.15] = [31656, 6783, 6783] data, labels = shuffle(data, labels, random_state=0) # print('data[0]', data[0:10]) # [23, 'Private', 84726, 'Assoc-acdm', 12, 'Married-civ-spouse', 'Farming-fishing', 'Wife', 'White', 'Female', 0, 0, 45, 'Germany'] # print('data[0].shape', data[0:10].shape) # # sex_column = data[:, 9] # print('np.argwhere(data[:37998, 9] == \'Female\')', np.argwhere(data[:37998, 9] == 'Female')) # Columns that are categorical: 1, 3, 5, 6, 7, 8, 9, 13 data_categorical = data[:, [1, 3, 5, 6, 7, 8, 9, 13]] data_numerical = data[:, [0, 2, 4, 10, 11, 12]] # print('np.asarray(data_categorical).shape', np.asarray(data_categorical).shape) # one-hot encoding enc = OneHotEncoder(handle_unknown='ignore') enc.fit(data_categorical) data_categorical_onehot = enc.transform(data_categorical).toarray() print('enc.get_feature_names()', enc.get_feature_names()) # print('enc.get_feature_names().shape', enc.get_feature_names().shape) print(np.where(enc.get_feature_names() == 'x6_Female')) # Female: 55 + 6, Male: 56 + 6 print(np.where(enc.get_feature_names() == 'x5_Amer-Indian-Eskimo')) # Amer-Indian-Eskimo: 50 + 6 print(np.where(enc.get_feature_names() == 'x5_Asian-Pac-Islander')) # Asian-Pac-Islander: 51 + 6 print(np.where(enc.get_feature_names() == 'x5_Black')) # Black: 52 + 6 print(np.where(enc.get_feature_names() == 'x5_Other')) # Other: 53 + 6 print(np.where(enc.get_feature_names() == 'x5_White')) # Other: 54 + 6 data_num_and_onehot = np.concatenate( (data_numerical, data_categorical_onehot), axis=1) # print('data_num_and_onehot[0]', data_num_and_onehot[0]) # print('data_num_and_onehot[0].shape', data_num_and_onehot[0].shape) # print('np.argwhere(data_num_and_onehot[:37998, 61] == 1)', np.argwhere(data_num_and_onehot[:37998, 61] == 1)) train_size = 38000 validation_size = 3000 #data.shape[0] * 0.1 # fraction_size = 1, validation_size = 2000 train_and_validation_data = data_num_and_onehot[:train_size + validation_size] test_data = data_num_and_onehot[train_size + validation_size:] # normalize scaler = MinMaxScaler() scaler.fit(train_and_validation_data) train_and_validation_data = scaler.transform(train_and_validation_data) test_data = scaler.transform(test_data) # print('np.argwhere(train_and_validation_data[:37998, 61] == 1)', np.argwhere(train_and_validation_data[:37998, 61] == 1)) # # print('train_and_validation_data[0]', train_and_validation_data[0]) # print('train_and_validation_data[0].shape', train_and_validation_data[0].shape) # X_train = train_and_validation_data[:train_size] # Y_train = labels[:train_size] # X_valid = train_and_validation_data[train_size:train_size + validation_size] # Y_valid = labels[train_size:train_size + validation_size] X_valid = train_and_validation_data[:validation_size] Y_valid = labels[:validation_size] X_train = train_and_validation_data[validation_size:validation_size + train_size] Y_train = labels[validation_size:validation_size + train_size] X_test = test_data Y_test = labels[train_size + validation_size:] train = DataSet(X_train, Y_train) validation = DataSet(X_valid, Y_valid) test = DataSet(X_test, Y_test) return base.Datasets(train=train, validation=validation, test=test)
def load_6_class_fashion_mnist_small(validation_size=3000, fraction_size=0.5): (train_images, train_labels), (test_images, test_labels) = tf.keras.datasets.fashion_mnist.load_data( ) #keras only added to tensorflow # since version 1.4, currently using 1.1, fashion_mnist only added to keras.datasets in even later versions than 1.4 # so apparently tensorflow==1.13 (latest) works even when i run run_spam_experiment. so I guess keep tf at this version # and all I had to do was downgrade spacy? train_images = np.reshape(train_images, (train_images.shape[0], train_images.shape[1], train_images.shape[2], 1)) test_images = np.reshape( test_images, (test_images.shape[0], test_images.shape[1], test_images.shape[2], 1)) images = np.concatenate((train_images, test_images), axis=0) labels = np.concatenate((train_labels, test_labels), axis=0) # print('images.shape', images.shape) # print('labels.shape', labels.shape) class_0_indices = np.argwhere(labels == 0) class_2_indices = np.argwhere(labels == 2) class_3_indices = np.argwhere(labels == 3) class_6_indices = np.argwhere(labels == 6) class_7_indices = np.argwhere(labels == 7) class_9_indices = np.argwhere(labels == 9) # print('class_0_indices.shape', class_0_indices.shape) class_0_indices = np.reshape( class_0_indices, (class_0_indices.shape[0], ))[:int(class_0_indices.shape[0] * fraction_size)] class_2_indices = np.reshape( class_2_indices, (class_2_indices.shape[0], ))[:int(class_2_indices.shape[0] * fraction_size)] class_3_indices = np.reshape( class_3_indices, (class_3_indices.shape[0], ))[:int(class_3_indices.shape[0] * fraction_size)] class_6_indices = np.reshape( class_6_indices, (class_6_indices.shape[0], ))[:int(class_6_indices.shape[0] * fraction_size)] class_7_indices = np.reshape( class_7_indices, (class_7_indices.shape[0], ))[:int(class_7_indices.shape[0] * fraction_size)] class_9_indices = np.reshape( class_9_indices, (class_9_indices.shape[0], ))[:int(class_9_indices.shape[0] * fraction_size)] # print('class_0_indices.shape', class_0_indices.shape) # print('class_2_indices.shape', class_2_indices.shape) # print('class_3_indices.shape', class_3_indices.shape) # print('class_6_indices.shape', class_6_indices.shape) # print('class_7_indices.shape', class_7_indices.shape) # print('class_9_indices.shape', class_9_indices.shape) reduced_class_indices = np.concatenate( (class_0_indices, class_2_indices, class_3_indices, class_6_indices, class_7_indices, class_9_indices)) # print('reduced_class_indices.shape', reduced_class_indices.shape) images = images[reduced_class_indices] labels = labels[reduced_class_indices] total_num_samples = images.shape[0] # Have to replace labels with 0, 1, 2, 3, 4, 5 or training won't work labels = np.where(labels == 2, 1, labels) labels = np.where(labels == 3, 2, labels) labels = np.where(labels == 6, 3, labels) labels = np.where(labels == 7, 4, labels) labels = np.where(labels == 9, 5, labels) # images, labels = shuffle(images, labels, random_state=0) # print('images.shape', images.shape) # print('labels.shape', labels.shape) # print('6', np.unique(labels)) # print('images[0]') # print(images[0]) # print('6 labels') # print(labels) train_size = int(36000 * fraction_size) validation_size = int(validation_size * fraction_size) total_num_samples = int(total_num_samples * fraction_size) # print('validation_size', validation_size) validation_images = images[:validation_size] validation_labels = labels[:validation_size] train_images = images[validation_size:train_size] train_labels = labels[validation_size:train_size] test_images = images[train_size:] test_labels = labels[train_size:] # print(len(validation_images)) # print(len(validation_labels)) # print(len(train_labels)) # print(len(train_images)) # print(len(test_images)) # print(len(test_labels)) if not 0 <= validation_size <= len(train_images): raise ValueError( 'Validation size should be between 0 and {}. Received: {}.'.format( len(train_images), validation_size)) # print('train_images.shape', train_images.shape) # print('train_labels.shape', train_labels.shape) # print('test_images.shape', test_images.shape) # print('test_images.shape', test_labels.shape) # print('validation_images.shape', validation_images.shape) # print('validation_images.shape', validation_labels.shape) train_images = train_images.astype(np.float64) / 255 validation_images = validation_images.astype(np.float64) / 255 test_images = test_images.astype(np.float64) / 255 # print('train_images.shape', train_images.shape) # print('validation_images.shape', validation_images.shape) # print('test_images.shape', test_images.shape) train = DataSet(train_images, train_labels) validation = DataSet(validation_images, validation_labels) test = DataSet(test_images, test_labels) return base.Datasets(train=train, validation=validation, test=test)
def load_animals(num_train_ex_per_class=300, num_test_ex_per_class=100, num_valid_ex_per_class=0, classes=None, ): num_channels = 3 img_side = 299 if num_valid_ex_per_class == 0: valid_str = '' else: valid_str = '_valid-%s' % num_valid_examples if classes is None: classes = ['dog', 'cat', 'bird', 'fish', 'horse', 'monkey', 'zebra', 'panda', 'lemur', 'wombat'] data_filename = os.path.join(BASE_DIR, 'dataset_train-%s_test-%s%s.npz' % ( num_train_ex_per_class, num_test_ex_per_class, valid_str)) else: data_filename = os.path.join(BASE_DIR, 'dataset_%s_train-%s_test-%s%s.npz' % ( '-'.join(classes), num_train_ex_per_class, num_test_ex_per_class, valid_str)) num_classes = len(classes) num_train_examples = num_train_ex_per_class * num_classes num_test_examples = num_test_ex_per_class * num_classes num_valid_examples = num_valid_ex_per_class * num_classes if os.path.exists(data_filename): print('Loading animals from disk...') f = np.load(data_filename) X_train = f['X_train'] X_test = f['X_test'] Y_train = f['Y_train'] Y_test = f['Y_test'] if 'X_valid' in f: X_valid = f['X_valid'] else: X_valid = None if 'Y_valid' in f: Y_valid = f['Y_valid'] else: Y_valid = None else: print('Reading animals from raw images...') X_train = np.zeros([num_train_examples, img_side, img_side, num_channels]) X_test = np.zeros([num_test_examples, img_side, img_side, num_channels]) X_valid = np.zeros([num_valid_examples, img_side, img_side, num_channels]) Y_train = np.zeros([num_train_examples]) Y_test = np.zeros([num_test_examples]) Y_valid = np.zeros([num_valid_examples]) for class_idx, class_string in enumerate(classes): print('class: %s' % class_string) # For some reason, a lot of numbers are skipped. i = 0 num_filled = 0 while num_filled < num_train_ex_per_class: img_path = os.path.join(BASE_DIR, '%s/%s_%s.JPEG' % (class_string, class_string, i)) print(img_path) if os.path.exists(img_path): fill(X_train, Y_train, num_filled + (num_train_ex_per_class * class_idx), class_idx, img_path, img_side) num_filled += 1 print(num_filled) i += 1 num_filled = 0 while num_filled < num_test_ex_per_class: img_path = os.path.join(BASE_DIR, '%s/%s_%s.JPEG' % (class_string, class_string, i)) if os.path.exists(img_path): fill(X_test, Y_test, num_filled + (num_test_ex_per_class * class_idx), class_idx, img_path, img_side) num_filled += 1 print(num_filled) i += 1 num_filled = 0 while num_filled < num_valid_ex_per_class: img_path = os.path.join(BASE_DIR, '%s/%s_%s.JPEG' % (class_string, class_string, i)) if os.path.exists(img_path): fill(X_valid, Y_valid, num_filled + (num_valid_ex_per_class * class_idx), class_idx, img_path, img_side) num_filled += 1 print(num_filled) i += 1 X_train = preprocess_input(X_train) X_test = preprocess_input(X_test) X_valid = preprocess_input(X_valid) np.random.seed(0) permutation_idx = np.arange(num_train_examples) np.random.shuffle(permutation_idx) X_train = X_train[permutation_idx, :] Y_train = Y_train[permutation_idx] permutation_idx = np.arange(num_test_examples) np.random.shuffle(permutation_idx) X_test = X_test[permutation_idx, :] Y_test = Y_test[permutation_idx] permutation_idx = np.arange(num_valid_examples) np.random.shuffle(permutation_idx) X_valid = X_valid[permutation_idx, :] Y_valid = Y_valid[permutation_idx] np.savez_compressed(data_filename, X_train=X_train, Y_train=Y_train, X_test=X_test, Y_test=Y_test, X_valid=X_valid, Y_valid=Y_valid) train = DataSet(X_train, Y_train) if (X_valid is not None) and (Y_valid is not None): validation = DataSet(X_valid, Y_valid) else: validation = None test = DataSet(X_test, Y_test) return base.Datasets(train=train, validation=validation, test=test)
def load_heart(): columns = [ 'age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target' ] data = pd.read_csv('data/heart/heart.csv', names=columns, sep=' *, *', skiprows=1, na_values='?') # test_data = pd.read_csv('data/adult/adult.test', names=columns, sep=' *, *', skiprows=1, na_values='?') # data = pd.concat([train_data, test_data]) print(data.info()) print(data) print(data.shape) # # Before data.dropna() ############################################################################################# # print('data[data.sex == \'Female\'].shape[0]', data[data.sex == 'Female'].shape[0]) # print('data[data.sex == \'Male\'].shape[0]', data[data.sex == 'Male'].shape[0]) # # Before data.dropna() ############################################################################################# # drop rows with missing values (where there is '?') # data = data.dropna() # # print(data.info()) # print(data) # print(data.shape) # # count number of occurrences of a specific value ################################################################## # # Minority Group - sex (index 9): Female=0(96), Male=1(207) print('data[data.sex == \'Female\'].shape[0]', data[data.sex == 0].shape[0]) print('data[data.sex == \'Male\'].shape[0]', data[data.sex == 1].shape[0]) # count number of occurrences of a specific value ################################################################## # data = data.replace({'<=50K.': '<=50K', '>50K.': '>50K'}) # data = data.replace({'<=50K': 0, '>50K': 1}) labels = data['target'].values data = data.drop(['target'], axis=1) data = data.to_numpy() print( data.shape ) # 303 -> [0.7, 0.15, 0.15] = [213, 45, 45], [0.8, 0.1, 0.1] = [243, 30, 30] data, labels = shuffle(data, labels, random_state=0) # print('data[0]', data[0:10]) # [23, 'Private', 84726, 'Assoc-acdm', 12, 'Married-civ-spouse', 'Farming-fishing', 'Wife', 'White', 'Female', 0, 0, 45, 'Germany'] # print('data[0].shape', data[0:10].shape) # # sex_column = data[:, 9] # print('np.argwhere(data[:37998, 9] == \'Female\')', np.argwhere(data[:37998, 9] == 'Female')) # Columns that are categorical: (sex, cp, fbs, restecg, exang, slope, ca, thal) data_categorical = data[:, [1, 2, 5, 6, 8, 10, 11, 12]] data_numerical = data[:, [0, 3, 4, 7, 9]] # print('np.asarray(data_categorical).shape', np.asarray(data_categorical).shape) # one-hot encoding enc = OneHotEncoder(handle_unknown='ignore') enc.fit(data_categorical) data_categorical_onehot = enc.transform(data_categorical).toarray() print('enc.get_feature_names()', enc.get_feature_names()) # print('enc.get_feature_names().shape', enc.get_feature_names().shape) print(np.where(enc.get_feature_names() == 'x0_0.0')) # Female: 0 + 5 # print(np.where(enc.get_feature_names() == 'x5_Amer-Indian-Eskimo')) # Amer-Indian-Eskimo: 50 + 6 # print(np.where(enc.get_feature_names() == 'x5_Asian-Pac-Islander')) # Asian-Pac-Islander: 51 + 6 # print(np.where(enc.get_feature_names() == 'x5_Black')) # Black: 52 + 6 # print(np.where(enc.get_feature_names() == 'x5_Other')) # Other: 53 + 6 data_num_and_onehot = np.concatenate( (data_numerical, data_categorical_onehot), axis=1) # print('data_num_and_onehot[0]', data_num_and_onehot[0]) # print('data_num_and_onehot[0].shape', data_num_and_onehot[0].shape) # print('np.argwhere(data_num_and_onehot[:37998, 61] == 1)', np.argwhere(data_num_and_onehot[:37998, 61] == 1)) train_size = 240 # 240 validation_size = 30 # 30 #data.shape[0] * 0.1 # fraction_size = 1, validation_size = 2000 test_size = 33 train_and_validation_data = data_num_and_onehot[:train_size + validation_size] test_data = data_num_and_onehot[train_size + validation_size:train_size + validation_size + test_size] # normalize scaler = MinMaxScaler() scaler.fit(train_and_validation_data) train_and_validation_data = scaler.transform(train_and_validation_data) test_data = scaler.transform(test_data) # print('np.argwhere(train_and_validation_data[:37998, 61] == 1)', np.argwhere(train_and_validation_data[:37998, 61] == 1)) # # print('train_and_validation_data[0]', train_and_validation_data[0]) # print('train_and_validation_data[0].shape', train_and_validation_data[0].shape) # X_train = train_and_validation_data[:train_size] # Y_train = labels[:train_size] # X_valid = train_and_validation_data[train_size:train_size + validation_size] # Y_valid = labels[train_size:train_size + validation_size] X_valid = train_and_validation_data[:validation_size] Y_valid = labels[:validation_size] X_train = train_and_validation_data[validation_size:validation_size + train_size] Y_train = labels[validation_size:validation_size + train_size] X_test = test_data Y_test = labels[train_size + validation_size:train_size + validation_size + test_size] train = DataSet(X_train, Y_train) validation = DataSet(X_valid, Y_valid) test = DataSet(X_test, Y_test) return base.Datasets(train=train, validation=validation, test=test)