def load_file(file, name): return HDF5Matrix(file, name)
num_labels = np.load(datadir+'mean_and_std.npy').shape[1] # Define the number of training spectra num_train = 41000 # Load labels with h5py.File(training_set, 'r') as F: y_train = np.hstack((F['TEFF'][0:num_train], F['LOGG'][0:num_train], F['FE_H'][0:num_train])) y_cv = np.hstack((F['TEFF'][num_train:], F['LOGG'][num_train:], F['FE_H'][num_train:])) # Normalize labels y_train = normalize(y_train) y_cv = normalize(y_cv) # Create the spectra training and cv datasets x_train = HDF5Matrix(training_set, 'spectrum', start=0, end=num_train) x_cv = HDF5Matrix(training_set, 'spectrum', start=num_train, end=None) # Define the number of output labels num_labels = y_train.shape[1] num_fluxes = x_train.shape[1] print('Each spectrum contains ' + str(num_fluxes) + ' wavelength bins') print('Training set includes ' + str(x_train.shape[0]) + ' spectra and the cross-validation set includes ' + str(x_cv.shape[0])+' spectra') # **Build the StarNet model architecture** #
x = Conv3D(16, kernel_size=7, strides=2, padding="same")(x_inp) x = LeakyReLU(0.3)(x) x = Conv3D(32, kernel_size=3, strides=2, padding="same")(x) x = LeakyReLU(0.3)(x) x = Conv3D(64, kernel_size=3, strides=2, padding="same")(x) x = LeakyReLU(0.3)(x) x = Flatten()(x) x = Dense(4096, )(x) x = LeakyReLU(0.3)(x) x = Dense(2048, activation='relu')(x) x = Dense(2048)(x) model = Model(x_inp, x) return model encoder = Encoder(batch_size=100) encoder.load_weights("Encoder.h5") encoder.summary() fhkl = HDF5Matrix("dataset.h5", "X", end=82700) ads = HDF5Matrix("dataset.h5", "Y", end=82700) L = len(ads) encoded_fhkl = encoder.predict(fhkl, batch_size=100, verbose=1) with h5py.File("dataset-encoded.h5", "w") as outh5: X = outh5.create_dataset("X", (L, 2048)) Y = outh5.create_dataset("Y", (L, 2)) X[:, :] = np.asarray(encoded_fhkl) Y[:, :] = np.asarray(ads)
def train_with_bottlenecks(args, label_map, trainable_model, non_trainable_model, iterations_per_epoch_t, iterations_per_epoch_v): if args.create_bottleneck: training_addr_label_map, train_npy_dir, h5py_file_train = create_bottlenecks_h5py( "train", label_map, args.train, non_trainable_model) # multiprocess_bottleneck_creation("train", label_map, args.train, non_trainable_model) # training_addr_label_map, train_npy_dir = create_npy_class_map("train", args) # Writing the dictionaries to a txt file so that we neednt loop again in future with open("essential_files/train_addr_label_map.txt", "wb") as file: pickle.dump(training_addr_label_map, file) with open("essential_files/train_npy_dir.txt", "wb") as file: pickle.dump(train_npy_dir, file) if not args.create_bottleneck: with open("essential_files/train_addr_label_map.txt", "rb") as file: print("[INFO] (Training)Loading Address to Label Map from Disk") training_addr_label_map = pickle.load(file) with open("essential_files/train_npy_dir.txt", "rb") as file: print("[INFO] (Training)Loading Address from Disk") train_npy_dir = pickle.load(file) # Saving the bottleneck features for the bottom nontrainable model (validation dataset) # Creating bottlenecks if its not created if args.create_bottleneck: validation_addr_label_map, val_npy_dir, h5py_file_val = create_bottlenecks_h5py( "val", label_map, args.val, non_trainable_model) # multiprocess_bottleneck_creation("val", label_map, args.val, non_trainable_model) # validation_addr_label_map, val_npy_dir = create_npy_class_map("val", args) with open("essential_files/validation_addr_label_map.txt", "wb") as file: pickle.dump(validation_addr_label_map, file) with open("essential_files/val_npy_dir.txt", "wb") as file: pickle.dump(val_npy_dir, file) if not args.create_bottleneck: with open("essential_files/validation_addr_label_map.txt", "rb") as file: print("[INFO] (Validation)Loading Address to Label Map from Disk") validation_addr_label_map = pickle.load(file) with open("essential_files/val_npy_dir.txt", "rb") as file: print("[INFO] (Validation)Loading Address to Label Map from Disk") val_npy_dir = pickle.load(file) print("[INFO] Loading the bottlenecks") print("[INFO] Starting to Train") history_information = [] h5py_file_train = args.bottleneck_dir + '/train/train' + '.h5' h5py_file_val = args.bottleneck_dir + '/val/val' + '.h5' # h5py_file_train = h5py.File(h5py_file_train, 'r') # h5py_file_val = h5py.File(h5py_file_val, 'r') print("Printing Trainable model summary") print(trainable_model.summary()) checkpoint = ModelCheckpoint(args.weight_file) tb_callback = keras.callbacks.TensorBoard(log_dir=args.logs, histogram_freq=2, write_graph=True) # early_stopping = EarlyStopping(monitor = 'val_loss') callback_list = [checkpoint, tb_callback] #, early_stopping] x_train = HDF5Matrix(h5py_file_train, 'train') y_train = HDF5Matrix(h5py_file_train, 'train_labels') x_val = HDF5Matrix(h5py_file_val, 'val') y_val = HDF5Matrix(h5py_file_val, 'val_labels') print(BATCH_SIZE) trainable_model.fit(x_train, y_train, batch_size=BATCH_SIZE, epochs=args.epochs, verbose=1, validation_data=(x_val, y_val), shuffle='batch', callbacks=callback_list) ''' trainable_model.fit_generator(load_random_cached_bottlenecks(BATCH_SIZE, label_map, training_addr_label_map, train_npy_dir, 'h5py', h5py_file_train), epochs = EPOCHS, steps_per_epoch=iterations_per_epoch_t, validation_data = load_random_cached_bottlenecks(BATCH_SIZE, label_map, validation_addr_label_map, val_npy_dir, 'h5py', h5py_file_val), validation_steps=iterations_per_epoch_v, workers = 1, callbacks = callback_list, use_multiprocessing = True, max_queue_size = 32) ''' # loss = trainable_model.train_on_batch(X, Y) # history_information.append(loss) # if i%10 == 0: # print (str(datetime.datetime.now())+"\tPercent to complete: " + str((iterations_per_epoch_t*EPOCHS - i)*100//(iterations_per_epoch_t*EPOCHS))+"\t\tEpoch: " + str(epoch) + "\tIteration: " + str(i) + '\tLoss: ' + str(loss[0]) + "\tTraining_Accuracy: " + str(loss[1])) # if epoch% args.saving_ckpts == 0: # trainable_model.save(args.weight_file) # for i in range(iterations_per_epoch_v): # X,Y = load_random_cached_bottlenecks(BATCH_SIZE, label_map, validation_addr_label_map, val_npy_dir, 'h5py', h5py_file_val) # loss = trainable_model.test_on_batch(X, Y) # print ("\tIteration: " + str(i) + '\tLoss: ' + str(loss[0]) + "\tValidation_Accuracy: " + str(loss[1])) # np.save("essential_files/history_training.npy", np.array(history_information)) print("[INFO] Completed Training!")
"The validation dataset lacks 'num_classes' and 'repetitions' attributes" ) if train_num_classes != valid_num_classes: raise ValueError( "The number of classes in training and validation databases differ" ) num_classes = train_num_classes else: raise ValueError( "The input database lacks training and validation datasets") print("Training and validation data loaded") print("Training data:", num_classes, "classes repeated", train_N, "times") print("Validation data:", num_classes, "classes repeated", valid_N, "times") train_data = HDF5Matrix(db_path, 'training') valid_data = HDF5Matrix(db_path, 'validation') train_labels = np.tile(np.arange(num_classes), (train_N, )) valid_labels = np.tile(np.arange(num_classes), (valid_N, )) print(train_data.shape, train_labels.shape) print(valid_data.shape, valid_labels.shape) if train_data.shape[1] != valid_data.shape[1]: ValueError( "Different model used for training and validation, not allowed") logits_length = train_data.shape[1] # Get info about loaded data additional_info = { 'Logits length': logits_length, 'Number of classes': num_classes, 'DB training repetitions': train_N,
filters1, filters2, filters3 = filters x = conv(x=input_tensor, filters=filters1, kernel_size=1, strides=strides) x = conv(x=x, filters=filters2, kernel_size=kernel_size) x = conv(x=x, filters=filters3, kernel_size=1) shortcut = conv(x=input_tensor, filters=filters3, kernel_size=1, strides=strides) x = Add()([x, shortcut]) x = Activation("relu")(x) return x batch_size = 10 L = 60000 X_train = HDF5Matrix("dataset.h5", "X", start=0, end=L) Y_train = HDF5Matrix("dataset.h5", "Y", start=0, end=L) X_test = HDF5Matrix("dataset.h5", "X", start=L, end=L + 22730) Y_test = HDF5Matrix("dataset.h5", "Y", start=L, end=L + 22730) gen_train = K_gen(X_train, Y_train, batch_size=batch_size) gen_test = K_gen(X_test, Y_test, batch_size=batch_size) x_inp = Input(batch_shape=(batch_size, 31, 31, 31, 2)) x = BatchNormalization()(x_inp) x = GaussianNoise(0.01)(x) x = conv_block(x, 3, [32, 32, 128]) x = identity_block(x, 3, [32, 32, 128]) x = conv_block(x, 3, [64, 64, 256]) x = identity_block(x, 3, [64, 64, 256])
def run(self): logger.info('Epochs: {}'.format(self.epochs)) logger.info('Batch size: {}'.format(self.batch_size)) logger.info('Model attributes: {}'.format(self.model_attributes)) logger.info('Output: {}'.format(self.output)) logger.info('Units: {}'.format(self.units)) logger.info('') if self.load_epoch == 0: os.mkdir(self.model_directory) data_files_temp = os.listdir(self.data_path) data_files = [] for f in data_files_temp: if not f.endswith('.h5'): continue if f.startswith('test_'): continue data_files.append(f) train_files = sorted(data_files) logger.info('{} files will be used for training'.format(len(train_files))) model = self.create_and_compile_model() for n_epoch in range(1, self.epochs + 1): logger.info('') logger.info('Processing epoch #{}...'.format(n_epoch)) for train_file in train_files: logger.info('Processing {}...'.format(train_file)) h5_file_path = os.path.join(self.data_path, train_file) train_input = HDF5Matrix(h5_file_path, 'input_data') train_output = HDF5Matrix(h5_file_path, 'output_data') logger.info('Train data size = {}'.format(train_input.size / self.input_size)) model.fit( train_input, train_output, # we need it to work with md5 data shuffle='batch', epochs=1, batch_size=self.batch_size, callbacks=[LoggingCallback(self.graphs_data)] ) logger.info('Predictions after epoch #{}'.format(n_epoch)) self.calculate_predictions(model, n_epoch) # We save model after each epoch logger.info('Saving model, please don\'t interrupt...') model_path = os.path.join(self.model_directory, '{}_model.h5'.format(n_epoch)) model.save(model_path) logger.info('Model saved') else: model_files = os.listdir(self.model_directory) model_file = None for f in model_files: if f.startswith('{}_'.format(self.load_epoch)): model_file = f model = load_model(os.path.join(self.model_directory, model_file)) self.calculate_predictions(model, None) if self.graphs_data.get('first'): self.print_best_result() logger.info(json.dumps(self.graphs_data)) if self.need_visualize: show_graphs(self.graphs_data)
def exceptUnknown(X, Y): #except data whose gender is unknown i = 0 while (i < len(Y)): if Y[i] == 0: Y = np.delete(Y, (i), axis=0) X = np.delete(X, (i), axis=0) else: i += 1 return np.transpose(X.astype('float32') / 255.0, [0, 2, 3, 1]), np_utils.to_categorical( Y - 1, num_classes=2) #onehot X_train = np.array(HDF5Matrix('train.hdf5', 'crops')) Y_train = extGender(HDF5Matrix('train.hdf5', 'labels')) X_val = np.array(HDF5Matrix('val.hdf5', 'crops')) Y_val = extGender(HDF5Matrix('val.hdf5', 'labels')) X_test = np.array(HDF5Matrix('test.hdf5', 'crops')) Y_test = extGender(HDF5Matrix('test.hdf5', 'labels')) X_train, Y_train = exceptUnknown(X_train, Y_train) X_val, Y_val = exceptUnknown(X_val, Y_val) X_test, Y_test = exceptUnknown(X_test, Y_test) #hdf5 data import def createCNNmodel(num_classes): model = Sequential() model.add(
x = Dense(512, **kwargs)(x) x = Dense(1)(x) x = GaussianNoise(0.05)(x) model = Model(x_inp, x) return model batch_size = 100 optimizer = Adadelta(lr=1.0e-2, clipvalue=.1) mofnn = MOFNN(batch_size=batch_size) mofnn.compile(optimizer=optimizer, loss="mse", metrics=["mean_absolute_percentage_error"]) mofnn.summary() X_train = HDF5Matrix("dataset-encoded.h5", "X", end=60000) Y_train = HDF5Matrix("dataset-encoded.h5", "Y", end=60000) X_valid = HDF5Matrix("dataset-encoded.h5", "X", start=60000) Y_valid = HDF5Matrix("dataset-encoded.h5", "Y", start=60000) gen_train = Gen(X_train, Y_train, batch_size=batch_size) gen_test = Gen(X_valid, Y_valid, batch_size=batch_size) chkpntr = ModelCheckpoint(filepath="1_bar_encoded.h5", save_best_only=True, verbose=1, save_weights_only=True) lrrdcr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-7, verbose=1)
def load_data(): """Loads PCam dataset. # Returns Tuple of Numpy arrays: `(x_train, y_train), (x_test, y_test)`. """ dirname = os.path.join('datasets', 'pcam') base = 'https://drive.google.com/uc?export=download&id=' try: y_train = HDF5Matrix( get_file('camelyonpatch_level_2_split_train_y.h5', origin=base + '1269yhu3pZDP8UYFQs-NYs3FPwuK-nGSG', cache_subdir=dirname, archive_format='gzip'), 'y') x_valid = HDF5Matrix( get_file('camelyonpatch_level_2_split_valid_x.h5', origin=base + '1hgshYGWK8V-eGRy8LToWJJgDU_rXWVJ3', cache_subdir=dirname, archive_format='gzip'), 'x') y_valid = HDF5Matrix( get_file('camelyonpatch_level_2_split_valid_y.h5', origin=base + '1bH8ZRbhSVAhScTS0p9-ZzGnX91cHT3uO', cache_subdir=dirname, archive_format='gzip'), 'y') x_test = HDF5Matrix( get_file('camelyonpatch_level_2_split_test_x.h5', origin=base + '1qV65ZqZvWzuIVthK8eVDhIwrbnsJdbg_', cache_subdir=dirname, archive_format='gzip'), 'x') y_test = HDF5Matrix( get_file('camelyonpatch_level_2_split_test_y.h5', origin=base + '17BHrSrwWKjYsOgTMmoqrIjDy6Fa2o_gP', cache_subdir=dirname, archive_format='gzip'), 'y') meta_train = pd.read_csv( get_file('camelyonpatch_level_2_split_train_meta.csv', origin=base + '1XoaGG3ek26YLFvGzmkKeOz54INW0fruR', cache_subdir=dirname)) meta_valid = pd.read_csv( get_file('camelyonpatch_level_2_split_valid_meta.csv', origin=base + '16hJfGFCZEcvR3lr38v3XCaD5iH1Bnclg', cache_subdir=dirname)) meta_test = pd.read_csv( get_file('camelyonpatch_level_2_split_test_meta.csv', origin=base + '19tj7fBlQQrd4DapCjhZrom_fA4QlHqN4', cache_subdir=dirname)) x_train = HDF5Matrix( get_file('camelyonpatch_level_2_split_train_x.h5', origin=base + '1Ka0XfEMiwgCYPdTI-vv6eUElOBnKFKQ2', cache_subdir=dirname, archive_format='gzip'), 'x') except OSError: raise NotImplementedError( 'Direct download currently not working. Please go to https://drive.google.com/drive/folders/1gHou49cA1s5vua2V5L98Lt8TiWA3FrKB and press download all. Then place files (ungzipped) in ~/.keras/datasets/pcam.' ) if K.image_data_format() == 'channels_first': raise NotImplementedError() return (x_train, y_train, meta_train), (x_valid, y_valid, meta_valid), (x_test, y_test, meta_test)