Exemple #1
0
def load_file(file, name):
    return HDF5Matrix(file, name)
Exemple #2
0
num_labels = np.load(datadir+'mean_and_std.npy').shape[1]

# Define the number of training spectra
num_train = 41000

# Load labels
with  h5py.File(training_set, 'r') as F:
    y_train = np.hstack((F['TEFF'][0:num_train], F['LOGG'][0:num_train], F['FE_H'][0:num_train]))
    y_cv = np.hstack((F['TEFF'][num_train:], F['LOGG'][num_train:], F['FE_H'][num_train:]))

# Normalize labels
y_train = normalize(y_train)
y_cv = normalize(y_cv)

# Create the spectra training and cv datasets
x_train = HDF5Matrix(training_set, 'spectrum', 
                           start=0, end=num_train)
x_cv = HDF5Matrix(training_set, 'spectrum', 
                           start=num_train, end=None)

# Define the number of output labels
num_labels = y_train.shape[1]

num_fluxes = x_train.shape[1]

print('Each spectrum contains ' + str(num_fluxes) + ' wavelength bins')
print('Training set includes ' + str(x_train.shape[0]) + 
      ' spectra and the cross-validation set includes ' + str(x_cv.shape[0])+' spectra')


# **Build the StarNet model architecture**
# 
Exemple #3
0
    x = Conv3D(16, kernel_size=7, strides=2, padding="same")(x_inp)
    x = LeakyReLU(0.3)(x)
    x = Conv3D(32, kernel_size=3, strides=2, padding="same")(x)
    x = LeakyReLU(0.3)(x)
    x = Conv3D(64, kernel_size=3, strides=2, padding="same")(x)
    x = LeakyReLU(0.3)(x)
    x = Flatten()(x)
    x = Dense(4096, )(x)
    x = LeakyReLU(0.3)(x)
    x = Dense(2048, activation='relu')(x)
    x = Dense(2048)(x)
    model = Model(x_inp, x)
    return model


encoder = Encoder(batch_size=100)
encoder.load_weights("Encoder.h5")
encoder.summary()

fhkl = HDF5Matrix("dataset.h5", "X", end=82700)
ads = HDF5Matrix("dataset.h5", "Y", end=82700)
L = len(ads)

encoded_fhkl = encoder.predict(fhkl, batch_size=100, verbose=1)

with h5py.File("dataset-encoded.h5", "w") as outh5:
    X = outh5.create_dataset("X", (L, 2048))
    Y = outh5.create_dataset("Y", (L, 2))
    X[:, :] = np.asarray(encoded_fhkl)
    Y[:, :] = np.asarray(ads)
def train_with_bottlenecks(args, label_map, trainable_model,
                           non_trainable_model, iterations_per_epoch_t,
                           iterations_per_epoch_v):
    if args.create_bottleneck:
        training_addr_label_map, train_npy_dir, h5py_file_train = create_bottlenecks_h5py(
            "train", label_map, args.train, non_trainable_model)
        # multiprocess_bottleneck_creation("train", label_map, args.train, non_trainable_model)
        # training_addr_label_map, train_npy_dir = create_npy_class_map("train", args)
        # Writing the dictionaries to a txt file so that we neednt loop again in future
        with open("essential_files/train_addr_label_map.txt", "wb") as file:
            pickle.dump(training_addr_label_map, file)
        with open("essential_files/train_npy_dir.txt", "wb") as file:
            pickle.dump(train_npy_dir, file)
    if not args.create_bottleneck:
        with open("essential_files/train_addr_label_map.txt", "rb") as file:
            print("[INFO] (Training)Loading Address to Label Map from Disk")
            training_addr_label_map = pickle.load(file)
        with open("essential_files/train_npy_dir.txt", "rb") as file:
            print("[INFO] (Training)Loading Address from Disk")
            train_npy_dir = pickle.load(file)

    # Saving the bottleneck features for the bottom nontrainable model (validation dataset)

    # Creating bottlenecks if its not created
    if args.create_bottleneck:
        validation_addr_label_map, val_npy_dir, h5py_file_val = create_bottlenecks_h5py(
            "val", label_map, args.val, non_trainable_model)
        # multiprocess_bottleneck_creation("val", label_map, args.val, non_trainable_model)
        # validation_addr_label_map, val_npy_dir = create_npy_class_map("val", args)
        with open("essential_files/validation_addr_label_map.txt",
                  "wb") as file:
            pickle.dump(validation_addr_label_map, file)
        with open("essential_files/val_npy_dir.txt", "wb") as file:
            pickle.dump(val_npy_dir, file)
    if not args.create_bottleneck:
        with open("essential_files/validation_addr_label_map.txt",
                  "rb") as file:
            print("[INFO] (Validation)Loading Address to Label Map from Disk")
            validation_addr_label_map = pickle.load(file)
        with open("essential_files/val_npy_dir.txt", "rb") as file:
            print("[INFO] (Validation)Loading Address to Label Map from Disk")
            val_npy_dir = pickle.load(file)

    print("[INFO] Loading the bottlenecks")

    print("[INFO] Starting to Train")
    history_information = []
    h5py_file_train = args.bottleneck_dir + '/train/train' + '.h5'
    h5py_file_val = args.bottleneck_dir + '/val/val' + '.h5'
    # h5py_file_train = h5py.File(h5py_file_train, 'r')
    # h5py_file_val = h5py.File(h5py_file_val, 'r')
    print("Printing Trainable model summary")
    print(trainable_model.summary())
    checkpoint = ModelCheckpoint(args.weight_file)
    tb_callback = keras.callbacks.TensorBoard(log_dir=args.logs,
                                              histogram_freq=2,
                                              write_graph=True)
    # early_stopping = EarlyStopping(monitor = 'val_loss')
    callback_list = [checkpoint, tb_callback]  #, early_stopping]
    x_train = HDF5Matrix(h5py_file_train, 'train')
    y_train = HDF5Matrix(h5py_file_train, 'train_labels')
    x_val = HDF5Matrix(h5py_file_val, 'val')
    y_val = HDF5Matrix(h5py_file_val, 'val_labels')
    print(BATCH_SIZE)
    trainable_model.fit(x_train,
                        y_train,
                        batch_size=BATCH_SIZE,
                        epochs=args.epochs,
                        verbose=1,
                        validation_data=(x_val, y_val),
                        shuffle='batch',
                        callbacks=callback_list)
    '''
	trainable_model.fit_generator(load_random_cached_bottlenecks(BATCH_SIZE, label_map, training_addr_label_map, train_npy_dir, 'h5py', h5py_file_train),
	epochs = EPOCHS, steps_per_epoch=iterations_per_epoch_t, validation_data = load_random_cached_bottlenecks(BATCH_SIZE, label_map, validation_addr_label_map, val_npy_dir, 'h5py', h5py_file_val),
	validation_steps=iterations_per_epoch_v, workers = 1, callbacks = callback_list, use_multiprocessing = True, max_queue_size = 32)
	'''
    # loss = trainable_model.train_on_batch(X, Y)
    # history_information.append(loss)
    # if i%10 == 0:
    # print (str(datetime.datetime.now())+"\tPercent to complete: " + str((iterations_per_epoch_t*EPOCHS - i)*100//(iterations_per_epoch_t*EPOCHS))+"\t\tEpoch: " + str(epoch) + "\tIteration: " + str(i) + '\tLoss: ' + str(loss[0]) + "\tTraining_Accuracy: " + str(loss[1]))
    # if epoch% args.saving_ckpts == 0:
    # 	trainable_model.save(args.weight_file)
    # for i in range(iterations_per_epoch_v):
    # 	X,Y = load_random_cached_bottlenecks(BATCH_SIZE, label_map, validation_addr_label_map, val_npy_dir, 'h5py', h5py_file_val)
    # 	loss = trainable_model.test_on_batch(X, Y)
    # 	print ("\tIteration: " + str(i) + '\tLoss: ' + str(loss[0]) + "\tValidation_Accuracy: " + str(loss[1]))
    # np.save("essential_files/history_training.npy", np.array(history_information))
    print("[INFO] Completed Training!")
                    "The validation dataset lacks 'num_classes' and 'repetitions' attributes"
                )
            if train_num_classes != valid_num_classes:
                raise ValueError(
                    "The number of classes in training and validation databases differ"
                )
            num_classes = train_num_classes
        else:
            raise ValueError(
                "The input database lacks training and validation datasets")
    print("Training and validation data loaded")
    print("Training data:", num_classes, "classes repeated", train_N, "times")
    print("Validation data:", num_classes, "classes repeated", valid_N,
          "times")

    train_data = HDF5Matrix(db_path, 'training')
    valid_data = HDF5Matrix(db_path, 'validation')
    train_labels = np.tile(np.arange(num_classes), (train_N, ))
    valid_labels = np.tile(np.arange(num_classes), (valid_N, ))
    print(train_data.shape, train_labels.shape)
    print(valid_data.shape, valid_labels.shape)

    if train_data.shape[1] != valid_data.shape[1]:
        ValueError(
            "Different model used for training and validation, not allowed")
    logits_length = train_data.shape[1]
    # Get info about loaded data
    additional_info = {
        'Logits length': logits_length,
        'Number of classes': num_classes,
        'DB training repetitions': train_N,
    filters1, filters2, filters3 = filters
    x = conv(x=input_tensor, filters=filters1, kernel_size=1, strides=strides)
    x = conv(x=x, filters=filters2, kernel_size=kernel_size)
    x = conv(x=x, filters=filters3, kernel_size=1)
    shortcut = conv(x=input_tensor,
                    filters=filters3,
                    kernel_size=1,
                    strides=strides)
    x = Add()([x, shortcut])
    x = Activation("relu")(x)
    return x


batch_size = 10
L = 60000
X_train = HDF5Matrix("dataset.h5", "X", start=0, end=L)
Y_train = HDF5Matrix("dataset.h5", "Y", start=0, end=L)

X_test = HDF5Matrix("dataset.h5", "X", start=L, end=L + 22730)
Y_test = HDF5Matrix("dataset.h5", "Y", start=L, end=L + 22730)

gen_train = K_gen(X_train, Y_train, batch_size=batch_size)
gen_test = K_gen(X_test, Y_test, batch_size=batch_size)

x_inp = Input(batch_shape=(batch_size, 31, 31, 31, 2))
x = BatchNormalization()(x_inp)
x = GaussianNoise(0.01)(x)
x = conv_block(x, 3, [32, 32, 128])
x = identity_block(x, 3, [32, 32, 128])
x = conv_block(x, 3, [64, 64, 256])
x = identity_block(x, 3, [64, 64, 256])
Exemple #7
0
    def run(self):
        logger.info('Epochs: {}'.format(self.epochs))
        logger.info('Batch size: {}'.format(self.batch_size))
        logger.info('Model attributes: {}'.format(self.model_attributes))
        logger.info('Output: {}'.format(self.output))
        logger.info('Units: {}'.format(self.units))
        logger.info('')

        if self.load_epoch == 0:
            os.mkdir(self.model_directory)

            data_files_temp = os.listdir(self.data_path)
            data_files = []
            for f in data_files_temp:
                if not f.endswith('.h5'):
                    continue

                if f.startswith('test_'):
                    continue

                data_files.append(f)

            train_files = sorted(data_files)
            logger.info('{} files will be used for training'.format(len(train_files)))

            model = self.create_and_compile_model()

            for n_epoch in range(1, self.epochs + 1):
                logger.info('')
                logger.info('Processing epoch #{}...'.format(n_epoch))

                for train_file in train_files:
                    logger.info('Processing {}...'.format(train_file))
                    h5_file_path = os.path.join(self.data_path, train_file)

                    train_input = HDF5Matrix(h5_file_path, 'input_data')
                    train_output = HDF5Matrix(h5_file_path, 'output_data')

                    logger.info('Train data size = {}'.format(train_input.size / self.input_size))

                    model.fit(
                        train_input,
                        train_output,
                        # we need it to work with md5 data
                        shuffle='batch',
                        epochs=1,
                        batch_size=self.batch_size,
                        callbacks=[LoggingCallback(self.graphs_data)]
                    )

                logger.info('Predictions after epoch #{}'.format(n_epoch))
                self.calculate_predictions(model, n_epoch)

                # We save model after each epoch
                logger.info('Saving model, please don\'t interrupt...')
                model_path = os.path.join(self.model_directory, '{}_model.h5'.format(n_epoch))
                model.save(model_path)
                logger.info('Model saved')
        else:
            model_files = os.listdir(self.model_directory)
            model_file = None
            for f in model_files:
                if f.startswith('{}_'.format(self.load_epoch)):
                    model_file = f

            model = load_model(os.path.join(self.model_directory, model_file))
            self.calculate_predictions(model, None)

        if self.graphs_data.get('first'):
            self.print_best_result()

            logger.info(json.dumps(self.graphs_data))

            if self.need_visualize:
                show_graphs(self.graphs_data)
Exemple #8
0

def exceptUnknown(X, Y):  #except data whose gender is unknown
    i = 0
    while (i < len(Y)):
        if Y[i] == 0:
            Y = np.delete(Y, (i), axis=0)
            X = np.delete(X, (i), axis=0)
        else:
            i += 1
    return np.transpose(X.astype('float32') / 255.0,
                        [0, 2, 3, 1]), np_utils.to_categorical(
                            Y - 1, num_classes=2)  #onehot


X_train = np.array(HDF5Matrix('train.hdf5', 'crops'))
Y_train = extGender(HDF5Matrix('train.hdf5', 'labels'))
X_val = np.array(HDF5Matrix('val.hdf5', 'crops'))
Y_val = extGender(HDF5Matrix('val.hdf5', 'labels'))
X_test = np.array(HDF5Matrix('test.hdf5', 'crops'))
Y_test = extGender(HDF5Matrix('test.hdf5', 'labels'))
X_train, Y_train = exceptUnknown(X_train, Y_train)
X_val, Y_val = exceptUnknown(X_val, Y_val)
X_test, Y_test = exceptUnknown(X_test, Y_test)

#hdf5 data import


def createCNNmodel(num_classes):
    model = Sequential()
    model.add(
Exemple #9
0
    x = Dense(512, **kwargs)(x)
    x = Dense(1)(x)
    x = GaussianNoise(0.05)(x)
    model = Model(x_inp, x)
    return model


batch_size = 100
optimizer = Adadelta(lr=1.0e-2, clipvalue=.1)
mofnn = MOFNN(batch_size=batch_size)
mofnn.compile(optimizer=optimizer,
              loss="mse",
              metrics=["mean_absolute_percentage_error"])
mofnn.summary()

X_train = HDF5Matrix("dataset-encoded.h5", "X", end=60000)
Y_train = HDF5Matrix("dataset-encoded.h5", "Y", end=60000)
X_valid = HDF5Matrix("dataset-encoded.h5", "X", start=60000)
Y_valid = HDF5Matrix("dataset-encoded.h5", "Y", start=60000)
gen_train = Gen(X_train, Y_train, batch_size=batch_size)
gen_test = Gen(X_valid, Y_valid, batch_size=batch_size)

chkpntr = ModelCheckpoint(filepath="1_bar_encoded.h5",
                          save_best_only=True,
                          verbose=1,
                          save_weights_only=True)
lrrdcr = ReduceLROnPlateau(monitor='val_loss',
                           factor=0.5,
                           patience=3,
                           min_lr=1e-7,
                           verbose=1)
Exemple #10
0
def load_data():
    """Loads PCam dataset.

    # Returns
        Tuple of Numpy arrays: `(x_train, y_train), (x_test, y_test)`.
    """
    dirname = os.path.join('datasets', 'pcam')
    base = 'https://drive.google.com/uc?export=download&id='
    try:
        y_train = HDF5Matrix(
            get_file('camelyonpatch_level_2_split_train_y.h5',
                     origin=base + '1269yhu3pZDP8UYFQs-NYs3FPwuK-nGSG',
                     cache_subdir=dirname,
                     archive_format='gzip'), 'y')
        x_valid = HDF5Matrix(
            get_file('camelyonpatch_level_2_split_valid_x.h5',
                     origin=base + '1hgshYGWK8V-eGRy8LToWJJgDU_rXWVJ3',
                     cache_subdir=dirname,
                     archive_format='gzip'), 'x')
        y_valid = HDF5Matrix(
            get_file('camelyonpatch_level_2_split_valid_y.h5',
                     origin=base + '1bH8ZRbhSVAhScTS0p9-ZzGnX91cHT3uO',
                     cache_subdir=dirname,
                     archive_format='gzip'), 'y')
        x_test = HDF5Matrix(
            get_file('camelyonpatch_level_2_split_test_x.h5',
                     origin=base + '1qV65ZqZvWzuIVthK8eVDhIwrbnsJdbg_',
                     cache_subdir=dirname,
                     archive_format='gzip'), 'x')
        y_test = HDF5Matrix(
            get_file('camelyonpatch_level_2_split_test_y.h5',
                     origin=base + '17BHrSrwWKjYsOgTMmoqrIjDy6Fa2o_gP',
                     cache_subdir=dirname,
                     archive_format='gzip'), 'y')

        meta_train = pd.read_csv(
            get_file('camelyonpatch_level_2_split_train_meta.csv',
                     origin=base + '1XoaGG3ek26YLFvGzmkKeOz54INW0fruR',
                     cache_subdir=dirname))
        meta_valid = pd.read_csv(
            get_file('camelyonpatch_level_2_split_valid_meta.csv',
                     origin=base + '16hJfGFCZEcvR3lr38v3XCaD5iH1Bnclg',
                     cache_subdir=dirname))
        meta_test = pd.read_csv(
            get_file('camelyonpatch_level_2_split_test_meta.csv',
                     origin=base + '19tj7fBlQQrd4DapCjhZrom_fA4QlHqN4',
                     cache_subdir=dirname))
        x_train = HDF5Matrix(
            get_file('camelyonpatch_level_2_split_train_x.h5',
                     origin=base + '1Ka0XfEMiwgCYPdTI-vv6eUElOBnKFKQ2',
                     cache_subdir=dirname,
                     archive_format='gzip'), 'x')
    except OSError:
        raise NotImplementedError(
            'Direct download currently not working. Please go to https://drive.google.com/drive/folders/1gHou49cA1s5vua2V5L98Lt8TiWA3FrKB and press download all. Then place files (ungzipped) in ~/.keras/datasets/pcam.'
        )

    if K.image_data_format() == 'channels_first':
        raise NotImplementedError()

    return (x_train, y_train, meta_train), (x_valid, y_valid,
                                            meta_valid), (x_test, y_test,
                                                          meta_test)