def fit_generator(self, generator, n_steps_per_epoch, n_epochs=1, validation_data=None, n_validation_steps=None, callbacks=None): """Train the network on batches of data generated from `generator` :param generator: a generator yielding batches indefinitely, where each batch is a tuple of (inputs, targets) :type generator: generator :param n_steps_per_epoch: number of batches to train on in one epoch :type n_steps_per_epoch: int :param n_epochs: number of epochs to train the model :type n_epochs: int :param validation_data: generator yielding batches to evaluate the loss on at the end of each epoch, where each batch is a tuple of (inputs, targets) :type validation_data: generator :param n_validation_steps: number of batches to evaluate on from `validation_data` :param callbacks: callbacks to be used during training :type callbacks: list[object] :raises RuntimeError: if only one of `validation_data` and `n_validation_steps` are passed in """ default_callbacks = self._load_default_callbacks() default_callbacks.append(ProgbarLogger(count_mode='steps')) if callbacks: default_callbacks.extend(callbacks) callbacks = CallbackList(default_callbacks) self._assert_compiled() invalid_inputs = ((validation_data is not None and not n_validation_steps) or (n_validation_steps and validation_data is None)) if invalid_inputs: msg = ('`validation_data` and `n_validation_steps` must both be ' 'passed, or neither.') raise RuntimeError(msg) if self.device: self.network.to(self.device) metrics = ['loss'] if self.n_outputs > 1: for idx_output in range(1, self.n_outputs + 1): metrics.append('loss{}'.format(idx_output)) if validation_data is not None: metrics.append('val_loss') if self.n_outputs > 1: for idx_output in range(1, self.n_outputs + 1): metrics.append('val_loss{}'.format(idx_output)) for metric_name in self.metric_names: metrics.append(metric_name) if validation_data is not None: metrics.append('val_{}'.format(metric_name)) callbacks.set_params({ 'epochs': n_epochs, 'metrics': metrics, 'steps': n_steps_per_epoch, 'verbose': True }) callbacks.set_model(self) callbacks.on_train_begin() for idx_epoch in range(n_epochs): if self.stop_training: break epoch_logs = {} callbacks.on_epoch_begin(idx_epoch) for idx_batch in range(n_steps_per_epoch): batch_logs = {'batch': idx_batch, 'size': 1} callbacks.on_batch_begin(idx_batch, batch_logs) generator_output = next(generator) if len(generator_output) != 2: msg = ('Output of generator should be a tuple of ' '(inputs, targets), but instead got a {}: ' '{}.').format(type(generator_output), str(generator_output)) inputs, targets = generator_output train_outputs = self.train_on_batch(inputs, targets) batch_logs['loss'] = train_outputs[0] if self.n_outputs > 1: for idx_output in range(1, self.n_outputs + 1): batch_logs['loss{}'.format(idx_output)] = ( train_outputs[idx_output]) idx_metric_values = (1 if self.n_outputs == 1 else self.n_outputs + 1) it = zip(self.metric_names, train_outputs[idx_metric_values:]) for metric_name, train_output in it: batch_logs[metric_name] = train_output callbacks.on_batch_end(idx_batch, batch_logs) if self.stop_training: break if validation_data: val_outputs = self.evaluate_generator(validation_data, n_validation_steps) epoch_logs['val_loss'] = val_outputs[0] if self.n_outputs > 1: for idx_output in range(1, self.n_outputs + 1): epoch_logs['val_loss{}'.format(idx_output)] = ( val_outputs[idx_output]) idx_metric_values = (1 if self.n_outputs == 1 else self.n_outputs + 1) it = zip(self.metric_names, val_outputs[idx_metric_values:]) for metric_name, val_output in it: metric_name = 'val_{}'.format(metric_name) epoch_logs[metric_name] = val_output callbacks.on_epoch_end(idx_epoch, epoch_logs) callbacks.on_train_end()
def GanTrain(discriminator, generator, opt, global_batch_size, warmup_epochs, datapath, EventsperFile, nEvents, WeightsDir, mod=0, nb_epochs=30, batch_size=128, latent_size=128, gen_weight=6, aux_weight=0.2, ecal_weight=0.1, lr=0.001, rho=0.9, decay=0.0, g_weights='params_generator_epoch_', d_weights='params_generator_epoch_', xscale=1, verbose=True): start_init = time.time() # verbose = False if hvd.rank() == 0: print('[INFO] Building discriminator') #discriminator.summary() discriminator.compile(optimizer=opt, loss=[ 'binary_crossentropy', 'mean_absolute_percentage_error', 'mean_absolute_percentage_error' ], loss_weights=[gen_weight, aux_weight, ecal_weight]) # build the generator if hvd.rank() == 0: print('[INFO] Building generator') #generator.summary() generator.compile(optimizer=opt, loss='binary_crossentropy') # build combined Model latent = Input(shape=(latent_size, ), name='combined_z') fake_image = generator(latent) discriminator.trainable = False fake, aux, ecal = discriminator(fake_image) combined = Model(input=[latent], output=[fake, aux, ecal], name='combined_model') # Getting Data Trainfiles, Testfiles = DivideFiles(datapath, nEvents=nEvents, EventsperFile=EventsperFile, datasetnames=["ECAL"], Particles=["Ele"]) if hvd.rank() == 0: print("Train files: {0} \nTest files: {1}".format( Trainfiles, Testfiles)) #Read test data into a single array for index, dtest in enumerate(Testfiles): if index == 0: X_test, Y_test, ecal_test = GetData(dtest) else: X_temp, Y_temp, ecal_temp = GetData(dtest) X_test = np.concatenate((X_test, X_temp)) Y_test = np.concatenate((Y_test, Y_temp)) ecal_test = np.concatenate((ecal_test, ecal_temp)) for index, dtrain in enumerate(Trainfiles): if index == 0: X_train, Y_train, ecal_train = GetData(dtrain) else: X_temp, Y_temp, ecal_temp = GetData(dtrain) X_train = np.concatenate((X_train, X_temp)) Y_train = np.concatenate((Y_train, Y_temp)) ecal_train = np.concatenate((ecal_train, ecal_temp)) nb_test = X_test.shape[0] assert X_train.shape[0] == EventsperFile * len( Trainfiles), "# Total events in training files" nb_train = X_train.shape[0] # Total events in training files total_batches = nb_train / global_batch_size if hvd.rank() == 0: print('Total Training batches = {} with {} events'.format( total_batches, nb_train)) combined.compile( #optimizer=Adam(lr=adam_lr, beta_1=adam_beta_1), optimizer=opt, loss=[ 'binary_crossentropy', 'mean_absolute_percentage_error', 'mean_absolute_percentage_error' ], loss_weights=[gen_weight, aux_weight, ecal_weight]) gcb = CallbackList( \ callbacks=[ \ hvd.callbacks.BroadcastGlobalVariablesCallback(0), \ hvd.callbacks.MetricAverageCallback(), \ hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=warmup_epochs, verbose=1, steps_per_epoch=total_batches), \ hvd.callbacks.LearningRateScheduleCallback(start_epoch=warmup_epochs, end_epoch=nb_epochs, multiplier=1.), \ keras.callbacks.ReduceLROnPlateau(patience=10, verbose=1) \ ]) dcb = CallbackList( \ callbacks=[ \ hvd.callbacks.BroadcastGlobalVariablesCallback(0), \ hvd.callbacks.MetricAverageCallback(), \ hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=warmup_epochs, verbose=1, steps_per_epoch=total_batches), \ hvd.callbacks.LearningRateScheduleCallback(start_epoch=warmup_epochs, end_epoch=nb_epochs, multiplier=1.), \ keras.callbacks.ReduceLROnPlateau(patience=10, verbose=1) \ ]) ccb = CallbackList( \ callbacks=[ \ hvd.callbacks.BroadcastGlobalVariablesCallback(0), \ hvd.callbacks.MetricAverageCallback(), \ hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=warmup_epochs, verbose=1, steps_per_epoch=total_batches), \ hvd.callbacks.LearningRateScheduleCallback(start_epoch=warmup_epochs, end_epoch=nb_epochs, multiplier=1.), \ keras.callbacks.ReduceLROnPlateau(patience=10, verbose=1) \ ]) gcb.set_model(generator) dcb.set_model(discriminator) ccb.set_model(combined) gcb.on_train_begin() dcb.on_train_begin() ccb.on_train_begin() print("On hostname {0} - After init using {1} memory".format( socket.gethostname(), psutil.Process(os.getpid()).memory_info()[0])) train_history = defaultdict(list) test_history = defaultdict(list) if hvd.rank() == 0: print('Initialization time was {} seconds'.format(time.time() - start_init)) for epoch in range(nb_epochs): epoch_start = time.time() if hvd.rank() == 0: print('Epoch {} of {}'.format(epoch + 1, nb_epochs)) randomize(X_train, Y_train, ecal_train) epoch_gen_loss = [] epoch_disc_loss = [] image_batches = genbatches(X_train, batch_size) energy_batches = genbatches(Y_train, batch_size) ecal_batches = genbatches(ecal_train, batch_size) for index in range(total_batches): start = time.time() image_batch = image_batches.next() energy_batch = energy_batches.next() ecal_batch = ecal_batches.next() noise = np.random.normal(0, 1, (batch_size, latent_size)) sampled_energies = np.random.uniform(0.1, 5, (batch_size, 1)) generator_ip = np.multiply(sampled_energies, noise) # ecal sum from fit ecal_ip = GetEcalFit(sampled_energies, mod, xscale) generated_images = generator.predict(generator_ip, verbose=0) real_batch_loss = discriminator.train_on_batch( image_batch, [BitFlip(np.ones(batch_size)), energy_batch, ecal_batch]) fake_batch_loss = discriminator.train_on_batch( generated_images, [BitFlip(np.zeros(batch_size)), sampled_energies, ecal_ip]) epoch_disc_loss.append([ (a + b) / 2 for a, b in zip(real_batch_loss, fake_batch_loss) ]) trick = np.ones(batch_size) gen_losses = [] for _ in range(2): noise = np.random.normal(0, 1, (batch_size, latent_size)) sampled_energies = np.random.uniform(0.1, 5, (batch_size, 1)) generator_ip = np.multiply(sampled_energies, noise) ecal_ip = GetEcalFit(sampled_energies, mod, xscale) gen_losses.append( combined.train_on_batch( [generator_ip], [trick, sampled_energies.reshape((-1, 1)), ecal_ip])) epoch_gen_loss.append([(a + b) / 2 for a, b in zip(*gen_losses)]) if (index % 1) == 0 and hvd.rank() == 0: # progress_bar.update(index) print('processed {}/{} batches in {}'.format( index + 1, total_batches, time.time() - start)) # save weights every epoch if hvd.rank() == 0: safe_mkdir(WeightsDir) print("saving weights of gen") generator.save_weights( WeightsDir + '/generator_{0}{1:03d}.hdf5'.format(g_weights, epoch), overwrite=True) print("saving weights of disc") discriminator.save_weights( WeightsDir + '/discriminator_{0}{1:03d}.hdf5'.format(d_weights, epoch), overwrite=True) epoch_time = time.time() - epoch_start print("The {} epoch took {} seconds".format(epoch, epoch_time))
def fit(self, x, y, batch_size, n_epochs=1, callbacks=None, validation_data=None): """Trains the network on the given data for a fixed number of epochs :param x: input data to train on :type x: torch.Tensor :param y: target data to train on :type y: torch.Tensor :param batch_size: number of samples to use per forward and backward pass :type batch_size: int :param n_epochs: number of epochs (iterations of the dataset) to train the model :type n_epochs: int :param callbacks: callbacks to be used during training :type callbacks: list[object] :param validation_data: data on which to evaluate the loss and metrics at the end of each epoch :type validation_data: tuple(numpy.ndarray) """ default_callbacks = self._load_default_callbacks() default_callbacks.append(ProgbarLogger(count_mode='samples')) if callbacks: default_callbacks.extend(callbacks) callbacks = CallbackList(default_callbacks) self._assert_compiled() if self.device: self.network.to(self.device) metrics = ['loss'] if self.n_outputs > 1: for idx_output in range(1, self.n_outputs + 1): metrics.append('loss{}'.format(idx_output)) if validation_data is not None: metrics.append('val_loss') if self.n_outputs > 1: for idx_output in range(1, self.n_outputs + 1): metrics.append('val_loss{}'.format(idx_output)) for metric_name in self.metric_names: metrics.append(metric_name) if validation_data is not None: metrics.append('val_{}'.format(metric_name)) index_array = np.arange(x.shape[0]) callbacks.set_params({ 'batch_size': batch_size, 'epochs': n_epochs, 'metrics': metrics, 'steps': None, 'samples': x.shape[0], 'verbose': True }) callbacks.set_model(self) callbacks.on_train_begin() for idx_epoch in range(n_epochs): if self.stop_training: break epoch_logs = {} callbacks.on_epoch_begin(idx_epoch) np.random.shuffle(index_array) batches = make_batches(len(index_array), batch_size) for idx_batch, (idx_start, idx_end) in enumerate(batches): batch_logs = {'batch': idx_batch, 'size': idx_end - idx_start} callbacks.on_batch_begin(idx_batch, batch_logs) inputs = x[index_array[idx_start:idx_end]] if self.n_outputs > 1: targets = [] for idx_output in range(self.n_outputs): targets.append( y[idx_output][index_array[idx_start:idx_end]]) else: targets = y[index_array[idx_start:idx_end]] train_outputs = self.train_on_batch(inputs, targets) batch_logs['loss'] = train_outputs[0] if self.n_outputs > 1: for idx_output in range(1, self.n_outputs + 1): batch_logs['loss{}'.format(idx_output)] = ( train_outputs[idx_output]) idx_metric_values = (1 if self.n_outputs == 1 else self.n_outputs + 1) it = zip(self.metric_names, train_outputs[idx_metric_values:]) for metric_name, train_output in it: batch_logs[metric_name] = train_output callbacks.on_batch_end(idx_batch, batch_logs) if self.stop_training: break if validation_data: val_outputs = self.evaluate(validation_data[0], validation_data[1], batch_size) epoch_logs['val_loss'] = val_outputs[0] if self.n_outputs > 1: for idx_output in range(1, self.n_outputs + 1): epoch_logs['val_loss{}'.format(idx_output)] = ( val_outputs[idx_output]) idx_metric_values = (1 if self.n_outputs == 1 else self.n_outputs + 1) it = zip(self.metric_names, val_outputs[idx_metric_values:]) for metric_name, val_output in it: metric_name = 'val_{}'.format(metric_name) epoch_logs[metric_name] = val_output callbacks.on_epoch_end(idx_epoch, epoch_logs) callbacks.on_train_end()