def test_reduce_lr_upon_nan(self): with ScratchDir('.'): callbacks = [ReduceLRUponNan(patience=100)] self.assertAlmostEqual(float(kb.get_value(self.model.optimizer.lr)), 1e-3) gen = Generator(self.x, np.array([1, np.nan]).reshape((1, 2, 1))) self.model.fit_generator(gen, steps_per_epoch=1, epochs=1, callbacks=callbacks, verbose=0) self.assertAlmostEqual(float(kb.get_value(self.model.optimizer.lr)), 0.5e-3) inp = [ Input(shape=(None, self.n_feature)), Input(shape=(None, self.n_bond_features)), Input(shape=(None, self.n_global_features)), Input(shape=(None,), dtype='int32'), Input(shape=(None,), dtype='int32'), Input(shape=(None,), dtype='int32'), Input(shape=(None,), dtype='int32'), ] units_v = [2, 2] units_e = [2, 2] units_u = [2, ] layer = MEGNetLayer(units_v, units_e, units_u) out = layer(inp) out = Dense(1)(out[2]) model = Model(inputs=inp, outputs=out) model.compile(loss='mse', optimizer='adam') x = [np.random.normal(size=(1, 4, self.n_feature)), np.random.normal(size=(1, 6, self.n_bond_features)), np.random.normal(size=(1, 2, self.n_global_features)), np.array([[0, 0, 1, 1, 2, 3]]), np.array([[1, 1, 0, 0, 3, 2]]), np.array([[0, 0, 1, 1]]), np.array([[0, 0, 0, 0, 1, 1]]), ] y = np.random.normal(size=(1, 2, 1)) train_gen = Generator(x, y) callbacks = [ReduceLRUponNan(filepath='./val_mae_{epoch:05d}_{val_mae:.6f}.hdf5', patience=100), ModelCheckpointMAE(filepath='./val_mae_{epoch:05d}_{val_mae:.6f}.hdf5', val_gen=train_gen, steps_per_val=1) ] # 1. involve training and saving model.fit_generator(train_gen, steps_per_epoch=1, epochs=2, callbacks=callbacks, verbose=1) # 2. throw nan loss, trigger ReduceLRUponNan model.fit_generator(gen, steps_per_epoch=1, epochs=2, callbacks=callbacks, verbose=1) # 3. Normal training, recover saved model from 1 model.fit_generator(train_gen, steps_per_epoch=1, epochs=2, callbacks=callbacks, verbose=1) self.assertAlmostEqual(float(kb.get_value(model.optimizer.lr)), 0.25e-3)
def train_from_graphs(self, train_graphs, train_targets, validation_graphs=None, validation_targets=None, epochs=1000, batch_size=128, verbose=1, callbacks=None, prev_model=None, **kwargs ): # load from saved model if prev_model: self.load_weights(prev_model) is_classification = 'entropy' in self.model.loss monitor = 'val_acc' if is_classification else 'val_mae' mode = 'max' if is_classification else 'min' dirname = kwargs.pop('dirname', 'callback') if not os.path.isdir(dirname): os.makedirs(dirname) if callbacks is None: # with this call back you can stop the model training by `touch STOP` callbacks = [ManualStop()] callbacks.append(ReduceLRUponNan()) train_nb_atoms = [len(i['atom']) for i in train_graphs] train_targets = [self.target_scaler.transform(i, j) for i, j in zip(train_targets, train_nb_atoms)] train_targets = np.array(train_targets).ravel() if validation_graphs is not None: filepath = pjoin(dirname, 'val_mae_{epoch:05d}_{%s:.6f}.hdf5' % monitor) val_nb_atoms = [len(i['atom']) for i in validation_graphs] validation_targets = [self.target_scaler.transform(i, j) for i, j in zip(validation_targets, val_nb_atoms)] validation_targets = np.array(validation_targets).ravel() val_inputs = self.graph_convertor.get_flat_data(validation_graphs, validation_targets) val_generator = self._create_generator(*val_inputs, batch_size=batch_size) steps_per_val = int(np.ceil(len(validation_graphs) / batch_size)) callbacks.extend([ModelCheckpointMAE(filepath=filepath, monitor=monitor, mode=mode, save_best_only=True, save_weights_only=False, val_gen=val_generator, steps_per_val=steps_per_val, y_scaler=None)]) else: val_generator = None steps_per_val = None train_inputs = self.graph_convertor.get_flat_data(train_graphs, train_targets) train_generator = self._create_generator(*train_inputs, batch_size=batch_size) steps_per_train = int(np.ceil(len(train_graphs) / batch_size)) self.fit_generator(train_generator, steps_per_epoch=steps_per_train, validation_data=val_generator, validation_steps=steps_per_val, epochs=epochs, verbose=verbose, callbacks=callbacks, **kwargs)
def test_reduce_lr_upon_nan(self): callbacks = [ReduceLRUponNan(patience=100)] self.assertAlmostEqual(float(kb.get_value(self.model.optimizer.lr)), 1e-3) gen = Generator(self.x, np.array([1, np.nan]).reshape((1, 2, 1))) self.model.fit_generator(gen, steps_per_epoch=1, epochs=1, callbacks=callbacks, verbose=0) self.assertAlmostEqual(float(kb.get_value(self.model.optimizer.lr)), 0.5e-3)
def train_from_graphs(self, train_graphs: List[Dict], train_targets: List[float], validation_graphs: List[Dict] = None, validation_targets: List[float] = None, sample_weights: List[float] = None, epochs: int = 1000, batch_size: int = 128, verbose: int = 1, pad_string: str = None, callbacks: List[Callback] = None, prev_model: str = None, lr_scaling_factor: float = 0.5, patience: int = 500, save_checkpoint: bool = True, automatic_correction: bool = True, **kwargs) -> "GraphModel": """ Args: train_graphs: (list) list of graph dictionaries train_targets: (list) list of target values validation_graphs: (list) list of graphs as validation validation_targets: (list) list of validation targets sample_weights: (list) list of sample weights epochs: (int) number of epochs batch_size: (int) training batch size verbose: (int) keras fit verbose, 0 no progress bar, 1 only at the epoch end and 2 every batch pad_string: (str) string to add in front of callback filepath callbacks: (list) megnet or keras callback functions for training prev_model: (str) file name for previously saved model lr_scaling_factor: (float, less than 1) scale the learning rate down when nan loss encountered patience: (int) patience for early stopping save_checkpoint: (bool) whether to save checkpoint automatic_correction: (bool) correct nan errors **kwargs: """ # load from saved model if prev_model: self.load_weights(prev_model) is_classification = 'entropy' in str(self.model.loss) monitor = 'val_acc' if is_classification else 'val_mae' mode = 'max' if is_classification else 'min' dirname = kwargs.pop('dirname', 'callback') has_sample_weights = sample_weights is not None if not os.path.isdir(dirname): os.makedirs(dirname) if callbacks is None: # with this call back you can stop the model training by `touch STOP` callbacks = [ManualStop()] train_nb_atoms = [len(i['atom']) for i in train_graphs] train_targets = [ self.target_scaler.transform(i, j) for i, j in zip(train_targets, train_nb_atoms) ] if (validation_graphs is not None) and (validation_targets is not None): filepath = ('%s_{epoch:05d}_{%s:.6f}.hdf5' % (monitor, monitor)) val_nb_atoms = [len(i['atom']) for i in validation_graphs] validation_targets = [ self.target_scaler.transform(i, j) for i, j in zip(validation_targets, val_nb_atoms) ] val_inputs = self.graph_converter.get_flat_data( validation_graphs, validation_targets) val_generator = self._create_generator(*val_inputs, batch_size=batch_size) steps_per_val = int(np.ceil(len(validation_graphs) / batch_size)) if save_checkpoint: callbacks.extend([ ModelCheckpointMAE(pad_string=pad_string, filepath=filepath, monitor=monitor, mode=mode, save_best_only=True, save_weights_only=False, val_gen=val_generator, steps_per_val=steps_per_val, target_scaler=self.target_scaler) ]) # avoid running validation twice in an epoch # val_generator = None # type: ignore # steps_per_val = None # type: ignore if automatic_correction: callbacks.extend([ ReduceLRUponNan(filepath=filepath, monitor=monitor, mode=mode, factor=lr_scaling_factor, patience=patience, has_sample_weights=has_sample_weights) ]) else: val_generator = None # type: ignore steps_per_val = None # type: ignore train_inputs = self.graph_converter.get_flat_data( train_graphs, train_targets) # check dimension match self.check_dimension(train_graphs[0]) train_generator = self._create_generator(*train_inputs, sample_weights=sample_weights, batch_size=batch_size) steps_per_train = int(np.ceil(len(train_graphs) / batch_size)) self.fit(train_generator, steps_per_epoch=steps_per_train, validation_data=val_generator, validation_steps=steps_per_val, epochs=epochs, verbose=verbose, callbacks=callbacks, **kwargs) return self
Args: ids (List): list of ids Returns: list of graphs and list of target values """ ids = [i for i in ids if i in final_graphs] return [final_graphs[i] for i in ids], [final_targets[i] for i in ids] train_graphs, train_targets = get_graphs_targets(train_ids) val_graphs, val_targets = get_graphs_targets(val_ids) # 5. Model training callbacks = [ReduceLRUponNan(patience=500), ManualStop()] model.train_from_graphs(train_graphs, train_targets, val_graphs, val_targets, epochs=EPOCHS, verbose=2, initial_epoch=0, callbacks=callbacks) # 6. Model testing ## load the best model with lowest validation error files = glob("./callback/*.hdf5") best_model = sorted(files, key=os.path.getctime)[-1]
def train_from_graphs(self, train_graphs: List[Dict], train_targets: List[float], validation_graphs: List[Dict] = None, validation_targets: List[float] = None, epochs: int = 1000, batch_size: int = 128, verbose: int = 1, callbacks: List[Callback] = None, prev_model: str = None, lr_scaling_factor: float = 0.5, patience: int = 500, save_checkpoint: bool = True, automatic_correction: bool = True, **kwargs) -> None: """ # TODO write doc... :param train_graphs: :param train_targets: :param validation_graphs: :param validation_targets: :param epochs: :param batch_size: :param verbose: :param callbacks: :param prev_model: :param lr_scaling_factor: :param patience: :param save_checkpoint: :param automatic_correction: :param kwargs: :return: """ # load from saved model if prev_model: self.load_weights(prev_model) is_classification = 'entropy' in self.model.loss monitor = 'val_acc' if is_classification else 'val_mae' mode = 'max' if is_classification else 'min' dirname = kwargs.pop('dirname', 'callback') if not os.path.isdir(dirname): os.makedirs(dirname) if callbacks is None: # with this call back you can stop the model training by `touch STOP` callbacks = [ManualStop()] train_nb_atoms = [len(i['atom']) for i in train_graphs] train_targets = [ self.target_scaler.transform(i, j) for i, j in zip(train_targets, train_nb_atoms) ] if validation_graphs is not None: filepath = os.path.join( dirname, '%s_{epoch:05d}_{%s:.6f}.hdf5' % (monitor, monitor)) val_nb_atoms = [len(i['atom']) for i in validation_graphs] validation_targets = [ self.target_scaler.transform(i, j) for i, j in zip(validation_targets, val_nb_atoms) ] val_inputs = self.graph_converter.get_flat_data( validation_graphs, validation_targets) val_generator = self._create_generator(*val_inputs, batch_size=batch_size) steps_per_val = int(np.ceil(len(validation_graphs) / batch_size)) if automatic_correction: callbacks.extend([ ReduceLRUponNan( filepath=filepath, monitor=monitor, mode=mode, factor=lr_scaling_factor, patience=patience, ) ]) if save_checkpoint: callbacks.extend([ ModelCheckpointMAE(filepath=filepath, monitor=monitor, mode=mode, save_best_only=True, save_weights_only=False, val_gen=val_generator, steps_per_val=steps_per_val, target_scaler=self.target_scaler) ]) else: val_generator = None steps_per_val = None train_inputs = self.graph_converter.get_flat_data( train_graphs, train_targets) # check dimension match self.check_dimension(train_graphs[0]) train_generator = self._create_generator(*train_inputs, batch_size=batch_size) steps_per_train = int(np.ceil(len(train_graphs) / batch_size)) self.fit_generator(train_generator, steps_per_epoch=steps_per_train, validation_data=val_generator, validation_steps=steps_per_val, epochs=epochs, verbose=verbose, callbacks=callbacks, **kwargs)