def fit(self, X_train: ndarray, y_train: ndarray, X_test: ndarray, y_test: ndarray, epochs: int = 100, eval_every: int = 10, batch_size: int = 32, seed: int = 1, restart: bool = True) -> None: ''' Fits the neural network on the training data for a certain number of epochs. Every 'eval_every' epochs, evaluates the network on testing data ''' setattr(self.optim, 'max_epochs', epochs) self.optim._setup_decay() np.random.seed(seed) if restart: for layer in self.net.layers: layer.first = True self.best_loss = 1e9 for epoch in range(epochs): if (epoch + 1) % eval_every == 0: # for early stopping last_model = deepcopy(self.net) X_train, y_train = permute_data(X_train, y_train) batch_generator = self.generate_batches(X_train, y_train, batch_size) pbar = tqdm(enumerate(batch_generator), total=len(list(batch_generator))) for i, (X_batch, y_batch) in pbar: pbar.set_postfix({"Epoch": epoch + 1, "Batch": i + 1}) self.net.train_batch(X_batch, y_batch) self.optim.step() if (epoch + 1) % eval_every == 0: test_preds = self.net.forward(X_test, inference=True) loss = self.net.loss.forward(test_preds, y_test) if loss < self.best_loss: print( f'Validation loss after {epoch + 1} epochs is {loss:.3f}' ) self.best_loss = loss else: print( f'Loss increased after {epoch + 1}, final loss was {self.best_loss:.3f}, using the model from epoch {epoch + 1 - eval_every}' ) self.net = last_model setattr(self.optim, 'net', self.net) break if self.optim.final_lr: self.optim._decay_lr()
def split_huuskonsen(): train_file_path = 'ugrnn/data/huuskonsen/train.smi' test1_file_path = 'ugrnn/data/huuskonsen/test1.smi' test2_file_path = 'ugrnn/data/huuskonsen/test2.smi' smile_col_name = "smiles" target_col_name = "solubility" logp_col_name = "logp" dtype = [(smile_col_name, 'S200'), (target_col_name, 'f8'), (logp_col_name, 'f8')] data = np.genfromtxt(train_file_path, usecols=(6, 3, 5), dtype=dtype, comments=None) data_perm = permute_data(data) l = len(data) train_end = int(l * .9) train_data = data_perm[:train_end] val_data = data_perm[train_end:] test1_data = np.genfromtxt(test1_file_path, usecols=(6, 3, 5), dtype=dtype) test2_data = np.genfromtxt(test2_file_path, usecols=(6, 3, 5), dtype=dtype) test_data = np.concatenate((test1_data, test2_data)) train_file_path = 'ugrnn/data/huuskonsen/train_huuskonsen.csv' validate_file_path = 'ugrnn/data/huuskonsen/validate_huuskonsen.csv' test_file_path = 'ugrnn/data/huuskonsen/test_huuskonsen.csv' header = "{:},{:},{:}".format(smile_col_name, target_col_name, logp_col_name) fmt = ('%s', '%4f', '%4f') np.savetxt(train_file_path, train_data, header=header, fmt=fmt, comments='', delimiter=',') np.savetxt(validate_file_path, val_data, header=header, fmt=fmt, comments='', delimiter=',') np.savetxt(test_file_path, test_data, header=header, fmt=fmt, comments='', delimiter=',')
def split_delaney(): csv_file_path = 'ugrnn/data/DILI/DILI.csv' smile_col_name = "smiles" target_col_name = "solubility" logp_col_name = "logp" data = read_csv(csv_file_path, smile_col_name, target_col_name, logp_col_name) data_perm = permute_data(data) traindata, valdata, testdata = cross_validation_split(data_perm, crossval_split_index=1, crossval_total_num_splits=10) train_file_path = './data/DILI/train_DILI.csv' validate_file_path = './data/DILI/validate_DILI.csv' test_file_path = './data/DILI/test_DILI.csv' header = "{:},{:},{:}".format(smile_col_name, target_col_name, logp_col_name ) fmt = ('%s', '%4f', '%4f') np.savetxt(train_file_path, traindata, header=header, fmt=fmt, comments='', delimiter=',') np.savetxt(validate_file_path, valdata, header=header, fmt=fmt, comments='', delimiter=',') np.savetxt(test_file_path, testdata, header=header, fmt=fmt, comments='', delimiter=',')
def split_karthikeyan(): csv_file_path = 'ugrnn/data/karthikeyan/melting_points.csv' smile_col_name = "SMILES" target_col_name = "MTP" data = read_csv(csv_file_path, smile_col_name, target_col_name) bool_arr = np.array([valid_smile(row[0]) for row in data]) print(bool_arr) filter_data = data[bool_arr] data_perm = permute_data(filter_data) traindata, valdata, testdata = cross_validation_split(data_perm, crossval_split_index=1, crossval_total_num_splits=10) train_file_path = 'ugrnn/data/karthikeyan/train_karthikeyan.csv' validate_file_path = 'ugrnn/data/karthikeyan/validate_karthikeyan.csv' test_file_path = 'ugrnn/data/karthikeyan/test_karthikeyan.csv' header = "{:},{:}".format(smile_col_name, target_col_name) fmt = ('%s', '%4f') np.savetxt(train_file_path, traindata, header=header, fmt=fmt, comments='', delimiter=',') np.savetxt(validate_file_path, valdata, header=header, fmt=fmt, comments='', delimiter=',') np.savetxt(test_file_path, testdata, header=header, fmt=fmt, comments='', delimiter=',')
def main(output_dir='output/', model_name='my_model', training_file='delaney_train.csv', validation_file='delaney_validate.csv', smile_col='smiles', target_col='solubility', crossval_total_num_splits=10, initial_crossvalidation_index=0, weight_decay_factor=0, *args, **kwargs): ''' valid kwargs: experiment_name, regression, binary_classification, batch_size, clip_gradient, model_params, contract_rings, learning_rate, max_epochs, enable_plotting ''' log_format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s' logging.basicConfig(level=logging.INFO, format=log_format) logger = logging.getLogger(__name__) print('output_dir', output_dir) output_dir = os.path.join(output_dir, model_name) # if tf.gfile.Exists(output_dir): # tf.gfile.DeleteRecursively(output_dir) tf.gfile.MakeDirs(output_dir) with tf.Graph().as_default(): # Create a session for running Ops on the Graph. # select CPU (as it is faster than GPUs) config = tf.ConfigProto(device_count={'GPU': 0}) session = tf.Session(config=config) logger.info('Loading data set from {:}'.format(training_file)) csv_file_path = training_file smile_col_name = smile_col target_col_name = target_col data = utils.read_csv(csv_file_path, None, smile_col_name, target_col_name) assert len(data[0]) > 0, 'no data loaded!' smiles, labels = utils.permute_data(data[0], data[1]) if kwargs['regression']: # normalize regression targets to be in a reasonable value-range labels_mean = labels.mean() labels_range = np.max(labels) - np.min(labels) labels = (labels - labels_mean) / labels_range #this function will be applied to predictions of the model and to targets when computing metrics def Targets_UnNormalization_fn(targets): return targets * labels_range + labels_mean def Targets_Normalization_fn(targets): return (targets - labels_mean) / labels_range else: if labels.ndim == 1: labels = labels.reshape((len(labels), 1)) Targets_UnNormalization_fn = lambda x: x Targets_Normalization_fn = lambda x: x if validation_file != '' and validation_file is not None: # train single model logger.info( 'Loading validation dataset from {:}'.format(validation_file)) valid_data = utils.read_csv(validation_file, None, smile_col_name, target_col_name) if kwargs['regression'] == 0 and labels.ndim == 1: labels = labels.reshape( (len(labels), 1)) #binary classification train_data = (smiles, labels) valid_data = (valid_data[0], Targets_Normalization_fn(valid_data[1])) training_scores_dict, validation_scores_dict = build_and_train( logger, session, output_dir, train_data, valid_data, model_name=model_name, Targets_UnNormalization_fn=Targets_UnNormalization_fn, weight_decay_factor=weight_decay_factor, **kwargs) else: # cross validation assert initial_crossvalidation_index < crossval_total_num_splits, 'INVALID VALUE GIVEN for initial_crossvalidation_index or crossval_total_num_splits!' training_scores_dict, validation_scores_dict = [], [] for crossval_split_index in range(initial_crossvalidation_index, crossval_total_num_splits): print('crossval_split: {} of {}'.format( crossval_split_index + 1, crossval_total_num_splits)) assert len(smiles) == len(labels) train_data, valid_data, testdata = utils.cross_validation_split( smiles, labels, crossval_split_index, crossval_total_num_splits=crossval_total_num_splits, validation_data_ratio=1. / crossval_total_num_splits) #merge "test" and train -- validation part used for testing train_data = (np.concatenate((train_data[0], testdata[0])), np.concatenate((train_data[1], testdata[1]))) print('CV: # train samples:', len(train_data[0]), '# validation samples:', len(valid_data[0])) td, vd = build_and_train( logger, session, output_dir + '_CV_{}'.format(crossval_split_index), train_data, valid_data, model_name=model_name, Targets_UnNormalization_fn=Targets_UnNormalization_fn, weight_decay_factor=weight_decay_factor, **kwargs) training_scores_dict.append(td) validation_scores_dict.append(vd) if isinstance(training_scores_dict, list) and len(training_scores_dict) == 1 and len( validation_scores_dict) == 1: return training_scores_dict[0], validation_scores_dict[0] return training_scores_dict, validation_scores_dict