def fit_model(self, data, cv_split='stratified'): eval_metrics = [] x = data.x if self.model_type == 'classifier' and data.binary_y is not None: y = data.binary_y else: y = data.y cross_val_data, cross_val_labels = cross_validation_split(x=x, y=y, split=cv_split, n_folds=self.ensemble_size) for i in range(self.ensemble_size): train_x = np.concatenate(cross_val_data[:i] + cross_val_data[(i + 1):]) test_x = cross_val_data[i] train_y = np.concatenate(cross_val_labels[:i] + cross_val_labels[(i + 1):]) test_y = cross_val_labels[i] if self.normalization: train_x, desc_mean = normalize_desc(train_x) self.desc_mean[i] = desc_mean test_x, _ = normalize_desc(test_x, desc_mean) self.model[i].fit(train_x, train_y.ravel()) predicted = self.model[i].predict(test_x) if self.model_type == 'classifier': eval_metrics.append(metrics.f1_score(test_y, predicted)) self.metrics_type = 'F1 score' elif self.model_type == 'regressor': r2 = metrics.r2_score(test_y, predicted) eval_metrics.append(r2) self.metrics_type = 'R^2 score' else: raise RuntimeError() return eval_metrics, self.metrics_type
def main(*args): output_dir = os.path.join(FLAGS.output_dir, FLAGS.model_name) # if tf.gfile.Exists(output_dir): # tf.gfile.DeleteRecursively(output_dir) tf.gfile.MakeDirs(output_dir) with tf.Graph().as_default(): # Create a session for running Ops on the Graph. session = tf.Session() logp_col_name = FLAGS.logp_col if FLAGS.add_logp else None logger.info('Loading data set from {:}'.format(FLAGS.training_file)) csv_file_path = FLAGS.training_file smile_col_name = FLAGS.smile_col target_col_name = FLAGS.target_col data = utils.read_csv(csv_file_path, smile_col_name, target_col_name, logp_col_name) data = list(zip(*data)) if FLAGS.validation_file != '': logger.info('Loading validation dataset from {:}'.format( FLAGS.validation_file)) valid_data = utils.read_csv(FLAGS.validation_file, smile_col_name, target_col_name, logp_col_name) train_data = data run_once(session, output_dir, list(zip(*train_data)), list(zip(*valid_data)), logp_col_name) else: assert FLAGS.initial_crossvalidation_index < FLAGS.crossval_total_num_splits, 'INVALID VALUE GIVEN!' for crossval_split_index in range( FLAGS.initial_crossvalidation_index, FLAGS.crossval_total_num_splits): print('crossval_split: {} of {}'.format( crossval_split_index + 1, FLAGS.crossval_total_num_splits)) assert len(data[0]) == len(data[1]) train_data, valid_data, testdata = utils.cross_validation_split( data[0], data[1], crossval_split_index, crossval_total_num_splits=FLAGS.crossval_total_num_splits, validation_data_ratio=1. / FLAGS.crossval_total_num_splits) #merge "test" and train -- validation part used for testing train_data = (np.concatenate((train_data[0], testdata[0])), np.concatenate((train_data[1], testdata[1]))) print('CV: # train samples:', len(train_data[0]), '# validation samples:', len(valid_data[0])) run_once(session, output_dir + '_CV_{}'.format(crossval_split_index), train_data, valid_data, logp_col_name)
def evaluate_algorithm(dataset, algorithm, n_folds): ''' This function is the main evaluation function which collects together all the steps that need to be performed in the naive bayes classifier Parameter --------- dataset: the dataset over which naive bayes classifier should be trained algorithms: The function handling the naive bayes algorithm n_folds: number of folds for the cross validaation Return: ------- scores: The accuracies achieve in each of the n_fold cross validation steps optimal summary: The model parameters for the optimal model achieved in cross validation ''' # get the split dataset for the cross validation folds = cross_validation_split(dataset, n_folds) scores = list() global_scores = -1 optimal_summary = None # iterate over each fold one at a time and make it he validation set for fold in folds: train_set = list(folds) train_set.remove(fold) train_set = sum(train_set, []) valid_set = list() for row in fold: row_copy = list(row) valid_set.append(row_copy) row_copy[-1] = None # Run the Naive bayes Algorithm to get the predictions predicted, summary = algorithm(train_set, valid_set) actual = [row[-1] for row in fold] accuracy = accuracy_metric(actual, predicted) # if there is an improvement in accuracy then select this model if accuracy > global_scores: global_scores = accuracy optimal_summary = summary # append the accuracy obtained in this iteration in the list of scores scores.append(accuracy) return scores, optimal_summary
def split_delaney(): csv_file_path = 'ugrnn/data/DILI/DILI.csv' smile_col_name = "smiles" target_col_name = "solubility" logp_col_name = "logp" data = read_csv(csv_file_path, smile_col_name, target_col_name, logp_col_name) data_perm = permute_data(data) traindata, valdata, testdata = cross_validation_split(data_perm, crossval_split_index=1, crossval_total_num_splits=10) train_file_path = './data/DILI/train_DILI.csv' validate_file_path = './data/DILI/validate_DILI.csv' test_file_path = './data/DILI/test_DILI.csv' header = "{:},{:},{:}".format(smile_col_name, target_col_name, logp_col_name ) fmt = ('%s', '%4f', '%4f') np.savetxt(train_file_path, traindata, header=header, fmt=fmt, comments='', delimiter=',') np.savetxt(validate_file_path, valdata, header=header, fmt=fmt, comments='', delimiter=',') np.savetxt(test_file_path, testdata, header=header, fmt=fmt, comments='', delimiter=',')
def split_karthikeyan(): csv_file_path = 'ugrnn/data/karthikeyan/melting_points.csv' smile_col_name = "SMILES" target_col_name = "MTP" data = read_csv(csv_file_path, smile_col_name, target_col_name) bool_arr = np.array([valid_smile(row[0]) for row in data]) print(bool_arr) filter_data = data[bool_arr] data_perm = permute_data(filter_data) traindata, valdata, testdata = cross_validation_split(data_perm, crossval_split_index=1, crossval_total_num_splits=10) train_file_path = 'ugrnn/data/karthikeyan/train_karthikeyan.csv' validate_file_path = 'ugrnn/data/karthikeyan/validate_karthikeyan.csv' test_file_path = 'ugrnn/data/karthikeyan/test_karthikeyan.csv' header = "{:},{:}".format(smile_col_name, target_col_name) fmt = ('%s', '%4f') np.savetxt(train_file_path, traindata, header=header, fmt=fmt, comments='', delimiter=',') np.savetxt(validate_file_path, valdata, header=header, fmt=fmt, comments='', delimiter=',') np.savetxt(test_file_path, testdata, header=header, fmt=fmt, comments='', delimiter=',')
def main(output_dir='output/', model_name='my_model', training_file='delaney_train.csv', validation_file='delaney_validate.csv', smile_col='smiles', target_col='solubility', crossval_total_num_splits=10, initial_crossvalidation_index=0, weight_decay_factor=0, *args, **kwargs): ''' valid kwargs: experiment_name, regression, binary_classification, batch_size, clip_gradient, model_params, contract_rings, learning_rate, max_epochs, enable_plotting ''' log_format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s' logging.basicConfig(level=logging.INFO, format=log_format) logger = logging.getLogger(__name__) print('output_dir', output_dir) output_dir = os.path.join(output_dir, model_name) # if tf.gfile.Exists(output_dir): # tf.gfile.DeleteRecursively(output_dir) tf.gfile.MakeDirs(output_dir) with tf.Graph().as_default(): # Create a session for running Ops on the Graph. # select CPU (as it is faster than GPUs) config = tf.ConfigProto(device_count={'GPU': 0}) session = tf.Session(config=config) logger.info('Loading data set from {:}'.format(training_file)) csv_file_path = training_file smile_col_name = smile_col target_col_name = target_col data = utils.read_csv(csv_file_path, None, smile_col_name, target_col_name) assert len(data[0]) > 0, 'no data loaded!' smiles, labels = utils.permute_data(data[0], data[1]) if kwargs['regression']: # normalize regression targets to be in a reasonable value-range labels_mean = labels.mean() labels_range = np.max(labels) - np.min(labels) labels = (labels - labels_mean) / labels_range #this function will be applied to predictions of the model and to targets when computing metrics def Targets_UnNormalization_fn(targets): return targets * labels_range + labels_mean def Targets_Normalization_fn(targets): return (targets - labels_mean) / labels_range else: if labels.ndim == 1: labels = labels.reshape((len(labels), 1)) Targets_UnNormalization_fn = lambda x: x Targets_Normalization_fn = lambda x: x if validation_file != '' and validation_file is not None: # train single model logger.info( 'Loading validation dataset from {:}'.format(validation_file)) valid_data = utils.read_csv(validation_file, None, smile_col_name, target_col_name) if kwargs['regression'] == 0 and labels.ndim == 1: labels = labels.reshape( (len(labels), 1)) #binary classification train_data = (smiles, labels) valid_data = (valid_data[0], Targets_Normalization_fn(valid_data[1])) training_scores_dict, validation_scores_dict = build_and_train( logger, session, output_dir, train_data, valid_data, model_name=model_name, Targets_UnNormalization_fn=Targets_UnNormalization_fn, weight_decay_factor=weight_decay_factor, **kwargs) else: # cross validation assert initial_crossvalidation_index < crossval_total_num_splits, 'INVALID VALUE GIVEN for initial_crossvalidation_index or crossval_total_num_splits!' training_scores_dict, validation_scores_dict = [], [] for crossval_split_index in range(initial_crossvalidation_index, crossval_total_num_splits): print('crossval_split: {} of {}'.format( crossval_split_index + 1, crossval_total_num_splits)) assert len(smiles) == len(labels) train_data, valid_data, testdata = utils.cross_validation_split( smiles, labels, crossval_split_index, crossval_total_num_splits=crossval_total_num_splits, validation_data_ratio=1. / crossval_total_num_splits) #merge "test" and train -- validation part used for testing train_data = (np.concatenate((train_data[0], testdata[0])), np.concatenate((train_data[1], testdata[1]))) print('CV: # train samples:', len(train_data[0]), '# validation samples:', len(valid_data[0])) td, vd = build_and_train( logger, session, output_dir + '_CV_{}'.format(crossval_split_index), train_data, valid_data, model_name=model_name, Targets_UnNormalization_fn=Targets_UnNormalization_fn, weight_decay_factor=weight_decay_factor, **kwargs) training_scores_dict.append(td) validation_scores_dict.append(vd) if isinstance(training_scores_dict, list) and len(training_scores_dict) == 1 and len( validation_scores_dict) == 1: return training_scores_dict[0], validation_scores_dict[0] return training_scores_dict, validation_scores_dict
if __name__=='__main__': data, labels = utils.load_delaney() traindata, valdata, testdata = utils.cross_validation_split(data, labels, crossval_split_index=0, crossval_total_num_splits=10, validation_data_ratio=0.1) preprocess_data_set(traindata, valdata, testdata, training_batchsize = 50, test_batchsize = 1000) def test__main(array_rep): r = extract_bondfeatures_of_neighbors_by_degree(array_rep) print r atom_features = array_rep['atom_features'] bond_features = array_rep['bond_features'] print 'atom_features',atom_features.shape print ' '*38,'bond_features',bond_features.shape for i in range(0,5):
def play_pyano(): n_folds = 5 learning_rate = 0.1 # 1e-05 n_epoch = 1500 mu = 0.001 filename = 'data-copy.csv' # 'data.csv' dataset = utils.load_csv(filename) utils.ds_to_float(dataset) # print_dataset(dataset) # convert class column to integers last_column_index = len(dataset[0]) - 1 utils.column_to_int(dataset, last_column_index) # print_dataset(dataset) # normalize input variables minmax = utils.min_max(dataset) # print(minmax) utils.normalize(dataset, minmax) folds = utils.cross_validation_split(dataset, n_folds) #for fold in folds: #print("Fold {} \n \n".format(fold)) scores = list() predicted = [] actual = [] for fold in folds: train_set = list(folds) train_set.remove(fold) train_set = sum(train_set, []) test_set = list() for row in fold: row_copy = list(row) test_set.append(row_copy) row_copy[-1] = None predicted = train_and_predict(dataset, train_set, test_set, row, learning_rate, n_epoch, mu) actual = [row[-1] for row in fold] accuracy = utils.accuracy_met(actual, predicted) cm = confusion_matrix(actual, predicted) utils.print_matrix(cm) FP = cm.sum(axis=0) - np.diag(cm) FN = cm.sum(axis=1) - np.diag(cm) TP = np.diag(cm) TN = cm.sum() - (FP + FN + TP) print('False Positives\n{}'.format(FP)) print('False Negatives\n{}'.format(FN)) print('True Positives\n{}'.format(TP)) print('True Negatives\n{}'.format(TN)) TPR = TP / (TP + FN) print('Sensitivity \n{}'.format(TPR)) TNR = TN / (TN + FP) print('Specificity \n{}'.format(TNR)) Precision = TP / (TP + FP) print('Precision \n{}'.format(Precision)) Recall = TP / (TP + FN) print('Recall \n{}'.format(Recall)) Acc = (TP + TN) / (TP + TN + FP + FN) print('Áccuracy \n{}'.format(Acc)) Fscore = 2 * (Precision * Recall) / (Precision + Recall) print('FScore \n{}'.format(Fscore)) k = cohen_kappa_score(actual, predicted) print('Çohen Kappa \n{}'.format(k)) scores.append(accuracy)