コード例 #1
0
 def fit_model(self, data, cv_split='stratified'):
     eval_metrics = []
     x = data.x
     if self.model_type == 'classifier' and data.binary_y is not None:
         y = data.binary_y
     else:
         y = data.y
     cross_val_data, cross_val_labels = cross_validation_split(x=x, y=y,
                                                               split=cv_split,
                                                               n_folds=self.ensemble_size)
     for i in range(self.ensemble_size):
         train_x = np.concatenate(cross_val_data[:i] +
                                  cross_val_data[(i + 1):])
         test_x = cross_val_data[i]
         train_y = np.concatenate(cross_val_labels[:i] +
                                  cross_val_labels[(i + 1):])
         test_y = cross_val_labels[i]
         if self.normalization:
             train_x, desc_mean = normalize_desc(train_x)
             self.desc_mean[i] = desc_mean
             test_x, _ = normalize_desc(test_x, desc_mean)
         self.model[i].fit(train_x, train_y.ravel())
         predicted = self.model[i].predict(test_x)
         if self.model_type == 'classifier':
             eval_metrics.append(metrics.f1_score(test_y, predicted))
             self.metrics_type = 'F1 score'
         elif self.model_type == 'regressor':
             r2 = metrics.r2_score(test_y, predicted)
             eval_metrics.append(r2)
             self.metrics_type = 'R^2 score'
         else:
             raise RuntimeError()
     return eval_metrics, self.metrics_type
コード例 #2
0
ファイル: train.py プロジェクト: GUR9000/InnerOuterRNN
def main(*args):
    output_dir = os.path.join(FLAGS.output_dir, FLAGS.model_name)

    #    if tf.gfile.Exists(output_dir):
    #        tf.gfile.DeleteRecursively(output_dir)

    tf.gfile.MakeDirs(output_dir)

    with tf.Graph().as_default():
        # Create a session for running Ops on the Graph.
        session = tf.Session()

        logp_col_name = FLAGS.logp_col if FLAGS.add_logp else None

        logger.info('Loading data set from {:}'.format(FLAGS.training_file))
        csv_file_path = FLAGS.training_file
        smile_col_name = FLAGS.smile_col
        target_col_name = FLAGS.target_col
        data = utils.read_csv(csv_file_path, smile_col_name, target_col_name,
                              logp_col_name)
        data = list(zip(*data))

        if FLAGS.validation_file != '':
            logger.info('Loading validation dataset from {:}'.format(
                FLAGS.validation_file))
            valid_data = utils.read_csv(FLAGS.validation_file, smile_col_name,
                                        target_col_name, logp_col_name)
            train_data = data

            run_once(session, output_dir, list(zip(*train_data)),
                     list(zip(*valid_data)), logp_col_name)

        else:
            assert FLAGS.initial_crossvalidation_index < FLAGS.crossval_total_num_splits, 'INVALID VALUE GIVEN!'
            for crossval_split_index in range(
                    FLAGS.initial_crossvalidation_index,
                    FLAGS.crossval_total_num_splits):
                print('crossval_split: {} of {}'.format(
                    crossval_split_index + 1, FLAGS.crossval_total_num_splits))

                assert len(data[0]) == len(data[1])
                train_data, valid_data, testdata = utils.cross_validation_split(
                    data[0],
                    data[1],
                    crossval_split_index,
                    crossval_total_num_splits=FLAGS.crossval_total_num_splits,
                    validation_data_ratio=1. / FLAGS.crossval_total_num_splits)
                #merge "test" and train -- validation part used for testing
                train_data = (np.concatenate((train_data[0], testdata[0])),
                              np.concatenate((train_data[1], testdata[1])))
                print('CV: # train samples:', len(train_data[0]),
                      '# validation samples:', len(valid_data[0]))

                run_once(session,
                         output_dir + '_CV_{}'.format(crossval_split_index),
                         train_data, valid_data, logp_col_name)
コード例 #3
0
def evaluate_algorithm(dataset, algorithm, n_folds):
    '''
  This function is the main evaluation function
  which collects together all the steps that need
  to be performed in the naive bayes classifier

  Parameter
  ---------
  dataset: the dataset over which naive bayes classifier
          should be trained
  
  algorithms: The  function  handling  the  naive  bayes 
              algorithm

  n_folds: number  of  folds  for  the cross validaation

  Return:
  -------
  scores: The accuracies achieve in each of the n_fold 
          cross validation steps
  
  optimal summary: The model parameters for the optimal
                    model achieved in cross  validation
  '''
    # get the split dataset for the cross validation
    folds = cross_validation_split(dataset, n_folds)
    scores = list()
    global_scores = -1
    optimal_summary = None

    # iterate over each fold one at a time and make it he validation set
    for fold in folds:
        train_set = list(folds)
        train_set.remove(fold)
        train_set = sum(train_set, [])
        valid_set = list()
        for row in fold:
            row_copy = list(row)
            valid_set.append(row_copy)
            row_copy[-1] = None

        # Run the Naive bayes Algorithm to get the predictions
        predicted, summary = algorithm(train_set, valid_set)
        actual = [row[-1] for row in fold]
        accuracy = accuracy_metric(actual, predicted)

        # if there is an improvement in accuracy then select this model
        if accuracy > global_scores:
            global_scores = accuracy
            optimal_summary = summary

        # append the accuracy obtained in this iteration in the list of scores
        scores.append(accuracy)
    return scores, optimal_summary
def split_delaney():
    csv_file_path = 'ugrnn/data/DILI/DILI.csv'
    smile_col_name = "smiles"
    target_col_name = "solubility"
    logp_col_name = "logp"

    data = read_csv(csv_file_path, smile_col_name, target_col_name, logp_col_name)
    data_perm = permute_data(data)

    traindata, valdata, testdata = cross_validation_split(data_perm, crossval_split_index=1, crossval_total_num_splits=10)

    train_file_path = './data/DILI/train_DILI.csv'
    validate_file_path = './data/DILI/validate_DILI.csv'
    test_file_path = './data/DILI/test_DILI.csv'

    header = "{:},{:},{:}".format(smile_col_name, target_col_name, logp_col_name )
    fmt = ('%s', '%4f', '%4f')
    np.savetxt(train_file_path, traindata, header=header, fmt=fmt, comments='', delimiter=',')
    np.savetxt(validate_file_path, valdata, header=header, fmt=fmt, comments='', delimiter=',')
    np.savetxt(test_file_path, testdata, header=header, fmt=fmt, comments='', delimiter=',')
def split_karthikeyan():
    csv_file_path = 'ugrnn/data/karthikeyan/melting_points.csv'
    smile_col_name = "SMILES"
    target_col_name = "MTP"

    data = read_csv(csv_file_path, smile_col_name, target_col_name)
    bool_arr = np.array([valid_smile(row[0]) for row in data])
    print(bool_arr)
    filter_data = data[bool_arr]
    data_perm = permute_data(filter_data)

    traindata, valdata, testdata = cross_validation_split(data_perm, crossval_split_index=1, crossval_total_num_splits=10)

    train_file_path = 'ugrnn/data/karthikeyan/train_karthikeyan.csv'
    validate_file_path = 'ugrnn/data/karthikeyan/validate_karthikeyan.csv'
    test_file_path = 'ugrnn/data/karthikeyan/test_karthikeyan.csv'

    header = "{:},{:}".format(smile_col_name, target_col_name)
    fmt = ('%s', '%4f')
    np.savetxt(train_file_path, traindata, header=header, fmt=fmt, comments='', delimiter=',')
    np.savetxt(validate_file_path, valdata, header=header, fmt=fmt, comments='', delimiter=',')
    np.savetxt(test_file_path, testdata, header=header, fmt=fmt, comments='', delimiter=',')
コード例 #6
0
ファイル: train.py プロジェクト: yishutu/InnerOuterRNN
def main(output_dir='output/',
         model_name='my_model',
         training_file='delaney_train.csv',
         validation_file='delaney_validate.csv',
         smile_col='smiles',
         target_col='solubility',
         crossval_total_num_splits=10,
         initial_crossvalidation_index=0,
         weight_decay_factor=0,
         *args,
         **kwargs):
    '''
    valid kwargs:

        experiment_name, regression,
        binary_classification, batch_size,
        clip_gradient, model_params,
        contract_rings, learning_rate,
        max_epochs, enable_plotting

    '''
    log_format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
    logging.basicConfig(level=logging.INFO, format=log_format)
    logger = logging.getLogger(__name__)
    print('output_dir', output_dir)
    output_dir = os.path.join(output_dir, model_name)

    #    if tf.gfile.Exists(output_dir):
    #        tf.gfile.DeleteRecursively(output_dir)

    tf.gfile.MakeDirs(output_dir)

    with tf.Graph().as_default():
        # Create a session for running Ops on the Graph.
        # select CPU (as it is faster than GPUs)
        config = tf.ConfigProto(device_count={'GPU': 0})
        session = tf.Session(config=config)

        logger.info('Loading data set from {:}'.format(training_file))
        csv_file_path = training_file
        smile_col_name = smile_col
        target_col_name = target_col
        data = utils.read_csv(csv_file_path, None, smile_col_name,
                              target_col_name)
        assert len(data[0]) > 0, 'no data loaded!'
        smiles, labels = utils.permute_data(data[0], data[1])

        if kwargs['regression']:
            # normalize regression targets to be in a reasonable value-range
            labels_mean = labels.mean()
            labels_range = np.max(labels) - np.min(labels)
            labels = (labels - labels_mean) / labels_range

            #this function will be applied to predictions of the model and to targets when computing metrics
            def Targets_UnNormalization_fn(targets):
                return targets * labels_range + labels_mean

            def Targets_Normalization_fn(targets):
                return (targets - labels_mean) / labels_range
        else:
            if labels.ndim == 1:
                labels = labels.reshape((len(labels), 1))
            Targets_UnNormalization_fn = lambda x: x
            Targets_Normalization_fn = lambda x: x

        if validation_file != '' and validation_file is not None:
            # train single model
            logger.info(
                'Loading validation dataset from {:}'.format(validation_file))
            valid_data = utils.read_csv(validation_file, None, smile_col_name,
                                        target_col_name)
            if kwargs['regression'] == 0 and labels.ndim == 1:
                labels = labels.reshape(
                    (len(labels), 1))  #binary classification
            train_data = (smiles, labels)
            valid_data = (valid_data[0],
                          Targets_Normalization_fn(valid_data[1]))

            training_scores_dict, validation_scores_dict = build_and_train(
                logger,
                session,
                output_dir,
                train_data,
                valid_data,
                model_name=model_name,
                Targets_UnNormalization_fn=Targets_UnNormalization_fn,
                weight_decay_factor=weight_decay_factor,
                **kwargs)

        else:
            # cross validation
            assert initial_crossvalidation_index < crossval_total_num_splits, 'INVALID VALUE GIVEN for initial_crossvalidation_index or crossval_total_num_splits!'
            training_scores_dict, validation_scores_dict = [], []
            for crossval_split_index in range(initial_crossvalidation_index,
                                              crossval_total_num_splits):
                print('crossval_split: {} of {}'.format(
                    crossval_split_index + 1, crossval_total_num_splits))

                assert len(smiles) == len(labels)
                train_data, valid_data, testdata = utils.cross_validation_split(
                    smiles,
                    labels,
                    crossval_split_index,
                    crossval_total_num_splits=crossval_total_num_splits,
                    validation_data_ratio=1. / crossval_total_num_splits)
                #merge "test" and train -- validation part used for testing
                train_data = (np.concatenate((train_data[0], testdata[0])),
                              np.concatenate((train_data[1], testdata[1])))
                print('CV: # train samples:', len(train_data[0]),
                      '# validation samples:', len(valid_data[0]))

                td, vd = build_and_train(
                    logger,
                    session,
                    output_dir + '_CV_{}'.format(crossval_split_index),
                    train_data,
                    valid_data,
                    model_name=model_name,
                    Targets_UnNormalization_fn=Targets_UnNormalization_fn,
                    weight_decay_factor=weight_decay_factor,
                    **kwargs)
                training_scores_dict.append(td)
                validation_scores_dict.append(vd)
        if isinstance(training_scores_dict,
                      list) and len(training_scores_dict) == 1 and len(
                          validation_scores_dict) == 1:
            return training_scores_dict[0], validation_scores_dict[0]
        return training_scores_dict, validation_scores_dict
コード例 #7
0










if __name__=='__main__':
    
    data, labels = utils.load_delaney()
    traindata, valdata, testdata = utils.cross_validation_split(data, labels, crossval_split_index=0, 
                                                                crossval_total_num_splits=10, 
                                                                validation_data_ratio=0.1)
    
    preprocess_data_set(traindata, valdata, testdata, training_batchsize = 50, test_batchsize = 1000)



    def test__main(array_rep):

        r = extract_bondfeatures_of_neighbors_by_degree(array_rep)
        print r
        atom_features = array_rep['atom_features']
        bond_features = array_rep['bond_features']
        print 'atom_features',atom_features.shape
        print ' '*38,'bond_features',bond_features.shape
        for i in range(0,5):
コード例 #8
0
def play_pyano():
    n_folds = 5
    learning_rate = 0.1  # 1e-05
    n_epoch = 1500
    mu = 0.001
    filename = 'data-copy.csv'  # 'data.csv'

    dataset = utils.load_csv(filename)

    utils.ds_to_float(dataset)
    # print_dataset(dataset)

    # convert class column to integers
    last_column_index = len(dataset[0]) - 1
    utils.column_to_int(dataset, last_column_index)
    # print_dataset(dataset)

    # normalize input variables
    minmax = utils.min_max(dataset)
    # print(minmax)
    utils.normalize(dataset, minmax)

    folds = utils.cross_validation_split(dataset, n_folds)
    #for fold in folds:
    #print("Fold {} \n \n".format(fold))
    scores = list()

    predicted = []
    actual = []

    for fold in folds:
        train_set = list(folds)
        train_set.remove(fold)
        train_set = sum(train_set, [])
        test_set = list()
        for row in fold:
            row_copy = list(row)
            test_set.append(row_copy)
            row_copy[-1] = None
        predicted = train_and_predict(dataset, train_set, test_set, row,
                                      learning_rate, n_epoch, mu)
        actual = [row[-1] for row in fold]
        accuracy = utils.accuracy_met(actual, predicted)
        cm = confusion_matrix(actual, predicted)
        utils.print_matrix(cm)
        FP = cm.sum(axis=0) - np.diag(cm)
        FN = cm.sum(axis=1) - np.diag(cm)
        TP = np.diag(cm)
        TN = cm.sum() - (FP + FN + TP)
        print('False Positives\n{}'.format(FP))
        print('False Negatives\n{}'.format(FN))
        print('True Positives\n{}'.format(TP))
        print('True Negatives\n{}'.format(TN))
        TPR = TP / (TP + FN)
        print('Sensitivity \n{}'.format(TPR))
        TNR = TN / (TN + FP)
        print('Specificity \n{}'.format(TNR))
        Precision = TP / (TP + FP)
        print('Precision \n{}'.format(Precision))
        Recall = TP / (TP + FN)
        print('Recall \n{}'.format(Recall))
        Acc = (TP + TN) / (TP + TN + FP + FN)
        print('Áccuracy \n{}'.format(Acc))
        Fscore = 2 * (Precision * Recall) / (Precision + Recall)
        print('FScore \n{}'.format(Fscore))
        k = cohen_kappa_score(actual, predicted)
        print('Çohen Kappa \n{}'.format(k))
        scores.append(accuracy)