Ejemplo n.º 1
0
def main():
    ######################
    # Prepare filesystem #
    ######################

    mkdir("models")

    #################
    # Load networks #
    #################

    networks, dims = load_ppmi_matrices(data_path)
    A = minmax_scale(networks[0])

    #########################
    # Train the autoencoder #
    #########################

    model_name = [f'{org}', f'{model_type}-{args.layers}']

    if ofile_tags != '':
        model_name.append(ofile_tags)

    model_name = '_'.join(model_name)

    stdout("Running for architecture", model_name)

    best_model_filename = 'best_model.h5'

    autoencoder = DeepAutoencoder(
        x_train=A,
        x_val=0.1,
        layers=layers,
        epochs=epochs,
        sparse=sparse,
        dropout=dropout,
        batch_size=batch_size,
        activation=activation,
        optimizer=optimizer,
        # early_stopping=(5, 0.),
        save_best_model=best_model_filename,
        verbose=2)

    autoencoder.train()

    history = autoencoder.history.history

    with open(os.path.join(models_path, f'{model_name}_training_history.pkl'),
              'wb') as f:
        pickle.dump(history, f)

    plot_loss({model_name: history}, f'{models_path}/{model_name}')

    embeddings = autoencoder.encode(A)

    embeddings_path = os.path.join(models_path, f'{model_name}_embeddings.mat')

    sio.savemat(embeddings_path, {'embeddings': embeddings})

    os.remove(best_model_filename)
Ejemplo n.º 2
0
def main():
    print(__doc__)
    stdout("Command line arguments", args)
    s = STRING()
    s.convert_ids_to_numbers()
    stdout('Writing networks to', output_path)
    s.write()
Ejemplo n.º 3
0
def load_ppmi_matrices(data_path):
    '''Load PPMI matrices.

    # Arguments
        data_path: str, path to .mat files

    # Returns
        Ms: List[numpy.ndarray], PPMI matrices
        dims: List[int], dimensions of matrices
    '''
    paths = sorted(glob.glob(os.path.join(data_path, "*.mat")))
    stdout('Networks', paths)

    Ms = []
    for p in paths:
        M = _load_ppmi_matrix(p)
        Ms.append(minmax_scale(M))

    dims = [i.shape[1] for i in Ms]
    stdout('Input dims', dims)
    return Ms, dims
Ejemplo n.º 4
0
def main():
    ######################
    # Prepare filesystem #
    ######################

    mkdir(results_path)

    #################
    # Load networks #
    #################

    networks, dims = load_ppmi_matrices(data_path)

    ###############
    # Load models #
    ###############

    model_names = sorted(
        glob.glob(
            os.path.join(os.path.expandvars(models_path),
                         f'{org}_MDA_arch_*.h5')))

    stdout("Model names", model_names)

    # For now I am only loading a single model at a time, specified by
    # `--architecture`. TODO improve this to handle multiple models at one
    # time.
    if architecture:
        for m in model_names:
            if int(m[-4]) == architecture:
                mid_model = load_model(m)
                model_name = os.path.basename(m).split(".")[0]
    else:
        raise Warning("`--architecture` must be supplied")

    ################################
    # Calculate network embeddings #
    ################################
    stdout("Calculating embeddings for", model_name)

    embeddings = minmax_scale(mid_model.predict(networks))

    embeddings_path = os.path.join(os.path.expandvars(results_path),
                                   f'{model_name}_features.mat')

    stdout("Writing embeddings to", embeddings_path)

    sio.savemat(embeddings_path, {'embeddings': embeddings})
Ejemplo n.º 5
0
def cross_validation(X,
                     y,
                     n_trials=10,
                     n_jobs=1,
                     n_threads=1,
                     random_state=None,
                     clf_type='LRC',
                     max_depth=5):
    '''Perform model selection via cross validation.
    '''
    stdout('Number of samples pre-filtering', X.shape)

    # Filter samples with no annotations
    del_rid = np.where(y.sum(axis=1) == 0)[0]
    y = np.delete(y, del_rid, axis=0)
    X = np.delete(X, del_rid, axis=0)
    stdout('Number of samples post-filtering', X.shape)

    # Scoring
    scoring = {
        'accuracy': 'accuracy',
        'f1': 'f1_micro',
        'M_AUPR': make_scorer(M_AUPR),
        'm_AUPR': make_scorer(m_AUPR)
    }

    # Split training data
    trials = ShuffleSplit(n_splits=n_trials,
                          test_size=0.2,
                          random_state=random_state)

    # Performance
    performance_metrics = ("accuracy", "m_AUPR", "M_AUPR", "f1")
    perf = defaultdict(dict)
    perf['grid_search'] = {}

    # Model selection for optimum hyperparameters
    iteration = 0
    for train_idx, test_idx in trials.split(X):
        # Split data
        X_train = X[train_idx]
        X_test = X[test_idx]
        y_train = y[train_idx]
        y_test = y[test_idx]

        iteration += 1
        stdout('Cross validation trial', iteration)
        stdout('Train samples', y_train.shape[0])
        stdout('Test samples', y_test.shape[0])

        # Classifier
        if clf_type == 'SVC':
            clf = SVClassifier(n_jobs=n_jobs, random_state=random_state)
            grid_search_params = {
                'C': [1, 5, 10, 50, 100],
                'gamma': [0.001, 0.005, 0.01, 0.05, 0.1],
                'kernel': ['rbf']
            }
        if clf_type == 'LinearSVC':
            clf = SVClassifier(n_jobs=n_jobs, random_state=random_state)
            grid_search_params = {
                'C': np.logspace(-1, 2, 4),
                'kernel': ['linear']
            }
        elif clf_type == 'LRC':
            clf = LRClassifier(n_jobs=n_jobs, random_state=random_state)
            grid_search_params = {'C': np.logspace(-2, 2, 5)}
        elif clf_type == 'RFC':
            clf = RFClassifier(n_jobs=n_jobs, random_state=random_state)
            grid_search_params = {'max_features': ['auto']}
        elif clf_type == 'XGB':
            clf = XGBClassifier(learning_rate=0.1,
                                n_estimators=1000,
                                gamma=0,
                                subsample=0.8,
                                colsample_bytree=0.8,
                                objective='binary:logistic',
                                scale_pos_weight=1,
                                n_jobs=n_jobs,
                                n_threads=n_threads,
                                random_state=random_state)
            grid_search_params = {
                'max_depth': max_depth,
                'min_child_weight': range(1, 3)
            }
        else:
            raise ValueError('`clf` must be a class in agape.ml.classifer')

        # Perform a grid search over the hyperparameter ranges

        stdout('Grid search')

        clf.grid_search(X_train,
                        y_train,
                        grid_search_params,
                        scoring=scoring,
                        refit='m_AUPR',
                        cv=5,
                        verbose=10)

        # Get the best hyperparameters
        clf_params = clf.get_clf().get_params()['estimator'] \
                                  .get_params()['estimator'] \
                                  .get_params()
        best_params = {
            k: clf_params[k.replace('estimator__', '')]
            for k in grid_search_params
        }

        perf['grid_search'][iteration] = {}
        perf['grid_search'][iteration]['best_estimator_'] = {}

        for k, v in best_params.items():
            k = k.split('__')[-1]
            perf['grid_search'][iteration]['best_estimator_'][k] = v

        stdout('Optimal parameters', best_params)

        perf['grid_search'][iteration]['best_score_'] = \
            clf.clf_grid_search.best_score_

        stdout('Train dataset AUPR', clf.clf_grid_search.best_score_)

        # Train a classifier with the optimal hyperparameters using the full
        # training data
        clf.fit(X_train, y_train)

        # Compute performance on test set
        y_pred = clf.predict(X_test)
        y_score = clf.predict_proba(X_test)

        stdout('Number of positive predictions', len(y_pred.nonzero()[0]))

        perf_trial = _Performance(y_test, y_score, y_pred)

        stdout('Test dataset')

        for pm in performance_metrics:
            pm_v = getattr(perf_trial, pm)
            stdout(pm, pm_v)
            perf[pm][iteration] = pm_v

        dummy = DummyClassifier().fit(X_train, y_train).score(X_test, y_test)
        perf['dummy'][iteration] = dummy

    # Performance across K-fold cross-validation

    def calculate_mean_std(metric):
        values = list(perf[metric].values())
        perf['metrics'][metric]['mean'] = np.mean(values)
        perf['metrics'][metric]['std'] = std(values)

    for pm in performance_metrics:
        perf['metrics'][pm] = {}
        calculate_mean_std(pm)

    return perf
Ejemplo n.º 6
0
def main():
    ######################
    # Prepare filesystem #
    ######################

    directory_exists(models_path)
    mkdir(results_path)

    ###################
    # Load embeddings #
    ###################

    embeddings_file = glob.glob(os.path.join(models_path, '*.mat'))[0]
    model_name = os.path.splitext(os.path.basename(embeddings_file))[0]
    print(model_name)
    stdout('Loading embeddings', embeddings_file)
    embeddings = load_embeddings(embeddings_file)
    embeddings = minmax_scale(embeddings)

    #######################
    # Load GO annotations #
    #######################

    annotation_dir = os.path.join(data_path, 'annotations')
    if validation == 'cerevisiae':
        annotation_file = os.path.join(annotation_dir,
                                       'cerevisiae_annotations.mat')
    else:
        annotation_file = os.path.join(annotation_dir, 'yeast_annotations.mat')
    stdout('Loading GO annotations', annotation_file)

    GO = sio.loadmat(annotation_file)

    ####################
    # Train classifier #
    ####################

    stdout('Running cross-validation for', level)

    annotations = GO[level]

    # Silence certain warning messages during cross-validation
    for w in (sklearn.exceptions.UndefinedMetricWarning, UserWarning,
              RuntimeWarning):
        warnings.filterwarnings("ignore", category=w)

    # Only use a subset of the data for testing purposes
    embeddings = embeddings[:test]
    annotations = annotations[:test]

    # performance = cross_validation(
    #     embeddings,
    #     annotations,
    #     n_trials=n_trials,
    #     n_jobs=n_jobs,
    #     n_threads=n_threads,
    #     random_state=random_state,
    #     clf_type=clf_type,
    #     max_depth=max_depth[level])

    performance = cross_validation(embeddings, annotations, n_trials=n_trials)

    performance['my_level'] = level

    pprint(performance)

    fout = f'{model_name}_{level}_{clf_type}_performance.json'

    with open(os.path.join(results_path, fout), 'w') as f:
        json.dump(performance, f)
Ejemplo n.º 7
0
                    default=-1,
                    type=int,
                    help='Set sklearn random_state. If -1, then sklearn uses \
                          the system randomness as a seed. If int, then this \
                          number will be used as a seed.')
parser.add_argument('--tags', default="", type=str)
parser.add_argument('-j', '--n_jobs', default=1, type=int)
parser.add_argument('-t', '--n_threads', default=1, type=int)
parser.add_argument('-c', '--clf_type', default='LRC', type=str)
parser.add_argument('--test',
                    default=None,
                    type=int,
                    help='If True, then only a subset of the data is used')
args = parser.parse_args()

stdout("Command line arguments", args)

org = args.organism
models_path = os.path.expandvars(args.models_path)
results_path = os.path.expandvars(args.results_path)
data_path = os.path.expandvars(args.data_path)
n_trials = args.n_trials
tags = args.tags
validation = args.validation
n_jobs = args.n_jobs
n_threads = args.n_threads
random_state = args.random_state
level = args.level
clf_type = args.clf_type
test = args.test
Ejemplo n.º 8
0
def main():
    # Prepare filesystem
    directory_exists(models_path)
    mkdir(results_path)

    # Load embeddings
    embeddings_file = glob.glob(os.path.join(models_path, '*.mat'))[0]
    model_name = os.path.splitext(
        os.path.basename(embeddings_file))[0].replace('_embeddings', '')
    stdout('Loading embeddings', embeddings_file)
    embeddings = load_embeddings(embeddings_file).astype('int32')

    # Load annotations
    annotation_dir = os.path.join(data_path, 'annotations')
    if validation == 'cerevisiae':
        annotation_file = os.path.join(
            annotation_dir, 'cerevisiae_annotations.mat')
    else:
        annotation_file = os.path.join(annotation_dir, 'yeast_annotations.mat')
    stdout('Loading GO annotations', annotation_file)

    annotation_file = sio.loadmat(annotation_file)

    # Train classifier
    stdout('Running cross-validation for', level)

    if validation == 'cv':
        if level in ('P', 'F', 'C'):
            annotations = np.hstack(
                [annotation_file[f'{level}_{i}'] for i in range(1, 4)])
        else:
            annotations = annotation_file[level]

    elif validation == 'cerevisiae':
        if level == 'all':
            annotations = np.hstack(
                [annotation_file[f'level{i}'] for i in range(1, 4)])
        else:
            annotations = annotation_file[level]

    annotations = annotations.astype('int32')

    # Silence certain warning messages during cross-validation
    for w in (sklearn.exceptions.UndefinedMetricWarning, UserWarning,
              RuntimeWarning):
        warnings.filterwarnings("ignore", category=w)

    # Remove genes with no annotations
    x = embeddings
    y = annotations
    del_rid = np.where(y.sum(axis=1) == 0)[0]
    x = np.delete(x, del_rid, axis=0)
    y = np.delete(y, del_rid, axis=0)

    # Set up CV
    performance_metrics = ('accuracy', 'm_AUPR', 'M_AUPR', 'f1')
    performance_repeats = defaultdict(dict)

    for repeat in range(1, repeats + 1):

        performance_repeats[f'repeat_{repeat}'] = defaultdict(dict)
        performance = performance_repeats[f'repeat_{repeat}']

        trials = ShuffleSplit(n_splits=n_trials, test_size=0.2,
                              random_state=random_state)
        iteration = 0

        # CV-folds
        for train_idx, test_idx in trials.split(x):
            iteration += 1

            x_train = x[train_idx]
            x_test = x[test_idx]
            y_train = y[train_idx]
            y_test = y[test_idx]

            # Define the MLP architecture
            model = MLP(x_train, y_train)
            model.compile('adam', 'binary_crossentropy', ['acc'])

            # Train the model
            callbacks = [EarlyStopping(min_delta=0., patience=20),
                         ModelCheckpoint('best_model.h5', save_best_only=True)]

            history = model.fit(x_train, y_train, batch_size=batch_size, epochs=200,
                                validation_split=0.2, shuffle=True,
                                callbacks=callbacks, verbose=2)

            performance['history'][iteration] = {}
            for tm in history.history:
                performance['history'][iteration][tm] = history.history[tm]

            # Read the best model from file (defined as the model which minimizes
            # the validation loss.
            model = load_model('best_model.h5')

            # Predict annotations
            y_score = model.predict(x_test)
            y_pred = y_score.copy()
            positive_threshold = .5
            y_pred[y_pred < positive_threshold] = 0
            y_pred[y_pred > 0] = 1
            performance_trial = _Performance(y_test, y_score, y_pred)

            for pm in performance_metrics:
                performance[pm][iteration] = getattr(performance_trial, pm)
                calculate_mean_std(performance, pm)

            dummy = DummyClassifier().fit(x_train, y_train).score(x_test, y_test)
            performance['dummy'][iteration] = dummy

        performance['level'] = level
        pprint(performance)

    # Save results and training history
    fout = f'{model_name}_{level}_{clf_type}'
    with open(os.path.join(results_path, f'{fout}.json'), 'w') as f:
        json.dump(performance_repeats, f)

    # Delete the best model file
    os.remove('best_model.h5')

    return None
Ejemplo n.º 9
0
 def test_string_object(self):
     stdout(self.s, self.o, file=self.f)
     assert self.f.getvalue() == 'string:\n    object\n\n'
Ejemplo n.º 10
0
 def test_string(self):
     stdout(self.s, file=self.f)
     assert self.f.getvalue() == 'string\n\n\n'