def main():
    '''Run for each ontology and for three levels of term counts.
    '''
    # Load GO DAG
    go = GO("experimental", "computational", "curated",
            go_dag_path='$CEREVISIAEDATA/go.obo',
            associations_path='$CEREVISIAEDATA/gene_association.sgd')

    go.load_go_dag()
    go_dag = go.go_dag

    # Set `vmin` and `vmax` for each level of term counts
    ontology_sizes = [(101, 300), (31, 100), (11, 30)]

    # Names of the three ontologies
    ontologies = ['P', 'F', 'C']

    # Save arrays for each ontology and level
    associations_ontologies_levels = {}

    # Loop over ontologies
    for ontology in ontologies:
        print('Calculating for ontology', ontology)
        # Get dict mapping genes to GO terms
        associations = go.get_associations(ontology)
        # Add parent terms
        associations = propagate_parent_terms(associations, go_dag)
        # Get dicts for mapping to array indexes
        gene_indexes = get_gene_index()
        go_id_indexes = get_go_id_index(associations)

        # Loop over levels of term counts
        for idx, (vmin, vmax) in enumerate(ontology_sizes):
            print('Min/max term counts', vmin, vmax)
            # Get associations between genes and GO terms
            M = np.zeros((max(go_id_indexes.values()) + 1,
                          max(gene_indexes.values()) + 1))
            print(M.shape)
            M = fill_array_of_associations(M, associations, gene_indexes,
                                           go_id_indexes)
            M = get_subarray_by_term_counts(M, vmin, vmax)
            print('Shape before filtering terms by Jaccard similarity', M.shape)
            # Filter terms by Jaccard similarity
            # M = filter_similar_terms(M)
            print('Shape after filtering terms by Jaccard similarity', M.shape)
            # Save array
            associations_ontologies_levels[f'{ontology}_{idx + 1}'] = M.T

    # Save `associations_ontologies_levels` to a .mat file
    output_dir = os.path.join(os.path.expandvars('$CEREVISIAEDATA'),
                              'deepNF',
                              'annotations')
    directory_exists(output_dir)
    output_file = 'yeast_annotations.mat'
    io.savemat(
        os.path.join(output_dir, output_file),
        associations_ontologies_levels,
        do_compression=True)

    print(f'{output_file} saved to {output_dir}')
Beispiel #2
0
def main():
    ######################
    # Prepare filesystem #
    ######################

    directory_exists(models_path)
    mkdir(results_path)

    ###################
    # Load embeddings #
    ###################

    embeddings_file = glob.glob(os.path.join(models_path, '*.mat'))[0]
    model_name = os.path.splitext(os.path.basename(embeddings_file))[0]
    print(model_name)
    stdout('Loading embeddings', embeddings_file)
    embeddings = load_embeddings(embeddings_file)
    embeddings = minmax_scale(embeddings)

    #######################
    # Load GO annotations #
    #######################

    annotation_dir = os.path.join(data_path, 'annotations')
    if validation == 'cerevisiae':
        annotation_file = os.path.join(annotation_dir,
                                       'cerevisiae_annotations.mat')
    else:
        annotation_file = os.path.join(annotation_dir, 'yeast_annotations.mat')
    stdout('Loading GO annotations', annotation_file)

    GO = sio.loadmat(annotation_file)

    ####################
    # Train classifier #
    ####################

    stdout('Running cross-validation for', level)

    annotations = GO[level]

    # Silence certain warning messages during cross-validation
    for w in (sklearn.exceptions.UndefinedMetricWarning, UserWarning,
              RuntimeWarning):
        warnings.filterwarnings("ignore", category=w)

    # Only use a subset of the data for testing purposes
    embeddings = embeddings[:test]
    annotations = annotations[:test]

    # performance = cross_validation(
    #     embeddings,
    #     annotations,
    #     n_trials=n_trials,
    #     n_jobs=n_jobs,
    #     n_threads=n_threads,
    #     random_state=random_state,
    #     clf_type=clf_type,
    #     max_depth=max_depth[level])

    performance = cross_validation(embeddings, annotations, n_trials=n_trials)

    performance['my_level'] = level

    pprint(performance)

    fout = f'{model_name}_{level}_{clf_type}_performance.json'

    with open(os.path.join(results_path, fout), 'w') as f:
        json.dump(performance, f)
Beispiel #3
0
def main():
    # Prepare filesystem
    directory_exists(models_path)
    mkdir(results_path)

    # Load embeddings
    embeddings_file = glob.glob(os.path.join(models_path, '*.mat'))[0]
    model_name = os.path.splitext(
        os.path.basename(embeddings_file))[0].replace('_embeddings', '')
    stdout('Loading embeddings', embeddings_file)
    embeddings = load_embeddings(embeddings_file).astype('int32')

    # Load annotations
    annotation_dir = os.path.join(data_path, 'annotations')
    if validation == 'cerevisiae':
        annotation_file = os.path.join(
            annotation_dir, 'cerevisiae_annotations.mat')
    else:
        annotation_file = os.path.join(annotation_dir, 'yeast_annotations.mat')
    stdout('Loading GO annotations', annotation_file)

    annotation_file = sio.loadmat(annotation_file)

    # Train classifier
    stdout('Running cross-validation for', level)

    if validation == 'cv':
        if level in ('P', 'F', 'C'):
            annotations = np.hstack(
                [annotation_file[f'{level}_{i}'] for i in range(1, 4)])
        else:
            annotations = annotation_file[level]

    elif validation == 'cerevisiae':
        if level == 'all':
            annotations = np.hstack(
                [annotation_file[f'level{i}'] for i in range(1, 4)])
        else:
            annotations = annotation_file[level]

    annotations = annotations.astype('int32')

    # Silence certain warning messages during cross-validation
    for w in (sklearn.exceptions.UndefinedMetricWarning, UserWarning,
              RuntimeWarning):
        warnings.filterwarnings("ignore", category=w)

    # Remove genes with no annotations
    x = embeddings
    y = annotations
    del_rid = np.where(y.sum(axis=1) == 0)[0]
    x = np.delete(x, del_rid, axis=0)
    y = np.delete(y, del_rid, axis=0)

    # Set up CV
    performance_metrics = ('accuracy', 'm_AUPR', 'M_AUPR', 'f1')
    performance_repeats = defaultdict(dict)

    for repeat in range(1, repeats + 1):

        performance_repeats[f'repeat_{repeat}'] = defaultdict(dict)
        performance = performance_repeats[f'repeat_{repeat}']

        trials = ShuffleSplit(n_splits=n_trials, test_size=0.2,
                              random_state=random_state)
        iteration = 0

        # CV-folds
        for train_idx, test_idx in trials.split(x):
            iteration += 1

            x_train = x[train_idx]
            x_test = x[test_idx]
            y_train = y[train_idx]
            y_test = y[test_idx]

            # Define the MLP architecture
            model = MLP(x_train, y_train)
            model.compile('adam', 'binary_crossentropy', ['acc'])

            # Train the model
            callbacks = [EarlyStopping(min_delta=0., patience=20),
                         ModelCheckpoint('best_model.h5', save_best_only=True)]

            history = model.fit(x_train, y_train, batch_size=batch_size, epochs=200,
                                validation_split=0.2, shuffle=True,
                                callbacks=callbacks, verbose=2)

            performance['history'][iteration] = {}
            for tm in history.history:
                performance['history'][iteration][tm] = history.history[tm]

            # Read the best model from file (defined as the model which minimizes
            # the validation loss.
            model = load_model('best_model.h5')

            # Predict annotations
            y_score = model.predict(x_test)
            y_pred = y_score.copy()
            positive_threshold = .5
            y_pred[y_pred < positive_threshold] = 0
            y_pred[y_pred > 0] = 1
            performance_trial = _Performance(y_test, y_score, y_pred)

            for pm in performance_metrics:
                performance[pm][iteration] = getattr(performance_trial, pm)
                calculate_mean_std(performance, pm)

            dummy = DummyClassifier().fit(x_train, y_train).score(x_test, y_test)
            performance['dummy'][iteration] = dummy

        performance['level'] = level
        pprint(performance)

    # Save results and training history
    fout = f'{model_name}_{level}_{clf_type}'
    with open(os.path.join(results_path, f'{fout}.json'), 'w') as f:
        json.dump(performance_repeats, f)

    # Delete the best model file
    os.remove('best_model.h5')

    return None
Beispiel #4
0
                    default='$AGAPEDATA/deepNF/networks',
                    type=str)
parser.add_argument('-o',
                    '--output-path',
                    default='$AGAPEDATA/deepNF/networks',
                    type=str)
parser.add_argument('--K', default=3, type=int)
parser.add_argument('--alpha', default=.98, type=float)
parser.add_argument('--genes', default=5100, type=int)
args = parser.parse_args()

######
# io #
######

input_path = directory_exists(args.input_path)
output_path = directory_exists(args.output_path)

########
# defs #
########


def _load_network(filename, mtrx='adj'):
    print(f"Loading {filename}")
    i, j, val = np.loadtxt(filename).T

    if 'fypo' in filename:
        A = coo_matrix((val, (i, j)), shape=(args.genes, max(j) + 1))
    else:
        A = coo_matrix((val, (i, j)), shape=(args.genes, args.genes))
Beispiel #5
0
##########################

parser = argparse.ArgumentParser()
parser.add_argument('-o',
                    '--output-path',
                    default='$CEREVISIAEDATA/deepNF',
                    type=str)
args = parser.parse_args()

######
# io #
######

data = os.environ["CEREVISIAEDATA"]

if directory_exists(args.output_path):
    output_path = os.path.expandvars(args.output_path)

########
# defs #
########


class STRING:
    """Load S. cerevisiae STRING database.
    """
    def __init__(self):
        # f = "4932.protein.links.detailed.v10.5.txt"
        f = "4932.protein.links.detailed.v9.1.txt"
        self.df = pd.read_csv(os.path.join(data, f), sep=" ")
        self.interaction_types = ('neighborhood', 'fusion', 'cooccurence',
Beispiel #6
0
 def test_directory_exists(self):
     with TemporaryDirectory() as d:
         assert directory_exists(d) == d
Beispiel #7
0
 def test_raises_FileNotFoundError(self):
     with TemporaryDirectory() as d:
         with raises(FileNotFoundError):
             directory_exists(d + "NOTAPATH")