Exemple #1
0
def calculateGOseq(ontology):
    organism = 'thalia'

    [dag, mapping] = fr.read_ontology_from_file('files/terms.obo')

    [Y, termNames, _] = fr.getDataLabels(organism, True, ontology, 8000)

    go = np.eye(len(termNames))

    for i, tname in enumerate(termNames):
        dagInd = mapping[tname]

        for pInd in dag[dagInd].parents:

            if dag[pInd].ontology != ontology:
                continue

            parLoc = termNames.index(dag[pInd].ID)

            go[i, parLoc] = 1.0
            go[parLoc, i] = 1.0

    #[Y, go] = removeRareTermsOnly(Y, go, label_fraction)

    go = csr_matrix(go)

    with open('go' + ontology + '.pkl', 'wb') as f:
        pickle.dump(go, f)

    return go
Exemple #2
0
def getParentsCoord(termNames, ontology, dag=None, mapping=None):

    if dag is None:
        [dag, mapping] = fr.read_ontology_from_file('files/terms.obo')
        #[dag, mapping] = fr.read_ontology_from_file('files/terms_09_17.obo')

    assert ontology == 'P'
    root = 'GO:0008150'

    parentsCoord = dict()

    for i, tname in enumerate(termNames):
        dagInd = mapping[tname]

        parentsCoord[i] = []

        for pInd in dag[dagInd].parents:

            if dag[pInd].ontology != ontology or dag[pInd].ID == root:
                continue

            parLoc = termNames.index(dag[pInd].ID)

            parentsCoord[i].append(parLoc)

    return parentsCoord
Exemple #3
0
def loadData(similarity, ontology, organism):

    assert ontology == 'P'
    root = 'GO:0008150'

    [dag, mapping] = fr.read_ontology_from_file('files/terms.obo')

    [Y, termNames, geneNames] = fr.getDataLabels(organism, True, ontology,
                                                 8000)

    data = loadmat('files/identities.mat')

    X = data['identityMatrix']

    X = np.array(X)

    rootInd = termNames.index(root)

    Y = np.array(Y)
    assert X.shape[0] == Y.shape[0]

    Y = np.delete(Y, rootInd, axis=1)
    del termNames[rootInd]

    [X, Y, deletedProteins] = removeEmptyProteins(X, Y)

    parentsCoord = getParentsCoord(termNames, ontology, dag, mapping)

    for i in sorted(deletedProteins, reverse=True):

        del geneNames[i]

    return [X, Y, termNames, deletedProteins, parentsCoord]
Exemple #4
0
def getLCAsCAFA(termNames, ontology, dag=None, mapping=None):

    if dag is None:
        [dag, mapping] = fr.read_ontology_from_file('files/terms_09_17.obo')

    if ontology == 'P':
        root = 'GO:0008150'
    elif ontology == 'F':
        root = 'GO:0003674'
    else:
        print 'wrong ontology'
        sys.exit(1)

    #find all ancestors of each term

    ancestors = dict()

    for i, t in enumerate(termNames):
        ancestors[t] = set()

        for pInd in dag[mapping[t]].parents:
            getAncestorsRecursively(dag, pInd, ancestors[t], ontology)

        ancestors[t].remove(root)

    lcas = dict()
    #get common ancestors
    for i, t1 in enumerate(termNames):
        print i
        for j, t2 in enumerate(termNames):

            #print t1, t2

            if i == j:
                lcas[t1, t2] = t1
            else:
                cas = ancestors[t1].intersection(ancestors[t2])
                #print cas

                for t in cas:

                    others = cas.difference(set([t]))

                    if others == ancestors[t]:
                        lcas[t1, t2] = t

                    break

    with open('lowestCommonAncestors.pkl', 'wb') as f:
        pickle.dump(lcas, f)

    with open('ancestors.pkl', 'wb') as f:
        pickle.dump(ancestors, f)
Exemple #5
0
def calculateGOallseq(ontology):
    organism = 'thalia'

    [dag, mapping] = fr.read_ontology_from_file('files/terms.obo')

    [Y, termNames, _] = fr.getDataLabels(organism, True, ontology, 8000)

    go = DFS(dag, termNames, mapping, ontology)

    go = csr_matrix(go)

    with open('go_allancestors' + ontology + '.pkl', 'wb') as f:
        pickle.dump(go, f)

    return go
Exemple #6
0
def calculateGOseqCAFA(ontology, termNames):
    organism = 'thalia'

    [dag, mapping] = fr.read_ontology_from_file('files/terms_09_17.obo')

    if ontology == 'P':
        root = 'GO:0008150'
    elif ontology == 'F':
        root = 'GO:0003674'
    else:
        print 'wrong ontology'
        sys.exit(1)

    go = np.eye(len(termNames))

    for i, tname in enumerate(termNames):
        dagInd = mapping[tname]

        for pInd in dag[dagInd].parents:

            if dag[pInd].ontology != ontology or dag[pInd] == root:
                continue

            parLoc = termNames.index(dag[pInd].ID)

            go[i, parLoc] = 1.0
            go[parLoc, i] = 1.0

    #[Y, go] = removeRareTermsOnly(Y, go, label_fraction)

    go = csr_matrix(go)

    with open('go' + ontology + '.pkl', 'wb') as f:
        pickle.dump(go, f)

    return go
Exemple #7
0
import numpy as np
import pickle
import sys
from utilities import *
from filereader import read_ontology_from_file
from gyros import getParentsCoord, calculateIC

species = sys.argv[1]

dag, mapping = read_ontology_from_file('../data/go/go-final.obo')

print('Loading go...')
[Y, geneNames, termNames, gene2row, term2col] = goLoader(species)

cv = KFold(n_splits=5, shuffle=True, random_state=656391)

np.random.seed(1901273)

thresholds = np.linspace(0, 1.0, 21)

for fold, (train, test) in enumerate(cv.split(Y)):

    print fold

    termNames = np.array(termNames)

    Ytrain = Y[train]
    Ytest = Y[test]

    nonempty = np.where(np.sum(Ytest, 0) > 0)[0]
    Ytest = Ytest[:, nonempty]
Exemple #8
0
        Y = np.delete(Y, root, 1)

    return [Y, proteins, termNames]


trainProteins = set()

leaves = set()

with open('../training_data/thalia_proteins.list') as f:
    for line in f:
        trainProteins.add(line.rsplit('\n')[0])

print 'Proteins in training set:', len(trainProteins)

[dag, mapping] = fr.read_ontology_from_file(
    '/tudelft.net/staff-bulk/ewi/insy/DBL/smakrod/go/terms_09_17.obo')

anno = dict()

absentTerms = set()

with open('../training_data/uniprot_sprot_exp.txt') as f:
    for line in f:

        fields = line.split('\t')

        protein = fields[0]

        if protein not in trainProteins:
            continue