def calculateGOseq(ontology): organism = 'thalia' [dag, mapping] = fr.read_ontology_from_file('files/terms.obo') [Y, termNames, _] = fr.getDataLabels(organism, True, ontology, 8000) go = np.eye(len(termNames)) for i, tname in enumerate(termNames): dagInd = mapping[tname] for pInd in dag[dagInd].parents: if dag[pInd].ontology != ontology: continue parLoc = termNames.index(dag[pInd].ID) go[i, parLoc] = 1.0 go[parLoc, i] = 1.0 #[Y, go] = removeRareTermsOnly(Y, go, label_fraction) go = csr_matrix(go) with open('go' + ontology + '.pkl', 'wb') as f: pickle.dump(go, f) return go
def getParentsCoord(termNames, ontology, dag=None, mapping=None): if dag is None: [dag, mapping] = fr.read_ontology_from_file('files/terms.obo') #[dag, mapping] = fr.read_ontology_from_file('files/terms_09_17.obo') assert ontology == 'P' root = 'GO:0008150' parentsCoord = dict() for i, tname in enumerate(termNames): dagInd = mapping[tname] parentsCoord[i] = [] for pInd in dag[dagInd].parents: if dag[pInd].ontology != ontology or dag[pInd].ID == root: continue parLoc = termNames.index(dag[pInd].ID) parentsCoord[i].append(parLoc) return parentsCoord
def loadData(similarity, ontology, organism): assert ontology == 'P' root = 'GO:0008150' [dag, mapping] = fr.read_ontology_from_file('files/terms.obo') [Y, termNames, geneNames] = fr.getDataLabels(organism, True, ontology, 8000) data = loadmat('files/identities.mat') X = data['identityMatrix'] X = np.array(X) rootInd = termNames.index(root) Y = np.array(Y) assert X.shape[0] == Y.shape[0] Y = np.delete(Y, rootInd, axis=1) del termNames[rootInd] [X, Y, deletedProteins] = removeEmptyProteins(X, Y) parentsCoord = getParentsCoord(termNames, ontology, dag, mapping) for i in sorted(deletedProteins, reverse=True): del geneNames[i] return [X, Y, termNames, deletedProteins, parentsCoord]
def getLCAsCAFA(termNames, ontology, dag=None, mapping=None): if dag is None: [dag, mapping] = fr.read_ontology_from_file('files/terms_09_17.obo') if ontology == 'P': root = 'GO:0008150' elif ontology == 'F': root = 'GO:0003674' else: print 'wrong ontology' sys.exit(1) #find all ancestors of each term ancestors = dict() for i, t in enumerate(termNames): ancestors[t] = set() for pInd in dag[mapping[t]].parents: getAncestorsRecursively(dag, pInd, ancestors[t], ontology) ancestors[t].remove(root) lcas = dict() #get common ancestors for i, t1 in enumerate(termNames): print i for j, t2 in enumerate(termNames): #print t1, t2 if i == j: lcas[t1, t2] = t1 else: cas = ancestors[t1].intersection(ancestors[t2]) #print cas for t in cas: others = cas.difference(set([t])) if others == ancestors[t]: lcas[t1, t2] = t break with open('lowestCommonAncestors.pkl', 'wb') as f: pickle.dump(lcas, f) with open('ancestors.pkl', 'wb') as f: pickle.dump(ancestors, f)
def calculateGOallseq(ontology): organism = 'thalia' [dag, mapping] = fr.read_ontology_from_file('files/terms.obo') [Y, termNames, _] = fr.getDataLabels(organism, True, ontology, 8000) go = DFS(dag, termNames, mapping, ontology) go = csr_matrix(go) with open('go_allancestors' + ontology + '.pkl', 'wb') as f: pickle.dump(go, f) return go
def calculateGOseqCAFA(ontology, termNames): organism = 'thalia' [dag, mapping] = fr.read_ontology_from_file('files/terms_09_17.obo') if ontology == 'P': root = 'GO:0008150' elif ontology == 'F': root = 'GO:0003674' else: print 'wrong ontology' sys.exit(1) go = np.eye(len(termNames)) for i, tname in enumerate(termNames): dagInd = mapping[tname] for pInd in dag[dagInd].parents: if dag[pInd].ontology != ontology or dag[pInd] == root: continue parLoc = termNames.index(dag[pInd].ID) go[i, parLoc] = 1.0 go[parLoc, i] = 1.0 #[Y, go] = removeRareTermsOnly(Y, go, label_fraction) go = csr_matrix(go) with open('go' + ontology + '.pkl', 'wb') as f: pickle.dump(go, f) return go
import numpy as np import pickle import sys from utilities import * from filereader import read_ontology_from_file from gyros import getParentsCoord, calculateIC species = sys.argv[1] dag, mapping = read_ontology_from_file('../data/go/go-final.obo') print('Loading go...') [Y, geneNames, termNames, gene2row, term2col] = goLoader(species) cv = KFold(n_splits=5, shuffle=True, random_state=656391) np.random.seed(1901273) thresholds = np.linspace(0, 1.0, 21) for fold, (train, test) in enumerate(cv.split(Y)): print fold termNames = np.array(termNames) Ytrain = Y[train] Ytest = Y[test] nonempty = np.where(np.sum(Ytest, 0) > 0)[0] Ytest = Ytest[:, nonempty]
Y = np.delete(Y, root, 1) return [Y, proteins, termNames] trainProteins = set() leaves = set() with open('../training_data/thalia_proteins.list') as f: for line in f: trainProteins.add(line.rsplit('\n')[0]) print 'Proteins in training set:', len(trainProteins) [dag, mapping] = fr.read_ontology_from_file( '/tudelft.net/staff-bulk/ewi/insy/DBL/smakrod/go/terms_09_17.obo') anno = dict() absentTerms = set() with open('../training_data/uniprot_sprot_exp.txt') as f: for line in f: fields = line.split('\t') protein = fields[0] if protein not in trainProteins: continue