def fromFile(cls, inputFileName, taxons = None, dataset = None): """Decides file type and reads relevant data.""" debug("Reading gene associations file %s...%s" % (inputFileName, ("" if dataset is None else " Dataset size is %d." % len(dataset)))) #open = gzip.open if inputFileName.endswith(".gz") else __builtins__.open if inputFileName.endswith('.pickle') or inputFileName.endswith('.pickle_reserved'): # Serialized data = much faster with open(inputFileName, 'rb') as f: associations, alltaxons = pickle.load(f) else: associations = defaultdict(set) alltaxons = set() with open(inputFileName, 'rb') as associationFile: for line in associationFile.read().decode('utf8').splitlines(): if line.startswith('!'): continue line = line.split('\t') taxon = {int(x.split(':')[1]) for x in line[12].split('|')} alltaxons.update(taxon) gene = Gene.canonicalName(line[2]) term = line[4] if (taxons is None or taxons.intersection(taxon)) and \ (dataset is None or gene in dataset): associations[term].add(gene) debug("Finished reading gene associations file %s... " % inputFileName) #if dataset is not None: # d = dataset.difference(allgenes) # if d: # debug("Missing genes: %s!!!" % ", ".join(d)) return cls(associations, alltaxons, dataset)
def serialize(self, fName): """Serializes data to a file = faster future use.""" debug("Serializing gene associations to file %s..." % fName) data = (self.associations, self.alltaxons) with open(fName, 'wb') as f: pickle.dump(data, f) debug("Finished serializing gene associations to file %s..." % fName)
def dotExport(self): """ Export as a graph to the DOT format for graphical presentation.""" debug("Exporting ontology to dot.") def nodename(t): return t.replace(":", "") for direction in ("parents", "children"): dotFile = self.inputFileName.parent / (self.inputFileName.stem +"_"+direction+ '.dot') with dotFile.open('w') as output: print("digraph ontology {", file=output) print('overlap="false";', file=output) print('root="%s";' % nodename(self.root), file=output) for term, props in self.ontology.items(): name = props['name'] if not name: continue if ',' in name: name = name.replace(', ', r',\n') else: name = name.replace('binding', r'\nbinding').replace('activity',r'\nactivity').replace(r' \n',r'\n') print('%s [fontsize=8,label="%s"]' % (nodename(term), name), file=output) for term, props in self.ontology.items(): for related in props[direction]: print('%s -> %s' % (nodename(term), nodename(related)), file=output) print("}", file=output) for fmt in ('ps', 'png'): outFile = dotFile.parent / (dotFile.stem + '.' + fmt) #print(" ".join(['dot', '-T'+fmt, str(dotFile), '-o', str(outFile)])) try: subprocess.Popen(['dot', '-T'+fmt, str(dotFile), '-o', str(outFile).replace(".","_dot.")]) subprocess.Popen(['twopi', '-T'+fmt, str(dotFile), '-o', str(outFile).replace(".","_twopi.")]) except IOError: pass debug("Finished dot export.")
def fromFile(cls, inputFileName, taxons=None, dataset=None): """Decides file type and reads relevant data.""" debug("Reading gene associations file %s...%s" % (inputFileName, ("" if dataset is None else " Dataset size is %d." % len(dataset)))) #open = gzip.open if inputFileName.endswith(".gz") else __builtins__.open if inputFileName.endswith('.pickle') or inputFileName.endswith( '.pickle_reserved'): # Serialized data = much faster with open(inputFileName, 'rb') as f: associations, alltaxons = pickle.load(f) else: associations = defaultdict(set) alltaxons = set() with open(inputFileName, 'rb') as associationFile: for line in associationFile.read().decode('utf8').splitlines(): if line.startswith('!'): continue line = line.split('\t') taxon = {int(x.split(':')[1]) for x in line[12].split('|')} alltaxons.update(taxon) gene = Gene.canonicalName(line[2]) term = line[4] if (taxons is None or taxons.intersection(taxon)) and \ (dataset is None or gene in dataset): associations[term].add(gene) debug("Finished reading gene associations file %s... " % inputFileName) #if dataset is not None: # d = dataset.difference(allgenes) # if d: # debug("Missing genes: %s!!!" % ", ".join(d)) return cls(associations, alltaxons, dataset)
def completeTest(self, treelikerArgs, processes=1): self.generateExamplesUnified() bestClassifiers = [] terms = self.termsByDepth( ) # This sorting is needed later in bnet learning treeliker = TreeLikerWrapper(self, *treelikerArgs) def processTerm(term): return term, treeliker.runTermTest(term) nets = defaultdict(dict) allresults = tuple(parallel_map_dill(processes, processTerm, terms)) combis = set() for term, learned in allresults: for clfName, i in learned: combis.add((term, clfName)) #for clf, X_train, y_train, X_test, y_test, X_validation, y_validation, g_train, g_test, g_validation in folds: if clfName in nets[i]: net = nets[i][clfName] else: net = BayesNet(i, clfName, self) nets[i][clfName] = net net.generateCPD( term ) #, clf, X_train, y_train, X_test, y_test, X_validation, y_validation, g_train, g_test, g_validation) for i, byClf in sorted(nets.items()): for clfName, net in byClf.items(): net.bake() net.predict() debug("Generating plots.") #for term, learned in allresults: # for clfName, folds in learned.items(): plt.figure(figsize=(6, 12)) for term, clfName in combis: plt.clf() termN = self[term]['name'] cvdir = getTermPath(termN) #folds2 = [(nets[i][clfName].nodeAsClf(term),)+f[1:] for i,f in enumerate(folds)] s1 = plt.subplot(211, adjustable='box', aspect=1) s1.axis('equal') #s1.legend(loc='center left', bbox_to_anchor=(1, 0.5)) plotRoc(termN, clfName, termN) s2 = plt.subplot(212, adjustable='box', aspect=1) s2.axis('equal') #s2.legend(loc='center left', bbox_to_anchor=(1, 0.5)) plotRoc(termN, clfName, "Bayes correction", clfs=(nets[i][clfName].nodeAsClf(term) for i in range(NUM_FOLDS))) #plotRoc("Bayes correction", folds2) print(str(cvdir / (clfName.replace(" ", "_") + '_roc.png'))) plt.savefig(str(cvdir / (clfName.replace(" ", "_") + '_roc.png'))) plt.savefig(str(cvdir / (clfName.replace(" ", "_") + '_roc.ps'))) debug("Finished complete test.")
def completeTest(self, treelikerArgs, processes = 1): self.generateExamplesUnified() bestClassifiers = [] terms = self.termsByDepth() # This sorting is needed later in bnet learning treeliker = TreeLikerWrapper(self, *treelikerArgs) def processTerm(term): return term, treeliker.runTermTest(term) nets = defaultdict(dict) allresults = tuple(parallel_map_dill(processes, processTerm, terms)) combis = set() for term, learned in allresults: for clfName, i in learned: combis.add((term,clfName)) #for clf, X_train, y_train, X_test, y_test, X_validation, y_validation, g_train, g_test, g_validation in folds: if clfName in nets[i]: net = nets[i][clfName] else: net = BayesNet(i, clfName, self) nets[i][clfName] = net net.generateCPD(term)#, clf, X_train, y_train, X_test, y_test, X_validation, y_validation, g_train, g_test, g_validation) for i, byClf in sorted(nets.items()): for clfName, net in byClf.items(): net.bake() net.predict() debug("Generating plots.") #for term, learned in allresults: # for clfName, folds in learned.items(): plt.figure(figsize = (6,12)) for term,clfName in combis: plt.clf() termN = self[term]['name'] cvdir = getTermPath(termN) #folds2 = [(nets[i][clfName].nodeAsClf(term),)+f[1:] for i,f in enumerate(folds)] s1 = plt.subplot(211, adjustable='box', aspect=1) s1.axis('equal') #s1.legend(loc='center left', bbox_to_anchor=(1, 0.5)) plotRoc(termN, clfName, termN) s2 = plt.subplot(212, adjustable='box', aspect=1) s2.axis('equal') #s2.legend(loc='center left', bbox_to_anchor=(1, 0.5)) plotRoc(termN, clfName, "Bayes correction", clfs = (nets[i][clfName].nodeAsClf(term) for i in range(NUM_FOLDS) )) #plotRoc("Bayes correction", folds2) print(str(cvdir/(clfName.replace(" ","_")+'_roc.png'))) plt.savefig(str(cvdir/(clfName.replace(" ","_")+'_roc.png'))) plt.savefig(str(cvdir/(clfName.replace(" ","_")+'_roc.ps'))) debug("Finished complete test.")
def __init__(self, inputFileName, namespace='molecular_function'): """Constructor, reads and parses the ontology OBO file.""" debug("Reading ontology file %s... " % inputFileName) self.root = None self.namespace = namespace ontology = defaultdict(lambda: defaultdict(list)) self.inputFileName = Path(inputFileName) with self.inputFileName.open() as go: terms = groupby(go.read().splitlines(), lambda x: x != '') for b, term in terms: term = list(term) if not b or term[0] != '[Term]': continue nonlists = ('id', 'def', 'name', 'namespace', 'is_obsolete') # Do some deciphering here... term = defaultdict( list, [(a, [y[1] for y in b][0 if a in nonlists else slice(None)]) for a, b in groupby([x.split(': ', 1) for x in term[1:]], lambda x: x[0])]) # Filter terms by namespace, discard obsolete terms if term['namespace'] != namespace or term[ 'is_obsolete'] == 'true': continue # Decide root node if term['name'] == namespace: assert self.root is None self.root = term['id'] # Save the term to ontology ontology[term['id']]['name'] = term['name'].replace( '_', ' ') # FIXME KDYBY BLBLO, ODEBRAT replace for ref in term['is_a']: refid, refname = ref.split(' ! ') ontology[refid]['children'].append(term['id']) ontology[term['id']]['parents'].append(refid) # This is used by Bayes nets ontology[term['id']]['node'] = defaultdict( dict) # fold : clfName : node ontology[term['id']]['clf'] = defaultdict( dict) # fold : clfName : Classifier self.ontology = {**ontology} self.associations = None self.geneFactory = GeneFactory() debug("Initialized ontology for file %s... " % inputFileName)
def _runTreeLiker(self, resultPath, batchPath): if not rerun and (resultPath / '0' / 'test.arff.bz2').is_file(): return cmd = ["java", "-cp", self.treeliker, "ida.ilp.treeLiker.TreeLikerMain", "-batch", batchPath.name] if self.maxMemory is not None: cmd.insert(1, '-Xmx'+self.maxMemory) debug("Starting treeliker for "+resultPath.name) if not resultPath.is_dir(): resultPath.mkdir() with subprocess.Popen(cmd, stdout = subprocess.PIPE, bufsize = 1, universal_newlines=True, cwd=str(resultPath)) as treelikerProc: prev = 0 i = 1 for _line in treelikerProc.stdout: line = '\r%d : %s' % (i, _line.rstrip()) if _line.startswith('Fold') and dp.utils.verbosity == 1: debug("%s: %s" % (batchPath.name, _line)) elif dp.utils.verbosity >= 2: #debug(line.ljust(prev), end=_line.startswith('Fold')) debug(_line.strip()) prev = len(line) if _line.startswith('Processing'): i+=1 if dp.utils.verbosity >= 2: sys.stderr.write("\n") debug("Finished treeliker for "+resultPath.name)
def _runTreeLiker(self, resultPath, batchPath): if not rerun and (resultPath / '0' / 'test.arff.bz2').is_file(): return cmd = [ "java", "-cp", self.treeliker, "ida.ilp.treeLiker.TreeLikerMain", "-batch", batchPath.name ] if self.maxMemory is not None: cmd.insert(1, '-Xmx' + self.maxMemory) debug("Starting treeliker for " + resultPath.name) if not resultPath.is_dir(): resultPath.mkdir() with subprocess.Popen(cmd, stdout=subprocess.PIPE, bufsize=1, universal_newlines=True, cwd=str(resultPath)) as treelikerProc: prev = 0 i = 1 for _line in treelikerProc.stdout: line = '\r%d : %s' % (i, _line.rstrip()) if _line.startswith('Fold') and dp.utils.verbosity == 1: debug("%s: %s" % (batchPath.name, _line)) elif dp.utils.verbosity >= 2: #debug(line.ljust(prev), end=_line.startswith('Fold')) debug(_line.strip()) prev = len(line) if _line.startswith('Processing'): i += 1 if dp.utils.verbosity >= 2: sys.stderr.write("\n") debug("Finished treeliker for " + resultPath.name)
def dotExport(self): """ Export as a graph to the DOT format for graphical presentation.""" debug("Exporting ontology to dot.") def nodename(t): return t.replace(":", "") for direction in ("parents", "children"): dotFile = self.inputFileName.parent / (self.inputFileName.stem + "_" + direction + '.dot') with dotFile.open('w') as output: print("digraph ontology {", file=output) print('overlap="false";', file=output) print('root="%s";' % nodename(self.root), file=output) for term, props in self.ontology.items(): name = props['name'] if not name: continue if ',' in name: name = name.replace(', ', r',\n') else: name = name.replace('binding', r'\nbinding').replace( 'activity', r'\nactivity').replace(r' \n', r'\n') print('%s [fontsize=8,label="%s"]' % (nodename(term), name), file=output) for term, props in self.ontology.items(): for related in props[direction]: print('%s -> %s' % (nodename(term), nodename(related)), file=output) print("}", file=output) for fmt in ('ps', 'png'): outFile = dotFile.parent / (dotFile.stem + '.' + fmt) #print(" ".join(['dot', '-T'+fmt, str(dotFile), '-o', str(outFile)])) try: subprocess.Popen([ 'dot', '-T' + fmt, str(dotFile), '-o', str(outFile).replace(".", "_dot.") ]) subprocess.Popen([ 'twopi', '-T' + fmt, str(dotFile), '-o', str(outFile).replace(".", "_twopi.") ]) except IOError: pass debug("Finished dot export.")
def runTermTest(self, term): term = self.ontology[term]['name'] debug("Preparing for TreeLiker on term %s." % term) resultPath = getTermPath(term) batchPath = resultPath / 'batch.treeliker' datasetPath = resultPath / 'dataset.txt' batchFile = "set(algorithm, relf_grounding_counting)\n" \ "set(verbosity, %d)\n" \ "set(output_type, train_test)\n" \ "set(examples, '%s')\n" \ "set(template, [%s])\n" \ "set(use_sampling, true)\n" \ "set(num_samples, %d)\n" \ "set(sample_size, %d)\n" \ "set(covered_class, '%s')\n\n" % ( dp.utils.verbosity, datasetPath.name, self.template, self.samples, self.sample_size, term) with datasetPath.open() as ds: dataSetLen = len([*ds]) # Counts lines for i, (train, test) in enumerate( cross_validation.KFold(dataSetLen, NUM_FOLDS)): path = resultPath / str(i) if not path.is_dir(): path.mkdir() batchFile += "set(output, '%s')\n" \ "set(train_set, [%s])\n" \ "set(test_set, [%s])\n" \ "work(yes)\n" % ( path.name, ",".join(map(str,train)), ",".join(map(str,test))) with batchPath.open('w') as bf: bf.write(batchFile) self._runTreeLiker(resultPath, batchPath) return learningTest(resultPath)
def runTermTest(self, term): term = self.ontology[term]['name'] debug("Preparing for TreeLiker on term %s." % term) resultPath = getTermPath(term) batchPath = resultPath / 'batch.treeliker' datasetPath = resultPath / 'dataset.txt' batchFile = "set(algorithm, relf_grounding_counting)\n" \ "set(verbosity, %d)\n" \ "set(output_type, train_test)\n" \ "set(examples, '%s')\n" \ "set(template, [%s])\n" \ "set(use_sampling, true)\n" \ "set(num_samples, %d)\n" \ "set(sample_size, %d)\n" \ "set(covered_class, '%s')\n\n" % ( dp.utils.verbosity, datasetPath.name, self.template, self.samples, self.sample_size, term) with datasetPath.open() as ds: dataSetLen = len([*ds]) # Counts lines for i, (train, test) in enumerate(cross_validation.KFold(dataSetLen, NUM_FOLDS)): path = resultPath / str(i) if not path.is_dir(): path.mkdir() batchFile += "set(output, '%s')\n" \ "set(train_set, [%s])\n" \ "set(test_set, [%s])\n" \ "work(yes)\n" % ( path.name, ",".join(map(str,train)), ",".join(map(str,test))) with batchPath.open('w') as bf: bf.write(batchFile) self._runTreeLiker(resultPath, batchPath) return learningTest(resultPath)
def shrink(self, toSize, minTermAssociations): random.seed(0) debug("Shrinking associations") allgenes = sorted(self.associations[self.ontology.root]) size = len(allgenes) while size > toSize: todel = random.choice(allgenes) allgenes.remove(todel) self.delgene(todel) self.ontology.deleteSmallTerms(minTermAssociations) allgenes = sorted(self.associations[self.ontology.root]) size = len(allgenes) self.ontology.genes = allgenes debug("Finished shrinking associations. Left with %d genes." % (size))
def __init__(self, inputFileName, namespace = 'molecular_function'): """Constructor, reads and parses the ontology OBO file.""" debug("Reading ontology file %s... " % inputFileName) self.root = None self.namespace = namespace ontology = defaultdict(lambda: defaultdict(list)) self.inputFileName = Path(inputFileName) with self.inputFileName.open() as go: terms = groupby(go.read().splitlines(), lambda x: x != '') for b, term in terms: term = list(term) if not b or term[0] != '[Term]': continue nonlists = ('id', 'def', 'name', 'namespace', 'is_obsolete') # Do some deciphering here... term = defaultdict(list, [ (a, [y[1] for y in b][0 if a in nonlists else slice(None)]) for a,b in groupby( [x.split(': ', 1) for x in term[1:]], lambda x: x[0])]) # Filter terms by namespace, discard obsolete terms if term['namespace'] != namespace or term['is_obsolete'] == 'true': continue # Decide root node if term['name'] == namespace: assert self.root is None self.root = term['id'] # Save the term to ontology ontology[term['id']]['name'] = term['name'].replace('_', ' ') # FIXME KDYBY BLBLO, ODEBRAT replace for ref in term['is_a']: refid, refname = ref.split(' ! ') ontology[refid]['children'].append(term['id']) ontology[term['id']]['parents'].append(refid) # This is used by Bayes nets ontology[term['id']]['node'] = defaultdict(dict) # fold : clfName : node ontology[term['id']]['clf'] = defaultdict(dict) # fold : clfName : Classifier self.ontology = {**ontology} self.associations = None self.geneFactory = GeneFactory() debug("Initialized ontology for file %s... " % inputFileName)
def generateExamplesUnified(self): #return debug("Generating unified datasets.") terms = self.termsByDepth(False) #rootname = self.ontology[self.root]['name'] with ExitStack() as stack: # Closes all files when exited files = [(term, stack.enter_context((getTermPath(term) / 'dataset.txt').open('w'))) for term in (self[t]['name'] for t in self.ontology.keys()) ]#if term != rootname] #for i, geneName in enumerate(self.genes): for geneName in self.genes: #debug("%d. Writing gene %s." % (i, geneName)) gene = self.geneFactory.getGene(geneName) repg = ", ".join(gene.logicalRepresentation()) for term, output in files: if geneName not in self.associations[term]: term = '~'+term e = '"%s" %s' % (term, repg) print(e, file=output)
def getSecStr(self): name, strand = self.name.split("_") name = name.upper() try: s = tuple([MAPPING[a] for a in self.ss["%s:%s:sequence" % (name, strand)]]) if s != self.sequence: debug("WARNING: Different sequences for %s" % (self.name,)) debug(str(s)) debug(str(self.sequence)) self.secstr = self.ss["%s:%s:secstr" % (name, strand)] except KeyError: debug("WARNING: missing secondary structure info %s" % (self.name,)) self.secstr = None self.dump()
def generateExamplesUnified(self): #return debug("Generating unified datasets.") terms = self.termsByDepth(False) #rootname = self.ontology[self.root]['name'] with ExitStack() as stack: # Closes all files when exited files = [ (term, stack.enter_context( (getTermPath(term) / 'dataset.txt').open('w'))) for term in (self[t]['name'] for t in self.ontology.keys()) ] #if term != rootname] #for i, geneName in enumerate(self.genes): for geneName in self.genes: #debug("%d. Writing gene %s." % (i, geneName)) gene = self.geneFactory.getGene(geneName) repg = ", ".join(gene.logicalRepresentation()) for term, output in files: if geneName not in self.associations[term]: term = '~' + term e = '"%s" %s' % (term, repg) print(e, file=output)
def getSecStr(self): name, strand = self.name.split("_") name = name.upper() try: s = tuple([ MAPPING[a] for a in self.ss["%s:%s:sequence" % (name, strand)] ]) if s != self.sequence: debug("WARNING: Different sequences for %s" % (self.name, )) debug(str(s)) debug(str(self.sequence)) self.secstr = self.ss["%s:%s:secstr" % (name, strand)] except KeyError: debug("WARNING: missing secondary structure info %s" % (self.name, )) self.secstr = None self.dump()
def fromXML(cls, fullName): """Generates protein data from stored XML file. Downloads it if not present.""" name = Gene.canonicalName(fullName, False) fname = cls.xmlname(name) sequence = [] structure = defaultdict( dict) # structure[seq_id][seq_position] == coordinates sequences = defaultdict(dict) strand2seq = {} # Download data if not stored on disc if not fname.is_file(): debug("Downloading file for gene %s... " % name, False) req = Request( cls.XML_URL % name, headers={ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36' }) with fname.open('wb') as f: f.write(urlopen(req).read()) #debug("Done.") # Parse the file debug("Parsing XML for gene %s... " % name) tag_stack = [] elem_stack = [] pp_seq_id = None with gzip.open(str(fname), 'r') as f: #fcntl.lockf(f.fileno(), fcntl.LOCK_EX) doc = ElementTree.iterparse(f, ('start', 'end')) #next(doc) # Skip root element _, root = next(doc) XMLNS = root.tag[:root.tag.find("}") + 1] path_atom = [XMLNS + 'atom_siteCategory', XMLNS + 'atom_site'] path_seq = [ XMLNS + 'entity_poly_seqCategory', XMLNS + 'entity_poly_seq' ] path_poly = [XMLNS + 'entity_polyCategory', XMLNS + 'entity_poly'] whitespace = re.compile(r'\s+') for event, elem in doc: if event == 'start': tag_stack.append(elem.tag) elem_stack.append(elem) elif event == 'end': if tag_stack == path_atom: # elem[11] = <PDBx:label_atom_id>, elem[5] = <PDBx:auth_atom_id> if elem[11].text == 'CA' and elem[5].text == 'CA': # elem[13] = <PDBx:label_entity_id> seq_id = elem[13].text # elem[14] = <PDBx:label_seq_id>, elem[{1,2,3}] = <PDBx:Cartn_{xyz}> label_seq_id = elem[14].text if label_seq_id is not None: coordinates = [ float(elem[i].text) for i in (1, 2, 3) ] seq_pos = int(elem[14].text) - 1 structure[seq_id][seq_pos] = tuple(coordinates) elem_stack[-2].remove(elem) elif tag_stack == path_poly: seq_type = elem.find(XMLNS + 'type').text if seq_type.startswith( 'polypeptide'): # elem[5] = <PDBx:type> seq = elem.find( XMLNS + 'pdbx_seq_one_letter_code_can').text sequence = [ MAPPING[c] for c in re.sub(whitespace, '', seq) ] seq_id = elem.attrib['entity_id'] strand_ids = elem.find(XMLNS + 'pdbx_strand_id').text strand2seq.update({ strand_id: seq_id for strand_id in strand_ids.split(',') }) sequences[seq_id] = sequence elem_stack[-2].remove(elem) #elif tag_stack == path_seq: # entity_id="1" shoud be the polypeptide sequence, I hope, but I'm not sure. FIXME # There can also be e.g. polynucleotide sequences # if elem.attrib['entity_id'] == '1': # sequence.append(elem.attrib['mon_id'].lower()) # elem_stack[-2].remove(elem) if tag_stack: tag_stack.pop() if elem_stack: elem_stack.pop() #atomQuery = "./%satom_siteCategory/%satom_site[%slabel_atom_id='N'][%sauth_atom_id='N'][%slabel_entity_id='1']" % ((XMLNS,) * 5) #seqQuery = "./%sentity_poly_seqCategory/%sentity_poly_seq[@entity_id='1']" % ((XMLNS,) * 2) #debug("Done.") for strand, seq in strand2seq.items(): yield cls(name + '_' + strand, tuple(sequences[seq]), tuple(structure[seq].items()))
def learningTest(cvdir): debug("Starting learning in node %s." % cvdir.name) clasfifiers = ( #TOOD, use third dict for params #("Bagged SVM", lambda: BaggingClassifier(SVC(C=0.1,probability=True))), #("LabelPropagation RBF", LabelPropagation), #("LabelSpreading RBF", LabelSpreading), #("LabelSpreading-7nn", lambda: LabelSpreading(kernel='knn')), #("LabelPropagation-7nn", lambda: LabelPropagation(kernel='knn')), #!("AdaBoost-DecisionTree", AdaBoostClassifier), #("5-NN", lambda: KNeighborsClassifier(p=1, algorithm='kd_tree')), #!("Random Forest", RandomForestClassifier), #("SGD", lambda: SGDClassifier(n_iter=100,alpha=0.01,loss="modified_huber")), ("RBF SVM C=1", lambda: SVC(shrinking=False, tol=1e-5, probability=True)), #("RBF SVM C=0.5", lambda : SVC(C=0.1,probability=True)), #("RBF SVM C=2", lambda : SVC(C=10,probability=True)), #("RBF SVM C=inf", lambda : SVC(C=numpy.inf,probability=True)), #("Linear SVM C=1", lambda: SVC(kernel='linear',probability=True)), #("Quadratic SVM C=1", lambda: SVC(kernel='poly', degree=2,probability=True)), #("Cubic SVM C=1", lambda: SVC(kernel='poly', degree=3,probability=True)), ) scaler = StandardScaler(copy=False) with (cvdir / 'scores.txt').open('w') as output: print('Cross Validation results for file term %s:' % cvdir.name, file=output) print( 'Confusion Matrix: [[True positive, False postive], [False negative, True negative]]', file=output) counts = Counter((re.search(r'^"(.*?)"', l).groups()[0] for l in (cvdir / 'dataset.txt').open() if l.strip() != '')) print('Dataset entry counts: ' + ", ".join( ("%s = %d" % item for item in sorted(counts.items()))), file=output) #alldata = defaultdict(list) alldata = [] for i, (g_train, g_test) in zip(range(NUM_FOLDS), getGenes(cvdir)): foldDir = cvdir / str(i) train = foldDir / 'train.arff' test = foldDir / 'test.arff' X_train, y_train, _ = readArff(train) X_test, y_test, _ = readArff(test) assert len(g_train) == len(y_train) and len(g_test) == len(y_test) # Preprocess #envelope = EllipticEnvelope(contamination=0.05) #envelope.fit(X_train) #inliers = envelope.predict(X_train) == 1 X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) norms = [numpy.linalg.norm(x) for x in X_train] #print(numpy.mean(norms)) #print(numpy.median(norms)) #print(numpy.max(norms)) #print(numpy.min(norms)) #print(numpy.sqrt(numpy.cov(norms))) plotPCA(X_train, y_train, X_test, y_test, foldDir) splitIndex = round(len(y_test) * VALIDATION_RATIO) X_validation, y_validation, g_validation = X_test[: splitIndex], y_test[: splitIndex], g_test[: splitIndex] X_test, y_test, g_test = X_test[splitIndex:], y_test[ splitIndex:], g_test[splitIndex:] assert len(g_train) == len(y_train) and len(g_test) == len(y_test) #plotLDA(X_train, X_test, y_train, y_test, foldDir) for name, Clf in clasfifiers: alldata.append((name, i)) serFile = foldDir / (name + ".pickle.bz2") if serFile.is_file() and not rerun: continue debug("Fitting clasifier %s for fold %d of %d in node %s." % (name, i + 1, NUM_FOLDS, cvdir.name)) if cvdir.name != 'molecular_function': clf = Clf() else: clf = DummyClassifier(strategy='constant', constant=POSTIVE_LABEL) clf.decision_function = lambda a: [1.0] * len(a) print('Testing the classifier %s:' % name, file=output) if clf.__module__.startswith('sklearn.semi_supervised'): y_train = -y_train y_test = -y_test pos = (y_train == POSTIVE_LABEL) neg = (y_train == NEGATIVE_LABEL) posWeight = numpy.sum(neg) / len(y_train) negWeight = numpy.sum(pos) / len(y_train) sample_weight = posWeight * pos + negWeight * neg try: #clf.fit(X_train, y_train) clf.fit(X_train, y_train, sample_weight=sample_weight) except TypeError: clf.fit(X_train, y_train) y_pred = clf.predict(X_test) #compare = numpy.empty((len(y_pred),2)) #compare[:,0] = y_test #compare[:,1] = y_pred #print(compare) pos = (y_test == POSTIVE_LABEL) neg = (y_test == NEGATIVE_LABEL) #prec = precision_score(y_test, y_pred, average='weighted') #reca = recall_score(y_test, y_pred, average='weighted') score = accuracy_score( y_test[pos], y_pred[pos]) * posWeight + accuracy_score( y_test[neg], y_pred[neg]) * negWeight conf = confusion_matrix(y_test, y_pred) if len(conf) == 1: conf = numpy.array([[conf[0][0], 0], [0, 0]]) print("Fold %d score: %.2f, confusion matrix: %s" % (i, score * 100.0, conf.tolist()), file=output) clf.conf = conf clf.fold = i clf.name = name clf.cvdir = cvdir clf.X_train = X_train clf.X_test = X_test clf.X_validation = X_validation clf.y_train = y_train clf.y_test = y_test clf.y_validation = y_validation clf.g_train = g_train clf.g_test = g_test clf.g_validation = g_validation #alldata[name].append((clf, X_train, y_train, X_test, y_test, X_validation, y_validation, g_train, g_test, g_validation)) with bz2.open(str(serFile), 'wb') as ser: ser.write(dill.dumps(clf)) debug( "Finished fitting clasifier %s for fold %d of %d in node %s." % (name, i + 1, NUM_FOLDS, cvdir.name)) #for clfName, folds in alldata.items(): # plotRoc(clfName, folds, cvdir) # plotPrc(clfName, folds, cvdir) # Musí to být tady, protože funkce výše mohou objekt ještě upravovat # for i, (clf,_,_,_,_,_,_,_,_,_) in enumerate(folds): # foldDir = cvdir / str(i) # with (foldDir / (name.replace(' ','')+'.pickle')).open('wb') as ser: # pickle.dump(clf, ser) debug("Finished learning in node %s." % cvdir.name) return alldata
def fromXML(cls, fullName): """Generates protein data from stored XML file. Downloads it if not present.""" name = Gene.canonicalName(fullName, False) fname = cls.xmlname(name) sequence = [] structure = defaultdict(dict) # structure[seq_id][seq_position] == coordinates sequences = defaultdict(dict) strand2seq = {} # Download data if not stored on disc if not fname.is_file(): debug("Downloading file for gene %s... " % name, False) req = Request( cls.XML_URL % name, headers={ "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36" }, ) with fname.open("wb") as f: f.write(urlopen(req).read()) # debug("Done.") # Parse the file debug("Parsing XML for gene %s... " % name) tag_stack = [] elem_stack = [] pp_seq_id = None with gzip.open(str(fname), "r") as f: # fcntl.lockf(f.fileno(), fcntl.LOCK_EX) doc = ElementTree.iterparse(f, ("start", "end")) # next(doc) # Skip root element _, root = next(doc) XMLNS = root.tag[: root.tag.find("}") + 1] path_atom = [XMLNS + "atom_siteCategory", XMLNS + "atom_site"] path_seq = [XMLNS + "entity_poly_seqCategory", XMLNS + "entity_poly_seq"] path_poly = [XMLNS + "entity_polyCategory", XMLNS + "entity_poly"] whitespace = re.compile(r"\s+") for event, elem in doc: if event == "start": tag_stack.append(elem.tag) elem_stack.append(elem) elif event == "end": if tag_stack == path_atom: # elem[11] = <PDBx:label_atom_id>, elem[5] = <PDBx:auth_atom_id> if elem[11].text == "CA" and elem[5].text == "CA": # elem[13] = <PDBx:label_entity_id> seq_id = elem[13].text # elem[14] = <PDBx:label_seq_id>, elem[{1,2,3}] = <PDBx:Cartn_{xyz}> label_seq_id = elem[14].text if label_seq_id is not None: coordinates = [float(elem[i].text) for i in (1, 2, 3)] seq_pos = int(elem[14].text) - 1 structure[seq_id][seq_pos] = tuple(coordinates) elem_stack[-2].remove(elem) elif tag_stack == path_poly: seq_type = elem.find(XMLNS + "type").text if seq_type.startswith("polypeptide"): # elem[5] = <PDBx:type> seq = elem.find(XMLNS + "pdbx_seq_one_letter_code_can").text sequence = [MAPPING[c] for c in re.sub(whitespace, "", seq)] seq_id = elem.attrib["entity_id"] strand_ids = elem.find(XMLNS + "pdbx_strand_id").text strand2seq.update({strand_id: seq_id for strand_id in strand_ids.split(",")}) sequences[seq_id] = sequence elem_stack[-2].remove(elem) # elif tag_stack == path_seq: # entity_id="1" shoud be the polypeptide sequence, I hope, but I'm not sure. FIXME # There can also be e.g. polynucleotide sequences # if elem.attrib['entity_id'] == '1': # sequence.append(elem.attrib['mon_id'].lower()) # elem_stack[-2].remove(elem) if tag_stack: tag_stack.pop() if elem_stack: elem_stack.pop() # atomQuery = "./%satom_siteCategory/%satom_site[%slabel_atom_id='N'][%sauth_atom_id='N'][%slabel_entity_id='1']" % ((XMLNS,) * 5) # seqQuery = "./%sentity_poly_seqCategory/%sentity_poly_seq[@entity_id='1']" % ((XMLNS,) * 2) # debug("Done.") for strand, seq in strand2seq.items(): yield cls(name + "_" + strand, tuple(sequences[seq]), tuple(structure[seq].items()))
def learningTest(cvdir): debug("Starting learning in node %s." % cvdir.name) clasfifiers = ( #TOOD, use third dict for params #("Bagged SVM", lambda: BaggingClassifier(SVC(C=0.1,probability=True))), #("LabelPropagation RBF", LabelPropagation), #("LabelSpreading RBF", LabelSpreading), #("LabelSpreading-7nn", lambda: LabelSpreading(kernel='knn')), #("LabelPropagation-7nn", lambda: LabelPropagation(kernel='knn')), #!("AdaBoost-DecisionTree", AdaBoostClassifier), #("5-NN", lambda: KNeighborsClassifier(p=1, algorithm='kd_tree')), #!("Random Forest", RandomForestClassifier), #("SGD", lambda: SGDClassifier(n_iter=100,alpha=0.01,loss="modified_huber")), ("RBF SVM C=1", lambda: SVC(shrinking=False, tol=1e-5,probability=True)), #("RBF SVM C=0.5", lambda : SVC(C=0.1,probability=True)), #("RBF SVM C=2", lambda : SVC(C=10,probability=True)), #("RBF SVM C=inf", lambda : SVC(C=numpy.inf,probability=True)), #("Linear SVM C=1", lambda: SVC(kernel='linear',probability=True)), #("Quadratic SVM C=1", lambda: SVC(kernel='poly', degree=2,probability=True)), #("Cubic SVM C=1", lambda: SVC(kernel='poly', degree=3,probability=True)), ) scaler = StandardScaler(copy=False) with (cvdir / 'scores.txt').open('w') as output: print('Cross Validation results for file term %s:' % cvdir.name, file=output) print('Confusion Matrix: [[True positive, False postive], [False negative, True negative]]', file=output) counts = Counter( (re.search(r'^"(.*?)"', l).groups()[0] for l in (cvdir / 'dataset.txt').open() if l.strip() != '')) print('Dataset entry counts: ' + ", ".join( ("%s = %d" % item for item in sorted(counts.items()))), file = output) #alldata = defaultdict(list) alldata = [] for i, (g_train, g_test) in zip(range(NUM_FOLDS), getGenes(cvdir)) : foldDir = cvdir / str(i) train = foldDir / 'train.arff' test = foldDir / 'test.arff' X_train, y_train, _ = readArff(train) X_test , y_test, _ = readArff(test) assert len(g_train) == len(y_train) and len(g_test) == len(y_test) # Preprocess #envelope = EllipticEnvelope(contamination=0.05) #envelope.fit(X_train) #inliers = envelope.predict(X_train) == 1 X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) norms = [numpy.linalg.norm(x) for x in X_train] #print(numpy.mean(norms)) #print(numpy.median(norms)) #print(numpy.max(norms)) #print(numpy.min(norms)) #print(numpy.sqrt(numpy.cov(norms))) plotPCA(X_train, y_train, X_test, y_test, foldDir) splitIndex = round(len(y_test)*VALIDATION_RATIO) X_validation, y_validation, g_validation = X_test[:splitIndex], y_test[:splitIndex], g_test[:splitIndex] X_test, y_test, g_test = X_test[splitIndex:], y_test[splitIndex:], g_test[splitIndex:] assert len(g_train) == len(y_train) and len(g_test) == len(y_test) #plotLDA(X_train, X_test, y_train, y_test, foldDir) for name, Clf in clasfifiers: alldata.append(( name, i)) serFile = foldDir / (name + ".pickle.bz2") if serFile.is_file() and not rerun: continue debug("Fitting clasifier %s for fold %d of %d in node %s." % (name, i+1, NUM_FOLDS, cvdir.name)) if cvdir.name != 'molecular_function': clf = Clf() else: clf = DummyClassifier(strategy='constant', constant=POSTIVE_LABEL) clf.decision_function = lambda a: [1.0]*len(a) print('Testing the classifier %s:' % name, file=output) if clf.__module__.startswith('sklearn.semi_supervised'): y_train = - y_train y_test = - y_test pos = (y_train == POSTIVE_LABEL) neg = (y_train == NEGATIVE_LABEL) posWeight = numpy.sum(neg) / len(y_train) negWeight = numpy.sum(pos) / len(y_train) sample_weight = posWeight*pos + negWeight*neg try: #clf.fit(X_train, y_train) clf.fit(X_train, y_train, sample_weight = sample_weight) except TypeError: clf.fit(X_train, y_train) y_pred = clf.predict(X_test) #compare = numpy.empty((len(y_pred),2)) #compare[:,0] = y_test #compare[:,1] = y_pred #print(compare) pos = (y_test == POSTIVE_LABEL) neg = (y_test == NEGATIVE_LABEL) #prec = precision_score(y_test, y_pred, average='weighted') #reca = recall_score(y_test, y_pred, average='weighted') score = accuracy_score(y_test[pos], y_pred[pos])*posWeight + accuracy_score(y_test[neg], y_pred[neg])*negWeight conf = confusion_matrix(y_test, y_pred) if len(conf) == 1: conf = numpy.array([[conf[0][0], 0],[0,0]]) print("Fold %d score: %.2f, confusion matrix: %s" % (i, score*100.0, conf.tolist()), file=output) clf.conf = conf clf.fold = i clf.name = name clf.cvdir = cvdir clf.X_train = X_train clf.X_test = X_test clf.X_validation = X_validation clf.y_train = y_train clf.y_test = y_test clf.y_validation = y_validation clf.g_train = g_train clf.g_test = g_test clf.g_validation = g_validation #alldata[name].append((clf, X_train, y_train, X_test, y_test, X_validation, y_validation, g_train, g_test, g_validation)) with bz2.open(str(serFile), 'wb') as ser: ser.write(dill.dumps(clf)) debug("Finished fitting clasifier %s for fold %d of %d in node %s." % (name, i+1, NUM_FOLDS, cvdir.name)) #for clfName, folds in alldata.items(): # plotRoc(clfName, folds, cvdir) # plotPrc(clfName, folds, cvdir) # Musí to být tady, protože funkce výše mohou objekt ještě upravovat # for i, (clf,_,_,_,_,_,_,_,_,_) in enumerate(folds): # foldDir = cvdir / str(i) # with (foldDir / (name.replace(' ','')+'.pickle')).open('wb') as ser: # pickle.dump(clf, ser) debug("Finished learning in node %s." % cvdir.name) return alldata