Beispiel #1
0
    def fromFile(cls, inputFileName, taxons = None, dataset = None):
        """Decides file type and reads relevant data."""
        debug("Reading gene associations file %s...%s" % (inputFileName, ("" if dataset is None else " Dataset size is %d." % len(dataset))))
        #open = gzip.open if inputFileName.endswith(".gz") else __builtins__.open

        if inputFileName.endswith('.pickle') or inputFileName.endswith('.pickle_reserved'):
            # Serialized data = much faster
            with open(inputFileName, 'rb') as f:
                associations, alltaxons = pickle.load(f)
        else:
            associations = defaultdict(set)
            alltaxons = set()

            with open(inputFileName, 'rb') as associationFile:
                for line in associationFile.read().decode('utf8').splitlines():
                    if line.startswith('!'): continue
                    line = line.split('\t')
                    taxon = {int(x.split(':')[1]) for x in line[12].split('|')}
                    alltaxons.update(taxon)
                    gene = Gene.canonicalName(line[2])
                    term = line[4]
                    if (taxons is None or taxons.intersection(taxon)) and \
                       (dataset is None or gene in dataset):
                        associations[term].add(gene)
        debug("Finished reading gene associations file %s... " % inputFileName)
        #if dataset is not None:
        #    d = dataset.difference(allgenes)
        #    if d:
        #        debug("Missing genes: %s!!!" % ", ".join(d))
        return cls(associations, alltaxons, dataset)
Beispiel #2
0
 def serialize(self, fName):
     """Serializes data to a file = faster future use."""
     debug("Serializing gene associations to file %s..." % fName)
     data = (self.associations, self.alltaxons)
     with open(fName, 'wb') as f:
         pickle.dump(data, f)
     debug("Finished serializing gene associations to file %s..." % fName)
Beispiel #3
0
 def serialize(self, fName):
     """Serializes data to a file = faster future use."""
     debug("Serializing gene associations to file %s..." % fName)
     data = (self.associations, self.alltaxons)
     with open(fName, 'wb') as f:
         pickle.dump(data, f)
     debug("Finished serializing gene associations to file %s..." % fName)
Beispiel #4
0
    def dotExport(self):
        """ Export as a graph to the DOT format for graphical presentation."""
        debug("Exporting ontology to dot.")
        def nodename(t): return t.replace(":", "")
        for direction in ("parents", "children"):
            dotFile = self.inputFileName.parent / (self.inputFileName.stem +"_"+direction+ '.dot')
            with dotFile.open('w') as output:
                print("digraph ontology {", file=output)
                print('overlap="false";', file=output)
                print('root="%s";' % nodename(self.root), file=output)
                for term, props in self.ontology.items():
                    name = props['name']
                    if not name:
                        continue
                    if ',' in name:
                        name = name.replace(', ', r',\n')
                    else:
                        name = name.replace('binding', r'\nbinding').replace('activity',r'\nactivity').replace(r' \n',r'\n')
                    print('%s [fontsize=8,label="%s"]' % (nodename(term), name), file=output)

                for term, props in self.ontology.items():
                    for related in props[direction]:
                        print('%s -> %s' % (nodename(term), nodename(related)), file=output)
                print("}", file=output)
            for fmt in ('ps', 'png'):
                outFile = dotFile.parent / (dotFile.stem + '.' + fmt)
                #print(" ".join(['dot', '-T'+fmt, str(dotFile), '-o', str(outFile)]))
                try:
                    subprocess.Popen(['dot', '-T'+fmt, str(dotFile), '-o', str(outFile).replace(".","_dot.")]) 
                    subprocess.Popen(['twopi', '-T'+fmt, str(dotFile), '-o', str(outFile).replace(".","_twopi.")]) 
                except IOError:
                    pass
        debug("Finished dot export.")
Beispiel #5
0
    def fromFile(cls, inputFileName, taxons=None, dataset=None):
        """Decides file type and reads relevant data."""
        debug("Reading gene associations file %s...%s" %
              (inputFileName,
               ("" if dataset is None else " Dataset size is %d." %
                len(dataset))))
        #open = gzip.open if inputFileName.endswith(".gz") else __builtins__.open

        if inputFileName.endswith('.pickle') or inputFileName.endswith(
                '.pickle_reserved'):
            # Serialized data = much faster
            with open(inputFileName, 'rb') as f:
                associations, alltaxons = pickle.load(f)
        else:
            associations = defaultdict(set)
            alltaxons = set()

            with open(inputFileName, 'rb') as associationFile:
                for line in associationFile.read().decode('utf8').splitlines():
                    if line.startswith('!'): continue
                    line = line.split('\t')
                    taxon = {int(x.split(':')[1]) for x in line[12].split('|')}
                    alltaxons.update(taxon)
                    gene = Gene.canonicalName(line[2])
                    term = line[4]
                    if (taxons is None or taxons.intersection(taxon)) and \
                       (dataset is None or gene in dataset):
                        associations[term].add(gene)
        debug("Finished reading gene associations file %s... " % inputFileName)
        #if dataset is not None:
        #    d = dataset.difference(allgenes)
        #    if d:
        #        debug("Missing genes: %s!!!" % ", ".join(d))
        return cls(associations, alltaxons, dataset)
Beispiel #6
0
    def completeTest(self, treelikerArgs, processes=1):
        self.generateExamplesUnified()
        bestClassifiers = []
        terms = self.termsByDepth(
        )  # This sorting is needed later in bnet learning
        treeliker = TreeLikerWrapper(self, *treelikerArgs)

        def processTerm(term):
            return term, treeliker.runTermTest(term)

        nets = defaultdict(dict)
        allresults = tuple(parallel_map_dill(processes, processTerm, terms))
        combis = set()
        for term, learned in allresults:
            for clfName, i in learned:
                combis.add((term, clfName))
                #for clf, X_train, y_train, X_test, y_test, X_validation, y_validation, g_train, g_test, g_validation in folds:
                if clfName in nets[i]:
                    net = nets[i][clfName]
                else:
                    net = BayesNet(i, clfName, self)
                    nets[i][clfName] = net

                net.generateCPD(
                    term
                )  #, clf, X_train, y_train, X_test, y_test, X_validation, y_validation, g_train, g_test, g_validation)

        for i, byClf in sorted(nets.items()):
            for clfName, net in byClf.items():
                net.bake()
                net.predict()

        debug("Generating plots.")
        #for term, learned in allresults:
        #    for clfName, folds in learned.items():
        plt.figure(figsize=(6, 12))
        for term, clfName in combis:
            plt.clf()
            termN = self[term]['name']
            cvdir = getTermPath(termN)
            #folds2 = [(nets[i][clfName].nodeAsClf(term),)+f[1:] for i,f in enumerate(folds)]

            s1 = plt.subplot(211, adjustable='box', aspect=1)
            s1.axis('equal')
            #s1.legend(loc='center left', bbox_to_anchor=(1, 0.5))
            plotRoc(termN, clfName, termN)
            s2 = plt.subplot(212, adjustable='box', aspect=1)
            s2.axis('equal')
            #s2.legend(loc='center left', bbox_to_anchor=(1, 0.5))
            plotRoc(termN,
                    clfName,
                    "Bayes correction",
                    clfs=(nets[i][clfName].nodeAsClf(term)
                          for i in range(NUM_FOLDS)))
            #plotRoc("Bayes correction", folds2)
            print(str(cvdir / (clfName.replace(" ", "_") + '_roc.png')))
            plt.savefig(str(cvdir / (clfName.replace(" ", "_") + '_roc.png')))
            plt.savefig(str(cvdir / (clfName.replace(" ", "_") + '_roc.ps')))
        debug("Finished complete test.")
Beispiel #7
0
    def completeTest(self, treelikerArgs, processes = 1):
        self.generateExamplesUnified()
        bestClassifiers = []
        terms = self.termsByDepth() # This sorting is needed later in bnet learning
        treeliker = TreeLikerWrapper(self, *treelikerArgs)
        def processTerm(term):
            return term, treeliker.runTermTest(term)
        
        nets = defaultdict(dict)
        allresults = tuple(parallel_map_dill(processes, processTerm, terms))
        combis = set()
        for term, learned in allresults:
            for clfName, i in learned:
                combis.add((term,clfName))
                #for clf, X_train, y_train, X_test, y_test, X_validation, y_validation, g_train, g_test, g_validation in folds:
                if clfName in nets[i]:
                    net = nets[i][clfName]
                else:
                    net = BayesNet(i, clfName, self)
                    nets[i][clfName] = net

                net.generateCPD(term)#, clf, X_train, y_train, X_test, y_test, X_validation, y_validation, g_train, g_test, g_validation) 

        for i, byClf in sorted(nets.items()):
            for clfName, net in byClf.items():
                net.bake()
                net.predict()

        debug("Generating plots.")
        #for term, learned in allresults:
        #    for clfName, folds in learned.items():
        plt.figure(figsize = (6,12))
        for term,clfName in combis:
            plt.clf()
            termN = self[term]['name']
            cvdir = getTermPath(termN)
            #folds2 = [(nets[i][clfName].nodeAsClf(term),)+f[1:] for i,f in enumerate(folds)]
            
            s1 = plt.subplot(211, adjustable='box', aspect=1)
            s1.axis('equal')
            #s1.legend(loc='center left', bbox_to_anchor=(1, 0.5))
            plotRoc(termN, clfName, termN)
            s2 = plt.subplot(212, adjustable='box', aspect=1)
            s2.axis('equal')
            #s2.legend(loc='center left', bbox_to_anchor=(1, 0.5))
            plotRoc(termN, clfName, "Bayes correction",
                    clfs = (nets[i][clfName].nodeAsClf(term) for i in range(NUM_FOLDS) ))
            #plotRoc("Bayes correction", folds2)
            print(str(cvdir/(clfName.replace(" ","_")+'_roc.png')))
            plt.savefig(str(cvdir/(clfName.replace(" ","_")+'_roc.png')))
            plt.savefig(str(cvdir/(clfName.replace(" ","_")+'_roc.ps')))
        debug("Finished complete test.")
Beispiel #8
0
    def __init__(self, inputFileName, namespace='molecular_function'):
        """Constructor, reads and parses the ontology OBO file."""
        debug("Reading ontology file %s... " % inputFileName)
        self.root = None
        self.namespace = namespace
        ontology = defaultdict(lambda: defaultdict(list))
        self.inputFileName = Path(inputFileName)
        with self.inputFileName.open() as go:
            terms = groupby(go.read().splitlines(), lambda x: x != '')

            for b, term in terms:
                term = list(term)
                if not b or term[0] != '[Term]': continue
                nonlists = ('id', 'def', 'name', 'namespace', 'is_obsolete')
                # Do some deciphering here...
                term = defaultdict(
                    list,
                    [(a, [y[1]
                          for y in b][0 if a in nonlists else slice(None)])
                     for a, b in groupby([x.split(': ', 1)
                                          for x in term[1:]], lambda x: x[0])])

                # Filter terms by namespace, discard obsolete terms
                if term['namespace'] != namespace or term[
                        'is_obsolete'] == 'true':
                    continue

                # Decide root node
                if term['name'] == namespace:
                    assert self.root is None
                    self.root = term['id']

                # Save the term to ontology
                ontology[term['id']]['name'] = term['name'].replace(
                    '_', ' ')  # FIXME KDYBY BLBLO, ODEBRAT replace
                for ref in term['is_a']:
                    refid, refname = ref.split(' ! ')
                    ontology[refid]['children'].append(term['id'])
                    ontology[term['id']]['parents'].append(refid)
                # This is used by Bayes nets
                ontology[term['id']]['node'] = defaultdict(
                    dict)  # fold : clfName : node
                ontology[term['id']]['clf'] = defaultdict(
                    dict)  # fold : clfName : Classifier

        self.ontology = {**ontology}

        self.associations = None
        self.geneFactory = GeneFactory()
        debug("Initialized ontology for file %s... " % inputFileName)
Beispiel #9
0
    def _runTreeLiker(self, resultPath, batchPath):
        if not rerun and (resultPath / '0' / 'test.arff.bz2').is_file():
            return
        cmd = ["java", "-cp", self.treeliker, "ida.ilp.treeLiker.TreeLikerMain", "-batch", batchPath.name]
        if self.maxMemory is not None:
            cmd.insert(1, '-Xmx'+self.maxMemory)

        debug("Starting treeliker for "+resultPath.name)
        if not resultPath.is_dir():
            resultPath.mkdir()
        with subprocess.Popen(cmd, stdout = subprocess.PIPE, bufsize = 1, universal_newlines=True, cwd=str(resultPath)) as treelikerProc:
            prev = 0
            i = 1
            for _line in treelikerProc.stdout:
                line = '\r%d : %s' % (i, _line.rstrip())
                if _line.startswith('Fold') and dp.utils.verbosity == 1:
                    debug("%s: %s" % (batchPath.name, _line))
                elif dp.utils.verbosity >= 2:
                    #debug(line.ljust(prev), end=_line.startswith('Fold'))
                    debug(_line.strip())
                prev = len(line)
                if _line.startswith('Processing'):
                    i+=1
        if dp.utils.verbosity >= 2:
            sys.stderr.write("\n")

        debug("Finished treeliker for "+resultPath.name)
Beispiel #10
0
    def _runTreeLiker(self, resultPath, batchPath):
        if not rerun and (resultPath / '0' / 'test.arff.bz2').is_file():
            return
        cmd = [
            "java", "-cp", self.treeliker, "ida.ilp.treeLiker.TreeLikerMain",
            "-batch", batchPath.name
        ]
        if self.maxMemory is not None:
            cmd.insert(1, '-Xmx' + self.maxMemory)

        debug("Starting treeliker for " + resultPath.name)
        if not resultPath.is_dir():
            resultPath.mkdir()
        with subprocess.Popen(cmd,
                              stdout=subprocess.PIPE,
                              bufsize=1,
                              universal_newlines=True,
                              cwd=str(resultPath)) as treelikerProc:
            prev = 0
            i = 1
            for _line in treelikerProc.stdout:
                line = '\r%d : %s' % (i, _line.rstrip())
                if _line.startswith('Fold') and dp.utils.verbosity == 1:
                    debug("%s: %s" % (batchPath.name, _line))
                elif dp.utils.verbosity >= 2:
                    #debug(line.ljust(prev), end=_line.startswith('Fold'))
                    debug(_line.strip())
                prev = len(line)
                if _line.startswith('Processing'):
                    i += 1
        if dp.utils.verbosity >= 2:
            sys.stderr.write("\n")

        debug("Finished treeliker for " + resultPath.name)
Beispiel #11
0
    def dotExport(self):
        """ Export as a graph to the DOT format for graphical presentation."""
        debug("Exporting ontology to dot.")

        def nodename(t):
            return t.replace(":", "")

        for direction in ("parents", "children"):
            dotFile = self.inputFileName.parent / (self.inputFileName.stem +
                                                   "_" + direction + '.dot')
            with dotFile.open('w') as output:
                print("digraph ontology {", file=output)
                print('overlap="false";', file=output)
                print('root="%s";' % nodename(self.root), file=output)
                for term, props in self.ontology.items():
                    name = props['name']
                    if not name:
                        continue
                    if ',' in name:
                        name = name.replace(', ', r',\n')
                    else:
                        name = name.replace('binding', r'\nbinding').replace(
                            'activity', r'\nactivity').replace(r' \n', r'\n')
                    print('%s [fontsize=8,label="%s"]' %
                          (nodename(term), name),
                          file=output)

                for term, props in self.ontology.items():
                    for related in props[direction]:
                        print('%s -> %s' % (nodename(term), nodename(related)),
                              file=output)
                print("}", file=output)
            for fmt in ('ps', 'png'):
                outFile = dotFile.parent / (dotFile.stem + '.' + fmt)
                #print(" ".join(['dot', '-T'+fmt, str(dotFile), '-o', str(outFile)]))
                try:
                    subprocess.Popen([
                        'dot', '-T' + fmt,
                        str(dotFile), '-o',
                        str(outFile).replace(".", "_dot.")
                    ])
                    subprocess.Popen([
                        'twopi', '-T' + fmt,
                        str(dotFile), '-o',
                        str(outFile).replace(".", "_twopi.")
                    ])
                except IOError:
                    pass
        debug("Finished dot export.")
Beispiel #12
0
    def runTermTest(self, term):
        term = self.ontology[term]['name']
        debug("Preparing for TreeLiker on term %s." % term)

        resultPath = getTermPath(term)
        batchPath = resultPath / 'batch.treeliker'

        datasetPath = resultPath / 'dataset.txt'

        batchFile = "set(algorithm, relf_grounding_counting)\n" \
                    "set(verbosity, %d)\n" \
                    "set(output_type, train_test)\n" \
                    "set(examples, '%s')\n" \
                    "set(template, [%s])\n" \
                    "set(use_sampling, true)\n" \
                    "set(num_samples, %d)\n" \
                    "set(sample_size, %d)\n" \
                    "set(covered_class, '%s')\n\n" % (
                        dp.utils.verbosity,
                        datasetPath.name,
                        self.template,
                        self.samples,
                        self.sample_size,
                        term)

        with datasetPath.open() as ds:
            dataSetLen = len([*ds])  # Counts lines

        for i, (train, test) in enumerate(
                cross_validation.KFold(dataSetLen, NUM_FOLDS)):
            path = resultPath / str(i)
            if not path.is_dir():
                path.mkdir()

            batchFile += "set(output, '%s')\n" \
                         "set(train_set, [%s])\n" \
                         "set(test_set, [%s])\n" \
                         "work(yes)\n" % (
                             path.name,
                             ",".join(map(str,train)),
                             ",".join(map(str,test)))

        with batchPath.open('w') as bf:
            bf.write(batchFile)

        self._runTreeLiker(resultPath, batchPath)

        return learningTest(resultPath)
Beispiel #13
0
    def runTermTest(self, term):
        term = self.ontology[term]['name']
        debug("Preparing for TreeLiker on term %s." % term)

        resultPath = getTermPath(term)
        batchPath = resultPath / 'batch.treeliker'

        datasetPath = resultPath / 'dataset.txt'

        batchFile = "set(algorithm, relf_grounding_counting)\n" \
                    "set(verbosity, %d)\n" \
                    "set(output_type, train_test)\n" \
                    "set(examples, '%s')\n" \
                    "set(template, [%s])\n" \
                    "set(use_sampling, true)\n" \
                    "set(num_samples, %d)\n" \
                    "set(sample_size, %d)\n" \
                    "set(covered_class, '%s')\n\n" % (
                        dp.utils.verbosity,
                        datasetPath.name,
                        self.template,
                        self.samples,
                        self.sample_size,
                        term)

        with datasetPath.open() as ds:
            dataSetLen = len([*ds]) # Counts lines

        for i, (train, test) in enumerate(cross_validation.KFold(dataSetLen, NUM_FOLDS)):
            path = resultPath / str(i)
            if not path.is_dir():
                path.mkdir()
                
            batchFile += "set(output, '%s')\n" \
                         "set(train_set, [%s])\n" \
                         "set(test_set, [%s])\n" \
                         "work(yes)\n" % (
                             path.name,
                             ",".join(map(str,train)),
                             ",".join(map(str,test)))

        with batchPath.open('w') as bf:
            bf.write(batchFile)

        self._runTreeLiker(resultPath, batchPath)

        return learningTest(resultPath)
Beispiel #14
0
    def shrink(self, toSize, minTermAssociations):
        random.seed(0)
        debug("Shrinking associations")
        allgenes = sorted(self.associations[self.ontology.root])
        size = len(allgenes)
        while size > toSize:
            todel = random.choice(allgenes)
            allgenes.remove(todel)
            self.delgene(todel)
            self.ontology.deleteSmallTerms(minTermAssociations)

            allgenes = sorted(self.associations[self.ontology.root])
            size = len(allgenes)

        self.ontology.genes = allgenes

        debug("Finished shrinking associations. Left with %d genes." % (size))
Beispiel #15
0
    def shrink(self, toSize, minTermAssociations):
        random.seed(0)
        debug("Shrinking associations")
        allgenes = sorted(self.associations[self.ontology.root])
        size = len(allgenes)
        while size > toSize:
            todel = random.choice(allgenes)
            allgenes.remove(todel)
            self.delgene(todel)
            self.ontology.deleteSmallTerms(minTermAssociations)

            allgenes = sorted(self.associations[self.ontology.root])
            size = len(allgenes)

        self.ontology.genes = allgenes

        debug("Finished shrinking associations. Left with %d genes." % (size))
Beispiel #16
0
    def __init__(self, inputFileName, namespace = 'molecular_function'):
        """Constructor, reads and parses the ontology OBO file."""
        debug("Reading ontology file %s... " % inputFileName)
        self.root = None
        self.namespace = namespace
        ontology = defaultdict(lambda: defaultdict(list))
        self.inputFileName = Path(inputFileName)
        with self.inputFileName.open() as go:
            terms = groupby(go.read().splitlines(), lambda x: x != '')

            for b, term in terms:
                term = list(term)
                if not b or term[0] != '[Term]': continue
                nonlists = ('id', 'def', 'name', 'namespace', 'is_obsolete')
                # Do some deciphering here...
                term = defaultdict(list, [
                        (a, [y[1] for y in b][0 if a in nonlists else slice(None)])
                        for a,b in groupby(
                            [x.split(': ', 1) for x in term[1:]],
                            lambda x: x[0])])

                # Filter terms by namespace, discard obsolete terms
                if term['namespace'] != namespace or term['is_obsolete'] == 'true':
                    continue

                # Decide root node
                if term['name'] == namespace:
                    assert self.root is None
                    self.root = term['id']

                # Save the term to ontology
                ontology[term['id']]['name'] = term['name'].replace('_', ' ') # FIXME KDYBY BLBLO, ODEBRAT replace
                for ref in term['is_a']:
                    refid, refname = ref.split(' ! ')
                    ontology[refid]['children'].append(term['id'])
                    ontology[term['id']]['parents'].append(refid)
                # This is used by Bayes nets
                ontology[term['id']]['node'] = defaultdict(dict) # fold : clfName : node
                ontology[term['id']]['clf'] = defaultdict(dict) # fold : clfName : Classifier

        self.ontology = {**ontology}

        self.associations = None
        self.geneFactory = GeneFactory()
        debug("Initialized ontology for file %s... " % inputFileName)
Beispiel #17
0
 def generateExamplesUnified(self):
     #return
     debug("Generating unified datasets.")
     terms = self.termsByDepth(False)
     #rootname = self.ontology[self.root]['name']
     with ExitStack() as stack: # Closes all files when exited
         files = [(term, stack.enter_context((getTermPath(term) / 'dataset.txt').open('w')))
                 for term
                 in (self[t]['name'] for t in self.ontology.keys())
                 ]#if term != rootname]
         #for i, geneName in enumerate(self.genes):
         for geneName in self.genes:
             #debug("%d. Writing gene %s." % (i, geneName))
             gene = self.geneFactory.getGene(geneName)
             repg = ", ".join(gene.logicalRepresentation())
             for term, output in files:
                 if geneName not in self.associations[term]:
                     term = '~'+term
                 e = '"%s" %s' % (term, repg)
                 print(e, file=output)
Beispiel #18
0
 def getSecStr(self):
     name, strand = self.name.split("_")
     name = name.upper()
     try:
         s = tuple([MAPPING[a] for a in self.ss["%s:%s:sequence" % (name, strand)]])
         if s != self.sequence:
             debug("WARNING: Different sequences for %s" % (self.name,))
             debug(str(s))
             debug(str(self.sequence))
         self.secstr = self.ss["%s:%s:secstr" % (name, strand)]
     except KeyError:
         debug("WARNING: missing secondary structure info %s" % (self.name,))
         self.secstr = None
     self.dump()
Beispiel #19
0
 def generateExamplesUnified(self):
     #return
     debug("Generating unified datasets.")
     terms = self.termsByDepth(False)
     #rootname = self.ontology[self.root]['name']
     with ExitStack() as stack:  # Closes all files when exited
         files = [
             (term,
              stack.enter_context(
                  (getTermPath(term) / 'dataset.txt').open('w')))
             for term in (self[t]['name'] for t in self.ontology.keys())
         ]  #if term != rootname]
         #for i, geneName in enumerate(self.genes):
         for geneName in self.genes:
             #debug("%d. Writing gene %s." % (i, geneName))
             gene = self.geneFactory.getGene(geneName)
             repg = ", ".join(gene.logicalRepresentation())
             for term, output in files:
                 if geneName not in self.associations[term]:
                     term = '~' + term
                 e = '"%s" %s' % (term, repg)
                 print(e, file=output)
Beispiel #20
0
 def getSecStr(self):
     name, strand = self.name.split("_")
     name = name.upper()
     try:
         s = tuple([
             MAPPING[a] for a in self.ss["%s:%s:sequence" % (name, strand)]
         ])
         if s != self.sequence:
             debug("WARNING: Different sequences for %s" % (self.name, ))
             debug(str(s))
             debug(str(self.sequence))
         self.secstr = self.ss["%s:%s:secstr" % (name, strand)]
     except KeyError:
         debug("WARNING: missing secondary structure info %s" %
               (self.name, ))
         self.secstr = None
     self.dump()
Beispiel #21
0
    def fromXML(cls, fullName):
        """Generates protein data from stored XML file. Downloads it if not present."""
        name = Gene.canonicalName(fullName, False)
        fname = cls.xmlname(name)
        sequence = []
        structure = defaultdict(
            dict)  # structure[seq_id][seq_position] == coordinates
        sequences = defaultdict(dict)
        strand2seq = {}

        # Download data if not stored on disc
        if not fname.is_file():
            debug("Downloading file for gene %s... " % name, False)
            req = Request(
                cls.XML_URL % name,
                headers={
                    'User-Agent':
                    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'
                })
            with fname.open('wb') as f:
                f.write(urlopen(req).read())
            #debug("Done.")

        # Parse the file
        debug("Parsing XML for gene %s... " % name)

        tag_stack = []
        elem_stack = []

        pp_seq_id = None
        with gzip.open(str(fname), 'r') as f:
            #fcntl.lockf(f.fileno(), fcntl.LOCK_EX)
            doc = ElementTree.iterparse(f, ('start', 'end'))

            #next(doc) # Skip root element
            _, root = next(doc)
            XMLNS = root.tag[:root.tag.find("}") + 1]

            path_atom = [XMLNS + 'atom_siteCategory', XMLNS + 'atom_site']
            path_seq = [
                XMLNS + 'entity_poly_seqCategory', XMLNS + 'entity_poly_seq'
            ]
            path_poly = [XMLNS + 'entity_polyCategory', XMLNS + 'entity_poly']

            whitespace = re.compile(r'\s+')
            for event, elem in doc:
                if event == 'start':
                    tag_stack.append(elem.tag)
                    elem_stack.append(elem)
                elif event == 'end':
                    if tag_stack == path_atom:
                        # elem[11] = <PDBx:label_atom_id>, elem[5] = <PDBx:auth_atom_id>
                        if elem[11].text == 'CA' and elem[5].text == 'CA':
                            # elem[13] = <PDBx:label_entity_id>
                            seq_id = elem[13].text

                            # elem[14] = <PDBx:label_seq_id>, elem[{1,2,3}] = <PDBx:Cartn_{xyz}>

                            label_seq_id = elem[14].text
                            if label_seq_id is not None:
                                coordinates = [
                                    float(elem[i].text) for i in (1, 2, 3)
                                ]
                                seq_pos = int(elem[14].text) - 1
                                structure[seq_id][seq_pos] = tuple(coordinates)
                        elem_stack[-2].remove(elem)

                    elif tag_stack == path_poly:
                        seq_type = elem.find(XMLNS + 'type').text
                        if seq_type.startswith(
                                'polypeptide'):  # elem[5] = <PDBx:type>
                            seq = elem.find(
                                XMLNS + 'pdbx_seq_one_letter_code_can').text
                            sequence = [
                                MAPPING[c]
                                for c in re.sub(whitespace, '', seq)
                            ]
                            seq_id = elem.attrib['entity_id']
                            strand_ids = elem.find(XMLNS +
                                                   'pdbx_strand_id').text
                            strand2seq.update({
                                strand_id: seq_id
                                for strand_id in strand_ids.split(',')
                            })
                            sequences[seq_id] = sequence
                        elem_stack[-2].remove(elem)

                    #elif tag_stack == path_seq:
                    # entity_id="1" shoud be the polypeptide sequence, I hope, but I'm not sure. FIXME
                    # There can also be e.g. polynucleotide sequences
                    #    if elem.attrib['entity_id'] == '1':
                    #        sequence.append(elem.attrib['mon_id'].lower())
                    #    elem_stack[-2].remove(elem)

                    if tag_stack: tag_stack.pop()
                    if elem_stack: elem_stack.pop()

            #atomQuery = "./%satom_siteCategory/%satom_site[%slabel_atom_id='N'][%sauth_atom_id='N'][%slabel_entity_id='1']" % ((XMLNS,) * 5)
            #seqQuery = "./%sentity_poly_seqCategory/%sentity_poly_seq[@entity_id='1']" % ((XMLNS,) * 2)

        #debug("Done.")
        for strand, seq in strand2seq.items():
            yield cls(name + '_' + strand, tuple(sequences[seq]),
                      tuple(structure[seq].items()))
Beispiel #22
0
def learningTest(cvdir):
    debug("Starting learning in node %s." % cvdir.name)
    clasfifiers = (
        #TOOD, use third dict for params
        #("Bagged SVM", lambda: BaggingClassifier(SVC(C=0.1,probability=True))),
        #("LabelPropagation RBF", LabelPropagation),
        #("LabelSpreading RBF", LabelSpreading),
        #("LabelSpreading-7nn", lambda: LabelSpreading(kernel='knn')),
        #("LabelPropagation-7nn", lambda: LabelPropagation(kernel='knn')),
        #!("AdaBoost-DecisionTree", AdaBoostClassifier),
        #("5-NN", lambda: KNeighborsClassifier(p=1, algorithm='kd_tree')),
        #!("Random Forest", RandomForestClassifier),
        #("SGD", lambda: SGDClassifier(n_iter=100,alpha=0.01,loss="modified_huber")),
        ("RBF SVM C=1",
         lambda: SVC(shrinking=False, tol=1e-5, probability=True)),
        #("RBF SVM C=0.5", lambda : SVC(C=0.1,probability=True)),
        #("RBF SVM C=2", lambda : SVC(C=10,probability=True)),
        #("RBF SVM C=inf", lambda : SVC(C=numpy.inf,probability=True)),
        #("Linear SVM C=1", lambda: SVC(kernel='linear',probability=True)),
        #("Quadratic SVM C=1", lambda: SVC(kernel='poly', degree=2,probability=True)),
        #("Cubic SVM C=1", lambda: SVC(kernel='poly', degree=3,probability=True)),
    )

    scaler = StandardScaler(copy=False)
    with (cvdir / 'scores.txt').open('w') as output:
        print('Cross Validation results for file term %s:' % cvdir.name,
              file=output)
        print(
            'Confusion Matrix: [[True positive, False postive], [False negative, True negative]]',
            file=output)
        counts = Counter((re.search(r'^"(.*?)"', l).groups()[0]
                          for l in (cvdir / 'dataset.txt').open()
                          if l.strip() != ''))
        print('Dataset entry counts: ' + ", ".join(
            ("%s = %d" % item for item in sorted(counts.items()))),
              file=output)

        #alldata = defaultdict(list)
        alldata = []
        for i, (g_train, g_test) in zip(range(NUM_FOLDS), getGenes(cvdir)):
            foldDir = cvdir / str(i)
            train = foldDir / 'train.arff'
            test = foldDir / 'test.arff'

            X_train, y_train, _ = readArff(train)
            X_test, y_test, _ = readArff(test)
            assert len(g_train) == len(y_train) and len(g_test) == len(y_test)

            # Preprocess
            #envelope = EllipticEnvelope(contamination=0.05)
            #envelope.fit(X_train)
            #inliers = envelope.predict(X_train) == 1

            X_train = scaler.fit_transform(X_train)
            X_test = scaler.transform(X_test)
            norms = [numpy.linalg.norm(x) for x in X_train]
            #print(numpy.mean(norms))
            #print(numpy.median(norms))
            #print(numpy.max(norms))
            #print(numpy.min(norms))
            #print(numpy.sqrt(numpy.cov(norms)))

            plotPCA(X_train, y_train, X_test, y_test, foldDir)
            splitIndex = round(len(y_test) * VALIDATION_RATIO)
            X_validation, y_validation, g_validation = X_test[:
                                                              splitIndex], y_test[:
                                                                                  splitIndex], g_test[:
                                                                                                      splitIndex]
            X_test, y_test, g_test = X_test[splitIndex:], y_test[
                splitIndex:], g_test[splitIndex:]
            assert len(g_train) == len(y_train) and len(g_test) == len(y_test)

            #plotLDA(X_train, X_test, y_train, y_test, foldDir)

            for name, Clf in clasfifiers:
                alldata.append((name, i))
                serFile = foldDir / (name + ".pickle.bz2")
                if serFile.is_file() and not rerun:
                    continue

                debug("Fitting clasifier %s for fold %d of %d in node %s." %
                      (name, i + 1, NUM_FOLDS, cvdir.name))
                if cvdir.name != 'molecular_function':
                    clf = Clf()
                else:
                    clf = DummyClassifier(strategy='constant',
                                          constant=POSTIVE_LABEL)
                    clf.decision_function = lambda a: [1.0] * len(a)

                print('Testing the classifier %s:' % name, file=output)

                if clf.__module__.startswith('sklearn.semi_supervised'):
                    y_train = -y_train
                    y_test = -y_test

                pos = (y_train == POSTIVE_LABEL)
                neg = (y_train == NEGATIVE_LABEL)
                posWeight = numpy.sum(neg) / len(y_train)
                negWeight = numpy.sum(pos) / len(y_train)
                sample_weight = posWeight * pos + negWeight * neg

                try:
                    #clf.fit(X_train, y_train)
                    clf.fit(X_train, y_train, sample_weight=sample_weight)
                except TypeError:
                    clf.fit(X_train, y_train)

                y_pred = clf.predict(X_test)
                #compare = numpy.empty((len(y_pred),2))
                #compare[:,0] = y_test
                #compare[:,1] = y_pred
                #print(compare)
                pos = (y_test == POSTIVE_LABEL)
                neg = (y_test == NEGATIVE_LABEL)
                #prec = precision_score(y_test, y_pred, average='weighted')
                #reca = recall_score(y_test, y_pred, average='weighted')
                score = accuracy_score(
                    y_test[pos], y_pred[pos]) * posWeight + accuracy_score(
                        y_test[neg], y_pred[neg]) * negWeight
                conf = confusion_matrix(y_test, y_pred)
                if len(conf) == 1:
                    conf = numpy.array([[conf[0][0], 0], [0, 0]])
                print("Fold %d score: %.2f, confusion matrix: %s" %
                      (i, score * 100.0, conf.tolist()),
                      file=output)
                clf.conf = conf
                clf.fold = i
                clf.name = name
                clf.cvdir = cvdir
                clf.X_train = X_train
                clf.X_test = X_test
                clf.X_validation = X_validation
                clf.y_train = y_train
                clf.y_test = y_test
                clf.y_validation = y_validation
                clf.g_train = g_train
                clf.g_test = g_test
                clf.g_validation = g_validation

                #alldata[name].append((clf, X_train, y_train, X_test, y_test, X_validation, y_validation, g_train, g_test, g_validation))

                with bz2.open(str(serFile), 'wb') as ser:
                    ser.write(dill.dumps(clf))

                debug(
                    "Finished fitting clasifier %s for fold %d of %d in node %s."
                    % (name, i + 1, NUM_FOLDS, cvdir.name))

        #for clfName, folds in alldata.items():
        #    plotRoc(clfName, folds, cvdir)
        #    plotPrc(clfName, folds, cvdir)

        # Musí to být tady, protože funkce výše mohou objekt ještě upravovat
        #    for i, (clf,_,_,_,_,_,_,_,_,_) in enumerate(folds):
        #        foldDir = cvdir / str(i)
        #        with (foldDir / (name.replace(' ','')+'.pickle')).open('wb') as ser:
        #            pickle.dump(clf, ser)
    debug("Finished learning in node %s." % cvdir.name)
    return alldata
Beispiel #23
0
    def fromXML(cls, fullName):
        """Generates protein data from stored XML file. Downloads it if not present."""
        name = Gene.canonicalName(fullName, False)
        fname = cls.xmlname(name)
        sequence = []
        structure = defaultdict(dict)  # structure[seq_id][seq_position] == coordinates
        sequences = defaultdict(dict)
        strand2seq = {}

        # Download data if not stored on disc
        if not fname.is_file():
            debug("Downloading file for gene %s... " % name, False)
            req = Request(
                cls.XML_URL % name,
                headers={
                    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36"
                },
            )
            with fname.open("wb") as f:
                f.write(urlopen(req).read())
            # debug("Done.")

        # Parse the file
        debug("Parsing XML for gene %s... " % name)

        tag_stack = []
        elem_stack = []

        pp_seq_id = None
        with gzip.open(str(fname), "r") as f:
            # fcntl.lockf(f.fileno(), fcntl.LOCK_EX)
            doc = ElementTree.iterparse(f, ("start", "end"))

            # next(doc) # Skip root element
            _, root = next(doc)
            XMLNS = root.tag[: root.tag.find("}") + 1]

            path_atom = [XMLNS + "atom_siteCategory", XMLNS + "atom_site"]
            path_seq = [XMLNS + "entity_poly_seqCategory", XMLNS + "entity_poly_seq"]
            path_poly = [XMLNS + "entity_polyCategory", XMLNS + "entity_poly"]

            whitespace = re.compile(r"\s+")
            for event, elem in doc:
                if event == "start":
                    tag_stack.append(elem.tag)
                    elem_stack.append(elem)
                elif event == "end":
                    if tag_stack == path_atom:
                        # elem[11] = <PDBx:label_atom_id>, elem[5] = <PDBx:auth_atom_id>
                        if elem[11].text == "CA" and elem[5].text == "CA":
                            # elem[13] = <PDBx:label_entity_id>
                            seq_id = elem[13].text

                            # elem[14] = <PDBx:label_seq_id>, elem[{1,2,3}] = <PDBx:Cartn_{xyz}>

                            label_seq_id = elem[14].text
                            if label_seq_id is not None:
                                coordinates = [float(elem[i].text) for i in (1, 2, 3)]
                                seq_pos = int(elem[14].text) - 1
                                structure[seq_id][seq_pos] = tuple(coordinates)
                        elem_stack[-2].remove(elem)

                    elif tag_stack == path_poly:
                        seq_type = elem.find(XMLNS + "type").text
                        if seq_type.startswith("polypeptide"):  # elem[5] = <PDBx:type>
                            seq = elem.find(XMLNS + "pdbx_seq_one_letter_code_can").text
                            sequence = [MAPPING[c] for c in re.sub(whitespace, "", seq)]
                            seq_id = elem.attrib["entity_id"]
                            strand_ids = elem.find(XMLNS + "pdbx_strand_id").text
                            strand2seq.update({strand_id: seq_id for strand_id in strand_ids.split(",")})
                            sequences[seq_id] = sequence
                        elem_stack[-2].remove(elem)

                    # elif tag_stack == path_seq:
                    # entity_id="1" shoud be the polypeptide sequence, I hope, but I'm not sure. FIXME
                    # There can also be e.g. polynucleotide sequences
                    #    if elem.attrib['entity_id'] == '1':
                    #        sequence.append(elem.attrib['mon_id'].lower())
                    #    elem_stack[-2].remove(elem)

                    if tag_stack:
                        tag_stack.pop()
                    if elem_stack:
                        elem_stack.pop()

            # atomQuery = "./%satom_siteCategory/%satom_site[%slabel_atom_id='N'][%sauth_atom_id='N'][%slabel_entity_id='1']" % ((XMLNS,) * 5)
            # seqQuery = "./%sentity_poly_seqCategory/%sentity_poly_seq[@entity_id='1']" % ((XMLNS,) * 2)

        # debug("Done.")
        for strand, seq in strand2seq.items():
            yield cls(name + "_" + strand, tuple(sequences[seq]), tuple(structure[seq].items()))
Beispiel #24
0
def learningTest(cvdir):
    debug("Starting learning in node %s." % cvdir.name) 
    clasfifiers = (
            #TOOD, use third dict for params
            #("Bagged SVM", lambda: BaggingClassifier(SVC(C=0.1,probability=True))),
            #("LabelPropagation RBF", LabelPropagation),
            #("LabelSpreading RBF", LabelSpreading),
            #("LabelSpreading-7nn", lambda: LabelSpreading(kernel='knn')),
            #("LabelPropagation-7nn", lambda: LabelPropagation(kernel='knn')),
            #!("AdaBoost-DecisionTree", AdaBoostClassifier),
            #("5-NN", lambda: KNeighborsClassifier(p=1, algorithm='kd_tree')),
            #!("Random Forest", RandomForestClassifier),
            #("SGD", lambda: SGDClassifier(n_iter=100,alpha=0.01,loss="modified_huber")),
            ("RBF SVM C=1", lambda: SVC(shrinking=False, tol=1e-5,probability=True)),
            #("RBF SVM C=0.5", lambda : SVC(C=0.1,probability=True)),
            #("RBF SVM C=2", lambda : SVC(C=10,probability=True)),
            #("RBF SVM C=inf", lambda : SVC(C=numpy.inf,probability=True)),
            #("Linear SVM C=1", lambda: SVC(kernel='linear',probability=True)),
            #("Quadratic SVM C=1", lambda: SVC(kernel='poly', degree=2,probability=True)),
            #("Cubic SVM C=1", lambda: SVC(kernel='poly', degree=3,probability=True)),
            )

    scaler = StandardScaler(copy=False)
    with (cvdir / 'scores.txt').open('w') as output:
        print('Cross Validation results for file term %s:' % cvdir.name, file=output)
        print('Confusion Matrix: [[True positive, False postive], [False negative, True negative]]', file=output)
        counts = Counter(
                   (re.search(r'^"(.*?)"', l).groups()[0]
                    for l in (cvdir / 'dataset.txt').open()
                    if l.strip() != ''))
        print('Dataset entry counts: ' +
                ", ".join(
                    ("%s = %d" % item
                     for item
                     in sorted(counts.items()))),
            file = output)

        #alldata = defaultdict(list)
        alldata = []
        for i, (g_train, g_test) in zip(range(NUM_FOLDS), getGenes(cvdir)) :
            foldDir = cvdir / str(i)
            train = foldDir / 'train.arff'
            test = foldDir / 'test.arff'


            X_train, y_train, _ = readArff(train)
            X_test , y_test,  _ = readArff(test)
            assert len(g_train) == len(y_train) and len(g_test) == len(y_test)

            # Preprocess
            #envelope = EllipticEnvelope(contamination=0.05)
            #envelope.fit(X_train)
            #inliers = envelope.predict(X_train) == 1

            X_train = scaler.fit_transform(X_train)
            X_test = scaler.transform(X_test)
            norms = [numpy.linalg.norm(x) for x in X_train]
            #print(numpy.mean(norms))
            #print(numpy.median(norms))
            #print(numpy.max(norms))
            #print(numpy.min(norms))
            #print(numpy.sqrt(numpy.cov(norms)))

            plotPCA(X_train, y_train, X_test, y_test, foldDir)
            splitIndex = round(len(y_test)*VALIDATION_RATIO)
            X_validation, y_validation, g_validation = X_test[:splitIndex], y_test[:splitIndex], g_test[:splitIndex]
            X_test, y_test, g_test = X_test[splitIndex:], y_test[splitIndex:], g_test[splitIndex:]
            assert len(g_train) == len(y_train) and len(g_test) == len(y_test)

            #plotLDA(X_train, X_test, y_train, y_test, foldDir)

            for name, Clf in clasfifiers:
                alldata.append(( name, i))
                serFile = foldDir / (name + ".pickle.bz2")
                if serFile.is_file() and not rerun:
                    continue

                debug("Fitting clasifier %s for fold %d of %d in node %s." % (name, i+1, NUM_FOLDS, cvdir.name))
                if cvdir.name != 'molecular_function':
                    clf = Clf()
                else:
                    clf = DummyClassifier(strategy='constant', constant=POSTIVE_LABEL)
                    clf.decision_function = lambda a: [1.0]*len(a)

                print('Testing the classifier %s:' % name, file=output)
       
                if clf.__module__.startswith('sklearn.semi_supervised'):
                    y_train = - y_train
                    y_test = - y_test

                pos = (y_train == POSTIVE_LABEL)
                neg = (y_train == NEGATIVE_LABEL)
                posWeight = numpy.sum(neg) / len(y_train)
                negWeight = numpy.sum(pos) / len(y_train)
                sample_weight = posWeight*pos + negWeight*neg

                try:
                    #clf.fit(X_train, y_train)
                    clf.fit(X_train, y_train, sample_weight = sample_weight)
                except TypeError:
                    clf.fit(X_train, y_train)

                y_pred = clf.predict(X_test)
                #compare = numpy.empty((len(y_pred),2))
                #compare[:,0] = y_test
                #compare[:,1] = y_pred
                #print(compare)
                pos = (y_test == POSTIVE_LABEL)
                neg = (y_test == NEGATIVE_LABEL)
                #prec = precision_score(y_test, y_pred, average='weighted')
                #reca = recall_score(y_test, y_pred, average='weighted')
                score = accuracy_score(y_test[pos], y_pred[pos])*posWeight + accuracy_score(y_test[neg], y_pred[neg])*negWeight
                conf = confusion_matrix(y_test, y_pred)
                if len(conf) == 1:
                    conf = numpy.array([[conf[0][0], 0],[0,0]])
                print("Fold %d score: %.2f, confusion matrix: %s" % (i, score*100.0, conf.tolist()), file=output)
                clf.conf = conf
                clf.fold = i
                clf.name = name
                clf.cvdir = cvdir
                clf.X_train = X_train
                clf.X_test = X_test
                clf.X_validation = X_validation
                clf.y_train = y_train
                clf.y_test = y_test
                clf.y_validation = y_validation
                clf.g_train = g_train
                clf.g_test = g_test
                clf.g_validation = g_validation

                #alldata[name].append((clf, X_train, y_train, X_test, y_test, X_validation, y_validation, g_train, g_test, g_validation))

                with bz2.open(str(serFile), 'wb') as ser:
                    ser.write(dill.dumps(clf))
                
                debug("Finished fitting clasifier %s for fold %d of %d in node %s." % (name, i+1, NUM_FOLDS, cvdir.name))

       
        #for clfName, folds in alldata.items():
        #    plotRoc(clfName, folds, cvdir)
        #    plotPrc(clfName, folds, cvdir)

            # Musí to být tady, protože funkce výše mohou objekt ještě upravovat
        #    for i, (clf,_,_,_,_,_,_,_,_,_) in enumerate(folds):
        #        foldDir = cvdir / str(i)
        #        with (foldDir / (name.replace(' ','')+'.pickle')).open('wb') as ser:
        #            pickle.dump(clf, ser)
    debug("Finished learning in node %s." % cvdir.name) 
    return alldata