Example #1
0
    def run(self):
        # read all taxonomic-specific marker genes
        print('Reading taxonomic-specific marker genes.')
        taxonomicMarkers = set()
        taxonParser = TaxonParser()
        taxonMarkerSets = taxonParser.readMarkerSets()
        for _, taxa in taxonMarkerSets.items():
            for _, markerSet in taxa.items():
                taxonomicMarkers = taxonomicMarkers.union(
                    markerSet.getMarkerGenes())

        print('  Taxonomic-specific marker genes: %d' % len(taxonomicMarkers))

        # read all lineage-specific marker genes
        print('Reading lineage-specific marker genes.')
        lineageMarkers = set()
        treeParser = TreeParser()
        uniqueIdToLineageStatistics = treeParser.readNodeMetadata()
        for uniqueId, d in uniqueIdToLineageStatistics.items():
            markerSet = MarkerSet(uniqueId, 'NA', int(d['# genomes']),
                                  eval(d['marker set']))
            lineageMarkers = lineageMarkers.union(markerSet.getMarkerGenes())

        print('  Lineage-specific marker genes: %d' % len(lineageMarkers))

        # gather all marker genes
        markerGenes = taxonomicMarkers.union(lineageMarkers)
        print('  Total marker genes: %d' % len(markerGenes))

        # get genes from same clan as marker genes
        print('Gathering HMMs from the same clan as marker genes.')
        pfam = PFAM()
        genesInSameClan = pfam.genesInSameClan(markerGenes)
        allMarkers = markerGenes.union(genesInSameClan)

        # create file with all model accession numbers
        keyFile = os.path.join(tempfile.gettempdir(), str(uuid.uuid4()))
        fout = open(keyFile, 'w')
        for modelAcc in allMarkers:
            fout.write(modelAcc + '\n')
        fout.close()

        # fetch specified models
        HF = HMMERRunner(mode='fetch')
        HF.fetch(self.hmms, keyFile, self.outputHMMs, bKeyFile=True)

        # index the HMM file
        if os.path.exists(self.outputHMMs + '.ssi'):
            os.remove(self.outputHMMs + '.ssi')
        HF.index(self.outputHMMs)

        # remove key file
        os.remove(keyFile)
Example #2
0
 def run(self):
     # read all taxonomic-specific marker genes
     print 'Reading taxonomic-specific marker genes.'
     taxonomicMarkers = set()
     taxonParser = TaxonParser()
     taxonMarkerSets = taxonParser.readMarkerSets()
     for _, taxa in taxonMarkerSets.iteritems():
         for _, markerSet in taxa.iteritems():
             taxonomicMarkers = taxonomicMarkers.union(markerSet.getMarkerGenes())
             
     print '  Taxonomic-specific marker genes: %d' % len(taxonomicMarkers)
             
     # read all lineage-specific marker genes
     print 'Reading lineage-specific marker genes.'
     lineageMarkers = set()
     treeParser = TreeParser()
     uniqueIdToLineageStatistics = treeParser.readNodeMetadata()
     for uniqueId, d in uniqueIdToLineageStatistics.iteritems():
         markerSet = MarkerSet(uniqueId, 'NA', int(d['# genomes']), eval(d['marker set']))
         lineageMarkers = lineageMarkers.union(markerSet.getMarkerGenes())
         
     print '  Lineage-specific marker genes: %d' % len(lineageMarkers)
     
     # gather all marker genes
     markerGenes = taxonomicMarkers.union(lineageMarkers)
     print '  Total marker genes: %d' % len(markerGenes)
     
     # get genes from same clan as marker genes
     print 'Gathering HMMs from the same clan as marker genes.'
     pfam = PFAM()
     genesInSameClan = pfam.genesInSameClan(markerGenes)
     allMarkers = markerGenes.union(genesInSameClan)
     
     # create file with all model accession numbers
     keyFile = os.path.join(tempfile.gettempdir(), str(uuid.uuid4()))
     fout = open(keyFile, 'w')
     for modelAcc in allMarkers:
         fout.write(modelAcc + '\n')
     fout.close()
     
     # fetch specified models
     HF = HMMERRunner(mode='fetch')
     HF.fetch(self.hmms, keyFile, self.outputHMMs, bKeyFile=True)
     
     # index the HMM file
     if os.path.exists(self.outputHMMs + '.ssi'):
         os.remove(self.outputHMMs + '.ssi')
     HF.index(self.outputHMMs)
     
     # remove key file
     os.remove(keyFile)
Example #3
0
 def __init__(self):
     self.pfam = PFAM()
     pass
Example #4
0
class EvaluateHMMs(object):
    def __init__(self):
        self.pfam = PFAM()
        pass

    def translateSixFramesGenerator(self, bioseq, table=11):
        revseq = bioseq.reverse_complement()
        for i in range(3):
            yield bioseq[i:].translate(table)
            yield revseq[i:].translate(table)

    def translateSixFrames(self, genomeId):
        print('  Creating six frame translation.')
        outFile = open('./hmm_test/' + genomeId + '.six_frames.fna', 'w')

        contigFile = open(IMG.genomeDir + genomeId + '/' + genomeId + '.fna')
        for rec in SeqIO.parse(contigFile, 'fasta'):
            frameNumber = 0
            for frame in self.translateSixFramesGenerator(rec.seq):
                frameNumber += 1
                SeqIO.write(
                    SeqRecord(frame,
                              description='',
                              id=rec.id + "_" + str(frameNumber)), outFile,
                    'fasta')

        outFile.close()

    def runPFAM(self, genomeId):
        print('  Running PFAM HMMs.')
        os.system(
            'hmmsearch --notextw --noali --cpu 0 --cut_ga --domtblout ./hmm_test/'
            + genomeId + '.pfam.table.txt ./hmm/pfam_markers.hmm ' +
            IMG.genomeDir + genomeId + '/' + genomeId +
            '.genes.derep.faa > ./hmm_test/' + genomeId + '.pfam.tsv\n')

    def runTIGRFAM(self, genomeId):
        print('  Running TIGRFAM HMMs.')
        os.system(
            'hmmsearch --notextw --noali --cpu 0 --cut_nc --domtblout ./hmm_test/'
            + genomeId + '.tigr.table.txt ./hmm/tigr_markers.hmm ' +
            IMG.genomeDir + genomeId + '/' + genomeId +
            '.genes.faa > ./hmm_test/' + genomeId + '.tigr.tsv\n')

    def runPFAM_SixFrames(self, genomeId):
        print('  Running PFAM HMMs on six frame translation.')
        os.system(
            'hmmsearch --notextw --noali --cpu 0 --cut_ga --domtblout ./hmm_test/'
            + genomeId +
            '.pfam.table.six_frames.txt ./hmm/pfam_markers.hmm ./hmm_test/' +
            genomeId + '.six_frames.fna > ./hmm_test/' + genomeId +
            '.pfam.six_frames.tsv\n')

    def runTIGRFAM_SixFrames(self, genomeId):
        print('  Running TIGRFAM HMMs on six frame translation.')
        os.system(
            'hmmsearch --notextw --noali --cpu 0 --cut_nc --domtblout ./hmm_test/'
            + genomeId +
            '.tigr.table.six_frames.txt ./hmm/tigr_markers.hmm ./hmm_test/' +
            genomeId + '.six_frames.fna > ./hmm_test/' + genomeId +
            '.tigr.six_frames.tsv\n')

    def readImgTable(self, genomeId, markers, extension, clusterIdIndex):
        hits = {}
        bHeader = True
        for line in open(IMG.genomeDir + genomeId + '/' + genomeId +
                         extension):
            if bHeader:
                bHeader = False
                continue

            lineSplit = line.split('\t')
            clusterId = lineSplit[clusterIdIndex]

            if clusterId in markers:
                s = hits.get(clusterId, set())
                s.add(lineSplit[0])
                hits[clusterId] = s

        return hits

        def readHmmTable(self, genomeId, extension):
            hits = {}
            for line in open('./hmm_test/' + genomeId + extension):
                if line[0] == '#' or line.strip() == '':
                    continue

                lineSplit = line.split()

                clusterId = lineSplit[4]
                if 'PF' in clusterId:
                    clusterId = clusterId.replace('PF', 'pfam')
                    clusterId = clusterId[0:clusterId.rfind('.')]

                geneId = lineSplit[0]
                evalue = float(lineSplit[12])  # i-evalue
                start = int(lineSplit[17])  # seq. start
                end = int(lineSplit[18])  # seq. end
                hits[geneId] = hits.get(geneId,
                                        []) + [[clusterId, evalue, start, end]]

            return hits

        def compareResults(self, genomeId, pfamMarkers, tigrMarkers, fout):
            # get marker hits to genes as determined by IMG
            imgPfamHits = self.readImgTable(genomeId, pfamMarkers,
                                            '.pfam.tab.txt', 8)
            imgTigrHits = self.readImgTable(genomeId, tigrMarkers,
                                            '.tigrfam.tab.txt', 6)

            print('  PFAM IMG hits: ' + str(len(imgPfamHits)))
            print('  PFAM TIGR hits: ' + str(len(imgTigrHits)))

            # get marker hits to genes as determined by HMMs
            hmmPfamHits = self.readHmmTable(genomeId, '.pfam.table.txt')
            hmmTigrHits = self.readHmmTable(genomeId, '.tigr.table.txt')

            print('  PFAM HMM hits: ' + str(len(hmmPfamHits)))
            print('  TIGR HMM hits: ' + str(len(hmmTigrHits)))

            # remove overlapping PFAM hits from the same clan
            print('  Filtering PFAM hits from the same clan.')
            filteredHmmPfamHits = self.pfam.filterHitsFromSameClan(
                hmmPfamHits, pfamMarkers)
            print('  Filtered PFAM hits: ' + str(len(filteredHmmPfamHits)))

            # reform TIGR hits so dictionary is indexed by TIGR ids
            reformedHmmTigrHits = {}
            for geneId, hits in hmmTigrHits.items():
                for h in hits:
                    tigrId = h[0]
                    s = reformedHmmTigrHits.get(tigrId, set())
                    s.add(geneId)
                    reformedHmmTigrHits[tigrId] = s

            # compare results
            pfamDiff = 0
            totalImgHits = 0
            totalHmmHits = 0
            imgAdditions = 0
            hmmAdditions = 0
            for pfamId in pfamMarkers:
                if len(imgPfamHits.get(pfamId, set())) - len(
                        filteredHmmPfamHits.get(pfamId, set())) > 0:
                    imgAdditions += len(imgPfamHits.get(pfamId, set())) - len(
                        filteredHmmPfamHits.get(pfamId, set()))

                if len(filteredHmmPfamHits.get(pfamId, set())) - len(
                        imgPfamHits.get(pfamId, set())) > 0:
                    hmmAdditions += len(filteredHmmPfamHits.get(
                        pfamId, set())) - len(imgPfamHits.get(pfamId, set()))

                pfamDiff += len(
                    imgPfamHits.get(pfamId, set()).symmetric_difference(
                        filteredHmmPfamHits.get(pfamId, set())))
                totalImgHits += len(imgPfamHits.get(pfamId, set()))
                totalHmmHits += len(filteredHmmPfamHits.get(pfamId, set()))

            print(
                '  PFAM (symmetric diff, IMG hits, HMM hits, IMG additional, HMM additional): '
                + str(pfamDiff) + ', ' + str(totalImgHits) + ', ' +
                str(totalHmmHits) + ', ' + str(imgAdditions) + ', ' +
                str(hmmAdditions))
            fout.write(
                '  PFAM (diff, IMG hits, HMM hits, IMG additional, HMM additional): '
                + str(pfamDiff) + ', ' + str(totalImgHits) + ', ' +
                str(totalHmmHits) + ', ' + str(imgAdditions) + ', ' +
                str(hmmAdditions) + '\n')

            tigrDiff = 0
            totalImgHits = 0
            totalHmmHits = 0
            imgAdditions = 0
            hmmAdditions = 0
            for tigrId in tigrMarkers:
                if len(imgTigrHits.get(tigrId, set())) - len(
                        reformedHmmTigrHits.get(tigrId, set())) > 0:
                    imgAdditions += len(imgTigrHits.get(tigrId, set())) - len(
                        reformedHmmTigrHits.get(tigrId, set()))

                if len(reformedHmmTigrHits.get(tigrId, set())) - len(
                        imgTigrHits.get(tigrId, set())) > 0:
                    hmmAdditions += len(reformedHmmTigrHits.get(
                        tigrId, set())) - len(imgTigrHits.get(tigrId, set()))

                tigrDiff += len(
                    imgTigrHits.get(tigrId, set()).symmetric_difference(
                        reformedHmmTigrHits.get(tigrId, set())))
                totalImgHits += len(imgTigrHits.get(tigrId, set()))
                totalHmmHits += len(reformedHmmTigrHits.get(tigrId, set()))

            print(
                '  TIGR (symmetric diff, IMG hits, HMM hits, IMG additional, HMM additional): '
                + str(tigrDiff) + ', ' + str(totalImgHits) + ', ' +
                str(totalHmmHits) + ', ' + str(imgAdditions) + ', ' +
                str(hmmAdditions))
            print('')
            fout.write(
                '  TIGR (diff, IMG hits, HMM hits, IMG additional, HMM additional): '
                + str(tigrDiff) + ', ' + str(totalImgHits) + ', ' +
                str(totalHmmHits) + ', ' + str(imgAdditions) + ', ' +
                str(hmmAdditions) + '\n\n')

        def compareSixFrameResults(self, genomeId, pfamMarkers, tigrMarkers,
                                   fout):
            # get marker hits to genes as determined by IMG
            imgPfamHits = self.readImgTable(genomeId, pfamMarkers,
                                            '.pfam.tab.txt', 8)
            imgTigrHits = self.readImgTable(genomeId, tigrMarkers,
                                            '.tigrfam.tab.txt', 6)

            # get marker hits to genes as determined by HMMs
            hmmPfamHits = self.readHmmTable(genomeId, pfamMarkers,
                                            '.pfam.table.six_frames.txt')
            hmmTigrHits = self.readHmmTable(genomeId, tigrMarkers,
                                            '.tigr.table.six_frames.txt')

            # compare results
            pfamDiff = 0
            totalImgHits = 0
            totalHmmHits = 0
            imgAdditions = 0
            hmmAdditions = 0
            for pfamId in pfamMarkers:
                if len(imgPfamHits.get(pfamId, set())) - len(
                        hmmPfamHits.get(pfamId, set())) > 0:
                    imgAdditions += len(imgPfamHits.get(pfamId, set())) - len(
                        hmmPfamHits.get(pfamId, set()))

                if len(hmmPfamHits.get(pfamId, set())) - len(
                        imgPfamHits.get(pfamId, set())) > 0:
                    hmmAdditions += len(hmmPfamHits.get(pfamId, set())) - len(
                        imgPfamHits.get(pfamId, set()))

                pfamDiff += abs(
                    len(imgPfamHits.get(pfamId, set())) -
                    len(hmmPfamHits.get(pfamId, set())))
                totalImgHits += len(imgPfamHits.get(pfamId, set()))
                totalHmmHits += len(hmmPfamHits.get(pfamId, set()))

            print(
                '  PFAM (diff, IMG hits, HMM hits, IMG additional, HMM additional): '
                + str(pfamDiff) + ', ' + str(totalImgHits) + ', ' +
                str(totalHmmHits) + ', ' + str(imgAdditions) + ', ' +
                str(hmmAdditions))
            fout.write(
                '  PFAM (diff, IMG hits, HMM hits, IMG additional, HMM additional): '
                + str(pfamDiff) + ', ' + str(totalImgHits) + ', ' +
                str(totalHmmHits) + ', ' + str(imgAdditions) + ', ' +
                str(hmmAdditions) + '\n')

            tigrDiff = 0
            totalImgHits = 0
            totalHmmHits = 0
            imgAdditions = 0
            hmmAdditions = 0
            for tigrId in tigrMarkers:
                if len(imgTigrHits.get(tigrId, set())) - len(
                        hmmTigrHits.get(tigrId, set())) > 0:
                    imgAdditions += len(imgTigrHits.get(tigrId, set())) - len(
                        hmmTigrHits.get(tigrId, set()))

                if len(hmmTigrHits.get(tigrId, set())) - len(
                        imgTigrHits.get(tigrId, set())) > 0:
                    hmmAdditions += len(hmmTigrHits.get(tigrId, set())) - len(
                        imgTigrHits.get(tigrId, set()))

                tigrDiff += abs(
                    len(imgTigrHits.get(tigrId, set())) -
                    len(hmmTigrHits.get(tigrId, set())))
                totalImgHits += len(imgTigrHits.get(tigrId, set()))
                totalHmmHits += len(hmmTigrHits.get(tigrId, set()))

            print(
                '  TIGR (diff, IMG hits, HMM hits, IMG additional, HMM additional): '
                + str(tigrDiff) + ', ' + str(totalImgHits) + ', ' +
                str(totalHmmHits) + ', ' + str(imgAdditions) + ', ' +
                str(hmmAdditions))
            print('')
            fout.write(
                '  TIGR (diff, IMG hits, HMM hits, IMG additional, HMM additional): '
                + str(tigrDiff) + ', ' + str(totalImgHits) + ', ' +
                str(totalHmmHits) + ', ' + str(imgAdditions) + ', ' +
                str(hmmAdditions) + '\n\n')

        def run(self):
            img = IMG()

            fout = open('./data/evaluate_hmms_with_prodigal.txt', 'w', 1)

            # get list of all marker genes
            markerset = MarkerSet()
            pfamMarkers, tigrMarkers = markerset.getCalculatedMarkerGenes()

            print('PFAM marker genes: ' + str(len(tigrMarkers)))
            print('TIGR marker genes: ' + str(len(pfamMarkers)))
            print('')

            # run HMMs on each of the finished genomes
            genomeIds = img.genomeIds('Finished')
            for genomeId in genomeIds:
                print(genomeId + ':')
                fout.write(genomeId + ':\n')

                self.runPFAM(genomeId)
                self.runTIGRFAM(genomeId)

                fout.write('  ORF results:\n')
                self.compareResults(genomeId, pfamMarkers, tigrMarkers, fout)

                #self.translateSixFrames(genomeId)
                #self.runPFAM_SixFrames(genomeId)
                #self.runTIGRFAM_SixFrames(genomeId)

                #fout.write('  Six-frame translation results:\n')
                #self.compareSixFrameResults(genomeId, pfamMarkers, tigrMarkers, fout)

            fout.close()
 def __init__(self):
     self.pfam = PFAM()
     pass
class EvaluateHMMs(object):
    def __init__(self):
        self.pfam = PFAM()
        pass

    def translateSixFramesGenerator(self, bioseq, table=11):
        revseq = bioseq.reverse_complement()
        for i in range(3):
            yield bioseq[i:].translate(table)
            yield revseq[i:].translate(table)

    def translateSixFrames(self, genomeId):
        print '  Creating six frame translation.'
        outFile = open('./hmm_test/' + genomeId + '.six_frames.fna', 'w')

        contigFile = open(IMG.genomeDir + genomeId + '/' + genomeId + '.fna')
        for rec in SeqIO.parse(contigFile, 'fasta'):
            frameNumber = 0
            for frame in self.translateSixFramesGenerator(rec.seq):
                frameNumber += 1
                SeqIO.write(SeqRecord(frame, description='', id=rec.id + "_" + str(frameNumber)), outFile, 'fasta')

        outFile.close()

    def runPFAM(self, genomeId):
        print '  Running PFAM HMMs.'
        os.system('hmmsearch --notextw --noali --cpu 0 --cut_ga --domtblout ./hmm_test/' + genomeId + '.pfam.table.txt ./hmm/pfam_markers.hmm ' + IMG.genomeDir + genomeId + '/' + genomeId + '.genes.derep.faa > ./hmm_test/' + genomeId + '.pfam.tsv\n')

    def runTIGRFAM(self, genomeId):
        print '  Running TIGRFAM HMMs.'
        os.system('hmmsearch --notextw --noali --cpu 0 --cut_nc --domtblout ./hmm_test/' + genomeId + '.tigr.table.txt ./hmm/tigr_markers.hmm ' + IMG.genomeDir + genomeId + '/' + genomeId + '.genes.faa > ./hmm_test/' + genomeId + '.tigr.tsv\n')

    def runPFAM_SixFrames(self, genomeId):
        print '  Running PFAM HMMs on six frame translation.'
        os.system('hmmsearch --notextw --noali --cpu 0 --cut_ga --domtblout ./hmm_test/' + genomeId + '.pfam.table.six_frames.txt ./hmm/pfam_markers.hmm ./hmm_test/' + genomeId + '.six_frames.fna > ./hmm_test/' + genomeId + '.pfam.six_frames.tsv\n')

    def runTIGRFAM_SixFrames(self, genomeId):
        print '  Running TIGRFAM HMMs on six frame translation.'
        os.system('hmmsearch --notextw --noali --cpu 0 --cut_nc --domtblout ./hmm_test/' + genomeId + '.tigr.table.six_frames.txt ./hmm/tigr_markers.hmm ./hmm_test/' + genomeId + '.six_frames.fna > ./hmm_test/' + genomeId + '.tigr.six_frames.tsv\n')

    def readImgTable(self, genomeId, markers, extension, clusterIdIndex):
        hits = {}
        bHeader = True
        for line in open(IMG.genomeDir + genomeId + '/' + genomeId + extension):
            if bHeader:
                bHeader = False
                continue

            lineSplit = line.split('\t')
            clusterId = lineSplit[clusterIdIndex]

            if clusterId in markers:
                s = hits.get(clusterId, set())
                s.add(lineSplit[0])
                hits[clusterId] = s

        return hits

        def readHmmTable(self, genomeId, extension):
            hits = {}
            for line in open('./hmm_test/' + genomeId + extension):
                if line[0] == '#' or line.strip() == '':
                    continue

                lineSplit = line.split()

                clusterId = lineSplit[4]
                if 'PF' in clusterId:
                    clusterId = clusterId.replace('PF', 'pfam')
                    clusterId = clusterId[0:clusterId.rfind('.')]

                geneId = lineSplit[0]
                evalue = float(lineSplit[12]) # i-evalue
                start = int(lineSplit[17])    # seq. start
                end = int(lineSplit[18])      # seq. end
                hits[geneId] = hits.get(geneId, []) + [[clusterId, evalue, start, end]]

            return hits

        def compareResults(self, genomeId, pfamMarkers, tigrMarkers, fout):
            # get marker hits to genes as determined by IMG
            imgPfamHits= self.readImgTable(genomeId, pfamMarkers, '.pfam.tab.txt', 8)
            imgTigrHits = self.readImgTable(genomeId, tigrMarkers, '.tigrfam.tab.txt', 6)

            print '  PFAM IMG hits: ' + str(len(imgPfamHits))
            print '  PFAM TIGR hits: ' + str(len(imgTigrHits))

            # get marker hits to genes as determined by HMMs
            hmmPfamHits = self.readHmmTable(genomeId, '.pfam.table.txt')
            hmmTigrHits = self.readHmmTable(genomeId, '.tigr.table.txt')

            print '  PFAM HMM hits: ' + str(len(hmmPfamHits))
            print '  TIGR HMM hits: ' + str(len(hmmTigrHits))

            # remove overlapping PFAM hits from the same clan
            print '  Filtering PFAM hits from the same clan.'
            filteredHmmPfamHits = self.pfam.filterHitsFromSameClan(hmmPfamHits, pfamMarkers)
            print '  Filtered PFAM hits: ' + str(len(filteredHmmPfamHits))

            # reform TIGR hits so dictionary is indexed by TIGR ids
            reformedHmmTigrHits = {}
            for geneId, hits in hmmTigrHits.iteritems():
                for h in hits:
                    tigrId = h[0]
                    s = reformedHmmTigrHits.get(tigrId, set())
                    s.add(geneId)
                    reformedHmmTigrHits[tigrId] = s

            # compare results
            pfamDiff = 0
            totalImgHits = 0
            totalHmmHits = 0
            imgAdditions = 0
            hmmAdditions = 0
            for pfamId in pfamMarkers:
                if len(imgPfamHits.get(pfamId, set())) - len (filteredHmmPfamHits.get(pfamId, set())) > 0:
                    imgAdditions += len(imgPfamHits.get(pfamId, set())) - len (filteredHmmPfamHits.get(pfamId, set()))

                if len(filteredHmmPfamHits.get(pfamId, set())) - len (imgPfamHits.get(pfamId, set())) > 0:
                    hmmAdditions += len(filteredHmmPfamHits.get(pfamId, set())) - len (imgPfamHits.get(pfamId, set()))

                pfamDiff += len(imgPfamHits.get(pfamId, set()).symmetric_difference(filteredHmmPfamHits.get(pfamId, set())))
                totalImgHits += len(imgPfamHits.get(pfamId, set()))
                totalHmmHits += len(filteredHmmPfamHits.get(pfamId, set()))

            print '  PFAM (symmetric diff, IMG hits, HMM hits, IMG additional, HMM additional): ' + str(pfamDiff) + ', ' + str(totalImgHits) + ', ' + str(totalHmmHits) + ', ' + str(imgAdditions) + ', ' + str(hmmAdditions)
            fout.write('  PFAM (diff, IMG hits, HMM hits, IMG additional, HMM additional): ' + str(pfamDiff) + ', ' + str(totalImgHits) + ', ' + str(totalHmmHits) + ', ' + str(imgAdditions) + ', ' + str(hmmAdditions) + '\n')

            tigrDiff = 0
            totalImgHits = 0
            totalHmmHits = 0
            imgAdditions = 0
            hmmAdditions = 0
            for tigrId in tigrMarkers:
                if len(imgTigrHits.get(tigrId, set())) - len (reformedHmmTigrHits.get(tigrId, set())) > 0:
                    imgAdditions += len(imgTigrHits.get(tigrId, set())) - len (reformedHmmTigrHits.get(tigrId, set()))

                if len(reformedHmmTigrHits.get(tigrId, set())) - len (imgTigrHits.get(tigrId, set())) > 0:
                    hmmAdditions += len(reformedHmmTigrHits.get(tigrId, set())) - len (imgTigrHits.get(tigrId, set()))

                tigrDiff += len(imgTigrHits.get(tigrId, set()).symmetric_difference(reformedHmmTigrHits.get(tigrId, set())))
                totalImgHits += len(imgTigrHits.get(tigrId, set()))
                totalHmmHits += len(reformedHmmTigrHits.get(tigrId, set()))

            print '  TIGR (symmetric diff, IMG hits, HMM hits, IMG additional, HMM additional): ' + str(tigrDiff) + ', ' + str(totalImgHits) + ', ' + str(totalHmmHits) + ', ' + str(imgAdditions) + ', ' + str(hmmAdditions)
            print ''
            fout.write('  TIGR (diff, IMG hits, HMM hits, IMG additional, HMM additional): ' + str(tigrDiff) + ', ' + str(totalImgHits) + ', ' + str(totalHmmHits) + ', ' + str(imgAdditions) + ', ' + str(hmmAdditions) + '\n\n')

        def compareSixFrameResults(self, genomeId, pfamMarkers, tigrMarkers, fout):
            # get marker hits to genes as determined by IMG
            imgPfamHits= self.readImgTable(genomeId, pfamMarkers, '.pfam.tab.txt', 8)
            imgTigrHits = self.readImgTable(genomeId, tigrMarkers, '.tigrfam.tab.txt', 6)

            # get marker hits to genes as determined by HMMs
            hmmPfamHits = self.readHmmTable(genomeId, pfamMarkers, '.pfam.table.six_frames.txt')
            hmmTigrHits = self.readHmmTable(genomeId, tigrMarkers, '.tigr.table.six_frames.txt')

            # compare results
            pfamDiff = 0
            totalImgHits = 0
            totalHmmHits = 0
            imgAdditions = 0
            hmmAdditions = 0
            for pfamId in pfamMarkers:
                if len(imgPfamHits.get(pfamId, set())) - len (hmmPfamHits.get(pfamId, set())) > 0:
                    imgAdditions += len(imgPfamHits.get(pfamId, set())) - len (hmmPfamHits.get(pfamId, set()))

                if len(hmmPfamHits.get(pfamId, set())) - len (imgPfamHits.get(pfamId, set())) > 0:
                    hmmAdditions += len(hmmPfamHits.get(pfamId, set())) - len (imgPfamHits.get(pfamId, set()))

                pfamDiff += abs(len(imgPfamHits.get(pfamId, set())) - len(hmmPfamHits.get(pfamId, set())))
                totalImgHits += len(imgPfamHits.get(pfamId, set()))
                totalHmmHits += len(hmmPfamHits.get(pfamId, set()))

            print '  PFAM (diff, IMG hits, HMM hits, IMG additional, HMM additional): ' + str(pfamDiff) + ', ' + str(totalImgHits) + ', ' + str(totalHmmHits) + ', ' + str(imgAdditions) + ', ' + str(hmmAdditions)
            fout.write('  PFAM (diff, IMG hits, HMM hits, IMG additional, HMM additional): ' + str(pfamDiff) + ', ' + str(totalImgHits) + ', ' + str(totalHmmHits) + ', ' + str(imgAdditions) + ', ' + str(hmmAdditions) + '\n')

            tigrDiff = 0
            totalImgHits = 0
            totalHmmHits = 0
            imgAdditions = 0
            hmmAdditions = 0
            for tigrId in tigrMarkers:
                if len(imgTigrHits.get(tigrId, set())) - len (hmmTigrHits.get(tigrId, set())) > 0:
                    imgAdditions += len(imgTigrHits.get(tigrId, set())) - len (hmmTigrHits.get(tigrId, set()))

                if len(hmmTigrHits.get(tigrId, set())) - len (imgTigrHits.get(tigrId, set())) > 0:
                    hmmAdditions += len(hmmTigrHits.get(tigrId, set())) - len (imgTigrHits.get(tigrId, set()))

                tigrDiff += abs(len(imgTigrHits.get(tigrId, set())) - len(hmmTigrHits.get(tigrId, set())))
                totalImgHits += len(imgTigrHits.get(tigrId, set()))
                totalHmmHits += len(hmmTigrHits.get(tigrId, set()))

            print '  TIGR (diff, IMG hits, HMM hits, IMG additional, HMM additional): ' + str(tigrDiff) + ', ' + str(totalImgHits) + ', ' + str(totalHmmHits) + ', ' + str(imgAdditions) + ', ' + str(hmmAdditions)
            print ''
            fout.write('  TIGR (diff, IMG hits, HMM hits, IMG additional, HMM additional): ' + str(tigrDiff) + ', ' + str(totalImgHits) + ', ' + str(totalHmmHits) + ', ' + str(imgAdditions) + ', ' + str(hmmAdditions) + '\n\n')

        def run(self):
            img = IMG()

            fout = open('./data/evaluate_hmms_with_prodigal.txt', 'w', 1)

            # get list of all marker genes
            markerset = MarkerSet()
            pfamMarkers, tigrMarkers = markerset.getCalculatedMarkerGenes()

            print 'PFAM marker genes: ' + str(len(tigrMarkers))
            print 'TIGR marker genes: ' + str(len(pfamMarkers))
            print ''

            # run HMMs on each of the finished genomes
            genomeIds = img.genomeIds('Finished')
            for genomeId in genomeIds:
                print genomeId + ':'
                fout.write(genomeId + ':\n')

                self.runPFAM(genomeId)
                self.runTIGRFAM(genomeId)

                fout.write('  ORF results:\n')
                self.compareResults(genomeId, pfamMarkers, tigrMarkers, fout)

                #self.translateSixFrames(genomeId)
                #self.runPFAM_SixFrames(genomeId)
                #self.runTIGRFAM_SixFrames(genomeId)

                #fout.write('  Six-frame translation results:\n')
                #self.compareSixFrameResults(genomeId, pfamMarkers, tigrMarkers, fout)

            fout.close()