def __init__(self): self.pfam = PFAM() pass
class EvaluateHMMs(object): def __init__(self): self.pfam = PFAM() pass def runProdigal(self, genomeId): print(' Running Prodigal.') os.system('time prodigal -m -c -f gff -g 11 -a ./prodigal_test/' + genomeId + '.genes.faa -d ./prodigal_test/' + genomeId + '.genes.fna -i ' + IMG.genomeDir + genomeId + '/' + genomeId + '.fna > ./prodigal_test/' + genomeId + '.prodigal.gff\n') def runGeneMark(self, genomeId): print(' Running GeneMarkS.') os.system( 'time ~/apps/genemark/gmsuite/gmsn.pl --prok --format GFF --gcode 11 --name ' + genomeId + ' --species ' + genomeId + ' --faa --fnn ' + IMG.genomeDir + genomeId + '/' + genomeId + '.fna') os.system('mv ' + genomeId + '.fna.faa ./genemark_test/' + genomeId + '.genes.faa') os.system('mv ' + genomeId + '.fna.fnn ./genemark_test/' + genomeId + '.genes.fna') os.system('mv ' + genomeId + '* ./genemark_test/') def runPFAM(self, genomeId): print(' Running PFAM HMMs.') os.system( 'hmmsearch --notextw --noali --cpu 0 --cut_ga --domtblout ./genemark_test/' + genomeId + '.pfam.table.txt ./hmm/pfam_markers.hmm ./genemark_test/' + genomeId + '.genes.faa > ./genemark_test/' + genomeId + '.pfam.tsv\n') os.system( 'hmmsearch --notextw --noali --cpu 0 --cut_ga --domtblout ./prodigal_test/' + genomeId + '.pfam.table.txt ./hmm/pfam_markers.hmm ./prodigal_test/' + genomeId + '.genes.faa > ./prodigal_test/' + genomeId + '.pfam.tsv\n') def runTIGRFAM(self, genomeId): print(' Running TIGR HMMs.') os.system( 'hmmsearch --notextw --noali --cpu 0 --cut_nc --domtblout ./genemark_test/' + genomeId + '.tigr.table.txt ./hmm/tigr_markers.hmm ./genemark_test/' + genomeId + '.genes.faa > ./genemark_test/' + genomeId + '.tigr.tsv\n') os.system( 'hmmsearch --notextw --noali --cpu 0 --cut_nc --domtblout ./prodigal_test/' + genomeId + '.tigr.table.txt ./hmm/tigr_markers.hmm ./prodigal_test/' + genomeId + '.genes.faa > ./prodigal_test/' + genomeId + '.tigr.tsv\n') def readImgTable(self, genomeId, markers, extension, clusterIdIndex): hits = {} bHeader = True for line in open(IMG.genomeDir + genomeId + '/' + genomeId + extension): if bHeader: bHeader = False continue lineSplit = line.split('\t') clusterId = lineSplit[clusterIdIndex] if clusterId in markers: s = hits.get(clusterId, set()) s.add(lineSplit[0]) hits[clusterId] = s for clusterId, geneSet in hits.items(): hits[clusterId] = len(geneSet) return hits def readHmmTable(self, resultsFile): hits = {} for line in open(resultsFile): if line[0] == '#' or line.strip() == '': continue lineSplit = line.split() clusterId = lineSplit[4] if 'PF' in clusterId: clusterId = clusterId.replace('PF', 'pfam') clusterId = clusterId[0:clusterId.rfind('.')] geneId = lineSplit[0] evalue = float(lineSplit[12]) # i-evalue start = int(lineSplit[17]) # seq. start end = int(lineSplit[18]) # seq. end hits[geneId] = hits.get(geneId, []) + [[clusterId, evalue, start, end]] return hits def compareResults(self, genomeId, pfamMarkers, tigrMarkers, fout): # get marker hits to genes as determined by IMG imgPfamHits = self.readImgTable(genomeId, pfamMarkers, '.pfam.tab.txt', 8) imgTigrHits = self.readImgTable(genomeId, tigrMarkers, '.tigrfam.tab.txt', 6) print(' PFAM IMG hits: ' + str(len(imgPfamHits))) print(' PFAM TIGR hits: ' + str(len(imgTigrHits))) # get prodigal marker hits to genes as determined by HMMs hmmPfamHits = self.readHmmTable('./prodigal_test/' + genomeId + '.pfam.table.txt') hmmTigrHits = self.readHmmTable('./prodigal_test/' + genomeId + '.tigr.table.txt') print(' Prodigal PFAM HMM hits: ' + str(len(hmmPfamHits))) print(' Prodigal TIGR HMM hits: ' + str(len(hmmTigrHits))) # remove overlapping PFAM hits from the same clan print(' Filtering Prodigal PFAM hits from the same clan.') filteredHmmPfamHits = self.pfam.filterHitsFromSameClan( hmmPfamHits, pfamMarkers) print(' Filtered Prodigal PFAM hits: ' + str(len(filteredHmmPfamHits))) # reformat filtered PFAM hits prodigalPfamHits = {} for pfamId, geneIds in filteredHmmPfamHits.items(): prodigalPfamHits[pfamId] = len(geneIds) # reformat TIGR hits so dictionary is indexed by TIGR ids prodigalTigrHits = {} for _, hits in hmmTigrHits.items(): for h in hits: tigrId = h[0] prodigalTigrHits[tigrId] = prodigalTigrHits.get(tigrId, 0) + 1 # get GeneMark marker hits to genes as determined by HMMs hmmPfamHits = self.readHmmTable('./genemark_test/' + genomeId + '.pfam.table.txt') hmmTigrHits = self.readHmmTable('./genemark_test/' + genomeId + '.tigr.table.txt') print(' GeneMark PFAM HMM hits: ' + str(len(hmmPfamHits))) print(' GeneMark TIGR HMM hits: ' + str(len(hmmTigrHits))) # remove overlapping PFAM hits from the same clan print(' Filtering GeneMark PFAM hits from the same clan.') filteredHmmPfamHits = self.pfam.filterHitsFromSameClan( hmmPfamHits, pfamMarkers) print(' Filtered GeneMark PFAM hits: ' + str(len(filteredHmmPfamHits))) # reformat filtered PFAM hits genemarkPfamHits = {} for pfamId, geneIds in filteredHmmPfamHits.items(): genemarkPfamHits[pfamId] = len(geneIds) # reformat TIGR hits so dictionary is indexed by TIGR ids genemarkTigrHits = {} for _, hits in hmmTigrHits.items(): for h in hits: tigrId = h[0] genemarkTigrHits[tigrId] = genemarkTigrHits.get(tigrId, 0) + 1 # compare results prodigalDiff = 0 genemarkDiff = 0 totalImgHits = 0 totalProdigalHits = 0 totalGeneMarkHits = 0 for pfamId in pfamMarkers: prodigalDiff += abs( prodigalPfamHits.get(pfamId, 0) - imgPfamHits.get(pfamId, 0)) genemarkDiff += abs( genemarkPfamHits.get(pfamId, 0) - imgPfamHits.get(pfamId, 0)) totalImgHits += imgPfamHits.get(pfamId, 0) totalProdigalHits += prodigalPfamHits.get(pfamId, 0) totalGeneMarkHits += genemarkPfamHits.get(pfamId, 0) print( ' PFAM (Prodigal diff, GeneMark diff, IMG hits, Prodigal hits, GeneMark hits): ' + str(prodigalDiff) + ', ' + str(genemarkDiff) + ', ' + str(totalImgHits) + ', ' + str(totalProdigalHits) + ', ' + str(totalGeneMarkHits)) fout.write( ' PFAM (Prodigal diff, GeneMark diff, IMG hits, Prodigal hits, GeneMark hits): ' + str(prodigalDiff) + ', ' + str(genemarkDiff) + ', ' + str(totalImgHits) + ', ' + str(totalProdigalHits) + ', ' + str(totalGeneMarkHits) + '\n') prodigalDiff = 0 genemarkDiff = 0 totalImgHits = 0 totalProdigalHits = 0 totalGeneMarkHits = 0 for tigrId in tigrMarkers: prodigalDiff += abs( prodigalTigrHits.get(tigrId, 0) - imgTigrHits.get(tigrId, 0)) genemarkDiff += abs( genemarkTigrHits.get(tigrId, 0) - imgTigrHits.get(tigrId, 0)) totalImgHits += imgTigrHits.get(tigrId, 0) totalProdigalHits += prodigalTigrHits.get(tigrId, 0) totalGeneMarkHits += genemarkTigrHits.get(tigrId, 0) print( ' TIGR (Prodigal diff, GeneMark diff, IMG hits, Prodigal hits, GeneMark hits): ' + str(prodigalDiff) + ', ' + str(genemarkDiff) + ', ' + str(totalImgHits) + ', ' + str(totalProdigalHits) + ', ' + str(totalGeneMarkHits)) print('') fout.write( ' TIGR (Prodigal diff, GeneMark diff, IMG hits, Prodigal hits, GeneMark hits): ' + str(prodigalDiff) + ', ' + str(genemarkDiff) + ', ' + str(totalImgHits) + ', ' + str(totalProdigalHits) + ', ' + str(totalGeneMarkHits) + '\n\n') def run(self): img = IMG() fout = open('./data/evaluate_prodigal.txt', 'w', 1) # get list of all marker genes markerset = MarkerSet() pfamMarkers, tigrMarkers = markerset.getCalculatedMarkerGenes() print('PFAM marker genes: ' + str(len(tigrMarkers))) print('TIGR marker genes: ' + str(len(pfamMarkers))) print('') # run HMMs on each of the finished genomes genomeIds = img.genomeIds('Finished') for genomeId in genomeIds: print(genomeId + ':') fout.write(genomeId + ':\n') self.runProdigal(genomeId) self.runGeneMark(genomeId) self.runPFAM(genomeId) self.runTIGRFAM(genomeId) self.compareResults(genomeId, pfamMarkers, tigrMarkers, fout) fout.close()
class EvaluateHMMs(object): def __init__(self): self.pfam = PFAM() pass def runProdigal(self, genomeId): print ' Running Prodigal.' os.system('time prodigal -m -c -f gff -g 11 -a ./prodigal_test/' + genomeId + '.genes.faa -d ./prodigal_test/' + genomeId + '.genes.fna -i ' + IMG.genomeDir + genomeId + '/' + genomeId + '.fna > ./prodigal_test/' + genomeId + '.prodigal.gff\n') def runGeneMark(self, genomeId): print ' Running GeneMarkS.' os.system('time ~/apps/genemark/gmsuite/gmsn.pl --prok --format GFF --gcode 11 --name ' + genomeId + ' --species ' + genomeId + ' --faa --fnn ' + IMG.genomeDir + genomeId + '/' + genomeId + '.fna') os.system('mv ' + genomeId + '.fna.faa ./genemark_test/' + genomeId + '.genes.faa') os.system('mv ' + genomeId + '.fna.fnn ./genemark_test/' + genomeId + '.genes.fna') os.system('mv ' + genomeId + '* ./genemark_test/') def runPFAM(self, genomeId): print ' Running PFAM HMMs.' os.system('hmmsearch --notextw --noali --cpu 0 --cut_ga --domtblout ./genemark_test/' + genomeId + '.pfam.table.txt ./hmm/pfam_markers.hmm ./genemark_test/' + genomeId + '.genes.faa > ./genemark_test/' + genomeId + '.pfam.tsv\n') os.system('hmmsearch --notextw --noali --cpu 0 --cut_ga --domtblout ./prodigal_test/' + genomeId + '.pfam.table.txt ./hmm/pfam_markers.hmm ./prodigal_test/' + genomeId + '.genes.faa > ./prodigal_test/' + genomeId + '.pfam.tsv\n') def runTIGRFAM(self, genomeId): print ' Running TIGR HMMs.' os.system('hmmsearch --notextw --noali --cpu 0 --cut_nc --domtblout ./genemark_test/' + genomeId + '.tigr.table.txt ./hmm/tigr_markers.hmm ./genemark_test/' + genomeId + '.genes.faa > ./genemark_test/' + genomeId + '.tigr.tsv\n') os.system('hmmsearch --notextw --noali --cpu 0 --cut_nc --domtblout ./prodigal_test/' + genomeId + '.tigr.table.txt ./hmm/tigr_markers.hmm ./prodigal_test/' + genomeId + '.genes.faa > ./prodigal_test/' + genomeId + '.tigr.tsv\n') def readImgTable(self, genomeId, markers, extension, clusterIdIndex): hits = {} bHeader = True for line in open(IMG.genomeDir + genomeId + '/' + genomeId + extension): if bHeader: bHeader = False continue lineSplit = line.split('\t') clusterId = lineSplit[clusterIdIndex] if clusterId in markers: s = hits.get(clusterId, set()) s.add(lineSplit[0]) hits[clusterId] = s for clusterId, geneSet in hits.iteritems(): hits[clusterId] = len(geneSet) return hits def readHmmTable(self, resultsFile): hits = {} for line in open(resultsFile): if line[0] == '#' or line.strip() == '': continue lineSplit = line.split() clusterId = lineSplit[4] if 'PF' in clusterId: clusterId = clusterId.replace('PF', 'pfam') clusterId = clusterId[0:clusterId.rfind('.')] geneId = lineSplit[0] evalue = float(lineSplit[12]) # i-evalue start = int(lineSplit[17]) # seq. start end = int(lineSplit[18]) # seq. end hits[geneId] = hits.get(geneId, []) + [[clusterId, evalue, start, end]] return hits def compareResults(self, genomeId, pfamMarkers, tigrMarkers, fout): # get marker hits to genes as determined by IMG imgPfamHits = self.readImgTable(genomeId, pfamMarkers, '.pfam.tab.txt', 8) imgTigrHits = self.readImgTable(genomeId, tigrMarkers, '.tigrfam.tab.txt', 6) print ' PFAM IMG hits: ' + str(len(imgPfamHits)) print ' PFAM TIGR hits: ' + str(len(imgTigrHits)) # get prodigal marker hits to genes as determined by HMMs hmmPfamHits = self.readHmmTable('./prodigal_test/' + genomeId + '.pfam.table.txt') hmmTigrHits = self.readHmmTable('./prodigal_test/' + genomeId + '.tigr.table.txt') print ' Prodigal PFAM HMM hits: ' + str(len(hmmPfamHits)) print ' Prodigal TIGR HMM hits: ' + str(len(hmmTigrHits)) # remove overlapping PFAM hits from the same clan print ' Filtering Prodigal PFAM hits from the same clan.' filteredHmmPfamHits = self.pfam.filterHitsFromSameClan(hmmPfamHits, pfamMarkers) print ' Filtered Prodigal PFAM hits: ' + str(len(filteredHmmPfamHits)) # reformat filtered PFAM hits prodigalPfamHits = {} for pfamId, geneIds in filteredHmmPfamHits.iteritems(): prodigalPfamHits[pfamId] = len(geneIds) # reformat TIGR hits so dictionary is indexed by TIGR ids prodigalTigrHits = {} for _, hits in hmmTigrHits.iteritems(): for h in hits: tigrId = h[0] prodigalTigrHits[tigrId] = prodigalTigrHits.get(tigrId, 0) + 1 # get GeneMark marker hits to genes as determined by HMMs hmmPfamHits = self.readHmmTable('./genemark_test/' + genomeId + '.pfam.table.txt') hmmTigrHits = self.readHmmTable('./genemark_test/' + genomeId + '.tigr.table.txt') print ' GeneMark PFAM HMM hits: ' + str(len(hmmPfamHits)) print ' GeneMark TIGR HMM hits: ' + str(len(hmmTigrHits)) # remove overlapping PFAM hits from the same clan print ' Filtering GeneMark PFAM hits from the same clan.' filteredHmmPfamHits = self.pfam.filterHitsFromSameClan(hmmPfamHits, pfamMarkers) print ' Filtered GeneMark PFAM hits: ' + str(len(filteredHmmPfamHits)) # reformat filtered PFAM hits genemarkPfamHits = {} for pfamId, geneIds in filteredHmmPfamHits.iteritems(): genemarkPfamHits[pfamId] = len(geneIds) # reformat TIGR hits so dictionary is indexed by TIGR ids genemarkTigrHits = {} for _, hits in hmmTigrHits.iteritems(): for h in hits: tigrId = h[0] genemarkTigrHits[tigrId] = genemarkTigrHits.get(tigrId, 0) + 1 # compare results prodigalDiff = 0 genemarkDiff = 0 totalImgHits = 0 totalProdigalHits = 0 totalGeneMarkHits = 0 for pfamId in pfamMarkers: prodigalDiff += abs(prodigalPfamHits.get(pfamId, 0) - imgPfamHits.get(pfamId, 0)) genemarkDiff += abs(genemarkPfamHits.get(pfamId, 0) - imgPfamHits.get(pfamId, 0)) totalImgHits += imgPfamHits.get(pfamId, 0) totalProdigalHits += prodigalPfamHits.get(pfamId, 0) totalGeneMarkHits += genemarkPfamHits.get(pfamId, 0) print ' PFAM (Prodigal diff, GeneMark diff, IMG hits, Prodigal hits, GeneMark hits): ' + str(prodigalDiff) + ', ' + str(genemarkDiff) + ', ' + str(totalImgHits) + ', ' + str(totalProdigalHits) + ', ' + str(totalGeneMarkHits) fout.write(' PFAM (Prodigal diff, GeneMark diff, IMG hits, Prodigal hits, GeneMark hits): ' + str(prodigalDiff) + ', ' + str(genemarkDiff) + ', ' + str(totalImgHits) + ', ' + str(totalProdigalHits) + ', ' + str(totalGeneMarkHits) + '\n') prodigalDiff = 0 genemarkDiff = 0 totalImgHits = 0 totalProdigalHits = 0 totalGeneMarkHits = 0 for tigrId in tigrMarkers: prodigalDiff += abs(prodigalTigrHits.get(tigrId, 0) - imgTigrHits.get(tigrId, 0)) genemarkDiff += abs(genemarkTigrHits.get(tigrId, 0) - imgTigrHits.get(tigrId, 0)) totalImgHits += imgTigrHits.get(tigrId, 0) totalProdigalHits += prodigalTigrHits.get(tigrId, 0) totalGeneMarkHits += genemarkTigrHits.get(tigrId, 0) print ' TIGR (Prodigal diff, GeneMark diff, IMG hits, Prodigal hits, GeneMark hits): ' + str(prodigalDiff) + ', ' + str(genemarkDiff) + ', ' + str(totalImgHits) + ', ' + str(totalProdigalHits) + ', ' + str(totalGeneMarkHits) print '' fout.write(' TIGR (Prodigal diff, GeneMark diff, IMG hits, Prodigal hits, GeneMark hits): ' + str(prodigalDiff) + ', ' + str(genemarkDiff) + ', ' + str(totalImgHits) + ', ' + str(totalProdigalHits) + ', ' + str(totalGeneMarkHits) + '\n\n') def run(self): img = IMG() fout = open('./data/evaluate_prodigal.txt', 'w', 1) # get list of all marker genes markerset = MarkerSet() pfamMarkers, tigrMarkers = markerset.getCalculatedMarkerGenes() print 'PFAM marker genes: ' + str(len(tigrMarkers)) print 'TIGR marker genes: ' + str(len(pfamMarkers)) print '' # run HMMs on each of the finished genomes genomeIds = img.genomeIds('Finished') for genomeId in genomeIds: print genomeId + ':' fout.write(genomeId + ':\n') self.runProdigal(genomeId) self.runGeneMark(genomeId) self.runPFAM(genomeId) self.runTIGRFAM(genomeId) self.compareResults(genomeId, pfamMarkers, tigrMarkers, fout) fout.close()