def toWellMappedContigs(inFastaFile, inTaxonomyWFile, outFastaFile, outFastaMisAssembledFile, outTaxonomyFile, weightThreshold=0.99): """ Creates the fasta and mapping files that contain well assembled contigs (filter out misassembled contigs). @param inFastaFile: input fasta file with contigs @param inTaxonomyWFile: input file that contains taxonomy with weights (seqId, weight, taxonId) @param outFastaFile: fasta file containing well assembled sequences @param outFastaMisAssembledFile: fasta file containing misassembled contigs @param outTaxonomyFile: resulting taxonomy of the well assembled sequences (seqId, taxonId) @param weightThreshold: only contigs the weight of which is at least this value will be taken @return: statistics """ seqIdToTaxonId = csv.predToDict(inTaxonomyWFile) seqIdToWeight = csv.getMapping(inTaxonomyWFile, 0, 1, '\t') outFastaOk = csv.OutFileBuffer(outFastaFile) outFastaMis = csv.OutFileBuffer(outFastaMisAssembledFile) outTaxonomyOk = csv.OutFileBuffer(outTaxonomyFile) totalBp = 0.0 totalCount = 0.0 okBp = 0.0 okCount = 0.0 avgSumBp = 0.0 for seqId, seq in fas.fastaFileToDictWholeNames(inFastaFile).iteritems(): bp = len(seq) totalBp += bp totalCount += 1 seqIdPrefix = str(seqId).split(' ')[0] weight = seqIdToWeight[seqIdPrefix][0] fastaEntry = '>' + str(seqIdPrefix) + '\n' + str(seq) + '\n' if float(weight) >= weightThreshold: outFastaOk.writeText(fastaEntry) outTaxonomyOk.writeText( str(seqIdPrefix) + '\t' + str(seqIdToTaxonId[seqIdPrefix]) + '\n') okBp += bp okCount += 1 avgSumBp += getCoverage(seqId) * bp else: outFastaMis.writeText(fastaEntry) outFastaOk.close() outFastaMis.close() outTaxonomyOk.close() return 'Taken: %s/%sMB, %s/%sseq, %s%% bp %s%% seq, avg coverage %s' % ( round(okBp / 1000000, 2), round(totalBp / 1000000, 2), okCount, totalCount, round( (okBp / totalBp) * 100, 2), round( (okCount / totalCount) * 100, 2), round(avgSumBp / okBp, 3))
def removeEntries(mg): """ Removes sequences from the marker gene files at the level from species, genus, family etc. """ removeListPath = '/net/metagenomics/projects/PPSmg/data/V35/genome_ncbids_species.txt' srcFilePath = str( '/net/metagenomics/projects/PPSmg/data/markerGenes2/db/' + mg + '_bact+arch_dnaV.tax') dstFilePath = str( '/net/metagenomics/projects/PPSmg/data/V35/mgScenarios/speciesRemoved/db/' + mg + '_bact+arch_dnaV.tax') out = csv.OutFileBuffer(dstFilePath) removeSet = set(csv.getColumnAsList(removeListPath, colNum=0, comment='#')) removeSetInt = set() removeSetIds = set() removed = 0 for s in removeSet: if s != '': removeSetInt.add(int(s)) col0 = csv.getColumnAsList(srcFilePath, colNum=0, sep='\t', comment='#') col1 = csv.getColumnAsList(srcFilePath, colNum=1, sep='\t', comment='#') for col0, col1 in zip(col0, col1): lineSetInt = set() for s in col1.split(';'): if s != '': lineSetInt.add(int(s)) if len(removeSetInt.intersection( lineSetInt)) > 0: #the intersection is not empty removed += 1 removeSetIds.add(col0) else: out.writeText(str(col0 + '\t' + col1 + '\n')) out.close() print mg, 'removedEntries', removed srcFilePath = str( '/net/metagenomics/projects/PPSmg/data/markerGenes2/db/' + mg + '_bact+arch_dnaV.noalign.fna') dstFilePath = str( '/net/metagenomics/projects/PPSmg/data/V35/mgScenarios/speciesRemoved/db/' + mg + '_bact+arch_dnaV.noalign.fna') out = csv.OutFileBuffer(dstFilePath) seqIdToSeq = fas.fastaFileToDict(srcFilePath) removed = 0 for seqId in seqIdToSeq: if seqId in removeSetIds: removed += 1 else: out.writeText( str('>' + str(seqId) + '\n' + str(seqIdToSeq[seqId]) + '\n')) out.close() print mg, 'removedSeq', removed
def _getLabelsCreateFasta(): """ To process the original mercier dataset with 59 strains. Take only contigs that were mapped to the reference genomes. Output a fasta file and a mapping file. :rtype : None """ # input fasta file fastaFilePath = '/net/metagenomics/projects/PPSmg/data/V35/contigs_1000.txt' #contigs_1000.txt seqIdToSeq = fas.fastaFileToDict(fastaFilePath) # contigs mapped to genome names nameLabelsFilePath = '/net/metagenomics/projects/PPSmg/data/V35/contigs_1000_blast_labels.txt' #contigs_1000_blast_labels.txt seqIdToNameLabels = csv.getMapping(nameLabelsFilePath, 0, 1, sep='\t', comment='#') # mapping: genome name -> taxon id genomeListFilePath = '/net/metagenomics/projects/PPSmg/data/V35/genome_list2.txt' #genome_list.txt nameLabelToNcbid = csv.getMapping(genomeListFilePath, 0, 2, sep=';', comment='#') # to store mapped sequences outFastaFilePath = '/net/metagenomics/projects/PPSmg/data/V35/contigsMappedBlast1000.fna' #contigsMappedBlast1000.fna outFasta = csv.OutFileBuffer(outFastaFilePath) # to stored taxonomic mapping of mapped sequences outLabelsFilePath = '/net/metagenomics/projects/PPSmg/data/V35/contigsMappedBlast1000Labels.txt' #contigsMappedBlast1000Labels.txt outLabels = csv.OutFileBuffer(outLabelsFilePath) for seqId in seqIdToSeq: if seqId in seqIdToNameLabels: outFasta.writeText( str('>' + str(seqId) + '\n' + seqIdToSeq[seqId] + '\n')) outFasta.close() print 'fasta created' for seqId in seqIdToSeq: if seqId in seqIdToNameLabels: nameLabel = seqIdToNameLabels[seqId][0] ncbid = nameLabelToNcbid[nameLabel][0] outLabels.writeText(str(str(seqId) + '\t' + str(ncbid) + '\n')) outLabels.close() print 'labels created'
def toContigsLabels(inMapFile, outMapFile): """ Creates the label of contigs from the label of reads. @param inMapFile: maps contigId to a list of read taxonIds @param outMapFile: maps contigId to weight and the most prevalent taxonId """ out = csv.OutFileBuffer(outMapFile) for line in csv.getColumnAsList(inMapFile, sep='\n'): contigId, taxonIds = str(line).split('\t') taxonIdsList = map(int, str(taxonIds).split(',')) idToCount = {} totalCount = 0.0 for taxonId in taxonIdsList: totalCount += 1 if taxonId in idToCount: idToCount[taxonId] += 1 else: idToCount[taxonId] = 1 pairList = [] for taxonId, count in idToCount.iteritems(): pairList.append((taxonId, count)) pairList.sort(key=lambda x: x[1], reverse=True) weight = round(float(pairList[0][1]) / totalCount, 3) out.writeText( str(contigId) + '\t' + str(weight) + '\t' + str(pairList[0][0]) + '\n') out.close()
def scafToContigOutput(scaffContigMapFile, scaffPPSOutFile, contigPPSOutFile): """ Takes scaffold-contigs mapping and scaffold placement (.out file), outputs contigs placement (.out file) @param scaffContigMapFile: tab sepparated scaffold-contigs mapping (scaffoldName \t contigName) @param scaffPPSOutFile: scaffold predictions (PPS output file) @param contigPPSOutFile: contigs predictions (as if it was a PPS output file) """ # init output out = csv.OutFileBuffer(contigPPSOutFile) # read scaffold predictions scaffNameToTaxonId = csv.predToDict(scaffPPSOutFile) # read mapping: scaffName -> contigNameList scaffNameToContigNameList = csv.getMapping(scaffContigMapFile, 0, 1, sep='\t') # store contigs' predictions (according to scaffolds' predictions) for scaffName, contigNameList in scaffNameToContigNameList.iteritems(): taxonId = scaffNameToTaxonId.get(scaffName, None) if taxonId is None: taxonId = 1 for contigName in contigNameList: out.writeText(contigName + '\t' + str(taxonId) + '\n') out.close()
def removeLines(mg): removeListFilePath = '/net/metagenomics/projects/PPSmg/data/V35/genome_ncbids.txt' #removeListFilePath = '/net/metagenomics/projects/PPSmg/data/V35/genome_accession_silva.txt' srcFilePath = str('/net/metagenomics/projects/PPSmg/data/markerGenes/db/' + mg + '_bact+arch_dnaV.tax') dstFilePath = str( '/net/metagenomics/projects/PPSmg/data/V35/genomesRemoved/markerGenes/db/' + mg + '_bact+arch_dnaV.tax') #srcFilePath = str('/net/metagenomics/projects/PPSmg/data/silva/' + mg + '_silva106_ncbitax.bacteria+archaea.tax' ) #dstFilePath = str('/net/metagenomics/projects/PPSmg/data/V35/genomesRemoved/silva/' + mg + '_silva106_ncbitax.bacteria+archaea.tax' ) pattern = r'.*ncbid:([0-9]+)$' #pattern = r'^([^\-]+)\-.*$' removeSet = set( csv.getColumnAsList(removeListFilePath, colNum=0, comment='#')) col0 = csv.getColumnAsList(srcFilePath, colNum=0, sep='\t', comment='#') col1 = csv.getColumnAsList(srcFilePath, colNum=1, sep='\t', comment='#') out = csv.OutFileBuffer(dstFilePath) removed = 0 for col0, col1 in zip(col0, col1): if re.sub(pattern, r'\1', col0) not in removeSet: out.writeText(str(col0 + '\t' + col1 + '\n')) else: removed += 1 out.close() print mg, 'removeLines', removed
def findPlasmids(outPlasmidFilePath): """ Read sequence descriptions from a DBK files (stdin), output sequence ids (record.id) if the corresponding description contain "plasmid". Plasmids can be also within the sequences! """ # append to a file if it already exists if os.path.isfile(outPlasmidFilePath): outFileMode = 'a' else: outFileMode = 'w' outBuffer = csv.OutFileBuffer(outPlasmidFilePath, bufferText=False, fileOpenMode=outFileMode) recordCount = 0 plasmidCount = 0 for record in SeqIO.parse(sys.stdin, "genbank"): recordCount += 1 if string.find(record.description, 'plasmid') != -1: outBuffer.writeText(str(str(record.id) + '\n')) plasmidCount += 1 outBuffer.close() print 'file, records, plasmids:', outPlasmidFilePath, recordCount, plasmidCount
def removeSequences(mg): removeListFilePath = '/net/metagenomics/projects/PPSmg/data/V35/genome_ncbids.txt' #removeListFilePath = '/net/metagenomics/projects/PPSmg/data/V35/genome_accession_silva.txt' srcFilePath = str('/net/metagenomics/projects/PPSmg/data/markerGenes/db/' + mg + '_bact+arch_dnaV.noalign.fna') dstFilePath = str( '/net/metagenomics/projects/PPSmg/data/V35/genomesRemoved/markerGenes/db/' + mg + '_bact+arch_dnaV.noalign.fna') #srcFilePath = str('/net/metagenomics/projects/PPSmg/data/silva/' + mg + '_silva106_ncbitax.bacteria+archaea.fna' ) #dstFilePath = str('/net/metagenomics/projects/PPSmg/data/V35/genomesRemoved/silva/' + mg + '_silva106_ncbitax.bacteria+archaea.fna' ) pattern = r'.*ncbid:([0-9]+)$' #pattern = r'^([^\-]+)\-.*$' removeSet = set( csv.getColumnAsList(removeListFilePath, colNum=0, comment='#')) seqIdToSeq = fas.fastaFileToDict(srcFilePath) out = csv.OutFileBuffer(dstFilePath) removed = 0 for seqId in seqIdToSeq: if re.sub(pattern, r'\1', str(seqId)) not in removeSet: out.writeText( str('>' + str(seqId) + '\n' + str(seqIdToSeq[seqId]) + '\n')) else: removed += 1 out.close() print mg, 'removeSequences', removed
def generateCladesForGeneralModel(refSeqDir, taxonomyDatabaseFile, rank, minTotalCount, minBpPerSpeciesCount, generalModelMaxClades, taxonIdListFile): """ Generates the list of clades (file) to model for the general model @param refSeqDir: directory with reference data as needed for PPS @param taxonomyDatabaseFile: taxonomy file in the sqlite3 format @param rank: the clades will be considered at this rank @param minTotalCount: (see config) @param minBpPerSpeciesCount: (see config) @param generalModelMaxClades: maximum length of the list of the clades. @param taxonIdListFile: file to which the ncbi taxon ids will be stored @return: the number of the ncbi taxon ids stored in the file """ cladeBpPairList = refToClades(refSeqDir, taxonomyDatabaseFile, rank) rs = ref_seq.RefSequences(refSeqDir, taxonomyDatabaseFile) cladeList = [] count = 0 for clade, bp in cladeBpPairList: if rs.isRefSufficient(int(clade), minTotalCount, minBpPerSpeciesCount): cladeList.append(int(clade)) count += 1 if count >= generalModelMaxClades: break out = csv.OutFileBuffer(taxonIdListFile) for clade in cladeList: out.writeText(str(clade) + '\n') out.close() rs.close() return len(cladeList)
def ppsOut2ppOut(inFile, outFile, taxonomicRanks, databaseFile): """ Transforms a PPS output file into a file in the PP format. @param inFile: input file in the PPS format (first column: seq name, last column: ncbi taxon id) @param outFile: output file in the PP format @param taxonomicRanks: taxonomic ranks (starting from superkingdom) @param databaseFile: database file in the sqlite3 format """ taxonomy = Taxonomy(databaseFile, taxonomicRanks) outBuff = csv.OutFileBuffer(outFile) namesList = csv.getColumnAsList(inFile, entryModifyFunction=None, colNum=0, sep='\t', comment='#') valCol = 1 ncbidsList = csv.getColumnAsList(inFile, entryModifyFunction=None, colNum=valCol, sep='\t', comment='#') while True: # this is not efficient! valCol += 1 tmpList = csv.getColumnAsList(inFile, entryModifyFunction=None, colNum=valCol, sep='\t', comment='#') if len(tmpList) == len(namesList): ncbidsList = tmpList else: break header = str('#PPS file transformed to PP format, input file: ' + str(inFile) + '\n#ID' + '\t' + 'root') for rank in taxonomicRanks: header += str('\t' + rank) outBuff.writeText(str(header + '\n')) for i in range(len(namesList)): name = namesList[i] ncbid = ncbidsList[i] taxPathDict = taxonomy.getPathToRoot(int(ncbid)) buff = str(name) if taxPathDict is None: buff += str('\t') else: buff += str('\t' + 'root') for rank in taxonomicRanks: if (taxPathDict is not None) and (rank in taxPathDict) and ( not taxPathDict[rank].isCopy()): buff += str('\t' + taxPathDict[rank].name) else: buff += '\t' outBuff.writeText(str(buff + '\n')) outBuff.close() taxonomy.close()
def genomesToMask(): rank = 'genus' #which rank will be masked fileName = '/Users/ivan/Documents/work/binning/data/simMC/fromJohannes/contigs_genus_ncbids.txt' outFile = '/Users/ivan/Documents/work/binning/data/simMC/fromJohannes/genome_genus_masked.txt' outFile2 = '/Users/ivan/Documents/work/binning/data/simMC/fromJohannes/genome_ncbids_genus.txt' #outFile = '/Users/ivan/Documents/work/binning/data/V35/genome_species_masked.txt' #output file #outFile2 = '/Users/ivan/Documents/work/binning/data/V35/genome_ncbids_species.txt' #output file #fileName='/Users/ivan/Documents/work/binning/data/V35/genome_ncbids.txt' #list of all genome ncbids dbFile = '/Users/ivan/Documents/work/binning/taxonomy/20120828/ncbitax_sqlite.db' #DB out = csv.OutFileBuffer(outFile) out2 = csv.OutFileBuffer(outFile2) genomeNcbids = csv.getColumnAsList(fileName, entryModifyFunction=None, colNum=0, sep=None, comment='#') taxonomy = taxonomy_ncbi.TaxonomyNcbi(dbFile) maskNcbids = [] #print len(genomeNcbids), genomeNcbids for ncbid in genomeNcbids: while taxonomy.getRank(ncbid) != rank: ncbid = taxonomy.getParentNcbid(ncbid) if int(ncbid) == 1: print 'root reached!' break maskNcbids.append(int(ncbid)) #print len(Set(maskNcbids)), maskNcbids maskSet = set(maskNcbids) for i in maskSet: out2.writeText(str(str(i) + '\n')) resultList = [] for ncbid in maskSet: list = collectChildren(taxonomy, ncbid) for i in list: out.writeText(str(str(i) + '\n')) print ncbid, list #print taxonomy.childrenNcbids(818) #997888,818 out.close() out2.close() taxonomy.close()
def toContigsLabelList(inFastaFileName, readsF, readsR, readOnContig, community, outMappingFileName): """ Gets mapping from contigIds to lists of taxonIds of individual reads of the contigs. @param inFastaFileName: @param readsF: @param readsR: @param readOnContig: @param community: @param outMappingFileName: """ # contigIds contigIdToBp = fas.getSequenceToBpDict(inFastaFileName) # map: contigId -> list of readIds contigIdToReadList = csv.getMapping(readOnContig, 1, 0, sep='\t', comment='r') # taxonIds as a list for reads readFTaxonIdList = getReadsTaxonIdList(readsF, community) print 's1' readRTaxonIdList = getReadsTaxonIdList(readsR, community) print 's2' if len(readFTaxonIdList) != len(readRTaxonIdList): print( 'toContigsLabels: different number of reads in the reads files, exit' ) return for i in range(len(readFTaxonIdList))[1:]: if readFTaxonIdList[i] != readRTaxonIdList[i]: print( 'toContigsLabels: at index %s different taxon ids %s and %s' % (i, readFTaxonIdList[i], readRTaxonIdList[i])) if readFTaxonIdList[i] is None or readRTaxonIdList[i] is None: print('toContigsLabels: at index %s, one is None %s or %s' % (i, readFTaxonIdList[i], readRTaxonIdList[i])) print 's3' # out = csv.OutFileBuffer(outMappingFileName) for contigId in contigIdToBp: try: readList = contigIdToReadList[contigId] taxonIdList = [] for readId in readList: taxonIdList.append(readFTaxonIdList[int(readId)]) out.writeText( str(contigId) + '\t' + ','.join(map(str, taxonIdList)) + '\n') except KeyError: print("No label for contigId: %s" % contigId) out.close() print 's4'
def concatenate(directory, outputFile): out = csv.OutFileBuffer(outputFile) for f in os.listdir(directory): path = os.path.join(directory, f) name = f.split('.')[0] seqIdToSeq = fasta.fastaFileToDict(path) out.writeText('>' + str(name) + '\n') for seqId, seq in seqIdToSeq.iteritems(): out.writeText(str(seq) + 200*'N' + '\n') out.close()
def filterOutContigs(inFastaFile, inTaxFile, outFastaFile, outTaxFile, notAllowedTaxonIdList): outFasta = csv.OutFileBuffer(outFastaFile) outTax = csv.OutFileBuffer(outTaxFile) seqIdToTaxonId = csv.predToDict(inTaxFile) notAllowedTaxonIdSet = set(notAllowedTaxonIdList) taxonIdToFilteredSeq = {} for taxonId in notAllowedTaxonIdSet: taxonIdToFilteredSeq[taxonId] = 0 for seqId, seq in fas.fastaFileToDict(inFastaFile).iteritems(): taxonId = int(seqIdToTaxonId[seqId]) if taxonId not in notAllowedTaxonIdSet: outFasta.writeText('>' + str(seqId) + '\n' + str(seq) + '\n') outTax.writeText(str(seqId) + '\t' + str(taxonId) + '\n') else: taxonIdToFilteredSeq[taxonId] += 1 outFasta.close() outTax.close() print("filtered taxonId -> seqCount: " + str(taxonIdToFilteredSeq))
def filterOutReads(): inFasta = '' outFasta = '' out = csv.OutFileBuffer(outFasta) notAllowedSet = set(['BA000019.2']) # Nostoc sp. PCC 7120 for seqId, seq in fas.fastaFileToDict(inFasta).iteritems(): id = re.sub(r'([^_]+)_.*', r'\1', seqId) if id not in notAllowedSet: out.writeText('>' + str(seqId) + '\n' + str(seq) + '\n') out.close()
def ppOut2PPSout(): inFile = '/Users/ivan/Documents/work/binning/data/HumanGut/PP/TS29_scaff.file.0.5.txt' outFile = '/Users/ivan/Documents/work/binning/data/HumanGut/PP/TS29_scaff.file.0.5.PPS.txt' dbFile = '/Users/ivan/Documents/work/binning/taxonomy/20120828/ncbitax_sqlite.db' #DB taxonomy = taxonomy_ncbi.TaxonomyNcbi(dbFile) out = csv.OutFileBuffer(outFile) csv.forEachLine(inFile, PP2PPSoutParser(taxonomy, out)) out.close()
def toLongSeq(inFastaFileName, outFastaFileName, minLength=1000): """ Creates a fasta file that contains sequences that are at least minLength long. @param inFastaFileName: @param outFastaFileName: @param minLength: """ out = csv.OutFileBuffer(outFastaFileName) for seqId, seq in fas.fastaFileToDictWholeNames( inFastaFileName).iteritems(): if len(seq) >= minLength: out.writeText('>' + str(seqId) + '\n' + str(seq) + '\n') out.close()
def samToMap(samFile, accToNcbiFile, outMapFile): """ @param samFile: sam file from an assembler @param accToNcbiFile: mapping: accessions -> ncbi taxon ids @param outMapFile: output file or directory """ accToNcbi = csv.getMapping(accToNcbiFile, 0, 1, sep='\t') contigToAcc = parseSam(samFile) out = csv.OutFileBuffer(outMapFile) for contigId, acc in contigToAcc.iteritems(): taxonId = accToNcbi.get(acc, None) if taxonId is None: print("No mapping for %s %s" % (contigId, acc)) else: out.writeText(contigId + '\t' + taxonId[0] + '\n') out.close()
def refToClades(refDir, taxonomyFile, rank='species', outFile=None): """ Returns (stores) a list of all clades (at the given rank) sorted according to the abundance of the individual clades. Abundance in respect to the size of the reference data available. @param refDir: directory containing reference data (as needed for PPS) @param taxonomyFile: ncbi taxonomy in the sqlite3 format @param rank: consider clades at this rank @param outFile: tab sep file, first column taxon id, second column number of bp (can be None) @return: list of tuples (clade, bp) """ taxonomy = taxonomy_ncbi.TaxonomyNcbi(taxonomyFile) cladeNcbiToBp = {} for fileName in os.listdir(refDir): size = os.path.getsize(os.path.join(refDir, fileName)) ncbid = int(fileName.rsplit('.', 2)[0]) current = ncbid while (current is not None) and (taxonomy.getRank(current) != rank): current = taxonomy.getParentNcbid(int(current)) if current is not None: if current in cladeNcbiToBp: cladeNcbiToBp[current] += size else: cladeNcbiToBp[current] = size else: print( 'There is no ncbi taxon id defined at rank %s for ncbi taxon id %s' % (rank, ncbid)) taxonomy.close() tuples = [] for ncbid, size in cladeNcbiToBp.iteritems(): tuples.append((ncbid, size)) tuples.sort(key=lambda x: x[1], reverse=True) if outFile is not None: out = csv.OutFileBuffer(outFile) for t in tuples: out.writeText(str(t[0]) + '\t' + str(t[1]) + '\n') out.close() return tuples
def getFirstLabelAtAllowedRank(): rank = 'species' # !!!!!!! predFile1 = '/net/metagenomics/projects/PPSmg/data/V35/contigsMappedBlast1000Labels.txt' predFile2 = '/net/metagenomics/projects/PPSmg/data/V35/contigsMappedBlast1000LabelsSpecies.txt' seqIdToLabel = csv.getMapping(predFile1, 0, 1, sep='\t', comment='#') outPred = csv.OutFileBuffer(predFile2) taxonomy = tax.TaxonomyNcbi( '/net/metagenomics/projects/PPSmg/data/nobackup/NCBI20120828/ncbiTax/ncbitax_sqlite.db' ) for seqId in seqIdToLabel: ncbid = int(seqIdToLabel[seqId][0]) while not taxonomy.isRankNcbidAllowed(ncbid): ncbid = taxonomy.getParentNcbid(ncbid) outPred.writeText(str(seqId + '\t' + str(ncbid) + '\n')) taxonomy.close() outPred.close()
def outToCami(ppspOutFile): """ Creates a cami output file, in format: #CAMI Format for Binning @Task:Binning @Version:1.0 @ContestantID:CONTESTANTID @SampleID:SAMPLEID @Referencebased:T @Assemblybased:T @ReplicateInfo:T @@SEQUENCEID TAXID BINID read1201 123 123 read1202 123 123 read1203 131564 131564 read1204 562 562.1 read1205 562 562.2 """ out = csv.OutFileBuffer(ppspOutFile + '.cami') out.writeText("""#CAMI Format for Binning @Task:Binning @Version:1.0 @ContestantID:CONTESTANTID @SampleID:SAMPLEID @Referencebased:T @Assemblybased:T @ReplicateInfo:T @@SEQUENCEID TAXID BINID """) for line in open(ppspOutFile): name, taxonId = line.strip('\n').split('\t', 2) out.writeText("%s\t%s\t%s\n" % (name, taxonId, taxonId)) out.close()
def sortReads(inReadsFile, outReadsFile, headerToNum=lambda x: int(x.split('_', 2)[1].strip('nr'))): i = 0 seqName = None tupleList = [] for line in csv.getColumnAsList(inReadsFile, sep='\n'): if i % 2 == 0: seqName = line else: seq = line assert seqName is not None tupleList.append( (seqName, zlib.compress(seq), headerToNum(seqName))) seqName = None i += 1 tupleList.sort(key=lambda x: x[2]) out = csv.OutFileBuffer(outReadsFile) for t in tupleList: out.writeText(str(t[0]) + '\n' + str(zlib.decompress(t[1])) + '\n') out.close()
def createEvalMetaFile(outputDir): precisionRecallFile = os.path.join(outputDir, 'precision_recall.csv') precisionRecallCorrectionFile = os.path.join( outputDir, 'precision_recall_correction.csv') confusionMatrixDir = os.path.join(outputDir, 'confusion_matrix') consistencyFile = os.path.join(outputDir, 'consistency.txt') metaOut = csv.OutFileBuffer(os.path.join(outputDir, 'biobox.yaml')) # creates a metafile describing the results if os.path.isfile(precisionRecallFile): metaOut.writeText('''name: Precision and recall type: csv value: %s\n\n''' % precisionRecallFile) if os.path.isfile(precisionRecallCorrectionFile): metaOut.writeText('''name: Precision and recall with correction type: csv value: %s\n\n''' % precisionRecallCorrectionFile) if os.path.isfile(consistencyFile): metaOut.writeText('''name: Consistency type: txt value: %s\n\n''' % consistencyFile) if os.path.isdir(confusionMatrixDir): for f in os.listdir(confusionMatrixDir): filePath = os.path.join(confusionMatrixDir, f) rank = filePath.rsplit('.', 2)[1].split('_')[0] metaOut.writeText('''name: Confusion table for %s description: Where rows correspond to the true assignments and columns correspond to the assignments by a binning method. type: csv value: %s\n\n''' % (rank, filePath)) metaOut.close()
def getSeeds(inSortedFasta, outSeedsFasta): """ @param inSortedFasta: DNA sequences sorted according to the sequence length in the descending order @param outSeedsFasta: a fasta file that contains all seeds """ out = csv.OutFileBuffer(outSeedsFasta) seedList = [] seqList = fasta.getSequencesToList( inSortedFasta) # list of (sequenceName, sequence) for seqId, seq in seqList: seq = string.upper(seq) newSeed = True for seedSeq in seedList: if len(seedSeq) < len(seq): continue # if bool(re.search(seq, seedSeq, re.I)) or bool(re.search(str(Seq(seq).reverse_complement()), seedSeq, re.I)): if seq in seedSeq or str(Seq(seq).reverse_complement()) in seedSeq: newSeed = False break if newSeed: # print 'new seed:', seqId seedList.append(seq) out.writeText(str('>' + seqId + '\n' + seq + '\n')) # else: # print 'no seed:', seqId out.close() print 'total', len(seqList) print 'seed count', len(seedList) print 'duplicate', (len(seqList) - len(seedList))
def getProfile(readsFFastaFile, communityFile, contigMFastaFile, contigLFastaFile, taxonomyMFile, taxonomyDbFile, outProfileFile): """ Gets the profile of the dataset. @param readsFFastaFile: @param communityFile: @param contigMFastaFile: @param contigLFastaFile: @param taxonomyMFile: @param taxonomyDbFile: taxonomy in the sqlite3 format @param outProfileFile: output file """ # get map: taxonId -> read count taxonIdToReadCount = {} readTotalCount = 0 for taxonId in getReadsTaxonIdList( readsFFastaFile, communityFile, readHeaderToCommunityId=getCommunityId)[1:]: if taxonId in taxonIdToReadCount: taxonIdToReadCount[taxonId] += 1 else: taxonIdToReadCount[taxonId] = 1 readTotalCount += 1 # get map: taxonId -> contig count # get map: taxonId -> contig bp taxonIdToContigCount = {} taxonIdToContigBp = {} totalContigCount = 0 seqIdToTaxonId = csv.predToDict(taxonomyMFile) seqIdToBp = fas.getSequenceToBpDict(contigMFastaFile) for seqId, bp in seqIdToBp.iteritems(): totalContigCount += 1 taxonId = seqIdToTaxonId[seqId] if taxonId in taxonIdToContigBp: taxonIdToContigBp[taxonId] += bp else: taxonIdToContigBp[taxonId] = bp if taxonId in taxonIdToContigCount: taxonIdToContigCount[taxonId] += 1 else: taxonIdToContigCount[taxonId] = 1 taxonIdToTotalBp = {} taxonIdToAvgSumCov = {} taxonIdToAvgCov = {} totalBp = 0.0 for taxonId in taxonIdToContigBp: taxonIdToTotalBp[taxonId] = 0.0 taxonIdToAvgSumCov[taxonId] = 0.0 taxonIdToAvgCov[taxonId] = 0.0 for seqId in fas.fastaFileToDictWholeNames(contigLFastaFile): shortSeqId = getShortContigId(seqId) if shortSeqId in seqIdToBp: coverage = getCoverage(seqId) bp = seqIdToBp[shortSeqId] taxonId = seqIdToTaxonId[shortSeqId] taxonIdToTotalBp[taxonId] += bp taxonIdToAvgSumCov[taxonId] += float(coverage) * float(bp) totalBp += bp for taxonId, bp in taxonIdToTotalBp.iteritems(): if bp > 0: taxonIdToAvgCov[taxonId] = taxonIdToAvgSumCov[taxonId] / float(bp) tupleList = [] taxonomy = taxonomy_ncbi.TaxonomyNcbi(taxonomyDbFile, considerNoRank=True) ranks = taxonomy_ncbi.TAXONOMIC_RANKS[2:] avgCoverage = 0.0 for taxonId, readCount in taxonIdToReadCount.iteritems(): scName = ScientificNameAtRank(taxonId, taxonomy, ranks) tupleList.append(( taxonId, round(100 * (readCount / float(readTotalCount)), 1), round(100 * (taxonIdToTotalBp.get(taxonId, 0) / float(totalBp)), 1), round(taxonIdToAvgCov.get(taxonId, 0), 2), round(taxonIdToTotalBp.get(taxonId, 0) / 1000000.0, 2), taxonIdToContigCount.get(taxonId, 0), taxonomy.getScientificName(taxonId), scName.getNameAtRank('phylum'), scName.getNameAtRank('class'), scName.getNameAtRank('order'), scName.getNameAtRank('family'), scName.getNameAtRank('genus'), scName.getNameAtRank( 'species') # this could be done in a nicer way )) avgCoverage += taxonIdToAvgCov.get(taxonId, 0) * taxonIdToTotalBp.get( taxonId, 0) avgCoverage /= float(totalBp) tupleList.sort(key=lambda x: x[2], reverse=True) out = csv.OutFileBuffer(outProfileFile) out.writeText( '#taxonId, % reads, % contigs, avg coverage, MB contigs, contigs count, strain name, ' + ",".join(ranks) + '\n') for entry in tupleList: out.writeText(','.join(map(str, entry)) + '\n') out.writeText('#Sum/Avg., -, -, ' + str(round(avgCoverage, 2)) + ', ' + str(round(totalBp / 1000000.0, 2)) + ', ' + str(totalContigCount) + ', -\n') out.close() taxonomy.close()
def _main(): """ See the module description.""" parser = argparse.ArgumentParser(description=__doc__, epilog="""""") parser.add_argument( '-i', '--input-data-dir', action='store', nargs=1, required=True, help= """Directory that contains fasta files and corresponding mapping files, for each "*.tax" (or "*.csv") file there must be a "*.fna" file with the same name. All files with suffix "tax" (or "*.csv") will be considered. (Takes only Bacteria and Archaea)""", metavar='input_dir', dest='inDir') parser.add_argument('-o', '--output-dir', action='store', nargs=1, required=True, help='Directory that contains the output files.', metavar='out_dir', dest='outDir') parser.add_argument( '-s', '--source-type', required=True, nargs=1, choices=["s", "a"], help= 'To determine the source, use "s" for the Silva database and "a" for the Amphora database.', dest='srcType') parser.add_argument( '-t', '--taxonomy-file', nargs=1, type=file, required=True, help='NCBI taxonomy database file in the sqlite3 format.', metavar='ncbitax_sqlite.db', dest='taxonomy') parser.add_argument('-n', '--not-considered-taxonIds', action='store', nargs=1, help='Comma separated leaf level or top level taxonIds (as a string) what fill be filtered out. (optional)', metavar='"2759,10239,77133,155900,408172,32644, 408170,433727,749907,556182,702656,410661,652676,410659,797283'\ ',408171,703336,256318,32630,433724,766747,488339,942017,1076179,717931,455559,527640,904678,552539,'\ '54395,198431,358574,415540,511564,369433,380357,81726,198834,271928,311313,2759,749906,1077529,'\ '1077529,361146,511563,361147"', dest='filterOut') # parse arguments args = parser.parse_args() inDir = args.inDir[0] outDir = args.outDir[0] srcType = args.srcType[0] filterOutTaxonIdsSet = set() try: if args.filterOut: filterOutTaxonIdsSet.update( set(map(int, str(args.filterOut[0]).split(',')))) except: print( 'Taxon ids that are to be filtered out are in a wrong format! Comma separated integers are needed!' ) raise taxonomy = _TaxonomyWrap(args.taxonomy[0].name) for dir in [inDir, outDir]: assert os.path.isdir(dir), 'Path: "' + dir + '" does not exists!' # create db for each gene mapDict = {} # map: seqId -> ncbid for mapFilePath in glob.glob( os.path.join(os.path.normpath(inDir), r'*.[ct][sa][vx]')): # *.csv or *.tax assert mapFilePath.endswith(('.csv', '.tax')), \ 'The mapping files can either end with .csv or .tax ' + mapFilePath base = os.path.basename(mapFilePath).rsplit( '.', 1)[0] # cut out dir path and suffix fastaDict = fas.fastaFileToDict( os.path.join(os.path.dirname(mapFilePath), (base + '.fna'))) # map: seqId -> seq print("Processing: %s seq count: %s" % (base, str(len(fastaDict)))) if 'a' in srcType: # Amphora mapDict = {} for k in csv.getColumnAsList(mapFilePath, colNum=0, sep='\t'): v = int(k.rsplit('|', 1)[1].split(':')[1]) # get ncbid assert ((k not in mapDict) or (mapDict[k] == v)), str( 'There are at least two different values for key: ' + str(k) + ' in ' + mapFilePath) mapDict[k] = v elif 's' in srcType: # Silva mapTmp = csv.getMapping(mapFilePath, 0, 2, '\t') mapDict = {} for k, v in mapTmp.iteritems(): mapDict[k] = int(v[0]) else: assert False, 'Unsupported source type!' # same number of entries in both files (fasta and mapping) ? if len(mapDict) != len(fastaDict): print( str('%s: The mapping file and the corresponding fasta file have different number of entries: ' + '"%s" "%s" these files will be skipped!') % (base, str(len(mapDict)), str(len(fastaDict)))) continue # are duplicates in the mapping file ? count = len(csv.getColumnAsList(mapFilePath)) if len(mapDict) != count: print( '%s: The mapping file contained duplicates! unique: %s non-unique: %s' % (base, str(len(mapDict)), str(count))) # store data to the output directory outDna = csv.OutFileBuffer(os.path.join(outDir, str(base + '.fna'))) outTax = csv.OutFileBuffer(os.path.join(outDir, str(base + '.tax'))) count = 0 filteredLeaf = 0 filteredSup = 0 notMapped = 0 noBacArch = 0 for seqId, taxonId in mapDict.iteritems(): if taxonId in filterOutTaxonIdsSet: filteredLeaf += 1 continue path = taxonomy.getPathToRoot(taxonId) if path is None: print('Could not find: %s for seqId: %s record skipped!' % (str(taxonId), seqId)) notMapped += 1 continue topLevel = int(path.split(';', 1)[0]) if topLevel in filterOutTaxonIdsSet: filteredSup += 1 continue if topLevel not in [2, 2157]: # Bacteria, Archaea noBacArch += 1 print('NoBactArch: ', topLevel) seq = fastaDict[seqId] if 'a' in srcType: # Amphora id = seqId elif 's' in srcType: # Silva id = str(seqId + '|ncbid:' + str(taxonId)) outTax.writeText(str(id + '\t' + path + '\n')) outDna.writeText(str('>' + id + '\n' + seq + '\n')) count += 1 outDna.close() outTax.close() print( 'Stored entries: %s filtered out: %s leaf, %s top level, not mapped: %s' % (count, filteredLeaf, filteredSup, notMapped)) if noBacArch > 0: print( 'WARN: stored %s of non Bacterial and non Archaeal sequences: ' % (noBacArch)) # Silva: #-i /Users/ivan/Documents/work/binning/database/silva111/arbGenerated -s s -t /Users/ivan/Documents/work/binning/taxonomy/20121122/ncbitax_sqlite.db # -o /Users/ivan/Documents/work/binning/database/silva111/db -n ... # Amphora # -i /Users/ivan/Documents/work/binning/database/markerGenes3/mGenesExtracted -s a -t /Users/ivan/Documents/work/binning/taxonomy/20121122/ncbitax_sqlite.db # -o /Users/ivan/Documents/work/binning/database/markerGenes3/db taxonomy.close() print 'done'
def computeTrainingAccuracy(workingDir, taWorkingDir, sampleSpecificDir, ppsTrainDataDir, outputDir, ppsInstallDir, ppsScripts, ppsConfigFilePath, predictLogFileName, modelTaxonIdFilePath, databaseFile): """ Computes the training accuracy for the PPS training data. This function doesn't consider training data used to train intermediate (misc?) nodes! The training data that correspond to the sample specific data is fragmented (via PPS) and contained in the training data of different lengths. @param workingDir: working directory of the PPS+ pipeline @param taWorkingDir: working directory for the accuracy computation @param sampleSpecificDir: directory containing the sample specific data @param ppsTrainDataDir: directory 'sampled_fasta' containing PPS training data @param outputDir: directory for output files @param ppsScripts: directory containing PPS scripts @param ppsConfigFilePath: the PPS configuration file @param ppsInstallDir: directory where PPS is installed @param predictLogFileName: logging file for PPS prediction @param modelTaxonIdFilePath: file containing all leaf ncbi taxon ids that are modelled @param databaseFile: ncbi taxonomy file in the sqlite3 format """ for d in [ workingDir, taWorkingDir, sampleSpecificDir, ppsTrainDataDir, outputDir, ppsInstallDir, ppsScripts, os.path.dirname(predictLogFileName) ]: assert os.path.isdir(d), "Directory '%s' doesn't exist!" % d for f in [ppsConfigFilePath, databaseFile, modelTaxonIdFilePath]: assert os.path.isfile(f), "File '%s' doesn't exist!" % f # all directories that contain PPS training data trainDirList = [sampleSpecificDir] for d in os.listdir(ppsTrainDataDir): trainDirList.append(os.path.join(ppsTrainDataDir, d)) # fasta file with all training sequences allTrainFastaFile = os.path.join(taWorkingDir, 'all_train_data.fna') out = csv.OutFileBuffer(allTrainFastaFile) seqIdToTruePred = {} # merge all training fasta files to one fasta file for d in trainDirList: dName = os.path.basename(d) for f in os.listdir(d): taxonId = int(os.path.basename(f).rsplit('.', 2)[0]) for seqId, seq in fasta.fastaFileToDict(os.path.join( d, f)).iteritems(): if d == sampleSpecificDir: #label = int(str(str(seqId).rsplit('|', 1)[1]).split(':', 1)[1]) id = str( taxonId) + '|' + dName + '|' + seqId + '|label:' + str( taxonId) else: id = str(taxonId) + '|' + dName + '|' + seqId out.writeText('>' + id + '\n' + seq + '\n') seqIdToTruePred[id] = taxonId out.close() # predict the merged file using the generated model if os.name == 'posix': predictCmd = str( os.path.join(ppsScripts, 'predict.rb') + ' ' + allTrainFastaFile + ' ' + ppsConfigFilePath) #print(predictCmd) logOut = open(predictLogFileName, 'w') predictProc = subprocess.Popen( predictCmd, shell=True, bufsize=-1, cwd=ppsInstallDir, stdout=logOut, stderr=subprocess.STDOUT) # stdout=subprocess.STDOUT predictProc.wait() logOut.close() if predictProc.returncode != 0: raise Exception( "PPS 'predict' training data returned with non-zero status: %s, cmd: %s" % (predictProc.returncode, predictCmd)) else: print("Can't run PPS on a non-posix system!") return # read in predicted train data seqIdToPred = csv.predToDict(allTrainFastaFile + '.nox.fna.out') # read fasta file seqIdToBp = fasta.getSequenceToBpDict(allTrainFastaFile) # leaf taxonIds that are modelled modelLeafTaxonIds = set(map(int, csv.getColumnAsList(modelTaxonIdFilePath))) taxonomyS = taxonomy_ncbi.TaxonomyNcbi(databaseFile, considerNoRank=True) notLeafTaxonIds = set() for id in modelLeafTaxonIds: notLeafTaxonIds.update( set(map(int, (taxonomyS.getParentsNcbidSet(id))))) taxonomyS.close() # get only sequences with true taxonId defined at leaf level that is modelled or lower seqIdToBp2 = {} seqIdToPred2 = {} seqIdToTruePred2 = {} seqIdToBpMisc = {} seqIdToPredMisc = {} seqIdToTruePredMisc = {} for seqId, bp in seqIdToBp.iteritems(): label = int(str(str(seqId).rsplit('|', 1)[1]).split(':', 1)[1]) if label not in notLeafTaxonIds: seqIdToBp2[seqId] = bp seqIdToPred2[seqId] = seqIdToPred[seqId] seqIdToTruePred2[seqId] = seqIdToTruePred[seqId] else: seqIdToBpMisc[seqId] = bp seqIdToPredMisc[seqId] = seqIdToPred[seqId] seqIdToTruePredMisc[seqId] = seqIdToTruePred[seqId] seqIdToBp = seqIdToBp2 seqIdToPred = seqIdToPred2 seqIdToTruePred = seqIdToTruePred2 # accuracy for all, filter out sample specific data (whole length) seqIdToBpNoSampleSpec = {} for seqId, bp in seqIdToBp.iteritems(): if str(seqId).split( '|', 2)[1].strip() != os.path.basename(sampleSpecificDir).strip(): seqIdToBpNoSampleSpec[seqId] = bp acc = accuracy.Accuracy(seqIdToBpNoSampleSpec, seqIdToPred, seqIdToTruePred, databaseFile) out = csv.OutFileBuffer(os.path.join(outputDir, 'train_accuracy_all.txt')) out.writeText( acc.getAccuracyPrint(taxonomy_ncbi.TAXONOMIC_RANKS[1:], minFracClade=None, minFracPred=None, overview=True)) out.close() taxonomyA = acc.getTaxonomy() acc.close(closeTaxonomy=False) # accuracy for (misc) nodes acc = accuracy.Accuracy(seqIdToBpMisc, seqIdToPredMisc, seqIdToTruePredMisc, taxonomyA) out = csv.OutFileBuffer(os.path.join(outputDir, 'train_accuracy_misc.txt')) out.writeText( acc.getAccuracyPrint(taxonomy_ncbi.TAXONOMIC_RANKS[1:], minFracClade=None, minFracPred=None, overview=True)) out.close() acc.close(closeTaxonomy=False) # generate the confusion matrices (for the "for all" scenario) cm = confusion_matrix.ConfusionMatrix(seqIdToBp, seqIdToPred, seqIdToTruePred, databaseFile, taxonomy_ncbi.TAXONOMIC_RANKS[1:]) for rank in taxonomy_ncbi.TAXONOMIC_RANKS[1:]: cm.generateConfusionMatrix( rank, os.path.join(outputDir, 'train_accuracy_cmp_all')) taxonomyCM = cm.getTaxonomy() cm.close(closeTaxonomy=False) # accuracy for individual directories (seq lengths) # (the sample specific fragments are among PPS sampled fasta) for d in trainDirList: dName = os.path.basename(d) seqIdToBpSub = {} seqIdToPredSub = {} seqIdToTruePredSub = {} for seqId, bp in seqIdToBp.iteritems(): if str(seqId).split('|', 2)[1].strip() == str(dName).strip(): seqIdToBpSub[seqId] = seqIdToBp[seqId] seqIdToPredSub[seqId] = seqIdToPred[seqId] seqIdToTruePredSub[seqId] = seqIdToTruePred[seqId] # accuracy acc = accuracy.Accuracy(seqIdToBpSub, seqIdToPredSub, seqIdToTruePredSub, taxonomyA) out = csv.OutFileBuffer( os.path.join(outputDir, 'train_accuracy_' + dName + '.txt')) out.writeText( acc.getAccuracyPrint(taxonomy_ncbi.TAXONOMIC_RANKS[1:], minFracClade=None, minFracPred=None, overview=True)) # confusion matrices cm = confusion_matrix.ConfusionMatrix( seqIdToBpSub, seqIdToPredSub, seqIdToTruePredSub, taxonomyCM, taxonomy_ncbi.TAXONOMIC_RANKS[1:]) for rank in taxonomy_ncbi.TAXONOMIC_RANKS[1:]: cm.generateConfusionMatrix( rank, os.path.join(outputDir, 'train_accuracy_cmp_' + dName)) cm.close(closeTaxonomy=False) out.close() acc.close(closeTaxonomy=False) taxonomyA.close() taxonomyCM.close()
def mergeSequences(mapFilePathList, fastaFilePathList, outputDir): """ Reads all sequences. For each taxonId creates a file that contain all sequences mapped to this taxonId. If a seqId appears more than one it is ignored since acession numbers are unique. @param mapFilePathList: list of files where each contain mapping: seqId -> taxonId @param fastaFilePathList: list of fasta files that contain mapping: seqId -> seq """ taxonIdToOutBuffer = {} seqIdSet = set() totalSeqCount = 0 totalStoredSeqCount = 0 totalIdenticalSeqCount = 0 for mapFilePath, fastaFilePath in zip(mapFilePathList, fastaFilePathList): print 'processing', mapFilePath, fastaFilePath seqCount = 0 storedSeqCount = 0 seqIdToSeq = fasta.fastaFileToDict(fastaFilePath) seqIdToNcbidList = csv.getMapping(mapFilePath, 0, 1, sep='\t', comment='#') for seqId, seq in seqIdToSeq.iteritems(): seqCount += 1 if seqId in seqIdSet: totalIdenticalSeqCount += 1 continue else: seqIdSet.add(seqId) taxonId = seqIdToNcbidList[seqId][0] if taxonId not in taxonIdToOutBuffer: outBuffer = csv.OutFileBuffer( os.path.join(outputDir, str(str(taxonId) + '.fna'))) taxonIdToOutBuffer[taxonId] = outBuffer taxonIdToOutBuffer[taxonId].writeText( str('>' + seqId + '\n' + seq + '\n')) taxonIdToOutBuffer[taxonId].close() storedSeqCount += 1 if len(string.replace(common.noNewLine(seq), 'N', '')) == 0: print 'zeros', seqId, fastaFilePath, len(common.noNewLine(seq)) # for buff in taxonIdToOutBuffer.values(): # buff.close() print 'totalSeq, storedSeq', seqCount, storedSeqCount totalSeqCount += seqCount totalStoredSeqCount += storedSeqCount print 'totalSeqCount, totalStoredSeqCount, totalIdenticalSeqCount', totalSeqCount, totalStoredSeqCount, totalIdenticalSeqCount print 'sequences merged'
def generateConfusionMatrix(self, rank, prefixOutputPath): """ Generates confusion matrix at given rank. The object must have been initialized considering this rank. @param prefixOutputPath: prefix of the output file path """ if self._initFailed: return rankId = self._taxonomy.getRankId(rank) if rankId not in self._allowedRankIdsSet: print("Can't consider rank: " + rank) return if not os.path.isdir(os.path.dirname(prefixOutputPath)): print( "Output prefix is wrong, the corresponding directory doesn't exist: " + os.path.dirname(prefixOutputPath)) return # entries of the confusion matrix tableCountMap = {} # (taxonId_ref, taxonId_pred) -> count tableBpMap = {} # (taxonId_ref, taxonId_pred) -> bp # predictions (and reference) at the given rank seqNameToPred = self._rankIdToPredMap[rankId] seqNameToRef = self._rankIdToRefMap[rankId] predTaxonIdSet = set() refTaxonIdSet = set() # fill in entries of the confusion matrix for seqId, bp in self._seqNameToBp.iteritems(): predId = seqNameToPred.get(seqId, None) refId = seqNameToRef.get(seqId, None) if predId is not None: predTaxonIdSet.add( predId) # stores it only if it's predicted at this rank if refId is not None: refTaxonIdSet.add(refId) key = (refId, predId) if key not in tableCountMap: tableCountMap[key] = 1 tableBpMap[key] = bp else: tableCountMap[key] += 1 tableBpMap[key] += bp # get taxonIds contained in prediction and reference prediction, common for both, unique for pred. and ref. commonTaxonIdSet = predTaxonIdSet.intersection(refTaxonIdSet) uniquePredIdSet = predTaxonIdSet.difference(commonTaxonIdSet) uniqueRefIdSet = refTaxonIdSet.difference(commonTaxonIdSet) # get taxonIds contained in predictions and reference predictions as lists of scientific names commonNames, commonMap = self._taxonomy.getSortedScientificNames( commonTaxonIdSet) uniquePredNames, uniquePredMap = self._taxonomy.getSortedScientificNames( uniquePredIdSet) uniqueRefNames, uniqueRefMap = self._taxonomy.getSortedScientificNames( uniqueRefIdSet) # headers predHeader = commonNames + uniquePredNames + ['unassigned' ] # predictions refHeader = commonNames + uniqueRefNames + ['unassigned'] # reference predHeaderTaxonIds = [] refHeaderTaxonIds = [] for name in commonNames: id = commonMap[name] predHeaderTaxonIds.append(id) refHeaderTaxonIds.append(id) for name in uniquePredNames: predHeaderTaxonIds.append(uniquePredMap[name]) for name in uniqueRefNames: refHeaderTaxonIds.append(uniqueRefMap[name]) predHeaderTaxonIds.append(None) # predicted as unassigned refHeaderTaxonIds.append(None) # unassigned in reference # count matches matchCount = 0 matchBp = 0 for taxonId in commonTaxonIdSet: count = tableCountMap.get((taxonId, taxonId), None) if count is not None: bp = tableBpMap.get((taxonId, taxonId), None) assert bp is not None matchCount += count matchBp += bp # count mismatches mismatchCount = 0 mismatchBp = 0 for predTaxonId in predHeaderTaxonIds[:-1]: for refTaxonId in refHeaderTaxonIds[:-1]: if predTaxonId == refTaxonId: continue assert (predTaxonId is not None) and (refTaxonId is not None) count = tableCountMap.get((refTaxonId, predTaxonId), None) if count is not None: bp = tableBpMap.get((refTaxonId, predTaxonId), None) assert bp is not None mismatchCount += count mismatchBp += bp # count pred total, ref total predTotalCount = 0 predTotalBp = 0 refTotalCount = 0 refTotalBp = 0 for predTaxonId in predHeaderTaxonIds: for refTaxonId in refHeaderTaxonIds: count = tableCountMap.get((refTaxonId, predTaxonId), None) if count is None: continue bp = tableBpMap.get((refTaxonId, predTaxonId), None) assert bp is not None if predTaxonId is not None: predTotalCount += count predTotalBp += bp if refTaxonId is not None: refTotalCount += count refTotalBp += bp # total totalCount = 0 totalBp = 0 for bp in self._seqNameToBp.values(): totalCount += 1 totalBp += bp # write the confusion matrix to a file out = csv.OutFileBuffer( os.path.normpath(prefixOutputPath + '.' + str(rank) + '_cmp.csv')) header = 'ref/pred' for e in predHeader: header += ', ' + e out.writeText(header + '\n') for i in range(len(refHeaderTaxonIds)): line = refHeader[i] refTaxonId = refHeaderTaxonIds[i] for j in range(len(predHeaderTaxonIds)): predTaxonId = predHeaderTaxonIds[j] count = tableCountMap.get((refTaxonId, predTaxonId), None) line += ', ' if count is not None: bp = tableBpMap.get((refTaxonId, predTaxonId), None) assert bp is not None line += str(int(round( float(bp) / 1000.0))) + 'k (' + str(count) + ')' out.writeText(line + '\n') out.writeText(',\n') out.writeText('Matches, ' + str(int(round(float(matchBp) / 1000.0))) + 'k, ' + str(matchCount) + ', ' + self._div(matchBp, matchBp + mismatchBp, 1) + ' %k' + ', ' + self._div(matchCount, matchCount + mismatchCount, 1) + ' %\n') out.writeText('Mismatches, ' + str(int(round(float(mismatchBp) / 1000.0))) + 'k, ' + str(mismatchCount) + ', ' + self._div(mismatchBp, matchBp + mismatchBp, 1) + ' %k' + ', ' + self._div(mismatchCount, matchCount + mismatchCount, 1) + ' %\n') out.writeText('Pred. assigned, ' + str(int(round(float(predTotalBp) / 1000.0))) + 'k, ' + str(predTotalCount) + ', ' + self._div(predTotalBp, totalBp, 1) + ' %k, ' + self._div(predTotalCount, totalCount, 1) + ' %\n') out.writeText('Ref. assigned, ' + str(int(round(float(refTotalBp) / 1000.0))) + 'k, ' + str(refTotalCount) + ', ' + self._div(refTotalBp, totalBp, 1) + ' %k, ' + self._div(refTotalCount, totalCount, 1) + ' %\n') out.writeText('Total fasta, ' + str(int(round(float(totalBp) / 1000.0))) + 'k, ' + str(totalCount) + '\n') out.close()
def maskDb(action, inDir, outDir, rank, clades, taxonomyFilePath, verbose=False): """ Main function (function interface), see module description. @param action: one action that will be performed [cl, mr, mg] ~ (generate list, mask seq, mask mg) @type action str @param inDir: directory containing input files @type inDir: str @param outDir: directory containing output files @type: outDir: str @param rank: the data will be excluded at this rank @type rank: str @param clades: a file containing clades that will be masked (one ncbi taxon id at a line), or a set of ncbi taxon ids that will be masked @type clades: file or set of int @param taxonomyFilePath: taxonomy database file in the sqlite3 format @type taxonomyFilePath: str """ # check input parameters assert action in ['cl', 'mr', 'mg'], str('Given action is not supported: ' + action) if action == 'mr': assert os.name == 'posix', 'Symbolic links can be created only on posix systems, action "mr" is not valid!' for dir in [inDir, outDir]: assert os.path.isdir(dir), str("Directory doesn't exists: " + dir) assert rank in _RANKS, str('Not supported rank: ' + rank) assert os.path.isfile(taxonomyFilePath), str( "Taxonomy database file doesn't exist: " + taxonomyFilePath) assert isinstance( clades, set ) or (isinstance(clades, str) and os.path.isfile(clades)), str( "Parameter 'clades' can be either a file or a set of ncbi taxonIds to be excluded." ) # maps a rank to a lower rank toLowerRank = {} for i in range(1, len(_RANKS)): toLowerRank[_RANKS[i - 1]] = _RANKS[i] taxonomy = _TaxonomyWrapMD(taxonomyFilePath) # leaf clades to mask if isinstance(clades, set): inCladesSet = set(map(int, clades)) else: inCladesSet = set(map(int, csv.getColumnAsList(clades))) # clades in the reference refCladesSet = set() if action in ['cl', 'mr']: # get the list of all taxon ids that appear in the directory (as PPS reference) for fastaFilePath in glob.glob( os.path.join(os.path.normpath(inDir), r'*.f[na][as]')): # *.fas or *.fna refCladesSet.add(_refFilePathToTaxonId( fastaFilePath)) # taxonId.1.fna or taxonId.1.fas elif action in ['mg']: # get the list of all taxon ids that appear in any file in the input directory as taxonomy ".tax" for mapFilePath in glob.glob( os.path.join(os.path.normpath(inDir), r'*.tax')): # *.tax refCladesSet.update( set( map(_mgSeqIdToTaxonId, csv.getColumnAsList(mapFilePath, sep='\t')))) else: assert False, str('Not supported action: ' + action) # checks whether taxonIds are in the taxonomy for taxonId in inCladesSet: assert taxonomy.exists(taxonId), str( 'taxonId: %s from clades list is not contained in the taxonomy!' % taxonId) for taxonId in refCladesSet: assert taxonomy.exists(taxonId), str( 'taxonId: %s from the reference is not contained in the taxonomy!' % taxonId) # checks whether the taxonIds are leafs (doesn't have to be (unless you want to mask at the strain level)) for taxonId in inCladesSet: if not taxonomy.isLeaf(taxonId): print( 'Taxon id %s does not represent a leaf clade in the taxonomy.' % taxonId) if verbose: print('Initial checks done.') # taxonIds that should be excluded toExcludeSet = set() for taxonId in inCladesSet: taxonIdAtRank = taxonomy.getTaxonIdAtRank(taxonId, rank) if taxonIdAtRank is None: # the lineage is not defined at this rank ! try a lower rank ! print('Taxon id: "%s" is not defined at rank: "%s"' % (taxonId, rank)) currentRank = rank # find a lower rank at which it's defined while currentRank in toLowerRank: currentRank = toLowerRank[currentRank] taxonIdAtRank = taxonomy.getTaxonIdAtRank(taxonId, currentRank) if taxonIdAtRank is not None: break if taxonIdAtRank is None: taxonIdAtRank = taxonId currentRank = _STRAIN print('Taxon id: %s will be masked at rank: %s' % (taxonId, currentRank)) # all child clades (and itself) toExcludeSet.add(int(taxonIdAtRank)) toExcludeSet.update( set(map(int, taxonomy.getAllChildren(taxonIdAtRank)))) # all clades that should be excluded (there is at least one sequence for each taxonId in the reference) toExcludeSet.intersection_update(refCladesSet) if verbose: print('Data to mask collected done.') print('To exclude: ', len(toExcludeSet)) # exclude data from the reference if action == 'cl': # generates a list of taxonIds out = csv.OutFileBuffer(os.path.join(outDir, 'exclude_list.txt')) for taxonId in toExcludeSet: out.writeText(str(taxonId) + '\n') out.close() elif action == 'mr': # masked reference sequences (create sim links to files that were not excluded) for fastaFilePath in glob.glob( os.path.join(os.path.normpath(inDir), r'*.f[na][as]')): # *.fas or *.fna taxonId = _refFilePathToTaxonId( fastaFilePath) # taxonId.1.fna or taxonId.1.fas if taxonId not in toExcludeSet: # assert os.name == 'posix' os.symlink( fastaFilePath, os.path.join(outDir, os.path.basename(fastaFilePath))) elif action == 'mg': # exclude sequences from the marker gene databases for mapFilePath in glob.glob( os.path.join(os.path.normpath(inDir), r'*.tax')): # get entries that can stay in the mapping and fasta files allowedEntriesSet = set( map(_mgSeqIdToTaxonId, csv.getColumnAsList(mapFilePath, sep='\t'))) allowedEntriesSet.difference_update(toExcludeSet) # filter out entries from the mapping file csv.filterOutLines(mapFilePath, os.path.join(outDir, os.path.basename(mapFilePath)), allowedEntriesSet, entryModifyFunction=_mgSeqIdToTaxonId, colNum=0, sep='\t') # filter out entries from the fasta file fastaFilePath = str(mapFilePath.rsplit('.', 1)[0] + '.fna') fas.filterOutSequences(fastaFilePath, os.path.join( outDir, os.path.basename(fastaFilePath)), allowedEntriesSet, seqNameModifyFunction=_mgSeqIdToTaxonId) else: assert False, 'Not supported action!' taxonomy.close() if verbose: print('Data masked done.')