def removeSequences(mg): removeListFilePath = '/net/metagenomics/projects/PPSmg/data/V35/genome_ncbids.txt' #removeListFilePath = '/net/metagenomics/projects/PPSmg/data/V35/genome_accession_silva.txt' srcFilePath = str('/net/metagenomics/projects/PPSmg/data/markerGenes/db/' + mg + '_bact+arch_dnaV.noalign.fna') dstFilePath = str( '/net/metagenomics/projects/PPSmg/data/V35/genomesRemoved/markerGenes/db/' + mg + '_bact+arch_dnaV.noalign.fna') #srcFilePath = str('/net/metagenomics/projects/PPSmg/data/silva/' + mg + '_silva106_ncbitax.bacteria+archaea.fna' ) #dstFilePath = str('/net/metagenomics/projects/PPSmg/data/V35/genomesRemoved/silva/' + mg + '_silva106_ncbitax.bacteria+archaea.fna' ) pattern = r'.*ncbid:([0-9]+)$' #pattern = r'^([^\-]+)\-.*$' removeSet = set( csv.getColumnAsList(removeListFilePath, colNum=0, comment='#')) seqIdToSeq = fas.fastaFileToDict(srcFilePath) out = csv.OutFileBuffer(dstFilePath) removed = 0 for seqId in seqIdToSeq: if re.sub(pattern, r'\1', str(seqId)) not in removeSet: out.writeText( str('>' + str(seqId) + '\n' + str(seqIdToSeq[seqId]) + '\n')) else: removed += 1 out.close() print mg, 'removeSequences', removed
def mergeSequences(mapFilePathList, fastaFilePathList, outputDir): """ Reads all sequences. For each taxonId creates a file that contain all sequences mapped to this taxonId. If a seqId appears more than one it is ignored since acession numbers are unique. @param mapFilePathList: list of files where each contain mapping: seqId -> taxonId @param fastaFilePathList: list of fasta files that contain mapping: seqId -> seq """ taxonIdToOutBuffer = {} seqIdSet = set() totalSeqCount = 0 totalStoredSeqCount = 0 totalIdenticalSeqCount = 0 for mapFilePath, fastaFilePath in zip(mapFilePathList, fastaFilePathList): print 'processing', mapFilePath, fastaFilePath seqCount = 0 storedSeqCount = 0 seqIdToSeq = fasta.fastaFileToDict(fastaFilePath) seqIdToNcbidList = csv.getMapping(mapFilePath, 0, 1, sep='\t', comment='#') for seqId, seq in seqIdToSeq.iteritems(): seqCount += 1 if seqId in seqIdSet: totalIdenticalSeqCount += 1 continue else: seqIdSet.add(seqId) taxonId = seqIdToNcbidList[seqId][0] if taxonId not in taxonIdToOutBuffer: outBuffer = csv.OutFileBuffer(os.path.join(outputDir, str(str(taxonId) + '.fna'))) taxonIdToOutBuffer[taxonId] = outBuffer taxonIdToOutBuffer[taxonId].writeText(str('>' + seqId + '\n' + seq + '\n')) taxonIdToOutBuffer[taxonId].close() storedSeqCount += 1 if len(string.replace(common.noNewLine(seq),'N','')) == 0: print 'zeros', seqId, fastaFilePath, len(common.noNewLine(seq)) # for buff in taxonIdToOutBuffer.values(): # buff.close() print 'totalSeq, storedSeq', seqCount, storedSeqCount totalSeqCount += seqCount totalStoredSeqCount += storedSeqCount print 'totalSeqCount, totalStoredSeqCount, totalIdenticalSeqCount', totalSeqCount, totalStoredSeqCount, totalIdenticalSeqCount print 'sequences merged'
def concatenate(directory, outputFile): out = csv.OutFileBuffer(outputFile) for f in os.listdir(directory): path = os.path.join(directory, f) name = f.split('.')[0] seqIdToSeq = fasta.fastaFileToDict(path) out.writeText('>' + str(name) + '\n') for seqId, seq in seqIdToSeq.iteritems(): out.writeText(str(seq) + 200*'N' + '\n') out.close()
def filterOutReads(): inFasta = '' outFasta = '' out = csv.OutFileBuffer(outFasta) notAllowedSet = set(['BA000019.2']) # Nostoc sp. PCC 7120 for seqId, seq in fas.fastaFileToDict(inFasta).iteritems(): id = re.sub(r'([^_]+)_.*', r'\1', seqId) if id not in notAllowedSet: out.writeText('>' + str(seqId) + '\n' + str(seq) + '\n') out.close()
def removeEntries(mg): """ Removes sequences from the marker gene files at the level from species, genus, family etc. """ removeListPath = '/net/metagenomics/projects/PPSmg/data/V35/genome_ncbids_species.txt' srcFilePath = str( '/net/metagenomics/projects/PPSmg/data/markerGenes2/db/' + mg + '_bact+arch_dnaV.tax') dstFilePath = str( '/net/metagenomics/projects/PPSmg/data/V35/mgScenarios/speciesRemoved/db/' + mg + '_bact+arch_dnaV.tax') out = csv.OutFileBuffer(dstFilePath) removeSet = set(csv.getColumnAsList(removeListPath, colNum=0, comment='#')) removeSetInt = set() removeSetIds = set() removed = 0 for s in removeSet: if s != '': removeSetInt.add(int(s)) col0 = csv.getColumnAsList(srcFilePath, colNum=0, sep='\t', comment='#') col1 = csv.getColumnAsList(srcFilePath, colNum=1, sep='\t', comment='#') for col0, col1 in zip(col0, col1): lineSetInt = set() for s in col1.split(';'): if s != '': lineSetInt.add(int(s)) if len(removeSetInt.intersection( lineSetInt)) > 0: #the intersection is not empty removed += 1 removeSetIds.add(col0) else: out.writeText(str(col0 + '\t' + col1 + '\n')) out.close() print mg, 'removedEntries', removed srcFilePath = str( '/net/metagenomics/projects/PPSmg/data/markerGenes2/db/' + mg + '_bact+arch_dnaV.noalign.fna') dstFilePath = str( '/net/metagenomics/projects/PPSmg/data/V35/mgScenarios/speciesRemoved/db/' + mg + '_bact+arch_dnaV.noalign.fna') out = csv.OutFileBuffer(dstFilePath) seqIdToSeq = fas.fastaFileToDict(srcFilePath) removed = 0 for seqId in seqIdToSeq: if seqId in removeSetIds: removed += 1 else: out.writeText( str('>' + str(seqId) + '\n' + str(seqIdToSeq[seqId]) + '\n')) out.close() print mg, 'removedSeq', removed
def _getLabelsCreateFasta(): """ To process the original mercier dataset with 59 strains. Take only contigs that were mapped to the reference genomes. Output a fasta file and a mapping file. :rtype : None """ # input fasta file fastaFilePath = '/net/metagenomics/projects/PPSmg/data/V35/contigs_1000.txt' #contigs_1000.txt seqIdToSeq = fas.fastaFileToDict(fastaFilePath) # contigs mapped to genome names nameLabelsFilePath = '/net/metagenomics/projects/PPSmg/data/V35/contigs_1000_blast_labels.txt' #contigs_1000_blast_labels.txt seqIdToNameLabels = csv.getMapping(nameLabelsFilePath, 0, 1, sep='\t', comment='#') # mapping: genome name -> taxon id genomeListFilePath = '/net/metagenomics/projects/PPSmg/data/V35/genome_list2.txt' #genome_list.txt nameLabelToNcbid = csv.getMapping(genomeListFilePath, 0, 2, sep=';', comment='#') # to store mapped sequences outFastaFilePath = '/net/metagenomics/projects/PPSmg/data/V35/contigsMappedBlast1000.fna' #contigsMappedBlast1000.fna outFasta = csv.OutFileBuffer(outFastaFilePath) # to stored taxonomic mapping of mapped sequences outLabelsFilePath = '/net/metagenomics/projects/PPSmg/data/V35/contigsMappedBlast1000Labels.txt' #contigsMappedBlast1000Labels.txt outLabels = csv.OutFileBuffer(outLabelsFilePath) for seqId in seqIdToSeq: if seqId in seqIdToNameLabels: outFasta.writeText( str('>' + str(seqId) + '\n' + seqIdToSeq[seqId] + '\n')) outFasta.close() print 'fasta created' for seqId in seqIdToSeq: if seqId in seqIdToNameLabels: nameLabel = seqIdToNameLabels[seqId][0] ncbid = nameLabelToNcbid[nameLabel][0] outLabels.writeText(str(str(seqId) + '\t' + str(ncbid) + '\n')) outLabels.close() print 'labels created'
def removeEntries(mg): """ Removes sequences from the marker gene files at the level from species, genus, family etc. """ removeListPath = '/net/metagenomics/projects/PPSmg/data/V35/genome_ncbids_species.txt' srcFilePath = str('/net/metagenomics/projects/PPSmg/data/markerGenes2/db/' + mg + '_bact+arch_dnaV.tax') dstFilePath = str('/net/metagenomics/projects/PPSmg/data/V35/mgScenarios/speciesRemoved/db/' + mg + '_bact+arch_dnaV.tax') out = csv.OutFileBuffer(dstFilePath) removeSet = set(csv.getColumnAsList(removeListPath, colNum=0, comment='#')) removeSetInt = set() removeSetIds = set() removed = 0 for s in removeSet: if s != '': removeSetInt.add(int(s)) col0 = csv.getColumnAsList(srcFilePath, colNum=0, sep='\t', comment='#') col1 = csv.getColumnAsList(srcFilePath, colNum=1, sep='\t', comment='#') for col0,col1 in zip(col0,col1): lineSetInt = set() for s in col1.split(';'): if s != '': lineSetInt.add(int(s)) if len(removeSetInt.intersection(lineSetInt)) > 0: #the intersection is not empty removed += 1 removeSetIds.add(col0) else: out.writeText(str(col0 + '\t' + col1 + '\n')) out.close() print mg, 'removedEntries', removed srcFilePath = str('/net/metagenomics/projects/PPSmg/data/markerGenes2/db/' + mg + '_bact+arch_dnaV.noalign.fna') dstFilePath = str('/net/metagenomics/projects/PPSmg/data/V35/mgScenarios/speciesRemoved/db/' + mg + '_bact+arch_dnaV.noalign.fna') out = csv.OutFileBuffer(dstFilePath) seqIdToSeq = fas.fastaFileToDict(srcFilePath) removed=0 for seqId in seqIdToSeq: if seqId in removeSetIds: removed += 1 else: out.writeText(str('>' + str(seqId) + '\n' + str(seqIdToSeq[seqId]) + '\n')) out.close() print mg, 'removedSeq', removed
def filterOutContigs(inFastaFile, inTaxFile, outFastaFile, outTaxFile, notAllowedTaxonIdList): outFasta = csv.OutFileBuffer(outFastaFile) outTax = csv.OutFileBuffer(outTaxFile) seqIdToTaxonId = csv.predToDict(inTaxFile) notAllowedTaxonIdSet = set(notAllowedTaxonIdList) taxonIdToFilteredSeq = {} for taxonId in notAllowedTaxonIdSet: taxonIdToFilteredSeq[taxonId] = 0 for seqId, seq in fas.fastaFileToDict(inFastaFile).iteritems(): taxonId = int(seqIdToTaxonId[seqId]) if taxonId not in notAllowedTaxonIdSet: outFasta.writeText('>' + str(seqId) + '\n' + str(seq) + '\n') outTax.writeText(str(seqId) + '\t' + str(taxonId) + '\n') else: taxonIdToFilteredSeq[taxonId] += 1 outFasta.close() outTax.close() print("filtered taxonId -> seqCount: " + str(taxonIdToFilteredSeq))
def _getLabelsCreateFasta(): """ To process the original mercier dataset with 59 strains. Take only contigs that were mapped to the reference genomes. Output a fasta file and a mapping file. :rtype : None """ # input fasta file fastaFilePath = '/net/metagenomics/projects/PPSmg/data/V35/contigs_1000.txt' #contigs_1000.txt seqIdToSeq = fas.fastaFileToDict(fastaFilePath) # contigs mapped to genome names nameLabelsFilePath = '/net/metagenomics/projects/PPSmg/data/V35/contigs_1000_blast_labels.txt' #contigs_1000_blast_labels.txt seqIdToNameLabels = csv.getMapping(nameLabelsFilePath, 0, 1, sep='\t', comment = '#') # mapping: genome name -> taxon id genomeListFilePath = '/net/metagenomics/projects/PPSmg/data/V35/genome_list2.txt' #genome_list.txt nameLabelToNcbid = csv.getMapping(genomeListFilePath, 0, 2, sep=';', comment = '#') # to store mapped sequences outFastaFilePath = '/net/metagenomics/projects/PPSmg/data/V35/contigsMappedBlast1000.fna' #contigsMappedBlast1000.fna outFasta = csv.OutFileBuffer(outFastaFilePath) # to stored taxonomic mapping of mapped sequences outLabelsFilePath = '/net/metagenomics/projects/PPSmg/data/V35/contigsMappedBlast1000Labels.txt' #contigsMappedBlast1000Labels.txt outLabels = csv.OutFileBuffer(outLabelsFilePath) for seqId in seqIdToSeq: if seqId in seqIdToNameLabels: outFasta.writeText(str('>' + str(seqId) + '\n' + seqIdToSeq[seqId] + '\n')) outFasta.close() print 'fasta created' for seqId in seqIdToSeq: if seqId in seqIdToNameLabels: nameLabel = seqIdToNameLabels[seqId][0] ncbid = nameLabelToNcbid[nameLabel][0] outLabels.writeText(str(str(seqId) + '\t' + str(ncbid) + '\n')) outLabels.close() print 'labels created'
def removeSequences(mg): removeListFilePath = '/net/metagenomics/projects/PPSmg/data/V35/genome_ncbids.txt' #removeListFilePath = '/net/metagenomics/projects/PPSmg/data/V35/genome_accession_silva.txt' srcFilePath = str('/net/metagenomics/projects/PPSmg/data/markerGenes/db/' + mg + '_bact+arch_dnaV.noalign.fna') dstFilePath = str('/net/metagenomics/projects/PPSmg/data/V35/genomesRemoved/markerGenes/db/' + mg + '_bact+arch_dnaV.noalign.fna') #srcFilePath = str('/net/metagenomics/projects/PPSmg/data/silva/' + mg + '_silva106_ncbitax.bacteria+archaea.fna' ) #dstFilePath = str('/net/metagenomics/projects/PPSmg/data/V35/genomesRemoved/silva/' + mg + '_silva106_ncbitax.bacteria+archaea.fna' ) pattern = r'.*ncbid:([0-9]+)$' #pattern = r'^([^\-]+)\-.*$' removeSet = set(csv.getColumnAsList(removeListFilePath, colNum=0, comment='#')) seqIdToSeq = fas.fastaFileToDict(srcFilePath) out = csv.OutFileBuffer(dstFilePath) removed = 0 for seqId in seqIdToSeq: if re.sub(pattern, r'\1', str(seqId)) not in removeSet: out.writeText(str('>' + str(seqId) + '\n' + str(seqIdToSeq[seqId]) + '\n')) else: removed += 1 out.close() print mg, 'removeSequences', removed
def _main(): """ See the module description.""" parser = argparse.ArgumentParser(description=__doc__, epilog="""""") parser.add_argument( '-i', '--input-data-dir', action='store', nargs=1, required=True, help= """Directory that contains fasta files and corresponding mapping files, for each "*.tax" (or "*.csv") file there must be a "*.fna" file with the same name. All files with suffix "tax" (or "*.csv") will be considered. (Takes only Bacteria and Archaea)""", metavar='input_dir', dest='inDir') parser.add_argument('-o', '--output-dir', action='store', nargs=1, required=True, help='Directory that contains the output files.', metavar='out_dir', dest='outDir') parser.add_argument( '-s', '--source-type', required=True, nargs=1, choices=["s", "a"], help= 'To determine the source, use "s" for the Silva database and "a" for the Amphora database.', dest='srcType') parser.add_argument( '-t', '--taxonomy-file', nargs=1, type=file, required=True, help='NCBI taxonomy database file in the sqlite3 format.', metavar='ncbitax_sqlite.db', dest='taxonomy') parser.add_argument('-n', '--not-considered-taxonIds', action='store', nargs=1, help='Comma separated leaf level or top level taxonIds (as a string) what fill be filtered out. (optional)', metavar='"2759,10239,77133,155900,408172,32644, 408170,433727,749907,556182,702656,410661,652676,410659,797283'\ ',408171,703336,256318,32630,433724,766747,488339,942017,1076179,717931,455559,527640,904678,552539,'\ '54395,198431,358574,415540,511564,369433,380357,81726,198834,271928,311313,2759,749906,1077529,'\ '1077529,361146,511563,361147"', dest='filterOut') # parse arguments args = parser.parse_args() inDir = args.inDir[0] outDir = args.outDir[0] srcType = args.srcType[0] filterOutTaxonIdsSet = set() try: if args.filterOut: filterOutTaxonIdsSet.update( set(map(int, str(args.filterOut[0]).split(',')))) except: print( 'Taxon ids that are to be filtered out are in a wrong format! Comma separated integers are needed!' ) raise taxonomy = _TaxonomyWrap(args.taxonomy[0].name) for dir in [inDir, outDir]: assert os.path.isdir(dir), 'Path: "' + dir + '" does not exists!' # create db for each gene mapDict = {} # map: seqId -> ncbid for mapFilePath in glob.glob( os.path.join(os.path.normpath(inDir), r'*.[ct][sa][vx]')): # *.csv or *.tax assert mapFilePath.endswith(('.csv', '.tax')), \ 'The mapping files can either end with .csv or .tax ' + mapFilePath base = os.path.basename(mapFilePath).rsplit( '.', 1)[0] # cut out dir path and suffix fastaDict = fas.fastaFileToDict( os.path.join(os.path.dirname(mapFilePath), (base + '.fna'))) # map: seqId -> seq print("Processing: %s seq count: %s" % (base, str(len(fastaDict)))) if 'a' in srcType: # Amphora mapDict = {} for k in csv.getColumnAsList(mapFilePath, colNum=0, sep='\t'): v = int(k.rsplit('|', 1)[1].split(':')[1]) # get ncbid assert ((k not in mapDict) or (mapDict[k] == v)), str( 'There are at least two different values for key: ' + str(k) + ' in ' + mapFilePath) mapDict[k] = v elif 's' in srcType: # Silva mapTmp = csv.getMapping(mapFilePath, 0, 2, '\t') mapDict = {} for k, v in mapTmp.iteritems(): mapDict[k] = int(v[0]) else: assert False, 'Unsupported source type!' # same number of entries in both files (fasta and mapping) ? if len(mapDict) != len(fastaDict): print( str('%s: The mapping file and the corresponding fasta file have different number of entries: ' + '"%s" "%s" these files will be skipped!') % (base, str(len(mapDict)), str(len(fastaDict)))) continue # are duplicates in the mapping file ? count = len(csv.getColumnAsList(mapFilePath)) if len(mapDict) != count: print( '%s: The mapping file contained duplicates! unique: %s non-unique: %s' % (base, str(len(mapDict)), str(count))) # store data to the output directory outDna = csv.OutFileBuffer(os.path.join(outDir, str(base + '.fna'))) outTax = csv.OutFileBuffer(os.path.join(outDir, str(base + '.tax'))) count = 0 filteredLeaf = 0 filteredSup = 0 notMapped = 0 noBacArch = 0 for seqId, taxonId in mapDict.iteritems(): if taxonId in filterOutTaxonIdsSet: filteredLeaf += 1 continue path = taxonomy.getPathToRoot(taxonId) if path is None: print('Could not find: %s for seqId: %s record skipped!' % (str(taxonId), seqId)) notMapped += 1 continue topLevel = int(path.split(';', 1)[0]) if topLevel in filterOutTaxonIdsSet: filteredSup += 1 continue if topLevel not in [2, 2157]: # Bacteria, Archaea noBacArch += 1 print('NoBactArch: ', topLevel) seq = fastaDict[seqId] if 'a' in srcType: # Amphora id = seqId elif 's' in srcType: # Silva id = str(seqId + '|ncbid:' + str(taxonId)) outTax.writeText(str(id + '\t' + path + '\n')) outDna.writeText(str('>' + id + '\n' + seq + '\n')) count += 1 outDna.close() outTax.close() print( 'Stored entries: %s filtered out: %s leaf, %s top level, not mapped: %s' % (count, filteredLeaf, filteredSup, notMapped)) if noBacArch > 0: print( 'WARN: stored %s of non Bacterial and non Archaeal sequences: ' % (noBacArch)) # Silva: #-i /Users/ivan/Documents/work/binning/database/silva111/arbGenerated -s s -t /Users/ivan/Documents/work/binning/taxonomy/20121122/ncbitax_sqlite.db # -o /Users/ivan/Documents/work/binning/database/silva111/db -n ... # Amphora # -i /Users/ivan/Documents/work/binning/database/markerGenes3/mGenesExtracted -s a -t /Users/ivan/Documents/work/binning/taxonomy/20121122/ncbitax_sqlite.db # -o /Users/ivan/Documents/work/binning/database/markerGenes3/db taxonomy.close() print 'done'
def main(): """ Wraps pIRS read simulator to simulate Illumina paired end reads. Sample config: /Users/ivan/Documents/work/binning/data/V35/simMetagenome/configMetagenome01.cfg """ if os.name != 'posix': print 'runs only on posix systems' return #parse arguments parser = argparse.ArgumentParser(description='''A simple Metagenome Illumina read simulator that wraps pIRS''', epilog='''''') parser.add_argument('-c', '--config', nargs=1, type=file, required=True, help='configuration file of the simulator', metavar='configMetagenome.cfg', dest='config') parser.add_argument('-p', '--pIRS-param', action='store', nargs='+', help='parameters of the pIRS simulator, e.g. "-Q 64 -E 1"', dest='p') args = parser.parse_args() config = Config(args.config[0], 'Sim') pirsParam = '' if args.p: pirsParam = args.p[0] #reads configuration workingDir = config.get('workingDir') referenceSeq = config.get('referenceSeq') frequenciesInfo = config.get('frequenciesInfo') coverageFrequencyMultiplier = float(config.get('coverageFrequencyMultiplier')) pirsInstallDir = config.get('pirsInstallDir') insertSizeMean = int(config.get('insertSizeMean')) insertSizeSd = int(config.get('insertSizeSd')) readLength = int(config.get('readLength')) #check whether the pIRS optional parameters doesn`t contain those predefined elsewhere (e.g. in the config) if (string.count(pirsParam,'-m') != 0 or string.count(pirsParam,'-v') != 0 or string.count(pirsParam,'-l') != 0 or string.count(pirsParam,'-x') != 0 or string.count(pirsParam,'-i') != 0 or string.count(pirsParam,'-o') != 0): print 'pIRS parameters -m -v -l (-x) must be set in the configuration file, parameters -i -o cannot be set ' return #check working directory, create temporary directory tmpDir = os.path.join(workingDir,'tmp') if not os.path.isdir(workingDir): print str('The working directory does not exists, create it! (' + str(workingDir) + ')') return if not os.path.isdir(tmpDir): os.mkdir(tmpDir) seqNameToSeq = fastaFileToDict(referenceSeq) seqNameToFreq = getMapping(frequenciesInfo, 0, 1, sep='\t', comment = '#') outReads1Merged = OutFileBuffer(os.path.join(workingDir,'reads_1.fq')) outReads2Merged = OutFileBuffer(os.path.join(workingDir,'reads_2.fq')) for seqName in seqNameToFreq: seq = seqNameToSeq[seqName] coverage = float(seqNameToFreq[seqName][0])*coverageFrequencyMultiplier fastaFile = os.path.join(tmpDir,str(seqName + '.fna')) outBuffer = OutFileBuffer(fastaFile) outBuffer.writeText(str('>' + seqName + '\n' + seq + '\n')) outBuffer.close() cmd = str(os.path.join(pirsInstallDir,'pirs') + ' simulate -i ' + fastaFile + ' -x ' + str(coverage) + ' -m ' + str(insertSizeMean) + ' -v ' + str(insertSizeSd) + ' -l ' + str(readLength) + ' -o ' + seqName + ' ' + pirsParam) #print cmd proc = subprocess.Popen(cmd, shell=True, bufsize=-1, cwd=tmpDir)# stdout=subprocess.STDOUT, stderr=subprocess.STDOUT) proc.wait() if proc.returncode != 0: sys.stderr.write(str('command failed: ' + cmd)) #append generated reads to the merged files reads1 = gzip.open(os.path.join(tmpDir, str(seqName + '_' + str(readLength) + '_' + str(insertSizeMean) + '_1.fq.gz')), 'rb') file1Content = reads1.read() outReads1Merged.writeText(str(file1Content.replace('@read_',str('@read_' + seqName + '_')) + '\n')) reads1.close() reads2 = gzip.open(os.path.join(tmpDir, str(seqName + '_' + str(readLength) + '_' + str(insertSizeMean) + '_2.fq.gz')), 'rb') file2Content = reads2.read() outReads2Merged.writeText(str(file2Content.replace('@read_',str('@read_' + seqName + '_')) + '\n')) reads2.close() outReads1Merged.close() outReads2Merged.close()
def mergeSequences(mapFilePathList, fastaFilePathList, outputDir): """ Reads all sequences. For each taxonId creates a file that contain all sequences mapped to this taxonId. If a seqId appears more than one it is ignored since acession numbers are unique. @param mapFilePathList: list of files where each contain mapping: seqId -> taxonId @param fastaFilePathList: list of fasta files that contain mapping: seqId -> seq """ taxonIdToOutBuffer = {} seqIdSet = set() totalSeqCount = 0 totalStoredSeqCount = 0 totalIdenticalSeqCount = 0 for mapFilePath, fastaFilePath in zip(mapFilePathList, fastaFilePathList): print 'processing', mapFilePath, fastaFilePath seqCount = 0 storedSeqCount = 0 seqIdToSeq = fasta.fastaFileToDict(fastaFilePath) seqIdToNcbidList = csv.getMapping(mapFilePath, 0, 1, sep='\t', comment='#') for seqId, seq in seqIdToSeq.iteritems(): seqCount += 1 if seqId in seqIdSet: totalIdenticalSeqCount += 1 continue else: seqIdSet.add(seqId) taxonId = seqIdToNcbidList[seqId][0] if taxonId not in taxonIdToOutBuffer: outBuffer = csv.OutFileBuffer( os.path.join(outputDir, str(str(taxonId) + '.fna'))) taxonIdToOutBuffer[taxonId] = outBuffer taxonIdToOutBuffer[taxonId].writeText( str('>' + seqId + '\n' + seq + '\n')) taxonIdToOutBuffer[taxonId].close() storedSeqCount += 1 if len(string.replace(common.noNewLine(seq), 'N', '')) == 0: print 'zeros', seqId, fastaFilePath, len(common.noNewLine(seq)) # for buff in taxonIdToOutBuffer.values(): # buff.close() print 'totalSeq, storedSeq', seqCount, storedSeqCount totalSeqCount += seqCount totalStoredSeqCount += storedSeqCount print 'totalSeqCount, totalStoredSeqCount, totalIdenticalSeqCount', totalSeqCount, totalStoredSeqCount, totalIdenticalSeqCount print 'sequences merged'
def computeTrainingAccuracy(workingDir, taWorkingDir, sampleSpecificDir, ppsTrainDataDir, outputDir, ppsInstallDir, ppsScripts, ppsConfigFilePath, predictLogFileName, modelTaxonIdFilePath, databaseFile): """ Computes the training accuracy for the PPS training data. This function doesn't consider training data used to train intermediate (misc?) nodes! The training data that correspond to the sample specific data is fragmented (via PPS) and contained in the training data of different lengths. @param workingDir: working directory of the PPS+ pipeline @param taWorkingDir: working directory for the accuracy computation @param sampleSpecificDir: directory containing the sample specific data @param ppsTrainDataDir: directory 'sampled_fasta' containing PPS training data @param outputDir: directory for output files @param ppsScripts: directory containing PPS scripts @param ppsConfigFilePath: the PPS configuration file @param ppsInstallDir: directory where PPS is installed @param predictLogFileName: logging file for PPS prediction @param modelTaxonIdFilePath: file containing all leaf ncbi taxon ids that are modelled @param databaseFile: ncbi taxonomy file in the sqlite3 format """ for d in [ workingDir, taWorkingDir, sampleSpecificDir, ppsTrainDataDir, outputDir, ppsInstallDir, ppsScripts, os.path.dirname(predictLogFileName) ]: assert os.path.isdir(d), "Directory '%s' doesn't exist!" % d for f in [ppsConfigFilePath, databaseFile, modelTaxonIdFilePath]: assert os.path.isfile(f), "File '%s' doesn't exist!" % f # all directories that contain PPS training data trainDirList = [sampleSpecificDir] for d in os.listdir(ppsTrainDataDir): trainDirList.append(os.path.join(ppsTrainDataDir, d)) # fasta file with all training sequences allTrainFastaFile = os.path.join(taWorkingDir, 'all_train_data.fna') out = csv.OutFileBuffer(allTrainFastaFile) seqIdToTruePred = {} # merge all training fasta files to one fasta file for d in trainDirList: dName = os.path.basename(d) for f in os.listdir(d): taxonId = int(os.path.basename(f).rsplit('.', 2)[0]) for seqId, seq in fasta.fastaFileToDict(os.path.join( d, f)).iteritems(): if d == sampleSpecificDir: #label = int(str(str(seqId).rsplit('|', 1)[1]).split(':', 1)[1]) id = str( taxonId) + '|' + dName + '|' + seqId + '|label:' + str( taxonId) else: id = str(taxonId) + '|' + dName + '|' + seqId out.writeText('>' + id + '\n' + seq + '\n') seqIdToTruePred[id] = taxonId out.close() # predict the merged file using the generated model if os.name == 'posix': predictCmd = str( os.path.join(ppsScripts, 'predict.rb') + ' ' + allTrainFastaFile + ' ' + ppsConfigFilePath) #print(predictCmd) logOut = open(predictLogFileName, 'w') predictProc = subprocess.Popen( predictCmd, shell=True, bufsize=-1, cwd=ppsInstallDir, stdout=logOut, stderr=subprocess.STDOUT) # stdout=subprocess.STDOUT predictProc.wait() logOut.close() if predictProc.returncode != 0: raise Exception( "PPS 'predict' training data returned with non-zero status: %s, cmd: %s" % (predictProc.returncode, predictCmd)) else: print("Can't run PPS on a non-posix system!") return # read in predicted train data seqIdToPred = csv.predToDict(allTrainFastaFile + '.nox.fna.out') # read fasta file seqIdToBp = fasta.getSequenceToBpDict(allTrainFastaFile) # leaf taxonIds that are modelled modelLeafTaxonIds = set(map(int, csv.getColumnAsList(modelTaxonIdFilePath))) taxonomyS = taxonomy_ncbi.TaxonomyNcbi(databaseFile, considerNoRank=True) notLeafTaxonIds = set() for id in modelLeafTaxonIds: notLeafTaxonIds.update( set(map(int, (taxonomyS.getParentsNcbidSet(id))))) taxonomyS.close() # get only sequences with true taxonId defined at leaf level that is modelled or lower seqIdToBp2 = {} seqIdToPred2 = {} seqIdToTruePred2 = {} seqIdToBpMisc = {} seqIdToPredMisc = {} seqIdToTruePredMisc = {} for seqId, bp in seqIdToBp.iteritems(): label = int(str(str(seqId).rsplit('|', 1)[1]).split(':', 1)[1]) if label not in notLeafTaxonIds: seqIdToBp2[seqId] = bp seqIdToPred2[seqId] = seqIdToPred[seqId] seqIdToTruePred2[seqId] = seqIdToTruePred[seqId] else: seqIdToBpMisc[seqId] = bp seqIdToPredMisc[seqId] = seqIdToPred[seqId] seqIdToTruePredMisc[seqId] = seqIdToTruePred[seqId] seqIdToBp = seqIdToBp2 seqIdToPred = seqIdToPred2 seqIdToTruePred = seqIdToTruePred2 # accuracy for all, filter out sample specific data (whole length) seqIdToBpNoSampleSpec = {} for seqId, bp in seqIdToBp.iteritems(): if str(seqId).split( '|', 2)[1].strip() != os.path.basename(sampleSpecificDir).strip(): seqIdToBpNoSampleSpec[seqId] = bp acc = accuracy.Accuracy(seqIdToBpNoSampleSpec, seqIdToPred, seqIdToTruePred, databaseFile) out = csv.OutFileBuffer(os.path.join(outputDir, 'train_accuracy_all.txt')) out.writeText( acc.getAccuracyPrint(taxonomy_ncbi.TAXONOMIC_RANKS[1:], minFracClade=None, minFracPred=None, overview=True)) out.close() taxonomyA = acc.getTaxonomy() acc.close(closeTaxonomy=False) # accuracy for (misc) nodes acc = accuracy.Accuracy(seqIdToBpMisc, seqIdToPredMisc, seqIdToTruePredMisc, taxonomyA) out = csv.OutFileBuffer(os.path.join(outputDir, 'train_accuracy_misc.txt')) out.writeText( acc.getAccuracyPrint(taxonomy_ncbi.TAXONOMIC_RANKS[1:], minFracClade=None, minFracPred=None, overview=True)) out.close() acc.close(closeTaxonomy=False) # generate the confusion matrices (for the "for all" scenario) cm = confusion_matrix.ConfusionMatrix(seqIdToBp, seqIdToPred, seqIdToTruePred, databaseFile, taxonomy_ncbi.TAXONOMIC_RANKS[1:]) for rank in taxonomy_ncbi.TAXONOMIC_RANKS[1:]: cm.generateConfusionMatrix( rank, os.path.join(outputDir, 'train_accuracy_cmp_all')) taxonomyCM = cm.getTaxonomy() cm.close(closeTaxonomy=False) # accuracy for individual directories (seq lengths) # (the sample specific fragments are among PPS sampled fasta) for d in trainDirList: dName = os.path.basename(d) seqIdToBpSub = {} seqIdToPredSub = {} seqIdToTruePredSub = {} for seqId, bp in seqIdToBp.iteritems(): if str(seqId).split('|', 2)[1].strip() == str(dName).strip(): seqIdToBpSub[seqId] = seqIdToBp[seqId] seqIdToPredSub[seqId] = seqIdToPred[seqId] seqIdToTruePredSub[seqId] = seqIdToTruePred[seqId] # accuracy acc = accuracy.Accuracy(seqIdToBpSub, seqIdToPredSub, seqIdToTruePredSub, taxonomyA) out = csv.OutFileBuffer( os.path.join(outputDir, 'train_accuracy_' + dName + '.txt')) out.writeText( acc.getAccuracyPrint(taxonomy_ncbi.TAXONOMIC_RANKS[1:], minFracClade=None, minFracPred=None, overview=True)) # confusion matrices cm = confusion_matrix.ConfusionMatrix( seqIdToBpSub, seqIdToPredSub, seqIdToTruePredSub, taxonomyCM, taxonomy_ncbi.TAXONOMIC_RANKS[1:]) for rank in taxonomy_ncbi.TAXONOMIC_RANKS[1:]: cm.generateConfusionMatrix( rank, os.path.join(outputDir, 'train_accuracy_cmp_' + dName)) cm.close(closeTaxonomy=False) out.close() acc.close(closeTaxonomy=False) taxonomyA.close() taxonomyCM.close()
def _init(self, align=True, dm=True, cluster=True): """ Init data, compute: alignment, distance matrix, clusters. """ if self._initDone: return self._initDone = True fastaPathList = [ ] # fasta files containing regions that correspond to particular marker genes self._mgList = [] # list of names of marker genes mgToFastaPath = dict([]) # marker gene name -> fasta file path #collect regions from Amphora mg for fastaFile in glob.glob( os.path.join(os.path.normpath(self._mgWorkingDir), '*.gff')): fastaPathList.append(fastaFile) for path in fastaPathList: name = re.sub('([^\.]+)\..*$', r'\1', os.path.basename(path)) mg = re.sub(r'([^_]+)_dna', r'\1', name) dir = os.path.dirname(path) self._mgList.append(mg) mgToFastaPath[mg] = path #add 16S s16List = ['5S_rRNA', '16S_rRNA', '23S_rRNA'] for mg in s16List: mgToFastaPath[mg] = str(self._s16Prefix + '.' + mg + '.fna') self._mgList.append(mg) #For each marker gene create filtered fasta file that contains for each mg and sequence at most one region. mgToFilteredFastaPath = dict([]) mgToSeqNameToTaxPathDict = dict( []) #mg -> seqName (~region name) -> pred for mg in self._mgList: mgToSeqNameToTaxPathDict[mg] = dict([]) for seq in self._sequences.sequences: id = str(str(seq.scaffold.id) + '_' + str(seq.id)) for mg, tag, pred in zip(seq.getCandidateTaxPathSourceList(), seq.getCandidateTaxPathTagList(), seq.getCandidateTaxPathDictList()): mgToSeqNameToTaxPathDict[mg][str(id + '_' + tag)] = pred #for each marker gene: choose only one sequence region for each mg and sequence #all sequences are predicted at least at superkingdom for mg in self._mgList: seqNameToPred = mgToSeqNameToTaxPathDict[ mg] #sequence region predictions for this mg seqNameToSeq = fastaFileToDict( mgToFastaPath[mg]) #read the fasta file outPath = os.path.normpath( os.path.join(self._clustDir, str(mg + '.filter.fna'))) mgToFilteredFastaPath[mg] = outPath out = OutFileBuffer(outPath) seqBaseToSeqName = dict( []) # sequence base (scaffId_seqId) -> region name for seqName in seqNameToSeq: seqBase = re.sub(r'^([0-9]+_[0-9]+)[^0-9].*', r'\1', seqName) if seqBase not in seqBaseToSeqName: seqBaseToSeqName[seqBase] = [] seqBaseToSeqName[seqBase].append(seqName) for seqBase in seqBaseToSeqName: seqId = int(re.sub(r'^[0-9]+_([0-9]+)', r'\1', seqBase)) seqBaseTaxPathDict = self._sequences.getSequence( seqId).getTaxonomyPath() list = seqBaseToSeqName[seqBase] candidateSeq = [ ] # sequence region is predicted at least at rank superkingdom for seqName in list: if seqName not in seqNameToPred: taxPathDict = None else: taxPathDict = seqNameToPred[seqName] if taxPathDict != None: candidateSeq.append(seqName) if len(candidateSeq) == 0: continue candidateSeq2 = [ ] # sequence regions predicted at least at the same rank as the whole sequence for seqName in candidateSeq: taxPathDict = seqNameToPred[seqName] if ((seqBaseTaxPathDict == None) or (len(taxPathDict) >= len(seqBaseTaxPathDict)) ): #predict at least at the same level candidateSeq2.append(seqName) if len(candidateSeq2) > 0: #take the longest sequence sMax = candidateSeq2[0] for s in candidateSeq2[1:]: if len(seqNameToSeq[s]) > len(seqNameToSeq[sMax]): sMax = s else: #all sequence regions are predicted higher than the sequence sMax = candidateSeq[ 0] #sequence region with the most specific prediction for s in candidateSeq[1:]: taxPathDictMax = seqNameToPred[sMax] taxPathDictS = seqNameToPred[s] if taxPathDictS == None: continue if taxPathDictMax == None: sMax = s continue if len(taxPathDictMax) < len(taxPathDictS): sMax = s candidateSeq3 = [ ] #get all sequence regions with the most specific prediction taxPathDictMax = seqNameToPred[sMax] for s in candidateSeq: taxPathDictS = seqNameToPred[s] if taxPathDictMax == None: candidateSeq3.append(s) elif len(taxPathDictS) == len(taxPathDictMax): candidateSeq3.append(s) sMax = candidateSeq3[0] for s in candidateSeq3[1:]: #take the longest sequence if len(seqNameToSeq[sMax]) < len(seqNameToSeq[s]): sMax = s out.writeText( str('>' + str(sMax) + '\n' + str(seqNameToSeq[sMax]) + '\n')) out.close() mgToAlignPath = dict([]) for mg in self._mgList: mgToAlignPath[mg] = os.path.normpath( os.path.join(self._clustDir, str(mg + '.align.fna'))) #build alignment if align: for mg in self._mgList: alignCmd = str( self._config.get('aligner') + ' -in ' + mgToFilteredFastaPath[mg] + ' -out ' + mgToAlignPath[mg] + ' -quiet') assert os.name == 'posix' predictProc = subprocess.Popen( alignCmd, cwd=self._mgWorkingDir, shell=True, bufsize=-1 ) #stdout=subprocess.STDOUT, stderr=subprocess.STDOUT) predictProc.wait() print 'Muscle return code for', mg, ':', predictProc.returncode if predictProc.returncode != 0: sys.stderr.write(str(alignCmd + ' \n')) #compute DM if dm: for mg in self._mgList: mothur = os.path.join( os.path.normpath( self._configRRNA16S.get('mothurInstallDir')), 'mothur') mothurCmd = str( 'time ' + mothur + ' "#dist.seqs(fasta=' + mgToAlignPath[mg] + ', processors=2, countends=F, calc=nogaps, cutoff=0.3, output=lt)"' ) assert os.name == 'posix' mothurProc = subprocess.Popen(mothurCmd, shell=True, bufsize=-1, cwd=self._mgWorkingDir) mothurProc.wait() print 'Mothur return code dist:', mg, mothurProc.returncode #distFilePath = os.path.join(os.path.dirname(mgToAlignPath[mg]), str(mg + '.align.phylip.dist')) #self._mgToDM[mg] = forEachLine(distFilePath, DM()) #self._mgToDM[mg].printDM() #cluster if cluster: for mg in self._mgList: distFilePath = os.path.join(os.path.dirname(mgToAlignPath[mg]), str(mg + '.align.phylip.dist')) mothur = os.path.join( os.path.normpath( self._configRRNA16S.get('mothurInstallDir')), 'mothur') mothurCmd = str('time ' + mothur + ' "#cluster(phylip=' + distFilePath + ', method=furthest, hard=t, precision=1000)"') assert os.name == 'posix' mothurProc = subprocess.Popen(mothurCmd, shell=True, bufsize=-1, cwd=self._mgWorkingDir) mothurProc.wait() print 'Mothur return code cluster:', mg, mothurProc.returncode #read DM and clusters #sequence predictions self._seqIdToTaxPathDict = dict([]) self._seqIdToWeight = dict([]) for seq in self._sequences.sequences: id = int(seq.id) self._seqIdToTaxPathDict[id] = seq.getTaxonomyPath() self._seqIdToWeight[id] = seq.getTaxonomyPathWeight() #similarity thresholds thresholds = self._configMG.get('mgSimilarityThresholds') self._mgToMaxThreshold = dict([]) tmpDict = getMapping(self._configMG.get('mgSimilarityThresholds'), 0, 1, sep='\t', comment='#') for k in tmpDict: self._mgToMaxThreshold[k] = float(tmpDict[k][0]) self._mgToDM = dict([]) self._mgToCluster = dict([]) for mg in self._mgList: file = os.path.join(os.path.dirname(mgToAlignPath[mg]), str(mg + '.align.phylip.dist')) self._mgToDM[mg] = forEachLine(file, DM()) file = os.path.join(os.path.dirname(mgToAlignPath[mg]), str(mg + '.align.phylip.fn.list')) self._mgToCluster[mg] = forEachLine( file, MCluster(self._seqIdToTaxPathDict, self._mgToMaxThreshold[mg]))
def _init(self, align=True, dm=True, cluster=True): """ Init data, compute: alignment, distance matrix, clusters. """ if self._initDone: return self._initDone = True fastaPathList = [] # fasta files containing regions that correspond to particular marker genes self._mgList = [] # list of names of marker genes mgToFastaPath = dict([]) # marker gene name -> fasta file path #collect regions from Amphora mg for fastaFile in glob.glob(os.path.join(os.path.normpath(self._mgWorkingDir),'*.gff')): fastaPathList.append(fastaFile) for path in fastaPathList: name = re.sub('([^\.]+)\..*$', r'\1' , os.path.basename(path)) mg = re.sub(r'([^_]+)_dna', r'\1',name) dir = os.path.dirname(path) self._mgList.append(mg) mgToFastaPath[mg] = path #add 16S s16List = ['5S_rRNA', '16S_rRNA', '23S_rRNA'] for mg in s16List: mgToFastaPath[mg] = str(self._s16Prefix + '.' + mg + '.fna') self._mgList.append(mg) #For each marker gene create filtered fasta file that contains for each mg and sequence at most one region. mgToFilteredFastaPath = dict([]) mgToSeqNameToTaxPathDict = dict([]) #mg -> seqName (~region name) -> pred for mg in self._mgList: mgToSeqNameToTaxPathDict[mg] = dict([]) for seq in self._sequences.sequences: id = str(str(seq.scaffold.id) + '_' + str(seq.id)) for mg,tag,pred in zip(seq.getCandidateTaxPathSourceList(), seq.getCandidateTaxPathTagList(), seq.getCandidateTaxPathDictList()): mgToSeqNameToTaxPathDict[mg][str(id + '_' + tag)] = pred #for each marker gene: choose only one sequence region for each mg and sequence #all sequences are predicted at least at superkingdom for mg in self._mgList: seqNameToPred = mgToSeqNameToTaxPathDict[mg] #sequence region predictions for this mg seqNameToSeq = fastaFileToDict(mgToFastaPath[mg]) #read the fasta file outPath = os.path.normpath(os.path.join(self._clustDir, str(mg + '.filter.fna'))) mgToFilteredFastaPath[mg] = outPath out = OutFileBuffer(outPath) seqBaseToSeqName = dict([]) # sequence base (scaffId_seqId) -> region name for seqName in seqNameToSeq: seqBase = re.sub(r'^([0-9]+_[0-9]+)[^0-9].*',r'\1', seqName) if seqBase not in seqBaseToSeqName: seqBaseToSeqName[seqBase] = [] seqBaseToSeqName[seqBase].append(seqName) for seqBase in seqBaseToSeqName: seqId = int(re.sub(r'^[0-9]+_([0-9]+)',r'\1', seqBase)) seqBaseTaxPathDict = self._sequences.getSequence(seqId).getTaxonomyPath() list = seqBaseToSeqName[seqBase] candidateSeq = [] # sequence region is predicted at least at rank superkingdom for seqName in list: if seqName not in seqNameToPred: taxPathDict = None else: taxPathDict = seqNameToPred[seqName] if taxPathDict != None: candidateSeq.append(seqName) if len(candidateSeq) == 0: continue candidateSeq2 = [] # sequence regions predicted at least at the same rank as the whole sequence for seqName in candidateSeq: taxPathDict = seqNameToPred[seqName] if ((seqBaseTaxPathDict == None) or (len(taxPathDict) >= len(seqBaseTaxPathDict))): #predict at least at the same level candidateSeq2.append(seqName) if len(candidateSeq2) > 0: #take the longest sequence sMax = candidateSeq2[0] for s in candidateSeq2[1:]: if len(seqNameToSeq[s]) > len(seqNameToSeq[sMax]): sMax = s else: #all sequence regions are predicted higher than the sequence sMax = candidateSeq[0] #sequence region with the most specific prediction for s in candidateSeq[1:]: taxPathDictMax = seqNameToPred[sMax] taxPathDictS = seqNameToPred[s] if taxPathDictS == None: continue if taxPathDictMax == None: sMax = s continue if len(taxPathDictMax) < len(taxPathDictS): sMax = s candidateSeq3 = [] #get all sequence regions with the most specific prediction taxPathDictMax = seqNameToPred[sMax] for s in candidateSeq: taxPathDictS = seqNameToPred[s] if taxPathDictMax == None: candidateSeq3.append(s) elif len(taxPathDictS) == len(taxPathDictMax): candidateSeq3.append(s) sMax = candidateSeq3[0] for s in candidateSeq3[1:]: #take the longest sequence if len(seqNameToSeq[sMax]) < len(seqNameToSeq[s]): sMax = s out.writeText(str('>' + str(sMax) + '\n' + str(seqNameToSeq[sMax]) + '\n')) out.close() mgToAlignPath = dict([]) for mg in self._mgList: mgToAlignPath[mg] = os.path.normpath(os.path.join(self._clustDir, str(mg + '.align.fna'))) #build alignment if align: for mg in self._mgList: alignCmd = str(self._config.get('aligner') + ' -in ' + mgToFilteredFastaPath[mg] + ' -out ' + mgToAlignPath[mg] + ' -quiet') assert os.name == 'posix' predictProc = subprocess.Popen(alignCmd, cwd=self._mgWorkingDir, shell=True, bufsize=-1) #stdout=subprocess.STDOUT, stderr=subprocess.STDOUT) predictProc.wait() print 'Muscle return code for', mg, ':', predictProc.returncode if predictProc.returncode != 0: sys.stderr.write(str(alignCmd + ' \n')) #compute DM if dm: for mg in self._mgList: mothur = os.path.join(os.path.normpath(self._configRRNA16S.get('mothurInstallDir')), 'mothur') mothurCmd = str('time ' + mothur + ' "#dist.seqs(fasta=' + mgToAlignPath[mg] + ', processors=2, countends=F, calc=nogaps, cutoff=0.3, output=lt)"') assert os.name == 'posix' mothurProc = subprocess.Popen(mothurCmd, shell=True, bufsize=-1, cwd=self._mgWorkingDir) mothurProc.wait() print 'Mothur return code dist:', mg, mothurProc.returncode #distFilePath = os.path.join(os.path.dirname(mgToAlignPath[mg]), str(mg + '.align.phylip.dist')) #self._mgToDM[mg] = forEachLine(distFilePath, DM()) #self._mgToDM[mg].printDM() #cluster if cluster: for mg in self._mgList: distFilePath = os.path.join(os.path.dirname(mgToAlignPath[mg]), str(mg + '.align.phylip.dist')) mothur = os.path.join(os.path.normpath(self._configRRNA16S.get('mothurInstallDir')), 'mothur') mothurCmd = str('time ' + mothur + ' "#cluster(phylip=' + distFilePath + ', method=furthest, hard=t, precision=1000)"') assert os.name == 'posix' mothurProc = subprocess.Popen(mothurCmd, shell=True, bufsize=-1, cwd=self._mgWorkingDir) mothurProc.wait() print 'Mothur return code cluster:', mg, mothurProc.returncode #read DM and clusters #sequence predictions self._seqIdToTaxPathDict = dict([]) self._seqIdToWeight = dict([]) for seq in self._sequences.sequences: id = int(seq.id) self._seqIdToTaxPathDict[id] = seq.getTaxonomyPath() self._seqIdToWeight[id] = seq.getTaxonomyPathWeight() #similarity thresholds thresholds = self._configMG.get('mgSimilarityThresholds') self._mgToMaxThreshold = dict([]) tmpDict = getMapping(self._configMG.get('mgSimilarityThresholds'), 0, 1, sep='\t', comment = '#') for k in tmpDict: self._mgToMaxThreshold[k] = float(tmpDict[k][0]) self._mgToDM = dict([]) self._mgToCluster = dict([]) for mg in self._mgList: file = os.path.join(os.path.dirname(mgToAlignPath[mg]), str(mg + '.align.phylip.dist')) self._mgToDM[mg] = forEachLine(file, DM()) file = os.path.join(os.path.dirname(mgToAlignPath[mg]), str(mg + '.align.phylip.fn.list')) self._mgToCluster[mg] = forEachLine(file, MCluster(self._seqIdToTaxPathDict, self._mgToMaxThreshold[mg]))
def _main(): """ See the module description.""" parser = argparse.ArgumentParser(description=__doc__, epilog="""""") parser.add_argument('-i', '--input-data-dir', action='store', nargs=1, required=True, help="""Directory that contains fasta files and corresponding mapping files, for each "*.tax" (or "*.csv") file there must be a "*.fna" file with the same name. All files with suffix "tax" (or "*.csv") will be considered. (Takes only Bacteria and Archaea)""", metavar='input_dir', dest='inDir') parser.add_argument('-o', '--output-dir', action='store', nargs=1, required=True, help='Directory that contains the output files.', metavar='out_dir', dest='outDir') parser.add_argument('-s', '--source-type', required=True, nargs=1, choices=["s","a"], help='To determine the source, use "s" for the Silva database and "a" for the Amphora database.', dest='srcType') parser.add_argument('-t', '--taxonomy-file', nargs=1, type=file, required=True, help='NCBI taxonomy database file in the sqlite3 format.', metavar='ncbitax_sqlite.db', dest='taxonomy') parser.add_argument('-n', '--not-considered-taxonIds', action='store', nargs=1, help='Comma separated leaf level or top level taxonIds (as a string) what fill be filtered out. (optional)', metavar='"2759,10239,77133,155900,408172,32644, 408170,433727,749907,556182,702656,410661,652676,410659,797283'\ ',408171,703336,256318,32630,433724,766747,488339,942017,1076179,717931,455559,527640,904678,552539,'\ '54395,198431,358574,415540,511564,369433,380357,81726,198834,271928,311313,2759,749906,1077529,'\ '1077529,361146,511563,361147"', dest='filterOut') # parse arguments args = parser.parse_args() inDir = args.inDir[0] outDir = args.outDir[0] srcType = args.srcType[0] filterOutTaxonIdsSet = set() try: if args.filterOut: filterOutTaxonIdsSet.update(set(map(int, str(args.filterOut[0]).split(',')))) except: print('Taxon ids that are to be filtered out are in a wrong format! Comma separated integers are needed!') raise taxonomy = TaxonomyWrap(args.taxonomy[0].name) for dir in [inDir, outDir]: assert os.path.isdir(dir), 'Path: "' + dir + '" does not exists!' # create db for each gene mapDict = {} # map: seqId -> ncbid for mapFilePath in glob.glob(os.path.join(os.path.normpath(inDir), r'*.[ct][sa][vx]')): # *.csv or *.tax assert mapFilePath.endswith(('.csv', '.tax')), \ 'The mapping files can either end with .csv or .tax ' + mapFilePath base = os.path.basename(mapFilePath).rsplit('.', 1)[0] # cut out dir path and suffix fastaDict = fas.fastaFileToDict(os.path.join(os.path.dirname(mapFilePath), (base + '.fna'))) # map: seqId -> seq print("Processing: %s seq count: %s" % (base, str(len(fastaDict)))) if 'a' in srcType: # Amphora mapDict = {} for k in csv.getColumnAsList(mapFilePath, colNum=0, sep='\t'): v = int(k.rsplit('|', 1)[1].split(':')[1]) # get ncbid assert ((k not in mapDict) or (mapDict[k] == v)), str( 'There are at least two different values for key: ' + str(k) + ' in ' + mapFilePath) mapDict[k] = v elif 's' in srcType: # Silva mapTmp = csv.getMapping(mapFilePath, 0, 2, '\t') mapDict = {} for k, v in mapTmp.iteritems(): mapDict[k] = int(v[0]) else: assert False, 'Unsupported source type!' # same number of entries in both files (fasta and mapping) ? if len(mapDict) != len(fastaDict): print(str('%s: The mapping file and the corresponding fasta file have different number of entries: ' + '"%s" "%s" these files will be skipped!') % (base, str(len(mapDict)), str(len(fastaDict)))) continue # are duplicates in the mapping file ? count = len(csv.getColumnAsList(mapFilePath)) if len(mapDict) != count: print('%s: The mapping file contained duplicates! unique: %s non-unique: %s' % ( base, str(len(mapDict)), str(count))) # store data to the output directory outDna = csv.OutFileBuffer(os.path.join(outDir, str(base + '.fna'))) outTax = csv.OutFileBuffer(os.path.join(outDir, str(base + '.tax'))) count = 0 filteredLeaf = 0 filteredSup = 0 notMapped = 0 noBacArch = 0 for seqId, taxonId in mapDict.iteritems(): if taxonId in filterOutTaxonIdsSet: filteredLeaf += 1 continue path = taxonomy.getPathToRoot(taxonId) if path is None: print('Could not find: %s for seqId: %s record skipped!' % (str(taxonId), seqId)) notMapped += 1 continue topLevel = int(path.split(';', 1)[0]) if topLevel in filterOutTaxonIdsSet: filteredSup += 1 continue if topLevel not in [2, 2157]: # Bacteria, Archaea noBacArch += 1 print('NoBactArch: ', topLevel) seq = fastaDict[seqId] if 'a' in srcType: # Amphora id = seqId elif 's' in srcType: # Silva id = str(seqId + '|ncbid:' + str(taxonId)) outTax.writeText(str(id + '\t' + path + '\n')) outDna.writeText(str('>' + id + '\n' + seq + '\n')) count += 1 outDna.close() outTax.close() print('Stored entries: %s filtered out: %s leaf, %s top level, not mapped: %s' % (count, filteredLeaf, filteredSup, notMapped)) if noBacArch > 0: print('WARN: stored %s of non Bacterial and non Archaeal sequences: ' % (noBacArch)) # Silva: #-i /Users/ivan/Documents/work/binning/database/silva111/arbGenerated -s s -t /Users/ivan/Documents/work/binning/taxonomy/20121122/ncbitax_sqlite.db # -o /Users/ivan/Documents/work/binning/database/silva111/db -n ... # Amphora # -i /Users/ivan/Documents/work/binning/database/markerGenes3/mGenesExtracted -s a -t /Users/ivan/Documents/work/binning/taxonomy/20121122/ncbitax_sqlite.db # -o /Users/ivan/Documents/work/binning/database/markerGenes3/db taxonomy.close() print 'done'
def computeTrainingAccuracy(workingDir, taWorkingDir, sampleSpecificDir, ppsTrainDataDir, outputDir, ppsInstallDir, ppsScripts, ppsConfigFilePath, predictLogFileName, modelTaxonIdFilePath, databaseFile): """ Computes the training accuracy for the PPS training data. This function doesn't consider training data used to train intermediate (misc?) nodes! The training data that correspond to the sample specific data is fragmented (via PPS) and contained in the training data of different lengths. @param workingDir: working directory of the PPS+ pipeline @param taWorkingDir: working directory for the accuracy computation @param sampleSpecificDir: directory containing the sample specific data @param ppsTrainDataDir: directory 'sampled_fasta' containing PPS training data @param outputDir: directory for output files @param ppsScripts: directory containing PPS scripts @param ppsConfigFilePath: the PPS configuration file @param ppsInstallDir: directory where PPS is installed @param predictLogFileName: logging file for PPS prediction @param modelTaxonIdFilePath: file containing all leaf ncbi taxon ids that are modelled @param databaseFile: ncbi taxonomy file in the sqlite3 format """ for d in [workingDir, taWorkingDir, sampleSpecificDir, ppsTrainDataDir, outputDir, ppsInstallDir, ppsScripts, os.path.dirname(predictLogFileName)]: assert os.path.isdir(d), "Directory '%s' doesn't exist!" % d for f in [ppsConfigFilePath, databaseFile, modelTaxonIdFilePath]: assert os.path.isfile(f), "File '%s' doesn't exist!" % f # all directories that contain PPS training data trainDirList = [sampleSpecificDir] for d in os.listdir(ppsTrainDataDir): trainDirList.append(os.path.join(ppsTrainDataDir, d)) # fasta file with all training sequences allTrainFastaFile = os.path.join(taWorkingDir, 'all_train_data.fna') out = csv.OutFileBuffer(allTrainFastaFile) seqIdToTruePred = {} # merge all training fasta files to one fasta file for d in trainDirList: dName = os.path.basename(d) for f in os.listdir(d): taxonId = int(os.path.basename(f).rsplit('.', 2)[0]) for seqId, seq in fasta.fastaFileToDict(os.path.join(d, f)).iteritems(): if d == sampleSpecificDir: #label = int(str(str(seqId).rsplit('|', 1)[1]).split(':', 1)[1]) id = str(taxonId) + '|' + dName + '|' + seqId + '|label:' + str(taxonId) else: id = str(taxonId) + '|' + dName + '|' + seqId out.writeText('>' + id + '\n' + seq + '\n') seqIdToTruePred[id] = taxonId out.close() # predict the merged file using the generated model if os.name == 'posix': predictCmd = str(os.path.join(ppsScripts, 'predict.rb') + ' ' + allTrainFastaFile + ' ' + ppsConfigFilePath) #print(predictCmd) logOut = open(predictLogFileName, 'w') predictProc = subprocess.Popen(predictCmd, shell=True, bufsize=-1, cwd=ppsInstallDir, stdout=logOut, stderr=subprocess.STDOUT) # stdout=subprocess.STDOUT predictProc.wait() logOut.close() if predictProc.returncode != 0: raise Exception("PPS 'predict' training data returned with non-zero status: %s, cmd: %s" % (predictProc.returncode, predictCmd)) else: print("Can't run PPS on a non-posix system!") return # read in predicted train data seqIdToPred = csv.predToDict(allTrainFastaFile + '.nox.fna.out') # read fasta file seqIdToBp = fasta.getSequenceToBpDict(allTrainFastaFile) # leaf taxonIds that are modelled modelLeafTaxonIds = set(map(int, csv.getColumnAsList(modelTaxonIdFilePath))) taxonomyS = taxonomy_ncbi.TaxonomyNcbi(databaseFile, considerNoRank=True) notLeafTaxonIds = set() for id in modelLeafTaxonIds: notLeafTaxonIds.update(set(map(int, (taxonomyS.getParentsNcbidSet(id))))) taxonomyS.close() # get only sequences with true taxonId defined at leaf level that is modelled or lower seqIdToBp2 = {} seqIdToPred2 = {} seqIdToTruePred2 = {} seqIdToBpMisc = {} seqIdToPredMisc = {} seqIdToTruePredMisc = {} for seqId, bp in seqIdToBp.iteritems(): label = int(str(str(seqId).rsplit('|', 1)[1]).split(':', 1)[1]) if label not in notLeafTaxonIds: seqIdToBp2[seqId] = bp seqIdToPred2[seqId] = seqIdToPred[seqId] seqIdToTruePred2[seqId] = seqIdToTruePred[seqId] else: seqIdToBpMisc[seqId] = bp seqIdToPredMisc[seqId] = seqIdToPred[seqId] seqIdToTruePredMisc[seqId] = seqIdToTruePred[seqId] seqIdToBp = seqIdToBp2 seqIdToPred = seqIdToPred2 seqIdToTruePred = seqIdToTruePred2 # accuracy for all, filter out sample specific data (whole length) seqIdToBpNoSampleSpec = {} for seqId, bp in seqIdToBp.iteritems(): if str(seqId).split('|', 2)[1].strip() != os.path.basename(sampleSpecificDir).strip(): seqIdToBpNoSampleSpec[seqId] = bp acc = accuracy.Accuracy(seqIdToBpNoSampleSpec, seqIdToPred, seqIdToTruePred, databaseFile) out = csv.OutFileBuffer(os.path.join(outputDir, 'train_accuracy_all.txt')) out.writeText(acc.getAccuracyPrint(taxonomy_ncbi.TAXONOMIC_RANKS[1:], minFracClade=None, minFracPred=None, overview=True)) out.close() taxonomyA = acc.getTaxonomy() acc.close(closeTaxonomy=False) # accuracy for (misc) nodes acc = accuracy.Accuracy(seqIdToBpMisc, seqIdToPredMisc, seqIdToTruePredMisc, taxonomyA) out = csv.OutFileBuffer(os.path.join(outputDir, 'train_accuracy_misc.txt')) out.writeText(acc.getAccuracyPrint(taxonomy_ncbi.TAXONOMIC_RANKS[1:], minFracClade=None, minFracPred=None, overview=True)) out.close() acc.close(closeTaxonomy=False) # generate the confusion matrices (for the "for all" scenario) cm = confusion_matrix.ConfusionMatrix(seqIdToBp, seqIdToPred, seqIdToTruePred, databaseFile, taxonomy_ncbi.TAXONOMIC_RANKS[1:]) for rank in taxonomy_ncbi.TAXONOMIC_RANKS[1:]: cm.generateConfusionMatrix(rank, os.path.join(outputDir, 'train_accuracy_cmp_all')) taxonomyCM = cm.getTaxonomy() cm.close(closeTaxonomy=False) # accuracy for individual directories (seq lengths) # (the sample specific fragments are among PPS sampled fasta) for d in trainDirList: dName = os.path.basename(d) seqIdToBpSub = {} seqIdToPredSub = {} seqIdToTruePredSub = {} for seqId, bp in seqIdToBp.iteritems(): if str(seqId).split('|', 2)[1].strip() == str(dName).strip(): seqIdToBpSub[seqId] = seqIdToBp[seqId] seqIdToPredSub[seqId] = seqIdToPred[seqId] seqIdToTruePredSub[seqId] = seqIdToTruePred[seqId] # accuracy acc = accuracy.Accuracy(seqIdToBpSub, seqIdToPredSub, seqIdToTruePredSub, taxonomyA) out = csv.OutFileBuffer(os.path.join(outputDir, 'train_accuracy_' + dName + '.txt')) out.writeText(acc.getAccuracyPrint(taxonomy_ncbi.TAXONOMIC_RANKS[1:], minFracClade=None, minFracPred=None, overview=True)) # confusion matrices cm = confusion_matrix.ConfusionMatrix(seqIdToBpSub, seqIdToPredSub, seqIdToTruePredSub, taxonomyCM, taxonomy_ncbi.TAXONOMIC_RANKS[1:]) for rank in taxonomy_ncbi.TAXONOMIC_RANKS[1:]: cm.generateConfusionMatrix(rank, os.path.join(outputDir, 'train_accuracy_cmp_' + dName)) cm.close(closeTaxonomy=False) out.close() acc.close(closeTaxonomy=False) taxonomyA.close() taxonomyCM.close()
def main(): """ Wraps pIRS read simulator to simulate Illumina paired end reads. Sample config: /Users/ivan/Documents/work/binning/data/V35/simMetagenome/configMetagenome01.cfg """ if os.name != 'posix': print 'runs only on posix systems' return #parse arguments parser = argparse.ArgumentParser( description= '''A simple Metagenome Illumina read simulator that wraps pIRS''', epilog='''''') parser.add_argument('-c', '--config', nargs=1, type=file, required=True, help='configuration file of the simulator', metavar='configMetagenome.cfg', dest='config') parser.add_argument( '-p', '--pIRS-param', action='store', nargs='+', help='parameters of the pIRS simulator, e.g. "-Q 64 -E 1"', dest='p') args = parser.parse_args() config = Config(args.config[0], 'Sim') pirsParam = '' if args.p: pirsParam = args.p[0] #reads configuration workingDir = config.get('workingDir') referenceSeq = config.get('referenceSeq') frequenciesInfo = config.get('frequenciesInfo') coverageFrequencyMultiplier = float( config.get('coverageFrequencyMultiplier')) pirsInstallDir = config.get('pirsInstallDir') insertSizeMean = int(config.get('insertSizeMean')) insertSizeSd = int(config.get('insertSizeSd')) readLength = int(config.get('readLength')) #check whether the pIRS optional parameters doesn`t contain those predefined elsewhere (e.g. in the config) if (string.count(pirsParam, '-m') != 0 or string.count(pirsParam, '-v') != 0 or string.count(pirsParam, '-l') != 0 or string.count(pirsParam, '-x') != 0 or string.count(pirsParam, '-i') != 0 or string.count(pirsParam, '-o') != 0): print 'pIRS parameters -m -v -l (-x) must be set in the configuration file, parameters -i -o cannot be set ' return #check working directory, create temporary directory tmpDir = os.path.join(workingDir, 'tmp') if not os.path.isdir(workingDir): print str('The working directory does not exists, create it! (' + str(workingDir) + ')') return if not os.path.isdir(tmpDir): os.mkdir(tmpDir) seqNameToSeq = fastaFileToDict(referenceSeq) seqNameToFreq = getMapping(frequenciesInfo, 0, 1, sep='\t', comment='#') outReads1Merged = OutFileBuffer(os.path.join(workingDir, 'reads_1.fq')) outReads2Merged = OutFileBuffer(os.path.join(workingDir, 'reads_2.fq')) for seqName in seqNameToFreq: seq = seqNameToSeq[seqName] coverage = float( seqNameToFreq[seqName][0]) * coverageFrequencyMultiplier fastaFile = os.path.join(tmpDir, str(seqName + '.fna')) outBuffer = OutFileBuffer(fastaFile) outBuffer.writeText(str('>' + seqName + '\n' + seq + '\n')) outBuffer.close() cmd = str( os.path.join(pirsInstallDir, 'pirs') + ' simulate -i ' + fastaFile + ' -x ' + str(coverage) + ' -m ' + str(insertSizeMean) + ' -v ' + str(insertSizeSd) + ' -l ' + str(readLength) + ' -o ' + seqName + ' ' + pirsParam) #print cmd proc = subprocess.Popen( cmd, shell=True, bufsize=-1, cwd=tmpDir) # stdout=subprocess.STDOUT, stderr=subprocess.STDOUT) proc.wait() if proc.returncode != 0: sys.stderr.write(str('command failed: ' + cmd)) #append generated reads to the merged files reads1 = gzip.open( os.path.join( tmpDir, str(seqName + '_' + str(readLength) + '_' + str(insertSizeMean) + '_1.fq.gz')), 'rb') file1Content = reads1.read() outReads1Merged.writeText( str( file1Content.replace('@read_', str('@read_' + seqName + '_')) + '\n')) reads1.close() reads2 = gzip.open( os.path.join( tmpDir, str(seqName + '_' + str(readLength) + '_' + str(insertSizeMean) + '_2.fq.gz')), 'rb') file2Content = reads2.read() outReads2Merged.writeText( str( file2Content.replace('@read_', str('@read_' + seqName + '_')) + '\n')) reads2.close() outReads1Merged.close() outReads2Merged.close()