def test1(): config = Config(open(os.path.normpath('D:\\A_Phylo\\A_Metagenomic\\pPPS\\workspace\\pPPS\\config01.cfg')), 'pPPS') databaseFile = os.path.normpath(config.get('databaseFile')) taxonomicRanks = config.get('taxonomicRanks').split(',') t = Taxonomy(databaseFile, taxonomicRanks) s = 'Assignment of query to the lowest common ancestor of Bacteroides thetaiotaomicron (226186), Porphyromonas gingivalis (242619) and Parabacteroides distasonis (435591).' #s = 'Assignment of query to the lowest common ancestor of Bacteroides thetaiotaomicron , Porphyromonas gingivalis (242619) and Parabacteroides distasonis (435591).' #s = 'Assignment of query to the lowest common ancestor of Bacteroides thetaiotaomicron , Porphyromonas gingivalis and Parabacteroides distasonis (435591).' #s = 'Assignment of query to the lowest common ancestor of Bacteroides thetaiotaomicron (226186) and Parabacteroides distasonis (435591).' #s = 'Assignment of query to the lowest common ancestor of Halobacterium sp. (64091), Thermococcus kodakarensis (69014), Pyrococcus horikoshii (70601), Methanothermobacter thermautotrophicus (187420), Methanopyrus kandleri AV19 (190192), Methanosarcina mazei (192952), Archaeoglobus fulgidus (224325), Methanocaldococcus jannaschii (243232), Methanococcoides burtonii (259564), Methanococcus maripaludis S2 (267377), Haloarcula marismortui (272569), Methanospirillum hungatei (323259), Methanosphaera stadtmanae (339860), Natronomonas pharaonis (348780), Methanosaeta thermophila PT (349307), Candidatus methanoarchaeon RC1 (351160), Haloquadratum walsbyi (362976), Methanoculleus marisnigri JR1 (368407), Methanocorpusculum labreanum (410358) and Methanobrevibacter smithii (420247).' #s = 'Assignment of query to the lowest common ancestor of Thermococcus kodakarensis (69014) and Pyrococcus horikoshii (70601).' list = re.findall(r'\([0-9]+\)', s) list2 = [] for i in list: str = re.sub(r'\(([0-9]+)\)', r'\1', i) list2.append(str) print str print '-------------------------' pathDict = t.getPathFromLowestCommonAncestorToRoot(list2) #pathDict = t.getPathToRoot(170187) for k in taxonomicRanks: if k not in pathDict: break n = pathDict[k] print n.ncbid, n.rank, n.name
def _test(): #s = Sequence(1, 'seqName','AATTGGCCC\n\rAAA\n') #print 'sequence name: ', s.seqName #print 'sequence:', s.getSeq() #print 'seqBp:', s.seqBp, '({0})'.format(len(s.seqCompressed)) config = Config( open( os.path.normpath( 'D:\\A_Phylo\\A_Metagenomic\\pPPS\\workspace\\pPPS\\config01.cfg' )), 'pPPS') outputFileContigSubPattern = config.get('outputFileContigSubPattern') nameToIDsFile = os.path.normpath( 'D:\\A_Phylo\\A_Metagenomic\\pPPS\\workspace\\pPPS\\wdir02\\inputTW.fas.cToIds' ) targetFile = os.path.normpath( 'D:\\A_Phylo\\A_Metagenomic\\pPPS\\workspace\\pPPS\\wdir02\\inputTW.fas.ids.out' ) outFile = os.path.normpath( 'D:\\A_Phylo\\A_Metagenomic\\pPPS\\workspace\\pPPS\\wdir02\\inputTW.fas.pOUT' ) replaceIdsWithNames(outputFileContigSubPattern, nameToIDsFile, targetFile, outFile)
def test3(): config = Config(open(os.path.normpath('/Users/ivan/Documents/work/binning/tests/CowRumen/03/config.cfg')), 'pPPS') databaseFile = os.path.normpath(config.get('databaseFile')) taxonomicRanks = config.get('taxonomicRanks').split(',') t = Taxonomy(databaseFile, taxonomicRanks) parentNcbid = 1239 #Firmicutes sampleName = 'test_sample' rank = 'species' t.createNewOtuDBEntry(parentNcbid, sampleName, rank) t.close()
def _test(): #s = Sequence(1, 'seqName','AATTGGCCC\n\rAAA\n') #print 'sequence name: ', s.seqName #print 'sequence:', s.getSeq() #print 'seqBp:', s.seqBp, '({0})'.format(len(s.seqCompressed)) config = Config(open(os.path.normpath('D:\\A_Phylo\\A_Metagenomic\\pPPS\\workspace\\pPPS\\config01.cfg')), 'pPPS') outputFileContigSubPattern = config.get('outputFileContigSubPattern') nameToIDsFile = os.path.normpath('D:\\A_Phylo\\A_Metagenomic\\pPPS\\workspace\\pPPS\\wdir02\\inputTW.fas.cToIds') targetFile = os.path.normpath('D:\\A_Phylo\\A_Metagenomic\\pPPS\\workspace\\pPPS\\wdir02\\inputTW.fas.ids.out') outFile = os.path.normpath('D:\\A_Phylo\\A_Metagenomic\\pPPS\\workspace\\pPPS\\wdir02\\inputTW.fas.pOUT') replaceIdsWithNames(outputFileContigSubPattern, nameToIDsFile, targetFile, outFile)
def newTaxonId(): parser = argparse.ArgumentParser(description='Gets a new taxon ID', epilog='Note that the use of this functionality alter the taxonomy file') parser.add_argument('-c', '--config', nargs=1, type=file, required=True, help='Configuration file, the taxonomy file in this configuration file will be changed', metavar='config.cfg', dest='c') parser.add_argument('-p', '--parent', nargs=1, required=True, help='The parent NCBI taxon id of the new taxon id.', metavar='parent', dest='p') parser.add_argument('-r', '--rank', nargs=1, required=True, help='Rank of the new taxon ID', metavar='rank', dest='r') parser.add_argument('-n', '--name_suffix', nargs=1, required=True, help='Scientific name suffix of the new taxon ID', metavar='name', dest='n') args = parser.parse_args() if len(args.c) != 1 or len(args.p) != 1 or len(args.r) != 1 or len(args.n) != 1: print parser.print_help() configFile = args.c[0].name try: parent = int(args.p[0]) except: print("The parent taxonomic id must be a number") return rank = args.r[0] name = args.n[0] if len(name) == 0: print("The scientific name cannot be empty!") return config = Config(open(os.path.normpath(configFile)), 'PhyloPythiaS_Plus') databaseFile = os.path.join(os.path.normpath(config.get('databaseFile')), 'ncbitax_sqlite.db') print databaseFile taxonomicRanks = taxonomy_ncbi.TAXONOMIC_RANKS[1:] if rank not in taxonomicRanks: print("Allowed ranks are only: %s" % taxonomicRanks) return t = Taxonomy(databaseFile, taxonomicRanks) newId = t.createNewOtuDBEntry(parent, name, rank) print('New taxonomic id: "%s"; with name suffix "%s"; at rank "%s"; ' 'as a descendant of "%s" has been created in "%s"' % (newId, name, rank, parent, databaseFile)) t.close() return newId
def test2(): config = Config(open(os.path.normpath('D:\\A_Phylo\\A_Metagenomic\\pPPS\\workspace\\pPPS\\config01.cfg')), 'pPPS') databaseFile = os.path.normpath(config.get('databaseFile')) taxonomicRanks = config.get('taxonomicRanks').split(',') t = Taxonomy(databaseFile, taxonomicRanks) taxPathDictList = [] taxPathDictList.append(t.getPathToRoot(33958))#Lactobacillaceae taxPathDictList.append(t.getPathToRoot(91061))#Bacilli taxPathDictList.append(t.getPathToRoot(2))#Bacteria #taxPathDictList.append(t.getPathToRoot(1385))#Bacillales taxPathDictList.append(t.getPathToRoot(1578))#Lactobacilus #taxPathDictList.append(t.getPathToRoot(31979))#Clostridiaceae taxPathDictList.append(t.getPathToRoot(2))#Bacteria taxPathDict = t.getLongestCommonPathFromMultipleAssignments(taxPathDictList) for key in taxPathDict: print key, taxPathDict[key]
def test(): config = Config( open( '/Users/ivan/Documents/work/binning/tests/CowRumen/03/config.cfg'), 'pPPS') mgWorkingDir = '/Users/ivan/Documents/work/binning/tests/CowRumen/03/working/mgWorking' s16Prefix = '/Users/ivan/Documents/work/binning/tests/CowRumen/03/working/cow_rumen_fragmented_velvet_assembly_scaffolds.fas.ids' clust = MGCluster(config, mgWorkingDir, s16Prefix) clust.preprocess(align=False, dm=False, cluster=False, readData=True) #clust.buildSpecificPred() clust.reconstructOTU()
def main(): """ Wraps pIRS read simulator to simulate Illumina paired end reads. Sample config: /Users/ivan/Documents/work/binning/data/V35/simMetagenome/configMetagenome01.cfg """ if os.name != 'posix': print 'runs only on posix systems' return #parse arguments parser = argparse.ArgumentParser( description= '''A simple Metagenome Illumina read simulator that wraps pIRS''', epilog='''''') parser.add_argument('-c', '--config', nargs=1, type=file, required=True, help='configuration file of the simulator', metavar='configMetagenome.cfg', dest='config') parser.add_argument( '-p', '--pIRS-param', action='store', nargs='+', help='parameters of the pIRS simulator, e.g. "-Q 64 -E 1"', dest='p') args = parser.parse_args() config = Config(args.config[0], 'Sim') pirsParam = '' if args.p: pirsParam = args.p[0] #reads configuration workingDir = config.get('workingDir') referenceSeq = config.get('referenceSeq') frequenciesInfo = config.get('frequenciesInfo') coverageFrequencyMultiplier = float( config.get('coverageFrequencyMultiplier')) pirsInstallDir = config.get('pirsInstallDir') insertSizeMean = int(config.get('insertSizeMean')) insertSizeSd = int(config.get('insertSizeSd')) readLength = int(config.get('readLength')) #check whether the pIRS optional parameters doesn`t contain those predefined elsewhere (e.g. in the config) if (string.count(pirsParam, '-m') != 0 or string.count(pirsParam, '-v') != 0 or string.count(pirsParam, '-l') != 0 or string.count(pirsParam, '-x') != 0 or string.count(pirsParam, '-i') != 0 or string.count(pirsParam, '-o') != 0): print 'pIRS parameters -m -v -l (-x) must be set in the configuration file, parameters -i -o cannot be set ' return #check working directory, create temporary directory tmpDir = os.path.join(workingDir, 'tmp') if not os.path.isdir(workingDir): print str('The working directory does not exists, create it! (' + str(workingDir) + ')') return if not os.path.isdir(tmpDir): os.mkdir(tmpDir) seqNameToSeq = fastaFileToDict(referenceSeq) seqNameToFreq = getMapping(frequenciesInfo, 0, 1, sep='\t', comment='#') outReads1Merged = OutFileBuffer(os.path.join(workingDir, 'reads_1.fq')) outReads2Merged = OutFileBuffer(os.path.join(workingDir, 'reads_2.fq')) for seqName in seqNameToFreq: seq = seqNameToSeq[seqName] coverage = float( seqNameToFreq[seqName][0]) * coverageFrequencyMultiplier fastaFile = os.path.join(tmpDir, str(seqName + '.fna')) outBuffer = OutFileBuffer(fastaFile) outBuffer.writeText(str('>' + seqName + '\n' + seq + '\n')) outBuffer.close() cmd = str( os.path.join(pirsInstallDir, 'pirs') + ' simulate -i ' + fastaFile + ' -x ' + str(coverage) + ' -m ' + str(insertSizeMean) + ' -v ' + str(insertSizeSd) + ' -l ' + str(readLength) + ' -o ' + seqName + ' ' + pirsParam) #print cmd proc = subprocess.Popen( cmd, shell=True, bufsize=-1, cwd=tmpDir) # stdout=subprocess.STDOUT, stderr=subprocess.STDOUT) proc.wait() if proc.returncode != 0: sys.stderr.write(str('command failed: ' + cmd)) #append generated reads to the merged files reads1 = gzip.open( os.path.join( tmpDir, str(seqName + '_' + str(readLength) + '_' + str(insertSizeMean) + '_1.fq.gz')), 'rb') file1Content = reads1.read() outReads1Merged.writeText( str( file1Content.replace('@read_', str('@read_' + seqName + '_')) + '\n')) reads1.close() reads2 = gzip.open( os.path.join( tmpDir, str(seqName + '_' + str(readLength) + '_' + str(insertSizeMean) + '_2.fq.gz')), 'rb') file2Content = reads2.read() outReads2Merged.writeText( str( file2Content.replace('@read_', str('@read_' + seqName + '_')) + '\n')) reads2.close() outReads1Merged.close() outReads2Merged.close()
def main(): """ Wraps pIRS read simulator to simulate Illumina paired end reads. Sample config: /Users/ivan/Documents/work/binning/data/V35/simMetagenome/configMetagenome01.cfg """ if os.name != 'posix': print 'runs only on posix systems' return #parse arguments parser = argparse.ArgumentParser(description='''A simple Metagenome Illumina read simulator that wraps pIRS''', epilog='''''') parser.add_argument('-c', '--config', nargs=1, type=file, required=True, help='configuration file of the simulator', metavar='configMetagenome.cfg', dest='config') parser.add_argument('-p', '--pIRS-param', action='store', nargs='+', help='parameters of the pIRS simulator, e.g. "-Q 64 -E 1"', dest='p') args = parser.parse_args() config = Config(args.config[0], 'Sim') pirsParam = '' if args.p: pirsParam = args.p[0] #reads configuration workingDir = config.get('workingDir') referenceSeq = config.get('referenceSeq') frequenciesInfo = config.get('frequenciesInfo') coverageFrequencyMultiplier = float(config.get('coverageFrequencyMultiplier')) pirsInstallDir = config.get('pirsInstallDir') insertSizeMean = int(config.get('insertSizeMean')) insertSizeSd = int(config.get('insertSizeSd')) readLength = int(config.get('readLength')) #check whether the pIRS optional parameters doesn`t contain those predefined elsewhere (e.g. in the config) if (string.count(pirsParam,'-m') != 0 or string.count(pirsParam,'-v') != 0 or string.count(pirsParam,'-l') != 0 or string.count(pirsParam,'-x') != 0 or string.count(pirsParam,'-i') != 0 or string.count(pirsParam,'-o') != 0): print 'pIRS parameters -m -v -l (-x) must be set in the configuration file, parameters -i -o cannot be set ' return #check working directory, create temporary directory tmpDir = os.path.join(workingDir,'tmp') if not os.path.isdir(workingDir): print str('The working directory does not exists, create it! (' + str(workingDir) + ')') return if not os.path.isdir(tmpDir): os.mkdir(tmpDir) seqNameToSeq = fastaFileToDict(referenceSeq) seqNameToFreq = getMapping(frequenciesInfo, 0, 1, sep='\t', comment = '#') outReads1Merged = OutFileBuffer(os.path.join(workingDir,'reads_1.fq')) outReads2Merged = OutFileBuffer(os.path.join(workingDir,'reads_2.fq')) for seqName in seqNameToFreq: seq = seqNameToSeq[seqName] coverage = float(seqNameToFreq[seqName][0])*coverageFrequencyMultiplier fastaFile = os.path.join(tmpDir,str(seqName + '.fna')) outBuffer = OutFileBuffer(fastaFile) outBuffer.writeText(str('>' + seqName + '\n' + seq + '\n')) outBuffer.close() cmd = str(os.path.join(pirsInstallDir,'pirs') + ' simulate -i ' + fastaFile + ' -x ' + str(coverage) + ' -m ' + str(insertSizeMean) + ' -v ' + str(insertSizeSd) + ' -l ' + str(readLength) + ' -o ' + seqName + ' ' + pirsParam) #print cmd proc = subprocess.Popen(cmd, shell=True, bufsize=-1, cwd=tmpDir)# stdout=subprocess.STDOUT, stderr=subprocess.STDOUT) proc.wait() if proc.returncode != 0: sys.stderr.write(str('command failed: ' + cmd)) #append generated reads to the merged files reads1 = gzip.open(os.path.join(tmpDir, str(seqName + '_' + str(readLength) + '_' + str(insertSizeMean) + '_1.fq.gz')), 'rb') file1Content = reads1.read() outReads1Merged.writeText(str(file1Content.replace('@read_',str('@read_' + seqName + '_')) + '\n')) reads1.close() reads2 = gzip.open(os.path.join(tmpDir, str(seqName + '_' + str(readLength) + '_' + str(insertSizeMean) + '_2.fq.gz')), 'rb') file2Content = reads2.read() outReads2Merged.writeText(str(file2Content.replace('@read_',str('@read_' + seqName + '_')) + '\n')) reads2.close() outReads1Merged.close() outReads2Merged.close()
entry += str('\t' + taxPathDict[rank].name) else: entry += '\t' f.write(entry) except Exception: print "Cannot create a file or write to it:", outFile raise finally: f.close() if __name__ == "__main__": #test 2 #ppsOutFile = 'D:\A_Phylo\A_Metagenomic\data\humanGut\PPS_contigs.txt' #outPPOutFile = 'D:\A_Phylo\A_Metagenomic\data\humanGut\PPS_PP_contigs.txt' #ppsOutFile = 'C:/Documents and Settings/Administrator/Desktop/temp/johdroPred/inputTW.fas.ids04.lP' ppsOutFile = 'C:/Documents and Settings/Administrator/Desktop/temp/johdroPred/inputTW.fas.ids05.lP' #outPPOutFile = 'C:/Documents and Settings/Administrator/Desktop/temp/johdroPred/inputTW.fas.ids04.lP.PP.out' outPPOutFile = 'C:/Documents and Settings/Administrator/Desktop/temp/johdroPred/inputTW.fas.ids05.lP.PP.out' config = Config(open(os.path.normpath('D:\\A_Phylo\\A_Metagenomic\\pPPS\\workspace\\pPPS\\config01.cfg')), 'pPPS') databaseFile = os.path.normpath(config.get('databaseFile')) taxonomicRanks = config.get('taxonomicRanks').split(',') taxonomy = Taxonomy(databaseFile, taxonomicRanks) ppsOutToPPOut(ppsOutFile, outPPOutFile, taxonomicRanks, taxonomy) #test 1 #scafContigFile = 'D:/A_Phylo/A_Metagenomic/reindeer/data/scaffolds-contigs.tab' #scafPPSOutFile = 'D:/A_Phylo/A_Metagenomic/reindeer/predictions/pps04/scaffoldsOut/SRM_Scaffolds_namesOnly.fna.PP.out' #contigPPSOutFile = 'D:/A_Phylo/A_Metagenomic/reindeer/predictions/pps04/scaffoldsOut/SRM_Scaffolds_namesOnly.fna.PP.out_contigs' #scafToContigOutput(scafContigFile, scafPPSOutFile, contigPPSOutFile)
def test(ncbid): config = Config(open(os.path.normpath('D://A_Phylo//A_Metagenomic//pPPS//workspace//pPPS//config01.cfg')), 'pPPS') databaseFile = os.path.normpath(config.get('databaseFile')) ncbiProcessDir = os.path.normpath('D://A_Phylo//A_Metagenomic//pPPS//workspace//pPPS//wdir02//ncbiProcDir') dbData = DBData(ncbiProcessDir, databaseFile) threshold = 3 print dbData.getGenomeWgsCount(ncbid, threshold) #config = Config(open(os.path.normpath('//AM//metagenomic//work//projects//pPPS//tests//TW//TW01//config.cfg')), 'pPPS') #threshold = 3 #dir = 'D://A_Phylo//A_Metagenomic//pPPS//workspace//pPPS//genomes' #dir = '//AM//metagenomic//work//projects//pPPS//tests//TW//TW01//ncbiProcDir' #databaseFile = os.path.normpath(config.get('databaseFile')) #taxonomicRanks = config.get('taxonomicRanks').split(',') #count = getGenomeWgsCount(ncbid, threshold, dir, databaseFile, taxonomicRanks) #print count, 'genomes/wgs for ncbid:', ncbid #if __name__ == "__main__": #test(122) #haveData(126) #haveData(84999) #Coriobacteriales #haveData(171549) #Bacteroidales #haveData(815) #Bacteriodaceae #haveData(171551) #Porphyromonadaceae #haveData(171552) #Prevotellaceae #haveData(171550) #Rikenellaceae ###test(976) #Bacteroidetes #haveData(200666) #Sphingobacteriales #haveData(768503) #Cytophagia #haveData(117743) #Flavobacteria #haveData(475963) #Caldilineales #haveData(292625) #Anaerolineae #haveData(200795) #Chloroflexi #haveData(204431) #Fibrobacteraceae (59374, 834) #haveData(186803) #Lachnospiraceae #haveData(541000) #Ruminococcaceae #haveData(186802) #Clostridiales #haveData(31979) #Clostridiaceae #haveData(186806) #Eubacteriaceae #haveData(186807) #Peptococcaceae #haveData(31977) #Veillonellaceae #haveData(186801) #Clostridia #haveData(128827) #Erysipelotrichaceae #haveData(1239) #Firmicutes #haveData(91061) #Bacilli #haveData(255528) #Victivallaceae (340101) #haveData(126) #Planctomycetaceae #haveData(481) #Neisseriaceae #haveData(213121) #Desulfobulbaceae (577650, 177439, 589865) #haveData(213421) #Desulfuromonaceae #haveData(69541) #Desulfuromonadales #haveData(72294) #Campylobacteraceae #haveData(1224) #Proteobacteria #haveData(28211) #Alphaproteobacteria #haveData(1236) #Gammaproteobacteria #haveData(137) #Spirochaetaceae #haveData(186333) #Anaeroplasmataceae #haveData(186332) #Anaeroplasmatales #haveData(31969) #Mollicutes #haveData(278082) #Victivallales #haveData(256845) #Lentisphaerae