コード例 #1
0
  def run(self, configFile, otu, threads):
    rc = ReadConfig()
    projectParams, sampleParams = rc.readConfig(configFile, outputDirExists = False)

    ggDB = self.ggDB.replace('##', str(otu), 1)
    print 'Mapping reads to the GreenGenes DB at: ' + ggDB + '\n'

    if not os.path.exists(ggDB + '.amb'):
      print 'Indexing GreenGenes DB:'
      os.system('bwa index -a is ' + ggDB)
      print ''
    else:
      print 'GreenGenes DB is already indexed.\n'

    for sample in sampleParams:
      print 'Mapping reads in sample: ' + sample

      pairs = sampleParams[sample]['pairs']
      singles = sampleParams[sample]['singles']

      # align and map each pair
      for i in xrange(0, len(pairs), 2):
        pair1 = pairs[i]
        pair2 = pairs[i+1]
        bamPrefix = projectParams['output_dir'] + ntpath.basename(pair1)
        mapPair(ggDB, pair1, pair2, bamPrefix, threads)

      # align and map each single-ended read file
      for i in xrange(0, len(singles)):
        bamPrefix = projectParams['output_dir'] + ntpath.basename(singles[i])
        mapSingle(ggDB, singles[i], bamPrefix, threads)
コード例 #2
0
    def run(self, configFile, contigFile, assemblies16S, binDir, threads):
        rc = ReadConfig()
        projectParams, sampleParams = rc.readConfig(configFile,
                                                    outputDirExists=True)

        # check if links directory already exists
        linkFile = os.path.join(projectParams['output_dir'], 'linksToBin')
        if not os.path.exists(linkFile):
            os.makedirs(linkFile)
        else:
            rtn = raw_input('Remove previously identified links (Y or N)? ')
            if rtn.lower() == 'y' or rtn.lower() == 'yes':
                files = os.listdir(linkFile)
                for f in files:
                    os.remove(os.path.join(linkFile, f))
            else:
                sys.exit()

        outputDir = os.path.join(projectParams['output_dir'], 'linksToBin')

        # create combined file with reference sequences and assembled 16S sequences
        print 'Combining unbinned reference sequences with de novo assembled 16S sequences.'
        combinedFile = os.path.join(outputDir, 'scaffolds.combined.fasta')
        os.system('cat ' + contigFile + ' ' + assemblies16S + ' > ' +
                  combinedFile)

        # create combined 16S read files
        print 'Combining 16S/18S reads from all samples.'
        reads1 = ''
        reads2 = ''
        for sample in sampleParams:
            extractedPrefix = os.path.join(projectParams['output_dir'],
                                           'extracted', sample)
            pairs = sampleParams[sample]['pairs']
            for i in xrange(0, len(pairs), 2):
                pair1Base = ntpath.basename(pairs[i])
                pair2Base = ntpath.basename(pairs[i + 1])

                classificationFile1 = extractedPrefix + '.' + pair1Base[
                    0:pair1Base.rfind('.')] + '.union.SSU.fasta'
                classificationFile2 = extractedPrefix + '.' + pair2Base[
                    0:pair2Base.rfind('.')] + '.union.SSU.fasta'

                reads1 += classificationFile1 + ' '
                reads2 += classificationFile2 + ' '

        os.system('cat ' + reads1 + ' > ' +
                  os.path.join(outputDir, 'ssu.1.fasta'))
        os.system('cat ' + reads2 + ' > ' +
                  os.path.join(outputDir, 'ssu.2.fasta'))

        # identify 16S sequences in paired-end reads
        self.link16S(combinedFile, os.path.join(outputDir, 'ssu.1.fasta'),
                     os.path.join(outputDir, 'ssu.2.fasta'), binDir, threads,
                     outputDir)
コード例 #3
0
ファイル: extractHMM_LSU.py プロジェクト: wwood/CommunityM
    def run(self, configFile, threads, evalue, bQuiet):
        rc = ReadConfig()
        projectParams, sampleParams = rc.readConfig(configFile, outputDirExists = True)
        os.makedirs(projectParams['output_dir'] + 'extracted_lsu')

        self.bQuiet = bQuiet

        for sample in sampleParams:
            pairs = sampleParams[sample]['pairs']
            singles = sampleParams[sample]['singles']

            # identify 16S sequences in paired-end reads
            self.processPairs(pairs, threads, evalue, projectParams['output_dir'], sample)

            # identify 16S sequences in single-end reads
            self.processSingles(singles, threads, evalue, projectParams['output_dir'], sample)
コード例 #4
0
    def run(self, configFile, contigFile, assemblies16S, binDir, threads):
        rc = ReadConfig()
        projectParams, sampleParams = rc.readConfig(configFile, outputDirExists = True)

        # check if links directory already exists
        linkFile = os.path.join(projectParams['output_dir'], 'linksToBin')
        if not os.path.exists(linkFile):
            os.makedirs(linkFile)
        else:
            rtn = raw_input('Remove previously identified links (Y or N)? ')
            if rtn.lower() == 'y' or rtn.lower() == 'yes':
                files = os.listdir(linkFile)
                for f in files:
                    os.remove(os.path.join(linkFile, f))
            else:
                sys.exit()

        outputDir = os.path.join(projectParams['output_dir'], 'linksToBin')

        # create combined file with reference sequences and assembled 16S sequences
        print 'Combining unbinned reference sequences with de novo assembled 16S sequences.'
        combinedFile = os.path.join(outputDir, 'scaffolds.combined.fasta')
        os.system('cat ' + contigFile + ' ' + assemblies16S + ' > ' + combinedFile)

        # create combined 16S read files
        print 'Combining 16S/18S reads from all samples.'
        reads1 = ''
        reads2 = ''
        for sample in sampleParams:
            extractedPrefix = os.path.join(projectParams['output_dir'], 'extracted', sample)
            pairs = sampleParams[sample]['pairs']
            for i in xrange(0, len(pairs), 2):
                pair1Base = ntpath.basename(pairs[i])
                pair2Base = ntpath.basename(pairs[i+1])

                classificationFile1 = extractedPrefix + '.' + pair1Base[0:pair1Base.rfind('.')] + '.union.SSU.fasta'
                classificationFile2 = extractedPrefix + '.' + pair2Base[0:pair2Base.rfind('.')] + '.union.SSU.fasta'

                reads1 += classificationFile1 + ' '
                reads2 += classificationFile2 + ' '

        os.system('cat ' + reads1 + ' > ' + os.path.join(outputDir, 'ssu.1.fasta'))
        os.system('cat ' + reads2 + ' > ' + os.path.join(outputDir, 'ssu.2.fasta'))

        # identify 16S sequences in paired-end reads
        self.link16S(combinedFile, os.path.join(outputDir, 'ssu.1.fasta'), os.path.join(outputDir, 'ssu.2.fasta'), binDir, threads, outputDir)
コード例 #5
0
  def run(self, configFile, mappingQual, minLength):
    self.mappingQualityThreshold = mappingQual
    self.minLength = minLength

    rc = ReadConfig()
    projectParams, sampleParams = rc.readConfig(configFile, outputDirExists = True)

    for sample in sampleParams:
      outputDir = projectParams['output_dir']
      prefix = outputDir + sample
      pairs = sampleParams[sample]['pairs']
      singles = sampleParams[sample]['singles']

      # identify 16S sequences in paired-end reads
      self.processPairs(pairs, outputDir, prefix)

      # identify 16S sequences in single-end reads
      self.processSingles(singles, outputDir, prefix)
コード例 #6
0
    def run(self, configFile, threads, evalue, bQuiet):
        rc = ReadConfig()
        projectParams, sampleParams = rc.readConfig(configFile,
                                                    outputDirExists=True)
        os.makedirs(projectParams['output_dir'] + 'extracted_lsu')

        self.bQuiet = bQuiet

        for sample in sampleParams:
            pairs = sampleParams[sample]['pairs']
            singles = sampleParams[sample]['singles']

            # identify 16S sequences in paired-end reads
            self.processPairs(pairs, threads, evalue,
                              projectParams['output_dir'], sample)

            # identify 16S sequences in single-end reads
            self.processSingles(singles, threads, evalue,
                                projectParams['output_dir'], sample)
コード例 #7
0
    def run(self, configFile, db, threads, bQuiet):
        rc = ReadConfig()
        projectParams, sampleParams = rc.readConfig(configFile,
                                                    outputDirExists=True)

        # check if classification directory already exists
        if not os.path.exists(
                os.path.join(projectParams['output_dir'], 'classified')):
            os.makedirs(os.path.join(projectParams['output_dir'],
                                     'classified'))
        else:
            rtn = raw_input('Remove previously classified reads (Y or N)? ')
            if rtn.lower() == 'y' or rtn.lower() == 'yes':
                files = os.listdir(projectParams['output_dir'] + 'classified')
                for f in files:
                    os.remove(projectParams['output_dir'] + 'classified/' + f)
            else:
                sys.exit()

        dbFile = self.dbFiles[db]
        taxonomyFile = self.taxonomyFiles[db]

        if not bQuiet:
            print 'Classifying reads with: ' + dbFile
            print 'Assigning taxonomy with: ' + taxonomyFile
            print 'Threads: ' + str(threads)
            print ''

        # create list of all sequence to classify
        mothurSeqFileList = ''
        for sample in sampleParams:
            prefix = os.path.join(projectParams['output_dir'], 'extracted',
                                  sample)
            pairs = sampleParams[sample]['pairs']
            singles = sampleParams[sample]['singles']

            for i in xrange(0, len(pairs), 2):
                pair1Base = ntpath.basename(pairs[i])
                pair1File = prefix + '.' + pair1Base[
                    0:pair1Base.rfind('.')] + '.intersect.SSU.fasta'

                pair2Base = ntpath.basename(pairs[i + 1])
                pair2File = prefix + '.' + pair2Base[
                    0:pair2Base.rfind('.')] + '.intersect.SSU.fasta'

                diffFile = prefix + '.' + pair1Base[
                    0:pair1Base.rfind('.')] + '.difference.SSU.fasta'

                mothurSeqFileList += pair1File + '-' + pair2File + '-' + diffFile + '-'

            for single in singles:
                singleBase = ntpath.basename(single)
                singleFile = prefix + '.' + singleBase[
                    0:singleBase.rfind('.')] + '.SSU.fasta'

                mothurSeqFileList += singleFile + '-'

        # classify with mothur
        mothurSeqFileList = mothurSeqFileList[0:-1]  # remove trailing dash
        self.classify(mothurSeqFileList, dbFile, taxonomyFile, threads, bQuiet)

        # rename classification file for consistency with down-stream processing
        print 'Final classifications written to: '
        for filename in mothurSeqFileList.split('-'):
            if 'GG' in db:
                inputName = filename[0:filename.
                                     rfind('.')] + '.full.wang.taxonomy'
            else:
                inputName = filename[0:filename.rfind(
                    '.')] + '.SSURef_111_NR_taxonomy.wang.taxonomy'
            outputName = inputName.replace('/extracted/', '/classified/')
            outputName = outputName.replace('SSU.full.wang.taxonomy',
                                            '16S.tsv')
            os.system('mv ' + inputName + ' ' + outputName)
            print '  ' + outputName
コード例 #8
0
            print ''


if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description=
        "Classify 16S fragments by mapping them to the GreenGenes DB with BWA.",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('config_file', help='project config file.')
    parser.add_argument(
        'ref_db',
        help=
        'Reference DB to use for classification (choices: GG94, GG97, GG99, SILVA98)',
        choices=['GG94', 'GG97', 'GG99', 'SILVA98'])
    parser.add_argument('-t',
                        '--threads',
                        help='number of threads',
                        type=int,
                        default=1)

    args = parser.parse_args()

    classifyBWA = ClassifyBWA()

    rc = ReadConfig()
    projectParams, sampleParams = rc.readConfig(args.config_file,
                                                outputDirExists=True)

    classifyBWA.run(projectParams, sampleParams, args.ref_db, args.threads)
コード例 #9
0
    def run(self, configFile, otu, seqIdentityThreshold, minSeqCutoff, bPairsAsSingles, bSingleEnded, bQuiet):
        self.bQuiet = bQuiet

        rc = ReadConfig()
        projectParams, sampleParams = rc.readConfig(configFile, outputDirExists = True)

        ggRefDistFile = self.ggRefDist.replace('##', str(otu))
        neighbours = self.getNeighbours(ggRefDistFile, seqIdentityThreshold)

        # create directory to store putative 16S genes
        dirPutative16S = projectParams['output_dir'] + 'putativeSSU/'
        if not os.path.exists(dirPutative16S):
            os.makedirs(dirPutative16S)
        else:
            rtn = raw_input('Remove previously recovered 16S reads (Y or N)? ')
            if rtn.lower() == 'y' or rtn.lower() == 'yes':
                files = os.listdir(dirPutative16S)
                for f in files:
                    if f.endswith('fasta'):
                        os.remove(dirPutative16S + '/' + f)
            else:
                sys.exit()

        referenceSeqHits = {}
        for sample in sampleParams:
            if not self.bQuiet:
                print ''
                print sample + ':'

            extractedPrefix = projectParams['output_dir'] + 'extracted/' + sample
            classifiedPrefix = projectParams['output_dir'] + 'classified/' + sample
            pairs = sampleParams[sample]['pairs']
            singles = sampleParams[sample]['singles']

            for i in xrange(0, len(pairs), 2):
                pair1Base = ntpath.basename(pairs[i])
                pair2Base = ntpath.basename(pairs[i+1])

                classificationFile1 = classifiedPrefix + '.' + pair1Base[0:pair1Base.rfind('.')] + '.intersect.16S.tsv'
                classificationFile2 = classifiedPrefix + '.' + pair2Base[0:pair2Base.rfind('.')] + '.intersect.16S.tsv'

                if not self.bQuiet:
                    print '  Processing files: '
                    print '    ' + classificationFile1
                    print '    ' + classificationFile2

                pairFile1 = extractedPrefix + '.' + pair1Base[0:pair1Base.rfind('.')] + '.intersect.SSU.fasta'
                pairFile2 = extractedPrefix + '.' + pair2Base[0:pair2Base.rfind('.')] + '.intersect.SSU.fasta'

                self.identifyConsistentPairs(referenceSeqHits, pairFile1, pairFile2, classificationFile1, classificationFile2, neighbours, bPairsAsSingles, bSingleEnded)

                if bSingleEnded:
                    classificationFile = classifiedPrefix + '.' + pair1Base[0:pair1Base.rfind('.')] + '.difference.16S.tsv'

                    if not self.bQuiet:
                        print '  Processing file: ' + classificationFile

                    singleFile = extractedPrefix + '.' + pair1Base[0:pair1Base.rfind('.')] + '.difference.SSU.fasta'
                    self.addSingletons(referenceSeqHits, singleFile, classificationFile)

            if bSingleEnded:
                for single in singles:
                    singleBase = ntpath.basename(single)
                    classificationFile = classifiedPrefix + '.' + singleBase[0:singleBase.rfind('.')] + '.16S.tsv'

                    if not self.bQuiet:
                        print '  Processing file: ' + classificationFile

                    singleFile = extractedPrefix + '.' + singleBase[0:singleBase.rfind('.')] + '.SSU.fasta'
                    self.addSingletons(referenceSeqHits, singleFile, classificationFile)

        self.extractRecoverable16S(referenceSeqHits, neighbours, minSeqCutoff, dirPutative16S)
コード例 #10
0
    def run(self, configFile, threads, kmerLen, minContigLen):
        rc = ReadConfig()
        projectParams, _ = rc.readConfig(configFile, outputDirExists=True)

        # create directory to store putative 16S genes
        dirPutative16S = projectParams['output_dir'] + 'putativeSSU/'
        if not os.path.exists(dirPutative16S):
            print '[Error] Putative 16S gene reads expected in: ' + dirPutative16S
            sys.exit()

        # extract GreenGene Ids of putative 16S genes
        ggIds = set()
        files = os.listdir(dirPutative16S)
        for f in files:
            if f.endswith('fasta'):
                ggIds.add(int(f.split('.')[0]))

        print 'Putative 16S genes to assemble: ' + str(len(ggIds))

        contigInfo = {}
        for ggId in ggIds:
            print 'Assembling ' + str(ggId) + ': '
            print ''

            pair1 = dirPutative16S + str(ggId) + '.1.fasta'
            pair2 = dirPutative16S + str(ggId) + '.2.fasta'
            single = dirPutative16S + str(ggId) + '.singletons.fasta'

            outputDir = dirPutative16S + str(ggId) + '_assembly'
            if os.path.exists(outputDir):
                shutil.rmtree(outputDir)

            cmd = 'mpiexec -n ' + str(threads) + ' Ray -k ' + str(
                kmerLen) + ' -minimum-contig-length ' + str(
                    minContigLen) + ' -o ' + outputDir
            if os.stat(single
                       ).st_size > 0:  # check if file contains any sequences
                cmd += ' -s ' + single
            if os.stat(pair1).st_size > 0:
                cmd += ' -p ' + pair1 + ' ' + pair2

            os.system(cmd)

            contigInfo[ggId] = self.parseContigInfo(outputDir)

        print '\n*********************************'
        allContigsFile = projectParams[
            'output_dir'] + 'assembled_contigs.16S.fasta'
        fout = open(allContigsFile, 'w')
        print 'Assembly results: '
        for ggId in contigInfo:
            print '  Assembly of ' + str(ggId) + ' produce ' + str(
                len(contigInfo[ggId])) + ' contig(s): ' + ' '.join(
                    contigInfo[ggId])

            index = 0
            for line in open(dirPutative16S + str(ggId) +
                             '_assembly/Contigs.fasta'):
                if line[0] == '>':
                    lineSplit = line.split()
                    seqLen = lineSplit[1]

                    fout.write('>16S_' + str(ggId) + '-' + str(index) + ' ' +
                               seqLen + '\n')

                    index += 1
                else:
                    fout.write(line)
        fout.close()

        print ''
        print '  All assembled 16S contigs written to: ' + allContigsFile
コード例 #11
0
        writeProc.start()

        for p in calcProc:
            p.start()

        for p in calcProc:
            p.join()

        writerQueue.put(None)
        writeProc.join()

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Extract 16S/18S sequences from metagenomic data using HMMs.",
                                          formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument('config_file', help='project config file.')

    parser.add_argument('-t', '--threads', help='number of threads', type=int, default = 1)
    parser.add_argument('-e', '--evalue', help='e-value threshold for identifying hits', default = '1e-5')
    parser.add_argument('-a', '--align_len', type=float, help='fraction of read that must align for identifying hits', default = '0.5')
    parser.add_argument('-q', '--quiet', help='suppress all output', action='store_true')

    args = parser.parse_args()

    # Read config file
    rc = ReadConfig()
    projectParams, sampleParams = rc.readConfig(args.config_file, outputDirExists = False)

    extract16S = Extract16S()
    extract16S.run(projectParams, sampleParams, args.threads, args.evalue, args.align_len, args.quiet)
コード例 #12
0
    def run(self, configFile, db, threads, bQuiet):
        rc = ReadConfig()
        projectParams, sampleParams = rc.readConfig(configFile, outputDirExists = True)

        # check if classification directory already exists
        if not os.path.exists(os.path.join(projectParams['output_dir'], 'classified')):
            os.makedirs(os.path.join(projectParams['output_dir'], 'classified'))
        else:
            rtn = raw_input('Remove previously classified reads (Y or N)? ')
            if rtn.lower() == 'y' or rtn.lower() == 'yes':
                files = os.listdir(projectParams['output_dir'] + 'classified')
                for f in files:
                    os.remove(projectParams['output_dir'] + 'classified/' + f)
            else:
                sys.exit()

        dbFile = self.dbFiles[db]
        taxonomyFile = self.taxonomyFiles[db]

        if not bQuiet:
            print 'Classifying reads with: ' + dbFile
            print 'Assigning taxonomy with: ' + taxonomyFile
            print 'Threads: ' + str(threads)
            print ''

        # create list of all sequence to classify
        mothurSeqFileList = ''
        for sample in sampleParams:
            prefix = os.path.join(projectParams['output_dir'], 'extracted', sample)
            pairs = sampleParams[sample]['pairs']
            singles = sampleParams[sample]['singles']

            for i in xrange(0, len(pairs), 2):
                pair1Base = ntpath.basename(pairs[i])
                pair1File = prefix + '.' + pair1Base[0:pair1Base.rfind('.')] + '.intersect.SSU.fasta'

                pair2Base = ntpath.basename(pairs[i+1])
                pair2File = prefix + '.' + pair2Base[0:pair2Base.rfind('.')] + '.intersect.SSU.fasta'

                diffFile = prefix + '.' + pair1Base[0:pair1Base.rfind('.')] + '.difference.SSU.fasta'

                mothurSeqFileList += pair1File + '-' + pair2File + '-' + diffFile + '-'

            for single in singles:
                singleBase = ntpath.basename(single)
                singleFile = prefix + '.' + singleBase[0:singleBase.rfind('.')] + '.SSU.fasta'

                mothurSeqFileList += singleFile + '-'

        # classify with mothur
        mothurSeqFileList = mothurSeqFileList[0:-1] # remove trailing dash
        self.classify(mothurSeqFileList, dbFile, taxonomyFile, threads, bQuiet)

        # rename classification file for consistency with down-stream processing
        print 'Final classifications written to: '
        for filename in mothurSeqFileList.split('-'):
            if 'GG' in db:
                inputName = filename[0:filename.rfind('.')] + '.full.wang.taxonomy'
            else:
                inputName = filename[0:filename.rfind('.')] + '.SSURef_111_NR_taxonomy.wang.taxonomy'
            outputName = inputName.replace('/extracted/','/classified/')
            outputName = outputName.replace('SSU.full.wang.taxonomy','16S.tsv')
            os.system('mv ' + inputName + ' ' + outputName)
            print '  ' + outputName
コード例 #13
0
    def run(self, configFile, threads):
        rc = ReadConfig()
        projectParams, _ = rc.readConfig(configFile, outputDirExists = True)

        # create directory to store putative 16S genes
        dirPutative16S = projectParams['output_dir'] + 'putativeSSU/'
        if not os.path.exists(dirPutative16S):
            print '[Error] Putative 16S gene reads expected in: ' + dirPutative16S
            sys.exit()

        # extract GreenGene Ids of putative 16S genes
        ggIds = set()
        files = os.listdir(dirPutative16S)
        for f in files:
            if f.endswith('fasta'):
                ggIds.add(int(f.split('.')[0]))

        print 'Putative 16S genes to assemble: ' + str(len(ggIds))

        scaffoldInfo = {}
        for ggId in ggIds:
            print 'Assembling ' + str(ggId) + ': '
            print ''

            pair1 = dirPutative16S + str(ggId) + '.1.fasta'
            pair2 = dirPutative16S + str(ggId) + '.2.fasta'
            single = dirPutative16S + str(ggId) + '.singletons.fasta'

            outputDir = dirPutative16S + str(ggId) + '_assembly_spades'
            if os.path.exists(outputDir):
                shutil.rmtree(outputDir)

            cmd = 'spades.py --only-assembler -o ' + outputDir + ' -t ' + str(threads)
            if os.stat(single).st_size > 0: # check if file contains any sequences
                cmd += ' -s ' + single
            if os.stat(pair1).st_size > 0:
                cmd += ' -1 ' + pair1 + ' -2 ' + pair2

            os.system(cmd)

            scaffoldInfo[ggId] = self.parseScaffoldInfo(outputDir)

        print '\n*********************************'
        allScaffoldsFile = projectParams['output_dir'] + 'assembled_scaffolds.16S.fasta'
        fout = open(allScaffoldsFile, 'w')
        print 'Assembly results: '
        for ggId in scaffoldInfo:
            print '  Assembly of ' + str(ggId) + ' produce ' + str(len(scaffoldInfo[ggId])) + ' scaffold(s): ' + ' '.join(scaffoldInfo[ggId])

            if not os.path.isfile(dirPutative16S + str(ggId) + '_assembly_spades/scaffolds.fasta'):
                print '    Failed to build scaffolds for ' + str(ggId)
                continue

            index = 0
            for line in open(dirPutative16S + str(ggId) + '_assembly_spades/scaffolds.fasta'):
                if line[0] == '>':
                    lineSplit = line.split('_')
                    seqLen = lineSplit[3]

                    fout.write('>16S_' + str(ggId) + '-' + str(index) + ' ' + seqLen + '\n')

                    index += 1
                else:
                    fout.write(line)
        fout.close()

        print ''
        print '  All assembled 16S contigs written to: ' + allScaffoldsFile
コード例 #14
0
    def run(self, configFile, threads):
        rc = ReadConfig()
        projectParams, _ = rc.readConfig(configFile, outputDirExists=True)

        # create directory to store putative 16S genes
        dirPutative16S = projectParams['output_dir'] + 'putativeSSU/'
        if not os.path.exists(dirPutative16S):
            print '[Error] Putative 16S gene reads expected in: ' + dirPutative16S
            sys.exit()

        # extract GreenGene Ids of putative 16S genes
        ggIds = set()
        files = os.listdir(dirPutative16S)
        for f in files:
            if f.endswith('fasta'):
                ggIds.add(int(f.split('.')[0]))

        print 'Putative 16S genes to assemble: ' + str(len(ggIds))

        scaffoldInfo = {}
        for ggId in ggIds:
            print 'Assembling ' + str(ggId) + ': '
            print ''

            pair1 = dirPutative16S + str(ggId) + '.1.fasta'
            pair2 = dirPutative16S + str(ggId) + '.2.fasta'
            single = dirPutative16S + str(ggId) + '.singletons.fasta'

            outputDir = dirPutative16S + str(ggId) + '_assembly_spades'
            if os.path.exists(outputDir):
                shutil.rmtree(outputDir)

            cmd = 'spades.py --only-assembler -o ' + outputDir + ' -t ' + str(
                threads)
            if os.stat(single
                       ).st_size > 0:  # check if file contains any sequences
                cmd += ' -s ' + single
            if os.stat(pair1).st_size > 0:
                cmd += ' -1 ' + pair1 + ' -2 ' + pair2

            os.system(cmd)

            scaffoldInfo[ggId] = self.parseScaffoldInfo(outputDir)

        print '\n*********************************'
        allScaffoldsFile = projectParams[
            'output_dir'] + 'assembled_scaffolds.16S.fasta'
        fout = open(allScaffoldsFile, 'w')
        print 'Assembly results: '
        for ggId in scaffoldInfo:
            print '  Assembly of ' + str(ggId) + ' produce ' + str(
                len(scaffoldInfo[ggId])) + ' scaffold(s): ' + ' '.join(
                    scaffoldInfo[ggId])

            if not os.path.isfile(dirPutative16S + str(ggId) +
                                  '_assembly_spades/scaffolds.fasta'):
                print '    Failed to build scaffolds for ' + str(ggId)
                continue

            index = 0
            for line in open(dirPutative16S + str(ggId) +
                             '_assembly_spades/scaffolds.fasta'):
                if line[0] == '>':
                    lineSplit = line.split('_')
                    seqLen = lineSplit[3]

                    fout.write('>16S_' + str(ggId) + '-' + str(index) + ' ' +
                               seqLen + '\n')

                    index += 1
                else:
                    fout.write(line)
        fout.close()

        print ''
        print '  All assembled 16S contigs written to: ' + allScaffoldsFile
コード例 #15
0
    def run(self, configFile, otu, seqIdentityThreshold, minSeqCutoff, bPairsAsSingles, bSingleEnded, bQuiet):
        self.bQuiet = bQuiet

        rc = ReadConfig()
        projectParams, sampleParams = rc.readConfig(configFile, outputDirExists = True)

        ggRefDistFile = self.ggRefDist.replace('##', str(otu))
        neighbours = self.getNeighbours(ggRefDistFile, seqIdentityThreshold)

        # create directory to store putative 16S genes
        dirPutative16S = projectParams['output_dir'] + 'putativeSSU/'
        if not os.path.exists(dirPutative16S):
            os.makedirs(dirPutative16S)
        else:
            rtn = raw_input('Remove previously recovered 16S reads (Y or N)? ')
            if rtn.lower() == 'y' or rtn.lower() == 'yes':
                files = os.listdir(dirPutative16S)
                for f in files:
                    if f.endswith('fasta'):
                        os.remove(dirPutative16S + '/' + f)
            else:
                sys.exit()

        referenceSeqHits = {}
        for sample in sampleParams:
            if not self.bQuiet:
                print ''
                print sample + ':'

            extractedPrefix = os.path.join(projectParams['output_dir'], 'extracted', sample)
            classifiedPrefix = os.path.join(projectParams['output_dir'], 'classified' + sample)
            pairs = sampleParams[sample]['pairs']
            singles = sampleParams[sample]['singles']

            for i in xrange(0, len(pairs), 2):
                pair1Base = ntpath.basename(pairs[i])
                pair2Base = ntpath.basename(pairs[i+1])

                classificationFile1 = classifiedPrefix + '.' + pair1Base[0:pair1Base.rfind('.')] + '.intersect.16S.tsv'
                classificationFile2 = classifiedPrefix + '.' + pair2Base[0:pair2Base.rfind('.')] + '.intersect.16S.tsv'

                if not self.bQuiet:
                    print '  Processing files: '
                    print '    ' + classificationFile1
                    print '    ' + classificationFile2

                pairFile1 = extractedPrefix + '.' + pair1Base[0:pair1Base.rfind('.')] + '.intersect.SSU.fasta'
                pairFile2 = extractedPrefix + '.' + pair2Base[0:pair2Base.rfind('.')] + '.intersect.SSU.fasta'

                self.identifyConsistentPairs(referenceSeqHits, pairFile1, pairFile2, classificationFile1, classificationFile2, neighbours, bPairsAsSingles, bSingleEnded)

                if bSingleEnded:
                    classificationFile = classifiedPrefix + '.' + pair1Base[0:pair1Base.rfind('.')] + '.difference.16S.tsv'

                    if not self.bQuiet:
                        print '  Processing file: ' + classificationFile

                    singleFile = extractedPrefix + '.' + pair1Base[0:pair1Base.rfind('.')] + '.difference.SSU.fasta'
                    self.addSingletons(referenceSeqHits, singleFile, classificationFile)

            if bSingleEnded:
                for single in singles:
                    singleBase = ntpath.basename(single)
                    classificationFile = classifiedPrefix + '.' + singleBase[0:singleBase.rfind('.')] + '.16S.tsv'

                    if not self.bQuiet:
                        print '  Processing file: ' + classificationFile

                    singleFile = extractedPrefix + '.' + singleBase[0:singleBase.rfind('.')] + '.SSU.fasta'
                    self.addSingletons(referenceSeqHits, singleFile, classificationFile)

        self.extractRecoverable16S(referenceSeqHits, neighbours, minSeqCutoff, dirPutative16S)
コード例 #16
0
    def run(self, configFile, threads, kmerLen, minContigLen):
        rc = ReadConfig()
        projectParams, _ = rc.readConfig(configFile, outputDirExists = True)

        # create directory to store putative 16S genes
        dirPutative16S = projectParams['output_dir'] + 'putativeSSU/'
        if not os.path.exists(dirPutative16S):
            print '[Error] Putative 16S gene reads expected in: ' + dirPutative16S
            sys.exit()

        # extract GreenGene Ids of putative 16S genes
        ggIds = set()
        files = os.listdir(dirPutative16S)
        for f in files:
            if f.endswith('fasta'):
                ggIds.add(int(f.split('.')[0]))

        print 'Putative 16S genes to assemble: ' + str(len(ggIds))

        contigInfo = {}
        for ggId in ggIds:
            print 'Assembling ' + str(ggId) + ': '
            print ''

            pair1 = dirPutative16S + str(ggId) + '.1.fasta'
            pair2 = dirPutative16S + str(ggId) + '.2.fasta'
            single = dirPutative16S + str(ggId) + '.singletons.fasta'

            outputDir = dirPutative16S + str(ggId) + '_assembly'
            if os.path.exists(outputDir):
                shutil.rmtree(outputDir)

            cmd = 'mpiexec -n ' + str(threads) + ' Ray -k ' + str(kmerLen) + ' -minimum-contig-length ' + str(minContigLen) + ' -o ' + outputDir
            if os.stat(single).st_size > 0: # check if file contains any sequences
                cmd += ' -s ' + single
            if os.stat(pair1).st_size > 0:
                cmd += ' -p ' + pair1 + ' ' + pair2

            os.system(cmd)

            contigInfo[ggId] = self.parseContigInfo(outputDir)

        print '\n*********************************'
        allContigsFile = projectParams['output_dir'] + 'assembled_contigs.16S.fasta'
        fout = open(allContigsFile, 'w')
        print 'Assembly results: '
        for ggId in contigInfo:
            print '  Assembly of ' + str(ggId) + ' produce ' + str(len(contigInfo[ggId])) + ' contig(s): ' + ' '.join(contigInfo[ggId])

            index = 0
            for line in open(dirPutative16S + str(ggId) + '_assembly/Contigs.fasta'):
                if line[0] == '>':
                    lineSplit = line.split()
                    seqLen = lineSplit[1]

                    fout.write('>16S_' + str(ggId) + '-' + str(index) + ' ' + seqLen + '\n')

                    index += 1
                else:
                    fout.write(line)
        fout.close()

        print ''
        print '  All assembled 16S contigs written to: ' + allContigsFile