Beispiel #1
0
def build_pangenome_db(args, genome_clusters):
	""" Build FASTA and BT2 database from pangene cluster centroids """
	import Bio.SeqIO
	# fasta database
	outdir = '/'.join([args['outdir'], 'genes/temp'])
	pangenome_fasta = open('/'.join([outdir, 'pangenomes.fa']), 'w')
	pangenome_map = open('/'.join([outdir, 'pangenome.map']), 'w')
	db_stats = {'total_length':0, 'total_seqs':0, 'genome_clusters':0}
	for species_id in genome_clusters:
		db_stats['genome_clusters'] += 1
		inpath = '/'.join([args['db'], 'genome_clusters', species_id, 'pangenome.fa.gz'])
		infile = utility.iopen(inpath)
		for r in Bio.SeqIO.parse(infile, 'fasta'):
			genome_id = '.'.join(r.id.split('.')[0:2])
			if not args['tax_mask'] or genome_id not in args['tax_mask']:
				pangenome_fasta.write('>%s\n%s\n' % (r.id, str(r.seq)))
				pangenome_map.write('%s\t%s\n' % (r.id, species_id))
				db_stats['total_length'] += len(r.seq)
				db_stats['total_seqs'] += 1
	pangenome_fasta.close()
	pangenome_map.close()
	# print out database stats
	print("  total species: %s" % db_stats['genome_clusters'])
	print("  total genes: %s" % db_stats['total_seqs'])
	print("  total base-pairs: %s" % db_stats['total_length'])
	# bowtie2 database
	inpath = '/'.join([outdir, 'pangenomes.fa'])
	outpath = '/'.join([outdir, 'pangenomes'])
	command = ' '.join([args['bowtie2-build'], inpath, outpath])
	args['log'].write('command: '+command+'\n')
	process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
	utility.check_exit_code(process, command)
Beispiel #2
0
def pangenome_align(args):
	""" Use Bowtie2 to map reads to all specified genome clusters """
	# Build command
	command = '%s --no-unal ' % args['bowtie2']
	#   index
	command += '-x %s ' % '/'.join([args['outdir'], 'genes/temp/pangenomes'])
	#   specify reads
	if args['max_reads']: command += '-u %s ' % args['max_reads']
	#   trim reads
	if args['trim']: command += '--trim3 %s ' % args['trim']
	#   speed/sensitivity
	command += '--%s-local ' % args['speed']
	#   threads
	command += '--threads %s ' % args['threads']
	#   file type
	if args['file_type'] == 'fasta': command += '-f '
	else: command += '-q '
	#   input file
	if (args['m1'] and args['m2']): command += '-1 %s -2 %s ' % (args['m1'], args['m2'])
	else: command += '-U %s' % args['m1']
	#   output unsorted bam
	bampath = '/'.join([args['outdir'], 'genes/temp/pangenome.bam'])
	command += '| %s view -b - > %s' % (args['samtools'], bampath)
	# Run command
	args['log'].write('command: '+command+'\n')
	process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
	# Check for errors
	utility.check_exit_code(process, command)
	utility.check_bamfile(args, bampath)
Beispiel #3
0
def build_pangenome_db(args, genome_clusters):
	""" Build FASTA and BT2 database from pangene cluster centroids """
	import Bio.SeqIO
	# fasta database
	outdir = '/'.join([args['outdir'], 'genes/temp'])
	pangenome_fasta = open('/'.join([outdir, 'pangenomes.fa']), 'w')
	pangenome_map = open('/'.join([outdir, 'pangenome.map']), 'w')
	db_stats = {'total_length':0, 'total_seqs':0, 'genome_clusters':0}
	for species_id in genome_clusters:
		db_stats['genome_clusters'] += 1
		inpath = '/'.join([args['db'], 'genome_clusters', species_id, 'pangenome.fa.gz'])
		infile = gzip.open(inpath)
		for r in Bio.SeqIO.parse(infile, 'fasta'):
			genome_id = '.'.join(r.id.split('.')[0:2])
			if not args['tax_mask'] or genome_id not in args['tax_mask']:
				pangenome_fasta.write('>%s\n%s\n' % (r.id, str(r.seq)))
				pangenome_map.write('%s\t%s\n' % (r.id, species_id))
				db_stats['total_length'] += len(r.seq)
				db_stats['total_seqs'] += 1
	pangenome_fasta.close()
	pangenome_map.close()
	# print out database stats
	print("  total species: %s" % db_stats['genome_clusters'])
	print("  total genes: %s" % db_stats['total_seqs'])
	print("  total base-pairs: %s" % db_stats['total_length'])
	# bowtie2 database
	inpath = '/'.join([outdir, 'pangenomes.fa'])
	outpath = '/'.join([outdir, 'pangenomes'])
	command = ' '.join([args['bowtie2-build'], inpath, outpath])
	args['log'].write('command: '+command+'\n')
	process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
	utility.check_exit_code(process, command)
Beispiel #4
0
def pileup(args):
    """ Filter alignments by % id, use samtools to create pileup, filter low quality bases, and write results to VCF file """
    # Build command
    #   percent id filtering
    bam_path = os.path.join(args['outdir'], 'snps/temp/genomes.bam')
    command = 'python %s %s %s %s %s | ' % (args['stream_bam'], bam_path,
                                            '/dev/stdout', args['mapid'],
                                            args['readq'])
    #   mpileup
    command += '%s mpileup -uv -A -d 10000 --skip-indels ' % args['samtools']
    #   quality filtering
    if not args['baq']: command += '-B '
    #   quality filtering
    if args['redo_baq']: command += '-E '
    #   adjust MQ
    if args['adjust_mq']: command += '-C 50 '
    #   quality filtering
    command += '-q %s -Q %s ' % (args['mapq'], args['baseq'])
    #   reference fna file
    command += '-f %s ' % ('%s/snps/temp/genomes.fa' % args['outdir'])
    #   input bam file
    command += '- '
    #   output vcf file
    command += '> %s ' % ('%s/snps/temp/genomes.vcf' % args['outdir'])
    # Run command
    args['log'].write('command: ' + command + '\n')
    process = subprocess.Popen(command,
                               shell=True,
                               stdout=subprocess.PIPE,
                               stderr=subprocess.PIPE)
    utility.check_exit_code(process, command)
Beispiel #5
0
def map_reads_hsblast(args):
    """ Use hs-blastn to map reads in fasta file to marker database """
    # stream sequences
    command = 'python %s' % args['stream_seqs']
    command += ' -1 %s' % args['m1']  # fasta/fastq
    if args['m2']: command += ' -2 %s' % args['m2']  # mate
    if args['max_reads']:
        command += ' -n %s' % args['max_reads']  # number of reads
    if args['read_length']:
        command += ' -l %s' % args['read_length']  # read length
    command += ' 2> %s/species/temp/read_count.txt' % args[
        'outdir']  # tmpfile to store # of reads, bp sampled
    # hs-blastn
    command += ' | %s align' % args['hs-blastn']
    command += ' -word_size %s' % args['word_size']
    command += ' -query /dev/stdin'
    command += ' -db %s/marker_genes/phyeco.fa' % args['db']
    command += ' -outfmt 6'
    command += ' -num_threads %s' % args['threads']
    command += ' -out %s/species/temp/alignments.m8' % args['outdir']
    command += ' -evalue 1e-3'
    args['log'].write('command: ' + command + '\n')
    process = subprocess.Popen(command,
                               shell=True,
                               stdout=subprocess.PIPE,
                               stderr=subprocess.PIPE)
    utility.check_exit_code(process, command)
Beispiel #6
0
def build_pangenome_db(args, species):
	""" Build FASTA and BT2 database from pangene species centroids """
	import Bio.SeqIO
	# fasta database
	outdir = '/'.join([args['outdir'], 'genes/temp'])
	pangenome_fasta = open('/'.join([outdir, 'pangenomes.fa']), 'w')
	pangenome_map = open('/'.join([outdir, 'pangenomes.map']), 'w')
	db_stats = {'total_length':0, 'total_seqs':0, 'species':0}
	for sp in species.values():
		db_stats['species'] += 1
		infile = utility.iopen(sp.paths['centroids.ffn'])
		for r in Bio.SeqIO.parse(infile, 'fasta'):
			pangenome_fasta.write('>%s\n%s\n' % (r.id, str(r.seq).upper()))
			pangenome_map.write('%s\t%s\n' % (r.id, sp.id))
			db_stats['total_length'] += len(r.seq)
			db_stats['total_seqs'] += 1
		infile.close()
	pangenome_fasta.close()
	pangenome_map.close()
	# print out database stats
	print("  total species: %s" % db_stats['species'])
	print("  total genes: %s" % db_stats['total_seqs'])
	print("  total base-pairs: %s" % db_stats['total_length'])
	# bowtie2 database
	inpath = '/'.join([outdir, 'pangenomes.fa'])
	outpath = '/'.join([outdir, 'pangenomes'])
	command = ' '.join([args['bowtie2-build'], inpath, outpath])
	args['log'].write('command: '+command+'\n')
	process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
	utility.check_exit_code(process, command)
Beispiel #7
0
def pileup(args):
    """ Filter alignments by % id, use samtools to create pileup, filter low quality bases """
    # Stream bam, filter alignments
    command = 'python %s ' % args['stream_bam']
    command += '%s ' % os.path.join(args['outdir'], 'snps/temp/genomes.bam')
    command += '/dev/stdout '
    command += '%s ' % args['mapid']
    command += '%s ' % args['readq']
    command += '%s ' % args['mapq']
    # Pipe to mpileup
    command += '| %s mpileup ' % args['samtools']
    command += '-d 10000 '  # set max depth
    if not args['baq']: command += '-B '  # BAQ
    if args['adjust_mq']: command += '-C 50 '  # adjust MQ
    if not args['discard']: command += '-A '  # keep discordant read pairs
    command += '-Q %s ' % (args['baseq'])  # base quality filtering
    command += '-f %s ' % ('%s/snps/temp/genomes.fa' % args['outdir']
                           )  # reference fna file
    command += '- '  #   input bam file
    command += '| gzip > %s ' % (
        '%s/snps/temp/genomes.mpileup.gz' % args['outdir'])  # output file
    # Run command
    args['log'].write('command: ' + command + '\n')
    process = subprocess.Popen(command,
                               shell=True,
                               stdout=subprocess.PIPE,
                               stderr=subprocess.PIPE)
    utility.check_exit_code(process, command)
Beispiel #8
0
def genome_align(args):
    """ Use Bowtie2 to map reads to representative genomes """
    # Bowtie2
    bam_path = os.path.join(args['outdir'], 'snps/temp/genomes.bam')
    command = '%s --no-unal ' % args['bowtie2']
    command += '-x %s ' % '/'.join([args['outdir'], 'snps/temp/genomes'
                                    ])  # index
    if args['max_reads']:
        command += '-u %s ' % args['max_reads']  # max num of reads
    if args['trim']: command += '--trim3 %s ' % args['trim']  # trim 3'
    command += '--%s ' % args['speed']  # speed/sensitivity
    command += '--threads %s ' % args['threads']  # threads
    command += '-f ' if args['file_type'] == 'fasta' else '-q '  # input type
    command += '-1 %s -2 %s ' % (
        args['m1'],
        args['m2']) if args['m2'] else '-U %s ' % args['m1']  # input reads
    # Pipe to samtools
    command += '| %s view -b - ' % args['samtools']  # convert to bam
    command += '| %s sort -f - %s ' % (args['samtools'], bam_path)  # sort bam
    # Run command
    args['log'].write('command: ' + command + '\n')
    process = subprocess.Popen(command,
                               shell=True,
                               stdout=subprocess.PIPE,
                               stderr=subprocess.PIPE)
    # Check for errors
    utility.check_exit_code(process, command)
    print("  finished aligning")
    print("  checking bamfile integrity")
    utility.check_bamfile(args, bam_path)
Beispiel #9
0
def pileup(args):
	""" Filter alignments by % id, use samtools to create pileup, filter low quality bases, and write results to VCF file """
	# Build command
	#   percent id filtering
	bam_path = os.path.join(args['outdir'], 'snps/temp/genomes.bam')
	command  = 'python %s %s %s %s %s | ' % (args['stream_bam'], bam_path, '/dev/stdout', args['mapid'], args['readq'])
	#   mpileup
	command += '%s mpileup -uv -A -d 10000 --skip-indels ' % args['samtools']
	#   quality filtering
	if not args['baq']: command += '-B '
	#   quality filtering
	if args['redo_baq']: command += '-E '
	#   adjust MQ
	if args['adjust_mq']: command += '-C 50 '
	#   quality filtering
	command += '-q %s -Q %s ' % (args['mapq'], args['baseq'])
	#   reference fna file
	command += '-f %s ' % ('%s/snps/temp/genomes.fa' % args['outdir'])
	#   input bam file
	command += '- '
	#   output vcf file
	command += '> %s ' % ('%s/snps/temp/genomes.vcf' % args['outdir'])
	# Run command
	args['log'].write('command: '+command+'\n')
	process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
	utility.check_exit_code(process, command)
Beispiel #10
0
def build_genome_db(args, species):
    """ Build FASTA and BT2 database of representative genomes """
    import Bio.SeqIO
    # fasta database
    outfile = open('/'.join([args['outdir'], 'snps/temp/genomes.fa']), 'w')
    db_stats = {'total_length': 0, 'total_seqs': 0, 'species': 0}
    for sp in species.values():
        db_stats['species'] += 1
        infile = utility.iopen(sp.paths['fna'])
        for r in Bio.SeqIO.parse(infile, 'fasta'):
            outfile.write('>%s\n%s\n' % (r.id, str(r.seq).upper()))
            db_stats['total_length'] += len(r.seq)
            db_stats['total_seqs'] += 1
        infile.close()
    outfile.close()
    # print out database stats
    print("  total genomes: %s" % db_stats['species'])
    print("  total contigs: %s" % db_stats['total_seqs'])
    print("  total base-pairs: %s" % db_stats['total_length'])
    # bowtie2 database
    inpath = '/'.join([args['outdir'], 'snps/temp/genomes.fa'])
    outpath = '/'.join([args['outdir'], 'snps/temp/genomes'])
    command = ' '.join([args['bowtie2-build'], inpath, outpath])
    args['log'].write('command: ' + command + '\n')
    process = subprocess.Popen(command,
                               shell=True,
                               stdout=subprocess.PIPE,
                               stderr=subprocess.PIPE)
    utility.check_exit_code(process, command)
Beispiel #11
0
def build_genome_db(args, genome_clusters):
	""" Build FASTA and BT2 database from genome cluster centroids """
	# fasta database
	genomes_fasta = open('/'.join([args['outdir'], 'snps/temp/genomes.fa']), 'w')
	genomes_map = open('/'.join([args['outdir'], 'snps/temp/genomes.map']), 'w')
	db_stats = {'total_length':0, 'total_seqs':0, 'genome_clusters':0}
	for species_id in genome_clusters:
		if args['tax_mask'] and fetch_centroid(args, species_id) in args['tax_mask']:
			continue
		db_stats['genome_clusters'] += 1
		inpath = '/'.join([args['db'], 'genome_clusters', species_id, 'genome.fna.gz'])
		infile = gzip.open(inpath)
		for line in infile:
			genomes_fasta.write(line)
			db_stats['total_length'] += len(line.rstrip())
			if line[0] == '>':
				sid = line.rstrip().lstrip('>').split()[0]
				genomes_map.write(sid+'\t'+species_id+'\n')
				db_stats['total_seqs'] += 1
	# print out database stats
	print("  total genomes: %s" % db_stats['genome_clusters'])
	print("  total contigs: %s" % db_stats['total_seqs'])
	print("  total base-pairs: %s" % db_stats['total_length'])
	# bowtie2 database
	inpath = '/'.join([args['outdir'], 'snps/temp/genomes.fa'])
	outpath = '/'.join([args['outdir'], 'snps/temp/genomes'])
	command = ' '.join([args['bowtie2-build'], inpath, outpath])
	args['log'].write('command: '+command+'\n')
	process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
	utility.check_exit_code(process, command)
Beispiel #12
0
def genome_align(args):
	""" Use Bowtie2 to map reads to representative genomes from each genome cluster
	"""
	# Build command
	#	bowtie2
	command = '%s --no-unal ' % args['bowtie2']
	#   index
	command += '-x %s ' % '/'.join([args['outdir'], 'snps/temp/genomes'])
	#   specify reads
	if args['max_reads']: command += '-u %s ' % args['max_reads']
	#   trim reads
	if args['trim']: command += '--trim3 %s ' % args['trim']
	#   speed/sensitivity
	command += '--%s ' % args['speed']
	#   threads
	command += '--threads %s ' % args['threads']
	#   file type
	if args['file_type'] == 'fasta': command += '-f '
	else: command += '-q '
	#   input file
	if (args['m1'] and args['m2']): command += '-1 %s -2 %s ' % (args['m1'], args['m2'])
	else: command += '-U %s' % args['m1']
	#   convert to bam
	command += '| %s view -b - ' % args['samtools']
	#   sort bam
	bam_path = os.path.join(args['outdir'], 'snps/temp/genomes.bam')
	command += '| %s sort -f - %s ' % (args['samtools'], bam_path)
	# Run command
	args['log'].write('command: '+command+'\n')
	process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
	# Check for errors
	utility.check_exit_code(process, command)
	utility.check_bamfile(args, bam_path)
Beispiel #13
0
def pangenome_align(args):
	""" Use Bowtie2 to map reads to all specified genome species """
	# Build command
	command = '%s --no-unal ' % args['bowtie2']
	#   index
	command += '-x %s ' % '/'.join([args['outdir'], 'genes/temp/pangenomes'])
	#   specify reads
	if args['max_reads']: command += '-u %s ' % args['max_reads']
	#   trim reads
	if args['trim']: command += '--trim3 %s ' % args['trim']
	#   speed/sensitivity
	command += '--%s-local ' % args['speed']
	#   threads
	command += '--threads %s ' % args['threads']
	#   file type
	if args['file_type'] == 'fasta': command += '-f '
	else: command += '-q '
	#   input file
	if (args['m1'] and args['m2']): command += '-1 %s -2 %s ' % (args['m1'], args['m2'])
	else: command += '-U %s ' % args['m1']
	#   output unsorted bam
	bampath = '/'.join([args['outdir'], 'genes/temp/pangenomes.bam'])
	command += '| %s view -b - > %s' % (args['samtools'], bampath)
	# Run command
	args['log'].write('command: '+command+'\n')
	process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
	# Check for errors
	utility.check_exit_code(process, command)
	print("  finished aligning")
	print("  checking bamfile integrity")
	utility.check_bamfile(args, bampath)
Beispiel #14
0
 def build_hsblastn_db(self, hsblastn):
     command = "%s index " % hsblastn
     command += " %s/phyeco.fa " % self.dir
     process = subprocess.Popen(command,
                                shell=True,
                                stdout=subprocess.PIPE,
                                stderr=subprocess.PIPE)
     utility.check_exit_code(process, command)
Beispiel #15
0
 def hsblastn_index(self, fasta):
     command = "hs-blastn index %s " % fasta
     process = subprocess.Popen(command,
                                shell=True,
                                stdout=subprocess.PIPE,
                                stderr=subprocess.PIPE,
                                env={'PATH': sys.path})
     utility.check_exit_code(process, command)
Beispiel #16
0
 def hmmsearch(self, inpath, outpath, threads):
     command = "hmmsearch --noali --cpu %s " % threads
     command += "--domtblout %s " % outpath
     command += "%s/%s " % (os.path.dirname(__file__), 'phyeco.hmm')
     command += "%s > /dev/null" % inpath
     process = subprocess.Popen(command,
                                shell=True,
                                stdout=subprocess.PIPE,
                                stderr=subprocess.PIPE)
     utility.check_exit_code(process, command)
Beispiel #17
0
 def build_hsblastn_db(self, hsblastn, resume):
     header = '%s/phyeco.fa.header' % self.dir
     if os.path.exists(header) and os.stat(header).st_size > 0 and resume:
         return
     command = "%s index " % hsblastn
     command += " %s/phyeco.fa " % self.dir
     process = subprocess.Popen(command,
                                shell=True,
                                stdout=subprocess.PIPE,
                                stderr=subprocess.PIPE)
     utility.check_exit_code(process, command)
Beispiel #18
0
 def uclust(self, genes, pid, centroids, clusters, threads):
     """ Run UCLUST from shell with specified arguments """
     command = "vsearch "
     command += "-cluster_fast %s " % genes
     command += "-id %s " % pid
     command += "-centroids %s " % centroids
     command += "-uc %s " % clusters
     command += "-threads %s " % threads
     process = subprocess.Popen(command,
                                shell=True,
                                stdout=subprocess.PIPE,
                                stderr=subprocess.PIPE)
     utility.check_exit_code(process, command)
Beispiel #19
0
def index_bam(args):
    start = time()
    print("\nIndexing bamfile")
    args['log'].write("\nIndexing bamfile\n")
    command = '%s index -@ %d %s/snps/temp/genomes.bam' % (
        args['samtools'], int(args['threads']), args['outdir'])
    args['log'].write('command: ' + command + '\n')
    process = subprocess.Popen(command,
                               shell=True,
                               stdout=subprocess.PIPE,
                               stderr=subprocess.PIPE)
    utility.check_exit_code(process, command)
    print("  %s minutes" % round((time() - start) / 60, 2))
    print("  %s Gb maximum memory" % utility.max_mem_usage())
Beispiel #20
0
def genome_align(args):
    """ Use Bowtie2 to map reads to representative genomes """
    # Bowtie2
    bam_path = os.path.join(args['outdir'], 'snps/temp/genomes.bam')
    command = '%s --no-unal ' % args['bowtie2']
    if args['bowtie-db']:
        command += '-x %s ' % '/'.join([args['bowtie-db'], 'genomes'])  # index
    else:
        command += '-x %s ' % '/'.join([args['outdir'], 'snps/temp/genomes'
                                        ])  # index
    if args['max_reads']:
        command += '-u %s ' % args['max_reads']  # max num of reads
    if args['trim']: command += '--trim3 %s ' % args['trim']  # trim 3'
    command += '--%s' % args['speed']  # alignment speed
    command += '-local ' if args[
        'mode'] == 'local' else ' '  # global/local alignment
    command += '--threads %s ' % args['threads']
    command += '-f ' if args['file_type'] == 'fasta' else '-q '  # input type
    if args['m2']:  # -1 and -2 contain paired reads
        command += '-1 %s -2 %s ' % (args['m1'], args['m2'])
    elif args['interleaved']:  # -1 contains paired reads
        command += '--interleaved %s ' % args['m1']
    else:  # -1 contains unpaired reads
        command += '-U %s ' % args['m1']
    # Pipe to samtools
    command += '| %s view -b - ' % args['samtools']  # convert to bam
    command += '--threads %s ' % args['threads']
    command += '| %s sort - ' % args['samtools']
    command += '--threads %s ' % args['threads']
    command += '-o %s ' % bam_path
    # Run command
    args['log'].write('command: ' + command + '\n')
    process = subprocess.Popen(command,
                               shell=True,
                               stdout=subprocess.PIPE,
                               stderr=subprocess.PIPE)
    # Check for errors
    utility.check_exit_code(process, command)
    print("  finished aligning")
    print("  checking bamfile integrity")
    utility.check_bamfile(args, bam_path)
Beispiel #21
0
def map_reads_hsblast(args):
	""" Use hs-blastn to map reads in fasta file to marker database """
	# stream sequences
	command = 'python %s' % args['stream_seqs']
	command += ' -1 %s' % args['m1'] # fasta/fastq
	if args['m2']: command += ' -2 %s' % args['m2'] # mate
	if args['max_reads']: command += ' -n %s' % args['max_reads'] # number of reads
	if args['read_length']: command += ' -l %s' % args['read_length'] # read length
	command += ' 2> %s/species/temp/read_count.txt' % args['outdir'] # tmpfile to store # of reads, bp sampled
	# hs-blastn
	command += ' | %s align' % args['hs-blastn']
	command += ' -word_size %s' % args['word_size']
	command += ' -query /dev/stdin'
	command += ' -db %s/%s/%s' % (args['db'], 'marker_genes', args['db_type'])
	command += ' -outfmt 6'
	command += ' -num_threads %s' % args['threads']
	command += ' -out %s/species/temp/alignments.m8' % args['outdir']
	command += ' -evalue 1e-3'
	args['log'].write('command: '+command+'\n')
	process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
	utility.check_exit_code(process, command)
Beispiel #22
0
def build_genome_db(args, genome_clusters):
    """ Build FASTA and BT2 database from genome cluster centroids """
    # fasta database
    genomes_fasta = open('/'.join([args['outdir'], 'snps/temp/genomes.fa']),
                         'w')
    genomes_map = open('/'.join([args['outdir'], 'snps/temp/genomes.map']),
                       'w')
    db_stats = {'total_length': 0, 'total_seqs': 0, 'genome_clusters': 0}
    for species_id in genome_clusters:
        if args['tax_mask'] and fetch_centroid(args,
                                               species_id) in args['tax_mask']:
            continue
        db_stats['genome_clusters'] += 1
        inpath = '/'.join(
            [args['db'], 'genome_clusters', species_id, 'genome.fna.gz'])
        infile = utility.iopen(inpath)
        for line in infile:
            genomes_fasta.write(line)
            db_stats['total_length'] += len(line.rstrip())
            if line[0] == '>':
                sid = line.rstrip().lstrip('>').split()[0]
                genomes_map.write(sid + '\t' + species_id + '\n')
                db_stats['total_seqs'] += 1
    # print out database stats
    print("  total genomes: %s" % db_stats['genome_clusters'])
    print("  total contigs: %s" % db_stats['total_seqs'])
    print("  total base-pairs: %s" % db_stats['total_length'])
    # bowtie2 database
    inpath = '/'.join([args['outdir'], 'snps/temp/genomes.fa'])
    outpath = '/'.join([args['outdir'], 'snps/temp/genomes'])
    command = ' '.join([args['bowtie2-build'], inpath, outpath])
    args['log'].write('command: ' + command + '\n')
    process = subprocess.Popen(command,
                               shell=True,
                               stdout=subprocess.PIPE,
                               stderr=subprocess.PIPE)
    utility.check_exit_code(process, command)