Exemple #1
0
def pileup(args):
    """ Filter alignments by % id, use samtools to create pileup, filter low quality bases, and write results to VCF file """
    # Build command
    #   percent id filtering
    command = 'python %s %s %s %s %s | ' % (
        args['stream_bam'], '%s/genomes.bam' % args['outdir'], '/dev/stdout',
        args['mapid'], args['readq'])
    #   mpileup
    command += '%s mpileup -uv -A -d 10000 --skip-indels ' % args['samtools']
    #   quality filtering
    if not args['baq']: command += '-B '
    #   quality filtering
    if args['redo_baq']: command += '-E '
    #   adjust MQ
    if args['adjust_mq']: command += '-C 50 '
    #   quality filtering
    command += '-q %s -Q %s ' % (args['mapq'], args['baseq'])
    #   reference fna file
    command += '-f %s ' % ('%s/db/genomes.fa' % args['outdir'])
    #   input bam file
    command += '- '
    #   output vcf file
    command += '> %s ' % ('%s/genomes.vcf' % args['outdir'])
    # Run command
    process = subprocess.Popen(command,
                               shell=True,
                               stdout=subprocess.PIPE,
                               stderr=subprocess.PIPE)
    utility.check_exit_code(process, command)
Exemple #2
0
def map_reads_hsblast(args):
    """ Use hs-blastn to map reads in fasta file to marker database """
    # stream sequences
    command = 'python %s' % args['stream_seqs']
    command += ' -1 %s' % args['m1']  # fasta/fastq
    if args['m2']: command += ' -2 %s' % args['m2']  # mate
    if args['max_reads']:
        command += ' -n %s' % args['max_reads']  # number of reads
    if args['read_length']:
        command += ' -l %s' % args['read_length']  # read length
    command += ' 2> %s.read_count' % args[
        'out']  # tmpfile to store # of reads, bp sampled
    # hs-blastn
    command += ' | %s align' % args['hs-blastn']
    command += ' -word_size %s' % args['word_size']
    command += ' -query /dev/stdin'
    command += ' -db %s/%s/%s' % (args['db'], 'marker_genes', args['db_type'])
    command += ' -outfmt 6'
    command += ' -num_threads %s' % args['threads']
    command += ' -out %s.m8' % args['out']
    command += ' -evalue 1e-3'
    process = subprocess.Popen(command,
                               shell=True,
                               stdout=subprocess.PIPE,
                               stderr=subprocess.PIPE)
    utility.check_exit_code(process, command)
Exemple #3
0
def pangenome_align(args):
	""" Use Bowtie2 to map reads to all specified genome clusters """
	# Build command
	command = '%s --no-unal ' % args['bowtie2']
	#   index
	command += '-x %s ' % '/'.join([args['outdir'], 'db', 'pangenomes'])
	#   specify reads
	if args['max_reads']: command += '-u %s ' % args['max_reads']
	#   trim reads
	if args['trim']: command += '--trim3 %s ' % args['trim']
	#   speed/sensitivity
	command += '--%s-local ' % args['speed']
	#   threads
	command += '--threads %s ' % args['threads']
	#   file type
	if args['file_type'] == 'fasta': command += '-f '
	else: command += '-q '
	#   input file
	if (args['m1'] and args['m2']): command += '-1 %s -2 %s ' % (args['m1'], args['m2'])
	else: command += '-U %s' % args['m1']
	#   output unsorted bam
	bampath = '/'.join([args['outdir'], 'pangenome.bam'])
	command += '| %s view -b - > %s' % (args['samtools'], bampath)
	# Run command
	process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
	# Check for errors
	utility.check_exit_code(process, command)
	utility.check_bamfile(args, bampath)
Exemple #4
0
def build_pangenome_db(args, genome_clusters):
	""" Build FASTA and BT2 database from pangene cluster centroids """
	import Bio.SeqIO
	# fasta database
	outdir = '/'.join([args['outdir'], 'db'])
	if not os.path.isdir(outdir): os.mkdir(outdir)
	pangenome_fasta = open('/'.join([args['outdir'], 'db/pangenomes.fa']), 'w')
	pangenome_map = open('/'.join([args['outdir'], 'db/pangenome.map']), 'w')
	db_stats = {'total_length':0, 'total_seqs':0, 'genome_clusters':0}
	for cluster_id in genome_clusters:
		db_stats['genome_clusters'] += 1
		inpath = '/'.join([args['db'], 'genome_clusters', cluster_id, 'pangenome.fa.gz'])
		infile = gzip.open(inpath)
		for r in Bio.SeqIO.parse(infile, 'fasta'):
			genome_id = '.'.join(r.id.split('.')[0:2])
			if not args['tax_mask'] or genome_id not in args['tax_mask']:
				pangenome_fasta.write('>%s\n%s\n' % (r.id, str(r.seq)))
				pangenome_map.write('%s\t%s\n' % (r.id, cluster_id))
				db_stats['total_length'] += len(r.seq)
				db_stats['total_seqs'] += 1
	pangenome_fasta.close()
	pangenome_map.close()
	# print out database stats
	print("  total genome-clusters: %s" % db_stats['genome_clusters'])
	print("  total genes: %s" % db_stats['total_seqs'])
	print("  total base-pairs: %s" % db_stats['total_length'])
	# bowtie2 database
	inpath = '/'.join([args['outdir'], 'db/pangenomes.fa'])
	outpath = '/'.join([args['outdir'], 'db/pangenomes'])
	command = ' '.join([args['bowtie2-build'], inpath, outpath])
	process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
	utility.check_exit_code(process, command)
Exemple #5
0
def map_reads_hsblast(args):
	""" Use hs-blastn to map reads in fasta file to marker database """
	# stream sequences
	command = 'python %s' % args['stream_seqs']
	command += ' -1 %s' % args['m1'] # fasta/fastq
	if args['m2']: command += ' -2 %s' % args['m2'] # mate
	if args['max_reads']: command += ' -n %s' % args['max_reads'] # number of reads
	if args['read_length']: command += ' -l %s' % args['read_length'] # read length
	command += ' 2> %s.read_count' % args['out'] # tmpfile to store # of reads, bp sampled
	# hs-blastn
	command += ' | %s align' % args['hs-blastn']
	command += ' -word_size %s' % args['word_size']
	command += ' -query /dev/stdin'
	command += ' -db %s/%s/%s' % (args['db'], 'marker_genes', args['db_type'])
	command += ' -outfmt 6'
	command += ' -num_threads %s' % args['threads']
	command += ' -out %s.m8' % args['out']
	command += ' -evalue 1e-3'
	process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
	utility.check_exit_code(process, command)
Exemple #6
0
def build_genome_db(args, genome_clusters):
    """ Build FASTA and BT2 database from genome cluster centroids """
    # fasta database
    outdir = '/'.join([args['outdir'], 'db'])
    if not os.path.isdir(outdir): os.mkdir(outdir)
    genomes_fasta = open('/'.join([args['outdir'], 'db', 'genomes.fa']), 'w')
    genomes_map = open('/'.join([args['outdir'], 'db', 'genomes.map']), 'w')
    db_stats = {'total_length': 0, 'total_seqs': 0, 'genome_clusters': 0}
    for cluster_id in genome_clusters:
        if args['tax_mask'] and fetch_centroid(args,
                                               cluster_id) in args['tax_mask']:
            continue
        db_stats['genome_clusters'] += 1
        inpath = '/'.join([
            args['db'], 'genome_clusters', cluster_id, 'representative.fna.gz'
        ])
        infile = gzip.open(inpath)
        for line in infile:
            genomes_fasta.write(line)
            db_stats['total_length'] += len(line.rstrip())
            if line[0] == '>':
                sid = line.rstrip().lstrip('>').split()[0]
                genomes_map.write(sid + '\t' + cluster_id + '\n')
                db_stats['total_seqs'] += 1
    # print out database stats
    print("  total genomes: %s" % db_stats['genome_clusters'])
    print("  total contigs: %s" % db_stats['total_seqs'])
    print("  total base-pairs: %s" % db_stats['total_length'])
    # bowtie2 database
    inpath = '/'.join([args['outdir'], 'db', 'genomes.fa'])
    outpath = '/'.join([args['outdir'], 'db', 'genomes'])
    command = ' '.join([args['bowtie2-build'], inpath, outpath])
    process = subprocess.Popen(command,
                               shell=True,
                               stdout=subprocess.PIPE,
                               stderr=subprocess.PIPE)
    utility.check_exit_code(process, command)
Exemple #7
0
def genome_align(args):
    """ Use Bowtie2 to map reads to representative genomes from each genome cluster
	"""
    # Build command
    #	bowtie2
    command = '%s --no-unal ' % args['bowtie2']
    #   index
    command += '-x %s ' % '/'.join([args['outdir'], 'db', 'genomes'])
    #   specify reads
    if args['max_reads']: command += '-u %s ' % args['max_reads']
    #   trim reads
    if args['trim']: command += '--trim3 %s ' % args['trim']
    #   speed/sensitivity
    command += '--%s ' % args['speed']
    #   threads
    command += '--threads %s ' % args['threads']
    #   file type
    if args['file_type'] == 'fasta': command += '-f '
    else: command += '-q '
    #   input file
    if (args['m1'] and args['m2']):
        command += '-1 %s -2 %s ' % (args['m1'], args['m2'])
    else:
        command += '-U %s' % args['m1']
    #   convert to bam
    command += '| %s view -b - ' % args['samtools']
    #   sort bam
    bam_path = os.path.join(args['outdir'], 'genomes.bam')
    command += '| %s sort -f - %s ' % (args['samtools'], bam_path)
    # Run command
    process = subprocess.Popen(command,
                               shell=True,
                               stdout=subprocess.PIPE,
                               stderr=subprocess.PIPE)
    # Check for errors
    utility.check_exit_code(process, command)
    utility.check_bamfile(args, bam_path)