def pileup(args): """ Filter alignments by % id, use samtools to create pileup, filter low quality bases, and write results to VCF file """ # Build command # percent id filtering command = 'python %s %s %s %s %s | ' % ( args['stream_bam'], '%s/genomes.bam' % args['outdir'], '/dev/stdout', args['mapid'], args['readq']) # mpileup command += '%s mpileup -uv -A -d 10000 --skip-indels ' % args['samtools'] # quality filtering if not args['baq']: command += '-B ' # quality filtering if args['redo_baq']: command += '-E ' # adjust MQ if args['adjust_mq']: command += '-C 50 ' # quality filtering command += '-q %s -Q %s ' % (args['mapq'], args['baseq']) # reference fna file command += '-f %s ' % ('%s/db/genomes.fa' % args['outdir']) # input bam file command += '- ' # output vcf file command += '> %s ' % ('%s/genomes.vcf' % args['outdir']) # Run command process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) utility.check_exit_code(process, command)
def map_reads_hsblast(args): """ Use hs-blastn to map reads in fasta file to marker database """ # stream sequences command = 'python %s' % args['stream_seqs'] command += ' -1 %s' % args['m1'] # fasta/fastq if args['m2']: command += ' -2 %s' % args['m2'] # mate if args['max_reads']: command += ' -n %s' % args['max_reads'] # number of reads if args['read_length']: command += ' -l %s' % args['read_length'] # read length command += ' 2> %s.read_count' % args[ 'out'] # tmpfile to store # of reads, bp sampled # hs-blastn command += ' | %s align' % args['hs-blastn'] command += ' -word_size %s' % args['word_size'] command += ' -query /dev/stdin' command += ' -db %s/%s/%s' % (args['db'], 'marker_genes', args['db_type']) command += ' -outfmt 6' command += ' -num_threads %s' % args['threads'] command += ' -out %s.m8' % args['out'] command += ' -evalue 1e-3' process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) utility.check_exit_code(process, command)
def pangenome_align(args): """ Use Bowtie2 to map reads to all specified genome clusters """ # Build command command = '%s --no-unal ' % args['bowtie2'] # index command += '-x %s ' % '/'.join([args['outdir'], 'db', 'pangenomes']) # specify reads if args['max_reads']: command += '-u %s ' % args['max_reads'] # trim reads if args['trim']: command += '--trim3 %s ' % args['trim'] # speed/sensitivity command += '--%s-local ' % args['speed'] # threads command += '--threads %s ' % args['threads'] # file type if args['file_type'] == 'fasta': command += '-f ' else: command += '-q ' # input file if (args['m1'] and args['m2']): command += '-1 %s -2 %s ' % (args['m1'], args['m2']) else: command += '-U %s' % args['m1'] # output unsorted bam bampath = '/'.join([args['outdir'], 'pangenome.bam']) command += '| %s view -b - > %s' % (args['samtools'], bampath) # Run command process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) # Check for errors utility.check_exit_code(process, command) utility.check_bamfile(args, bampath)
def build_pangenome_db(args, genome_clusters): """ Build FASTA and BT2 database from pangene cluster centroids """ import Bio.SeqIO # fasta database outdir = '/'.join([args['outdir'], 'db']) if not os.path.isdir(outdir): os.mkdir(outdir) pangenome_fasta = open('/'.join([args['outdir'], 'db/pangenomes.fa']), 'w') pangenome_map = open('/'.join([args['outdir'], 'db/pangenome.map']), 'w') db_stats = {'total_length':0, 'total_seqs':0, 'genome_clusters':0} for cluster_id in genome_clusters: db_stats['genome_clusters'] += 1 inpath = '/'.join([args['db'], 'genome_clusters', cluster_id, 'pangenome.fa.gz']) infile = gzip.open(inpath) for r in Bio.SeqIO.parse(infile, 'fasta'): genome_id = '.'.join(r.id.split('.')[0:2]) if not args['tax_mask'] or genome_id not in args['tax_mask']: pangenome_fasta.write('>%s\n%s\n' % (r.id, str(r.seq))) pangenome_map.write('%s\t%s\n' % (r.id, cluster_id)) db_stats['total_length'] += len(r.seq) db_stats['total_seqs'] += 1 pangenome_fasta.close() pangenome_map.close() # print out database stats print(" total genome-clusters: %s" % db_stats['genome_clusters']) print(" total genes: %s" % db_stats['total_seqs']) print(" total base-pairs: %s" % db_stats['total_length']) # bowtie2 database inpath = '/'.join([args['outdir'], 'db/pangenomes.fa']) outpath = '/'.join([args['outdir'], 'db/pangenomes']) command = ' '.join([args['bowtie2-build'], inpath, outpath]) process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) utility.check_exit_code(process, command)
def map_reads_hsblast(args): """ Use hs-blastn to map reads in fasta file to marker database """ # stream sequences command = 'python %s' % args['stream_seqs'] command += ' -1 %s' % args['m1'] # fasta/fastq if args['m2']: command += ' -2 %s' % args['m2'] # mate if args['max_reads']: command += ' -n %s' % args['max_reads'] # number of reads if args['read_length']: command += ' -l %s' % args['read_length'] # read length command += ' 2> %s.read_count' % args['out'] # tmpfile to store # of reads, bp sampled # hs-blastn command += ' | %s align' % args['hs-blastn'] command += ' -word_size %s' % args['word_size'] command += ' -query /dev/stdin' command += ' -db %s/%s/%s' % (args['db'], 'marker_genes', args['db_type']) command += ' -outfmt 6' command += ' -num_threads %s' % args['threads'] command += ' -out %s.m8' % args['out'] command += ' -evalue 1e-3' process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) utility.check_exit_code(process, command)
def build_genome_db(args, genome_clusters): """ Build FASTA and BT2 database from genome cluster centroids """ # fasta database outdir = '/'.join([args['outdir'], 'db']) if not os.path.isdir(outdir): os.mkdir(outdir) genomes_fasta = open('/'.join([args['outdir'], 'db', 'genomes.fa']), 'w') genomes_map = open('/'.join([args['outdir'], 'db', 'genomes.map']), 'w') db_stats = {'total_length': 0, 'total_seqs': 0, 'genome_clusters': 0} for cluster_id in genome_clusters: if args['tax_mask'] and fetch_centroid(args, cluster_id) in args['tax_mask']: continue db_stats['genome_clusters'] += 1 inpath = '/'.join([ args['db'], 'genome_clusters', cluster_id, 'representative.fna.gz' ]) infile = gzip.open(inpath) for line in infile: genomes_fasta.write(line) db_stats['total_length'] += len(line.rstrip()) if line[0] == '>': sid = line.rstrip().lstrip('>').split()[0] genomes_map.write(sid + '\t' + cluster_id + '\n') db_stats['total_seqs'] += 1 # print out database stats print(" total genomes: %s" % db_stats['genome_clusters']) print(" total contigs: %s" % db_stats['total_seqs']) print(" total base-pairs: %s" % db_stats['total_length']) # bowtie2 database inpath = '/'.join([args['outdir'], 'db', 'genomes.fa']) outpath = '/'.join([args['outdir'], 'db', 'genomes']) command = ' '.join([args['bowtie2-build'], inpath, outpath]) process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) utility.check_exit_code(process, command)
def genome_align(args): """ Use Bowtie2 to map reads to representative genomes from each genome cluster """ # Build command # bowtie2 command = '%s --no-unal ' % args['bowtie2'] # index command += '-x %s ' % '/'.join([args['outdir'], 'db', 'genomes']) # specify reads if args['max_reads']: command += '-u %s ' % args['max_reads'] # trim reads if args['trim']: command += '--trim3 %s ' % args['trim'] # speed/sensitivity command += '--%s ' % args['speed'] # threads command += '--threads %s ' % args['threads'] # file type if args['file_type'] == 'fasta': command += '-f ' else: command += '-q ' # input file if (args['m1'] and args['m2']): command += '-1 %s -2 %s ' % (args['m1'], args['m2']) else: command += '-U %s' % args['m1'] # convert to bam command += '| %s view -b - ' % args['samtools'] # sort bam bam_path = os.path.join(args['outdir'], 'genomes.bam') command += '| %s sort -f - %s ' % (args['samtools'], bam_path) # Run command process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) # Check for errors utility.check_exit_code(process, command) utility.check_bamfile(args, bam_path)