def build_pangenome_db(args, genome_clusters): """ Build FASTA and BT2 database from pangene cluster centroids """ import Bio.SeqIO # fasta database outdir = '/'.join([args['outdir'], 'genes/temp']) pangenome_fasta = open('/'.join([outdir, 'pangenomes.fa']), 'w') pangenome_map = open('/'.join([outdir, 'pangenome.map']), 'w') db_stats = {'total_length':0, 'total_seqs':0, 'genome_clusters':0} for species_id in genome_clusters: db_stats['genome_clusters'] += 1 inpath = '/'.join([args['db'], 'genome_clusters', species_id, 'pangenome.fa.gz']) infile = utility.iopen(inpath) for r in Bio.SeqIO.parse(infile, 'fasta'): genome_id = '.'.join(r.id.split('.')[0:2]) if not args['tax_mask'] or genome_id not in args['tax_mask']: pangenome_fasta.write('>%s\n%s\n' % (r.id, str(r.seq))) pangenome_map.write('%s\t%s\n' % (r.id, species_id)) db_stats['total_length'] += len(r.seq) db_stats['total_seqs'] += 1 pangenome_fasta.close() pangenome_map.close() # print out database stats print(" total species: %s" % db_stats['genome_clusters']) print(" total genes: %s" % db_stats['total_seqs']) print(" total base-pairs: %s" % db_stats['total_length']) # bowtie2 database inpath = '/'.join([outdir, 'pangenomes.fa']) outpath = '/'.join([outdir, 'pangenomes']) command = ' '.join([args['bowtie2-build'], inpath, outpath]) args['log'].write('command: '+command+'\n') process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) utility.check_exit_code(process, command)
def pangenome_align(args): """ Use Bowtie2 to map reads to all specified genome clusters """ # Build command command = '%s --no-unal ' % args['bowtie2'] # index command += '-x %s ' % '/'.join([args['outdir'], 'genes/temp/pangenomes']) # specify reads if args['max_reads']: command += '-u %s ' % args['max_reads'] # trim reads if args['trim']: command += '--trim3 %s ' % args['trim'] # speed/sensitivity command += '--%s-local ' % args['speed'] # threads command += '--threads %s ' % args['threads'] # file type if args['file_type'] == 'fasta': command += '-f ' else: command += '-q ' # input file if (args['m1'] and args['m2']): command += '-1 %s -2 %s ' % (args['m1'], args['m2']) else: command += '-U %s' % args['m1'] # output unsorted bam bampath = '/'.join([args['outdir'], 'genes/temp/pangenome.bam']) command += '| %s view -b - > %s' % (args['samtools'], bampath) # Run command args['log'].write('command: '+command+'\n') process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) # Check for errors utility.check_exit_code(process, command) utility.check_bamfile(args, bampath)
def build_pangenome_db(args, genome_clusters): """ Build FASTA and BT2 database from pangene cluster centroids """ import Bio.SeqIO # fasta database outdir = '/'.join([args['outdir'], 'genes/temp']) pangenome_fasta = open('/'.join([outdir, 'pangenomes.fa']), 'w') pangenome_map = open('/'.join([outdir, 'pangenome.map']), 'w') db_stats = {'total_length':0, 'total_seqs':0, 'genome_clusters':0} for species_id in genome_clusters: db_stats['genome_clusters'] += 1 inpath = '/'.join([args['db'], 'genome_clusters', species_id, 'pangenome.fa.gz']) infile = gzip.open(inpath) for r in Bio.SeqIO.parse(infile, 'fasta'): genome_id = '.'.join(r.id.split('.')[0:2]) if not args['tax_mask'] or genome_id not in args['tax_mask']: pangenome_fasta.write('>%s\n%s\n' % (r.id, str(r.seq))) pangenome_map.write('%s\t%s\n' % (r.id, species_id)) db_stats['total_length'] += len(r.seq) db_stats['total_seqs'] += 1 pangenome_fasta.close() pangenome_map.close() # print out database stats print(" total species: %s" % db_stats['genome_clusters']) print(" total genes: %s" % db_stats['total_seqs']) print(" total base-pairs: %s" % db_stats['total_length']) # bowtie2 database inpath = '/'.join([outdir, 'pangenomes.fa']) outpath = '/'.join([outdir, 'pangenomes']) command = ' '.join([args['bowtie2-build'], inpath, outpath]) args['log'].write('command: '+command+'\n') process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) utility.check_exit_code(process, command)
def pileup(args): """ Filter alignments by % id, use samtools to create pileup, filter low quality bases, and write results to VCF file """ # Build command # percent id filtering bam_path = os.path.join(args['outdir'], 'snps/temp/genomes.bam') command = 'python %s %s %s %s %s | ' % (args['stream_bam'], bam_path, '/dev/stdout', args['mapid'], args['readq']) # mpileup command += '%s mpileup -uv -A -d 10000 --skip-indels ' % args['samtools'] # quality filtering if not args['baq']: command += '-B ' # quality filtering if args['redo_baq']: command += '-E ' # adjust MQ if args['adjust_mq']: command += '-C 50 ' # quality filtering command += '-q %s -Q %s ' % (args['mapq'], args['baseq']) # reference fna file command += '-f %s ' % ('%s/snps/temp/genomes.fa' % args['outdir']) # input bam file command += '- ' # output vcf file command += '> %s ' % ('%s/snps/temp/genomes.vcf' % args['outdir']) # Run command args['log'].write('command: ' + command + '\n') process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) utility.check_exit_code(process, command)
def map_reads_hsblast(args): """ Use hs-blastn to map reads in fasta file to marker database """ # stream sequences command = 'python %s' % args['stream_seqs'] command += ' -1 %s' % args['m1'] # fasta/fastq if args['m2']: command += ' -2 %s' % args['m2'] # mate if args['max_reads']: command += ' -n %s' % args['max_reads'] # number of reads if args['read_length']: command += ' -l %s' % args['read_length'] # read length command += ' 2> %s/species/temp/read_count.txt' % args[ 'outdir'] # tmpfile to store # of reads, bp sampled # hs-blastn command += ' | %s align' % args['hs-blastn'] command += ' -word_size %s' % args['word_size'] command += ' -query /dev/stdin' command += ' -db %s/marker_genes/phyeco.fa' % args['db'] command += ' -outfmt 6' command += ' -num_threads %s' % args['threads'] command += ' -out %s/species/temp/alignments.m8' % args['outdir'] command += ' -evalue 1e-3' args['log'].write('command: ' + command + '\n') process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) utility.check_exit_code(process, command)
def build_pangenome_db(args, species): """ Build FASTA and BT2 database from pangene species centroids """ import Bio.SeqIO # fasta database outdir = '/'.join([args['outdir'], 'genes/temp']) pangenome_fasta = open('/'.join([outdir, 'pangenomes.fa']), 'w') pangenome_map = open('/'.join([outdir, 'pangenomes.map']), 'w') db_stats = {'total_length':0, 'total_seqs':0, 'species':0} for sp in species.values(): db_stats['species'] += 1 infile = utility.iopen(sp.paths['centroids.ffn']) for r in Bio.SeqIO.parse(infile, 'fasta'): pangenome_fasta.write('>%s\n%s\n' % (r.id, str(r.seq).upper())) pangenome_map.write('%s\t%s\n' % (r.id, sp.id)) db_stats['total_length'] += len(r.seq) db_stats['total_seqs'] += 1 infile.close() pangenome_fasta.close() pangenome_map.close() # print out database stats print(" total species: %s" % db_stats['species']) print(" total genes: %s" % db_stats['total_seqs']) print(" total base-pairs: %s" % db_stats['total_length']) # bowtie2 database inpath = '/'.join([outdir, 'pangenomes.fa']) outpath = '/'.join([outdir, 'pangenomes']) command = ' '.join([args['bowtie2-build'], inpath, outpath]) args['log'].write('command: '+command+'\n') process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) utility.check_exit_code(process, command)
def pileup(args): """ Filter alignments by % id, use samtools to create pileup, filter low quality bases """ # Stream bam, filter alignments command = 'python %s ' % args['stream_bam'] command += '%s ' % os.path.join(args['outdir'], 'snps/temp/genomes.bam') command += '/dev/stdout ' command += '%s ' % args['mapid'] command += '%s ' % args['readq'] command += '%s ' % args['mapq'] # Pipe to mpileup command += '| %s mpileup ' % args['samtools'] command += '-d 10000 ' # set max depth if not args['baq']: command += '-B ' # BAQ if args['adjust_mq']: command += '-C 50 ' # adjust MQ if not args['discard']: command += '-A ' # keep discordant read pairs command += '-Q %s ' % (args['baseq']) # base quality filtering command += '-f %s ' % ('%s/snps/temp/genomes.fa' % args['outdir'] ) # reference fna file command += '- ' # input bam file command += '| gzip > %s ' % ( '%s/snps/temp/genomes.mpileup.gz' % args['outdir']) # output file # Run command args['log'].write('command: ' + command + '\n') process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) utility.check_exit_code(process, command)
def genome_align(args): """ Use Bowtie2 to map reads to representative genomes """ # Bowtie2 bam_path = os.path.join(args['outdir'], 'snps/temp/genomes.bam') command = '%s --no-unal ' % args['bowtie2'] command += '-x %s ' % '/'.join([args['outdir'], 'snps/temp/genomes' ]) # index if args['max_reads']: command += '-u %s ' % args['max_reads'] # max num of reads if args['trim']: command += '--trim3 %s ' % args['trim'] # trim 3' command += '--%s ' % args['speed'] # speed/sensitivity command += '--threads %s ' % args['threads'] # threads command += '-f ' if args['file_type'] == 'fasta' else '-q ' # input type command += '-1 %s -2 %s ' % ( args['m1'], args['m2']) if args['m2'] else '-U %s ' % args['m1'] # input reads # Pipe to samtools command += '| %s view -b - ' % args['samtools'] # convert to bam command += '| %s sort -f - %s ' % (args['samtools'], bam_path) # sort bam # Run command args['log'].write('command: ' + command + '\n') process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) # Check for errors utility.check_exit_code(process, command) print(" finished aligning") print(" checking bamfile integrity") utility.check_bamfile(args, bam_path)
def pileup(args): """ Filter alignments by % id, use samtools to create pileup, filter low quality bases, and write results to VCF file """ # Build command # percent id filtering bam_path = os.path.join(args['outdir'], 'snps/temp/genomes.bam') command = 'python %s %s %s %s %s | ' % (args['stream_bam'], bam_path, '/dev/stdout', args['mapid'], args['readq']) # mpileup command += '%s mpileup -uv -A -d 10000 --skip-indels ' % args['samtools'] # quality filtering if not args['baq']: command += '-B ' # quality filtering if args['redo_baq']: command += '-E ' # adjust MQ if args['adjust_mq']: command += '-C 50 ' # quality filtering command += '-q %s -Q %s ' % (args['mapq'], args['baseq']) # reference fna file command += '-f %s ' % ('%s/snps/temp/genomes.fa' % args['outdir']) # input bam file command += '- ' # output vcf file command += '> %s ' % ('%s/snps/temp/genomes.vcf' % args['outdir']) # Run command args['log'].write('command: '+command+'\n') process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) utility.check_exit_code(process, command)
def build_genome_db(args, species): """ Build FASTA and BT2 database of representative genomes """ import Bio.SeqIO # fasta database outfile = open('/'.join([args['outdir'], 'snps/temp/genomes.fa']), 'w') db_stats = {'total_length': 0, 'total_seqs': 0, 'species': 0} for sp in species.values(): db_stats['species'] += 1 infile = utility.iopen(sp.paths['fna']) for r in Bio.SeqIO.parse(infile, 'fasta'): outfile.write('>%s\n%s\n' % (r.id, str(r.seq).upper())) db_stats['total_length'] += len(r.seq) db_stats['total_seqs'] += 1 infile.close() outfile.close() # print out database stats print(" total genomes: %s" % db_stats['species']) print(" total contigs: %s" % db_stats['total_seqs']) print(" total base-pairs: %s" % db_stats['total_length']) # bowtie2 database inpath = '/'.join([args['outdir'], 'snps/temp/genomes.fa']) outpath = '/'.join([args['outdir'], 'snps/temp/genomes']) command = ' '.join([args['bowtie2-build'], inpath, outpath]) args['log'].write('command: ' + command + '\n') process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) utility.check_exit_code(process, command)
def build_genome_db(args, genome_clusters): """ Build FASTA and BT2 database from genome cluster centroids """ # fasta database genomes_fasta = open('/'.join([args['outdir'], 'snps/temp/genomes.fa']), 'w') genomes_map = open('/'.join([args['outdir'], 'snps/temp/genomes.map']), 'w') db_stats = {'total_length':0, 'total_seqs':0, 'genome_clusters':0} for species_id in genome_clusters: if args['tax_mask'] and fetch_centroid(args, species_id) in args['tax_mask']: continue db_stats['genome_clusters'] += 1 inpath = '/'.join([args['db'], 'genome_clusters', species_id, 'genome.fna.gz']) infile = gzip.open(inpath) for line in infile: genomes_fasta.write(line) db_stats['total_length'] += len(line.rstrip()) if line[0] == '>': sid = line.rstrip().lstrip('>').split()[0] genomes_map.write(sid+'\t'+species_id+'\n') db_stats['total_seqs'] += 1 # print out database stats print(" total genomes: %s" % db_stats['genome_clusters']) print(" total contigs: %s" % db_stats['total_seqs']) print(" total base-pairs: %s" % db_stats['total_length']) # bowtie2 database inpath = '/'.join([args['outdir'], 'snps/temp/genomes.fa']) outpath = '/'.join([args['outdir'], 'snps/temp/genomes']) command = ' '.join([args['bowtie2-build'], inpath, outpath]) args['log'].write('command: '+command+'\n') process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) utility.check_exit_code(process, command)
def genome_align(args): """ Use Bowtie2 to map reads to representative genomes from each genome cluster """ # Build command # bowtie2 command = '%s --no-unal ' % args['bowtie2'] # index command += '-x %s ' % '/'.join([args['outdir'], 'snps/temp/genomes']) # specify reads if args['max_reads']: command += '-u %s ' % args['max_reads'] # trim reads if args['trim']: command += '--trim3 %s ' % args['trim'] # speed/sensitivity command += '--%s ' % args['speed'] # threads command += '--threads %s ' % args['threads'] # file type if args['file_type'] == 'fasta': command += '-f ' else: command += '-q ' # input file if (args['m1'] and args['m2']): command += '-1 %s -2 %s ' % (args['m1'], args['m2']) else: command += '-U %s' % args['m1'] # convert to bam command += '| %s view -b - ' % args['samtools'] # sort bam bam_path = os.path.join(args['outdir'], 'snps/temp/genomes.bam') command += '| %s sort -f - %s ' % (args['samtools'], bam_path) # Run command args['log'].write('command: '+command+'\n') process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) # Check for errors utility.check_exit_code(process, command) utility.check_bamfile(args, bam_path)
def pangenome_align(args): """ Use Bowtie2 to map reads to all specified genome species """ # Build command command = '%s --no-unal ' % args['bowtie2'] # index command += '-x %s ' % '/'.join([args['outdir'], 'genes/temp/pangenomes']) # specify reads if args['max_reads']: command += '-u %s ' % args['max_reads'] # trim reads if args['trim']: command += '--trim3 %s ' % args['trim'] # speed/sensitivity command += '--%s-local ' % args['speed'] # threads command += '--threads %s ' % args['threads'] # file type if args['file_type'] == 'fasta': command += '-f ' else: command += '-q ' # input file if (args['m1'] and args['m2']): command += '-1 %s -2 %s ' % (args['m1'], args['m2']) else: command += '-U %s ' % args['m1'] # output unsorted bam bampath = '/'.join([args['outdir'], 'genes/temp/pangenomes.bam']) command += '| %s view -b - > %s' % (args['samtools'], bampath) # Run command args['log'].write('command: '+command+'\n') process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) # Check for errors utility.check_exit_code(process, command) print(" finished aligning") print(" checking bamfile integrity") utility.check_bamfile(args, bampath)
def build_hsblastn_db(self, hsblastn): command = "%s index " % hsblastn command += " %s/phyeco.fa " % self.dir process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) utility.check_exit_code(process, command)
def hsblastn_index(self, fasta): command = "hs-blastn index %s " % fasta process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env={'PATH': sys.path}) utility.check_exit_code(process, command)
def hmmsearch(self, inpath, outpath, threads): command = "hmmsearch --noali --cpu %s " % threads command += "--domtblout %s " % outpath command += "%s/%s " % (os.path.dirname(__file__), 'phyeco.hmm') command += "%s > /dev/null" % inpath process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) utility.check_exit_code(process, command)
def build_hsblastn_db(self, hsblastn, resume): header = '%s/phyeco.fa.header' % self.dir if os.path.exists(header) and os.stat(header).st_size > 0 and resume: return command = "%s index " % hsblastn command += " %s/phyeco.fa " % self.dir process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) utility.check_exit_code(process, command)
def uclust(self, genes, pid, centroids, clusters, threads): """ Run UCLUST from shell with specified arguments """ command = "vsearch " command += "-cluster_fast %s " % genes command += "-id %s " % pid command += "-centroids %s " % centroids command += "-uc %s " % clusters command += "-threads %s " % threads process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) utility.check_exit_code(process, command)
def index_bam(args): start = time() print("\nIndexing bamfile") args['log'].write("\nIndexing bamfile\n") command = '%s index -@ %d %s/snps/temp/genomes.bam' % ( args['samtools'], int(args['threads']), args['outdir']) args['log'].write('command: ' + command + '\n') process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) utility.check_exit_code(process, command) print(" %s minutes" % round((time() - start) / 60, 2)) print(" %s Gb maximum memory" % utility.max_mem_usage())
def genome_align(args): """ Use Bowtie2 to map reads to representative genomes """ # Bowtie2 bam_path = os.path.join(args['outdir'], 'snps/temp/genomes.bam') command = '%s --no-unal ' % args['bowtie2'] if args['bowtie-db']: command += '-x %s ' % '/'.join([args['bowtie-db'], 'genomes']) # index else: command += '-x %s ' % '/'.join([args['outdir'], 'snps/temp/genomes' ]) # index if args['max_reads']: command += '-u %s ' % args['max_reads'] # max num of reads if args['trim']: command += '--trim3 %s ' % args['trim'] # trim 3' command += '--%s' % args['speed'] # alignment speed command += '-local ' if args[ 'mode'] == 'local' else ' ' # global/local alignment command += '--threads %s ' % args['threads'] command += '-f ' if args['file_type'] == 'fasta' else '-q ' # input type if args['m2']: # -1 and -2 contain paired reads command += '-1 %s -2 %s ' % (args['m1'], args['m2']) elif args['interleaved']: # -1 contains paired reads command += '--interleaved %s ' % args['m1'] else: # -1 contains unpaired reads command += '-U %s ' % args['m1'] # Pipe to samtools command += '| %s view -b - ' % args['samtools'] # convert to bam command += '--threads %s ' % args['threads'] command += '| %s sort - ' % args['samtools'] command += '--threads %s ' % args['threads'] command += '-o %s ' % bam_path # Run command args['log'].write('command: ' + command + '\n') process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) # Check for errors utility.check_exit_code(process, command) print(" finished aligning") print(" checking bamfile integrity") utility.check_bamfile(args, bam_path)
def map_reads_hsblast(args): """ Use hs-blastn to map reads in fasta file to marker database """ # stream sequences command = 'python %s' % args['stream_seqs'] command += ' -1 %s' % args['m1'] # fasta/fastq if args['m2']: command += ' -2 %s' % args['m2'] # mate if args['max_reads']: command += ' -n %s' % args['max_reads'] # number of reads if args['read_length']: command += ' -l %s' % args['read_length'] # read length command += ' 2> %s/species/temp/read_count.txt' % args['outdir'] # tmpfile to store # of reads, bp sampled # hs-blastn command += ' | %s align' % args['hs-blastn'] command += ' -word_size %s' % args['word_size'] command += ' -query /dev/stdin' command += ' -db %s/%s/%s' % (args['db'], 'marker_genes', args['db_type']) command += ' -outfmt 6' command += ' -num_threads %s' % args['threads'] command += ' -out %s/species/temp/alignments.m8' % args['outdir'] command += ' -evalue 1e-3' args['log'].write('command: '+command+'\n') process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) utility.check_exit_code(process, command)
def build_genome_db(args, genome_clusters): """ Build FASTA and BT2 database from genome cluster centroids """ # fasta database genomes_fasta = open('/'.join([args['outdir'], 'snps/temp/genomes.fa']), 'w') genomes_map = open('/'.join([args['outdir'], 'snps/temp/genomes.map']), 'w') db_stats = {'total_length': 0, 'total_seqs': 0, 'genome_clusters': 0} for species_id in genome_clusters: if args['tax_mask'] and fetch_centroid(args, species_id) in args['tax_mask']: continue db_stats['genome_clusters'] += 1 inpath = '/'.join( [args['db'], 'genome_clusters', species_id, 'genome.fna.gz']) infile = utility.iopen(inpath) for line in infile: genomes_fasta.write(line) db_stats['total_length'] += len(line.rstrip()) if line[0] == '>': sid = line.rstrip().lstrip('>').split()[0] genomes_map.write(sid + '\t' + species_id + '\n') db_stats['total_seqs'] += 1 # print out database stats print(" total genomes: %s" % db_stats['genome_clusters']) print(" total contigs: %s" % db_stats['total_seqs']) print(" total base-pairs: %s" % db_stats['total_length']) # bowtie2 database inpath = '/'.join([args['outdir'], 'snps/temp/genomes.fa']) outpath = '/'.join([args['outdir'], 'snps/temp/genomes']) command = ' '.join([args['bowtie2-build'], inpath, outpath]) args['log'].write('command: ' + command + '\n') process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) utility.check_exit_code(process, command)