def generate_bedfile(ref_fasta, kmers, output_bedfile, tmpdir='bedgentmp', threads='2', logfile=None): """ Given a reference FASTA file and a fasta-formatted set of kmers, will generate a coverage bedfile for the reference FASTA by mapping the kmers back to the FASTA. :param ref_fasta: Path to reference FASTA. :param kmers: Path to FASTA-formatted kmer file. :param output_bedfile: Path to output bedfile. :param tmpdir: Temporary directory to store intermediate files. Will be deleted upon method completion. :param threads: Number of threads to use for analysis. Must be a string. """ if not os.path.isdir(tmpdir): os.makedirs(tmpdir) # First, need to generate a bam file - align the kmers to a reference fasta genome. bbtools.bbmap(ref_fasta, kmers, os.path.join(tmpdir, 'out.bam'), threads=threads, ambig='best', perfectmode='true') # Once the bam file is generated, turn it into a sorted bamfile so that bedtools can work with it. cmd = 'samtools sort {bamfile} -o {sorted_bamfile}'.format(bamfile=os.path.join(tmpdir, 'out.bam'), sorted_bamfile=os.path.join(tmpdir, 'out_sorted.bam')) if logfile: with open(logfile, 'a+') as f: f.write('Command: {}'.format(cmd)) subprocess.call(cmd, shell=True, stderr=f, stdout=f) else: subprocess.call(cmd, shell=True) # Use bedtools to get genome coverage, so that we know what to mask. cmd = 'bedtools genomecov -ibam {sorted_bamfile} -bga' \ ' > {output_bed}'.format(sorted_bamfile=os.path.join(tmpdir, 'out_sorted.bam'), output_bed=output_bedfile) if logfile: with open(logfile, 'a+') as f: f.write('Command: {}'.format(cmd)) subprocess.call(cmd, shell=True, stderr=f, stdout=f) else: subprocess.call(cmd, shell=True) shutil.rmtree(tmpdir)
def test_bbmap_command_paired(): out, err, cmd = bbtools.bbmap(reference='tests/dummy_fasta/test.fasta', forward_in='tests/dummy_fastq/test_R1.fastq', out_bam='tests/out.bam', returncmd=True) assert cmd == 'bbmap.sh ref=tests/dummy_fasta/test.fasta in=tests/dummy_fastq/test_R1.fastq' \ ' in2=tests/dummy_fastq/test_R2.fastq out=tests/out.bam nodisk' os.remove('tests/out.bam')
def test_bbmap_command_paired_kwargs(): out, err, cmd = bbtools.bbmap(reference='tests/dummy_fasta/test.fasta', forward_in='tests/dummy_fastq/test_R1.fastq', out_bam='tests/out.bam', returncmd=True, threads='3', ordered='t') assert 'bbmap.sh ref=tests/dummy_fasta/test.fasta in=tests/dummy_fastq/test_R1.fastq' \ ' in2=tests/dummy_fastq/test_R2.fastq out=tests/out.bam nodisk' in cmd and 'ordered=t' in cmd and 'threads=3' in cmd os.remove('tests/out.bam')
def create_bam(forward_reads, reverse_reads, reference_fasta, outdir): # Map with bbmap default settings. This may or may not work. # Getting some real weird results with this - try using bowtie2 and see what happens # Bowtie2 gives really similar results with default settings. Will need to play with parameters lots tomorrow to # try to figure out what's going on. cmd = 'samtools faidx {reference_fasta}'.format( reference_fasta=reference_fasta) out, err = run_cmd(cmd) write_to_logfile(logfile=os.path.join(outdir, 'log.txt'), out=out, err=err, cmd=cmd) bbtools.bbmap( reference=reference_fasta, forward_in=forward_reads, reverse_in=reverse_reads, out_bam=os.path.join(outdir, 'aligned.bam'), subfilter=1 ) # Limiting to one substitution allowed per read mapped might help - TBD # Also sort and index the bamfile so pysam will be happy with us. cmd = 'samtools sort {bamfile} -o {sorted_bamfile}'.format( bamfile=os.path.join(outdir, 'aligned.bam'), sorted_bamfile=os.path.join(outdir, 'aligned_sorted.bam')) out, err = run_cmd(cmd) write_to_logfile(logfile=os.path.join(outdir, 'log.txt'), out=out, err=err, cmd=cmd) cmd = 'samtools index {sorted_bamfile}'.format( sorted_bamfile=os.path.join(outdir, 'aligned_sorted.bam')) out, err = run_cmd(cmd) write_to_logfile(logfile=os.path.join(outdir, 'log.txt'), out=out, err=err, cmd=cmd)