def faster_split_bam(log, sorted_reduced_bam, sample_dir_iter, iteration): start_time = time.time() sample_dir_iter_locus_temp = os.path.join(sample_dir_iter, "loci", "temp") # make a temp dir in locus folder in which to store locus-specific SAM data os.makedirs(sample_dir_iter_locus_temp) os.chdir(sample_dir_iter_locus_temp) cmd1 = [ get_user_path("executables", "samtools"), "view", sorted_reduced_bam ] cmd2 = [get_user_path("executables", "grep"), "-v", "^@"] cmd3 = [get_user_path("executables", "gawk"), "-F\t", '{print > $3}'] proc1 = subprocess.Popen(cmd1, stdout=subprocess.PIPE) proc2 = subprocess.Popen(cmd2, stdin=proc1.stdout, stdout=subprocess.PIPE) proc3 = subprocess.Popen(cmd3, stdin=proc2.stdout, stdout=subprocess.PIPE) proc1.stdout.close() proc2.stdout.close() stdout = proc3.communicate() if proc3.returncode is not 0: raise IOError("Splitting BAM file has failed") else: os.chdir(sample_dir_iter) end_time = time.time() time_delta_sec = round(end_time - start_time, 3) log.info("\tSplit SAMs took {} seconds".format(time_delta_sec)) return sample_dir_iter_locus_temp
def bwa_mem_pe_align(log, sample, sample_dir, ref, cores, r1, r2, iteration=0): #pdb.set_trace() cmd1 = [ get_user_path("executables", "bwa"), "mem", "-t", str(cores), ref, r1.pth, r2.pth ] cmd2 = [get_user_path("executables", "samtools"), "view", "-bS", "-"] sampe_out_fname = os.path.join(sample_dir, 'iter-{}.pe.bwa.log'.format(iteration)) samtools_out_fname = os.path.join( sample_dir, 'iter-{}.pe.samtools.log'.format(iteration)) bam_out_fname = os.path.join(sample_dir, 'iter-{}.bam'.format(iteration)) log.info("Building BAM for {}, iteration {}".format(sample, iteration)) with open(sampe_out_fname, 'w') as sampe_out: with open(samtools_out_fname, 'w') as samtools_out: with open(bam_out_fname, 'w') as bam_out: proc1 = subprocess.Popen(cmd1, stdout=subprocess.PIPE, stderr=sampe_out) proc2 = subprocess.Popen(cmd2, stdin=proc1.stdout, stdout=bam_out, stderr=samtools_out) proc1.stdout.close() proc2.communicate() return bam_out_fname
def samtools_split_sam(sample, sample_dir_iter_locus, locus, clean, only_single_locus): sam_out_fname = os.path.join(sample_dir_iter_locus, '{}.sam'.format(locus)) # split the reduced files into properly paired and singleton reads bam_out_fname_paired = os.path.join(sample_dir_iter_locus, '{}.paired.bam'.format(locus)) # -f 2 -F 2048 gets properly paired, non-supplementary alignments cmd2 = [ get_user_path("executables", "samtools"), "view", "-f", "2", "-F", "2048", "-b", sam_out_fname, "-o", bam_out_fname_paired ] proc2 = subprocess.Popen(cmd2) stdout = proc2.communicate() # sort the paired bam bam_out_fname_paired_sorted = os.path.join( sample_dir_iter_locus, '{}.paired.sorted.bam'.format(locus)) cmd1 = [ get_user_path("executables", "samtools"), "sort", "-n", bam_out_fname_paired, "-o", bam_out_fname_paired_sorted ] proc1 = subprocess.Popen(cmd1) stdout = proc1.communicate() bam_out_fname_singleton = os.path.join(sample_dir_iter_locus, '{}.singleton.bam'.format(locus)) cmd3 = [ get_user_path("executables", "samtools"), "view", "-f", "8", "-b", sam_out_fname, "-o", bam_out_fname_singleton ] proc3 = subprocess.Popen(cmd3) stdout = proc3.communicate() if clean: os.remove(sam_out_fname) os.remove(bam_out_fname_paired) return bam_out_fname_paired_sorted, bam_out_fname_singleton
def bedtools_to_fastq(sample, sample_dir, bam_paired, bam_singleton, locus, clean): fastq_out_fname_r1 = os.path.join(sample_dir, '{}.read1.fastq'.format(locus)) fastq_out_fname_r2 = os.path.join(sample_dir, '{}.read2.fastq'.format(locus)) fastq_out_fname_s = os.path.join(sample_dir, '{}.singleton.fastq'.format(locus)) cmd0 = [ get_user_path("executables", "bedtools"), "bamtofastq", "-i", bam_paired, "-fq", fastq_out_fname_r1, "-fq2", fastq_out_fname_r2 ] proc0 = subprocess.Popen(cmd0, stdout=subprocess.PIPE, stderr=subprocess.PIPE) # stderr may contain entries when chimeric reads are present. these are not # included in the output. stdout, stderr = proc0.communicate() cmd1 = [ get_user_path("executables", "bedtools"), "bamtofastq", "-i", bam_singleton, "-fq", fastq_out_fname_s ] proc1 = subprocess.Popen(cmd1, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = proc1.communicate() fastqs = { 1: fastq_out_fname_r1, 2: fastq_out_fname_r2, 's': fastq_out_fname_s } if clean: os.remove(bam_paired) os.remove(bam_singleton) return fastqs
def bwa_version(): cmd = [get_user_path("executables", "bwa")] proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) stdout, stderr = proc.communicate() return stdout.split("\n")[2].split(' ')[1]
def samtools_version(): cmd = [get_user_path("executables", "samtools"), '--version'] proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) stdout, stderr = proc.communicate() return stdout.split("\n")[0].split(' ')[1]
def samtools_get_locus_names_from_bam(log, bam, iteration): #pdb.set_trace() cmd1 = [get_user_path("executables", "samtools"), "view", bam] cmd2 = [ get_user_path("executables", "gawk"), '{print $3}', ] proc1 = subprocess.Popen(cmd1, stdout=subprocess.PIPE) proc2 = subprocess.Popen(cmd2, stdin=proc1.stdout, stdout=subprocess.PIPE) proc1.stdout.close() stdout = proc2.communicate() # return unique list of locus names locus_names = list(set(stdout[0].split("\n"))) locus_names.sort() # make sure empty is removed locus_names.remove('') log.info("Recovered {} loci for iteration {}".format( len(locus_names), iteration)) return locus_names
def samtools_index(log, sample, sample_dir, bam, iteration=0): log.info("Indexing BAM for {}".format(sample)) cmd = [get_user_path("executables", "samtools"), "index", bam] samtools_out_fname = os.path.join( sample_dir, 'iter-{}.samtools-idx.log'.format(sample)) with open(samtools_out_fname, 'w') as samtools_out: proc = subprocess.Popen(cmd, stdout=samtools_out, stderr=subprocess.STDOUT) proc.communicate()
def bwa_index_seeds(seeds, log): log.info("Running bwa indexing against {}".format(os.path.basename(seeds))) cwd = os.getcwd() # move into reference directory os.chdir(os.path.dirname(seeds)) cmd = [get_user_path("executables", "bwa"), "index", seeds] with open('bwa-index-file.log', 'a') as outf: proc = subprocess.Popen(cmd, stdout=outf, stderr=subprocess.STDOUT) proc.communicate() # mvoe back to working directory os.chdir(cwd)
def samtools_sort(log, sample, sample_dir, bam, iteration=0): #pdb.set_trace() bam_out_fname = os.path.join(sample_dir, 'iter-{}.reduce.sorted.bam'.format(iteration)) cmd1 = [ get_user_path("executables", "samtools"), "sort", bam, "-o", bam_out_fname ] samtools_out_fname = os.path.join(sample_dir, 'iter-{}.sort.log'.format(iteration)) with open(samtools_out_fname, 'w') as samtools_out: proc = subprocess.Popen(cmd1, stdout=samtools_out, stderr=subprocess.STDOUT) proc.communicate() return bam_out_fname
def samtools_reduce(log, sample, sample_dir, bam, iteration=0): #pdb.set_trace() log.info("Reducing BAM for {}, iteration {}".format(sample, iteration)) bam_out_fname = os.path.join(sample_dir, 'iter-{}.reduce.bam'.format(iteration)) cmd = [ get_user_path("executables", "samtools"), "view", "-F", "4", "-bq", "1", bam, "-o", bam_out_fname ] samtools_out_fname = os.path.join(sample_dir, 'iter-{}.reduce.log'.format(iteration)) with open(samtools_out_fname, 'w') as samtools_out: proc = subprocess.Popen(cmd, stdout=samtools_out, stderr=subprocess.STDOUT) proc.communicate() return bam_out_fname
def spades_paired_end_assembly(iteration, sample, sample_dir, fastqs, locus, clean): assembly_out_fname = os.path.join(sample_dir, '{}-assembly'.format(locus)) # go ahead and assemble without error correction, for speed. # explcitly set threads = 1 cmd1 = [ get_user_path("executables", "spades"), "-t", "1", "-1", fastqs[1], "-2", fastqs[2], "-s", fastqs['s'], "-k", get_user_param('spades', 'kmer'), "--cov-cutoff", get_user_param('spades', 'coverage_cutoff'), "--memory", get_user_param('spades', 'memory'), "-o", assembly_out_fname ] # turn off error correction for non-final rounds, turn on error-correction # for final round and also use --careful assembly option (both of these are # slower) if not iteration == 'final': cmd1.append("--only-assembler") if iteration == 'final': cmd1.append("--careful") # spades creates its own log file in the assembly dir - redirect to /dev/null fnull_file = open(os.devnull, 'w') proc = subprocess.Popen(cmd1, stdout=fnull_file, stderr=subprocess.STDOUT) stdout, stderr = proc.communicate() return assembly_out_fname
def get_bam_header(log, bam, iteration): cmd1 = [get_user_path("executables", "samtools"), "view", "-H", bam] proc1 = subprocess.Popen(cmd1, stdout=subprocess.PIPE) stdout = proc1.communicate() log.info("Got BAM header for iteration {}".format(iteration)) return stdout[0]