def align(pair): import os from bcbio.utils import file_exists, replace_suffix, append_stem, safe_makedir import subprocess safe_makedir("align") genome = "/n/hsphS10/hsphfs1/chb/biodata/genomes/Hsapiens/GRCh37/bowtie2/GRCh37" out_sam = os.path.join("align", os.path.basename(replace_suffix(pair[0], ".sam"))) out_bam = replace_suffix(out_sam, ".bam") sorted = append_stem(out_bam, "_sorted") sorted_prefix = os.path.splitext(sorted)[0] out_index = replace_suffix(sorted, ".bai") if not file_exists(out_sam): if len(pair) == 2: fq1, fq2 = pair cmd = "bowtie2 -S {out_sam} {genome} -1 {fq1} -2 {fq2}" else: fq1 = pair[0] cmd = "bowtie2 -S {out_sam} {genome} {fq1}" subprocess.check_call(cmd.format(**locals()), shell=True) if not file_exists(out_bam): cmd = "samtools view -S {out_sam} -b -o {out_bam}" subprocess.check_call(cmd.format(**locals()), shell=True) if not file_exists(sorted): cmd = "samtools sort {out_bam} {sorted_prefix}" subprocess.check_call(cmd.format(**locals()), shell=True) if not file_exists(out_index): cmd = "samtools index {sorted}" subprocess.check_call(cmd.format(**locals()), shell=True) return sorted
def _cutadapt_trim(fastq_files, quality_format, adapters, out_files): if quality_format == "illumina": quality_base = "64" else: quality_base = "33" # --times=2 tries twice remove adapters which will allow things like: # realsequenceAAAAAAadapter to remove both the poly-A and the adapter # this behavior might not be what we want; we could also do two or # more passes of cutadapt base_cmd = [ "cutadapt", "--times=" + "2", "--quality-base=" + quality_base, "--quality-cutoff=20", "--format=fastq", "--minimum-length=0" ] adapter_cmd = map(lambda x: "--adapter=" + x, adapters) base_cmd.extend(adapter_cmd) if all(map(file_exists, out_files)): return out_files for in_file, out_file in zip(fastq_files, out_files): # if you pass an output filename, cutadapt will write some stats # about trimmed adapters to stdout. stat_file captures that. stat_file = replace_suffix(out_file, ".trim_stats.txt") with open(stat_file, "w") as stat_handle: cmd = list(base_cmd) cmd.extend(["--output=" + out_file, in_file]) try: return_value = subprocess.check_call(cmd, stdout=stat_handle) except subprocess.CalledProcessError: cmd_string = subprocess.list2cmdline(cmd) logger.error("Cutadapt returned an error. The command " "used to run cutadapt was: %s." % (cmd_string)) exit(1) return out_files
def _cutadapt_trim(fastq_files, quality_format, adapters, out_files): if quality_format == "illumina": quality_base = "64" else: quality_base = "33" # --times=2 tries twice remove adapters which will allow things like: # realsequenceAAAAAAadapter to remove both the poly-A and the adapter # this behavior might not be what we want; we could also do two or # more passes of cutadapt base_cmd = ["cutadapt", "--times=" + "2", "--quality-base=" + quality_base, "--quality-cutoff=20", "--format=fastq", "--minimum-length=0"] adapter_cmd = map(lambda x: "--adapter=" + x, adapters) base_cmd.extend(adapter_cmd) if all(map(file_exists, out_files)): return out_files for in_file, out_file in zip(fastq_files, out_files): # if you pass an output filename, cutadapt will write some stats # about trimmed adapters to stdout. stat_file captures that. stat_file = replace_suffix(out_file, ".trim_stats.txt") with open(stat_file, "w") as stat_handle: cmd = list(base_cmd) cmd.extend(["--output=" + out_file, in_file]) try: return_value = subprocess.check_call(cmd, stdout=stat_handle) except subprocess.CalledProcessError: cmd_string = subprocess.list2cmdline(cmd) logger.error("Cutadapt returned an error. The command " "used to run cutadapt was: %s." % (cmd_string)) exit(1) return out_files
def convert_bam_to_sam(in_file): if not is_bam(in_file): raise ValueError("Non BAM file passed to convert_sam_to_bam: " "%s" % (in_file)) out_file = replace_suffix(in_file, ".sam") if file_exists(out_file): return out_file with file_transaction(out_file) as tmp_out_file: pysam.view("-h", "-o" + tmp_out_file, in_file) return out_file
def bam2sam(in_file): """ converts a bam file to a sam file bam2sam("file.bam") -> "file.sam" """ out_file = replace_suffix(in_file, ".sam") if file_exists(out_file): return out_file with file_transaction(out_file) as tmp_out_file: pysam.view("-h", "-o" + tmp_out_file, in_file) return out_file
def split_vcf(in_file, ref_file, config, out_dir=None): """Split a VCF file into separate files by chromosome. """ if out_dir is None: out_dir = os.path.join(os.path.dirname(in_file), "split") out_files = [] with open(ref.fasta_idx(ref_file, config)) as in_handle: for line in in_handle: chrom, size = line.split()[:2] out_file = os.path.join(out_dir, os.path.basename(replace_suffix(append_stem(in_file, "-%s" % chrom), ".vcf"))) subset_vcf(in_file, (chrom, 0, size), out_file, config) out_files.append(out_file) return out_files
def split_vcf(in_file, ref_file, config, out_dir=None): """Split a VCF file into separate files by chromosome. """ if out_dir is None: out_dir = os.path.join(os.path.dirname(in_file), "split") out_files = [] with open(ref.fasta_idx(ref_file, config)) as in_handle: for line in in_handle: chrom, size = line.split()[:2] out_file = os.path.join( out_dir, os.path.basename( replace_suffix(append_stem(in_file, "-%s" % chrom), ".vcf"))) subset_vcf(in_file, (chrom, 0, size), out_file, config) out_files.append(out_file) return out_files
def bamindex(in_file, samtools="samtools"): """ index a bam file avoids use of pysam.index which is not working for indexing as of 0.7.4 with ipython """ assert (is_bam(in_file)), "bamindex requires a BAM file, got %s" % in_file out_file = replace_suffix(in_file, ".bai") if file_exists(out_file): return out_file cmd = ["samtools", "index", in_file] try: subprocess.check_call(cmd) except subprocess.CalledProcessError: cmd_string = subprocess.list2cmdline(cmd) logger.error("bamindex returned an error. The command " "used to run bamindex was: %s." % (cmd_string)) return out_file
def bamindex(in_file, samtools="samtools"): """ index a bam file avoids use of pysam.index which is not working for indexing as of 0.7.4 with ipython """ assert(is_bam(in_file)), "bamindex requires a BAM file, got %s" % in_file out_file = replace_suffix(in_file, ".bai") if file_exists(out_file): return out_file cmd = ["samtools", "index", in_file] try: subprocess.check_call(cmd) except subprocess.CalledProcessError: cmd_string = subprocess.list2cmdline(cmd) logger.error("bamindex returned an error. The command " "used to run bamindex was: %s." % (cmd_string)) return out_file
def bam2sam(in_file, samtools="samtools"): """ converts a bam file to a sam file bam2sam("file.bam") -> "file.sam" """ assert(is_bam(in_file)), "bam2sam requires a BAM file, got %s" % in_file out_file = replace_suffix(in_file, ".sam") if file_exists(out_file): return out_file with file_transaction(out_file) as tmp_out_file: #pysam.view("-h", "-o" + tmp_out_file, in_file) cmd = "{samtools} view -h -o {tmp_out_file} {in_file}".format(**locals()) try: subprocess.check_call(cmd) except subprocess.CalledProcessError: cmd_string = subprocess.list2cmdline(cmd) logger.error("bam2sam returned an error. The command " "used to run bam2sam was: %s." % (cmd_string)) return out_file
def bam2sam(in_file, samtools="samtools"): """ converts a bam file to a sam file bam2sam("file.bam") -> "file.sam" """ assert (is_bam(in_file)), "bam2sam requires a BAM file, got %s" % in_file out_file = replace_suffix(in_file, ".sam") if file_exists(out_file): return out_file with file_transaction(out_file) as tmp_out_file: #pysam.view("-h", "-o" + tmp_out_file, in_file) cmd = "{samtools} view -h -o {tmp_out_file} {in_file}".format( **locals()) try: subprocess.check_call(cmd) except subprocess.CalledProcessError: cmd_string = subprocess.list2cmdline(cmd) logger.error("bam2sam returned an error. The command " "used to run bam2sam was: %s." % (cmd_string)) return out_file
def mark_duplicates(sam_file): import subprocess from bcbio.utils import file_exists, replace_suffix, append_stem fm = "/n/HSPH/local/share/java/picard/FixMateInformation.jar" md = "/n/HSPH/local/share/java/picard/MarkDuplicates.jar" jvm_opts = "-Xms750m -Xmx2000m" mate_fixed_file = append_stem(sam_file, "_matefixed") if not file_exists(mate_fixed_file): cmd = ("java {jvm_opts} -jar {fm} INPUT={sam_file} " "OUTPUT={mate_fixed_file}") subprocess.check_call(cmd.format(**locals()), shell=True) sam_file = mate_fixed_file out_file = append_stem(sam_file, "_dupemarked") stats_file = replace_suffix(append_stem(sam_file, "_stats"), ".txt") if not file_exists(out_file): cmd = ("java {jvm_opts} -jar {md} INPUT={sam_file} " "OUTPUT={out_file} METRICS_FILE={stats_file} " "VALIDATION_STRINGENCY=LENIENT") subprocess.check_call(cmd.format(**locals()), shell=True) return out_file
def _run_cutadapt_on_single_file(base_cmd, fastq_file, out_file): stat_file = replace_suffix(out_file, ".trim_stats.txt") with open(stat_file, "w") as stat_handle: cmd = list(base_cmd) cmd.extend(["--output=" + out_file, fastq_file]) do.run(cmd, "Running cutadapt on %s." % (fastq_file), None)
def test_replace_suffix_of_string(self): test_string = "/string/test/foo.txt" correct = "/string/test/foo.bar" out_string = utils.replace_suffix(test_string, ".bar") self.assertEquals(correct, out_string)
def test_replace_suffix_of_list(self): test_list = ["/list/test/foo.txt", "/list/test/foobar.txt"] correct = ["/list/test/foo.bar", "/list/test/foobar.bar"] out_list = utils.replace_suffix(test_list, ".bar") for c, o in zip(correct, out_list): self.assertEquals(c, o)
def chr_out(chrom): out_file = replace_suffix(append_stem(in_file, chrom), ".vcf") return os.path.join(out_dir, os.path.basename(out_file))
def _run_cutadapt_on_single_file(base_cmd, fastq_file, out_file): stat_file = replace_suffix(out_file, ".trim_stats.txt") with open(stat_file, "w") as stat_handle: cmd = list(base_cmd) cmd.extend(["--output=" + out_file, fastq_file]) _run_with_possible_error_message(cmd, stdout=stat_handle)