def _salmon_quant_reads(fq1, fq2, salmon_dir, index, data): samplename = dd.get_sample_name(data) quant_dir = os.path.join(salmon_dir, "quant") safe_makedir(salmon_dir) out_file = os.path.join(quant_dir, "quant.sf") if file_exists(out_file): return out_file num_cores = dd.get_num_cores(data) salmon = config_utils.get_program("salmon", dd.get_config(data)) num_cores = dd.get_num_cores(data) cmd = ("{salmon} quant -l A -i {index} -p {num_cores} " "-o {tx_out_dir} ") fq1_cmd = "<(cat {fq1})" if not is_gzipped(fq1) else "<(gzip -cd {fq1})" fq1_cmd = fq1_cmd.format(fq1=fq1) if not fq2: cmd += " -r {fq1_cmd} " else: fq2_cmd = "<(cat {fq2})" if not is_gzipped( fq2) else "<(gzip -cd {fq2})" fq2_cmd = fq2_cmd.format(fq2=fq2) cmd += " -1 {fq1_cmd} -2 {fq2_cmd} " with file_transaction(data, quant_dir) as tx_out_dir: message = ("Quantifying transcripts in %s and %s with Salmon." % (fq1, fq2)) do.run(cmd.format(**locals()), message, None) return out_file
def sailfish(fq1, fq2, sailfish_dir, gtf_file, ref_file, strandedness, data): safe_makedir(sailfish_dir) samplename = dd.get_sample_name(data) out_file = os.path.join(sailfish_dir, "quant.sf") if file_exists(out_file): return out_file sailfish_idx = sailfish_index(gtf_file, ref_file, data, sailfish_dir) num_cores = dd.get_num_cores(data) sailfish = config_utils.get_program("sailfish", data["config"]) cmd = "{sailfish} quant -i {sailfish_idx} -p {num_cores} " cmd += _libtype_string(fq1, fq2, strandedness) fq1_cmd = "{fq1}" if not is_gzipped(fq1) else "<(gzip -cd {fq1})" fq1_cmd = fq1_cmd.format(fq1=fq1) if not fq2: cmd += " -r {fq1_cmd} " else: fq2_cmd = "{fq2}" if not is_gzipped(fq2) else "<(gzip -cd {fq2})" fq2_cmd = fq2_cmd.format(fq2=fq2) cmd += " -1 {fq1_cmd} -2 {fq2_cmd} " cmd += "--useVBOpt --numBootstraps 30 " cmd += "-o {tx_out_dir}" message = "Quantifying transcripts in {fq1} and {fq2}." with file_transaction(data, sailfish_dir) as tx_out_dir: do.run(cmd.format(**locals()), message.format(**locals()), None) return out_file
def salmon_quant_reads(fq1, fq2, salmon_dir, gtf_file, ref_file, data): safe_makedir(salmon_dir) samplename = dd.get_sample_name(data) out_file = os.path.join(salmon_dir, "quant.sf") if file_exists(out_file): return out_file gtf_fa = sailfish._create_combined_fasta(data, salmon_dir) num_cores = dd.get_num_cores(data) strandedness = dd.get_strandedness(data).lower() salmon = config_utils.get_program("salmon", dd.get_config(data)) libtype = sailfish._libtype_string(fq1, fq2, strandedness) num_cores = dd.get_num_cores(data) index = salmon_index(gtf_file, ref_file, data, salmon_dir) cmd = ("{salmon} quant {libtype} -i {index} -p {num_cores} " "-o {tx_out_dir} ") fq1_cmd = "{fq1}" if not is_gzipped(fq1) else "<(gzip -cd {fq1})" fq1_cmd = fq1_cmd.format(fq1=fq1) if not fq2: cmd += " -r {fq1_cmd} " else: fq2_cmd = "{fq2}" if not is_gzipped(fq2) else "<(gzip -cd {fq2})" fq2_cmd = fq2_cmd.format(fq2=fq2) cmd += " -1 {fq1_cmd} -2 {fq2_cmd} " cmd += "--numBootstraps 30 --useVBOpt " with file_transaction(data, salmon_dir) as tx_out_dir: message = ("Quantifying transcripts in %s and %s with Salmon." % (fq1, fq2)) do.run(cmd.format(**locals()), message, None) return out_file
def rapmap_align(fq1, fq2, rapmap_dir, gtf_file, ref_file, algorithm, data): valid_algorithms = ["pseudo", "quasi"] assert algorithm in valid_algorithms, \ "RapMap algorithm needs to be one of %s." % valid_algorithms safe_makedir(rapmap_dir) samplename = dd.get_sample_name(data) out_file = os.path.join(rapmap_dir, samplename + ".bam") if file_exists(out_file): return out_file rapmap_index_loc = rapmap_index(gtf_file, ref_file, algorithm, data, rapmap_dir) num_cores = dd.get_num_cores(data) algorithm_subcommand = algorithm + "map" rapmap = config_utils.get_program("rapmap", dd.get_config(data)) cmd = "{rapmap} {algorithm_subcommand} -t {num_cores} -i {rapmap_index_loc} " fq1_cmd = "{fq1} " if not is_gzipped(fq1) else "<(gzip -cd {fq1}) " fq1_cmd = fq1_cmd.format(fq1=fq1) if not fq2: cmd += "-r {fq1_cmd} " else: fq2_cmd = "{fq2} " if not is_gzipped(fq2) else "<(gzip -cd {fq2}) " fq2_cmd = fq2_cmd.format(fq2=fq2) cmd += "-1 {fq2_cmd} -2 {fq2_cmd} " with file_transaction(out_file) as tx_out_file: cmd += "| " + postalign.sam_to_sortbam_cl(data, tx_out_file) run_message = ("%smapping %s and %s to %s with Rapmap. " % (algorithm, fq1, fq2, rapmap_index)) do.run(cmd.format(**locals()), run_message, None) return out_file
def salmon_quant_reads(fq1, fq2, salmon_dir, gtf_file, ref_file, data): samplename = dd.get_sample_name(data) quant_dir = os.path.join(salmon_dir, "quant") safe_makedir(salmon_dir) out_file = os.path.join(quant_dir, "quant.sf") if file_exists(out_file): return out_file num_cores = dd.get_num_cores(data) strandedness = dd.get_strandedness(data).lower() salmon = config_utils.get_program("salmon", dd.get_config(data)) libtype = sailfish._libtype_string(fq1, fq2, strandedness) num_cores = dd.get_num_cores(data) index = salmon_index(gtf_file, ref_file, data, salmon_dir) resources = config_utils.get_resources("salmon", dd.get_config(data)) params = "" if resources.get("options") is not None: params = " ".join([str(x) for x in resources.get("options", [])]) cmd = ("{salmon} quant {libtype} -i {index} -p {num_cores} " "-o {tx_out_dir} {params} ") fq1_cmd = "<(cat {fq1})" if not is_gzipped(fq1) else "<(gzip -cd {fq1})" fq1_cmd = fq1_cmd.format(fq1=fq1) if not fq2: cmd += " -r {fq1_cmd} " else: fq2_cmd = "<(cat {fq2})" if not is_gzipped( fq2) else "<(gzip -cd {fq2})" fq2_cmd = fq2_cmd.format(fq2=fq2) cmd += " -1 {fq1_cmd} -2 {fq2_cmd} " # skip --useVBOpt for now, it can cause segfaults cmd += "--numBootstraps 30 " with file_transaction(data, quant_dir) as tx_out_dir: message = ("Quantifying transcripts in %s and %s with Salmon." % (fq1, fq2)) do.run(cmd.format(**locals()), message, None) return out_file
def _salmon_quant_reads(fq1, fq2, salmon_dir, index, data): samplename = dd.get_sample_name(data) quant_dir = os.path.join(salmon_dir, "quant") safe_makedir(salmon_dir) out_file = os.path.join(quant_dir, "quant.sf") if file_exists(out_file): return out_file num_cores = dd.get_num_cores(data) salmon = config_utils.get_program("salmon", dd.get_config(data)) num_cores = dd.get_num_cores(data) cmd = ("{salmon} quant -l A -i {index} -p {num_cores} " "-o {tx_out_dir} ") fq1_cmd = "<(cat {fq1})" if not is_gzipped(fq1) else "<(gzip -cd {fq1})" fq1_cmd = fq1_cmd.format(fq1=fq1) if not fq2: cmd += " -r {fq1_cmd} " else: fq2_cmd = "<(cat {fq2})" if not is_gzipped(fq2) else "<(gzip -cd {fq2})" fq2_cmd = fq2_cmd.format(fq2=fq2) cmd += " -1 {fq1_cmd} -2 {fq2_cmd} " with file_transaction(data, quant_dir) as tx_out_dir: message = ("Quantifying transcripts in %s and %s with Salmon." %(fq1, fq2)) do.run(cmd.format(**locals()), message, None) return out_file
def salmon_quant_reads(fq1, fq2, salmon_dir, gtf_file, ref_file, data): samplename = dd.get_sample_name(data) quant_dir = os.path.join(salmon_dir, "quant") safe_makedir(salmon_dir) out_file = os.path.join(quant_dir, "quant.sf") if file_exists(out_file): return out_file num_cores = dd.get_num_cores(data) strandedness = dd.get_strandedness(data).lower() salmon = config_utils.get_program("salmon", dd.get_config(data)) libtype = sailfish._libtype_string(fq1, fq2, strandedness) num_cores = dd.get_num_cores(data) index = salmon_index(gtf_file, ref_file, data, salmon_dir) cmd = ("{salmon} quant {libtype} -i {index} -p {num_cores} " "-o {tx_out_dir} ") fq1_cmd = "{fq1}" if not is_gzipped(fq1) else "<(gzip -cd {fq1})" fq1_cmd = fq1_cmd.format(fq1=fq1) if not fq2: cmd += " -r {fq1_cmd} " else: fq2_cmd = "{fq2}" if not is_gzipped(fq2) else "<(gzip -cd {fq2})" fq2_cmd = fq2_cmd.format(fq2=fq2) cmd += " -1 {fq1_cmd} -2 {fq2_cmd} " # skip --useVBOpt for now, it can cause segfaults cmd += "--numBootstraps 30 " with file_transaction(data, quant_dir) as tx_out_dir: message = ("Quantifying transcripts in %s and %s with Salmon." %(fq1, fq2)) do.run(cmd.format(**locals()), message, None) return out_file
def sailfish(fq1, fq2, sailfish_dir, gtf_file, ref_file, strandedness, data): safe_makedir(sailfish_dir) samplename = dd.get_sample_name(data) out_file = os.path.join(sailfish_dir, "quant.sf") if file_exists(out_file): return out_file sailfish_idx = sailfish_index(gtf_file, ref_file, data, sailfish_dir) num_cores = dd.get_num_cores(data) sailfish = config_utils.get_program("sailfish", data["config"]) cmd = "{sailfish} quant -i {sailfish_idx} -p {num_cores} " cmd += _libtype_string(fq1, fq2, strandedness) fq1_cmd = "{fq1}" if not is_gzipped(fq1) else "<(gzip -cd {fq1})" fq1_cmd = fq1_cmd.format(fq1=fq1) if not fq2: cmd += " -r {fq1_cmd} " else: fq2_cmd = "{fq2}" if not is_gzipped(fq2) else "<(gzip -cd {fq2})" fq2_cmd = fq2_cmd.format(fq2=fq2) cmd += " -1 {fq1_cmd} -2 {fq2_cmd} " cmd += "--useVBOpt --numBootstraps 30 " cmd += "-o {tx_out_dir}" message = "Quantifying transcripts in {fq1} and {fq2}." with file_transaction(data, sailfish_dir) as tx_out_dir: do.run(cmd.format(**locals()), message.format(**locals()), None) _sleuthify_sailfish(sailfish_dir) return out_file
def decorate_problem_regions(query_bed, problem_bed_dir): """ decorate query_bed with percentage covered by BED files of regions specified in the problem_bed_dir """ if utils.is_gzipped(query_bed): stem, _ = os.path.splitext(query_bed) stem, ext = os.path.splitext(stem) else: stem, ext = os.path.splitext(query_bed) out_file = stem + ".problem_annotated" + ext + ".gz" if utils.file_exists(out_file): return out_file bed_files = glob.glob(os.path.join(problem_bed_dir, "*.bed")) bed_file_string = " ".join(bed_files) names = [os.path.splitext(os.path.basename(x))[0] for x in bed_files] names_string = " ".join(names) header = "\t".join(["chr", "start", "end", "name", "coverage", "completeness"] + names) cmd = ("bedtools annotate -i {query_bed} -files {bed_file_string} " "-names {names_string} | sed -s 's/^#.*$/#{header}/' | bgzip -c > {tx_out_file}") with file_transaction(out_file) as tx_out_file: message = "Annotate %s with problem regions." % query_bed do.run(cmd.format(**locals()), message) return out_file
def align(fastq_file, pair_file, ref_file, names, align_dir, data): config = data["config"] out_prefix = os.path.join(align_dir, names["lane"]) out_file = out_prefix + "Aligned.out.sam" out_dir = os.path.join(align_dir, "%s_star" % names["lane"]) final_out = os.path.join(out_dir, "{0}.bam".format(names["sample"])) if file_exists(final_out): return final_out star_path = config_utils.get_program("STAR", config) fastq = " ".join([fastq_file, pair_file]) if pair_file else fastq_file num_cores = config["algorithm"].get("num_cores", 1) safe_makedir(align_dir) cmd = ("{star_path} --genomeDir {ref_file} --readFilesIn {fastq} " "--runThreadN {num_cores} --outFileNamePrefix {out_prefix} " "--outReadsUnmapped Fastx --outFilterMultimapNmax 10 " "--outStd SAM " "--outSAMunmapped Within --outSAMattributes %s" % " ".join(ALIGN_TAGS)) cmd = cmd + " --readFilesCommand zcat " if is_gzipped(fastq_file) else cmd cmd += _read_group_option(names) fusion_mode = get_in(data, ("config", "algorithm", "fusion_mode"), False) if fusion_mode: cmd += " --chimSegmentMin 15 --chimJunctionOverhangMin 15" strandedness = get_in(data, ("config", "algorithm", "strandedness"), "unstranded").lower() if strandedness == "unstranded": cmd += " --outSAMstrandField intronMotif " sam_to_bam = bam.sam_to_bam_stream_cmd(config) sort = bam.sort_cmd(config) cmd += "| {sam_to_bam} | {sort} -o {tx_final_out} " run_message = "Running STAR aligner on %s and %s." % (fastq_file, ref_file) with file_transaction(final_out) as tx_final_out: do.run(cmd.format(**locals()), run_message, None) return final_out
def decorate_problem_regions(query_bed, problem_bed_dir): """ decorate query_bed with percentage covered by BED files of regions specified in the problem_bed_dir """ if is_gzipped(query_bed): stem, _ = os.path.splitext(query_bed) stem, ext = os.path.splitext(stem) else: stem, ext = os.path.splitext(query_bed) out_file = stem + ".problem_annotated" + ext + ".gz" if file_exists(out_file): return out_file bed_files = _find_bed_files(problem_bed_dir) bed_file_string = " ".join(bed_files) names = [os.path.splitext(os.path.basename(x))[0] for x in bed_files] names_string = " ".join(names) with open_gzipsafe(query_bed) as in_handle: header = map(str, in_handle.next().strip().split()) header = "\t".join(header + names) cmd = ( "bedtools annotate -i {query_bed} -files {bed_file_string} " "-names {names_string} | sed -s 's/^#.*$/{header}/' | bgzip -c > {tx_out_file}" ) with file_transaction(out_file) as tx_out_file: message = "Annotate %s with problem regions." % query_bed do.run(cmd.format(**locals()), message) return out_file
def decorate_problem_regions(query_bed, problem_bed_dir): """ decorate query_bed with percentage covered by BED files of regions specified in the problem_bed_dir """ if utils.is_gzipped(query_bed): stem, _ = os.path.splitext(query_bed) stem, ext = os.path.splitext(stem) else: stem, ext = os.path.splitext(query_bed) out_file = stem + ".problem_annotated" + ext + ".gz" if utils.file_exists(out_file): return out_file bed_files = glob.glob(os.path.join(problem_bed_dir, "*.bed")) bed_file_string = " ".join(bed_files) names = [os.path.splitext(os.path.basename(x))[0] for x in bed_files] names_string = " ".join(names) with utils.open_gzipsafe(query_bed) as in_handle: header = map(str, in_handle.next().strip().split()) header = "\t".join(header + names) cmd = ("bedtools annotate -i {query_bed} -files {bed_file_string} " "-names {names_string} | sed -s 's/^#.*$/{header}/' | bgzip -c > {tx_out_file}") with file_transaction(out_file) as tx_out_file: message = "Annotate %s with problem regions." % query_bed do.run(cmd.format(**locals()), message) return out_file
def align(fastq_file, pair_file, ref_file, names, align_dir, data): if not ref_file: logger.error("STAR index not found. We don't provide the STAR indexes " "by default because they are very large. You can install " "the index for your genome with: bcbio_nextgen.py upgrade " "--aligners star --genomes genome-build-name --data") sys.exit(1) max_hits = 10 srna = True if data["analysis"].lower().startswith("smallrna-seq") else False srna_opts = "" if srna: max_hits = 1000 srna_opts = "--alignIntronMax 1" config = data["config"] star_dirs = _get_star_dirnames(align_dir, data, names) if file_exists(star_dirs.final_out): data = _update_data(star_dirs.final_out, star_dirs.out_dir, names, data) return data star_path = config_utils.get_program("STAR", config) fastq_files = " ".join([fastq_file, pair_file]) if pair_file else fastq_file num_cores = dd.get_num_cores(data) gtf_file = dd.get_gtf_file(data) if ref_file.endswith("chrLength"): ref_file = os.path.dirname(ref_file) with file_transaction(data, align_dir) as tx_align_dir: tx_star_dirnames = _get_star_dirnames(tx_align_dir, data, names) tx_out_dir, tx_out_file, tx_out_prefix, tx_final_out = tx_star_dirnames safe_makedir(tx_align_dir) safe_makedir(tx_out_dir) cmd = ("{star_path} --genomeDir {ref_file} --readFilesIn {fastq_files} " "--runThreadN {num_cores} --outFileNamePrefix {tx_out_prefix} " "--outReadsUnmapped Fastx --outFilterMultimapNmax {max_hits} " "--outStd SAM {srna_opts} " "--outSAMunmapped Within --outSAMattributes %s " % " ".join(ALIGN_TAGS)) cmd += _add_sj_index_commands(fastq_file, ref_file, gtf_file) if not srna else "" cmd += " --readFilesCommand zcat " if is_gzipped(fastq_file) else "" cmd += _read_group_option(names) fusion_mode = utils.get_in(data, ("config", "algorithm", "fusion_mode"), False) if fusion_mode: cmd += (" --chimSegmentMin 12 --chimJunctionOverhangMin 12 " "--chimScoreDropMax 30 --chimSegmentReadGapMax 5 " "--chimScoreSeparation 5 " "--chimOutType WithinSAM ") strandedness = utils.get_in(data, ("config", "algorithm", "strandedness"), "unstranded").lower() if strandedness == "unstranded" and not srna: cmd += " --outSAMstrandField intronMotif " if not srna: cmd += " --quantMode TranscriptomeSAM " cmd += " | " + postalign.sam_to_sortbam_cl(data, tx_final_out) run_message = "Running STAR aligner on %s and %s" % (fastq_file, ref_file) do.run(cmd.format(**locals()), run_message, None) print("hello") data = _update_data(star_dirs.final_out, star_dirs.out_dir, names, data) return data
def sailfish(fq1, fq2, align_dir, gtf_file, ref_file, strandedness, data): sailfish_idx = sailfish_index(gtf_file, ref_file, data) sailfish = config_utils.get_program("sailfish", data["config"]) cmd = "{sailfish} quant -i {sailfish_idx} " cmd += _libtype_string(fq1, fq2, strandedness) fq1_cmd = "{fq1}" if not is_gzipped(fq1) else "<(gzip -cd {fq1})" fq1_cmd = fq1_cmd.format(fq1=fq1) if not fq2: cmd = " -r {fq1_cmd} " else: fq2_cmd = "{fq2}" if not is_gzipped(fq2) else "<(gzip -cd {fq2})" fq2_cmd = fq2_cmd.format(fq2=fq2) cmd += " -1 {fq1_cmd} -2 {fq2_cmd} " cmd += "-o {tx_out_dir}" message = "Quantifying transcripts in {fq1} and {fq2}." with file_transaction(data, align_dir) as tx_out_dir: do.run(cmd.format(**locals()), message.format(**locals()), None) return align_dir
def _unpack_fastq(f): """Use process substitution instead of readFilesCommand for gzipped inputs. Prevents issues on shared filesystems that don't support FIFO: https://github.com/alexdobin/STAR/issues/143 """ if f and is_gzipped(f): return "<(gunzip -c %s)" % f else: return f
def install_gtf_file(build_dir, gtf, build): out_file = os.path.join(build_dir, RNASEQ_DIR, "ref-transcripts.gtf") if not file_exists(out_file): if is_gzipped(gtf): with gzip.open(gtf_file, 'rb') as in_handle: with open(out_file, 'wb') as out_handle: shutil.copyfileobj(in_handle, out_handle) else: shutil.copyfile(gtf, out_file) return out_file
def align(fastq_file, pair_file, ref_file, names, align_dir, data): max_hits = 10 srna = True if data["analysis"].lower().startswith("smallrna-seq") else False srna_opts = "" if srna: max_hits = 1000 srna_opts = "--alignIntronMax 1" config = data["config"] out_prefix = os.path.join(align_dir, dd.get_lane(data)) out_file = out_prefix + "Aligned.out.sam" out_dir = os.path.join(align_dir, "%s_star" % dd.get_lane(data)) if not ref_file: logger.error("STAR index not found. We don't provide the STAR indexes " "by default because they are very large. You can install " "the index for your genome with: bcbio_nextgen.py upgrade " "--aligners star --genomes genome-build-name --data") sys.exit(1) final_out = os.path.join(out_dir, "{0}.bam".format(names["sample"])) if file_exists(final_out): data = _update_data(final_out, out_dir, names, data) return data star_path = config_utils.get_program("STAR", config) fastq_files = " ".join([fastq_file, pair_file]) if pair_file else fastq_file num_cores = dd.get_num_cores(data) gtf_file = dd.get_gtf_file(data) safe_makedir(align_dir) cmd = ("{star_path} --genomeDir {ref_file} --readFilesIn {fastq_files} " "--runThreadN {num_cores} --outFileNamePrefix {out_prefix} " "--outReadsUnmapped Fastx --outFilterMultimapNmax {max_hits} " "--outStd SAM {srna_opts} " "--outSAMunmapped Within --outSAMattributes %s " % " ".join(ALIGN_TAGS)) cmd += _add_sj_index_commands(fastq_file, ref_file, gtf_file) cmd += " --readFilesCommand zcat " if is_gzipped(fastq_file) else "" cmd += _read_group_option(names) fusion_mode = utils.get_in(data, ("config", "algorithm", "fusion_mode"), False) if fusion_mode: cmd += " --chimSegmentMin 15 --chimJunctionOverhangMin 15" strandedness = utils.get_in(data, ("config", "algorithm", "strandedness"), "unstranded").lower() if strandedness == "unstranded" and not srna: cmd += " --outSAMstrandField intronMotif " if dd.get_transcriptome_align(data) and not is_transcriptome_broken(data): cmd += " --quantMode TranscriptomeSAM " with file_transaction(data, final_out) as tx_final_out: cmd += " | " + postalign.sam_to_sortbam_cl(data, tx_final_out) run_message = "Running STAR aligner on %s and %s" % (fastq_file, ref_file) do.run(cmd.format(**locals()), run_message, None) data = _update_data(final_out, out_dir, names, data) return data
def align(fastq_file, pair_file, ref_file, names, align_dir, data): config = data["config"] out_prefix = os.path.join(align_dir, dd.get_lane(data)) out_file = out_prefix + "Aligned.out.sam" out_dir = os.path.join(align_dir, "%s_star" % dd.get_lane(data)) if not ref_file: logger.error( "STAR index not found. We don't provide the STAR indexes " "by default because they are very large. You can install " "the index for your genome with: bcbio_nextgen.py upgrade " "--aligners star --genomes genome-build-name --data") sys.exit(1) final_out = os.path.join(out_dir, "{0}.bam".format(names["sample"])) if file_exists(final_out): data = _update_data(final_out, out_dir, names, data) return data star_path = config_utils.get_program("STAR", config) fastq = " ".join([fastq_file, pair_file]) if pair_file else fastq_file num_cores = config["algorithm"].get("num_cores", 1) safe_makedir(align_dir) cmd = ("{star_path} --genomeDir {ref_file} --readFilesIn {fastq} " "--runThreadN {num_cores} --outFileNamePrefix {out_prefix} " "--outReadsUnmapped Fastx --outFilterMultimapNmax 10 " "--outStd SAM " "--outSAMunmapped Within --outSAMattributes %s" % " ".join(ALIGN_TAGS)) cmd = cmd + " --readFilesCommand zcat " if is_gzipped(fastq_file) else cmd cmd += _read_group_option(names) fusion_mode = utils.get_in(data, ("config", "algorithm", "fusion_mode"), False) if fusion_mode: cmd += " --chimSegmentMin 15 --chimJunctionOverhangMin 15" strandedness = utils.get_in(data, ("config", "algorithm", "strandedness"), "unstranded").lower() if strandedness == "unstranded": cmd += " --outSAMstrandField intronMotif " if dd.get_rsem(data) and not is_transcriptome_broken(): cmd += " --quantMode TranscriptomeSAM " with tx_tmpdir(data) as tmp_dir: sam_to_bam = bam.sam_to_bam_stream_cmd(config) sort = bam.sort_cmd(config, tmp_dir) cmd += "| {sam_to_bam} | {sort} -o {tx_final_out} " run_message = "Running STAR aligner on %s and %s" % (fastq_file, ref_file) with file_transaction(data, final_out) as tx_final_out: do.run(cmd.format(**locals()), run_message, None) data = _update_data(final_out, out_dir, names, data) return data
def filter_barcodes(data): # if data was pre-demultiplexed, there is no need to filter the barcodes if dd.get_demultiplexed(data): return [[data]] fq1 = dd.get_input_sequence_files(data)[0] umi_dir = os.path.join(dd.get_work_dir(data), "umis") correction = dd.get_cellular_barcode_correction(data) bc = get_cellular_barcodes(data) if not bc: logger.info("No cellular barcodes found, skipping filtering.") return [[data]] bc1 = None bc2 = None bc3 = None umi_dir = os.path.join(dd.get_work_dir(data), "umis") if isinstance(bc, six.string_types): bc1 = bc if len(bc) == 1: bc1 = bc[0] if len(bc) > 1: bc1 = bc[0] bc2 = bc[1] if len(bc) == 3: bc3 = bc[2] out_base = dd.get_sample_name(data) + ".filtered.fq.gz" out_file = os.path.join(umi_dir, out_base) if file_exists(out_file): data["files"] = [out_file] return [[data]] ncores = dd.get_num_cores(data) cmd = "{umis} cb_filter --cores {ncores} " if bc1: cmd += "--bc1 {bc1} " if correction: cmd += "--nedit {correction} " if bc2: cmd += "--bc2 {bc2} " if bc3: cmd += "--bc3 {bc3} " fq1_cmd = "{fq1} " if not is_gzipped(fq1) else "<(gzip -cd {fq1}) " fq1_cmd = fq1_cmd.format(fq1=fq1) cmd += "{fq1_cmd} | gzip > {tx_out_file}" sample_dir = os.path.join(umi_dir, dd.get_sample_name(data)) safe_makedir(sample_dir) umis = config_utils.get_program("umis", data, default="umis") with file_transaction(out_file) as tx_out_file: message = "Filtering by cellular barcode." do.run(cmd.format(**locals()), message) data["files"] = [out_file] return [[data]]
def _gzip_fastq(in_file): """ gzip a fastq file if it is not already gzipped """ if fastq.is_fastq(in_file) and not utils.is_gzipped(in_file): gzipped_file = in_file + ".gz" if file_exists(gzipped_file): return gzipped_file message = "gzipping {in_file}.".format(in_file=in_file) do.run("gzip -c {in_file} > {gzipped_file}".format(**locals()), message) return gzipped_file return in_file
def _gzip_fastq(in_file): """ gzip a fastq file if it is not already gzipped """ if (fastq.is_fastq(in_file) and not utils.is_gzipped(in_file) and not in_file.startswith(utils.SUPPORTED_REMOTES)): gzipped_file = in_file + ".gz" if file_exists(gzipped_file): return gzipped_file message = "gzipping {in_file}.".format(in_file=in_file) do.run("gzip -c {in_file} > {gzipped_file}".format(**locals()), message) return gzipped_file return in_file
def align(fastq_file, pair_file, ref_file, names, align_dir, data): config = data["config"] out_prefix = os.path.join(align_dir, dd.get_lane(data)) out_file = out_prefix + "Aligned.out.sam" out_dir = os.path.join(align_dir, "%s_star" % dd.get_lane(data)) if not ref_file: logger.error("STAR index not found. We don't provide the STAR indexes " "by default because they are very large. You can install " "the index for your genome with: bcbio_nextgen.py upgrade " "--aligners star --genomes genome-build-name --data") sys.exit(1) final_out = os.path.join(out_dir, "{0}.bam".format(names["sample"])) if file_exists(final_out): data = _update_data(final_out, out_dir, names, data) return data star_path = config_utils.get_program("STAR", config) fastq = " ".join([fastq_file, pair_file]) if pair_file else fastq_file num_cores = config["algorithm"].get("num_cores", 1) safe_makedir(align_dir) cmd = ("{star_path} --genomeDir {ref_file} --readFilesIn {fastq} " "--runThreadN {num_cores} --outFileNamePrefix {out_prefix} " "--outReadsUnmapped Fastx --outFilterMultimapNmax 10 " "--outStd SAM " "--outSAMunmapped Within --outSAMattributes %s" % " ".join(ALIGN_TAGS)) cmd = cmd + " --readFilesCommand zcat " if is_gzipped(fastq_file) else cmd cmd += _read_group_option(names) fusion_mode = utils.get_in(data, ("config", "algorithm", "fusion_mode"), False) if fusion_mode: cmd += " --chimSegmentMin 15 --chimJunctionOverhangMin 15" strandedness = utils.get_in(data, ("config", "algorithm", "strandedness"), "unstranded").lower() if strandedness == "unstranded": cmd += " --outSAMstrandField intronMotif " if dd.get_rsem(data) and not is_transcriptome_broken(): cmd += " --quantMode TranscriptomeSAM " with tx_tmpdir(data) as tmp_dir: sam_to_bam = bam.sam_to_bam_stream_cmd(config) sort = bam.sort_cmd(config, tmp_dir) cmd += "| {sam_to_bam} | {sort} -o {tx_final_out} " run_message = "Running STAR aligner on %s and %s" % (fastq_file, ref_file) with file_transaction(data, final_out) as tx_final_out: do.run(cmd.format(**locals()), run_message, None) data = _update_data(final_out, out_dir, names, data) return data
def rapmap_pseudoalign(fq1, fq2, rapmap_dir, gtf_file, ref_file, data): safe_makedir(rapmap_dir) samplename = dd.get_sample_name(data) out_file = os.path.join(rapmap_dir, samplename + ".bam") if file_exists(out_file): return out_file rapmap_idx = rapmap_index(gtf_file, ref_file, data, sailfish_dir) num_cores = dd.get_num_cores(data) rapmap = config_utils.get_program("rapmap", data["config"]) cmd = ("{rapmap} pseudomap -i {rapmap_idx} -t {num_cores} ") fq1_cmd = "{fq1}" if not is_gzipped(fq1) else "<(gzip -cd {fq1})" fq1_cmd = fq1_cmd.format(fq1=fq1) if not fq2: cmd += " -r {fq1_cmd} " else: fq2_cmd = "{fq2}" if not is_gzipped(fq2) else "<(gzip -cd {fq2})" fq2_cmd = fq2_cmd.format(fq2=fq2) cmd += " -1 {fq1_cmd} -2 {fq2_cmd} " message = "pseudomapping transcripts in {fq1} and {fq2}." with file_transaction(data, out_file) as tx_out_file: cmd += " | " + postalign.sam_to_sortbam_cl(data, tx_out_file) do.run(cmd.format(**locals()), message.format(**locals()), None) return out_file
def _gzip_fastq(in_file): """ gzip a fastq file if it is not already gzipped """ if (fastq.is_fastq(in_file) and not utils.is_gzipped(in_file) and not objectstore.is_remote(in_file)): gzipped_file = in_file + ".gz" if file_exists(gzipped_file): return gzipped_file message = "gzipping {in_file}.".format(in_file=in_file) do.run("gzip -c {in_file} > {gzipped_file}".format(**locals()), message) return gzipped_file return in_file
def rapmap_pseudoalign(fq1, fq2, rapmap_dir, gtf_file, ref_file, data): safe_makedir(rapmap_dir) samplename = dd.get_sample_name(data) out_file = os.path.join(rapmap_dir, samplename + ".bam") if file_exists(out_file): return out_file rapmap_index = rapmap_pseudoindex(gtf_file, ref_file, data, rapmap_dir) num_cores = dd.get_num_cores(data) rapmap = config_utils.get_program("rapmap", dd.get_config(data)) cmd = "{rapmap} pseudomap -t {num_cores} -i {rapmap_index} " fq1_cmd = "{fq1} " if not is_gzipped(fq1) else "<(gzip -cd {fq1}) " fq1_cmd = fq1_cmd.format(fq1=fq1) if not fq2: cmd += "-r {fq1_cmd} " else: fq2_cmd = "{fq2} " if not is_gzipped(fq2) else "<(gzip -cd {fq2}) " fq2_cmd = fq2_cmd.format(fq2=fq2) cmd += "-1 {fq2_cmd} -2 {fq2_cmd} " with file_transaction(out_file) as tx_out_file: cmd += "| " + postalign.sam_to_sortbam_cl(data, tx_out_file) run_message = ("Pseudomapping %s and %s to %s with Rapmap. " % (fq1, fq2, rapmap_index)) do.run(cmd.format(**locals()), run_message, None) return out_file
def barcode_histogram(data): fq1 = dd.get_input_sequence_files(data)[0] umi_dir = os.path.join(dd.get_work_dir(data), "umis") sample_dir = os.path.join(umi_dir, dd.get_sample_name(data)) umis = config_utils.get_program("umis", data, default="umis") safe_makedir(sample_dir) out_file = os.path.join(sample_dir, "cb-histogram.txt") if file_exists(out_file): return [[data]] fq1_cmd = "{fq1}" if not is_gzipped(fq1) else "<(gzip -cd {fq1})" fq1_cmd = fq1_cmd.format(fq1=fq1) cmd = "{umis} cb_histogram {fq1_cmd} > {tx_out_file}" with file_transaction(out_file) as tx_out_file: message = "Computing cellular barcode counts for %s." % fq1 do.run(cmd.format(**locals()), message) return [[data]]
def _gzip_fastq(in_file): """ gzip a fastq file if it is not already gzipped, handling conversion from bzip to gzipped files """ if fastq.is_fastq(in_file) and not objectstore.is_remote(in_file): if utils.is_bzipped(in_file): return _bzip_gzip(in_file) elif not utils.is_gzipped(in_file): gzipped_file = in_file + ".gz" if file_exists(gzipped_file): return gzipped_file message = "gzipping {in_file}.".format(in_file=in_file) with file_transaction(gzipped_file) as tx_gzipped_file: do.run("gzip -c {in_file} > {tx_gzipped_file}".format(**locals()), message) return gzipped_file return in_file
def filter_barcodes(data): fq1 = dd.get_input_sequence_files(data)[0] umi_dir = os.path.join(dd.get_work_dir(data), "umis") correction = dd.get_cellular_barcode_correction(data) bc = dd.get_cellular_barcodes(data) if not bc: return [[data]] bc1 = None bc2 = None umi_dir = os.path.join(dd.get_work_dir(data), "umis") if isinstance(bc, basestring): bc1 = bc if len(bc) == 1: bc1 = bc[0] if len(bc) == 2: bc1 = bc[0] bc2 = bc[1] out_base = dd.get_sample_name(data) + ".filtered.fq.gz" out_file = os.path.join(umi_dir, out_base) if file_exists(out_file): data["files"] = [out_file] return [[data]] ncores = dd.get_num_cores(data) cmd = "{umis} cb_filter --cores {ncores} " if bc1: cmd += "--bc1 {bc1} " if correction: cmd += "--nedit {correction} " if bc2: cmd += "--bc2 {bc2} " fq1_cmd = "{fq1} " if not is_gzipped(fq1) else "<(gzip -cd {fq1}) " fq1_cmd = fq1_cmd.format(fq1=fq1) cmd += "{fq1_cmd} | gzip > {tx_out_file}" sample_dir = os.path.join(umi_dir, dd.get_sample_name(data)) safe_makedir(sample_dir) umis = config_utils.get_program("umis", data, default="umis") with file_transaction(out_file) as tx_out_file: message = "Filtering by cellular barcode." do.run(cmd.format(**locals()), message) data["files"] = [out_file] return [[data]]
def filter_barcodes(data): fq1 = dd.get_input_sequence_files(data)[0] umi_dir = os.path.join(dd.get_work_dir(data), "umis") correction = dd.get_cellular_barcode_correction(data) bc = get_cellular_barcodes(data) if not bc: return [[data]] bc1 = None bc2 = None umi_dir = os.path.join(dd.get_work_dir(data), "umis") if isinstance(bc, basestring): bc1 = bc if len(bc) == 1: bc1 = bc[0] if len(bc) == 2: bc1 = bc[0] bc2 = bc[1] out_base = dd.get_sample_name(data) + ".filtered.fq.gz" out_file = os.path.join(umi_dir, out_base) if file_exists(out_file): data["files"] = [out_file] return [[data]] ncores = dd.get_num_cores(data) cmd = "{umis} cb_filter --cores {ncores} " if bc1: cmd += "--bc1 {bc1} " if correction: cmd += "--nedit {correction} " if bc2: cmd += "--bc2 {bc2} " fq1_cmd = "{fq1} " if not is_gzipped(fq1) else "<(gzip -cd {fq1}) " fq1_cmd = fq1_cmd.format(fq1=fq1) cmd += "{fq1_cmd} | gzip > {tx_out_file}" sample_dir = os.path.join(umi_dir, dd.get_sample_name(data)) safe_makedir(sample_dir) umis = config_utils.get_program("umis", data, default="umis") with file_transaction(out_file) as tx_out_file: message = "Filtering by cellular barcode." do.run(cmd.format(**locals()), message) data["files"] = [out_file] return [[data]]
def align(fastq_file, pair_file, ref_file, names, align_dir, data): config = data["config"] out_prefix = os.path.join(align_dir, names["lane"]) out_file = out_prefix + "Aligned.out.sam" out_dir = os.path.join(align_dir, "%s_star" % names["lane"]) final_out = os.path.join(out_dir, "{0}.bam".format(names["sample"])) if file_exists(final_out): return final_out star_path = config_utils.get_program("STAR", config) fastq = " ".join([fastq_file, pair_file]) if pair_file else fastq_file num_cores = config["algorithm"].get("num_cores", 1) safe_makedir(align_dir) cmd = ("{star_path} --genomeDir {ref_file} --readFilesIn {fastq} " "--runThreadN {num_cores} --outFileNamePrefix {out_prefix} " "--outReadsUnmapped Fastx --outFilterMultimapNmax 10 " "--outStd SAM " "--outSAMunmapped Within --outSAMattributes %s" % " ".join(ALIGN_TAGS)) cmd = cmd + " --readFilesCommand zcat " if is_gzipped(fastq_file) else cmd cmd += _read_group_option(names) fusion_mode = utils.get_in(data, ("config", "algorithm", "fusion_mode"), False) if fusion_mode: cmd += " --chimSegmentMin 15 --chimJunctionOverhangMin 15" strandedness = utils.get_in(data, ("config", "algorithm", "strandedness"), "unstranded").lower() if strandedness == "unstranded": cmd += " --outSAMstrandField intronMotif " with tx_tmpdir(data) as tmp_dir: sam_to_bam = bam.sam_to_bam_stream_cmd(config) sort = bam.sort_cmd(config, tmp_dir) cmd += "| {sam_to_bam} | {sort} -o {tx_final_out} " run_message = "Running STAR aligner on %s and %s." % (fastq_file, ref_file) with file_transaction(data, final_out) as tx_final_out: do.run(cmd.format(**locals()), run_message, None) return final_out
def align(fastq_file, pair_file, ref_file, names, align_dir, data): if not ref_file: logger.error( "STAR index not found. We don't provide the STAR indexes " "by default because they are very large. You can install " "the index for your genome with: bcbio_nextgen.py upgrade " "--aligners star --genomes genome-build-name --data") sys.exit(1) max_hits = 10 srna = True if data["analysis"].lower().startswith( "smallrna-seq") else False srna_opts = "" if srna: max_hits = 1000 srna_opts = "--alignIntronMax 1" config = data["config"] star_dirs = _get_star_dirnames(align_dir, data, names) if file_exists(star_dirs.final_out): data = _update_data(star_dirs.final_out, star_dirs.out_dir, names, data) return data star_path = config_utils.get_program("STAR", config) fastq_files = " ".join([fastq_file, pair_file ]) if pair_file else fastq_file num_cores = dd.get_num_cores(data) gtf_file = dd.get_gtf_file(data) if ref_file.endswith("chrLength"): ref_file = os.path.dirname(ref_file) with file_transaction(data, align_dir) as tx_align_dir: tx_star_dirnames = _get_star_dirnames(tx_align_dir, data, names) tx_out_dir, tx_out_file, tx_out_prefix, tx_final_out = tx_star_dirnames safe_makedir(tx_align_dir) safe_makedir(tx_out_dir) cmd = ( "{star_path} --genomeDir {ref_file} --readFilesIn {fastq_files} " "--runThreadN {num_cores} --outFileNamePrefix {tx_out_prefix} " "--outReadsUnmapped Fastx --outFilterMultimapNmax {max_hits} " "--outStd SAM {srna_opts} " "--outSAMunmapped Within --outSAMattributes %s " % " ".join(ALIGN_TAGS)) cmd += _add_sj_index_commands(fastq_file, ref_file, gtf_file) if not srna else "" cmd += " --readFilesCommand zcat " if is_gzipped(fastq_file) else "" cmd += _read_group_option(names) fusion_mode = utils.get_in(data, ("config", "algorithm", "fusion_mode"), False) if fusion_mode: cmd += (" --chimSegmentMin 12 --chimJunctionOverhangMin 12 " "--chimScoreDropMax 30 --chimSegmentReadGapMax 5 " "--chimScoreSeparation 5 " "--chimOutType WithinSAM ") strandedness = utils.get_in(data, ("config", "algorithm", "strandedness"), "unstranded").lower() if strandedness == "unstranded" and not srna: cmd += " --outSAMstrandField intronMotif " if not srna: cmd += " --quantMode TranscriptomeSAM " cmd += " | " + postalign.sam_to_sortbam_cl(data, tx_final_out) run_message = "Running STAR aligner on %s and %s" % (fastq_file, ref_file) do.run(cmd.format(**locals()), run_message, None) print("hello") data = _update_data(star_dirs.final_out, star_dirs.out_dir, names, data) return data