def _get_gtf(config): gtf = config["annotation"].get("file", None) #gtf = config.get("gtf", None) if not gtf or not file_exists(gtf): logger.error("genebody_coverage needs a GTF file passed to it.") exit(1) return gtf
def align(fastq_file, pair_file, ref_file, out_base, align_dir, config, names=None): out_dir = os.path.join(align_dir, "%s_tophat" % out_base) out_file = os.path.join(out_dir, _out_fnames[0]) if file_exists(out_file): return os.path.join(out_dir, "%s.sam" % out_base) if not _bowtie_ref_match(ref_file, config): logger.error("Bowtie version %d was detected but the reference " "file %s is built for version %d. Download version " "%d or build it with bowtie-build." % (_bowtie_major_version(config), ref_file, _ref_version(ref_file), _bowtie_major_version(config))) exit(1) out_files = tophat_align(fastq_file, pair_file, ref_file, out_base, align_dir, config, names=None) return out_files
def demultiplex_samples(data): """ demultiplex a fastqtransformed FASTQ file into separate sample barcode files """ work_dir = os.path.join(dd.get_work_dir(data), "umis") sample_dir = os.path.join(work_dir, dd.get_sample_name(data)) demulti_dir = os.path.join(sample_dir, "demultiplexed") files = data["files"] if len(files) == 2: logger.error( "Sample demultiplexing doesn't handle paired-end reads, but " "we can add it. Open an issue here https://github.com/bcbio/bcbio-nextgen/issues if you need this and we'll add it." ) sys.exit(1) else: fq1 = files[0] # check if samples need to be demultiplexed with open_fastq(fq1) as in_handle: read = next(in_handle) if "SAMPLE_" not in read: return [[data]] bcfile = get_sample_barcodes(dd.get_sample_barcodes(data), sample_dir) demultiplexed = glob.glob(os.path.join(demulti_dir, "*.fq*")) if demultiplexed: return [split_demultiplexed_sampledata(data, demultiplexed)] umis = config_utils.get_program("umis", data, default="umis") cmd = ("{umis} demultiplex_samples --nedit 1 --barcodes {bcfile} " "--out_dir {tx_dir} {fq1}") msg = "Demultiplexing {fq1}." with file_transaction(data, demulti_dir) as tx_dir: do.run(cmd.format(**locals()), msg.format(**locals())) demultiplexed = glob.glob(os.path.join(demulti_dir, "*.fq*")) return [split_demultiplexed_sampledata(data, demultiplexed)]
def run_rnaseq_variant_calling(data): """ run RNA-seq variant calling, variation file is stored in `vrn_file` in the datadict """ variantcaller = dd.get_variantcaller(data) if isinstance(variantcaller, list) and len(variantcaller) > 1: logger.error("Only one variantcaller can be run for RNA-seq at " "this time. Post an issue here " "(https://github.com/bcbio/bcbio-nextgen/issues) " "if this is something you need to do.") sys.exit(1) if variantcaller: if "gatk-haplotype" in variantcaller: data = variation.rnaseq_gatk_variant_calling(data) if vardict.get_vardict_command(data): data = variation.rnaseq_vardict_variant_calling(data) if dd.get_vrn_file(data): ann_file = vcfanno.run_vcfanno(dd.get_vrn_file(data), ["rnaedit"], data) if ann_file: data = dd.set_vrn_file(data, ann_file) ann_file = population.run_vcfanno(dd.get_vrn_file(data), data, population.do_db_build([data])) if ann_file: data = dd.set_vrn_file(data, ann_file) return [[data]]
def singlecell_rnaseq(samples, run_parallel): quantifier = dd.get_in_samples(samples, dd.get_singlecell_quantifier) quantifier = quantifier.lower() samples = run_parallel("run_umi_transform", samples) demultiplexed = run_parallel("demultiplex_samples", samples) # break demultiplixed lanes into their own samples samples = [] for lane in demultiplexed: for index in lane: samples.append([index]) samples = run_parallel("run_filter_barcodes", samples) samples = run_parallel("run_barcode_histogram", samples) if quantifier == "rapmap": samples = run_parallel("run_rapmap_index", [samples]) samples = run_parallel("run_rapmap_align", samples) samples = run_parallel("run_tagcount", samples) samples = run_parallel("run_concatenate_sparse_counts", [samples]) elif quantifier == "kallisto": samples = run_parallel("run_kallisto_singlecell", samples) else: logger.error(("%s is not supported for singlecell RNA-seq " "quantification." % quantifier)) sys.exit(1) samples = scrnaseq_concatenate_metadata(samples) singlecellexperiment.make_scrnaseq_object(samples) return samples
def main(config, view): # make the needed directories map(safe_makedir, config["dir"].values()) # specific for project human_input = find_sam_files(config["input_dir_human"]) mouse_input = find_sam_files(config["input_dir_mouse"]) if len(human_input) != len(mouse_input): logger.error("The length of the number of human SAM files does " "not match the length of the number of mouse SAM " "files, aborting.") sys.exit(1) input_files = zip(human_input, mouse_input) curr_files = input_files logger.info("Running disambiguation pipeline on %s." % (curr_files)) for stage in config["run"]: if stage == "disambiguate": logger.info("Disambiguating %s." % (curr_files)) disambiguate = Disambiguate(config) out_files = list(flatten(view.map(disambiguate, curr_files))) bam_files = view.map(sam.sam2bam, out_files) bam_sorted = view.map(sam.bamsort, bam_files) view.map(sam.bamindex, bam_sorted)
def load_summarizedexperiment(samples): """ create summarizedexperiment rds object fails with n_samples = 1 """ # using r36 (4.0) - will eventually drop R3.5 rcmd = Rscript_cmd("r36") se_script = os.path.join(os.path.dirname(__file__), os.pardir, "scripts", "R", "bcbio2se.R") data = samples[0][0] work_dir = dd.get_work_dir(data) out_dir = os.path.join(work_dir, "salmon") summarized_experiment = os.path.join(out_dir, "bcbio-se.rds") if not file_exists(summarized_experiment): with file_transaction(summarized_experiment) as tx_out_file: cmd = f"{rcmd} --vanilla {se_script} {work_dir} {tx_out_file}" message = f"Loading SummarizedExperiment." try: do.run(cmd, message) except Exception: logger.error("SE creation failed") if file_exists(summarized_experiment): try: se_qc_report = generate_se_qc_report(work_dir) except Exception: se_qc_report = None logger.error("SE QC failed") updated_samples = [] for data in dd.sample_data_iterator(samples): data = dd.set_summarized_experiment(data, summarized_experiment) updated_samples.append([data]) return updated_samples else: return samples
def clean_chipseq_alignment(data): aligner = dd.get_aligner(data) data["align_bam"] = dd.get_work_bam(data) if dd.get_mark_duplicates(data): if aligner: if aligner == "bowtie2": filterer = bowtie2.filter_multimappers elif aligner == "bwa": filterer = bwa.filter_multimappers else: logger.error("ChIP-seq only supported for bowtie2 and bwa.") sys.exit(-1) unique_bam = filterer(dd.get_work_bam(data), data) data["work_bam"] = unique_bam else: logger.info( "Warning: When BAM file is given as input, bcbio skips multimappers removal." "If BAM is not cleaned for peak calling, can result in downstream errors." ) # lcr_bed = utils.get_in(data, ("genome_resources", "variation", "lcr")) data["work_bam"] = _keep_assembled_chrom(dd.get_work_bam(data), dd.get_ref_file(data), data["config"]) encode_bed = tz.get_in( ["genome_resources", "variation", "encode_blacklist"], data) if encode_bed: data["work_bam"] = _prepare_bam(dd.get_work_bam(data), encode_bed, data['config']) bam.index(data["work_bam"], data['config']) data["bigwig"] = _bam_coverage(dd.get_sample_name(data), dd.get_work_bam(data), data) return [[data]]
def make_bcbiornaseq_object(data): """ load the initial bcb.rda object using bcbioRNASeq """ if "bcbiornaseq" not in dd.get_tools_on(data): return data upload_dir = tz.get_in(("upload", "dir"), data) report_dir = os.path.join(upload_dir, "bcbioRNASeq") safe_makedir(report_dir) organism = dd.get_bcbiornaseq(data).get("organism", None) groups = dd.get_bcbiornaseq(data).get("interesting_groups", None) loadstring = create_load_string(upload_dir, groups, organism, "gene") r_file = os.path.join(report_dir, "load_bcbioRNAseq.R") with file_transaction(r_file) as tmp_file: memoize_write_file(loadstring, tmp_file) rcmd = Rscript_cmd(env="rbcbiornaseq") with chdir(report_dir): do.run([rcmd, "--vanilla", r_file], "Loading bcbioRNASeq object.") # bcbiornaseq 0.3.44 writes to data/bcb.rds write_counts(os.path.join(report_dir, "data", "bcb.rds"), "gene") loadstring = create_load_string(upload_dir, groups, organism, "transcript") r_file = os.path.join(report_dir, "load_transcript_bcbioRNAseq.R") with file_transaction(r_file) as tmp_file: memoize_write_file(loadstring, tmp_file) rcmd = Rscript_cmd(env="rbcbiornaseq") with chdir(report_dir): do.run([rcmd, "--vanilla", r_file], "Loading transcript-level bcbioRNASeq object.") write_counts(os.path.join(report_dir, "data-transcript", "bcb.rds"), "transcript") try: make_quality_report(data) except: logger.error("bcbiornaseq error at quality report") finally: return data
def clean_chipseq_alignment(data): aligner = dd.get_aligner(data) data["align_bam"] = dd.get_work_bam(data) if dd.get_mark_duplicates(data): if aligner: if aligner == "bowtie2": filterer = bowtie2.filter_multimappers elif aligner == "bwa": filterer = bwa.filter_multimappers else: logger.error("ChIP-seq only supported for bowtie2 and bwa.") sys.exit(-1) unique_bam = filterer(dd.get_work_bam(data), data) data["work_bam"] = unique_bam else: logger.info("Warning: When BAM file is given as input, bcbio skips multimappers removal." "If BAM is not cleaned for peak calling, can result in downstream errors.") # lcr_bed = utils.get_in(data, ("genome_resources", "variation", "lcr")) data["work_bam"] = _keep_assembled_chrom(dd.get_work_bam(data), dd.get_ref_file(data), data["config"]) encode_bed = tz.get_in(["genome_resources", "variation", "encode_blacklist"], data) if encode_bed: data["work_bam"] = _prepare_bam(dd.get_work_bam(data), encode_bed, data['config']) bam.index(data["work_bam"], data['config']) data["bigwig"] = _bam_coverage(dd.get_sample_name(data), dd.get_work_bam(data), data) return [[data]]
def demultiplex_samples(data): """ demultiplex a fastqtransformed FASTQ file into separate sample barcode files """ files = data["files"] if len(files) == 2: logger.error("Sample demultiplexing doesn't handle paired-end reads, but " "we can add it. Open an issue here https://github.com/chapmanb/bcbio-nextgen/issues if you need this and we'll add it.") sys.exit(1) else: fq1 = files[0] # check if samples need to be demultiplexed with open_fastq(fq1) as in_handle: read = in_handle.next() if "SAMPLE_" not in read: return [[data]] bcfile = dd.get_sample_barcodes(data) if not bcfile: logger.error("Sample demultiplexing needs a list of known indexes provided " "with via the sample_barcodes option in the algorithm section.") sys.exit(1) work_dir = os.path.join(dd.get_work_dir(data), "umis") sample_dir = os.path.join(work_dir, dd.get_sample_name(data)) demulti_dir = os.path.join(sample_dir, "demultiplexed") demultiplexed = glob.glob(os.path.join(demulti_dir, "*.fq*")) if demultiplexed: return [split_demultiplexed_sampledata(data, demultiplexed)] umis = config_utils.get_program("umis", data, default="umis") cmd = ("{umis} demultiplex_samples --nedit 1 --barcodes {bcfile} " "--out_dir {tx_dir} {fq1}") msg = "Demultiplexing {fq1}." with file_transaction(data, demulti_dir) as tx_dir: do.run(cmd.format(**locals()), msg.format(**locals())) demultiplexed = glob.glob(os.path.join(demulti_dir, "*.fq*")) return [split_demultiplexed_sampledata(data, demultiplexed)]
def _cutadapt_trim(fastq_files, quality_format, adapters, out_files): if quality_format == "illumina": quality_base = "64" else: quality_base = "33" # --times=2 tries twice remove adapters which will allow things like: # realsequenceAAAAAAadapter to remove both the poly-A and the adapter # this behavior might not be what we want; we could also do two or # more passes of cutadapt base_cmd = [ "cutadapt", "--times=" + "2", "--quality-base=" + quality_base, "--quality-cutoff=20", "--format=fastq", "--minimum-length=0" ] adapter_cmd = map(lambda x: "--adapter=" + x, adapters) base_cmd.extend(adapter_cmd) if all(map(file_exists, out_files)): return out_files for in_file, out_file in zip(fastq_files, out_files): # if you pass an output filename, cutadapt will write some stats # about trimmed adapters to stdout. stat_file captures that. stat_file = replace_suffix(out_file, ".trim_stats.txt") with open(stat_file, "w") as stat_handle: cmd = list(base_cmd) cmd.extend(["--output=" + out_file, in_file]) try: return_value = subprocess.check_call(cmd, stdout=stat_handle) except subprocess.CalledProcessError: cmd_string = subprocess.list2cmdline(cmd) logger.error("Cutadapt returned an error. The command " "used to run cutadapt was: %s." % (cmd_string)) exit(1) return out_files
def _cutadapt_trim(fastq_files, quality_format, adapters, out_files): if quality_format == "illumina": quality_base = "64" else: quality_base = "33" # --times=2 tries twice remove adapters which will allow things like: # realsequenceAAAAAAadapter to remove both the poly-A and the adapter # this behavior might not be what we want; we could also do two or # more passes of cutadapt base_cmd = ["cutadapt", "--times=" + "2", "--quality-base=" + quality_base, "--quality-cutoff=20", "--format=fastq", "--minimum-length=0"] adapter_cmd = map(lambda x: "--adapter=" + x, adapters) base_cmd.extend(adapter_cmd) if all(map(file_exists, out_files)): return out_files for in_file, out_file in zip(fastq_files, out_files): # if you pass an output filename, cutadapt will write some stats # about trimmed adapters to stdout. stat_file captures that. stat_file = replace_suffix(out_file, ".trim_stats.txt") with open(stat_file, "w") as stat_handle: cmd = list(base_cmd) cmd.extend(["--output=" + out_file, in_file]) try: return_value = subprocess.check_call(cmd, stdout=stat_handle) except subprocess.CalledProcessError: cmd_string = subprocess.list2cmdline(cmd) logger.error("Cutadapt returned an error. The command " "used to run cutadapt was: %s." % (cmd_string)) exit(1) return out_files
def singlecell_rnaseq(samples, run_parallel): quantifier = dd.get_in_samples(samples, dd.get_singlecell_quantifier) quantifier = quantifier.lower() samples = run_parallel("run_umi_transform", samples) demultiplexed = run_parallel("demultiplex_samples", samples) # break demultiplixed lanes into their own samples samples = [] for lane in demultiplexed: for index in lane: samples.append([index]) if not samples: logger.error(f"No samples were found matching the supplied sample barcodes. See " f"https://github.com/bcbio/bcbio-nextgen/issues/3428#issuecomment-772609904 " f"for how to debug this issue.") sys.exit(1) samples = run_parallel("run_filter_barcodes", samples) samples = run_parallel("run_barcode_histogram", samples) if quantifier == "rapmap": samples = run_parallel("run_rapmap_index", [samples]) samples = run_parallel("run_rapmap_align", samples) samples = run_parallel("run_tagcount", samples) samples = run_parallel("run_concatenate_sparse_counts", [samples]) elif quantifier == "kallisto": samples = run_parallel("run_kallisto_singlecell", samples) else: logger.error(("%s is not supported for singlecell RNA-seq " "quantification." % quantifier)) sys.exit(1) samples = scrnaseq_concatenate_metadata(samples) singlecellexperiment.make_scrnaseq_object(samples) return samples
def align(fastq_file, pair_file, ref_file, names, align_dir, data): if not ref_file: logger.error("STAR index not found. We don't provide the STAR indexes " "by default because they are very large. You can install " "the index for your genome with: bcbio_nextgen.py upgrade " "--aligners star --genomes genome-build-name --data") sys.exit(1) max_hits = 10 srna = True if data["analysis"].lower().startswith("smallrna-seq") else False srna_opts = "" if srna: max_hits = 1000 srna_opts = "--alignIntronMax 1" config = data["config"] star_dirs = _get_star_dirnames(align_dir, data, names) if file_exists(star_dirs.final_out): data = _update_data(star_dirs.final_out, star_dirs.out_dir, names, data) return data star_path = config_utils.get_program("STAR", config) fastq_files = " ".join([fastq_file, pair_file]) if pair_file else fastq_file num_cores = dd.get_num_cores(data) gtf_file = dd.get_gtf_file(data) if ref_file.endswith("chrLength"): ref_file = os.path.dirname(ref_file) with file_transaction(data, align_dir) as tx_align_dir: tx_star_dirnames = _get_star_dirnames(tx_align_dir, data, names) tx_out_dir, tx_out_file, tx_out_prefix, tx_final_out = tx_star_dirnames safe_makedir(tx_align_dir) safe_makedir(tx_out_dir) cmd = ("{star_path} --genomeDir {ref_file} --readFilesIn {fastq_files} " "--runThreadN {num_cores} --outFileNamePrefix {tx_out_prefix} " "--outReadsUnmapped Fastx --outFilterMultimapNmax {max_hits} " "--outStd SAM {srna_opts} " "--outSAMunmapped Within --outSAMattributes %s " % " ".join(ALIGN_TAGS)) cmd += _add_sj_index_commands(fastq_file, ref_file, gtf_file) if not srna else "" cmd += " --readFilesCommand zcat " if is_gzipped(fastq_file) else "" cmd += _read_group_option(names) fusion_mode = utils.get_in(data, ("config", "algorithm", "fusion_mode"), False) if fusion_mode: cmd += (" --chimSegmentMin 12 --chimJunctionOverhangMin 12 " "--chimScoreDropMax 30 --chimSegmentReadGapMax 5 " "--chimScoreSeparation 5 " "--chimOutType WithinSAM ") strandedness = utils.get_in(data, ("config", "algorithm", "strandedness"), "unstranded").lower() if strandedness == "unstranded" and not srna: cmd += " --outSAMstrandField intronMotif " if not srna: cmd += " --quantMode TranscriptomeSAM " cmd += " | " + postalign.sam_to_sortbam_cl(data, tx_final_out) run_message = "Running STAR aligner on %s and %s" % (fastq_file, ref_file) do.run(cmd.format(**locals()), run_message, None) print("hello") data = _update_data(star_dirs.final_out, star_dirs.out_dir, names, data) return data
def _run_with_possible_error_message(cmd, **kwargs): try: subprocess.check_call(cmd, **kwargs) except subprocess.CalledProcessError: cmd_string = subprocess.list2cmdline(cmd) logger.error("Cutadapt returned an error. The command " "used to run cutadapt was: %s." % (cmd_string)) exit(1)
def _check_bowtie(ref_file, config): if not _bowtie_ref_match(ref_file, config): logger.error("Bowtie version %d was detected but the reference " "file %s is built for version %d. Download version " "%d or build it with bowtie-build." % (_bowtie_major_version(config), ref_file, _ref_version(ref_file), _bowtie_major_version(config))) exit(1)
def combine_pairs(input_files): """ calls files pairs if they are completely the same except for one has _1 and the other has _2 returns a list of tuples of pairs or singles. From bipy.utils (https://github.com/roryk/bipy/blob/master/bipy/utils.py) Adjusted to allow different input paths or extensions for matching files. """ PAIR_FILE_IDENTIFIERS = set(["1", "2", "3"]) pairs = [] used = set([]) for in_file in input_files: if in_file in used: continue for comp_file in input_files: if comp_file in used or comp_file == in_file: continue a = rstrip_extra(utils.splitext_plus(os.path.basename(in_file))[0]) b = rstrip_extra( utils.splitext_plus(os.path.basename(comp_file))[0]) if len(a) != len(b): continue s = dif(a, b) # no differences, then its the same file stem if len(s) == 0: logger.error( "%s and %s have the same stem, so we don't know " "how to assign it to the sample data in the CSV. To " "get around this you can rename one of the files. " "If they are meant to be the same sample run in two " "lanes, combine them first with the " "bcbio_prepare_samples.py script." "(http://bcbio-nextgen.readthedocs.io/en/latest/contents/configuration.html#multiple-files-per-sample)" % (in_file, comp_file)) sys.exit(1) if len(s) > 1: continue #there is only 1 difference if (a[s[0]] in PAIR_FILE_IDENTIFIERS and b[s[0]] in PAIR_FILE_IDENTIFIERS): # if the 1/2 isn't the last digit before a separator, skip # this skips stuff like 2P 2A, often denoting replicates, not # read pairings if len(b) > (s[0] + 1): if (b[s[0] + 1] not in ("_", "-", ".")): continue # if the 1/2 is not a separator or prefaced with R, skip if b[s[0] - 1] in ("R", "_", "-", "."): used.add(in_file) used.add(comp_file) if b[s[0]] > a[s[0]]: pairs.append([in_file, comp_file]) else: pairs.append([comp_file, in_file]) break if in_file not in used: pairs.append([in_file]) used.add(in_file) return pairs
def align(fastq_file, pair_file, ref_file, names, align_dir, data): assert data["analysis"].lower().startswith("wgbs-seq"), "No comparible alignment." config = data["config"] sample = dd.get_sample_name(data) out_prefix = os.path.join(align_dir, dd.get_lane(data)) out_dir = os.path.join(align_dir, "%s_bismark" % dd.get_lane(data)) if not ref_file: logger.error("bismark index not found. You can install " "the index for your genome with: bcbio_nextgen.py upgrade " "--aligners bismark --genomes genome-build-name --data") sys.exit(1) final_out = os.path.join(align_dir, "{0}.bam".format(sample)) if file_exists(final_out): data = dd.set_work_bam(data, final_out) data["bam_report"] = glob.glob(os.path.join(out_dir, "*report.txt"))[0] data = dd.update_summary_qc(data, "bismark", base=data["bam_report"]) return data bismark = config_utils.get_program("bismark", config) # bismark uses 5 threads/sample and ~12GB RAM/sample (hg38) resources = config_utils.get_resources("bismark", data["config"]) max_cores = dd.get_num_cores(data) max_mem = config_utils.convert_to_bytes(resources.get("memory", "1G")) / (1024.0 * 1024.0) instances = calculate_bismark_instances(max_cores, max_mem * max_cores) # override instances if specified in the config if resources and resources.get("bismark_threads"): instances = resources.get("bismark_threads") logger.info(f"Using {instances} bismark instances - overriden by resources") bowtie_threads = 1 if resources and resources.get("bowtie_threads"): bowtie_threads = resources.get("bowtie_threads") logger.info(f"Using {bowtie_threads} bowtie threads per bismark instance") kit = kits.KITS.get(dd.get_kit(data), None) directional = "--non_directional" if kit and not kit.is_directional else "" other_opts = resources.get("options", []) other_opts = " ".join([str(x) for x in other_opts]).strip() fastq_files = " ".join([fastq_file, pair_file]) if pair_file else fastq_file safe_makedir(align_dir) cmd = "{bismark} {other_opts} {directional} --bowtie2 --temp_dir {tx_out_dir} --gzip --parallel {instances} -p {bowtie_threads} -o {tx_out_dir} --unmapped {ref_file} {fastq_file} " if pair_file: fastq_file = "-1 %s -2 %s" % (fastq_file, pair_file) raw_bam = glob.glob(out_dir + "/*bismark*bt2*bam") if not raw_bam: with tx_tmpdir() as tx_out_dir: run_message = "Running Bismark aligner on %s and %s" % (fastq_file, ref_file) do.run(cmd.format(**locals()), run_message, None) shutil.move(tx_out_dir, out_dir) raw_bam = glob.glob(out_dir + "/*bismark*bt2*bam") # don't process bam in the bismark pipeline! utils.symlink_plus(raw_bam[0], final_out) data = dd.set_work_bam(data, final_out) data["bam_report"] = glob.glob(os.path.join(out_dir, "*report.txt"))[0] data = dd.update_summary_qc(data, "bismark", base=data["bam_report"]) return data
def align(fastq_file, pair_file, ref_file, names, align_dir, data): assert data["analysis"].lower().startswith( "wgbs-seq"), "No comparible alignment." config = data["config"] sample = dd.get_sample_name(data) out_prefix = os.path.join(align_dir, dd.get_lane(data)) out_dir = os.path.join(align_dir, "%s_bismark" % dd.get_lane(data)) if not ref_file: logger.error( "bismark index not found. We don't provide the STAR indexes " "by default because they are very large. You can install " "the index for your genome with: bcbio_nextgen.py upgrade " "--aligners bismark --genomes genome-build-name --data") sys.exit(1) final_out = os.path.join(align_dir, "{0}.bam".format(sample)) if file_exists(final_out): data = dd.set_work_bam(data, final_out) data["bam_report"] = glob.glob(os.path.join(out_dir, "*report.txt"))[0] return data bismark = config_utils.get_program("bismark", config) # bismark uses 5 threads/sample and ~12GB RAM/sample (hg38) resources = config_utils.get_resources("bismark", data["config"]) max_cores = resources.get("cores", 1) max_mem = config_utils.convert_to_bytes(resources.get("memory", "1G")) n = min(max(int(max_cores / 5), 1), max(int(max_mem / config_utils.convert_to_bytes("12G")), 1)) kit = kits.KITS.get(dd.get_kit(data), None) directional = "--non_directional" if kit and not kit.is_directional else "" other_opts = resources.get("options", []) other_opts = " ".join([str(x) for x in other_opts]).strip() fastq_files = " ".join([fastq_file, pair_file ]) if pair_file else fastq_file safe_makedir(align_dir) cmd = "{bismark} {other_opts} {directional} --bowtie2 --temp_dir {tx_out_dir} --gzip --multicore {n} -o {tx_out_dir} --unmapped {ref_file} {fastq_file}" if pair_file: fastq_file = "-1 %s -2 %s" % (fastq_file, pair_file) raw_bam = glob.glob(out_dir + "/*bismark*bt2*bam") if not raw_bam: with tx_tmpdir() as tx_out_dir: run_message = "Running Bismark aligner on %s and %s" % (fastq_file, ref_file) do.run(cmd.format(**locals()), run_message, None) shutil.move(tx_out_dir, out_dir) raw_bam = glob.glob(out_dir + "/*bismark*bt2*bam") process_bam = _process_bam(raw_bam[0], fastq_files, sample, dd.get_sam_ref(data), config) utils.symlink_plus(process_bam, final_out) data = dd.set_work_bam(data, final_out) data["bam_report"] = glob.glob(os.path.join(out_dir, "*report.txt"))[0] return data
def _get_quality_format(config): SUPPORTED_FORMATS = ["illumina", "standard"] quality_format = dd.get_quality_format(data).lower() if quality_format not in SUPPORTED_FORMATS: logger.error("quality_format is set to an unsupported format. " "Supported formats are %s." % (", ".join(SUPPORTED_FORMATS))) exit(1) return quality_format
def _get_pipeline(item): from bcbio.log import logger analysis_type = item.get("analysis", "").lower() if analysis_type not in SUPPORTED_PIPELINES: logger.error("Cannot determine which type of analysis to run, " "set in the run_info under details.") sys.exit(1) else: return SUPPORTED_PIPELINES[analysis_type]
def index(data, bam_fpath): cmdl = make_command(data, "index", bam_fpath) indexed_bam = bam_fpath + ".bai" if not utils.file_uptodate(indexed_bam, bam_fpath): do.run(cmdl, "Indexing BAM file using sambamba") if not utils.file_exists(indexed_bam): logger.error("Cannot index BAM file " + bam_fpath + " using sambamba.") return None return indexed_bam
def umi_transform(data): """ transform each read by identifying the barcode and UMI for each read and putting the information in the read name """ fq1 = data["files"][0] umi_dir = os.path.join(dd.get_work_dir(data), "umis") safe_makedir(umi_dir) transform = dd.get_umi_type(data) if not transform: logger.info("No UMI transform specified, assuming pre-transformed data.") if is_transformed(fq1): logger.info("%s detected as pre-transformed, passing it on unchanged." % fq1) data["files"] = [fq1] return data else: logger.error("No UMI transform was specified, but %s does not look " "pre-transformed. Assuming non-umi data." % fq1) return data if file_exists(transform): transform_file = transform else: transform_file = get_transform_file(transform) if not file_exists(transform_file): logger.error( "The UMI transform can be specified as either a file or a " "bcbio-supported transform. Either the file %s does not exist " "or the transform is not supported by bcbio. Supported " "transforms are %s." % (dd.get_umi_type(data), ", ".join(SUPPORTED_TRANSFORMS))) sys.exit(1) out_base = dd.get_sample_name(data) + ".umitransformed.fq.gz" out_file = os.path.join(umi_dir, out_base) if file_exists(out_file): data["files"] = [out_file] return data umis = config_utils.get_program("umis", data, default="umis") cores = dd.get_num_cores(data) # skip transformation if the file already looks transformed with open_fastq(fq1) as in_handle: read = next(in_handle) if "UMI_" in read: data["files"] = [out_file] return data cmd = ("{umis} fastqtransform {transform_file} " "--cores {cores} " "{fq1}" "| seqtk seq -L 20 - | gzip > {tx_out_file}") message = ("Inserting UMI and barcode information into the read name of %s" % fq1) with file_transaction(out_file) as tx_out_file: do.run(cmd.format(**locals()), message) data["files"] = [out_file] return data
def _get_quality_format(config): SUPPORTED_FORMATS = ["illumina", "standard"] quality_format = config["algorithm"].get("quality_format", "standard").lower() if quality_format not in SUPPORTED_FORMATS: logger.error("quality_format is set to an unsupported format. " "Supported formats are %s." % (", ".join(SUPPORTED_FORMATS))) exit(1) return quality_format
def combine_pairs(input_files): """ calls files pairs if they are completely the same except for one has _1 and the other has _2 returns a list of tuples of pairs or singles. From bipy.utils (https://github.com/roryk/bipy/blob/master/bipy/utils.py) Adjusted to allow different input paths or extensions for matching files. """ PAIR_FILE_IDENTIFIERS = set(["1", "2", "3"]) pairs = [] used = set([]) for in_file in input_files: if in_file in used: continue for comp_file in input_files: if comp_file in used or comp_file == in_file: continue a = rstrip_extra(utils.splitext_plus(os.path.basename(in_file))[0]) b = rstrip_extra(utils.splitext_plus(os.path.basename(comp_file))[0]) if len(a) != len(b): continue s = dif(a,b) # no differences, then its the same file stem if len(s) == 0: logger.error("%s and %s have the same stem, so we don't know " "how to assign it to the sample data in the CSV. To " "get around this you can rename one of the files. " "If they are meant to be the same sample run in two " "lanes, combine them first with the " "bcbio_prepare_samples.py script." "(http://bcbio-nextgen.readthedocs.io/en/latest/contents/configuration.html#multiple-files-per-sample)" % (in_file, comp_file)) sys.exit(1) if len(s) > 1: continue #there is only 1 difference if (a[s[0]] in PAIR_FILE_IDENTIFIERS and b[s[0]] in PAIR_FILE_IDENTIFIERS): # if the 1/2 isn't the last digit before a separator, skip # this skips stuff like 2P 2A, often denoting replicates, not # read pairings if len(b) > (s[0] + 1): if (b[s[0]+1] not in ("_", "-", ".")): continue # if the 1/2 is not a separator or prefaced with R, skip if b[s[0]- 1] in ("R", "_", "-", "."): used.add(in_file) used.add(comp_file) if b[s[0]] > a[s[0]]: pairs.append([in_file, comp_file]) else: pairs.append([comp_file, in_file]) break if in_file not in used: pairs.append([in_file]) used.add(in_file) return pairs
def _get_pipeline(item): from bcbio.log import logger SUPPORTED_PIPELINES = {x.name.lower(): x for x in utils.itersubclasses(AbstractPipeline)} analysis_type = item.get("analysis", "").lower() if analysis_type not in SUPPORTED_PIPELINES: logger.error("Cannot determine which type of analysis to run, " "set in the run_info under details.") sys.exit(1) else: return SUPPORTED_PIPELINES[analysis_type]
def align(fastq_file, pair_file, ref_file, names, align_dir, data): config = data["config"] out_prefix = os.path.join(align_dir, dd.get_lane(data)) out_file = out_prefix + "Aligned.out.sam" out_dir = os.path.join(align_dir, "%s_star" % dd.get_lane(data)) if not ref_file: logger.error( "STAR index not found. We don't provide the STAR indexes " "by default because they are very large. You can install " "the index for your genome with: bcbio_nextgen.py upgrade " "--aligners star --genomes genome-build-name --data") sys.exit(1) final_out = os.path.join(out_dir, "{0}.bam".format(names["sample"])) if file_exists(final_out): data = _update_data(final_out, out_dir, names, data) return data star_path = config_utils.get_program("STAR", config) fastq = " ".join([fastq_file, pair_file]) if pair_file else fastq_file num_cores = config["algorithm"].get("num_cores", 1) safe_makedir(align_dir) cmd = ("{star_path} --genomeDir {ref_file} --readFilesIn {fastq} " "--runThreadN {num_cores} --outFileNamePrefix {out_prefix} " "--outReadsUnmapped Fastx --outFilterMultimapNmax 10 " "--outStd SAM " "--outSAMunmapped Within --outSAMattributes %s" % " ".join(ALIGN_TAGS)) cmd = cmd + " --readFilesCommand zcat " if is_gzipped(fastq_file) else cmd cmd += _read_group_option(names) fusion_mode = utils.get_in(data, ("config", "algorithm", "fusion_mode"), False) if fusion_mode: cmd += " --chimSegmentMin 15 --chimJunctionOverhangMin 15" strandedness = utils.get_in(data, ("config", "algorithm", "strandedness"), "unstranded").lower() if strandedness == "unstranded": cmd += " --outSAMstrandField intronMotif " if dd.get_rsem(data) and not is_transcriptome_broken(): cmd += " --quantMode TranscriptomeSAM " with tx_tmpdir(data) as tmp_dir: sam_to_bam = bam.sam_to_bam_stream_cmd(config) sort = bam.sort_cmd(config, tmp_dir) cmd += "| {sam_to_bam} | {sort} -o {tx_final_out} " run_message = "Running STAR aligner on %s and %s" % (fastq_file, ref_file) with file_transaction(data, final_out) as tx_final_out: do.run(cmd.format(**locals()), run_message, None) data = _update_data(final_out, out_dir, names, data) return data
def align(fastq_file, pair_file, ref_file, names, align_dir, data): max_hits = 10 srna = True if data["analysis"].lower().startswith("smallrna-seq") else False srna_opts = "" if srna: max_hits = 1000 srna_opts = "--alignIntronMax 1" config = data["config"] out_prefix = os.path.join(align_dir, dd.get_lane(data)) out_file = out_prefix + "Aligned.out.sam" out_dir = os.path.join(align_dir, "%s_star" % dd.get_lane(data)) if not ref_file: logger.error("STAR index not found. We don't provide the STAR indexes " "by default because they are very large. You can install " "the index for your genome with: bcbio_nextgen.py upgrade " "--aligners star --genomes genome-build-name --data") sys.exit(1) final_out = os.path.join(out_dir, "{0}.bam".format(names["sample"])) if file_exists(final_out): data = _update_data(final_out, out_dir, names, data) return data star_path = config_utils.get_program("STAR", config) fastq_files = " ".join([fastq_file, pair_file]) if pair_file else fastq_file num_cores = dd.get_num_cores(data) gtf_file = dd.get_gtf_file(data) safe_makedir(align_dir) cmd = ("{star_path} --genomeDir {ref_file} --readFilesIn {fastq_files} " "--runThreadN {num_cores} --outFileNamePrefix {out_prefix} " "--outReadsUnmapped Fastx --outFilterMultimapNmax {max_hits} " "--outStd SAM {srna_opts} " "--outSAMunmapped Within --outSAMattributes %s " % " ".join(ALIGN_TAGS)) cmd += _add_sj_index_commands(fastq_file, ref_file, gtf_file) cmd += " --readFilesCommand zcat " if is_gzipped(fastq_file) else "" cmd += _read_group_option(names) fusion_mode = utils.get_in(data, ("config", "algorithm", "fusion_mode"), False) if fusion_mode: cmd += " --chimSegmentMin 15 --chimJunctionOverhangMin 15" strandedness = utils.get_in(data, ("config", "algorithm", "strandedness"), "unstranded").lower() if strandedness == "unstranded" and not srna: cmd += " --outSAMstrandField intronMotif " if dd.get_transcriptome_align(data) and not is_transcriptome_broken(data): cmd += " --quantMode TranscriptomeSAM " with file_transaction(data, final_out) as tx_final_out: cmd += " | " + postalign.sam_to_sortbam_cl(data, tx_final_out) run_message = "Running STAR aligner on %s and %s" % (fastq_file, ref_file) do.run(cmd.format(**locals()), run_message, None) data = _update_data(final_out, out_dir, names, data) return data
def run(name, chip_bam, input_bam, genome_build, out_dir, method, resources, data): """ Run macs2 for chip and input samples avoiding errors due to samples. """ # output file name need to have the caller name config = dd.get_config(data) out_file = os.path.join(out_dir, name + "_peaks_macs2.xls") macs2_file = os.path.join(out_dir, name + "_peaks.xls") if utils.file_exists(out_file): _compress_and_sort_bdg_files(out_dir, data) return _get_output_files(out_dir) macs2 = config_utils.get_program("macs2", config) antibody = dd.get_antibody(data) if antibody: antibody = antibody.lower() if antibody not in antibodies.SUPPORTED_ANTIBODIES: logger.error( f"{antibody} specified, but not listed as a supported antibody. Valid antibodies are {antibodies.SUPPORTED_ANTIBODIES}. If you know your antibody " f"should be called with narrow or broad peaks, supply 'narrow' or 'broad' as the antibody." f"It will run 'narrow' if the antibody is not supported.") antibody = 'narrow' antibody = antibodies.ANTIBODIES[antibody] logger.info( f"{antibody.name} specified, using {antibody.peaktype} peak settings." ) peaksettings = select_peak_parameters(antibody) elif method == "atac": logger.info(f"ATAC-seq specified, using narrow peak settings.") peaksettings = " " else: peaksettings = " " options = " ".join(resources.get("macs2", {}).get("options", "")) genome_size = bam.fasta.total_sequence_length(dd.get_ref_file(data)) genome_size = "" if options.find("-g") > -1 else "-g %s" % genome_size paired = "-f BAMPE" if bam.is_paired(chip_bam) else "" with utils.chdir(out_dir): cmd = _macs2_cmd(data) cmd += peaksettings try: do.run(cmd.format(**locals()), "macs2 for %s" % name) utils.move_safe(macs2_file, out_file) except subprocess.CalledProcessError: raise RuntimeWarning( "macs2 terminated with an error. " "Please, check the message and report " "error if it is related to bcbio. " "You can add specific options for the sample " "setting resources as explained in docs: " "https://bcbio-nextgen.readthedocs.org/en/latest/contents/configuration.html#sample-specific-resources" ) _compress_and_sort_bdg_files(out_dir, data) return _get_output_files(out_dir)
def _ref_version(ref_file): _, ext = os.path.splitext(glob.glob(ref_file + "*")[0]) if ext == ".ebwt": return 1 elif ext == ".bt2": return 2 else: logger.error("Cannot detect which reference version %s is. " "Should end in either .ebwt (bowtie) or .bt2 " "(bowtie2)." % (ref_file)) exit(1)
def get_sample_barcodes(fn, out_dir): if not fn: logger.error("Sample demultiplexing needs a list of known indexes provided " "with via the sample_barcodes option in the algorithm section.") sys.exit(1) utils.safe_makedir(out_dir) out_fn = os.path.join(out_dir, "barcodes.csv") with open(fn) as inh: with open(out_fn, 'w') as outh: for line in inh: outh.write("%s\n" % (line.strip().split(",")[0])) return out_fn
def htseq_reader(align_file): """ returns a read-by-read sequence reader for a BAM or SAM file """ if bam.is_sam(align_file): read_seq = HTSeq.SAM_Reader(align_file) elif bam.is_bam(align_file): read_seq = HTSeq.BAM_Reader(align_file) else: logger.error("%s is not a SAM or BAM file" % (align_file)) sys.exit(1) return read_seq
def umi_transform(data): """ transform each read by identifying the barcode and UMI for each read and putting the information in the read name """ fqfiles = data["files"] fqfiles.extend(list(repeat("", 4-len(fqfiles)))) fq1, fq2, fq3, fq4 = fqfiles umi_dir = os.path.join(dd.get_work_dir(data), "umis") safe_makedir(umi_dir) transform = dd.get_umi_type(data) if file_exists(transform): transform_file = transform else: transform_file = get_transform_file(transform) if not file_exists(transform_file): logger.error( "The UMI transform can be specified as either a file or a " "bcbio-supported transform. Either the file %s does not exist " "or the transform is not supported by bcbio. Supported " "transforms are %s." %(dd.get_umi_type(data), ", ".join(SUPPORTED_TRANSFORMS))) sys.exit(1) out_base = dd.get_sample_name(data) + ".umitransformed.fq.gz" out_file = os.path.join(umi_dir, out_base) if file_exists(out_file): data["files"] = [out_file] return [[data]] cellular_barcodes = get_cellular_barcodes(data) if len(cellular_barcodes) > 1: split_option = "--separate_cb" else: split_option = "" umis = config_utils.get_program("umis", data, default="umis") cores = dd.get_num_cores(data) # skip transformation if the file already looks transformed with open_fastq(fq1) as in_handle: read = in_handle.next() if "UMI_" in read: data["files"] = [out_file] return [[data]] cmd = ("{umis} fastqtransform {split_option} {transform_file} " "--cores {cores} " "{fq1} {fq2} {fq3} {fq4}" "| seqtk seq -L 20 - | gzip > {tx_out_file}") message = ("Inserting UMI and barcode information into the read name of %s" % fq1) with file_transaction(out_file) as tx_out_file: do.run(cmd.format(**locals()), message) data["files"] = [out_file] return [[data]]
def umi_transform(data): """ transform each read by identifying the barcode and UMI for each read and putting the information in the read name """ fqfiles = data["files"] fqfiles.extend(list(repeat("", 4 - len(fqfiles)))) fq1, fq2, fq3, fq4 = fqfiles umi_dir = os.path.join(dd.get_work_dir(data), "umis") safe_makedir(umi_dir) transform = dd.get_umi_type(data) if file_exists(transform): transform_file = transform else: transform_file = get_transform_file(transform) if not file_exists(transform_file): logger.error( "The UMI transform can be specified as either a file or a " "bcbio-supported transform. Either the file %s does not exist " "or the transform is not supported by bcbio. Supported " "transforms are %s." % (transform_file, ", ".join(SUPPORTED_TRANSFORMS))) sys.exit(1) out_base = dd.get_sample_name(data) + ".umitransformed.fq.gz" out_file = os.path.join(umi_dir, out_base) if file_exists(out_file): data["files"] = [out_file] return [[data]] cellular_barcodes = get_cellular_barcodes(data) if len(cellular_barcodes) == 2: split_option = "--separate_cb" else: split_option = "" umis = config_utils.get_program("umis", data, default="umis") cores = dd.get_num_cores(data) # skip transformation if the file already looks transformed with open_fastq(fq1) as in_handle: read = in_handle.next() if "UMI_" in read: data["files"] = [out_file] return [[data]] cmd = ("{umis} fastqtransform {split_option} {transform_file} " "--cores {cores} " "{fq1} {fq2} {fq3} {fq4}" "| seqtk seq -L 20 - | gzip > {tx_out_file}") message = ( "Inserting UMI and barcode information into the read name of %s" % fq1) with file_transaction(out_file) as tx_out_file: do.run(cmd.format(**locals()), message) data["files"] = [out_file] return [[data]]
def includes_missingalt(data): """ As of GATK 4.1.0.0, variants with missing alts are generated (see https://github.com/broadinstitute/gatk/issues/5650) """ MISSINGALT_VERSION = LooseVersion("4.1.0.0") version = LooseVersion(broad.get_gatk_version(config=dd.get_config(data))) try: return version >= MISSINGALT_VERSION except TypeError: logger.error( f"LooseVersion failing with {version} as the detected version.") sys.exit(1)
def get_sample_barcodes(fn, out_dir): if not fn: logger.error( "Sample demultiplexing needs a list of known indexes provided " "with via the sample_barcodes option in the algorithm section.") sys.exit(1) utils.safe_makedir(out_dir) out_fn = os.path.join(out_dir, "barcodes.csv") with open(fn) as inh: with open(out_fn, 'w') as outh: for line in inh: outh.write("%s\n" % (line.strip().split(",")[0])) return out_fn
def _cut_file(self, in_file): """ run cutadapt on a single file """ adapters = self._get_adapters(self.chemistry) out_file = self.in2trimmed(in_file) if file_exists(out_file): return out_file cutadapt = sh.Command(self.stage_config.get("program", "cutadapt")) quality_format = self.quality_format if not quality_format: quality_format = self._detect_fastq_format(in_file) if quality_format == "sanger": logger.info("Quality format detected as sanger.") quality_base = 33 elif quality_format == "illumina": logger.info("Quality format set to illumina 1.5/1.3") quality_base = 64 else: logger.error("Quality format could not be detected. Quality " "Detected or set as %s. It should be illumina " "or sanger.") exit(1) # if we want to trim the polya tails we have to first remove # the adapters and then trim the tail if self.stage_config.get("trim_polya", True): temp_cut = tempfile.NamedTemporaryFile(suffix=".fastq", dir=self.out_dir) # trim off adapters cmd = str(cutadapt.bake(in_file, self.options, adapters, quality_base=quality_base, out=temp_cut.name)) do.run(cmd, "Cutadapt trim of adapters of %s." % (in_file), None) with file_transaction(out_file) as temp_out: polya = ADAPTERS.get("polya") # trim off polya cmd = str(cutadapt.bake(temp_cut.name, self.options, "-a", polya, "-a", self._rc_adapters(polya), quality_base=quality_base, out=temp_out)) do.run(cmd, "Cutadapt trim of polyA tail of %s." % (temp_cut.name), None) return out_file else: with file_transaction(out_file) as temp_out: cmd = str(cutadapt.bake(in_file, self.options, adapters, out=temp_out)) do.run(cmd, "Cutadapt trim of %s." % (in_file)) return out_file
def align(fastq_file, pair_file, ref_file, names, align_dir, data): config = data["config"] out_prefix = os.path.join(align_dir, dd.get_lane(data)) out_file = out_prefix + "Aligned.out.sam" out_dir = os.path.join(align_dir, "%s_star" % dd.get_lane(data)) if not ref_file: logger.error("STAR index not found. We don't provide the STAR indexes " "by default because they are very large. You can install " "the index for your genome with: bcbio_nextgen.py upgrade " "--aligners star --genomes genome-build-name --data") sys.exit(1) final_out = os.path.join(out_dir, "{0}.bam".format(names["sample"])) if file_exists(final_out): data = _update_data(final_out, out_dir, names, data) return data star_path = config_utils.get_program("STAR", config) fastq = " ".join([fastq_file, pair_file]) if pair_file else fastq_file num_cores = config["algorithm"].get("num_cores", 1) safe_makedir(align_dir) cmd = ("{star_path} --genomeDir {ref_file} --readFilesIn {fastq} " "--runThreadN {num_cores} --outFileNamePrefix {out_prefix} " "--outReadsUnmapped Fastx --outFilterMultimapNmax 10 " "--outStd SAM " "--outSAMunmapped Within --outSAMattributes %s" % " ".join(ALIGN_TAGS)) cmd = cmd + " --readFilesCommand zcat " if is_gzipped(fastq_file) else cmd cmd += _read_group_option(names) fusion_mode = utils.get_in(data, ("config", "algorithm", "fusion_mode"), False) if fusion_mode: cmd += " --chimSegmentMin 15 --chimJunctionOverhangMin 15" strandedness = utils.get_in(data, ("config", "algorithm", "strandedness"), "unstranded").lower() if strandedness == "unstranded": cmd += " --outSAMstrandField intronMotif " if dd.get_rsem(data) and not is_transcriptome_broken(): cmd += " --quantMode TranscriptomeSAM " with tx_tmpdir(data) as tmp_dir: sam_to_bam = bam.sam_to_bam_stream_cmd(config) sort = bam.sort_cmd(config, tmp_dir) cmd += "| {sam_to_bam} | {sort} -o {tx_final_out} " run_message = "Running STAR aligner on %s and %s" % (fastq_file, ref_file) with file_transaction(data, final_out) as tx_final_out: do.run(cmd.format(**locals()), run_message, None) data = _update_data(final_out, out_dir, names, data) return data
def _fetch_chrom_sizes(config): PROGRAM = "fetchChromSizes" if not program_exists(PROGRAM): logger.error("%s is not in the path or is not executable. Make sure " "it is installed or go to " "http://hgdownload.cse.ucsc.edu/admin/exe/" "to download it." % (PROGRAM)) exit(1) if "annotation" not in config: logger.error("'annotation' must be in the yaml file. See example " " configuration files") exit(1) if "name" not in config["annotation"]: logger.error("'name' must be in the yaml file under " " 'annotation'. See example configuration files.") exit(1) genome = config["annotation"]["name"] chrom_size_file = os.path.join(_results_dir(config), genome + ".sizes") if file_exists(chrom_size_file): return chrom_size_file with file_transaction(chrom_size_file) as tmp_chrom_size_file: sh.fetchChromSizes(genome, _out=tmp_chrom_size_file) if not file_exists(chrom_size_file): logger.error("chromosome size file does not exist. Check " "'annotation': 'name' to make sure it is valid.") exit(1) return chrom_size_file
def run_with_config(input_file, config, stage, out_file=None): if out_file is None: out_dir = os.path.join(config["dir"].get("results", None), stage) out_file = os.path.join(out_dir, _get_outfilename(input_file)) safe_makedir(out_dir) if "annotation" not in config: logger.error("annotation must appear in the config file, see example " "configuration files.") exit(1) ref = prepare_ref_file(config["annotation"], config) out_file = run(input_file, ref, out_file) return out_file
def run_rnaseq_variant_calling(data): variantcaller = dd.get_variantcaller(data) if isinstance(variantcaller, list) and len(variantcaller) > 1: logger.error("Only one variantcaller can be run for RNA-seq at " "this time. Post an issue here " "(https://github.com/chapmanb/bcbio-nextgen/issues) " "if this is something you need to do.") sys.exit(1) if variantcaller and "gatk" in variantcaller: data = variation.rnaseq_gatk_variant_calling(data) if vardict.get_vardict_command(data): data = variation.rnaseq_vardict_variant_calling(data) return [[data]]
def singlecell_rnaseq(samples, run_parallel): quantifier = dd.get_in_samples(samples, dd.get_singlecell_quantifier) quantifier = quantifier.lower() samples = run_parallel("run_umi_transform", samples) samples = run_parallel("run_barcode_histogram", samples) samples = run_parallel("run_filter_barcodes", samples) if quantifier == "rapmap": samples = run_parallel("run_rapmap_align", samples) samples = run_parallel("run_tagcount", samples) elif quantifier == "kallisto": samples = run_parallel("run_kallisto_singlecell", samples) else: logger.error(("%s is not supported for singlecell RNA-seq " "quantification." % quantifier)) sys.exit(1) return samples
def _check_stems(files): """check if stem names are the same and use full path then""" used = set() for fn in files: if os.path.basename(fn) in used: logger.error("%s stem is multiple times in your file list, " "so we don't know " "how to assign it to the sample data in the CSV. " "We are gonna use full path to make a difference, " "that means paired files should be in the same folder. " "If this is a problem, you should rename the files you want " "to merge. Sorry, no possible magic here." % os.path.basename(fn) ) return True used.add(os.path.basename(fn)) return False
def bamindex(in_file, samtools="samtools"): """ index a bam file avoids use of pysam.index which is not working for indexing as of 0.7.4 with ipython """ assert(is_bam(in_file)), "bamindex requires a BAM file, got %s" % in_file out_file = replace_suffix(in_file, ".bai") if file_exists(out_file): return out_file cmd = ["samtools", "index", in_file] try: subprocess.check_call(cmd) except subprocess.CalledProcessError: cmd_string = subprocess.list2cmdline(cmd) logger.error("bamindex returned an error. The command " "used to run bamindex was: %s." % (cmd_string)) return out_file
def bam2sam(in_file, samtools="samtools"): """ converts a bam file to a sam file bam2sam("file.bam") -> "file.sam" """ assert(is_bam(in_file)), "bam2sam requires a BAM file, got %s" % in_file out_file = replace_suffix(in_file, ".sam") if file_exists(out_file): return out_file with file_transaction(out_file) as tmp_out_file: #pysam.view("-h", "-o" + tmp_out_file, in_file) cmd = "{samtools} view -h -o {tmp_out_file} {in_file}".format(**locals()) try: subprocess.check_call(cmd) except subprocess.CalledProcessError: cmd_string = subprocess.list2cmdline(cmd) logger.error("bam2sam returned an error. The command " "used to run bam2sam was: %s." % (cmd_string)) return out_file
def wig2bigwig(wiggle_file, chrom_size_file, out_file): """ convert wiggle file to bigwig file using the UCSC tool """ PROGRAM = "wigToBigWig" if not program_exists(PROGRAM): logger.error("%s is not in the path or is not executable. Make sure " "it is installed or go to " "http://hgdownload.cse.ucsc.edu/admin/exe/" "to download it." % (PROGRAM)) exit(1) if file_exists(out_file): return out_file wigToBigWig = sh.Command(which(PROGRAM)) with file_transaction(out_file) as tx_out_file: cmd = str(wigToBigWig.bake(wiggle_file, chrom_size_file, tx_out_file)) do.run(cmd, "Converting %s from wig to bigwig." % (wiggle_file), None) return out_file