def salmon_index(gtf_file, ref_file, data, out_dir): out_dir = os.path.join(out_dir, "index", sailfish.get_build_string(data)) if dd.get_disambiguate(data): out_dir = "-".join([out_dir] + dd.get_disambiguate(data)) salmon = config_utils.get_program("salmon", dd.get_config(data)) num_cores = dd.get_num_cores(data) if dd.get_transcriptome_fasta(data): gtf_fa = dd.get_transcriptome_fasta(data) else: gtf_fa = sailfish.create_combined_fasta(data) assert file_exists(gtf_fa), "%s was not found, exiting." % gtf_fa tmpdir = dd.get_tmp_dir(data) out_file = os.path.join(out_dir, "versionInfo.json") if file_exists(out_file): logger.info("Transcriptome index for %s detected, skipping building." % gtf_fa) return out_dir files = dd.get_input_sequence_files(data) readlength = bam.fastq.estimate_read_length(files[0]) if readlength % 2 == 0: readlength -= 1 kmersize = min(readlength, 31) with file_transaction(data, out_dir) as tx_out_dir: cmd = "{salmon} index -k {kmersize} -p {num_cores} -i {tx_out_dir} -t {gtf_fa}" message = "Creating Salmon index for {gtf_fa}." do.run(cmd.format(**locals()), message.format(**locals()), None) return out_dir
def generate_transcript_counts(data): """Generate counts per transcript and per exon from an alignment""" data["count_file"] = featureCounts.count(data) if dd.get_fusion_mode(data, False): oncofuse_file = oncofuse.run(data) if oncofuse_file: data = dd.set_oncofuse_file(data, oncofuse_file) if dd.get_transcriptome_align(data) and not dd.get_transcriptome_bam(data): file1, file2 = None, None if dd.get_disambiguate(data): bam_path = data["work_bam"] fastq_paths = alignprep._bgzip_from_bam(bam_path, data["dirs"], data["config"], is_retry=False, output_infix='-transcriptome') if len(fastq_paths) == 2: file1, file2 = fastq_paths else: file1, file2 = fastq_paths[0], None else: file1, file2 = dd.get_input_sequence_files(data) ref_file = dd.get_ref_file(data) logger.info("Transcriptome alignment was flagged to run, but the " "transcriptome BAM file was not found. Aligning to the " "transcriptome with bowtie2.") data = bowtie2.align_transcriptome(file1, file2, ref_file, data) return [[data]]
def convert_to_kallisto(data): files = dd.get_input_sequence_files(data) if len(files) == 2: fq1, fq2 = files else: fq1, fq2 = files[0], None samplename = dd.get_sample_name(data) work_dir = dd.get_work_dir(data) kallisto_dir = os.path.join(work_dir, "kallisto", samplename, "fastq") out_file = os.path.join(kallisto_dir, "barcodes.batch") umis = config_utils.get_program("umis", dd.get_config(data)) if file_exists(out_file): return out_file if dd.get_minimum_barcode_depth(data): cb_histogram = os.path.join(work_dir, "umis", samplename, "cb-histogram.txt") cb_cutoff = dd.get_minimum_barcode_depth(data) cb_options = "--cb_histogram {cb_histogram} --cb_cutoff {cb_cutoff}" cb_options = cb_options.format(**locals()) else: cb_options = "" cmd = ("{umis} kallisto {cb_options} --out_dir {tx_kallisto_dir} {fq1}") with file_transaction(data, kallisto_dir) as tx_kallisto_dir: safe_makedir(tx_kallisto_dir) message = ("Transforming %s to Kallisto singlecell format. " % fq1) do.run(cmd.format(**locals()), message) return out_file
def salmon_decoy_index(gtf_file, data, out_dir): input_dir = os.path.join(dd.get_work_dir(data), "inputs", "transcriptome") decoy_transcriptome = os.path.join( input_dir, sailfish.get_build_string(data) + "-decoy.fa") out_dir = os.path.join(out_dir, "index", sailfish.get_build_string(data)) if dd.get_disambiguate(data): out_dir = "-".join([out_dir] + dd.get_disambiguate(data)) salmon = config_utils.get_program("salmon", dd.get_config(data)) num_cores = dd.get_num_cores(data) if dd.get_transcriptome_fasta(data): gtf_fa = dd.get_transcriptome_fasta(data) else: gtf_fa = sailfish.create_combined_fasta(data) assert file_exists(gtf_fa), "%s was not found, exiting." % gtf_fa decoy_sequence_file = get_decoy_sequence_file(data) decoy_name_file = get_decoy_name_file(data) gtf_fa = create_decoy_transcriptome(gtf_fa, get_decoy_sequence_file(data), decoy_transcriptome) out_file = os.path.join(out_dir, "versionInfo.json") if file_exists(out_file): logger.info("Transcriptome index for %s detected, skipping building." % gtf_fa) return out_dir files = dd.get_input_sequence_files(data) kmersize = sailfish.pick_kmersize(files[0]) with file_transaction(data, out_dir) as tx_out_dir: cmd = ( "{salmon} index -k {kmersize} -p {num_cores} -i {tx_out_dir} -t {gtf_fa} " "--decoys {decoy_name_file} ") message = "Creating decoy-aware Salmon index for {gtf_fa} with {kmersize} bp kmers." do.run(cmd.format(**locals()), message.format(**locals()), None) return out_dir
def rapmap_index(gtf_file, ref_file, algorithm, data, out_dir): valid_indexes = ["pseudoindex", "quasiindex"] index_type = algorithm + "index" assert index_type in valid_indexes, \ "RapMap only supports %s indices." % valid_indexes out_dir = os.path.join(out_dir, index_type, dd.get_genome_build(data)) if dd.get_disambiguate(data): out_dir = "-".join([out_dir] + dd.get_disambguate(data)) rapmap = config_utils.get_program("rapmap", dd.get_config(data)) # use user supplied transcriptome FASTA file if it exists if dd.get_transcriptome_fasta(data): out_dir = os.path.join(out_dir, index_type, dd.get_genome_build(data)) gtf_fa = dd.get_transcriptome_fasta(data) else: gtf_fa = sailfish.create_combined_fasta(data) tmpdir = dd.get_tmp_dir(data) if file_exists(out_dir + "rapidx.jfhash"): return out_dir files = dd.get_input_sequence_files(data) kmersize = sailfish.pick_kmersize(files[0]) message = "Creating rapmap {index_type} for {gtf_fa} with {kmersize} bp kmers." with file_transaction(out_dir) as tx_out_dir: cmd = "{rapmap} {index_type} -k {kmersize} -i {tx_out_dir} -t {gtf_fa}" do.run(cmd.format(**locals()), message.format(**locals()), None) return out_dir
def generate_transcript_counts(data): """Generate counts per transcript and per exon from an alignment""" data["count_file"] = featureCounts.count(data) if dd.get_transcriptome_align(data): # to create a disambiguated transcriptome file realign with bowtie2 if dd.get_disambiguate(data): logger.info("Aligning to the transcriptome with bowtie2 using the " "disambiguated reads.") bam_path = data["work_bam"] fastq_paths = alignprep._bgzip_from_bam( bam_path, data["dirs"], data, is_retry=False, output_infix='-transcriptome') if len(fastq_paths) == 2: file1, file2 = fastq_paths else: file1, file2 = fastq_paths[0], None ref_file = dd.get_ref_file(data) data = bowtie2.align_transcriptome(file1, file2, ref_file, data) else: file1, file2 = dd.get_input_sequence_files(data) if not dd.get_transcriptome_bam(data): ref_file = dd.get_ref_file(data) logger.info( "Transcriptome alignment was flagged to run, but the " "transcriptome BAM file was not found. Aligning to the " "transcriptome with bowtie2.") data = bowtie2.align_transcriptome(file1, file2, ref_file, data) data = spikein.counts_spikein(data) return [[data]]
def umi_transform(data): """ transform each read by identifying the barcode and UMI for each read and putting the information in the read name """ fq1, fq2 = dd.get_input_sequence_files(data) fq2 = fq2 if fq2 else "" umi_dir = os.path.join(dd.get_work_dir(data), "umis") transform = dd.get_umi_type(data) transform_data = transforms[transform] safe_makedir(umi_dir) transform_file = os.path.join(umi_dir, transform + ".json") transform_file = write_transform_file(transform_data, transform_file) out_base = dd.get_sample_name(data) + ".umitransformed.fq.gz" out_file = os.path.join(umi_dir, out_base) if file_exists(out_file): data["files"] = [out_file] return [[data]] index_option = "--dual_index" if transform_data["dual"] else "" if len(dd.get_cellular_barcodes(data)) == 2: split_option = "--separate_cb" else: split_option = "" umis = config_utils.get_program("umis", data, default="umis") cmd = ( "{umis} fastqtransform {index_option} {split_option} {transform_file} " "{fq1} {fq2} " "| seqtk seq -L 20 - | gzip > {tx_out_file}") message = ( "Inserting UMI and barcode information into the read name of %s" % fq1) with file_transaction(out_file) as tx_out_file: do.run(cmd.format(**locals()), message) data["files"] = [out_file] return [[data]]
def salmon_index(gtf_file, ref_file, data, out_dir): out_dir = os.path.join(out_dir, "index", sailfish.get_build_string(data)) if dd.get_disambiguate(data): out_dir = "-".join([out_dir] + dd.get_disambiguate(data)) salmon = config_utils.get_program("salmon", dd.get_config(data)) num_cores = dd.get_num_cores(data) if dd.get_transcriptome_fasta(data): gtf_fa = dd.get_transcriptome_fasta(data) else: gtf_fa = sailfish.create_combined_fasta(data, out_dir) assert file_exists(gtf_fa), "%s was not found, exiting." % gtf_fa tmpdir = dd.get_tmp_dir(data) out_file = os.path.join(out_dir, "versionInfo.json") if file_exists(out_file): return out_dir files = dd.get_input_sequence_files(data) readlength = bam.fastq.estimate_read_length(files[0]) if readlength % 2 == 0: readlength -= 1 kmersize = min(readlength, 31) with file_transaction(data, out_dir) as tx_out_dir: cmd = "{salmon} index -k {kmersize} -p {num_cores} -i {tx_out_dir} -t {gtf_fa}" message = "Creating Salmon index for {gtf_fa}." do.run(cmd.format(**locals()), message.format(**locals()), None) return out_dir
def counts_spikein(data): data = utils.to_single_data(data) samplename = dd.get_sample_name(data) work_dir = dd.get_work_dir(data) salmon_dir = os.path.join(work_dir, "spikein", samplename) fasta_file = dd.get_spikein_fasta(data) if not fasta_file: return data files = dd.get_input_sequence_files(data) if len(files) == 2: fq1, fq2 = files else: fq1, fq2 = files[0], None assert file_exists(fasta_file), "%s was not found, exiting." % fasta_file readlength = fastq.estimate_read_length(fq1) if readlength % 2 == 0: readlength -= 1 kmersize = min(readlength, 31) logger.info("kmersize used for salmon index at spikein quant: %s" % kmersize) kmersize = kmersize if not dd.get_analysis( data).lower() == "smallrna-seq" else 15 fasta_index = _index_spikein(fasta_file, salmon_dir, data, kmersize) out_file = _salmon_quant_reads(fq1, fq2, salmon_dir, fasta_index, data) data = dd.set_spikein_counts(data, out_file) return data
def umi_transform(data): """ transform each read by identifying the barcode and UMI for each read and putting the information in the read name """ fq1, fq2 = dd.get_input_sequence_files(data) fq2 = fq2 if fq2 else "" umi_dir = os.path.join(dd.get_work_dir(data), "umis") transform = dd.get_umi_type(data) transform_data = transforms[transform] safe_makedir(umi_dir) transform_file = os.path.join(umi_dir, transform + ".json") transform_file = write_transform_file(transform_data, transform_file) out_base = dd.get_sample_name(data) + ".umitransformed.fq.gz" out_file = os.path.join(umi_dir, out_base) if file_exists(out_file): data["files"] = [out_file] return [[data]] index_option = "--dual_index" if transform_data["dual"] else "" if len(dd.get_cellular_barcodes(data)) == 2: split_option = "--separate_cb" else: split_option = "" umis = config_utils.get_program("umis", data, default="umis") cmd = ("{umis} fastqtransform {index_option} {split_option} {transform_file} " "{fq1} {fq2} " "| seqtk seq -L 20 - | gzip > {tx_out_file}") message = ("Inserting UMI and barcode information into the read name of %s" % fq1) with file_transaction(out_file) as tx_out_file: do.run(cmd.format(**locals()), message) data["files"] = [out_file] return [[data]]
def trim_adapters(data): fq1, fq2 = dd.get_input_sequence_files(data) skewer = config_utils.get_program("skewer", data, default="skewer") nthreads = dd.get_num_cores(data) samplename = dd.get_sample_name(data) out_dir = safe_makedir( os.path.join(dd.get_work_dir(data), "trimmed", samplename)) of1 = os.path.join(out_dir, samplename + "-trimmed-pair1.fastq.gz") of2 = os.path.join(out_dir, samplename + "-trimmed-pair2.fastq.gz") of2 = of2 if fq2 else None if fq1 and fq2: if file_exists(of1) and file_exists(of2): return of1, of2 else: if file_exists(of1): return of1, None safe_makedir(out_dir) file_string = "{fq1} {fq2} " if fq2 else "{fq1} " fw_cmd = _fw_command(data) rv_cmd = _rv_command(data) mode = "tail" if not fq2 else "pe" cmd = ("{skewer} --min 25 --threads {nthreads} -q 5 " "{fw_cmd} " "{rv_cmd} " "-m {mode} " "--compress --output {out_stem} ") + file_string with file_transaction(out_dir) as tx_out_dir: safe_makedir(tx_out_dir) out_stem = os.path.join(tx_out_dir, samplename) message = "Trimming {fq1}, {fq2} with skewer.".format(**locals()) do.run(cmd.format(**locals()), message) return of1, of2
def generate_transcript_counts(data): """Generate counts per transcript and per exon from an alignment""" data["count_file"] = featureCounts.count(data) if dd.get_fusion_mode(data, False): oncofuse_file = oncofuse.run(data) if oncofuse_file: data = dd.set_oncofuse_file(data, oncofuse_file) if dd.get_transcriptome_align(data) and not dd.get_transcriptome_bam(data): file1, file2 = None, None if dd.get_disambiguate(data): bam_path = data["work_bam"] fastq_paths = alignprep._bgzip_from_bam( bam_path, data["dirs"], data["config"], is_retry=False, output_infix='-transcriptome') if len(fastq_paths) == 2: file1, file2 = fastq_paths else: file1, file2 = fastq_paths[0], None else: file1, file2 = dd.get_input_sequence_files(data) ref_file = dd.get_ref_file(data) logger.info("Transcriptome alignment was flagged to run, but the " "transcriptome BAM file was not found. Aligning to the " "transcriptome with bowtie2.") data = bowtie2.align_transcriptome(file1, file2, ref_file, data) return [[data]]
def trim_adapters(data): fq1, fq2 = dd.get_input_sequence_files(data) skewer = config_utils.get_program("skewer", data, default="skewer") nthreads = dd.get_num_cores(data) samplename = dd.get_sample_name(data) out_dir = os.path.join(dd.get_work_dir(data), "trimmed", samplename) of1 = os.path.join(out_dir, samplename + "-trimmed-pair1.fastq.gz") of2 = os.path.join(out_dir, samplename + "-trimmed-pair2.fastq.gz") of2 = of2 if fq2 else None if fq1 and fq2: if file_exists(of1) and file_exists(of2): return of1, of2 else: if file_exists(of1): return of1, None safe_makedir(out_dir) file_string = "{fq1} {fq2} " if fq2 else "{fq1} " fw_cmd = _fw_command(data) rv_cmd = _rv_command(data) mode = "tail" if not fq2 else "pe" cmd = ("{skewer} --min 25 --threads {nthreads} -q 5 " "{fw_cmd} " "{rv_cmd} " "-m {mode} " "--compress --output {out_stem} ") + file_string with file_transaction(out_dir) as tx_out_dir: safe_makedir(tx_out_dir) out_stem = os.path.join(tx_out_dir, samplename) message = "Trimming {fq1}, {fq2} with skewer.".format(**locals()) do.run(cmd.format(**locals()), message) return of1, of2
def process_alignment(data, alt_input=None): """Do an alignment of fastq files, preparing a sorted BAM output file. """ fastq1, fastq2 = dd.get_input_sequence_files(data) if alt_input: fastq1, fastq2 = alt_input config = data["config"] aligner = config["algorithm"].get("aligner", None) if fastq1 and objectstore.file_exists_or_remote(fastq1) and aligner: logger.info("Aligning lane %s with %s aligner" % (data["rgnames"]["lane"], aligner)) data = align_to_sort_bam(fastq1, fastq2, aligner, data) data = _add_supplemental_bams(data) elif fastq1 and os.path.exists(fastq1) and fastq1.endswith(".bam"): sort_method = config["algorithm"].get("bam_sort") bamclean = config["algorithm"].get("bam_clean") if bamclean is True or bamclean == "picard": if sort_method and sort_method != "coordinate": raise ValueError( "Cannot specify `bam_clean: picard` with `bam_sort` other than coordinate: %s" % sort_method) out_bam = cleanbam.picard_prep(fastq1, data["rgnames"], data["sam_ref"], data["dirs"], data) elif sort_method: runner = broad.runner_from_config(config) out_file = os.path.join( data["dirs"]["work"], "{}-sort.bam".format( os.path.splitext(os.path.basename(fastq1))[0])) out_bam = runner.run_fn("picard_sort", fastq1, sort_method, out_file) else: out_bam = link_bam_file( fastq1, os.path.join(data["dirs"]["work"], "prealign", data["rgnames"]["sample"])) bam.index(out_bam, data["config"]) bam.check_header(out_bam, data["rgnames"], data["sam_ref"], data["config"]) dedup_bam = postalign.dedup_bam(out_bam, data) bam.index(dedup_bam, data["config"]) data["work_bam"] = dedup_bam elif fastq1 and objectstore.file_exists_or_remote( fastq1) and fastq1.endswith(".cram"): data["work_bam"] = fastq1 elif fastq1 is None and "vrn_file" in data: data["config"]["algorithm"]["variantcaller"] = False data["work_bam"] = None elif not fastq1: raise ValueError("No 'files' specified for input sample: %s" % dd.get_sample_name(data)) else: raise ValueError( "Could not process input file from sample configuration. \n" + fastq1 + "\nIs the path to the file correct?\n" + "If it is a fastq file (not pre-aligned BAM or CRAM), " "is an aligner specified in the input configuration?") return [[data]]
def process_alignment(data, alt_input=None): """Do an alignment of fastq files, preparing a sorted BAM output file. """ data = utils.to_single_data(data) fastq1, fastq2 = dd.get_input_sequence_files(data) if alt_input: fastq1, fastq2 = alt_input config = data["config"] aligner = config["algorithm"].get("aligner", None) if fastq1 and objectstore.file_exists_or_remote(fastq1) and aligner: logger.info("Aligning lane %s with %s aligner" % (data["rgnames"]["lane"], aligner)) data = align_to_sort_bam(fastq1, fastq2, aligner, data) data = _add_supplemental_bams(data) elif fastq1 and objectstore.file_exists_or_remote(fastq1) and fastq1.endswith(".bam"): sort_method = config["algorithm"].get("bam_sort") bamclean = config["algorithm"].get("bam_clean") if bamclean is True or bamclean == "picard": if sort_method and sort_method != "coordinate": raise ValueError("Cannot specify `bam_clean: picard` with `bam_sort` other than coordinate: %s" % sort_method) out_bam = cleanbam.picard_prep(fastq1, data["rgnames"], dd.get_ref_file(data), data["dirs"], data) elif bamclean == "fixrg": out_bam = cleanbam.fixrg(fastq1, data["rgnames"], dd.get_ref_file(data), data["dirs"], data) elif sort_method: runner = broad.runner_from_path("picard", config) out_file = os.path.join(data["dirs"]["work"], "{}-sort.bam".format( os.path.splitext(os.path.basename(fastq1))[0])) out_bam = runner.run_fn("picard_sort", fastq1, sort_method, out_file) else: out_bam = link_bam_file(fastq1, os.path.join(data["dirs"]["work"], "prealign", data["rgnames"]["sample"])) bam.index(out_bam, data["config"]) bam.check_header(out_bam, data["rgnames"], dd.get_ref_file(data), data["config"]) dedup_bam = postalign.dedup_bam(out_bam, data) bam.index(dedup_bam, data["config"]) data["work_bam"] = dedup_bam elif fastq1 and objectstore.file_exists_or_remote(fastq1) and fastq1.endswith(".cram"): data["work_bam"] = fastq1 elif fastq1 is None and "vrn_file" in data: data["config"]["algorithm"]["variantcaller"] = False data["work_bam"] = None elif not fastq1: raise ValueError("No 'files' specified for input sample: %s" % dd.get_sample_name(data)) else: raise ValueError("Could not process input file from sample configuration. \n" + fastq1 + "\nIs the path to the file correct or is empty?\n" + "If it is a fastq file (not pre-aligned BAM or CRAM), " "is an aligner specified in the input configuration?") if data.get("work_bam"): # Add stable 'align_bam' target to use for retrieving raw alignment data["align_bam"] = data["work_bam"] data = _add_hla_files(data) return [[data]]
def filter_barcodes(data): # if data was pre-demultiplexed, there is no need to filter the barcodes if dd.get_demultiplexed(data): return [[data]] fq1 = dd.get_input_sequence_files(data)[0] umi_dir = os.path.join(dd.get_work_dir(data), "umis") correction = dd.get_cellular_barcode_correction(data) bc = get_cellular_barcodes(data) if not bc: logger.info("No cellular barcodes found, skipping filtering.") return [[data]] bc1 = None bc2 = None bc3 = None umi_dir = os.path.join(dd.get_work_dir(data), "umis") if isinstance(bc, six.string_types): bc1 = bc if len(bc) == 1: bc1 = bc[0] if len(bc) > 1: bc1 = bc[0] bc2 = bc[1] if len(bc) == 3: bc3 = bc[2] out_base = dd.get_sample_name(data) + ".filtered.fq.gz" out_file = os.path.join(umi_dir, out_base) if file_exists(out_file): data["files"] = [out_file] return [[data]] ncores = dd.get_num_cores(data) cmd = "{umis} cb_filter --cores {ncores} " if bc1: cmd += "--bc1 {bc1} " if correction: cmd += "--nedit {correction} " if bc2: cmd += "--bc2 {bc2} " if bc3: cmd += "--bc3 {bc3} " fq1_cmd = "{fq1} " fq1_cmd = fq1_cmd.format(fq1=fq1) cmd += "{fq1_cmd} | gzip -c > {tx_out_file}" sample_dir = os.path.join(umi_dir, dd.get_sample_name(data)) safe_makedir(sample_dir) umis = config_utils.get_program("umis", data, default="umis") with file_transaction(out_file) as tx_out_file: message = "Filtering by cellular barcode." do.run(cmd.format(**locals()), message) data["files"] = [out_file] return [[data]]
def filter_barcodes(data): # if data was pre-demultiplexed, there is no need to filter the barcodes if dd.get_demultiplexed(data): return [[data]] fq1 = dd.get_input_sequence_files(data)[0] umi_dir = os.path.join(dd.get_work_dir(data), "umis") correction = dd.get_cellular_barcode_correction(data) bc = get_cellular_barcodes(data) if not bc: logger.info("No cellular barcodes found, skipping filtering.") return [[data]] bc1 = None bc2 = None bc3 = None umi_dir = os.path.join(dd.get_work_dir(data), "umis") if isinstance(bc, six.string_types): bc1 = bc if len(bc) == 1: bc1 = bc[0] if len(bc) > 1: bc1 = bc[0] bc2 = bc[1] if len(bc) == 3: bc3 = bc[2] out_base = dd.get_sample_name(data) + ".filtered.fq.gz" out_file = os.path.join(umi_dir, out_base) if file_exists(out_file): data["files"] = [out_file] return [[data]] ncores = dd.get_num_cores(data) cmd = "{umis} cb_filter --cores {ncores} " if bc1: cmd += "--bc1 {bc1} " if correction: cmd += "--nedit {correction} " if bc2: cmd += "--bc2 {bc2} " if bc3: cmd += "--bc3 {bc3} " fq1_cmd = "{fq1} " if not is_gzipped(fq1) else "<(gzip -cd {fq1}) " fq1_cmd = fq1_cmd.format(fq1=fq1) cmd += "{fq1_cmd} | gzip > {tx_out_file}" sample_dir = os.path.join(umi_dir, dd.get_sample_name(data)) safe_makedir(sample_dir) umis = config_utils.get_program("umis", data, default="umis") with file_transaction(out_file) as tx_out_file: message = "Filtering by cellular barcode." do.run(cmd.format(**locals()), message) data["files"] = [out_file] return [[data]]
def run_sailfish_index(*samples): fq1, _ = dd.get_input_sequence_files(samples[0][0]) kmer_size = estimate_kmer_size(fq1) Build = namedtuple('Build', ['build', 'ref', 'gtf']) builds = {Build(get_build_string(x), dd.get_ref_file(x), dd.get_gtf_file(x)) for x in dd.sample_data_iterator(samples)} data = samples[0][0] indexdirs = {} for build in builds: indexdirs[build.build] = sailfish_index(build.ref, build.gtf, data, build.build, kmer_size) return samples
def filter_barcodes(data): fq1 = dd.get_input_sequence_files(data)[0] umi_dir = os.path.join(dd.get_work_dir(data), "umis") correction = dd.get_cellular_barcode_correction(data) bc = get_cellular_barcodes(data) if not bc: return [[data]] bc1 = None bc2 = None bc3 = None umi_dir = os.path.join(dd.get_work_dir(data), "umis") if isinstance(bc, basestring): bc1 = bc if len(bc) == 1: bc1 = bc[0] if len(bc) > 1: bc1 = bc[0] bc2 = bc[1] if len(bc) == 3: bc3 = bc[2] out_base = dd.get_sample_name(data) + ".filtered.fq.gz" out_file = os.path.join(umi_dir, out_base) if file_exists(out_file): data["files"] = [out_file] return [[data]] ncores = dd.get_num_cores(data) cmd = "{umis} cb_filter --cores {ncores} " if bc1: cmd += "--bc1 {bc1} " if correction: cmd += "--nedit {correction} " if bc2: cmd += "--bc2 {bc2} " if bc3: cmd += "--bc3 {bc3} " fq1_cmd = "{fq1} " if not is_gzipped(fq1) else "<(gzip -cd {fq1}) " fq1_cmd = fq1_cmd.format(fq1=fq1) cmd += "{fq1_cmd} | gzip > {tx_out_file}" sample_dir = os.path.join(umi_dir, dd.get_sample_name(data)) safe_makedir(sample_dir) umis = config_utils.get_program("umis", data, default="umis") with file_transaction(out_file) as tx_out_file: message = "Filtering by cellular barcode." do.run(cmd.format(**locals()), message) data["files"] = [out_file] return [[data]]
def run_rapmap_align(data): samplename = dd.get_sample_name(data) files = dd.get_input_sequence_files(data) work_dir = dd.get_work_dir(data) if len(files) == 2: fq1, fq2 = files else: fq1, fq2 = files[0], None rapmap_dir = os.path.join(work_dir, "rapmap", samplename) gtf_file = dd.get_gtf_file(data) fasta_file = dd.get_ref_file(data) out_file = rapmap_align(fq1, fq2, rapmap_dir, gtf_file, fasta_file, "quasi", data) data = dd.set_transcriptome_bam(data, out_file) return [[data]]
def prepare_input_data(config): """ In case of disambiguation, we want to run fusion calling on the disambiguated reads, which are in the work_bam file. As EricScript accepts 2 fastq files as input, we need to convert the .bam to 2 .fq files. """ if not dd.get_disambiguate(config): return dd.get_input_sequence_files(config) work_bam = dd.get_work_bam(config) logger.info("Converting disambiguated reads to fastq...") fq_files = convert_bam_to_fastq(work_bam, dd.get_work_dir(config), None, None, config) return fq_files
def barcode_histogram(data): fq1 = dd.get_input_sequence_files(data)[0] umi_dir = os.path.join(dd.get_work_dir(data), "umis") sample_dir = os.path.join(umi_dir, dd.get_sample_name(data)) umis = config_utils.get_program("umis", data, default="umis") safe_makedir(sample_dir) out_file = os.path.join(sample_dir, "cb-histogram.txt") if file_exists(out_file): return [[data]] fq1_cmd = fq1 cmd = "{umis} cb_histogram {fq1_cmd} > {tx_out_file}" with file_transaction(out_file) as tx_out_file: message = "Computing cellular barcode counts for %s." % fq1 do.run(cmd.format(**locals()), message) return [[data]]
def generate_transcript_counts(data): """Generate counts per transcript and per exon from an alignment""" data["count_file"] = featureCounts.count(data) if dd.get_fusion_mode(data, False): oncofuse_file = oncofuse.run(data) if oncofuse_file: data = dd.set_oncofuse_file(data, oncofuse_file) # if RSEM set to run, but the aligner didn't create the transcriptome BAM # file, make one with bwa if dd.get_rsem(data) and not dd.get_transcriptome_bam(data): file1, file2 = dd.get_input_sequence_files(data) ref_file = dd.get_ref_file(data) logger.info("RSEM was flagged to run, but the transcriptome BAM file " "was not found. Aligning to the transcriptome with bowtie2.") data = bowtie2.align_transcriptome(file1, file2, ref_file, data) return [[data]]
def sailfish_index(gtf_file, ref_file, data, build): work_dir = dd.get_work_dir(data) out_dir = os.path.join(work_dir, "sailfish", "index", build) sailfish = config_utils.get_program("sailfish", data["config"]) num_cores = dd.get_num_cores(data) gtf_fa = create_combined_fasta(data) if file_exists(os.path.join(out_dir, "versionInfo.json")): return out_dir with file_transaction(data, out_dir) as tx_out_dir: fq1, _ = dd.get_input_sequence_files(data) kmersize = pick_kmersize(fq1) cmd = ("{sailfish} index -p {num_cores} -t {gtf_fa} -o {tx_out_dir} " "-k {kmersize}") message = "Creating sailfish index for {gtf_fa} with {kmersize} bp kmers." do.run(cmd.format(**locals()), message.format(**locals()), None) return out_dir
def prepare_input_data(config): """ In case of disambiguation, we want to run fusion calling on the disambiguated reads, which are in the work_bam file. As EricScript accepts 2 fastq files as input, we need to convert the .bam to 2 .fq files. """ if not dd.get_disambiguate(config): return dd.get_input_sequence_files(config) work_bam = dd.get_work_bam(config) logger.info("Converting disambiguated reads to fastq...") fq_files = convert_bam_to_fastq( work_bam, dd.get_work_dir(config), None, None, config ) return fq_files
def run_kallisto_singlecell(data): samplename = dd.get_sample_name(data) work_dir = dd.get_work_dir(data) kallisto_dir = os.path.join(work_dir, "kallisto", samplename) gtf_file = dd.get_gtf_file(data) files = dd.get_input_sequence_files(data) if len(files) == 2: fq1, fq2 = files else: fq1, fq2 = files[0], None assert file_exists(gtf_file), "%s was not found, exiting." % gtf_file fasta_file = dd.get_ref_file(data) assert file_exists(fasta_file), "%s was not found, exiting." % fasta_file out_file = kallisto_singlecell(fq1, kallisto_dir, gtf_file, fasta_file, data) data = dd.set_kallisto_quant(data, out_file) return [[data]]
def run_salmon_reads(data): data = utils.to_single_data(data) samplename = dd.get_sample_name(data) work_dir = dd.get_work_dir(data) salmon_dir = os.path.join(work_dir, "salmon", samplename) gtf_file = dd.get_gtf_file(data) files = dd.get_input_sequence_files(data) if len(files) == 2: fq1, fq2 = files else: fq1, fq2 = files[0], None fasta_file = dd.get_ref_file(data) out_file = salmon_quant_reads(fq1, fq2, salmon_dir, gtf_file, fasta_file, data) data = dd.set_salmon(data, out_file) data = dd.set_salmon_dir(data, salmon_dir) return [[data]]
def run_rapmap_pseudoalign(data): samplename = dd.get_sample_name(data) files = dd.get_input_sequence_files(data) work_dir = dd.get_work_dir(data) if len(files) == 2: fq1, fq2 = files else: fq1, fq2 = files[0], None rapmap_dir = os.path.join(work_dir, "rapmap", samplename) gtf_file = dd.get_gtf_file(data) assert file_exists(gtf_file), "%s was not found, exiting." % gtf_file fasta_file = dd.get_ref_file(data) assert file_exists(fasta_file), "%s was not found, exiting." % fasta_file out_file = rapmap_pseudoalign(fq1, fq2, rapmap_dir, gtf_file, fasta_file, data) data = dd.set_transcriptome_bam(data, out_file) return [[data]]
def run_salmon_reads(data): samplename = dd.get_sample_name(data) work_dir = dd.get_work_dir(data) salmon_dir = os.path.join(work_dir, "salmon", samplename) gtf_file = dd.get_gtf_file(data) files = dd.get_input_sequence_files(data) if len(files) == 2: fq1, fq2 = files else: fq1, fq2 = files[0], None assert file_exists(gtf_file), "%s was not found, exiting." % gtf_file fasta_file = dd.get_ref_file(data) assert file_exists(fasta_file), "%s was not found, exiting." % fasta_file out_file = salmon_quant_reads(fq1, fq2, salmon_dir, gtf_file, fasta_file, data) data = dd.set_sailfish(data, out_file) data = dd.set_sailfish_dir(data, salmon_dir) return [[data]]
def generate_transcript_counts(data): """Generate counts per transcript and per exon from an alignment""" data["count_file"] = featureCounts.count(data) if dd.get_fusion_mode(data, False): oncofuse_file = oncofuse.run(data) if oncofuse_file: data = dd.set_oncofuse_file(data, oncofuse_file) # if RSEM set to run, but the aligner didn't create the transcriptome BAM # file, make one with bwa if dd.get_rsem(data) and not dd.get_transcriptome_bam(data): file1, file2 = dd.get_input_sequence_files(data) ref_file = dd.get_ref_file(data) logger.info( "RSEM was flagged to run, but the transcriptome BAM file " "was not found. Aligning to the transcriptome with bowtie2.") data = bowtie2.align_transcriptome(file1, file2, ref_file, data) return [[data]]
def barcode_histogram(data): fq1 = dd.get_input_sequence_files(data)[0] umi_dir = os.path.join(dd.get_work_dir(data), "umis") sample_dir = os.path.join(umi_dir, dd.get_sample_name(data)) umis = config_utils.get_program("umis", data, default="umis") safe_makedir(sample_dir) out_file = os.path.join(sample_dir, "cb-histogram.txt") filtered_out_file = os.path.join(sample_dir, "cb-histogram-filtered.txt") fq1_cmd = fq1 cmd = "{umis} cb_histogram {fq1_cmd} > {tx_out_file}" if not file_exists(out_file): with file_transaction(out_file) as tx_out_file: message = "Computing cellular barcode counts for %s." % fq1 do.run(cmd.format(**locals()), message) cutoff = dd.get_minimum_barcode_depth(data) filter_barcode_histogram(filtered_out_file, out_file, cutoff) newdata = dd.set_histogram_counts(data, filtered_out_file) return [[newdata]]
def run_kallisto_rnaseq(data): samplename = dd.get_sample_name(data) work_dir = dd.get_work_dir(data) kallisto_dir = os.path.join(work_dir, "kallisto", samplename) gtf_file = dd.get_gtf_file(data) files = dd.get_input_sequence_files(data) if len(files) == 2: fq1, fq2 = files else: fq1, fq2 = files[0], None assert file_exists(gtf_file), "%s was not found, exiting." % gtf_file fasta_file = dd.get_ref_file(data) assert file_exists(fasta_file), "%s was not found, exiting." % fasta_file assert fq2, ("We don't support kallisto for single-end reads and fusion " "calling with pizzly does not accept single end reads.") out_file = kallisto_rnaseq(fq1, fq2, kallisto_dir, gtf_file, fasta_file, data) data = dd.set_kallisto_quant(data, out_file) return [[data]]
def filter_barcodes(data): fq1 = dd.get_input_sequence_files(data)[0] umi_dir = os.path.join(dd.get_work_dir(data), "umis") correction = dd.get_cellular_barcode_correction(data) bc = dd.get_cellular_barcodes(data) if not bc: return [[data]] bc1 = None bc2 = None umi_dir = os.path.join(dd.get_work_dir(data), "umis") if isinstance(bc, basestring): bc1 = bc if len(bc) == 1: bc1 = bc[0] if len(bc) == 2: bc1 = bc[0] bc2 = bc[1] out_base = dd.get_sample_name(data) + ".filtered.fq.gz" out_file = os.path.join(umi_dir, out_base) if file_exists(out_file): data["files"] = [out_file] return [[data]] ncores = dd.get_num_cores(data) cmd = "{umis} cb_filter --cores {ncores} " if bc1: cmd += "--bc1 {bc1} " if correction: cmd += "--nedit {correction} " if bc2: cmd += "--bc2 {bc2} " fq1_cmd = "{fq1} " if not is_gzipped(fq1) else "<(gzip -cd {fq1}) " fq1_cmd = fq1_cmd.format(fq1=fq1) cmd += "{fq1_cmd} | gzip > {tx_out_file}" sample_dir = os.path.join(umi_dir, dd.get_sample_name(data)) safe_makedir(sample_dir) umis = config_utils.get_program("umis", data, default="umis") with file_transaction(out_file) as tx_out_file: message = "Filtering by cellular barcode." do.run(cmd.format(**locals()), message) data["files"] = [out_file] return [[data]]
def run_kallisto_rnaseq(data): samplename = dd.get_sample_name(data) work_dir = dd.get_work_dir(data) kallisto_dir = os.path.join(work_dir, "kallisto", samplename) gtf_file = dd.get_gtf_file(data) files = dd.get_input_sequence_files(data) if len(files) == 2: fq1, fq2 = files else: fq1, fq2 = files[0], None assert file_exists(gtf_file), "%s was not found, exiting." % gtf_file fasta_file = dd.get_ref_file(data) assert file_exists(fasta_file), "%s was not found, exiting." % fasta_file assert fq2, ("bcbio doesn't support kallisto for single-end reads, we can " "add support for this if you open up an issue about it here: " "https://github.com/chapmanb/bcbio-nextgen/issues") out_file = kallisto_rnaseq(fq1, fq2, kallisto_dir, gtf_file, fasta_file, data) data = dd.set_kallisto_quant(data, out_file) return [[data]]
def run_kallisto_rnaseq(data): samplename = dd.get_sample_name(data) work_dir = dd.get_work_dir(data) kallisto_dir = os.path.join(work_dir, "kallisto", samplename) gtf_file = dd.get_transcriptome_gtf(data, default=dd.get_gtf_file(data)) files = dd.get_input_sequence_files(data) if len(files) == 2: fq1, fq2 = files else: fq1, fq2 = files[0], None assert file_exists(gtf_file), "%s was not found, exiting." % gtf_file fasta_file = dd.get_ref_file(data) assert file_exists(fasta_file), "%s was not found, exiting." % fasta_file assert fq2, ("We don't support kallisto for single-end reads and fusion " "calling with pizzly does not accept single end reads.") out_file = kallisto_rnaseq(fq1, fq2, kallisto_dir, gtf_file, fasta_file, data) data = dd.set_kallisto_quant(data, out_file) return [[data]]
def counts_spikein(data): data = utils.to_single_data(data) samplename = dd.get_sample_name(data) work_dir = dd.get_work_dir(data) salmon_dir = os.path.join(work_dir, "spikein", samplename) fasta_file = dd.get_spikein_fasta(data) if not fasta_file: return data files = dd.get_input_sequence_files(data) if len(files) == 2: fq1, fq2 = files else: fq1, fq2 = files[0], None assert file_exists(fasta_file), "%s was not found, exiting." % fasta_file kmer = 31 if not dd.get_analysis(data).lower() == "smallrna-seq" else 15 fasta_index = _index_spikein(fasta_file, salmon_dir, data, kmer) out_file = _salmon_quant_reads(fq1, fq2, salmon_dir, fasta_index, data) data = dd.set_spikein_counts(data, out_file) return data
def run_kallisto_rnaseq(data): samplename = dd.get_sample_name(data) work_dir = dd.get_work_dir(data) kallisto_dir = os.path.join(work_dir, "kallisto", samplename) gtf_file = dd.get_gtf_file(data) files = dd.get_input_sequence_files(data) if len(files) == 2: fq1, fq2 = files else: fq1, fq2 = files[0], None assert file_exists(gtf_file), "%s was not found, exiting." % gtf_file fasta_file = dd.get_ref_file(data) assert file_exists(fasta_file), "%s was not found, exiting." % fasta_file assert fq2, ("bcbio doesn't support kallisto for single-end reads, we can " "add support for this if you open up an issue about it here: " "https://github.com/bcbio/bcbio-nextgen/issues") out_file = kallisto_rnaseq(fq1, fq2, kallisto_dir, gtf_file, fasta_file, data) data = dd.set_kallisto_quant(data, out_file) return [[data]]
def run_sailfish(data): samplename = dd.get_sample_name(data) files = dd.get_input_sequence_files(data) work_dir = dd.get_work_dir(data) if len(files) == 2: fq1, fq2 = files else: fq1, fq2 = files[0], None if not fastq.is_fastq(fq1): return [[data]] sailfish_dir = os.path.join(work_dir, "sailfish", samplename) gtf_file = dd.get_gtf_file(data) assert file_exists(gtf_file), "%s was not found, exiting." % gtf_file fasta_file = dd.get_ref_file(data) assert file_exists(fasta_file), "%s was not found, exiting." % fasta_file stranded = dd.get_strandedness(data).lower() out_file = sailfish(fq1, fq2, sailfish_dir, gtf_file, fasta_file, stranded, data) data = dd.set_sailfish(data, out_file) data = dd.set_sailfish_dir(data, sailfish_dir) return [[data]]
def run_salmon_reads(data): data = utils.to_single_data(data) files = dd.get_input_sequence_files(data) if bam.is_bam(files[0]): files = fastq.convert_bam_to_fastq(files[0], data["dirs"]["work"], data, data["dirs"], data["config"]) samplename = dd.get_sample_name(data) work_dir = dd.get_work_dir(data) salmon_dir = os.path.join(work_dir, "salmon", samplename) gtf_file = dd.get_gtf_file(data) if len(files) == 2: fq1, fq2 = files else: fq1, fq2 = files[0], None fasta_file = dd.get_ref_file(data) out_file = salmon_quant_reads(fq1, fq2, salmon_dir, gtf_file, fasta_file, data) data = dd.set_salmon(data, out_file) data = dd.set_salmon_dir(data, salmon_dir) return [[data]]
def generate_transcript_counts(data): """Generate counts per transcript and per exon from an alignment""" data["count_file"] = featureCounts.count(data) if dd.get_fusion_mode(data, False): oncofuse_file = oncofuse.run(data) if oncofuse_file: data = dd.set_oncofuse_file(data, oncofuse_file) # if RSEM set to run, but the aligner didn't create the transcriptome BAM # file, make one with bwa if dd.get_disambiguate(data): logger.info("RSEM is not supported yet for disambiguation protocols. " "See https://github.com/chapmanb/bcbio-nextgen/issues/859") return [[data]] if dd.get_rsem(data) and not dd.get_transcriptome_bam(data): file1, file2 = dd.get_input_sequence_files(data) ref_file = dd.get_ref_file(data) logger.info("RSEM was flagged to run, but the transcriptome BAM file " "was not found. Aligning to the transcriptome with bowtie2.") data = bowtie2.align_transcriptome(file1, file2, ref_file, data) return [[data]]
def run_salmon_reads(data): data = utils.to_single_data(data) files = dd.get_input_sequence_files(data) if bam.is_bam(files[0]): files = fastq.convert_bam_to_fastq(files[0], data["dirs"]["work"], data, data["dirs"], data["config"]) samplename = dd.get_sample_name(data) work_dir = dd.get_work_dir(data) salmon_dir = os.path.join(work_dir, "salmon", samplename) gtf_file = dd.get_gtf_file(data) if len(files) == 2: fq1, fq2 = files else: fq1, fq2 = files[0], None fasta_file = dd.get_ref_file(data) out_file = salmon_quant_reads(fq1, fq2, salmon_dir, gtf_file, fasta_file, data) data = dd.set_salmon(data, out_file) data = dd.set_salmon_dir(data, salmon_dir) data = dd.set_salmon_fraglen_file(data, _get_fraglen_file(salmon_dir)) return [[data]]
def run_salmon_decoy(data): data = utils.to_single_data(data) files = dd.get_input_sequence_files(data) if bam.is_bam(files[0]): files = fastq.convert_bam_to_fastq(files[0], data["dirs"]["work"], data, data["dirs"], data["config"]) samplename = dd.get_sample_name(data) work_dir = dd.get_work_dir(data) salmon_dir = os.path.join(work_dir, "salmon", samplename) gtf_file = dd.get_gtf_file(data) if len(files) == 2: fq1, fq2 = files else: fq1, fq2 = files[0], None index = salmon_decoy_index(gtf_file, data, os.path.dirname(salmon_dir)) out_file = salmon_quant_reads(fq1, fq2, salmon_dir, gtf_file, data, index) data = dd.set_salmon(data, out_file) data = dd.set_salmon_dir(data, salmon_dir) data = dd.set_salmon_fraglen_file(data, _get_fraglen_file(salmon_dir)) data = dd.update_summary_qc(data, "salmon", base=dd.get_salmon_fraglen_file(data)) return [[data]]
def salmon_index(gtf_file, data, out_dir): out_dir = os.path.join(out_dir, "index", sailfish.get_build_string(data)) if dd.get_disambiguate(data): out_dir = "-".join([out_dir] + dd.get_disambiguate(data)) salmon = config_utils.get_program("salmon", dd.get_config(data)) num_cores = dd.get_num_cores(data) if dd.get_transcriptome_fasta(data): gtf_fa = dd.get_transcriptome_fasta(data) else: gtf_fa = sailfish.create_combined_fasta(data) assert file_exists(gtf_fa), "%s was not found, exiting." % gtf_fa tmpdir = dd.get_tmp_dir(data) out_file = os.path.join(out_dir, "versionInfo.json") if file_exists(out_file): return out_dir files = dd.get_input_sequence_files(data) kmersize = sailfish.pick_kmersize(files[0]) with file_transaction(data, out_dir) as tx_out_dir: cmd = "{salmon} index --keepDuplicates -k {kmersize} -p {num_cores} -i {tx_out_dir} -t {gtf_fa}" message = "Creating Salmon index for {gtf_fa} with {kmersize} bp kmers." do.run(cmd.format(**locals()), message.format(**locals()), None) return out_dir
def process_alignment(data): """Do an alignment of fastq files, preparing a sorted BAM output file. """ fastq1, fastq2 = dd.get_input_sequence_files(data) config = data["config"] aligner = config["algorithm"].get("aligner", None) if fastq1 and utils.file_exists_or_remote(fastq1) and aligner: logger.info("Aligning lane %s with %s aligner" % (data["rgnames"]["lane"], aligner)) data = align_to_sort_bam(fastq1, fastq2, aligner, data) data = _add_supplemental_bams(data) elif fastq1 and os.path.exists(fastq1) and fastq1.endswith(".bam"): sort_method = config["algorithm"].get("bam_sort") bamclean = config["algorithm"].get("bam_clean") if bamclean is True or bamclean == "picard": if sort_method and sort_method != "coordinate": raise ValueError("Cannot specify `bam_clean: picard` with `bam_sort` other than coordinate: %s" % sort_method) out_bam = cleanbam.picard_prep(fastq1, data["rgnames"], data["sam_ref"], data["dirs"], data) elif sort_method: runner = broad.runner_from_config(config) out_file = os.path.join(data["dirs"]["work"], "{}-sort.bam".format( os.path.splitext(os.path.basename(fastq1))[0])) out_bam = runner.run_fn("picard_sort", fastq1, sort_method, out_file) else: out_bam = link_bam_file(fastq1, os.path.join(data["dirs"]["work"], "prealign", data["rgnames"]["sample"])) bam.check_header(out_bam, data["rgnames"], data["sam_ref"], data["config"]) dedup_bam = postalign.dedup_bam(out_bam, data) data["work_bam"] = dedup_bam elif fastq1 and utils.file_exists_or_remote(fastq1) and fastq1.endswith(".cram"): data["work_bam"] = fastq1 elif fastq1 is None and "vrn_file" in data: data["config"]["algorithm"]["variantcaller"] = False data["work_bam"] = None else: raise ValueError("Could not process input file: %s" % fastq1) return [[data]]
def salmon_index(gtf_file, ref_file, data, out_dir): out_dir = os.path.join(out_dir, "index", sailfish.get_build_string(data)) if dd.get_disambiguate(data): out_dir = "-".join([out_dir] + dd.get_disambiguate(data)) salmon = config_utils.get_program("salmon", dd.get_config(data)) num_cores = dd.get_num_cores(data) if dd.get_transcriptome_fasta(data): gtf_fa = dd.get_transcriptome_fasta(data) else: gtf_fa = sailfish.create_combined_fasta(data) assert file_exists(gtf_fa), "%s was not found, exiting." % gtf_fa tmpdir = dd.get_tmp_dir(data) out_file = os.path.join(out_dir, "versionInfo.json") if file_exists(out_file): logger.info("Transcriptome index for %s detected, skipping building." % gtf_fa) return out_dir files = dd.get_input_sequence_files(data) kmersize = sailfish.pick_kmersize(files[0]) with file_transaction(data, out_dir) as tx_out_dir: cmd = "{salmon} index -k {kmersize} -p {num_cores} -i {tx_out_dir} -t {gtf_fa}" message = "Creating Salmon index for {gtf_fa} with {kmersize} bp kmers." do.run(cmd.format(**locals()), message.format(**locals()), None) return out_dir
def counts_spikein(data): data = utils.to_single_data(data) samplename = dd.get_sample_name(data) work_dir = dd.get_work_dir(data) salmon_dir = os.path.join(work_dir, "spikein", samplename) fasta_file = dd.get_spikein_fasta(data) if not fasta_file: return data files = dd.get_input_sequence_files(data) if len(files) == 2: fq1, fq2 = files else: fq1, fq2 = files[0], None assert file_exists(fasta_file), "%s was not found, exiting." % fasta_file readlength = fastq.estimate_read_length(fq1) if readlength % 2 == 0: readlength -= 1 kmersize = min(readlength, 31) logger.info("kmersize used for salmon index at spikein quant: %s" % kmersize) kmersize = kmersize if not dd.get_analysis(data).lower() == "smallrna-seq" else 15 fasta_index = _index_spikein(fasta_file, salmon_dir, data, kmersize) out_file = _salmon_quant_reads(fq1, fq2, salmon_dir, fasta_index, data) data = dd.set_spikein_counts(data, out_file) return data