def salmon_index(gtf_file, ref_file, data, out_dir): out_dir = os.path.join(out_dir, "index", sailfish.get_build_string(data)) if dd.get_disambiguate(data): out_dir = "-".join([out_dir] + dd.get_disambiguate(data)) salmon = config_utils.get_program("salmon", dd.get_config(data)) num_cores = dd.get_num_cores(data) if dd.get_transcriptome_fasta(data): gtf_fa = dd.get_transcriptome_fasta(data) else: gtf_fa = sailfish.create_combined_fasta(data, out_dir) assert file_exists(gtf_fa), "%s was not found, exiting." % gtf_fa tmpdir = dd.get_tmp_dir(data) out_file = os.path.join(out_dir, "versionInfo.json") if file_exists(out_file): return out_dir files = dd.get_input_sequence_files(data) readlength = bam.fastq.estimate_read_length(files[0]) if readlength % 2 == 0: readlength -= 1 kmersize = min(readlength, 31) with file_transaction(data, out_dir) as tx_out_dir: cmd = "{salmon} index -k {kmersize} -p {num_cores} -i {tx_out_dir} -t {gtf_fa}" message = "Creating Salmon index for {gtf_fa}." do.run(cmd.format(**locals()), message.format(**locals()), None) return out_dir
def _create_combined_fasta(data, out_dir): """ if there are genomes to be disambiguated, create a FASTA file of all of the transcripts for all genomes """ items = disambiguate.split([data]) fasta_files = [] for i in items: odata = i[0] gtf_file = dd.get_gtf_file(odata) ref_file = dd.get_ref_file(odata) out_file = os.path.join(out_dir, dd.get_genome_build(odata) + ".fa") if file_exists(out_file): fasta_files.append(out_file) else: out_file = _gtf_to_fasta(gtf_file, ref_file, out_file) out_file = _clean_gtf_fa(out_file, out_file) fasta_files.append(out_file) out_stem = os.path.join(out_dir, dd.get_genome_build(data)) if dd.get_disambiguate(data): out_stem = "-".join([out_stem] + dd.get_disambiguate(data)) combined_file = out_stem + ".fa" if file_exists(combined_file): return combined_file fasta_file_string = " ".join(fasta_files) cmd = "cat {fasta_file_string} > {tx_out_file}" with file_transaction(combined_file) as tx_out_file: do.run(cmd.format(**locals()), "Combining transcriptome FASTA files.") return combined_file
def rapmap_index(gtf_file, ref_file, data, out_dir): out_dir = os.path.join(out_dir, "index", dd.get_genome_build(data)) if dd.get_disambiguate(data): out_dir = "-".join([out_dir] + dd.get_disambiguate(data)) rapmap = config_utils.get_program("rapmap", data["config"]) gtf_fa = create_combined_fasta(data, out_dir) if file_exists(out_dir + "rapidx.jfhash"): return out_dir with file_transaction(out_dir) as tx_out_dir: cmd = "{rapmap} pseudoindex -i {tx_out_dir} -t {gtf_fa}" message = "Creating RapMap pseudoindex for {gtf_fa}." do.run(cmd.format(**locals()), message.format(**locals()), None) return out_dir
def sailfish_index(gtf_file, ref_file, data, out_dir): out_dir = os.path.join(out_dir, "index", dd.get_genome_build(data)) if dd.get_disambiguate(data): out_dir = "-".join([out_dir] + dd.get_disambiguate(data)) sailfish = config_utils.get_program("sailfish", data["config"]) num_cores = dd.get_num_cores(data) gtf_fa = create_combined_fasta(data, out_dir) if file_exists(out_dir + "versionInfo.json"): return out_dir with file_transaction(out_dir) as tx_out_dir: cmd = "{sailfish} index -p {num_cores} -t {gtf_fa} -o {tx_out_dir} -k 25" message = "Creating sailfish index for {gtf_fa}." do.run(cmd.format(**locals()), message.format(**locals()), None) return out_dir
def sailfish_index(gtf_file, ref_file, data, out_dir, kmer_size): out_dir = os.path.join(out_dir, "index", dd.get_genome_build(data)) if dd.get_disambiguate(data): out_dir = "-".join([out_dir] + (dd.get_disambiguate(data) or [])) sailfish = config_utils.get_program("sailfish", data["config"]) num_cores = dd.get_num_cores(data) gtf_fa = create_combined_fasta(data, out_dir) if file_exists(out_dir + "versionInfo.json"): return out_dir with file_transaction(out_dir) as tx_out_dir: cmd = ("{sailfish} index -p {num_cores} -t {gtf_fa} -o {tx_out_dir} " "-k {kmer_size}") message = "Creating sailfish index for {gtf_fa}." do.run(cmd.format(**locals()), message.format(**locals()), None) return out_dir
def generate_transcript_counts(data): """Generate counts per transcript and per exon from an alignment""" data["count_file"] = featureCounts.count(data) if dd.get_fusion_mode(data, False): oncofuse_file = oncofuse.run(data) if oncofuse_file: data = dd.set_oncofuse_file(data, oncofuse_file) if dd.get_transcriptome_align(data) and not dd.get_transcriptome_bam(data): file1, file2 = None, None if dd.get_disambiguate(data): bam_path = data["work_bam"] fastq_paths = alignprep._bgzip_from_bam(bam_path, data["dirs"], data["config"], is_retry=False, output_infix='-transcriptome') if len(fastq_paths) == 2: file1, file2 = fastq_paths else: file1, file2 = fastq_paths[0], None else: file1, file2 = dd.get_input_sequence_files(data) ref_file = dd.get_ref_file(data) logger.info("Transcriptome alignment was flagged to run, but the " "transcriptome BAM file was not found. Aligning to the " "transcriptome with bowtie2.") data = bowtie2.align_transcriptome(file1, file2, ref_file, data) return [[data]]
def salmon_index(gtf_file, ref_file, data, out_dir): out_dir = os.path.join(out_dir, "index", sailfish.get_build_string(data)) if dd.get_disambiguate(data): out_dir = "-".join([out_dir] + dd.get_disambguate(data)) salmon = config_utils.get_program("salmon", dd.get_config(data)) num_cores = dd.get_num_cores(data) if dd.get_transcriptome_fasta(data): gtf_fa = dd.get_transcriptome_fasta(data) else: gtf_fa = sailfish.create_combined_fasta(data, out_dir) assert file_exists(gtf_fa), "%s was not found, exiting." % gtf_fa tmpdir = dd.get_tmp_dir(data) out_file = os.path.join(out_dir, "versionInfo.json") if file_exists(out_file): return out_dir files = dd.get_input_sequence_files(data) readlength = bam.fastq.estimate_read_length(files[0]) if readlength % 2 == 0: readlength -= 1 kmersize = min(readlength, 31) with file_transaction(data, out_dir) as tx_out_dir: cmd = "{salmon} index -k {kmersize} -p {num_cores} -i {tx_out_dir} -t {gtf_fa}" message = "Creating Salmon index for {gtf_fa}." do.run(cmd.format(**locals()), message.format(**locals()), None) return out_dir
def rapmap_index(gtf_file, ref_file, algorithm, data, out_dir): valid_indexes = ["pseudoindex", "quasiindex"] index_type = algorithm + "index" assert index_type in valid_indexes, \ "RapMap only supports %s indices." % valid_indexes out_dir = os.path.join(out_dir, index_type, dd.get_genome_build(data)) if dd.get_disambiguate(data): out_dir = "-".join([out_dir] + dd.get_disambguate(data)) rapmap = config_utils.get_program("rapmap", dd.get_config(data)) # use user supplied transcriptome FASTA file if it exists if dd.get_transcriptome_fasta(data): out_dir = os.path.join(out_dir, index_type, dd.get_genome_build(data)) gtf_fa = dd.get_transcriptome_fasta(data) else: gtf_fa = sailfish.create_combined_fasta(data) tmpdir = dd.get_tmp_dir(data) if file_exists(out_dir + "rapidx.jfhash"): return out_dir files = dd.get_input_sequence_files(data) kmersize = sailfish.pick_kmersize(files[0]) message = "Creating rapmap {index_type} for {gtf_fa} with {kmersize} bp kmers." with file_transaction(out_dir) as tx_out_dir: cmd = "{rapmap} {index_type} -k {kmersize} -i {tx_out_dir} -t {gtf_fa}" do.run(cmd.format(**locals()), message.format(**locals()), None) return out_dir
def generate_transcript_counts(data): """Generate counts per transcript and per exon from an alignment""" data["count_file"] = featureCounts.count(data) if dd.get_fusion_mode(data, False): oncofuse_file = oncofuse.run(data) if oncofuse_file: data = dd.set_oncofuse_file(data, oncofuse_file) if dd.get_transcriptome_align(data) and not dd.get_transcriptome_bam(data): file1, file2 = None, None if dd.get_disambiguate(data): bam_path = data["work_bam"] fastq_paths = alignprep._bgzip_from_bam( bam_path, data["dirs"], data["config"], is_retry=False, output_infix='-transcriptome') if len(fastq_paths) == 2: file1, file2 = fastq_paths else: file1, file2 = fastq_paths[0], None else: file1, file2 = dd.get_input_sequence_files(data) ref_file = dd.get_ref_file(data) logger.info("Transcriptome alignment was flagged to run, but the " "transcriptome BAM file was not found. Aligning to the " "transcriptome with bowtie2.") data = bowtie2.align_transcriptome(file1, file2, ref_file, data) return [[data]]
def generate_transcript_counts(data): """Generate counts per transcript and per exon from an alignment""" data["count_file"] = featureCounts.count(data) if dd.get_transcriptome_align(data): # to create a disambiguated transcriptome file realign with bowtie2 if dd.get_disambiguate(data): logger.info("Aligning to the transcriptome with bowtie2 using the " "disambiguated reads.") bam_path = data["work_bam"] fastq_paths = alignprep._bgzip_from_bam( bam_path, data["dirs"], data, is_retry=False, output_infix='-transcriptome') if len(fastq_paths) == 2: file1, file2 = fastq_paths else: file1, file2 = fastq_paths[0], None ref_file = dd.get_ref_file(data) data = bowtie2.align_transcriptome(file1, file2, ref_file, data) else: file1, file2 = dd.get_input_sequence_files(data) if not dd.get_transcriptome_bam(data): ref_file = dd.get_ref_file(data) logger.info( "Transcriptome alignment was flagged to run, but the " "transcriptome BAM file was not found. Aligning to the " "transcriptome with bowtie2.") data = bowtie2.align_transcriptome(file1, file2, ref_file, data) data = spikein.counts_spikein(data) return [[data]]
def kallisto_index(gtf_file, ref_file, data, out_dir): out_dir = os.path.join(out_dir, "index") out_stem = dd.get_genome_build(data) if dd.get_disambiguate(data): out_stem = "-".join([out_stem] + dd.get_disambiguate(data)) index_dir = os.path.join(out_dir, out_stem) out_file = os.path.join(index_dir, out_stem + ".idx") kallisto = config_utils.get_program("kallisto", dd.get_config(data)) if dd.get_transcriptome_fasta(data): gtf_fa = dd.get_transcriptome_fasta(data) else: gtf_fa = sailfish.create_combined_fasta(data) if file_exists(out_file): return out_file with file_transaction(out_file) as tx_out_file: cmd = "{kallisto} index -k 31 -i {tx_out_file} {gtf_fa}" message = "Creating Kallisto index for {gtf_fa}." do.run(cmd.format(**locals()), message.format(**locals()), None) return out_file
def determine_indexes_to_make(samples): """ returns a subset of the samples that have different indexes in them to make sure we only make each index once """ samples = [to_single_data(x) for x in samples] indexes = set() tomake = [] for data in samples: out_dir = os.path.join(dd.get_work_dir(data), "inputs", "transcriptome") out_stem = os.path.join(out_dir, dd.get_genome_build(data)) if dd.get_disambiguate(data): out_stem = "-".join([out_stem] + (dd.get_disambiguate(data) or [])) if dd.get_disambiguate(data): out_stem = "-".join([out_stem] + (dd.get_disambiguate(data) or [])) combined_file = out_stem + ".fa" if combined_file not in indexes: tomake.append(data) indexes.add(combined_file) return tomake
def rapmap_pseudoindex(gtf_file, ref_file, data, out_dir): out_dir = os.path.join(out_dir, "pseudoindex", dd.get_genome_build(data)) if dd.get_disambiguate(data): out_dir = "-".join([out_dir] + dd.get_disambguate(data)) rapmap = config_utils.get_program("rapmap", dd.get_config(data)) gtf_fa = sailfish._create_combined_fasta(data, out_dir) tmpdir = dd.get_tmp_dir(data) if file_exists(out_dir + "rapidx.jfhash"): return out_dir with file_transaction(out_dir) as tx_out_dir: cmd = "{rapmap} pseudoindex -k 31 -i {tx_out_dir} -t {gtf_fa}" message = "Creating rapmap pseudoindex for {gtf_fa}." do.run(cmd.format(**locals()), message.format(**locals()), None) return out_dir
def salmon_index(gtf_file, ref_file, data, out_dir): out_dir = os.path.join(out_dir, "index", dd.get_genome_build(data)) if dd.get_disambiguate(data): out_dir = "-".join([out_dir] + dd.get_disambguate(data)) salmon = config_utils.get_program("salmon", dd.get_config(data)) num_cores = dd.get_num_cores(data) gtf_fa = sailfish._create_combined_fasta(data, out_dir) tmpdir = dd.get_tmp_dir(data) ### TODO PUT MEMOZATION HERE with file_transaction(out_dir) as tx_out_dir: cmd = "{salmon} index -k 31 -p {num_cores} -i {tx_out_dir} -t {gtf_fa}" message = "Creating Salmon index for {gtf_fa}." do.run(cmd.format(**locals()), message.format(**locals()), None) return out_dir
def salmon_index(gtf_file, ref_file, data, out_dir): out_dir = os.path.join(out_dir, "index", sailfish.get_build_string(data)) if dd.get_disambiguate(data): out_dir = "-".join([out_dir] + dd.get_disambiguate(data)) salmon = config_utils.get_program("salmon", dd.get_config(data)) num_cores = dd.get_num_cores(data) if dd.get_transcriptome_fasta(data): gtf_fa = dd.get_transcriptome_fasta(data) else: gtf_fa = sailfish.create_combined_fasta(data) assert file_exists(gtf_fa), "%s was not found, exiting." % gtf_fa tmpdir = dd.get_tmp_dir(data) out_file = os.path.join(out_dir, "versionInfo.json") if file_exists(out_file): logger.info("Transcriptome index for %s detected, skipping building." % gtf_fa) return out_dir files = dd.get_input_sequence_files(data) kmersize = sailfish.pick_kmersize(files[0]) with file_transaction(data, out_dir) as tx_out_dir: cmd = "{salmon} index -k {kmersize} -p {num_cores} -i {tx_out_dir} -t {gtf_fa}" message = "Creating Salmon index for {gtf_fa} with {kmersize} bp kmers." do.run(cmd.format(**locals()), message.format(**locals()), None) return out_dir
def prepare_input_data(config): """ In case of disambiguation, we want to run fusion calling on the disambiguated reads, which are in the work_bam file. As EricScript accepts 2 fastq files as input, we need to convert the .bam to 2 .fq files. """ if not dd.get_disambiguate(config): return dd.get_input_sequence_files(config) work_bam = dd.get_work_bam(config) logger.info("Converting disambiguated reads to fastq...") fq_files = convert_bam_to_fastq(work_bam, dd.get_work_dir(config), None, None, config) return fq_files
def prepare_input_data(config): """ In case of disambiguation, we want to run fusion calling on the disambiguated reads, which are in the work_bam file. As EricScript accepts 2 fastq files as input, we need to convert the .bam to 2 .fq files. """ if not dd.get_disambiguate(config): return dd.get_input_sequence_files(config) work_bam = dd.get_work_bam(config) logger.info("Converting disambiguated reads to fastq...") fq_files = convert_bam_to_fastq( work_bam, dd.get_work_dir(config), None, None, config ) return fq_files
def rapmap_index(gtf_file, ref_file, algorithm, data, out_dir): valid_indexes = ["pseudoindex", "quasiindex"] index_type = algorithm + "index" assert index_type in valid_indexes, \ "RapMap only supports %s indices." % valid_indexes out_dir = os.path.join(out_dir, index_type, dd.get_genome_build(data)) if dd.get_disambiguate(data): out_dir = "-".join([out_dir] + dd.get_disambguate(data)) rapmap = config_utils.get_program("rapmap", dd.get_config(data)) gtf_fa = sailfish.create_combined_fasta(data, out_dir) tmpdir = dd.get_tmp_dir(data) if file_exists(out_dir + "rapidx.jfhash"): return out_dir with file_transaction(out_dir) as tx_out_dir: cmd = "{rapmap} {index_type} -k 31 -i {tx_out_dir} -t {gtf_fa}" message = "Creating rapmap {index_type} for {gtf_fa}." do.run(cmd.format(**locals()), message.format(**locals()), None) return out_dir
def salmon_index(gtf_file, ref_file, data, out_dir): out_dir = os.path.join(out_dir, "index", dd.get_genome_build(data)) if dd.get_disambiguate(data): out_dir = "-".join([out_dir] + dd.get_disambguate(data)) salmon = config_utils.get_program("salmon", dd.get_config(data)) num_cores = dd.get_num_cores(data) if dd.get_transcriptome_fasta(data): gtf_fa = dd.get_transcriptome_fasta(data) else: gtf_fa = sailfish.create_combined_fasta(data, out_dir) tmpdir = dd.get_tmp_dir(data) out_file = os.path.join(out_dir, "versionInfo.json") if file_exists(out_file): return out_dir with file_transaction(out_dir) as tx_out_dir: cmd = "{salmon} index -k 31 -p {num_cores} -i {tx_out_dir} -t {gtf_fa}" message = "Creating Salmon index for {gtf_fa}." do.run(cmd.format(**locals()), message.format(**locals()), None) return out_dir
def generate_transcript_counts(data): """Generate counts per transcript and per exon from an alignment""" data["count_file"] = featureCounts.count(data) if dd.get_fusion_mode(data, False): oncofuse_file = oncofuse.run(data) if oncofuse_file: data = dd.set_oncofuse_file(data, oncofuse_file) # if RSEM set to run, but the aligner didn't create the transcriptome BAM # file, make one with bwa if dd.get_disambiguate(data): logger.info("RSEM is not supported yet for disambiguation protocols. " "See https://github.com/chapmanb/bcbio-nextgen/issues/859") return [[data]] if dd.get_rsem(data) and not dd.get_transcriptome_bam(data): file1, file2 = dd.get_input_sequence_files(data) ref_file = dd.get_ref_file(data) logger.info("RSEM was flagged to run, but the transcriptome BAM file " "was not found. Aligning to the transcriptome with bowtie2.") data = bowtie2.align_transcriptome(file1, file2, ref_file, data) return [[data]]
def get_average_coverage(target_name, bed_file, data, bam_file=None): if not bam_file: bam_file = dd.get_align_bam(data) or dd.get_work_bam(data) cache_file = _get_cache_file(data, target_name) if dd.get_disambiguate(data): cache = _read_cache(cache_file, [bed_file]) else: cache = _read_cache(cache_file, [bam_file, bed_file]) if "avg_coverage" in cache: return int(cache["avg_coverage"]) if bed_file: avg_cov = _average_bed_coverage(bed_file, target_name, data) else: avg_cov = _average_genome_coverage(data, bam_file) cache["avg_coverage"] = int(avg_cov) _write_cache(cache, cache_file) return int(avg_cov)
def get_build_string(data): build_string = dd.get_genome_build(data) if dd.get_disambiguate(data): build_string = "-".join([build_string] + (dd.get_disambiguate(data) or [])) return build_string