def rapmap_index(gtf_file, ref_file, algorithm, data, out_dir): valid_indexes = ["pseudoindex", "quasiindex"] index_type = algorithm + "index" assert index_type in valid_indexes, \ "RapMap only supports %s indices." % valid_indexes out_dir = os.path.join(out_dir, index_type, dd.get_genome_build(data)) if dd.get_disambiguate(data): out_dir = "-".join([out_dir] + dd.get_disambguate(data)) rapmap = config_utils.get_program("rapmap", dd.get_config(data)) # use user supplied transcriptome FASTA file if it exists if dd.get_transcriptome_fasta(data): out_dir = os.path.join(out_dir, index_type, dd.get_genome_build(data)) gtf_fa = dd.get_transcriptome_fasta(data) else: gtf_fa = sailfish.create_combined_fasta(data) tmpdir = dd.get_tmp_dir(data) if file_exists(out_dir + "rapidx.jfhash"): return out_dir files = dd.get_input_sequence_files(data) kmersize = sailfish.pick_kmersize(files[0]) message = "Creating rapmap {index_type} for {gtf_fa} with {kmersize} bp kmers." with file_transaction(out_dir) as tx_out_dir: cmd = "{rapmap} {index_type} -k {kmersize} -i {tx_out_dir} -t {gtf_fa}" do.run(cmd.format(**locals()), message.format(**locals()), None) return out_dir
def salmon_quant_reads(fq1, fq2, salmon_dir, gtf_file, ref_file, data): samplename = dd.get_sample_name(data) quant_dir = os.path.join(salmon_dir, "quant") safe_makedir(salmon_dir) out_file = os.path.join(quant_dir, "quant.sf") if file_exists(out_file): return out_file gtf_fa = sailfish.create_combined_fasta(data, salmon_dir) num_cores = dd.get_num_cores(data) strandedness = dd.get_strandedness(data).lower() salmon = config_utils.get_program("salmon", dd.get_config(data)) libtype = sailfish._libtype_string(fq1, fq2, strandedness) num_cores = dd.get_num_cores(data) index = salmon_index(gtf_file, ref_file, data, salmon_dir) cmd = ("{salmon} quant {libtype} -i {index} -p {num_cores} " "-o {tx_out_dir} ") fq1_cmd = "{fq1}" if not is_gzipped(fq1) else "<(gzip -cd {fq1})" fq1_cmd = fq1_cmd.format(fq1=fq1) if not fq2: cmd += " -r {fq1_cmd} " else: fq2_cmd = "{fq2}" if not is_gzipped(fq2) else "<(gzip -cd {fq2})" fq2_cmd = fq2_cmd.format(fq2=fq2) cmd += " -1 {fq1_cmd} -2 {fq2_cmd} " # skip --useVBOpt for now, it can cause segfaults cmd += "--numBootstraps 30 " with file_transaction(data, quant_dir) as tx_out_dir: message = ("Quantifying transcripts in %s and %s with Salmon." %(fq1, fq2)) do.run(cmd.format(**locals()), message, None) return out_file
def salmon_index(gtf_file, ref_file, data, out_dir): out_dir = os.path.join(out_dir, "index", sailfish.get_build_string(data)) if dd.get_disambiguate(data): out_dir = "-".join([out_dir] + dd.get_disambiguate(data)) salmon = config_utils.get_program("salmon", dd.get_config(data)) num_cores = dd.get_num_cores(data) if dd.get_transcriptome_fasta(data): gtf_fa = dd.get_transcriptome_fasta(data) else: gtf_fa = sailfish.create_combined_fasta(data) assert file_exists(gtf_fa), "%s was not found, exiting." % gtf_fa tmpdir = dd.get_tmp_dir(data) out_file = os.path.join(out_dir, "versionInfo.json") if file_exists(out_file): logger.info("Transcriptome index for %s detected, skipping building." % gtf_fa) return out_dir files = dd.get_input_sequence_files(data) readlength = bam.fastq.estimate_read_length(files[0]) if readlength % 2 == 0: readlength -= 1 kmersize = min(readlength, 31) with file_transaction(data, out_dir) as tx_out_dir: cmd = "{salmon} index -k {kmersize} -p {num_cores} -i {tx_out_dir} -t {gtf_fa}" message = "Creating Salmon index for {gtf_fa}." do.run(cmd.format(**locals()), message.format(**locals()), None) return out_dir
def salmon_index(gtf_file, ref_file, data, out_dir): out_dir = os.path.join(out_dir, "index", sailfish.get_build_string(data)) if dd.get_disambiguate(data): out_dir = "-".join([out_dir] + dd.get_disambiguate(data)) salmon = config_utils.get_program("salmon", dd.get_config(data)) num_cores = dd.get_num_cores(data) if dd.get_transcriptome_fasta(data): gtf_fa = dd.get_transcriptome_fasta(data) else: gtf_fa = sailfish.create_combined_fasta(data, out_dir) assert file_exists(gtf_fa), "%s was not found, exiting." % gtf_fa tmpdir = dd.get_tmp_dir(data) out_file = os.path.join(out_dir, "versionInfo.json") if file_exists(out_file): return out_dir files = dd.get_input_sequence_files(data) readlength = bam.fastq.estimate_read_length(files[0]) if readlength % 2 == 0: readlength -= 1 kmersize = min(readlength, 31) with file_transaction(data, out_dir) as tx_out_dir: cmd = "{salmon} index -k {kmersize} -p {num_cores} -i {tx_out_dir} -t {gtf_fa}" message = "Creating Salmon index for {gtf_fa}." do.run(cmd.format(**locals()), message.format(**locals()), None) return out_dir
def salmon_decoy_index(gtf_file, data, out_dir): input_dir = os.path.join(dd.get_work_dir(data), "inputs", "transcriptome") decoy_transcriptome = os.path.join( input_dir, sailfish.get_build_string(data) + "-decoy.fa") out_dir = os.path.join(out_dir, "index", sailfish.get_build_string(data)) if dd.get_disambiguate(data): out_dir = "-".join([out_dir] + dd.get_disambiguate(data)) salmon = config_utils.get_program("salmon", dd.get_config(data)) num_cores = dd.get_num_cores(data) if dd.get_transcriptome_fasta(data): gtf_fa = dd.get_transcriptome_fasta(data) else: gtf_fa = sailfish.create_combined_fasta(data) assert file_exists(gtf_fa), "%s was not found, exiting." % gtf_fa decoy_sequence_file = get_decoy_sequence_file(data) decoy_name_file = get_decoy_name_file(data) gtf_fa = create_decoy_transcriptome(gtf_fa, get_decoy_sequence_file(data), decoy_transcriptome) out_file = os.path.join(out_dir, "versionInfo.json") if file_exists(out_file): logger.info("Transcriptome index for %s detected, skipping building." % gtf_fa) return out_dir files = dd.get_input_sequence_files(data) kmersize = sailfish.pick_kmersize(files[0]) with file_transaction(data, out_dir) as tx_out_dir: cmd = ( "{salmon} index -k {kmersize} -p {num_cores} -i {tx_out_dir} -t {gtf_fa} " "--decoys {decoy_name_file} ") message = "Creating decoy-aware Salmon index for {gtf_fa} with {kmersize} bp kmers." do.run(cmd.format(**locals()), message.format(**locals()), None) return out_dir
def rapmap_index(gtf_file, ref_file, data, out_dir): out_dir = os.path.join(out_dir, "index", dd.get_genome_build(data)) if dd.get_disambiguate(data): out_dir = "-".join([out_dir] + dd.get_disambiguate(data)) rapmap = config_utils.get_program("rapmap", data["config"]) gtf_fa = create_combined_fasta(data, out_dir) if file_exists(out_dir + "rapidx.jfhash"): return out_dir with file_transaction(out_dir) as tx_out_dir: cmd = "{rapmap} pseudoindex -i {tx_out_dir} -t {gtf_fa}" message = "Creating RapMap pseudoindex for {gtf_fa}." do.run(cmd.format(**locals()), message.format(**locals()), None) return out_dir
def run_pizzly(data): work_dir = dd.get_work_dir(data) pizzlydir = os.path.join(work_dir, "pizzly") samplename = dd.get_sample_name(data) gtf = dd.get_gtf_file(data) if dd.get_transcriptome_fasta(data): gtf_fa = dd.get_transcriptome_fasta(data) else: gtf_fa = sailfish.create_combined_fasta(data) fraglength = get_fragment_length(data) cachefile = os.path.join(pizzlydir, "pizzly.cache") fusions = kallisto.get_kallisto_fusions(data) pizzlypath = config_utils.get_program("pizzly", dd.get_config(data)) outdir = pizzly(pizzlypath, gtf, gtf_fa, fraglength, cachefile, pizzlydir, fusions, samplename, data) return outdir
def salmon_index(gtf_file, ref_file, data, out_dir): out_dir = os.path.join(out_dir, "index", dd.get_genome_build(data)) if dd.get_disambiguate(data): out_dir = "-".join([out_dir] + dd.get_disambguate(data)) salmon = config_utils.get_program("salmon", dd.get_config(data)) num_cores = dd.get_num_cores(data) gtf_fa = sailfish.create_combined_fasta(data, out_dir) tmpdir = dd.get_tmp_dir(data) out_file = os.path.join(out_dir, "versionInfo.json") if file_exists(out_file): return out_dir with file_transaction(out_dir) as tx_out_dir: cmd = "{salmon} index -k 31 -p {num_cores} -i {tx_out_dir} -t {gtf_fa}" message = "Creating Salmon index for {gtf_fa}." do.run(cmd.format(**locals()), message.format(**locals()), None) return out_dir
def rapmap_index(gtf_file, ref_file, algorithm, data, out_dir): valid_indexes = ["pseudoindex", "quasiindex"] index_type = algorithm + "index" assert index_type in valid_indexes, \ "RapMap only supports %s indices." % valid_indexes out_dir = os.path.join(out_dir, index_type, dd.get_genome_build(data)) if dd.get_disambiguate(data): out_dir = "-".join([out_dir] + dd.get_disambguate(data)) rapmap = config_utils.get_program("rapmap", dd.get_config(data)) gtf_fa = sailfish.create_combined_fasta(data, out_dir) tmpdir = dd.get_tmp_dir(data) if file_exists(out_dir + "rapidx.jfhash"): return out_dir with file_transaction(out_dir) as tx_out_dir: cmd = "{rapmap} {index_type} -k 31 -i {tx_out_dir} -t {gtf_fa}" message = "Creating rapmap {index_type} for {gtf_fa}." do.run(cmd.format(**locals()), message.format(**locals()), None) return out_dir
def salmon_quant_bam(bam_file, salmon_dir, gtf_file, ref_file, data): samplename = dd.get_sample_name(data) quant_dir = os.path.join(salmon_dir, "quant") safe_makedir(salmon_dir) out_file = os.path.join(quant_dir, "quant.sf") if file_exists(out_file): return out_file gtf_fa = sailfish.create_combined_fasta(data, salmon_dir) num_cores = dd.get_num_cores(data) strandedness = dd.get_strandedness(data).lower() salmon = config_utils.get_program("salmon", dd.get_config(data)) libtype = _libtype_string(bam_file, strandedness) num_cores = dd.get_num_cores(data) cmd = "{salmon} quant {libtype} -p {num_cores} -t {gtf_fa} " "-o {tx_out_dir} -a {bam_file} " cmd += "--numBootstraps 30 " with file_transaction(data, quant_dir) as tx_out_dir: message = "Quantifying transcripts in %s with Salmon." % bam_file do.run(cmd.format(**locals()), message, None) return out_file
def kallisto_index(gtf_file, ref_file, data, out_dir): out_dir = os.path.join(out_dir, "index") out_stem = dd.get_genome_build(data) if dd.get_disambiguate(data): out_stem = "-".join([out_stem] + dd.get_disambiguate(data)) index_dir = os.path.join(out_dir, out_stem) out_file = os.path.join(index_dir, out_stem + ".idx") kallisto = config_utils.get_program("kallisto", dd.get_config(data)) if dd.get_transcriptome_fasta(data): gtf_fa = dd.get_transcriptome_fasta(data) else: gtf_fa = sailfish.create_combined_fasta(data) if file_exists(out_file): return out_file with file_transaction(out_file) as tx_out_file: cmd = "{kallisto} index -k 31 -i {tx_out_file} {gtf_fa}" message = "Creating Kallisto index for {gtf_fa}." do.run(cmd.format(**locals()), message.format(**locals()), None) return out_file
def salmon_quant_bam(bam_file, salmon_dir, gtf_file, ref_file, data): samplename = dd.get_sample_name(data) quant_dir = os.path.join(salmon_dir, "quant") safe_makedir(salmon_dir) out_file = os.path.join(quant_dir, "quant.sf") if file_exists(out_file): return out_file gtf_fa = sailfish.create_combined_fasta(data, salmon_dir) num_cores = dd.get_num_cores(data) strandedness = dd.get_strandedness(data).lower() salmon = config_utils.get_program("salmon", dd.get_config(data)) libtype = _libtype_string(bam_file, strandedness) num_cores = dd.get_num_cores(data) cmd = ("{salmon} quant {libtype} -p {num_cores} -t {gtf_fa} " "-o {tx_out_dir} -a {bam_file} ") cmd += "--numBootstraps 30 " with file_transaction(data, quant_dir) as tx_out_dir: message = "Quantifying transcripts in %s with Salmon." % bam_file do.run(cmd.format(**locals()), message, None) return out_file
def run_pizzly(data): samplename = dd.get_sample_name(data) work_dir = dd.get_work_dir(data) pizzlydir = os.path.join(work_dir, "pizzly") gtf = dd.get_transcriptome_gtf(data) if not gtf: gtf = dd.get_gtf_file(data) if dd.get_transcriptome_fasta(data): gtf_fa = dd.get_transcriptome_fasta(data) else: gtf_fa = sailfish.create_combined_fasta(data) stripped_fa = os.path.splitext(os.path.basename(gtf_fa))[0] + "-noversions.fa" stripped_fa = os.path.join(pizzlydir, stripped_fa) gtf_fa = fasta.strip_transcript_versions(gtf_fa, stripped_fa) fraglength = get_fragment_length(data) cachefile = os.path.join(pizzlydir, "pizzly.cache") fusions = kallisto.get_kallisto_fusions(data) pizzlypath = config_utils.get_program("pizzly", dd.get_config(data)) outdir = pizzly(pizzlypath, gtf, gtf_fa, fraglength, cachefile, pizzlydir, fusions, samplename, data) return outdir
def salmon_index(gtf_file, data, out_dir): out_dir = os.path.join(out_dir, "index", sailfish.get_build_string(data)) if dd.get_disambiguate(data): out_dir = "-".join([out_dir] + dd.get_disambiguate(data)) salmon = config_utils.get_program("salmon", dd.get_config(data)) num_cores = dd.get_num_cores(data) if dd.get_transcriptome_fasta(data): gtf_fa = dd.get_transcriptome_fasta(data) else: gtf_fa = sailfish.create_combined_fasta(data) assert file_exists(gtf_fa), "%s was not found, exiting." % gtf_fa tmpdir = dd.get_tmp_dir(data) out_file = os.path.join(out_dir, "versionInfo.json") if file_exists(out_file): return out_dir files = dd.get_input_sequence_files(data) kmersize = sailfish.pick_kmersize(files[0]) with file_transaction(data, out_dir) as tx_out_dir: cmd = "{salmon} index --keepDuplicates -k {kmersize} -p {num_cores} -i {tx_out_dir} -t {gtf_fa}" message = "Creating Salmon index for {gtf_fa} with {kmersize} bp kmers." do.run(cmd.format(**locals()), message.format(**locals()), None) return out_dir
def run_pizzly(data): samplename = dd.get_sample_name(data) work_dir = dd.get_work_dir(data) pizzlydir = os.path.join(work_dir, "pizzly") gtf = dd.get_transcriptome_gtf(data) if not gtf: gtf = dd.get_gtf_file(data) if dd.get_transcriptome_fasta(data): gtf_fa = dd.get_transcriptome_fasta(data) else: gtf_fa = sailfish.create_combined_fasta(data) stripped_fa = os.path.splitext( os.path.basename(gtf_fa))[0] + "-noversions.fa" stripped_fa = os.path.join(pizzlydir, stripped_fa) gtf_fa = fasta.strip_transcript_versions(gtf_fa, stripped_fa) fraglength = get_fragment_length(data) cachefile = os.path.join(pizzlydir, "pizzly.cache") fusions = kallisto.get_kallisto_fusions(data) pizzlypath = config_utils.get_program("pizzly", dd.get_config(data)) outdir = pizzly(pizzlypath, gtf, gtf_fa, fraglength, cachefile, pizzlydir, fusions, samplename, data) return outdir
def salmon_index(gtf_file, ref_file, data, out_dir): out_dir = os.path.join(out_dir, "index", sailfish.get_build_string(data)) if dd.get_disambiguate(data): out_dir = "-".join([out_dir] + dd.get_disambiguate(data)) salmon = config_utils.get_program("salmon", dd.get_config(data)) num_cores = dd.get_num_cores(data) if dd.get_transcriptome_fasta(data): gtf_fa = dd.get_transcriptome_fasta(data) else: gtf_fa = sailfish.create_combined_fasta(data) assert file_exists(gtf_fa), "%s was not found, exiting." % gtf_fa tmpdir = dd.get_tmp_dir(data) out_file = os.path.join(out_dir, "versionInfo.json") if file_exists(out_file): logger.info("Transcriptome index for %s detected, skipping building." % gtf_fa) return out_dir files = dd.get_input_sequence_files(data) kmersize = sailfish.pick_kmersize(files[0]) with file_transaction(data, out_dir) as tx_out_dir: cmd = "{salmon} index -k {kmersize} -p {num_cores} -i {tx_out_dir} -t {gtf_fa}" message = "Creating Salmon index for {gtf_fa} with {kmersize} bp kmers." do.run(cmd.format(**locals()), message.format(**locals()), None) return out_dir