def sailfish(fq1, fq2, sailfish_dir, gtf_file, ref_file, strandedness, data): safe_makedir(sailfish_dir) samplename = dd.get_sample_name(data) quant_dir = os.path.join(sailfish_dir, "quant") out_file = os.path.join(quant_dir, "quant.sf") if file_exists(out_file): return out_file kmer_size = int(fastq.estimate_read_length(fq1)) if kmer_size < 30: # kmer size must be odd kmer_size = kmer_size - 5 kmer_size = kmer_size if kmer_size % 2 else kmer_size - 1 else: kmer_size = 25 sailfish_idx = sailfish_index(gtf_file, ref_file, data, sailfish_dir, kmer_size) num_cores = dd.get_num_cores(data) sailfish = config_utils.get_program("sailfish", data["config"]) cmd = "{sailfish} quant -i {sailfish_idx} -p {num_cores} " cmd += _libtype_string(fq1, fq2, strandedness) fq1_cmd = "{fq1}" if not is_gzipped(fq1) else "<(gzip -cd {fq1})" fq1_cmd = fq1_cmd.format(fq1=fq1) if not fq2: cmd += " -r {fq1_cmd} " else: fq2_cmd = "{fq2}" if not is_gzipped(fq2) else "<(gzip -cd {fq2})" fq2_cmd = fq2_cmd.format(fq2=fq2) cmd += " -1 {fq1_cmd} -2 {fq2_cmd} " cmd += "--useVBOpt --numBootstraps 30 " cmd += "-o {tx_out_dir}" message = "Quantifying transcripts in {fq1} and {fq2}." with file_transaction(data, quant_dir) as tx_out_dir: do.run(cmd.format(**locals()), message.format(**locals()), None) _sleuthify_sailfish(quant_dir) return out_file
def counts_spikein(data): data = utils.to_single_data(data) samplename = dd.get_sample_name(data) work_dir = dd.get_work_dir(data) salmon_dir = os.path.join(work_dir, "spikein", samplename) fasta_file = dd.get_spikein_fasta(data) if not fasta_file: return data files = dd.get_input_sequence_files(data) if len(files) == 2: fq1, fq2 = files else: fq1, fq2 = files[0], None assert file_exists(fasta_file), "%s was not found, exiting." % fasta_file readlength = fastq.estimate_read_length(fq1) if readlength % 2 == 0: readlength -= 1 kmersize = min(readlength, 31) logger.info("kmersize used for salmon index at spikein quant: %s" % kmersize) kmersize = kmersize if not dd.get_analysis( data).lower() == "smallrna-seq" else 15 fasta_index = _index_spikein(fasta_file, salmon_dir, data, kmersize) out_file = _salmon_quant_reads(fq1, fq2, salmon_dir, fasta_index, data) data = dd.set_spikein_counts(data, out_file) return data
def estimate_kmer_size(fq): kmer_size = int(fastq.estimate_read_length(fq)) if kmer_size < 30: # kmer size must be odd kmer_size = kmer_size - 5 kmer_size = kmer_size if kmer_size % 2 else kmer_size - 1 else: kmer_size = 25 return kmer_size
def pick_kmersize(fq): """ pick an appropriate kmer size based off of https://www.biostars.org/p/201474/ tl;dr version: pick 31 unless the reads are very small, if not then guess that readlength / 2 is about right. """ readlength = fastq.estimate_read_length(fq) halfread = int(round(readlength / 2)) if halfread >= 31: kmersize = 31 else: kmersize = halfread if kmersize % 2 == 0: kmersize += 1 return kmersize
def counts_spikein(data): data = utils.to_single_data(data) samplename = dd.get_sample_name(data) work_dir = dd.get_work_dir(data) salmon_dir = os.path.join(work_dir, "spikein", samplename) fasta_file = dd.get_spikein_fasta(data) if not fasta_file: return data files = dd.get_input_sequence_files(data) if len(files) == 2: fq1, fq2 = files else: fq1, fq2 = files[0], None assert file_exists(fasta_file), "%s was not found, exiting." % fasta_file readlength = fastq.estimate_read_length(fq1) if readlength % 2 == 0: readlength -= 1 kmersize = min(readlength, 31) logger.info("kmersize used for salmon index at spikein quant: %s" % kmersize) kmersize = kmersize if not dd.get_analysis(data).lower() == "smallrna-seq" else 15 fasta_index = _index_spikein(fasta_file, salmon_dir, data, kmersize) out_file = _salmon_quant_reads(fq1, fq2, salmon_dir, fasta_index, data) data = dd.set_spikein_counts(data, out_file) return data