Ejemplo n.º 1
0
def sailfish(fq1, fq2, sailfish_dir, gtf_file, ref_file, strandedness, data):
    safe_makedir(sailfish_dir)
    samplename = dd.get_sample_name(data)
    quant_dir = os.path.join(sailfish_dir, "quant")
    out_file = os.path.join(quant_dir, "quant.sf")
    if file_exists(out_file):
        return out_file
    kmer_size = int(fastq.estimate_read_length(fq1))
    if kmer_size < 30:
        # kmer size must be odd
        kmer_size = kmer_size - 5
        kmer_size = kmer_size if kmer_size % 2 else kmer_size - 1
    else:
        kmer_size = 25
    sailfish_idx = sailfish_index(gtf_file, ref_file, data, sailfish_dir, kmer_size)
    num_cores = dd.get_num_cores(data)
    sailfish = config_utils.get_program("sailfish", data["config"])
    cmd = "{sailfish} quant -i {sailfish_idx} -p {num_cores} "
    cmd += _libtype_string(fq1, fq2, strandedness)
    fq1_cmd = "{fq1}" if not is_gzipped(fq1) else "<(gzip -cd {fq1})"
    fq1_cmd = fq1_cmd.format(fq1=fq1)
    if not fq2:
        cmd += " -r {fq1_cmd} "
    else:
        fq2_cmd = "{fq2}" if not is_gzipped(fq2) else "<(gzip -cd {fq2})"
        fq2_cmd = fq2_cmd.format(fq2=fq2)
        cmd += " -1 {fq1_cmd} -2 {fq2_cmd} "
    cmd += "--useVBOpt --numBootstraps 30 "
    cmd += "-o {tx_out_dir}"
    message = "Quantifying transcripts in {fq1} and {fq2}."
    with file_transaction(data, quant_dir) as tx_out_dir:
        do.run(cmd.format(**locals()), message.format(**locals()), None)
    _sleuthify_sailfish(quant_dir)
    return out_file
Ejemplo n.º 2
0
def counts_spikein(data):
    data = utils.to_single_data(data)
    samplename = dd.get_sample_name(data)
    work_dir = dd.get_work_dir(data)
    salmon_dir = os.path.join(work_dir, "spikein", samplename)
    fasta_file = dd.get_spikein_fasta(data)
    if not fasta_file:
        return data
    files = dd.get_input_sequence_files(data)
    if len(files) == 2:
        fq1, fq2 = files
    else:
        fq1, fq2 = files[0], None
    assert file_exists(fasta_file), "%s was not found, exiting." % fasta_file
    readlength = fastq.estimate_read_length(fq1)
    if readlength % 2 == 0:
        readlength -= 1
    kmersize = min(readlength, 31)
    logger.info("kmersize used for salmon index at spikein quant: %s" %
                kmersize)
    kmersize = kmersize if not dd.get_analysis(
        data).lower() == "smallrna-seq" else 15
    fasta_index = _index_spikein(fasta_file, salmon_dir, data, kmersize)
    out_file = _salmon_quant_reads(fq1, fq2, salmon_dir, fasta_index, data)
    data = dd.set_spikein_counts(data, out_file)
    return data
Ejemplo n.º 3
0
def sailfish(fq1, fq2, sailfish_dir, gtf_file, ref_file, strandedness, data):
    safe_makedir(sailfish_dir)
    samplename = dd.get_sample_name(data)
    quant_dir = os.path.join(sailfish_dir, "quant")
    out_file = os.path.join(quant_dir, "quant.sf")
    if file_exists(out_file):
        return out_file
    kmer_size = int(fastq.estimate_read_length(fq1))
    if kmer_size < 30:
        # kmer size must be odd
        kmer_size = kmer_size - 5
        kmer_size = kmer_size if kmer_size % 2 else kmer_size - 1
    else:
        kmer_size = 25
    sailfish_idx = sailfish_index(gtf_file, ref_file, data, sailfish_dir,
                                  kmer_size)
    num_cores = dd.get_num_cores(data)
    sailfish = config_utils.get_program("sailfish", data["config"])
    cmd = "{sailfish} quant -i {sailfish_idx} -p {num_cores} "
    cmd += _libtype_string(fq1, fq2, strandedness)
    fq1_cmd = "{fq1}" if not is_gzipped(fq1) else "<(gzip -cd {fq1})"
    fq1_cmd = fq1_cmd.format(fq1=fq1)
    if not fq2:
        cmd += " -r {fq1_cmd} "
    else:
        fq2_cmd = "{fq2}" if not is_gzipped(fq2) else "<(gzip -cd {fq2})"
        fq2_cmd = fq2_cmd.format(fq2=fq2)
        cmd += " -1 {fq1_cmd} -2 {fq2_cmd} "
    cmd += "--useVBOpt --numBootstraps 30 "
    cmd += "-o {tx_out_dir}"
    message = "Quantifying transcripts in {fq1} and {fq2}."
    with file_transaction(data, quant_dir) as tx_out_dir:
        do.run(cmd.format(**locals()), message.format(**locals()), None)
    _sleuthify_sailfish(quant_dir)
    return out_file
Ejemplo n.º 4
0
def estimate_kmer_size(fq):
    kmer_size = int(fastq.estimate_read_length(fq))
    if kmer_size < 30:
        # kmer size must be odd
        kmer_size = kmer_size - 5
        kmer_size = kmer_size if kmer_size % 2 else kmer_size - 1
    else:
        kmer_size = 25
    return kmer_size
Ejemplo n.º 5
0
def estimate_kmer_size(fq):
    kmer_size = int(fastq.estimate_read_length(fq))
    if kmer_size < 30:
        # kmer size must be odd
        kmer_size = kmer_size - 5
        kmer_size = kmer_size if kmer_size % 2 else kmer_size - 1
    else:
        kmer_size = 25
    return kmer_size
Ejemplo n.º 6
0
def pick_kmersize(fq):
    """
    pick an appropriate kmer size based off of https://www.biostars.org/p/201474/
    tl;dr version: pick 31 unless the reads are very small, if not then guess
    that readlength / 2 is about right.
    """
    readlength = fastq.estimate_read_length(fq)
    halfread = int(round(readlength / 2))
    if halfread >= 31:
        kmersize = 31
    else:
        kmersize = halfread
    if kmersize % 2 == 0:
        kmersize += 1
    return kmersize
Ejemplo n.º 7
0
def pick_kmersize(fq):
    """
    pick an appropriate kmer size based off of https://www.biostars.org/p/201474/
    tl;dr version: pick 31 unless the reads are very small, if not then guess
    that readlength / 2 is about right.
    """
    readlength = fastq.estimate_read_length(fq)
    halfread = int(round(readlength / 2))
    if halfread >= 31:
        kmersize = 31
    else:
        kmersize = halfread
    if kmersize % 2 == 0:
        kmersize += 1
    return kmersize
Ejemplo n.º 8
0
def counts_spikein(data):
    data = utils.to_single_data(data)
    samplename = dd.get_sample_name(data)
    work_dir = dd.get_work_dir(data)
    salmon_dir = os.path.join(work_dir, "spikein", samplename)
    fasta_file = dd.get_spikein_fasta(data)
    if not fasta_file:
        return data
    files = dd.get_input_sequence_files(data)
    if len(files) == 2:
        fq1, fq2 = files
    else:
        fq1, fq2 = files[0], None
    assert file_exists(fasta_file), "%s was not found, exiting." % fasta_file
    readlength = fastq.estimate_read_length(fq1)
    if readlength % 2 == 0:
        readlength -= 1
    kmersize = min(readlength, 31)
    logger.info("kmersize used for salmon index at spikein quant: %s" % kmersize)
    kmersize = kmersize if not dd.get_analysis(data).lower() == "smallrna-seq" else 15
    fasta_index = _index_spikein(fasta_file, salmon_dir, data, kmersize)
    out_file = _salmon_quant_reads(fq1, fq2, salmon_dir, fasta_index, data)
    data = dd.set_spikein_counts(data, out_file)
    return data