Example #1
0
def rapmap_index(gtf_file, ref_file, algorithm, data, out_dir):
    valid_indexes = ["pseudoindex", "quasiindex"]
    index_type = algorithm + "index"
    assert index_type in valid_indexes, \
        "RapMap only supports %s indices." % valid_indexes
    out_dir = os.path.join(out_dir, index_type, dd.get_genome_build(data))
    if dd.get_disambiguate(data):
        out_dir = "-".join([out_dir] + dd.get_disambguate(data))
    rapmap = config_utils.get_program("rapmap", dd.get_config(data))
    # use user supplied transcriptome FASTA file if it exists
    if dd.get_transcriptome_fasta(data):
        out_dir = os.path.join(out_dir, index_type, dd.get_genome_build(data))
        gtf_fa = dd.get_transcriptome_fasta(data)
    else:
        gtf_fa = sailfish.create_combined_fasta(data)
    tmpdir = dd.get_tmp_dir(data)
    if file_exists(out_dir + "rapidx.jfhash"):
        return out_dir
    files = dd.get_input_sequence_files(data)
    kmersize = sailfish.pick_kmersize(files[0])
    message = "Creating rapmap {index_type} for {gtf_fa} with {kmersize} bp kmers."
    with file_transaction(out_dir) as tx_out_dir:
        cmd = "{rapmap} {index_type} -k {kmersize} -i {tx_out_dir} -t {gtf_fa}"
        do.run(cmd.format(**locals()), message.format(**locals()), None)
    return out_dir
Example #2
0
def salmon_decoy_index(gtf_file, data, out_dir):
    input_dir = os.path.join(dd.get_work_dir(data), "inputs", "transcriptome")
    decoy_transcriptome = os.path.join(
        input_dir,
        sailfish.get_build_string(data) + "-decoy.fa")
    out_dir = os.path.join(out_dir, "index", sailfish.get_build_string(data))
    if dd.get_disambiguate(data):
        out_dir = "-".join([out_dir] + dd.get_disambiguate(data))
    salmon = config_utils.get_program("salmon", dd.get_config(data))
    num_cores = dd.get_num_cores(data)
    if dd.get_transcriptome_fasta(data):
        gtf_fa = dd.get_transcriptome_fasta(data)
    else:
        gtf_fa = sailfish.create_combined_fasta(data)
    assert file_exists(gtf_fa), "%s was not found, exiting." % gtf_fa
    decoy_sequence_file = get_decoy_sequence_file(data)
    decoy_name_file = get_decoy_name_file(data)
    gtf_fa = create_decoy_transcriptome(gtf_fa, get_decoy_sequence_file(data),
                                        decoy_transcriptome)
    out_file = os.path.join(out_dir, "versionInfo.json")
    if file_exists(out_file):
        logger.info("Transcriptome index for %s detected, skipping building." %
                    gtf_fa)
        return out_dir
    files = dd.get_input_sequence_files(data)
    kmersize = sailfish.pick_kmersize(files[0])
    with file_transaction(data, out_dir) as tx_out_dir:
        cmd = (
            "{salmon} index -k {kmersize} -p {num_cores} -i {tx_out_dir} -t {gtf_fa} "
            "--decoys {decoy_name_file} ")
        message = "Creating decoy-aware Salmon index for {gtf_fa} with {kmersize} bp kmers."
        do.run(cmd.format(**locals()), message.format(**locals()), None)
    return out_dir
Example #3
0
def salmon_index(gtf_file, ref_file, data, out_dir):
    out_dir = os.path.join(out_dir, "index", sailfish.get_build_string(data))
    if dd.get_disambiguate(data):
        out_dir = "-".join([out_dir] + dd.get_disambiguate(data))
    salmon = config_utils.get_program("salmon", dd.get_config(data))
    num_cores = dd.get_num_cores(data)
    if dd.get_transcriptome_fasta(data):
        gtf_fa = dd.get_transcriptome_fasta(data)
    else:
        gtf_fa = sailfish.create_combined_fasta(data)
    assert file_exists(gtf_fa), "%s was not found, exiting." % gtf_fa
    tmpdir = dd.get_tmp_dir(data)
    out_file = os.path.join(out_dir, "versionInfo.json")
    if file_exists(out_file):
        logger.info("Transcriptome index for %s detected, skipping building." %
                    gtf_fa)
        return out_dir
    files = dd.get_input_sequence_files(data)
    readlength = bam.fastq.estimate_read_length(files[0])
    if readlength % 2 == 0:
        readlength -= 1
    kmersize = min(readlength, 31)
    with file_transaction(data, out_dir) as tx_out_dir:
        cmd = "{salmon} index -k {kmersize} -p {num_cores} -i {tx_out_dir} -t {gtf_fa}"
        message = "Creating Salmon index for {gtf_fa}."
        do.run(cmd.format(**locals()), message.format(**locals()), None)
    return out_dir
Example #4
0
def rapmap_index(gtf_file, ref_file, algorithm, data, out_dir):
    valid_indexes = ["pseudoindex", "quasiindex"]
    index_type = algorithm + "index"
    assert index_type in valid_indexes, \
        "RapMap only supports %s indices." % valid_indexes
    out_dir = os.path.join(out_dir, index_type, dd.get_genome_build(data))
    if dd.get_disambiguate(data):
        out_dir = "-".join([out_dir] + dd.get_disambguate(data))
    rapmap = config_utils.get_program("rapmap", dd.get_config(data))
    # use user supplied transcriptome FASTA file if it exists
    if dd.get_transcriptome_fasta(data):
        out_dir = os.path.join(out_dir, index_type, dd.get_genome_build(data))
        gtf_fa = dd.get_transcriptome_fasta(data)
    else:
        gtf_fa = sailfish.create_combined_fasta(data)
    tmpdir = dd.get_tmp_dir(data)
    if file_exists(out_dir + "rapidx.jfhash"):
        return out_dir
    files = dd.get_input_sequence_files(data)
    kmersize = sailfish.pick_kmersize(files[0])
    message = "Creating rapmap {index_type} for {gtf_fa} with {kmersize} bp kmers."
    with file_transaction(out_dir) as tx_out_dir:
        cmd = "{rapmap} {index_type} -k {kmersize} -i {tx_out_dir} -t {gtf_fa}"
        do.run(cmd.format(**locals()), message.format(**locals()), None)
    return out_dir
Example #5
0
def salmon_index(gtf_file, ref_file, data, out_dir):
    out_dir = os.path.join(out_dir, "index", sailfish.get_build_string(data))
    if dd.get_disambiguate(data):
        out_dir = "-".join([out_dir] + dd.get_disambiguate(data))
    salmon = config_utils.get_program("salmon", dd.get_config(data))
    num_cores = dd.get_num_cores(data)
    if dd.get_transcriptome_fasta(data):
        gtf_fa = dd.get_transcriptome_fasta(data)
    else:
        gtf_fa = sailfish.create_combined_fasta(data, out_dir)
    assert file_exists(gtf_fa), "%s was not found, exiting." % gtf_fa
    tmpdir = dd.get_tmp_dir(data)
    out_file = os.path.join(out_dir, "versionInfo.json")
    if file_exists(out_file):
        return out_dir
    files = dd.get_input_sequence_files(data)
    readlength = bam.fastq.estimate_read_length(files[0])
    if readlength % 2 == 0:
        readlength -= 1
    kmersize = min(readlength, 31)
    with file_transaction(data, out_dir) as tx_out_dir:
        cmd = "{salmon} index -k {kmersize} -p {num_cores} -i {tx_out_dir} -t {gtf_fa}"
        message = "Creating Salmon index for {gtf_fa}."
        do.run(cmd.format(**locals()), message.format(**locals()), None)
    return out_dir
Example #6
0
def use_installed_transcriptome(data):
    user_fa = dd.get_transcriptome_fasta(data)
    user_gtf = dd.get_transcriptome_gtf(data)
    if not user_fa and not user_gtf:
        return True
    else:
        return False
Example #7
0
def run_pizzly(data):
    work_dir = dd.get_work_dir(data)
    pizzlydir = os.path.join(work_dir, "pizzly")
    samplename = dd.get_sample_name(data)
    gtf = dd.get_gtf_file(data)
    if dd.get_transcriptome_fasta(data):
        gtf_fa = dd.get_transcriptome_fasta(data)
    else:
        gtf_fa = sailfish.create_combined_fasta(data)
    fraglength = get_fragment_length(data)
    cachefile = os.path.join(pizzlydir, "pizzly.cache")
    fusions = kallisto.get_kallisto_fusions(data)
    pizzlypath = config_utils.get_program("pizzly", dd.get_config(data))
    outdir = pizzly(pizzlypath, gtf, gtf_fa, fraglength, cachefile, pizzlydir,
                    fusions, samplename, data)
    return outdir
Example #8
0
def run_pizzly(data):
    work_dir = dd.get_work_dir(data)
    pizzlydir = os.path.join(work_dir, "pizzly")
    samplename = dd.get_sample_name(data)
    gtf = dd.get_gtf_file(data)
    if dd.get_transcriptome_fasta(data):
        gtf_fa = dd.get_transcriptome_fasta(data)
    else:
        gtf_fa = sailfish.create_combined_fasta(data)
    fraglength = get_fragment_length(data)
    cachefile = os.path.join(pizzlydir, "pizzly.cache")
    fusions = kallisto.get_kallisto_fusions(data)
    pizzlypath = config_utils.get_program("pizzly", dd.get_config(data))
    outdir = pizzly(pizzlypath, gtf, gtf_fa, fraglength, cachefile, pizzlydir,
                    fusions, samplename, data)
    return outdir
Example #9
0
def use_installed_transcriptome(data):
    user_fa = dd.get_transcriptome_fasta(data)
    user_gtf = dd.get_transcriptome_gtf(data)
    if not user_fa and not user_gtf:
        return True
    else:
        return False
Example #10
0
def salmon_index(gtf_file, ref_file, data, out_dir):
    out_dir = os.path.join(out_dir, "index", dd.get_genome_build(data))
    if dd.get_disambiguate(data):
        out_dir = "-".join([out_dir] + dd.get_disambguate(data))
    salmon = config_utils.get_program("salmon", dd.get_config(data))
    num_cores = dd.get_num_cores(data)
    if dd.get_transcriptome_fasta(data):
        gtf_fa = dd.get_transcriptome_fasta(data)
    else:
        gtf_fa = sailfish.create_combined_fasta(data, out_dir)
    tmpdir = dd.get_tmp_dir(data)
    out_file = os.path.join(out_dir, "versionInfo.json")
    if file_exists(out_file):
        return out_dir
    with file_transaction(out_dir) as tx_out_dir:
        cmd = "{salmon} index -k 31 -p {num_cores} -i {tx_out_dir} -t {gtf_fa}"
        message = "Creating Salmon index for {gtf_fa}."
        do.run(cmd.format(**locals()), message.format(**locals()), None)
    return out_dir
Example #11
0
def salmon_index(gtf_file, ref_file, data, out_dir):
    out_dir = os.path.join(out_dir, "index", dd.get_genome_build(data))
    if dd.get_disambiguate(data):
        out_dir = "-".join([out_dir] + dd.get_disambguate(data))
    salmon = config_utils.get_program("salmon", dd.get_config(data))
    num_cores = dd.get_num_cores(data)
    if dd.get_transcriptome_fasta(data):
        gtf_fa = dd.get_transcriptome_fasta(data)
    else:
        gtf_fa = sailfish.create_combined_fasta(data, out_dir)
    tmpdir = dd.get_tmp_dir(data)
    out_file = os.path.join(out_dir, "versionInfo.json")
    if file_exists(out_file):
        return out_dir
    with file_transaction(out_dir) as tx_out_dir:
        cmd = "{salmon} index -k 31 -p {num_cores} -i {tx_out_dir} -t {gtf_fa}"
        message = "Creating Salmon index for {gtf_fa}."
        do.run(cmd.format(**locals()), message.format(**locals()), None)
    return out_dir
Example #12
0
def kallisto_index(gtf_file, ref_file, data, out_dir):
    out_dir = os.path.join(out_dir, "index")
    out_stem = dd.get_genome_build(data)
    if dd.get_disambiguate(data):
        out_stem = "-".join([out_stem] + dd.get_disambiguate(data))
    index_dir = os.path.join(out_dir, out_stem)
    out_file = os.path.join(index_dir, out_stem + ".idx")
    kallisto = config_utils.get_program("kallisto", dd.get_config(data))
    if dd.get_transcriptome_fasta(data):
        gtf_fa = dd.get_transcriptome_fasta(data)
    else:
        gtf_fa = sailfish.create_combined_fasta(data)
    if file_exists(out_file):
        return out_file
    with file_transaction(out_file) as tx_out_file:
        cmd = "{kallisto} index -k 31 -i {tx_out_file} {gtf_fa}"
        message = "Creating Kallisto index for {gtf_fa}."
        do.run(cmd.format(**locals()), message.format(**locals()), None)
    return out_file
Example #13
0
def kallisto_index(gtf_file, ref_file, data, out_dir):
    out_dir = os.path.join(out_dir, "index")
    out_stem = dd.get_genome_build(data)
    if dd.get_disambiguate(data):
        out_stem = "-".join([out_stem] + dd.get_disambiguate(data))
    index_dir = os.path.join(out_dir, out_stem)
    out_file = os.path.join(index_dir, out_stem + ".idx")
    kallisto = config_utils.get_program("kallisto", dd.get_config(data))
    if dd.get_transcriptome_fasta(data):
        gtf_fa = dd.get_transcriptome_fasta(data)
    else:
        gtf_fa = sailfish.create_combined_fasta(data)
    if file_exists(out_file):
        return out_file
    with file_transaction(out_file) as tx_out_file:
        cmd = "{kallisto} index -k 31 -i {tx_out_file} {gtf_fa}"
        message = "Creating Kallisto index for {gtf_fa}."
        do.run(cmd.format(**locals()), message.format(**locals()), None)
    return out_file
Example #14
0
def run_pizzly(data):
    samplename = dd.get_sample_name(data)
    work_dir = dd.get_work_dir(data)
    pizzlydir = os.path.join(work_dir, "pizzly")
    gtf = dd.get_transcriptome_gtf(data)
    if not gtf:
        gtf = dd.get_gtf_file(data)
    if dd.get_transcriptome_fasta(data):
        gtf_fa = dd.get_transcriptome_fasta(data)
    else:
        gtf_fa = sailfish.create_combined_fasta(data)
    stripped_fa = os.path.splitext(os.path.basename(gtf_fa))[0] + "-noversions.fa"
    stripped_fa = os.path.join(pizzlydir, stripped_fa)
    gtf_fa = fasta.strip_transcript_versions(gtf_fa, stripped_fa)
    fraglength = get_fragment_length(data)
    cachefile = os.path.join(pizzlydir, "pizzly.cache")
    fusions = kallisto.get_kallisto_fusions(data)
    pizzlypath = config_utils.get_program("pizzly", dd.get_config(data))
    outdir = pizzly(pizzlypath, gtf, gtf_fa, fraglength, cachefile, pizzlydir,
                    fusions, samplename, data)
    return outdir
Example #15
0
def run_pizzly(data):
    samplename = dd.get_sample_name(data)
    work_dir = dd.get_work_dir(data)
    pizzlydir = os.path.join(work_dir, "pizzly")
    gtf = dd.get_transcriptome_gtf(data)
    if not gtf:
        gtf = dd.get_gtf_file(data)
    if dd.get_transcriptome_fasta(data):
        gtf_fa = dd.get_transcriptome_fasta(data)
    else:
        gtf_fa = sailfish.create_combined_fasta(data)
    stripped_fa = os.path.splitext(
        os.path.basename(gtf_fa))[0] + "-noversions.fa"
    stripped_fa = os.path.join(pizzlydir, stripped_fa)
    gtf_fa = fasta.strip_transcript_versions(gtf_fa, stripped_fa)
    fraglength = get_fragment_length(data)
    cachefile = os.path.join(pizzlydir, "pizzly.cache")
    fusions = kallisto.get_kallisto_fusions(data)
    pizzlypath = config_utils.get_program("pizzly", dd.get_config(data))
    outdir = pizzly(pizzlypath, gtf, gtf_fa, fraglength, cachefile, pizzlydir,
                    fusions, samplename, data)
    return outdir
Example #16
0
def salmon_index(gtf_file, data, out_dir):
    out_dir = os.path.join(out_dir, "index", sailfish.get_build_string(data))
    if dd.get_disambiguate(data):
        out_dir = "-".join([out_dir] + dd.get_disambiguate(data))
    salmon = config_utils.get_program("salmon", dd.get_config(data))
    num_cores = dd.get_num_cores(data)
    if dd.get_transcriptome_fasta(data):
        gtf_fa = dd.get_transcriptome_fasta(data)
    else:
        gtf_fa = sailfish.create_combined_fasta(data)
    assert file_exists(gtf_fa), "%s was not found, exiting." % gtf_fa
    tmpdir = dd.get_tmp_dir(data)
    out_file = os.path.join(out_dir, "versionInfo.json")
    if file_exists(out_file):
        return out_dir
    files = dd.get_input_sequence_files(data)
    kmersize = sailfish.pick_kmersize(files[0])
    with file_transaction(data, out_dir) as tx_out_dir:
        cmd = "{salmon} index --keepDuplicates -k {kmersize} -p {num_cores} -i {tx_out_dir} -t {gtf_fa}"
        message = "Creating Salmon index for {gtf_fa} with {kmersize} bp kmers."
        do.run(cmd.format(**locals()), message.format(**locals()), None)
    return out_dir
Example #17
0
def salmon_index(gtf_file, ref_file, data, out_dir):
    out_dir = os.path.join(out_dir, "index", sailfish.get_build_string(data))
    if dd.get_disambiguate(data):
        out_dir = "-".join([out_dir] + dd.get_disambiguate(data))
    salmon = config_utils.get_program("salmon", dd.get_config(data))
    num_cores = dd.get_num_cores(data)
    if dd.get_transcriptome_fasta(data):
        gtf_fa = dd.get_transcriptome_fasta(data)
    else:
        gtf_fa = sailfish.create_combined_fasta(data)
    assert file_exists(gtf_fa), "%s was not found, exiting." % gtf_fa
    tmpdir = dd.get_tmp_dir(data)
    out_file = os.path.join(out_dir, "versionInfo.json")
    if file_exists(out_file):
        logger.info("Transcriptome index for %s detected, skipping building." % gtf_fa)
        return out_dir
    files = dd.get_input_sequence_files(data)
    kmersize = sailfish.pick_kmersize(files[0])
    with file_transaction(data, out_dir) as tx_out_dir:
        cmd = "{salmon} index -k {kmersize} -p {num_cores} -i {tx_out_dir} -t {gtf_fa}"
        message = "Creating Salmon index for {gtf_fa} with {kmersize} bp kmers."
        do.run(cmd.format(**locals()), message.format(**locals()), None)
    return out_dir
Example #18
0
def salmon_quant_bam(bam_file, salmon_dir, gtf_file, data):
    samplename = dd.get_sample_name(data)
    quant_dir = salmon_dir
    safe_makedir(salmon_dir)
    out_file = os.path.join(quant_dir, "quant.sf")
    if file_exists(out_file):
        return out_file
    if dd.get_transcriptome_fasta(data):
        gtf_fa = dd.get_transcriptome_fasta(data)
    else:
        gtf_fa = sailfish.create_combined_fasta(data)
    num_cores = dd.get_num_cores(data)
    strandedness = dd.get_strandedness(data).lower()
    salmon = config_utils.get_program("salmon", dd.get_config(data))
    libtype = _libtype_string(bam_file, strandedness)
    num_cores = dd.get_num_cores(data)
    cmd = ("{salmon} quant {libtype} -p {num_cores} -t {gtf_fa} "
           "-o {tx_out_dir} -a {bam_file} ")
    cmd += "--numBootstraps 30 "
    with file_transaction(data, quant_dir) as tx_out_dir:
        message = "Quantifying transcripts in %s with Salmon." % bam_file
        do.run(cmd.format(**locals()), message, None)
    return out_file
Example #19
0
def salmon_quant_bam(bam_file, salmon_dir, gtf_file, ref_file, data):
    samplename = dd.get_sample_name(data)
    quant_dir = os.path.join(salmon_dir, "quant")
    safe_makedir(salmon_dir)
    out_file = os.path.join(quant_dir, "quant.sf")
    if file_exists(out_file):
        return out_file
    if dd.get_transcriptome_fasta(data):
        gtf_fa = dd.get_transcriptome_fasta(data)
    else:
        gtf_fa = sailfish.create_combined_fasta(data)
    num_cores = dd.get_num_cores(data)
    strandedness = dd.get_strandedness(data).lower()
    salmon = config_utils.get_program("salmon", dd.get_config(data))
    libtype = _libtype_string(bam_file, strandedness)
    num_cores = dd.get_num_cores(data)
    cmd = ("{salmon} quant {libtype} -p {num_cores} -t {gtf_fa} "
           "-o {tx_out_dir} -a {bam_file} ")
    cmd += "--numBootstraps 30 "
    with file_transaction(data, quant_dir) as tx_out_dir:
        message = "Quantifying transcripts in %s with Salmon." % bam_file
        do.run(cmd.format(**locals()), message, None)
    return out_file