예제 #1
0
def salmon_index(gtf_file, ref_file, data, out_dir):
    out_dir = os.path.join(out_dir, "index", sailfish.get_build_string(data))
    if dd.get_disambiguate(data):
        out_dir = "-".join([out_dir] + dd.get_disambiguate(data))
    salmon = config_utils.get_program("salmon", dd.get_config(data))
    num_cores = dd.get_num_cores(data)
    if dd.get_transcriptome_fasta(data):
        gtf_fa = dd.get_transcriptome_fasta(data)
    else:
        gtf_fa = sailfish.create_combined_fasta(data)
    assert file_exists(gtf_fa), "%s was not found, exiting." % gtf_fa
    tmpdir = dd.get_tmp_dir(data)
    out_file = os.path.join(out_dir, "versionInfo.json")
    if file_exists(out_file):
        logger.info("Transcriptome index for %s detected, skipping building." %
                    gtf_fa)
        return out_dir
    files = dd.get_input_sequence_files(data)
    readlength = bam.fastq.estimate_read_length(files[0])
    if readlength % 2 == 0:
        readlength -= 1
    kmersize = min(readlength, 31)
    with file_transaction(data, out_dir) as tx_out_dir:
        cmd = "{salmon} index -k {kmersize} -p {num_cores} -i {tx_out_dir} -t {gtf_fa}"
        message = "Creating Salmon index for {gtf_fa}."
        do.run(cmd.format(**locals()), message.format(**locals()), None)
    return out_dir
예제 #2
0
def generate_transcript_counts(data):
    """Generate counts per transcript and per exon from an alignment"""
    data["count_file"] = featureCounts.count(data)

    if dd.get_fusion_mode(data, False):
        oncofuse_file = oncofuse.run(data)
        if oncofuse_file:
            data = dd.set_oncofuse_file(data, oncofuse_file)

    if dd.get_transcriptome_align(data) and not dd.get_transcriptome_bam(data):
        file1, file2 = None, None

        if dd.get_disambiguate(data):
            bam_path = data["work_bam"]
            fastq_paths = alignprep._bgzip_from_bam(bam_path, data["dirs"], data["config"], is_retry=False, output_infix='-transcriptome')
            if len(fastq_paths) == 2:
                file1, file2 = fastq_paths
            else:
                file1, file2 = fastq_paths[0], None
        else:
            file1, file2 = dd.get_input_sequence_files(data)

        ref_file = dd.get_ref_file(data)
        logger.info("Transcriptome alignment was flagged to run, but the "
                    "transcriptome BAM file was not found. Aligning to the "
                    "transcriptome with bowtie2.")
        data = bowtie2.align_transcriptome(file1, file2, ref_file, data)
    return [[data]]
예제 #3
0
def convert_to_kallisto(data):
    files = dd.get_input_sequence_files(data)
    if len(files) == 2:
        fq1, fq2 = files
    else:
        fq1, fq2 = files[0], None
    samplename = dd.get_sample_name(data)
    work_dir = dd.get_work_dir(data)
    kallisto_dir = os.path.join(work_dir, "kallisto", samplename, "fastq")
    out_file = os.path.join(kallisto_dir, "barcodes.batch")
    umis = config_utils.get_program("umis", dd.get_config(data))
    if file_exists(out_file):
        return out_file
    if dd.get_minimum_barcode_depth(data):
        cb_histogram = os.path.join(work_dir, "umis", samplename,
                                    "cb-histogram.txt")
        cb_cutoff = dd.get_minimum_barcode_depth(data)
        cb_options = "--cb_histogram {cb_histogram} --cb_cutoff {cb_cutoff}"
        cb_options = cb_options.format(**locals())
    else:
        cb_options = ""
    cmd = ("{umis} kallisto {cb_options} --out_dir {tx_kallisto_dir} {fq1}")
    with file_transaction(data, kallisto_dir) as tx_kallisto_dir:
        safe_makedir(tx_kallisto_dir)
        message = ("Transforming %s to Kallisto singlecell format. " % fq1)
        do.run(cmd.format(**locals()), message)
    return out_file
예제 #4
0
파일: umi.py 프로젝트: hliang/bcbio-nextgen
def convert_to_kallisto(data):
    files = dd.get_input_sequence_files(data)
    if len(files) == 2:
        fq1, fq2 = files
    else:
        fq1, fq2 = files[0], None
    samplename = dd.get_sample_name(data)
    work_dir = dd.get_work_dir(data)
    kallisto_dir = os.path.join(work_dir, "kallisto", samplename, "fastq")
    out_file = os.path.join(kallisto_dir, "barcodes.batch")
    umis = config_utils.get_program("umis", dd.get_config(data))
    if file_exists(out_file):
        return out_file
    if dd.get_minimum_barcode_depth(data):
        cb_histogram = os.path.join(work_dir, "umis", samplename, "cb-histogram.txt")
        cb_cutoff = dd.get_minimum_barcode_depth(data)
        cb_options = "--cb_histogram {cb_histogram} --cb_cutoff {cb_cutoff}"
        cb_options = cb_options.format(**locals())
    else:
        cb_options = ""
    cmd = ("{umis} kallisto {cb_options} --out_dir {tx_kallisto_dir} {fq1}")
    with file_transaction(data, kallisto_dir) as tx_kallisto_dir:
        safe_makedir(tx_kallisto_dir)
        message = ("Transforming %s to Kallisto singlecell format. "
                   % fq1)
        do.run(cmd.format(**locals()), message)
    return out_file
예제 #5
0
def salmon_decoy_index(gtf_file, data, out_dir):
    input_dir = os.path.join(dd.get_work_dir(data), "inputs", "transcriptome")
    decoy_transcriptome = os.path.join(
        input_dir,
        sailfish.get_build_string(data) + "-decoy.fa")
    out_dir = os.path.join(out_dir, "index", sailfish.get_build_string(data))
    if dd.get_disambiguate(data):
        out_dir = "-".join([out_dir] + dd.get_disambiguate(data))
    salmon = config_utils.get_program("salmon", dd.get_config(data))
    num_cores = dd.get_num_cores(data)
    if dd.get_transcriptome_fasta(data):
        gtf_fa = dd.get_transcriptome_fasta(data)
    else:
        gtf_fa = sailfish.create_combined_fasta(data)
    assert file_exists(gtf_fa), "%s was not found, exiting." % gtf_fa
    decoy_sequence_file = get_decoy_sequence_file(data)
    decoy_name_file = get_decoy_name_file(data)
    gtf_fa = create_decoy_transcriptome(gtf_fa, get_decoy_sequence_file(data),
                                        decoy_transcriptome)
    out_file = os.path.join(out_dir, "versionInfo.json")
    if file_exists(out_file):
        logger.info("Transcriptome index for %s detected, skipping building." %
                    gtf_fa)
        return out_dir
    files = dd.get_input_sequence_files(data)
    kmersize = sailfish.pick_kmersize(files[0])
    with file_transaction(data, out_dir) as tx_out_dir:
        cmd = (
            "{salmon} index -k {kmersize} -p {num_cores} -i {tx_out_dir} -t {gtf_fa} "
            "--decoys {decoy_name_file} ")
        message = "Creating decoy-aware Salmon index for {gtf_fa} with {kmersize} bp kmers."
        do.run(cmd.format(**locals()), message.format(**locals()), None)
    return out_dir
예제 #6
0
def rapmap_index(gtf_file, ref_file, algorithm, data, out_dir):
    valid_indexes = ["pseudoindex", "quasiindex"]
    index_type = algorithm + "index"
    assert index_type in valid_indexes, \
        "RapMap only supports %s indices." % valid_indexes
    out_dir = os.path.join(out_dir, index_type, dd.get_genome_build(data))
    if dd.get_disambiguate(data):
        out_dir = "-".join([out_dir] + dd.get_disambguate(data))
    rapmap = config_utils.get_program("rapmap", dd.get_config(data))
    # use user supplied transcriptome FASTA file if it exists
    if dd.get_transcriptome_fasta(data):
        out_dir = os.path.join(out_dir, index_type, dd.get_genome_build(data))
        gtf_fa = dd.get_transcriptome_fasta(data)
    else:
        gtf_fa = sailfish.create_combined_fasta(data)
    tmpdir = dd.get_tmp_dir(data)
    if file_exists(out_dir + "rapidx.jfhash"):
        return out_dir
    files = dd.get_input_sequence_files(data)
    kmersize = sailfish.pick_kmersize(files[0])
    message = "Creating rapmap {index_type} for {gtf_fa} with {kmersize} bp kmers."
    with file_transaction(out_dir) as tx_out_dir:
        cmd = "{rapmap} {index_type} -k {kmersize} -i {tx_out_dir} -t {gtf_fa}"
        do.run(cmd.format(**locals()), message.format(**locals()), None)
    return out_dir
예제 #7
0
def generate_transcript_counts(data):
    """Generate counts per transcript and per exon from an alignment"""
    data["count_file"] = featureCounts.count(data)

    if dd.get_transcriptome_align(data):
        # to create a disambiguated transcriptome file realign with bowtie2
        if dd.get_disambiguate(data):
            logger.info("Aligning to the transcriptome with bowtie2 using the "
                        "disambiguated reads.")
            bam_path = data["work_bam"]
            fastq_paths = alignprep._bgzip_from_bam(
                bam_path,
                data["dirs"],
                data,
                is_retry=False,
                output_infix='-transcriptome')
            if len(fastq_paths) == 2:
                file1, file2 = fastq_paths
            else:
                file1, file2 = fastq_paths[0], None
            ref_file = dd.get_ref_file(data)
            data = bowtie2.align_transcriptome(file1, file2, ref_file, data)
        else:
            file1, file2 = dd.get_input_sequence_files(data)
        if not dd.get_transcriptome_bam(data):
            ref_file = dd.get_ref_file(data)
            logger.info(
                "Transcriptome alignment was flagged to run, but the "
                "transcriptome BAM file was not found. Aligning to the "
                "transcriptome with bowtie2.")
            data = bowtie2.align_transcriptome(file1, file2, ref_file, data)
    data = spikein.counts_spikein(data)
    return [[data]]
예제 #8
0
def umi_transform(data):
    """
    transform each read by identifying the barcode and UMI for each read
    and putting the information in the read name
    """
    fq1, fq2 = dd.get_input_sequence_files(data)
    fq2 = fq2 if fq2 else ""
    umi_dir = os.path.join(dd.get_work_dir(data), "umis")
    transform = dd.get_umi_type(data)
    transform_data = transforms[transform]
    safe_makedir(umi_dir)
    transform_file = os.path.join(umi_dir, transform + ".json")
    transform_file = write_transform_file(transform_data, transform_file)
    out_base = dd.get_sample_name(data) + ".umitransformed.fq.gz"
    out_file = os.path.join(umi_dir, out_base)
    if file_exists(out_file):
        data["files"] = [out_file]
        return [[data]]
    index_option = "--dual_index" if transform_data["dual"] else ""
    if len(dd.get_cellular_barcodes(data)) == 2:
        split_option = "--separate_cb"
    else:
        split_option = ""
    umis = config_utils.get_program("umis", data, default="umis")
    cmd = (
        "{umis} fastqtransform {index_option} {split_option} {transform_file} "
        "{fq1} {fq2} "
        "| seqtk seq -L 20 - | gzip > {tx_out_file}")
    message = (
        "Inserting UMI and barcode information into the read name of %s" % fq1)
    with file_transaction(out_file) as tx_out_file:
        do.run(cmd.format(**locals()), message)
    data["files"] = [out_file]
    return [[data]]
예제 #9
0
def salmon_index(gtf_file, ref_file, data, out_dir):
    out_dir = os.path.join(out_dir, "index", sailfish.get_build_string(data))
    if dd.get_disambiguate(data):
        out_dir = "-".join([out_dir] + dd.get_disambiguate(data))
    salmon = config_utils.get_program("salmon", dd.get_config(data))
    num_cores = dd.get_num_cores(data)
    if dd.get_transcriptome_fasta(data):
        gtf_fa = dd.get_transcriptome_fasta(data)
    else:
        gtf_fa = sailfish.create_combined_fasta(data, out_dir)
    assert file_exists(gtf_fa), "%s was not found, exiting." % gtf_fa
    tmpdir = dd.get_tmp_dir(data)
    out_file = os.path.join(out_dir, "versionInfo.json")
    if file_exists(out_file):
        return out_dir
    files = dd.get_input_sequence_files(data)
    readlength = bam.fastq.estimate_read_length(files[0])
    if readlength % 2 == 0:
        readlength -= 1
    kmersize = min(readlength, 31)
    with file_transaction(data, out_dir) as tx_out_dir:
        cmd = "{salmon} index -k {kmersize} -p {num_cores} -i {tx_out_dir} -t {gtf_fa}"
        message = "Creating Salmon index for {gtf_fa}."
        do.run(cmd.format(**locals()), message.format(**locals()), None)
    return out_dir
예제 #10
0
def counts_spikein(data):
    data = utils.to_single_data(data)
    samplename = dd.get_sample_name(data)
    work_dir = dd.get_work_dir(data)
    salmon_dir = os.path.join(work_dir, "spikein", samplename)
    fasta_file = dd.get_spikein_fasta(data)
    if not fasta_file:
        return data
    files = dd.get_input_sequence_files(data)
    if len(files) == 2:
        fq1, fq2 = files
    else:
        fq1, fq2 = files[0], None
    assert file_exists(fasta_file), "%s was not found, exiting." % fasta_file
    readlength = fastq.estimate_read_length(fq1)
    if readlength % 2 == 0:
        readlength -= 1
    kmersize = min(readlength, 31)
    logger.info("kmersize used for salmon index at spikein quant: %s" %
                kmersize)
    kmersize = kmersize if not dd.get_analysis(
        data).lower() == "smallrna-seq" else 15
    fasta_index = _index_spikein(fasta_file, salmon_dir, data, kmersize)
    out_file = _salmon_quant_reads(fq1, fq2, salmon_dir, fasta_index, data)
    data = dd.set_spikein_counts(data, out_file)
    return data
예제 #11
0
def umi_transform(data):
    """
    transform each read by identifying the barcode and UMI for each read
    and putting the information in the read name
    """
    fq1, fq2 = dd.get_input_sequence_files(data)
    fq2 = fq2 if fq2 else ""
    umi_dir = os.path.join(dd.get_work_dir(data), "umis")
    transform = dd.get_umi_type(data)
    transform_data = transforms[transform]
    safe_makedir(umi_dir)
    transform_file = os.path.join(umi_dir, transform + ".json")
    transform_file = write_transform_file(transform_data, transform_file)
    out_base = dd.get_sample_name(data) + ".umitransformed.fq.gz"
    out_file = os.path.join(umi_dir, out_base)
    if file_exists(out_file):
        data["files"] = [out_file]
        return [[data]]
    index_option = "--dual_index" if transform_data["dual"] else ""
    if len(dd.get_cellular_barcodes(data)) == 2:
        split_option = "--separate_cb"
    else:
        split_option = ""
    umis = config_utils.get_program("umis", data, default="umis")
    cmd = ("{umis} fastqtransform {index_option} {split_option} {transform_file} "
           "{fq1} {fq2} "
           "| seqtk seq -L 20 - | gzip > {tx_out_file}")
    message = ("Inserting UMI and barcode information into the read name of %s"
               % fq1)
    with file_transaction(out_file) as tx_out_file:
        do.run(cmd.format(**locals()), message)
    data["files"] = [out_file]
    return [[data]]
예제 #12
0
def trim_adapters(data):
    fq1, fq2 = dd.get_input_sequence_files(data)
    skewer = config_utils.get_program("skewer", data, default="skewer")
    nthreads = dd.get_num_cores(data)
    samplename = dd.get_sample_name(data)
    out_dir = safe_makedir(
        os.path.join(dd.get_work_dir(data), "trimmed", samplename))
    of1 = os.path.join(out_dir, samplename + "-trimmed-pair1.fastq.gz")
    of2 = os.path.join(out_dir, samplename + "-trimmed-pair2.fastq.gz")
    of2 = of2 if fq2 else None
    if fq1 and fq2:
        if file_exists(of1) and file_exists(of2):
            return of1, of2
    else:
        if file_exists(of1):
            return of1, None
    safe_makedir(out_dir)
    file_string = "{fq1} {fq2} " if fq2 else "{fq1} "
    fw_cmd = _fw_command(data)
    rv_cmd = _rv_command(data)
    mode = "tail" if not fq2 else "pe"
    cmd = ("{skewer} --min 25 --threads {nthreads} -q 5 "
           "{fw_cmd} "
           "{rv_cmd} "
           "-m {mode} "
           "--compress --output {out_stem} ") + file_string
    with file_transaction(out_dir) as tx_out_dir:
        safe_makedir(tx_out_dir)
        out_stem = os.path.join(tx_out_dir, samplename)
        message = "Trimming {fq1}, {fq2} with skewer.".format(**locals())
        do.run(cmd.format(**locals()), message)
    return of1, of2
예제 #13
0
def generate_transcript_counts(data):
    """Generate counts per transcript and per exon from an alignment"""
    data["count_file"] = featureCounts.count(data)

    if dd.get_fusion_mode(data, False):
        oncofuse_file = oncofuse.run(data)
        if oncofuse_file:
            data = dd.set_oncofuse_file(data, oncofuse_file)

    if dd.get_transcriptome_align(data) and not dd.get_transcriptome_bam(data):
        file1, file2 = None, None

        if dd.get_disambiguate(data):
            bam_path = data["work_bam"]
            fastq_paths = alignprep._bgzip_from_bam(
                bam_path,
                data["dirs"],
                data["config"],
                is_retry=False,
                output_infix='-transcriptome')
            if len(fastq_paths) == 2:
                file1, file2 = fastq_paths
            else:
                file1, file2 = fastq_paths[0], None
        else:
            file1, file2 = dd.get_input_sequence_files(data)

        ref_file = dd.get_ref_file(data)
        logger.info("Transcriptome alignment was flagged to run, but the "
                    "transcriptome BAM file was not found. Aligning to the "
                    "transcriptome with bowtie2.")
        data = bowtie2.align_transcriptome(file1, file2, ref_file, data)
    return [[data]]
예제 #14
0
def trim_adapters(data):
    fq1, fq2 = dd.get_input_sequence_files(data)
    skewer = config_utils.get_program("skewer", data, default="skewer")
    nthreads = dd.get_num_cores(data)
    samplename = dd.get_sample_name(data)
    out_dir = os.path.join(dd.get_work_dir(data), "trimmed", samplename)
    of1 = os.path.join(out_dir, samplename + "-trimmed-pair1.fastq.gz")
    of2 = os.path.join(out_dir, samplename + "-trimmed-pair2.fastq.gz")
    of2 = of2 if fq2 else None
    if fq1 and fq2:
        if file_exists(of1) and file_exists(of2):
            return of1, of2
    else:
        if file_exists(of1):
            return of1, None
    safe_makedir(out_dir)
    file_string = "{fq1} {fq2} " if fq2 else "{fq1} "
    fw_cmd = _fw_command(data)
    rv_cmd = _rv_command(data)
    mode = "tail" if not fq2 else "pe"
    cmd = ("{skewer} --min 25 --threads {nthreads} -q 5 "
           "{fw_cmd} "
           "{rv_cmd} "
           "-m {mode} "
           "--compress --output {out_stem} ") + file_string
    with file_transaction(out_dir) as tx_out_dir:
        safe_makedir(tx_out_dir)
        out_stem = os.path.join(tx_out_dir, samplename)
        message = "Trimming {fq1}, {fq2} with skewer.".format(**locals())
        do.run(cmd.format(**locals()), message)
    return of1, of2
예제 #15
0
def rapmap_index(gtf_file, ref_file, algorithm, data, out_dir):
    valid_indexes = ["pseudoindex", "quasiindex"]
    index_type = algorithm + "index"
    assert index_type in valid_indexes, \
        "RapMap only supports %s indices." % valid_indexes
    out_dir = os.path.join(out_dir, index_type, dd.get_genome_build(data))
    if dd.get_disambiguate(data):
        out_dir = "-".join([out_dir] + dd.get_disambguate(data))
    rapmap = config_utils.get_program("rapmap", dd.get_config(data))
    # use user supplied transcriptome FASTA file if it exists
    if dd.get_transcriptome_fasta(data):
        out_dir = os.path.join(out_dir, index_type, dd.get_genome_build(data))
        gtf_fa = dd.get_transcriptome_fasta(data)
    else:
        gtf_fa = sailfish.create_combined_fasta(data)
    tmpdir = dd.get_tmp_dir(data)
    if file_exists(out_dir + "rapidx.jfhash"):
        return out_dir
    files = dd.get_input_sequence_files(data)
    kmersize = sailfish.pick_kmersize(files[0])
    message = "Creating rapmap {index_type} for {gtf_fa} with {kmersize} bp kmers."
    with file_transaction(out_dir) as tx_out_dir:
        cmd = "{rapmap} {index_type} -k {kmersize} -i {tx_out_dir} -t {gtf_fa}"
        do.run(cmd.format(**locals()), message.format(**locals()), None)
    return out_dir
예제 #16
0
def process_alignment(data, alt_input=None):
    """Do an alignment of fastq files, preparing a sorted BAM output file.
    """
    fastq1, fastq2 = dd.get_input_sequence_files(data)
    if alt_input:
        fastq1, fastq2 = alt_input
    config = data["config"]
    aligner = config["algorithm"].get("aligner", None)
    if fastq1 and objectstore.file_exists_or_remote(fastq1) and aligner:
        logger.info("Aligning lane %s with %s aligner" %
                    (data["rgnames"]["lane"], aligner))
        data = align_to_sort_bam(fastq1, fastq2, aligner, data)
        data = _add_supplemental_bams(data)
    elif fastq1 and os.path.exists(fastq1) and fastq1.endswith(".bam"):
        sort_method = config["algorithm"].get("bam_sort")
        bamclean = config["algorithm"].get("bam_clean")
        if bamclean is True or bamclean == "picard":
            if sort_method and sort_method != "coordinate":
                raise ValueError(
                    "Cannot specify `bam_clean: picard` with `bam_sort` other than coordinate: %s"
                    % sort_method)
            out_bam = cleanbam.picard_prep(fastq1, data["rgnames"],
                                           data["sam_ref"], data["dirs"], data)
        elif sort_method:
            runner = broad.runner_from_config(config)
            out_file = os.path.join(
                data["dirs"]["work"], "{}-sort.bam".format(
                    os.path.splitext(os.path.basename(fastq1))[0]))
            out_bam = runner.run_fn("picard_sort", fastq1, sort_method,
                                    out_file)
        else:
            out_bam = link_bam_file(
                fastq1,
                os.path.join(data["dirs"]["work"], "prealign",
                             data["rgnames"]["sample"]))
        bam.index(out_bam, data["config"])
        bam.check_header(out_bam, data["rgnames"], data["sam_ref"],
                         data["config"])
        dedup_bam = postalign.dedup_bam(out_bam, data)
        bam.index(dedup_bam, data["config"])
        data["work_bam"] = dedup_bam
    elif fastq1 and objectstore.file_exists_or_remote(
            fastq1) and fastq1.endswith(".cram"):
        data["work_bam"] = fastq1
    elif fastq1 is None and "vrn_file" in data:
        data["config"]["algorithm"]["variantcaller"] = False
        data["work_bam"] = None
    elif not fastq1:
        raise ValueError("No 'files' specified for input sample: %s" %
                         dd.get_sample_name(data))
    else:
        raise ValueError(
            "Could not process input file from sample configuration. \n" +
            fastq1 + "\nIs the path to the file correct?\n" +
            "If it is a fastq file (not pre-aligned BAM or CRAM), "
            "is an aligner specified in the input configuration?")
    return [[data]]
예제 #17
0
def process_alignment(data, alt_input=None):
    """Do an alignment of fastq files, preparing a sorted BAM output file.
    """
    data = utils.to_single_data(data)
    fastq1, fastq2 = dd.get_input_sequence_files(data)
    if alt_input:
        fastq1, fastq2 = alt_input
    config = data["config"]
    aligner = config["algorithm"].get("aligner", None)
    if fastq1 and objectstore.file_exists_or_remote(fastq1) and aligner:
        logger.info("Aligning lane %s with %s aligner" % (data["rgnames"]["lane"], aligner))
        data = align_to_sort_bam(fastq1, fastq2, aligner, data)
        data = _add_supplemental_bams(data)
    elif fastq1 and objectstore.file_exists_or_remote(fastq1) and fastq1.endswith(".bam"):
        sort_method = config["algorithm"].get("bam_sort")
        bamclean = config["algorithm"].get("bam_clean")
        if bamclean is True or bamclean == "picard":
            if sort_method and sort_method != "coordinate":
                raise ValueError("Cannot specify `bam_clean: picard` with `bam_sort` other than coordinate: %s"
                                 % sort_method)
            out_bam = cleanbam.picard_prep(fastq1, data["rgnames"], dd.get_ref_file(data), data["dirs"],
                                           data)
        elif bamclean == "fixrg":
            out_bam = cleanbam.fixrg(fastq1, data["rgnames"], dd.get_ref_file(data), data["dirs"], data)
        elif sort_method:
            runner = broad.runner_from_path("picard", config)
            out_file = os.path.join(data["dirs"]["work"], "{}-sort.bam".format(
                os.path.splitext(os.path.basename(fastq1))[0]))
            out_bam = runner.run_fn("picard_sort", fastq1, sort_method, out_file)
        else:
            out_bam = link_bam_file(fastq1, os.path.join(data["dirs"]["work"], "prealign",
                                                         data["rgnames"]["sample"]))
        bam.index(out_bam, data["config"])
        bam.check_header(out_bam, data["rgnames"], dd.get_ref_file(data), data["config"])
        dedup_bam = postalign.dedup_bam(out_bam, data)
        bam.index(dedup_bam, data["config"])
        data["work_bam"] = dedup_bam
    elif fastq1 and objectstore.file_exists_or_remote(fastq1) and fastq1.endswith(".cram"):
        data["work_bam"] = fastq1
    elif fastq1 is None and "vrn_file" in data:
        data["config"]["algorithm"]["variantcaller"] = False
        data["work_bam"] = None
    elif not fastq1:
        raise ValueError("No 'files' specified for input sample: %s" % dd.get_sample_name(data))
    else:
        raise ValueError("Could not process input file from sample configuration. \n" +
                         fastq1 +
                         "\nIs the path to the file correct or is empty?\n" +
                         "If it is a fastq file (not pre-aligned BAM or CRAM), "
                         "is an aligner specified in the input configuration?")
    if data.get("work_bam"):
        # Add stable 'align_bam' target to use for retrieving raw alignment
        data["align_bam"] = data["work_bam"]
        data = _add_hla_files(data)
    return [[data]]
예제 #18
0
def filter_barcodes(data):
    # if data was pre-demultiplexed, there is no need to filter the barcodes
    if dd.get_demultiplexed(data):
        return [[data]]
    fq1 = dd.get_input_sequence_files(data)[0]
    umi_dir = os.path.join(dd.get_work_dir(data), "umis")
    correction = dd.get_cellular_barcode_correction(data)
    bc = get_cellular_barcodes(data)
    if not bc:
        logger.info("No cellular barcodes found, skipping filtering.")
        return [[data]]
    bc1 = None
    bc2 = None
    bc3 = None
    umi_dir = os.path.join(dd.get_work_dir(data), "umis")
    if isinstance(bc, six.string_types):
        bc1 = bc
    if len(bc) == 1:
        bc1 = bc[0]
    if len(bc) > 1:
        bc1 = bc[0]
        bc2 = bc[1]
    if len(bc) == 3:
        bc3 = bc[2]
    out_base = dd.get_sample_name(data) + ".filtered.fq.gz"
    out_file = os.path.join(umi_dir, out_base)
    if file_exists(out_file):
        data["files"] = [out_file]
        return [[data]]

    ncores = dd.get_num_cores(data)
    cmd = "{umis} cb_filter --cores {ncores} "
    if bc1:
        cmd += "--bc1 {bc1} "
        if correction:
            cmd += "--nedit {correction} "
    if bc2:
        cmd += "--bc2 {bc2} "
    if bc3:
        cmd += "--bc3 {bc3} "

    fq1_cmd = "{fq1} "
    fq1_cmd = fq1_cmd.format(fq1=fq1)
    cmd += "{fq1_cmd} | gzip -c > {tx_out_file}"

    sample_dir = os.path.join(umi_dir, dd.get_sample_name(data))
    safe_makedir(sample_dir)
    umis = config_utils.get_program("umis", data, default="umis")
    with file_transaction(out_file) as tx_out_file:
        message = "Filtering by cellular barcode."
        do.run(cmd.format(**locals()), message)
    data["files"] = [out_file]
    return [[data]]
예제 #19
0
def filter_barcodes(data):
    # if data was pre-demultiplexed, there is no need to filter the barcodes
    if dd.get_demultiplexed(data):
        return [[data]]
    fq1 = dd.get_input_sequence_files(data)[0]
    umi_dir = os.path.join(dd.get_work_dir(data), "umis")
    correction = dd.get_cellular_barcode_correction(data)
    bc = get_cellular_barcodes(data)
    if not bc:
        logger.info("No cellular barcodes found, skipping filtering.")
        return [[data]]
    bc1 = None
    bc2 = None
    bc3 = None
    umi_dir = os.path.join(dd.get_work_dir(data), "umis")
    if isinstance(bc, six.string_types):
        bc1 = bc
    if len(bc) == 1:
        bc1 = bc[0]
    if len(bc) > 1:
        bc1 = bc[0]
        bc2 = bc[1]
    if len(bc) == 3:
        bc3 = bc[2]
    out_base = dd.get_sample_name(data) + ".filtered.fq.gz"
    out_file = os.path.join(umi_dir, out_base)
    if file_exists(out_file):
        data["files"] = [out_file]
        return [[data]]

    ncores = dd.get_num_cores(data)
    cmd = "{umis} cb_filter --cores {ncores} "
    if bc1:
        cmd += "--bc1 {bc1} "
        if correction:
            cmd += "--nedit {correction} "
    if bc2:
        cmd += "--bc2 {bc2} "
    if bc3:
        cmd += "--bc3 {bc3} "

    fq1_cmd = "{fq1} " if not is_gzipped(fq1) else "<(gzip -cd {fq1}) "
    fq1_cmd = fq1_cmd.format(fq1=fq1)
    cmd += "{fq1_cmd} | gzip > {tx_out_file}"

    sample_dir = os.path.join(umi_dir, dd.get_sample_name(data))
    safe_makedir(sample_dir)
    umis = config_utils.get_program("umis", data, default="umis")
    with file_transaction(out_file) as tx_out_file:
        message = "Filtering by cellular barcode."
        do.run(cmd.format(**locals()), message)
    data["files"] = [out_file]
    return [[data]]
예제 #20
0
def run_sailfish_index(*samples):
    fq1, _ = dd.get_input_sequence_files(samples[0][0])
    kmer_size = estimate_kmer_size(fq1)
    Build = namedtuple('Build', ['build', 'ref', 'gtf'])
    builds = {Build(get_build_string(x), dd.get_ref_file(x), dd.get_gtf_file(x))
              for x in dd.sample_data_iterator(samples)}
    data = samples[0][0]
    indexdirs = {}
    for build in builds:
        indexdirs[build.build] = sailfish_index(build.ref, build.gtf, data,
                                                build.build, kmer_size)
    return samples
예제 #21
0
def run_sailfish_index(*samples):
    fq1, _ = dd.get_input_sequence_files(samples[0][0])
    kmer_size = estimate_kmer_size(fq1)
    Build = namedtuple('Build', ['build', 'ref', 'gtf'])
    builds = {Build(get_build_string(x), dd.get_ref_file(x), dd.get_gtf_file(x))
              for x in dd.sample_data_iterator(samples)}
    data = samples[0][0]
    indexdirs = {}
    for build in builds:
        indexdirs[build.build] = sailfish_index(build.ref, build.gtf, data,
                                                build.build, kmer_size)
    return samples
예제 #22
0
파일: umi.py 프로젝트: vamst/bcbio-nextgen
def filter_barcodes(data):
    fq1 = dd.get_input_sequence_files(data)[0]
    umi_dir = os.path.join(dd.get_work_dir(data), "umis")
    correction = dd.get_cellular_barcode_correction(data)
    bc = get_cellular_barcodes(data)
    if not bc:
        return [[data]]
    bc1 = None
    bc2 = None
    bc3 = None
    umi_dir = os.path.join(dd.get_work_dir(data), "umis")
    if isinstance(bc, basestring):
        bc1 = bc
    if len(bc) == 1:
        bc1 = bc[0]
    if len(bc) > 1:
        bc1 = bc[0]
        bc2 = bc[1]
    if len(bc) == 3:
        bc3 = bc[2]
    out_base = dd.get_sample_name(data) + ".filtered.fq.gz"
    out_file = os.path.join(umi_dir, out_base)
    if file_exists(out_file):
        data["files"] = [out_file]
        return [[data]]

    ncores = dd.get_num_cores(data)
    cmd = "{umis} cb_filter --cores {ncores} "
    if bc1:
        cmd += "--bc1 {bc1} "
        if correction:
            cmd += "--nedit {correction} "
    if bc2:
        cmd += "--bc2 {bc2} "
    if bc3:
        cmd += "--bc3 {bc3} "

    fq1_cmd = "{fq1} " if not is_gzipped(fq1) else "<(gzip -cd {fq1}) "
    fq1_cmd = fq1_cmd.format(fq1=fq1)
    cmd += "{fq1_cmd} | gzip > {tx_out_file}"

    sample_dir = os.path.join(umi_dir, dd.get_sample_name(data))
    safe_makedir(sample_dir)
    umis = config_utils.get_program("umis", data, default="umis")
    with file_transaction(out_file) as tx_out_file:
        message = "Filtering by cellular barcode."
        do.run(cmd.format(**locals()), message)
    data["files"] = [out_file]
    return [[data]]
예제 #23
0
def run_rapmap_align(data):
    samplename = dd.get_sample_name(data)
    files = dd.get_input_sequence_files(data)
    work_dir = dd.get_work_dir(data)
    if len(files) == 2:
        fq1, fq2 = files
    else:
        fq1, fq2 = files[0], None
    rapmap_dir = os.path.join(work_dir, "rapmap", samplename)
    gtf_file = dd.get_gtf_file(data)
    fasta_file = dd.get_ref_file(data)
    out_file = rapmap_align(fq1, fq2, rapmap_dir, gtf_file, fasta_file,
                            "quasi", data)
    data = dd.set_transcriptome_bam(data, out_file)
    return [[data]]
예제 #24
0
def run_rapmap_align(data):
    samplename = dd.get_sample_name(data)
    files = dd.get_input_sequence_files(data)
    work_dir = dd.get_work_dir(data)
    if len(files) == 2:
        fq1, fq2 = files
    else:
        fq1, fq2 = files[0], None
    rapmap_dir = os.path.join(work_dir, "rapmap", samplename)
    gtf_file = dd.get_gtf_file(data)
    fasta_file = dd.get_ref_file(data)
    out_file = rapmap_align(fq1, fq2, rapmap_dir, gtf_file, fasta_file,
                            "quasi", data)
    data = dd.set_transcriptome_bam(data, out_file)
    return [[data]]
예제 #25
0
def prepare_input_data(config):
    """ In case of disambiguation, we want to run fusion calling on
    the disambiguated reads, which are in the work_bam file.
    As EricScript accepts 2 fastq files as input, we need to convert
    the .bam to 2 .fq files.
    """

    if not dd.get_disambiguate(config):
        return dd.get_input_sequence_files(config)

    work_bam = dd.get_work_bam(config)
    logger.info("Converting disambiguated reads to fastq...")
    fq_files = convert_bam_to_fastq(work_bam, dd.get_work_dir(config), None,
                                    None, config)
    return fq_files
예제 #26
0
def barcode_histogram(data):
    fq1 = dd.get_input_sequence_files(data)[0]
    umi_dir = os.path.join(dd.get_work_dir(data), "umis")
    sample_dir = os.path.join(umi_dir, dd.get_sample_name(data))
    umis = config_utils.get_program("umis", data, default="umis")
    safe_makedir(sample_dir)
    out_file = os.path.join(sample_dir, "cb-histogram.txt")
    if file_exists(out_file):
        return [[data]]
    fq1_cmd = fq1
    cmd = "{umis} cb_histogram {fq1_cmd} > {tx_out_file}"
    with file_transaction(out_file) as tx_out_file:
        message = "Computing cellular barcode counts for %s." % fq1
        do.run(cmd.format(**locals()), message)
    return [[data]]
예제 #27
0
def barcode_histogram(data):
    fq1 = dd.get_input_sequence_files(data)[0]
    umi_dir = os.path.join(dd.get_work_dir(data), "umis")
    sample_dir = os.path.join(umi_dir, dd.get_sample_name(data))
    umis = config_utils.get_program("umis", data, default="umis")
    safe_makedir(sample_dir)
    out_file = os.path.join(sample_dir, "cb-histogram.txt")
    if file_exists(out_file):
        return [[data]]
    fq1_cmd = fq1
    cmd = "{umis} cb_histogram {fq1_cmd} > {tx_out_file}"
    with file_transaction(out_file) as tx_out_file:
        message = "Computing cellular barcode counts for %s." % fq1
        do.run(cmd.format(**locals()), message)
    return [[data]]
예제 #28
0
def generate_transcript_counts(data):
    """Generate counts per transcript and per exon from an alignment"""
    data["count_file"] = featureCounts.count(data)
    if dd.get_fusion_mode(data, False):
        oncofuse_file = oncofuse.run(data)
        if oncofuse_file:
            data = dd.set_oncofuse_file(data, oncofuse_file)
    # if RSEM set to run, but the aligner didn't create the transcriptome BAM
    # file, make one with bwa
    if dd.get_rsem(data) and not dd.get_transcriptome_bam(data):
        file1, file2 = dd.get_input_sequence_files(data)
        ref_file = dd.get_ref_file(data)
        logger.info("RSEM was flagged to run, but the transcriptome BAM file "
                    "was not found. Aligning to the transcriptome with bowtie2.")
        data = bowtie2.align_transcriptome(file1, file2, ref_file, data)
    return [[data]]
예제 #29
0
def sailfish_index(gtf_file, ref_file, data, build):
    work_dir = dd.get_work_dir(data)
    out_dir = os.path.join(work_dir, "sailfish", "index", build)
    sailfish = config_utils.get_program("sailfish", data["config"])
    num_cores = dd.get_num_cores(data)
    gtf_fa = create_combined_fasta(data)
    if file_exists(os.path.join(out_dir, "versionInfo.json")):
        return out_dir
    with file_transaction(data, out_dir) as tx_out_dir:
        fq1, _ = dd.get_input_sequence_files(data)
        kmersize = pick_kmersize(fq1)
        cmd = ("{sailfish} index -p {num_cores} -t {gtf_fa} -o {tx_out_dir} "
               "-k {kmersize}")
        message = "Creating sailfish index for {gtf_fa} with {kmersize} bp kmers."
        do.run(cmd.format(**locals()), message.format(**locals()), None)
    return out_dir
예제 #30
0
def prepare_input_data(config):
    """ In case of disambiguation, we want to run fusion calling on
    the disambiguated reads, which are in the work_bam file.
    As EricScript accepts 2 fastq files as input, we need to convert
    the .bam to 2 .fq files.
    """

    if not dd.get_disambiguate(config):
        return dd.get_input_sequence_files(config)

    work_bam = dd.get_work_bam(config)
    logger.info("Converting disambiguated reads to fastq...")
    fq_files = convert_bam_to_fastq(
        work_bam, dd.get_work_dir(config), None, None, config
    )
    return fq_files
예제 #31
0
def run_kallisto_singlecell(data):
    samplename = dd.get_sample_name(data)
    work_dir = dd.get_work_dir(data)
    kallisto_dir = os.path.join(work_dir, "kallisto", samplename)
    gtf_file = dd.get_gtf_file(data)
    files = dd.get_input_sequence_files(data)
    if len(files) == 2:
        fq1, fq2 = files
    else:
        fq1, fq2 = files[0], None
    assert file_exists(gtf_file), "%s was not found, exiting." % gtf_file
    fasta_file = dd.get_ref_file(data)
    assert file_exists(fasta_file), "%s was not found, exiting." % fasta_file
    out_file = kallisto_singlecell(fq1, kallisto_dir, gtf_file, fasta_file, data)
    data = dd.set_kallisto_quant(data, out_file)
    return [[data]]
예제 #32
0
def run_salmon_reads(data):
    data = utils.to_single_data(data)
    samplename = dd.get_sample_name(data)
    work_dir = dd.get_work_dir(data)
    salmon_dir = os.path.join(work_dir, "salmon", samplename)
    gtf_file = dd.get_gtf_file(data)
    files = dd.get_input_sequence_files(data)
    if len(files) == 2:
        fq1, fq2 = files
    else:
        fq1, fq2 = files[0], None
    fasta_file = dd.get_ref_file(data)
    out_file = salmon_quant_reads(fq1, fq2, salmon_dir, gtf_file, fasta_file, data)
    data = dd.set_salmon(data, out_file)
    data = dd.set_salmon_dir(data, salmon_dir)
    return [[data]]
예제 #33
0
def run_kallisto_singlecell(data):
    samplename = dd.get_sample_name(data)
    work_dir = dd.get_work_dir(data)
    kallisto_dir = os.path.join(work_dir, "kallisto", samplename)
    gtf_file = dd.get_gtf_file(data)
    files = dd.get_input_sequence_files(data)
    if len(files) == 2:
        fq1, fq2 = files
    else:
        fq1, fq2 = files[0], None
    assert file_exists(gtf_file), "%s was not found, exiting." % gtf_file
    fasta_file = dd.get_ref_file(data)
    assert file_exists(fasta_file), "%s was not found, exiting." % fasta_file
    out_file = kallisto_singlecell(fq1, kallisto_dir, gtf_file, fasta_file, data)
    data = dd.set_kallisto_quant(data, out_file)
    return [[data]]
예제 #34
0
def sailfish_index(gtf_file, ref_file, data, build):
    work_dir = dd.get_work_dir(data)
    out_dir = os.path.join(work_dir, "sailfish", "index", build)
    sailfish = config_utils.get_program("sailfish", data["config"])
    num_cores = dd.get_num_cores(data)
    gtf_fa = create_combined_fasta(data)
    if file_exists(os.path.join(out_dir, "versionInfo.json")):
        return out_dir
    with file_transaction(data, out_dir) as tx_out_dir:
        fq1, _ = dd.get_input_sequence_files(data)
        kmersize = pick_kmersize(fq1)
        cmd = ("{sailfish} index -p {num_cores} -t {gtf_fa} -o {tx_out_dir} "
               "-k {kmersize}")
        message = "Creating sailfish index for {gtf_fa} with {kmersize} bp kmers."
        do.run(cmd.format(**locals()), message.format(**locals()), None)
    return out_dir
예제 #35
0
def run_rapmap_pseudoalign(data):
    samplename = dd.get_sample_name(data)
    files = dd.get_input_sequence_files(data)
    work_dir = dd.get_work_dir(data)
    if len(files) == 2:
        fq1, fq2 = files
    else:
        fq1, fq2 = files[0], None
    rapmap_dir = os.path.join(work_dir, "rapmap", samplename)
    gtf_file = dd.get_gtf_file(data)
    assert file_exists(gtf_file), "%s was not found, exiting." % gtf_file
    fasta_file = dd.get_ref_file(data)
    assert file_exists(fasta_file), "%s was not found, exiting." % fasta_file
    out_file = rapmap_pseudoalign(fq1, fq2, rapmap_dir, gtf_file, fasta_file, data)
    data = dd.set_transcriptome_bam(data, out_file)
    return [[data]]
예제 #36
0
def run_salmon_reads(data):
    samplename = dd.get_sample_name(data)
    work_dir = dd.get_work_dir(data)
    salmon_dir = os.path.join(work_dir, "salmon", samplename)
    gtf_file = dd.get_gtf_file(data)
    files = dd.get_input_sequence_files(data)
    if len(files) == 2:
        fq1, fq2 = files
    else:
        fq1, fq2 = files[0], None
    assert file_exists(gtf_file), "%s was not found, exiting." % gtf_file
    fasta_file = dd.get_ref_file(data)
    assert file_exists(fasta_file), "%s was not found, exiting." % fasta_file
    out_file = salmon_quant_reads(fq1, fq2, salmon_dir, gtf_file, fasta_file, data)
    data = dd.set_sailfish(data, out_file)
    data = dd.set_sailfish_dir(data, salmon_dir)
    return [[data]]
예제 #37
0
def run_salmon_reads(data):
    samplename = dd.get_sample_name(data)
    work_dir = dd.get_work_dir(data)
    salmon_dir = os.path.join(work_dir, "salmon", samplename)
    gtf_file = dd.get_gtf_file(data)
    files = dd.get_input_sequence_files(data)
    if len(files) == 2:
        fq1, fq2 = files
    else:
        fq1, fq2 = files[0], None
    assert file_exists(gtf_file), "%s was not found, exiting." % gtf_file
    fasta_file = dd.get_ref_file(data)
    assert file_exists(fasta_file), "%s was not found, exiting." % fasta_file
    out_file = salmon_quant_reads(fq1, fq2, salmon_dir, gtf_file, fasta_file, data)
    data = dd.set_sailfish(data, out_file)
    data = dd.set_sailfish_dir(data, salmon_dir)
    return [[data]]
예제 #38
0
def generate_transcript_counts(data):
    """Generate counts per transcript and per exon from an alignment"""
    data["count_file"] = featureCounts.count(data)
    if dd.get_fusion_mode(data, False):
        oncofuse_file = oncofuse.run(data)
        if oncofuse_file:
            data = dd.set_oncofuse_file(data, oncofuse_file)
    # if RSEM set to run, but the aligner didn't create the transcriptome BAM
    # file, make one with bwa
    if dd.get_rsem(data) and not dd.get_transcriptome_bam(data):
        file1, file2 = dd.get_input_sequence_files(data)
        ref_file = dd.get_ref_file(data)
        logger.info(
            "RSEM was flagged to run, but the transcriptome BAM file "
            "was not found. Aligning to the transcriptome with bowtie2.")
        data = bowtie2.align_transcriptome(file1, file2, ref_file, data)
    return [[data]]
예제 #39
0
def barcode_histogram(data):
    fq1 = dd.get_input_sequence_files(data)[0]
    umi_dir = os.path.join(dd.get_work_dir(data), "umis")
    sample_dir = os.path.join(umi_dir, dd.get_sample_name(data))
    umis = config_utils.get_program("umis", data, default="umis")
    safe_makedir(sample_dir)
    out_file = os.path.join(sample_dir, "cb-histogram.txt")
    filtered_out_file = os.path.join(sample_dir, "cb-histogram-filtered.txt")
    fq1_cmd = fq1
    cmd = "{umis} cb_histogram {fq1_cmd} > {tx_out_file}"
    if not file_exists(out_file):
        with file_transaction(out_file) as tx_out_file:
            message = "Computing cellular barcode counts for %s." % fq1
            do.run(cmd.format(**locals()), message)
    cutoff = dd.get_minimum_barcode_depth(data)
    filter_barcode_histogram(filtered_out_file, out_file, cutoff)
    newdata = dd.set_histogram_counts(data, filtered_out_file)
    return [[newdata]]
예제 #40
0
def barcode_histogram(data):
    fq1 = dd.get_input_sequence_files(data)[0]
    umi_dir = os.path.join(dd.get_work_dir(data), "umis")
    sample_dir = os.path.join(umi_dir, dd.get_sample_name(data))
    umis = config_utils.get_program("umis", data, default="umis")
    safe_makedir(sample_dir)
    out_file = os.path.join(sample_dir, "cb-histogram.txt")
    filtered_out_file = os.path.join(sample_dir, "cb-histogram-filtered.txt")
    fq1_cmd = fq1
    cmd = "{umis} cb_histogram {fq1_cmd} > {tx_out_file}"
    if not file_exists(out_file):
        with file_transaction(out_file) as tx_out_file:
            message = "Computing cellular barcode counts for %s." % fq1
            do.run(cmd.format(**locals()), message)
    cutoff = dd.get_minimum_barcode_depth(data)
    filter_barcode_histogram(filtered_out_file, out_file, cutoff)
    newdata = dd.set_histogram_counts(data, filtered_out_file)
    return [[newdata]]
예제 #41
0
def run_kallisto_rnaseq(data):
    samplename = dd.get_sample_name(data)
    work_dir = dd.get_work_dir(data)
    kallisto_dir = os.path.join(work_dir, "kallisto", samplename)
    gtf_file = dd.get_gtf_file(data)
    files = dd.get_input_sequence_files(data)
    if len(files) == 2:
        fq1, fq2 = files
    else:
        fq1, fq2 = files[0], None
    assert file_exists(gtf_file), "%s was not found, exiting." % gtf_file
    fasta_file = dd.get_ref_file(data)
    assert file_exists(fasta_file), "%s was not found, exiting." % fasta_file
    assert fq2, ("We don't support kallisto for single-end reads and fusion "
                 "calling with pizzly does not accept single end reads.")
    out_file = kallisto_rnaseq(fq1, fq2, kallisto_dir, gtf_file, fasta_file, data)
    data = dd.set_kallisto_quant(data, out_file)
    return [[data]]
예제 #42
0
def filter_barcodes(data):
    fq1 = dd.get_input_sequence_files(data)[0]
    umi_dir = os.path.join(dd.get_work_dir(data), "umis")
    correction = dd.get_cellular_barcode_correction(data)
    bc = dd.get_cellular_barcodes(data)
    if not bc:
        return [[data]]
    bc1 = None
    bc2 = None
    umi_dir = os.path.join(dd.get_work_dir(data), "umis")
    if isinstance(bc, basestring):
        bc1 = bc
    if len(bc) == 1:
        bc1 = bc[0]
    if len(bc) == 2:
        bc1 = bc[0]
        bc2 = bc[1]
    out_base = dd.get_sample_name(data) + ".filtered.fq.gz"
    out_file = os.path.join(umi_dir, out_base)
    if file_exists(out_file):
        data["files"] = [out_file]
        return [[data]]

    ncores = dd.get_num_cores(data)
    cmd = "{umis} cb_filter --cores {ncores} "
    if bc1:
        cmd += "--bc1 {bc1} "
        if correction:
            cmd += "--nedit {correction} "
    if bc2:
        cmd += "--bc2 {bc2} "

    fq1_cmd = "{fq1} " if not is_gzipped(fq1) else "<(gzip -cd {fq1}) "
    fq1_cmd = fq1_cmd.format(fq1=fq1)
    cmd += "{fq1_cmd} | gzip > {tx_out_file}"

    sample_dir = os.path.join(umi_dir, dd.get_sample_name(data))
    safe_makedir(sample_dir)
    umis = config_utils.get_program("umis", data, default="umis")
    with file_transaction(out_file) as tx_out_file:
        message = "Filtering by cellular barcode."
        do.run(cmd.format(**locals()), message)
    data["files"] = [out_file]
    return [[data]]
예제 #43
0
def run_kallisto_rnaseq(data):
    samplename = dd.get_sample_name(data)
    work_dir = dd.get_work_dir(data)
    kallisto_dir = os.path.join(work_dir, "kallisto", samplename)
    gtf_file = dd.get_gtf_file(data)
    files = dd.get_input_sequence_files(data)
    if len(files) == 2:
        fq1, fq2 = files
    else:
        fq1, fq2 = files[0], None
    assert file_exists(gtf_file), "%s was not found, exiting." % gtf_file
    fasta_file = dd.get_ref_file(data)
    assert file_exists(fasta_file), "%s was not found, exiting." % fasta_file
    assert fq2, ("bcbio doesn't support kallisto for single-end reads, we can "
                 "add support for this if you open up an issue about it here: "
                 "https://github.com/chapmanb/bcbio-nextgen/issues")
    out_file = kallisto_rnaseq(fq1, fq2, kallisto_dir, gtf_file, fasta_file, data)
    data = dd.set_kallisto_quant(data, out_file)
    return [[data]]
예제 #44
0
def run_kallisto_rnaseq(data):
    samplename = dd.get_sample_name(data)
    work_dir = dd.get_work_dir(data)
    kallisto_dir = os.path.join(work_dir, "kallisto", samplename)
    gtf_file = dd.get_transcriptome_gtf(data, default=dd.get_gtf_file(data))
    files = dd.get_input_sequence_files(data)
    if len(files) == 2:
        fq1, fq2 = files
    else:
        fq1, fq2 = files[0], None
    assert file_exists(gtf_file), "%s was not found, exiting." % gtf_file
    fasta_file = dd.get_ref_file(data)
    assert file_exists(fasta_file), "%s was not found, exiting." % fasta_file
    assert fq2, ("We don't support kallisto for single-end reads and fusion "
                 "calling with pizzly does not accept single end reads.")
    out_file = kallisto_rnaseq(fq1, fq2, kallisto_dir, gtf_file, fasta_file,
                               data)
    data = dd.set_kallisto_quant(data, out_file)
    return [[data]]
예제 #45
0
def counts_spikein(data):
    data = utils.to_single_data(data)
    samplename = dd.get_sample_name(data)
    work_dir = dd.get_work_dir(data)
    salmon_dir = os.path.join(work_dir, "spikein", samplename)
    fasta_file = dd.get_spikein_fasta(data)
    if not fasta_file:
        return data
    files = dd.get_input_sequence_files(data)
    if len(files) == 2:
        fq1, fq2 = files
    else:
        fq1, fq2 = files[0], None
    assert file_exists(fasta_file), "%s was not found, exiting." % fasta_file
    kmer = 31 if not dd.get_analysis(data).lower() == "smallrna-seq" else 15
    fasta_index = _index_spikein(fasta_file, salmon_dir, data, kmer)
    out_file = _salmon_quant_reads(fq1, fq2, salmon_dir, fasta_index, data)
    data = dd.set_spikein_counts(data, out_file)
    return data
예제 #46
0
def run_kallisto_rnaseq(data):
    samplename = dd.get_sample_name(data)
    work_dir = dd.get_work_dir(data)
    kallisto_dir = os.path.join(work_dir, "kallisto", samplename)
    gtf_file = dd.get_gtf_file(data)
    files = dd.get_input_sequence_files(data)
    if len(files) == 2:
        fq1, fq2 = files
    else:
        fq1, fq2 = files[0], None
    assert file_exists(gtf_file), "%s was not found, exiting." % gtf_file
    fasta_file = dd.get_ref_file(data)
    assert file_exists(fasta_file), "%s was not found, exiting." % fasta_file
    assert fq2, ("bcbio doesn't support kallisto for single-end reads, we can "
                 "add support for this if you open up an issue about it here: "
                 "https://github.com/bcbio/bcbio-nextgen/issues")
    out_file = kallisto_rnaseq(fq1, fq2, kallisto_dir, gtf_file, fasta_file, data)
    data = dd.set_kallisto_quant(data, out_file)
    return [[data]]
예제 #47
0
def counts_spikein(data):
    data = utils.to_single_data(data)
    samplename = dd.get_sample_name(data)
    work_dir = dd.get_work_dir(data)
    salmon_dir = os.path.join(work_dir, "spikein", samplename)
    fasta_file = dd.get_spikein_fasta(data)
    if not fasta_file:
        return data
    files = dd.get_input_sequence_files(data)
    if len(files) == 2:
        fq1, fq2 = files
    else:
        fq1, fq2 = files[0], None
    assert file_exists(fasta_file), "%s was not found, exiting." % fasta_file
    kmer = 31 if not dd.get_analysis(data).lower() == "smallrna-seq" else 15
    fasta_index = _index_spikein(fasta_file, salmon_dir, data, kmer)
    out_file = _salmon_quant_reads(fq1, fq2, salmon_dir, fasta_index, data)
    data = dd.set_spikein_counts(data, out_file)
    return data
예제 #48
0
def run_sailfish(data):
    samplename = dd.get_sample_name(data)
    files = dd.get_input_sequence_files(data)
    work_dir = dd.get_work_dir(data)
    if len(files) == 2:
        fq1, fq2 = files
    else:
        fq1, fq2 = files[0], None
    if not fastq.is_fastq(fq1):
        return [[data]]
    sailfish_dir = os.path.join(work_dir, "sailfish", samplename)
    gtf_file = dd.get_gtf_file(data)
    assert file_exists(gtf_file), "%s was not found, exiting." % gtf_file
    fasta_file = dd.get_ref_file(data)
    assert file_exists(fasta_file), "%s was not found, exiting." % fasta_file
    stranded = dd.get_strandedness(data).lower()
    out_file = sailfish(fq1, fq2, sailfish_dir, gtf_file, fasta_file, stranded, data)
    data = dd.set_sailfish(data, out_file)
    data = dd.set_sailfish_dir(data, sailfish_dir)
    return [[data]]
예제 #49
0
def run_sailfish(data):
    samplename = dd.get_sample_name(data)
    files = dd.get_input_sequence_files(data)
    work_dir = dd.get_work_dir(data)
    if len(files) == 2:
        fq1, fq2 = files
    else:
        fq1, fq2 = files[0], None
    if not fastq.is_fastq(fq1):
        return [[data]]
    sailfish_dir = os.path.join(work_dir, "sailfish", samplename)
    gtf_file = dd.get_gtf_file(data)
    assert file_exists(gtf_file), "%s was not found, exiting." % gtf_file
    fasta_file = dd.get_ref_file(data)
    assert file_exists(fasta_file), "%s was not found, exiting." % fasta_file
    stranded = dd.get_strandedness(data).lower()
    out_file = sailfish(fq1, fq2, sailfish_dir, gtf_file, fasta_file, stranded, data)
    data = dd.set_sailfish(data, out_file)
    data = dd.set_sailfish_dir(data, sailfish_dir)
    return [[data]]
예제 #50
0
def run_salmon_reads(data):
    data = utils.to_single_data(data)
    files = dd.get_input_sequence_files(data)
    if bam.is_bam(files[0]):
        files = fastq.convert_bam_to_fastq(files[0], data["dirs"]["work"],
                                           data, data["dirs"], data["config"])
    samplename = dd.get_sample_name(data)
    work_dir = dd.get_work_dir(data)
    salmon_dir = os.path.join(work_dir, "salmon", samplename)
    gtf_file = dd.get_gtf_file(data)
    if len(files) == 2:
        fq1, fq2 = files
    else:
        fq1, fq2 = files[0], None
    fasta_file = dd.get_ref_file(data)
    out_file = salmon_quant_reads(fq1, fq2, salmon_dir, gtf_file, fasta_file,
                                  data)
    data = dd.set_salmon(data, out_file)
    data = dd.set_salmon_dir(data, salmon_dir)
    return [[data]]
예제 #51
0
def generate_transcript_counts(data):
    """Generate counts per transcript and per exon from an alignment"""
    data["count_file"] = featureCounts.count(data)
    if dd.get_fusion_mode(data, False):
        oncofuse_file = oncofuse.run(data)
        if oncofuse_file:
            data = dd.set_oncofuse_file(data, oncofuse_file)
    # if RSEM set to run, but the aligner didn't create the transcriptome BAM
    # file, make one with bwa
    if dd.get_disambiguate(data):
        logger.info("RSEM is not supported yet for disambiguation protocols. "
                    "See https://github.com/chapmanb/bcbio-nextgen/issues/859")
        return [[data]]
    if dd.get_rsem(data) and not dd.get_transcriptome_bam(data):
        file1, file2 = dd.get_input_sequence_files(data)
        ref_file = dd.get_ref_file(data)
        logger.info("RSEM was flagged to run, but the transcriptome BAM file "
                    "was not found. Aligning to the transcriptome with bowtie2.")
        data = bowtie2.align_transcriptome(file1, file2, ref_file, data)
    return [[data]]
예제 #52
0
def run_salmon_reads(data):
    data = utils.to_single_data(data)
    files = dd.get_input_sequence_files(data)
    if bam.is_bam(files[0]):
        files = fastq.convert_bam_to_fastq(files[0], data["dirs"]["work"],
                                           data, data["dirs"], data["config"])
    samplename = dd.get_sample_name(data)
    work_dir = dd.get_work_dir(data)
    salmon_dir = os.path.join(work_dir, "salmon", samplename)
    gtf_file = dd.get_gtf_file(data)
    if len(files) == 2:
        fq1, fq2 = files
    else:
        fq1, fq2 = files[0], None
    fasta_file = dd.get_ref_file(data)
    out_file = salmon_quant_reads(fq1, fq2, salmon_dir, gtf_file, fasta_file, data)
    data = dd.set_salmon(data, out_file)
    data = dd.set_salmon_dir(data, salmon_dir)
    data = dd.set_salmon_fraglen_file(data, _get_fraglen_file(salmon_dir))
    return [[data]]
예제 #53
0
def run_salmon_decoy(data):
    data = utils.to_single_data(data)
    files = dd.get_input_sequence_files(data)
    if bam.is_bam(files[0]):
        files = fastq.convert_bam_to_fastq(files[0], data["dirs"]["work"],
                                           data, data["dirs"], data["config"])
    samplename = dd.get_sample_name(data)
    work_dir = dd.get_work_dir(data)
    salmon_dir = os.path.join(work_dir, "salmon", samplename)
    gtf_file = dd.get_gtf_file(data)
    if len(files) == 2:
        fq1, fq2 = files
    else:
        fq1, fq2 = files[0], None
    index = salmon_decoy_index(gtf_file, data, os.path.dirname(salmon_dir))
    out_file = salmon_quant_reads(fq1, fq2, salmon_dir, gtf_file, data, index)
    data = dd.set_salmon(data, out_file)
    data = dd.set_salmon_dir(data, salmon_dir)
    data = dd.set_salmon_fraglen_file(data, _get_fraglen_file(salmon_dir))
    data = dd.update_summary_qc(data, "salmon", base=dd.get_salmon_fraglen_file(data))
    return [[data]]
예제 #54
0
def salmon_index(gtf_file, data, out_dir):
    out_dir = os.path.join(out_dir, "index", sailfish.get_build_string(data))
    if dd.get_disambiguate(data):
        out_dir = "-".join([out_dir] + dd.get_disambiguate(data))
    salmon = config_utils.get_program("salmon", dd.get_config(data))
    num_cores = dd.get_num_cores(data)
    if dd.get_transcriptome_fasta(data):
        gtf_fa = dd.get_transcriptome_fasta(data)
    else:
        gtf_fa = sailfish.create_combined_fasta(data)
    assert file_exists(gtf_fa), "%s was not found, exiting." % gtf_fa
    tmpdir = dd.get_tmp_dir(data)
    out_file = os.path.join(out_dir, "versionInfo.json")
    if file_exists(out_file):
        return out_dir
    files = dd.get_input_sequence_files(data)
    kmersize = sailfish.pick_kmersize(files[0])
    with file_transaction(data, out_dir) as tx_out_dir:
        cmd = "{salmon} index --keepDuplicates -k {kmersize} -p {num_cores} -i {tx_out_dir} -t {gtf_fa}"
        message = "Creating Salmon index for {gtf_fa} with {kmersize} bp kmers."
        do.run(cmd.format(**locals()), message.format(**locals()), None)
    return out_dir
예제 #55
0
def process_alignment(data):
    """Do an alignment of fastq files, preparing a sorted BAM output file.
    """
    fastq1, fastq2 = dd.get_input_sequence_files(data)
    config = data["config"]
    aligner = config["algorithm"].get("aligner", None)
    if fastq1 and utils.file_exists_or_remote(fastq1) and aligner:
        logger.info("Aligning lane %s with %s aligner" % (data["rgnames"]["lane"], aligner))
        data = align_to_sort_bam(fastq1, fastq2, aligner, data)
        data = _add_supplemental_bams(data)
    elif fastq1 and os.path.exists(fastq1) and fastq1.endswith(".bam"):
        sort_method = config["algorithm"].get("bam_sort")
        bamclean = config["algorithm"].get("bam_clean")
        if bamclean is True or bamclean == "picard":
            if sort_method and sort_method != "coordinate":
                raise ValueError("Cannot specify `bam_clean: picard` with `bam_sort` other than coordinate: %s"
                                 % sort_method)
            out_bam = cleanbam.picard_prep(fastq1, data["rgnames"], data["sam_ref"], data["dirs"],
                                           data)
        elif sort_method:
            runner = broad.runner_from_config(config)
            out_file = os.path.join(data["dirs"]["work"], "{}-sort.bam".format(
                os.path.splitext(os.path.basename(fastq1))[0]))
            out_bam = runner.run_fn("picard_sort", fastq1, sort_method, out_file)
        else:
            out_bam = link_bam_file(fastq1, os.path.join(data["dirs"]["work"], "prealign",
                                                         data["rgnames"]["sample"]))
        bam.check_header(out_bam, data["rgnames"], data["sam_ref"], data["config"])
        dedup_bam = postalign.dedup_bam(out_bam, data)
        data["work_bam"] = dedup_bam
    elif fastq1 and utils.file_exists_or_remote(fastq1) and fastq1.endswith(".cram"):
        data["work_bam"] = fastq1
    elif fastq1 is None and "vrn_file" in data:
        data["config"]["algorithm"]["variantcaller"] = False
        data["work_bam"] = None
    else:
        raise ValueError("Could not process input file: %s" % fastq1)
    return [[data]]
예제 #56
0
def salmon_index(gtf_file, ref_file, data, out_dir):
    out_dir = os.path.join(out_dir, "index", sailfish.get_build_string(data))
    if dd.get_disambiguate(data):
        out_dir = "-".join([out_dir] + dd.get_disambiguate(data))
    salmon = config_utils.get_program("salmon", dd.get_config(data))
    num_cores = dd.get_num_cores(data)
    if dd.get_transcriptome_fasta(data):
        gtf_fa = dd.get_transcriptome_fasta(data)
    else:
        gtf_fa = sailfish.create_combined_fasta(data)
    assert file_exists(gtf_fa), "%s was not found, exiting." % gtf_fa
    tmpdir = dd.get_tmp_dir(data)
    out_file = os.path.join(out_dir, "versionInfo.json")
    if file_exists(out_file):
        logger.info("Transcriptome index for %s detected, skipping building." % gtf_fa)
        return out_dir
    files = dd.get_input_sequence_files(data)
    kmersize = sailfish.pick_kmersize(files[0])
    with file_transaction(data, out_dir) as tx_out_dir:
        cmd = "{salmon} index -k {kmersize} -p {num_cores} -i {tx_out_dir} -t {gtf_fa}"
        message = "Creating Salmon index for {gtf_fa} with {kmersize} bp kmers."
        do.run(cmd.format(**locals()), message.format(**locals()), None)
    return out_dir
예제 #57
0
def counts_spikein(data):
    data = utils.to_single_data(data)
    samplename = dd.get_sample_name(data)
    work_dir = dd.get_work_dir(data)
    salmon_dir = os.path.join(work_dir, "spikein", samplename)
    fasta_file = dd.get_spikein_fasta(data)
    if not fasta_file:
        return data
    files = dd.get_input_sequence_files(data)
    if len(files) == 2:
        fq1, fq2 = files
    else:
        fq1, fq2 = files[0], None
    assert file_exists(fasta_file), "%s was not found, exiting." % fasta_file
    readlength = fastq.estimate_read_length(fq1)
    if readlength % 2 == 0:
        readlength -= 1
    kmersize = min(readlength, 31)
    logger.info("kmersize used for salmon index at spikein quant: %s" % kmersize)
    kmersize = kmersize if not dd.get_analysis(data).lower() == "smallrna-seq" else 15
    fasta_index = _index_spikein(fasta_file, salmon_dir, data, kmersize)
    out_file = _salmon_quant_reads(fq1, fq2, salmon_dir, fasta_index, data)
    data = dd.set_spikein_counts(data, out_file)
    return data