Python is_gzipped Beispiele, bcbio.utils.is_gzipped Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: spikein.py Projekt: zhangj5/bcbio-nextgen

def _salmon_quant_reads(fq1, fq2, salmon_dir, index, data):
    samplename = dd.get_sample_name(data)
    quant_dir = os.path.join(salmon_dir, "quant")
    safe_makedir(salmon_dir)
    out_file = os.path.join(quant_dir, "quant.sf")
    if file_exists(out_file):
        return out_file
    num_cores = dd.get_num_cores(data)
    salmon = config_utils.get_program("salmon", dd.get_config(data))
    num_cores = dd.get_num_cores(data)
    cmd = ("{salmon} quant -l A -i {index} -p {num_cores} " "-o {tx_out_dir} ")
    fq1_cmd = "<(cat {fq1})" if not is_gzipped(fq1) else "<(gzip -cd {fq1})"
    fq1_cmd = fq1_cmd.format(fq1=fq1)
    if not fq2:
        cmd += " -r {fq1_cmd} "
    else:
        fq2_cmd = "<(cat {fq2})" if not is_gzipped(
            fq2) else "<(gzip -cd {fq2})"
        fq2_cmd = fq2_cmd.format(fq2=fq2)
        cmd += " -1 {fq1_cmd} -2 {fq2_cmd} "
    with file_transaction(data, quant_dir) as tx_out_dir:
        message = ("Quantifying transcripts in %s and %s with Salmon." %
                   (fq1, fq2))
        do.run(cmd.format(**locals()), message, None)
    return out_file

Beispiel #2

0

Datei anzeigen

Datei: sailfish.py Projekt: samesun/bcbio-nextgen

def sailfish(fq1, fq2, sailfish_dir, gtf_file, ref_file, strandedness, data):
    safe_makedir(sailfish_dir)
    samplename = dd.get_sample_name(data)
    out_file = os.path.join(sailfish_dir, "quant.sf")
    if file_exists(out_file):
        return out_file
    sailfish_idx = sailfish_index(gtf_file, ref_file, data, sailfish_dir)
    num_cores = dd.get_num_cores(data)
    sailfish = config_utils.get_program("sailfish", data["config"])
    cmd = "{sailfish} quant -i {sailfish_idx} -p {num_cores} "
    cmd += _libtype_string(fq1, fq2, strandedness)
    fq1_cmd = "{fq1}" if not is_gzipped(fq1) else "<(gzip -cd {fq1})"
    fq1_cmd = fq1_cmd.format(fq1=fq1)
    if not fq2:
        cmd += " -r {fq1_cmd} "
    else:
        fq2_cmd = "{fq2}" if not is_gzipped(fq2) else "<(gzip -cd {fq2})"
        fq2_cmd = fq2_cmd.format(fq2=fq2)
        cmd += " -1 {fq1_cmd} -2 {fq2_cmd} "
        cmd += "--useVBOpt --numBootstraps 30 "
    cmd += "-o {tx_out_dir}"
    message = "Quantifying transcripts in {fq1} and {fq2}."
    with file_transaction(data, sailfish_dir) as tx_out_dir:
        do.run(cmd.format(**locals()), message.format(**locals()), None)
    return out_file

Beispiel #3

0

Datei anzeigen

def salmon_quant_reads(fq1, fq2, salmon_dir, gtf_file, ref_file, data):
    safe_makedir(salmon_dir)
    samplename = dd.get_sample_name(data)
    out_file = os.path.join(salmon_dir, "quant.sf")
    if file_exists(out_file):
        return out_file
    gtf_fa = sailfish._create_combined_fasta(data, salmon_dir)
    num_cores = dd.get_num_cores(data)
    strandedness = dd.get_strandedness(data).lower()
    salmon = config_utils.get_program("salmon", dd.get_config(data))
    libtype = sailfish._libtype_string(fq1, fq2, strandedness)
    num_cores = dd.get_num_cores(data)
    index = salmon_index(gtf_file, ref_file, data, salmon_dir)
    cmd = ("{salmon} quant {libtype} -i {index} -p {num_cores} "
           "-o {tx_out_dir} ")
    fq1_cmd = "{fq1}" if not is_gzipped(fq1) else "<(gzip -cd {fq1})"
    fq1_cmd = fq1_cmd.format(fq1=fq1)
    if not fq2:
        cmd += " -r {fq1_cmd} "
    else:
        fq2_cmd = "{fq2}" if not is_gzipped(fq2) else "<(gzip -cd {fq2})"
        fq2_cmd = fq2_cmd.format(fq2=fq2)
        cmd += " -1 {fq1_cmd} -2 {fq2_cmd} "
    cmd += "--numBootstraps 30 --useVBOpt "
    with file_transaction(data, salmon_dir) as tx_out_dir:
        message = ("Quantifying transcripts in %s and %s with Salmon." %
                   (fq1, fq2))
        do.run(cmd.format(**locals()), message, None)
    return out_file

Beispiel #4

0

Datei anzeigen

Datei: rapmap.py Projekt: zhangj5/bcbio-nextgen

def rapmap_align(fq1, fq2, rapmap_dir, gtf_file, ref_file, algorithm, data):
    valid_algorithms = ["pseudo", "quasi"]
    assert algorithm in valid_algorithms, \
        "RapMap algorithm needs to be one of %s." % valid_algorithms
    safe_makedir(rapmap_dir)
    samplename = dd.get_sample_name(data)
    out_file = os.path.join(rapmap_dir, samplename + ".bam")
    if file_exists(out_file):
        return out_file
    rapmap_index_loc = rapmap_index(gtf_file, ref_file, algorithm, data,
                                    rapmap_dir)
    num_cores = dd.get_num_cores(data)
    algorithm_subcommand = algorithm + "map"
    rapmap = config_utils.get_program("rapmap", dd.get_config(data))
    cmd = "{rapmap} {algorithm_subcommand} -t {num_cores} -i {rapmap_index_loc} "
    fq1_cmd = "{fq1} " if not is_gzipped(fq1) else "<(gzip -cd {fq1}) "
    fq1_cmd = fq1_cmd.format(fq1=fq1)
    if not fq2:
        cmd += "-r {fq1_cmd} "
    else:
        fq2_cmd = "{fq2} " if not is_gzipped(fq2) else "<(gzip -cd {fq2}) "
        fq2_cmd = fq2_cmd.format(fq2=fq2)
        cmd += "-1 {fq2_cmd} -2 {fq2_cmd} "
    with file_transaction(out_file) as tx_out_file:
        cmd += "| " + postalign.sam_to_sortbam_cl(data, tx_out_file)
        run_message = ("%smapping %s and %s to %s with Rapmap. "
                       % (algorithm, fq1, fq2, rapmap_index))
        do.run(cmd.format(**locals()), run_message, None)
    return out_file

Beispiel #5

0

Datei anzeigen

Datei: salmon.py Projekt: unix0000/bcbio-nextgen

def salmon_quant_reads(fq1, fq2, salmon_dir, gtf_file, ref_file, data):
    samplename = dd.get_sample_name(data)
    quant_dir = os.path.join(salmon_dir, "quant")
    safe_makedir(salmon_dir)
    out_file = os.path.join(quant_dir, "quant.sf")
    if file_exists(out_file):
        return out_file
    num_cores = dd.get_num_cores(data)
    strandedness = dd.get_strandedness(data).lower()
    salmon = config_utils.get_program("salmon", dd.get_config(data))
    libtype = sailfish._libtype_string(fq1, fq2, strandedness)
    num_cores = dd.get_num_cores(data)
    index = salmon_index(gtf_file, ref_file, data, salmon_dir)
    resources = config_utils.get_resources("salmon", dd.get_config(data))
    params = ""
    if resources.get("options") is not None:
        params = " ".join([str(x) for x in resources.get("options", [])])
    cmd = ("{salmon} quant {libtype} -i {index} -p {num_cores} "
           "-o {tx_out_dir} {params} ")
    fq1_cmd = "<(cat {fq1})" if not is_gzipped(fq1) else "<(gzip -cd {fq1})"
    fq1_cmd = fq1_cmd.format(fq1=fq1)
    if not fq2:
        cmd += " -r {fq1_cmd} "
    else:
        fq2_cmd = "<(cat {fq2})" if not is_gzipped(
            fq2) else "<(gzip -cd {fq2})"
        fq2_cmd = fq2_cmd.format(fq2=fq2)
        cmd += " -1 {fq1_cmd} -2 {fq2_cmd} "
    # skip --useVBOpt for now, it can cause segfaults
    cmd += "--numBootstraps 30 "
    with file_transaction(data, quant_dir) as tx_out_dir:
        message = ("Quantifying transcripts in %s and %s with Salmon." %
                   (fq1, fq2))
        do.run(cmd.format(**locals()), message, None)
    return out_file

Beispiel #6

0

Datei anzeigen

Datei: spikein.py Projekt: DoaneAS/bcbio-nextgen

def _salmon_quant_reads(fq1, fq2, salmon_dir, index, data):
    samplename = dd.get_sample_name(data)
    quant_dir = os.path.join(salmon_dir, "quant")
    safe_makedir(salmon_dir)
    out_file = os.path.join(quant_dir, "quant.sf")
    if file_exists(out_file):
        return out_file
    num_cores = dd.get_num_cores(data)
    salmon = config_utils.get_program("salmon", dd.get_config(data))
    num_cores = dd.get_num_cores(data)
    cmd = ("{salmon} quant -l A -i {index} -p {num_cores} "
           "-o {tx_out_dir} ")
    fq1_cmd = "<(cat {fq1})" if not is_gzipped(fq1) else "<(gzip -cd {fq1})"
    fq1_cmd = fq1_cmd.format(fq1=fq1)
    if not fq2:
        cmd += " -r {fq1_cmd} "
    else:
        fq2_cmd = "<(cat {fq2})" if not is_gzipped(fq2) else "<(gzip -cd {fq2})"
        fq2_cmd = fq2_cmd.format(fq2=fq2)
        cmd += " -1 {fq1_cmd} -2 {fq2_cmd} "
    with file_transaction(data, quant_dir) as tx_out_dir:
        message = ("Quantifying transcripts in %s and %s with Salmon."
                   %(fq1, fq2))
        do.run(cmd.format(**locals()), message, None)
    return out_file

Beispiel #7

0

Datei anzeigen

Datei: salmon.py Projekt: hliang/bcbio-nextgen

def salmon_quant_reads(fq1, fq2, salmon_dir, gtf_file, ref_file, data):
    samplename = dd.get_sample_name(data)
    quant_dir = os.path.join(salmon_dir, "quant")
    safe_makedir(salmon_dir)
    out_file = os.path.join(quant_dir, "quant.sf")
    if file_exists(out_file):
        return out_file
    num_cores = dd.get_num_cores(data)
    strandedness = dd.get_strandedness(data).lower()
    salmon = config_utils.get_program("salmon", dd.get_config(data))
    libtype = sailfish._libtype_string(fq1, fq2, strandedness)
    num_cores = dd.get_num_cores(data)
    index = salmon_index(gtf_file, ref_file, data, salmon_dir)
    cmd = ("{salmon} quant {libtype} -i {index} -p {num_cores} "
           "-o {tx_out_dir} ")
    fq1_cmd = "{fq1}" if not is_gzipped(fq1) else "<(gzip -cd {fq1})"
    fq1_cmd = fq1_cmd.format(fq1=fq1)
    if not fq2:
        cmd += " -r {fq1_cmd} "
    else:
        fq2_cmd = "{fq2}" if not is_gzipped(fq2) else "<(gzip -cd {fq2})"
        fq2_cmd = fq2_cmd.format(fq2=fq2)
        cmd += " -1 {fq1_cmd} -2 {fq2_cmd} "
    # skip --useVBOpt for now, it can cause segfaults
    cmd += "--numBootstraps 30 "
    with file_transaction(data, quant_dir) as tx_out_dir:
        message = ("Quantifying transcripts in %s and %s with Salmon."
                   %(fq1, fq2))
        do.run(cmd.format(**locals()), message, None)
    return out_file

Beispiel #8

0

Datei anzeigen

Datei: sailfish.py Projekt: dlorson/bcbio-nextgen

def sailfish(fq1, fq2, sailfish_dir, gtf_file, ref_file, strandedness, data):
    safe_makedir(sailfish_dir)
    samplename = dd.get_sample_name(data)
    out_file = os.path.join(sailfish_dir, "quant.sf")
    if file_exists(out_file):
        return out_file
    sailfish_idx = sailfish_index(gtf_file, ref_file, data, sailfish_dir)
    num_cores = dd.get_num_cores(data)
    sailfish = config_utils.get_program("sailfish", data["config"])
    cmd = "{sailfish} quant -i {sailfish_idx} -p {num_cores} "
    cmd += _libtype_string(fq1, fq2, strandedness)
    fq1_cmd = "{fq1}" if not is_gzipped(fq1) else "<(gzip -cd {fq1})"
    fq1_cmd = fq1_cmd.format(fq1=fq1)
    if not fq2:
        cmd += " -r {fq1_cmd} "
    else:
        fq2_cmd = "{fq2}" if not is_gzipped(fq2) else "<(gzip -cd {fq2})"
        fq2_cmd = fq2_cmd.format(fq2=fq2)
        cmd += " -1 {fq1_cmd} -2 {fq2_cmd} "
    cmd += "--useVBOpt --numBootstraps 30 "
    cmd += "-o {tx_out_dir}"
    message = "Quantifying transcripts in {fq1} and {fq2}."
    with file_transaction(data, sailfish_dir) as tx_out_dir:
        do.run(cmd.format(**locals()), message.format(**locals()), None)
    _sleuthify_sailfish(sailfish_dir)
    return out_file

Beispiel #9

0

Datei anzeigen

def decorate_problem_regions(query_bed, problem_bed_dir):
    """
    decorate query_bed with percentage covered by BED files of regions specified
    in the problem_bed_dir
    """
    if utils.is_gzipped(query_bed):
        stem, _ = os.path.splitext(query_bed)
        stem, ext = os.path.splitext(stem)
    else:
        stem, ext = os.path.splitext(query_bed)
    out_file = stem + ".problem_annotated" + ext + ".gz"
    if utils.file_exists(out_file):
        return out_file
    bed_files = glob.glob(os.path.join(problem_bed_dir, "*.bed"))
    bed_file_string = " ".join(bed_files)
    names = [os.path.splitext(os.path.basename(x))[0] for x in bed_files]
    names_string = " ".join(names)
    header = "\t".join(["chr", "start", "end", "name", "coverage",
                        "completeness"] + names)
    cmd = ("bedtools annotate -i {query_bed} -files {bed_file_string} "
           "-names {names_string} | sed -s 's/^#.*$/#{header}/' | bgzip -c > {tx_out_file}")
    with file_transaction(out_file) as tx_out_file:
        message = "Annotate %s with problem regions." % query_bed
        do.run(cmd.format(**locals()), message)
    return out_file

Beispiel #10

0

Datei anzeigen

Datei: star.py Projekt: Lobage/bcbio-nextgen

def align(fastq_file, pair_file, ref_file, names, align_dir, data):
    config = data["config"]
    out_prefix = os.path.join(align_dir, names["lane"])
    out_file = out_prefix + "Aligned.out.sam"
    out_dir = os.path.join(align_dir, "%s_star" % names["lane"])

    final_out = os.path.join(out_dir, "{0}.bam".format(names["sample"]))
    if file_exists(final_out):
        return final_out
    star_path = config_utils.get_program("STAR", config)
    fastq = " ".join([fastq_file, pair_file]) if pair_file else fastq_file
    num_cores = config["algorithm"].get("num_cores", 1)

    safe_makedir(align_dir)
    cmd = ("{star_path} --genomeDir {ref_file} --readFilesIn {fastq} "
           "--runThreadN {num_cores} --outFileNamePrefix {out_prefix} "
           "--outReadsUnmapped Fastx --outFilterMultimapNmax 10 "
           "--outStd SAM "
           "--outSAMunmapped Within --outSAMattributes %s" % " ".join(ALIGN_TAGS))
    cmd = cmd + " --readFilesCommand zcat " if is_gzipped(fastq_file) else cmd
    cmd += _read_group_option(names)
    fusion_mode = get_in(data, ("config", "algorithm", "fusion_mode"), False)
    if fusion_mode:
        cmd += " --chimSegmentMin 15 --chimJunctionOverhangMin 15"
    strandedness = get_in(data, ("config", "algorithm", "strandedness"),
                          "unstranded").lower()
    if strandedness == "unstranded":
        cmd += " --outSAMstrandField intronMotif "
    sam_to_bam = bam.sam_to_bam_stream_cmd(config)
    sort = bam.sort_cmd(config)
    cmd += "| {sam_to_bam} | {sort} -o {tx_final_out} "
    run_message = "Running STAR aligner on %s and %s." % (fastq_file, ref_file)
    with file_transaction(final_out) as tx_final_out:
        do.run(cmd.format(**locals()), run_message, None)
    return final_out

Beispiel #11

0

Datei anzeigen

Datei: coverage.py Projekt: dlorson/bcbio-nextgen

def decorate_problem_regions(query_bed, problem_bed_dir):
    """
    decorate query_bed with percentage covered by BED files of regions specified
    in the problem_bed_dir
    """
    if is_gzipped(query_bed):
        stem, _ = os.path.splitext(query_bed)
        stem, ext = os.path.splitext(stem)
    else:
        stem, ext = os.path.splitext(query_bed)
    out_file = stem + ".problem_annotated" + ext + ".gz"
    if file_exists(out_file):
        return out_file
    bed_files = _find_bed_files(problem_bed_dir)
    bed_file_string = " ".join(bed_files)
    names = [os.path.splitext(os.path.basename(x))[0] for x in bed_files]
    names_string = " ".join(names)
    with open_gzipsafe(query_bed) as in_handle:
        header = map(str, in_handle.next().strip().split())
    header = "\t".join(header + names)
    cmd = (
        "bedtools annotate -i {query_bed} -files {bed_file_string} "
        "-names {names_string} | sed -s 's/^#.*$/{header}/' | bgzip -c > {tx_out_file}"
    )
    with file_transaction(out_file) as tx_out_file:
        message = "Annotate %s with problem regions." % query_bed
        do.run(cmd.format(**locals()), message)
    return out_file

Beispiel #12

0

Datei anzeigen

Datei: coverage.py Projekt: crpavia/bcbio-nextgen

def decorate_problem_regions(query_bed, problem_bed_dir):
    """
    decorate query_bed with percentage covered by BED files of regions specified
    in the problem_bed_dir
    """
    if utils.is_gzipped(query_bed):
        stem, _ = os.path.splitext(query_bed)
        stem, ext = os.path.splitext(stem)
    else:
        stem, ext = os.path.splitext(query_bed)
    out_file = stem + ".problem_annotated" + ext + ".gz"
    if utils.file_exists(out_file):
        return out_file
    bed_files = glob.glob(os.path.join(problem_bed_dir, "*.bed"))
    bed_file_string = " ".join(bed_files)
    names = [os.path.splitext(os.path.basename(x))[0] for x in bed_files]
    names_string = " ".join(names)
    with utils.open_gzipsafe(query_bed) as in_handle:
        header = map(str, in_handle.next().strip().split())
    header = "\t".join(header + names)
    cmd = ("bedtools annotate -i {query_bed} -files {bed_file_string} "
           "-names {names_string} | sed -s 's/^#.*$/{header}/' | bgzip -c > {tx_out_file}")
    with file_transaction(out_file) as tx_out_file:
        message = "Annotate %s with problem regions." % query_bed
        do.run(cmd.format(**locals()), message)
    return out_file

Beispiel #13

0

Datei anzeigen

def align(fastq_file, pair_file, ref_file, names, align_dir, data):
    if not ref_file:
        logger.error("STAR index not found. We don't provide the STAR indexes "
                     "by default because they are very large. You can install "
                     "the index for your genome with: bcbio_nextgen.py upgrade "
                     "--aligners star --genomes genome-build-name --data")
        sys.exit(1)

    max_hits = 10
    srna = True if data["analysis"].lower().startswith("smallrna-seq") else False
    srna_opts = ""
    if srna:
        max_hits = 1000
        srna_opts = "--alignIntronMax 1"
    config = data["config"]
    star_dirs = _get_star_dirnames(align_dir, data, names)
    if file_exists(star_dirs.final_out):
        data = _update_data(star_dirs.final_out, star_dirs.out_dir, names, data)
        return data

    star_path = config_utils.get_program("STAR", config)
    fastq_files = " ".join([fastq_file, pair_file]) if pair_file else fastq_file
    num_cores = dd.get_num_cores(data)
    gtf_file = dd.get_gtf_file(data)
    if ref_file.endswith("chrLength"):
        ref_file = os.path.dirname(ref_file)

    with file_transaction(data, align_dir) as tx_align_dir:
        tx_star_dirnames = _get_star_dirnames(tx_align_dir, data, names)
        tx_out_dir, tx_out_file, tx_out_prefix, tx_final_out = tx_star_dirnames
        safe_makedir(tx_align_dir)
        safe_makedir(tx_out_dir)
        cmd = ("{star_path} --genomeDir {ref_file} --readFilesIn {fastq_files} "
            "--runThreadN {num_cores} --outFileNamePrefix {tx_out_prefix} "
            "--outReadsUnmapped Fastx --outFilterMultimapNmax {max_hits} "
            "--outStd SAM {srna_opts} "
            "--outSAMunmapped Within --outSAMattributes %s " % " ".join(ALIGN_TAGS))
        cmd += _add_sj_index_commands(fastq_file, ref_file, gtf_file) if not srna else ""
        cmd += " --readFilesCommand zcat " if is_gzipped(fastq_file) else ""
        cmd += _read_group_option(names)
        fusion_mode = utils.get_in(data, ("config", "algorithm", "fusion_mode"), False)
        if fusion_mode:
            cmd += (" --chimSegmentMin 12 --chimJunctionOverhangMin 12 "
                    "--chimScoreDropMax 30 --chimSegmentReadGapMax 5 "
                    "--chimScoreSeparation 5 "
                    "--chimOutType WithinSAM ")
        strandedness = utils.get_in(data, ("config", "algorithm", "strandedness"),
                                    "unstranded").lower()
        if strandedness == "unstranded" and not srna:
            cmd += " --outSAMstrandField intronMotif "
        if not srna:
            cmd += " --quantMode TranscriptomeSAM "
        cmd += " | " + postalign.sam_to_sortbam_cl(data, tx_final_out)
        run_message = "Running STAR aligner on %s and %s" % (fastq_file, ref_file)
        do.run(cmd.format(**locals()), run_message, None)
        print("hello")

    data = _update_data(star_dirs.final_out, star_dirs.out_dir, names, data)
    return data

Beispiel #14

0

Datei anzeigen

Datei: sailfish.py Projekt: crpavia/bcbio-nextgen

def sailfish(fq1, fq2, align_dir, gtf_file, ref_file, strandedness, data):
    sailfish_idx = sailfish_index(gtf_file, ref_file, data)
    sailfish = config_utils.get_program("sailfish", data["config"])
    cmd = "{sailfish} quant -i {sailfish_idx} "
    cmd += _libtype_string(fq1, fq2, strandedness)
    fq1_cmd = "{fq1}" if not is_gzipped(fq1) else "<(gzip -cd {fq1})"
    fq1_cmd = fq1_cmd.format(fq1=fq1)
    if not fq2:
        cmd = " -r {fq1_cmd} "
    else:
        fq2_cmd = "{fq2}" if not is_gzipped(fq2) else "<(gzip -cd {fq2})"
        fq2_cmd = fq2_cmd.format(fq2=fq2)
        cmd += " -1 {fq1_cmd} -2 {fq2_cmd} "
    cmd += "-o {tx_out_dir}"
    message = "Quantifying transcripts in {fq1} and {fq2}."
    with file_transaction(data, align_dir) as tx_out_dir:
        do.run(cmd.format(**locals()), message.format(**locals()), None)
    return align_dir

Beispiel #15

0

Datei anzeigen

    def _unpack_fastq(f):
        """Use process substitution instead of readFilesCommand for gzipped inputs.

        Prevents issues on shared filesystems that don't support FIFO:
        https://github.com/alexdobin/STAR/issues/143
        """
        if f and is_gzipped(f):
            return "<(gunzip -c %s)" % f
        else:
            return f

Beispiel #16

0

Datei anzeigen

Datei: bcbio_setup_genome.py Projekt: chapmanb/bcbio-nextgen

def install_gtf_file(build_dir, gtf, build):
    out_file = os.path.join(build_dir, RNASEQ_DIR, "ref-transcripts.gtf")
    if not file_exists(out_file):
        if is_gzipped(gtf):
            with gzip.open(gtf_file, 'rb') as in_handle:
                with open(out_file, 'wb') as out_handle:
                    shutil.copyfileobj(in_handle, out_handle)
        else:
            shutil.copyfile(gtf, out_file)
    return out_file

Beispiel #17

0

Datei anzeigen

def install_gtf_file(build_dir, gtf, build):
    out_file = os.path.join(build_dir, RNASEQ_DIR, "ref-transcripts.gtf")
    if not file_exists(out_file):
        if is_gzipped(gtf):
            with gzip.open(gtf_file, 'rb') as in_handle:
                with open(out_file, 'wb') as out_handle:
                    shutil.copyfileobj(in_handle, out_handle)
        else:
            shutil.copyfile(gtf, out_file)
    return out_file

Beispiel #18

0

Datei anzeigen

Datei: star.py Projekt: chapmanb/bcbio-nextgen

    def _unpack_fastq(f):
        """Use process substitution instead of readFilesCommand for gzipped inputs.

        Prevents issues on shared filesystems that don't support FIFO:
        https://github.com/alexdobin/STAR/issues/143
        """
        if f and is_gzipped(f):
            return "<(gunzip -c %s)" % f
        else:
            return f

Beispiel #19

0

Datei anzeigen

Datei: star.py Projekt: guillermo-carrasco/bcbio-nextgen

def align(fastq_file, pair_file, ref_file, names, align_dir, data):
    max_hits = 10
    srna = True if data["analysis"].lower().startswith("smallrna-seq") else False
    srna_opts = ""
    if srna:
        max_hits = 1000
        srna_opts = "--alignIntronMax 1"
    config = data["config"]
    out_prefix = os.path.join(align_dir, dd.get_lane(data))
    out_file = out_prefix + "Aligned.out.sam"
    out_dir = os.path.join(align_dir, "%s_star" % dd.get_lane(data))

    if not ref_file:
        logger.error("STAR index not found. We don't provide the STAR indexes "
                     "by default because they are very large. You can install "
                     "the index for your genome with: bcbio_nextgen.py upgrade "
                     "--aligners star --genomes genome-build-name --data")
        sys.exit(1)

    final_out = os.path.join(out_dir, "{0}.bam".format(names["sample"]))
    if file_exists(final_out):
        data = _update_data(final_out, out_dir, names, data)
        return data
    star_path = config_utils.get_program("STAR", config)
    fastq_files = " ".join([fastq_file, pair_file]) if pair_file else fastq_file
    num_cores = dd.get_num_cores(data)
    gtf_file = dd.get_gtf_file(data)

    safe_makedir(align_dir)
    cmd = ("{star_path} --genomeDir {ref_file} --readFilesIn {fastq_files} "
           "--runThreadN {num_cores} --outFileNamePrefix {out_prefix} "
           "--outReadsUnmapped Fastx --outFilterMultimapNmax {max_hits} "
           "--outStd SAM {srna_opts} "
           "--outSAMunmapped Within --outSAMattributes %s " % " ".join(ALIGN_TAGS))
    cmd += _add_sj_index_commands(fastq_file, ref_file, gtf_file)
    cmd += " --readFilesCommand zcat " if is_gzipped(fastq_file) else ""
    cmd += _read_group_option(names)
    fusion_mode = utils.get_in(data, ("config", "algorithm", "fusion_mode"), False)
    if fusion_mode:
        cmd += " --chimSegmentMin 15 --chimJunctionOverhangMin 15"
    strandedness = utils.get_in(data, ("config", "algorithm", "strandedness"),
                                "unstranded").lower()
    if strandedness == "unstranded" and not srna:
        cmd += " --outSAMstrandField intronMotif "

    if dd.get_transcriptome_align(data) and not is_transcriptome_broken(data):
        cmd += " --quantMode TranscriptomeSAM "

    with file_transaction(data, final_out) as tx_final_out:
        cmd += " | " + postalign.sam_to_sortbam_cl(data, tx_final_out)
        run_message = "Running STAR aligner on %s and %s" % (fastq_file, ref_file)
        do.run(cmd.format(**locals()), run_message, None)

    data = _update_data(final_out, out_dir, names, data)
    return data

Beispiel #20

0

Datei anzeigen

Datei: star.py Projekt: hshujia/bcbio-nextgen

def align(fastq_file, pair_file, ref_file, names, align_dir, data):
    config = data["config"]
    out_prefix = os.path.join(align_dir, dd.get_lane(data))
    out_file = out_prefix + "Aligned.out.sam"
    out_dir = os.path.join(align_dir, "%s_star" % dd.get_lane(data))

    if not ref_file:
        logger.error(
            "STAR index not found. We don't provide the STAR indexes "
            "by default because they are very large. You can install "
            "the index for your genome with: bcbio_nextgen.py upgrade "
            "--aligners star --genomes genome-build-name --data")
        sys.exit(1)

    final_out = os.path.join(out_dir, "{0}.bam".format(names["sample"]))
    if file_exists(final_out):
        data = _update_data(final_out, out_dir, names, data)
        return data

    star_path = config_utils.get_program("STAR", config)
    fastq = " ".join([fastq_file, pair_file]) if pair_file else fastq_file
    num_cores = config["algorithm"].get("num_cores", 1)

    safe_makedir(align_dir)
    cmd = ("{star_path} --genomeDir {ref_file} --readFilesIn {fastq} "
           "--runThreadN {num_cores} --outFileNamePrefix {out_prefix} "
           "--outReadsUnmapped Fastx --outFilterMultimapNmax 10 "
           "--outStd SAM "
           "--outSAMunmapped Within --outSAMattributes %s" %
           " ".join(ALIGN_TAGS))
    cmd = cmd + " --readFilesCommand zcat " if is_gzipped(fastq_file) else cmd
    cmd += _read_group_option(names)
    fusion_mode = utils.get_in(data, ("config", "algorithm", "fusion_mode"),
                               False)
    if fusion_mode:
        cmd += " --chimSegmentMin 15 --chimJunctionOverhangMin 15"
    strandedness = utils.get_in(data, ("config", "algorithm", "strandedness"),
                                "unstranded").lower()
    if strandedness == "unstranded":
        cmd += " --outSAMstrandField intronMotif "

    if dd.get_rsem(data) and not is_transcriptome_broken():
        cmd += " --quantMode TranscriptomeSAM "

    with tx_tmpdir(data) as tmp_dir:
        sam_to_bam = bam.sam_to_bam_stream_cmd(config)
        sort = bam.sort_cmd(config, tmp_dir)
        cmd += "| {sam_to_bam} | {sort} -o {tx_final_out} "
        run_message = "Running STAR aligner on %s and %s" % (fastq_file,
                                                             ref_file)
        with file_transaction(data, final_out) as tx_final_out:
            do.run(cmd.format(**locals()), run_message, None)

    data = _update_data(final_out, out_dir, names, data)
    return data

Beispiel #21

0

Datei anzeigen

def filter_barcodes(data):
    # if data was pre-demultiplexed, there is no need to filter the barcodes
    if dd.get_demultiplexed(data):
        return [[data]]
    fq1 = dd.get_input_sequence_files(data)[0]
    umi_dir = os.path.join(dd.get_work_dir(data), "umis")
    correction = dd.get_cellular_barcode_correction(data)
    bc = get_cellular_barcodes(data)
    if not bc:
        logger.info("No cellular barcodes found, skipping filtering.")
        return [[data]]
    bc1 = None
    bc2 = None
    bc3 = None
    umi_dir = os.path.join(dd.get_work_dir(data), "umis")
    if isinstance(bc, six.string_types):
        bc1 = bc
    if len(bc) == 1:
        bc1 = bc[0]
    if len(bc) > 1:
        bc1 = bc[0]
        bc2 = bc[1]
    if len(bc) == 3:
        bc3 = bc[2]
    out_base = dd.get_sample_name(data) + ".filtered.fq.gz"
    out_file = os.path.join(umi_dir, out_base)
    if file_exists(out_file):
        data["files"] = [out_file]
        return [[data]]

    ncores = dd.get_num_cores(data)
    cmd = "{umis} cb_filter --cores {ncores} "
    if bc1:
        cmd += "--bc1 {bc1} "
        if correction:
            cmd += "--nedit {correction} "
    if bc2:
        cmd += "--bc2 {bc2} "
    if bc3:
        cmd += "--bc3 {bc3} "

    fq1_cmd = "{fq1} " if not is_gzipped(fq1) else "<(gzip -cd {fq1}) "
    fq1_cmd = fq1_cmd.format(fq1=fq1)
    cmd += "{fq1_cmd} | gzip > {tx_out_file}"

    sample_dir = os.path.join(umi_dir, dd.get_sample_name(data))
    safe_makedir(sample_dir)
    umis = config_utils.get_program("umis", data, default="umis")
    with file_transaction(out_file) as tx_out_file:
        message = "Filtering by cellular barcode."
        do.run(cmd.format(**locals()), message)
    data["files"] = [out_file]
    return [[data]]

Beispiel #22

0

Datei anzeigen

Datei: fastq.py Projekt: GetBen/bcbio-nextgen

def _gzip_fastq(in_file):
    """
    gzip a fastq file if it is not already gzipped
    """
    if fastq.is_fastq(in_file) and not utils.is_gzipped(in_file):
        gzipped_file = in_file + ".gz"
        if file_exists(gzipped_file):
            return gzipped_file
        message = "gzipping {in_file}.".format(in_file=in_file)
        do.run("gzip -c {in_file} > {gzipped_file}".format(**locals()), message)
        return gzipped_file
    return in_file

Beispiel #23

0

Datei anzeigen

Datei: umi.py Projekt: vladsaveliev/bcbio-nextgen

def filter_barcodes(data):
    # if data was pre-demultiplexed, there is no need to filter the barcodes
    if dd.get_demultiplexed(data):
        return [[data]]
    fq1 = dd.get_input_sequence_files(data)[0]
    umi_dir = os.path.join(dd.get_work_dir(data), "umis")
    correction = dd.get_cellular_barcode_correction(data)
    bc = get_cellular_barcodes(data)
    if not bc:
        logger.info("No cellular barcodes found, skipping filtering.")
        return [[data]]
    bc1 = None
    bc2 = None
    bc3 = None
    umi_dir = os.path.join(dd.get_work_dir(data), "umis")
    if isinstance(bc, six.string_types):
        bc1 = bc
    if len(bc) == 1:
        bc1 = bc[0]
    if len(bc) > 1:
        bc1 = bc[0]
        bc2 = bc[1]
    if len(bc) == 3:
        bc3 = bc[2]
    out_base = dd.get_sample_name(data) + ".filtered.fq.gz"
    out_file = os.path.join(umi_dir, out_base)
    if file_exists(out_file):
        data["files"] = [out_file]
        return [[data]]

    ncores = dd.get_num_cores(data)
    cmd = "{umis} cb_filter --cores {ncores} "
    if bc1:
        cmd += "--bc1 {bc1} "
        if correction:
            cmd += "--nedit {correction} "
    if bc2:
        cmd += "--bc2 {bc2} "
    if bc3:
        cmd += "--bc3 {bc3} "

    fq1_cmd = "{fq1} " if not is_gzipped(fq1) else "<(gzip -cd {fq1}) "
    fq1_cmd = fq1_cmd.format(fq1=fq1)
    cmd += "{fq1_cmd} | gzip > {tx_out_file}"

    sample_dir = os.path.join(umi_dir, dd.get_sample_name(data))
    safe_makedir(sample_dir)
    umis = config_utils.get_program("umis", data, default="umis")
    with file_transaction(out_file) as tx_out_file:
        message = "Filtering by cellular barcode."
        do.run(cmd.format(**locals()), message)
    data["files"] = [out_file]
    return [[data]]

Beispiel #24

0

Datei anzeigen

Datei: fastq.py Projekt: hshujia/bcbio-nextgen

def _gzip_fastq(in_file):
    """
    gzip a fastq file if it is not already gzipped
    """
    if (fastq.is_fastq(in_file) and not utils.is_gzipped(in_file)
          and not in_file.startswith(utils.SUPPORTED_REMOTES)):
        gzipped_file = in_file + ".gz"
        if file_exists(gzipped_file):
            return gzipped_file
        message = "gzipping {in_file}.".format(in_file=in_file)
        do.run("gzip -c {in_file} > {gzipped_file}".format(**locals()), message)
        return gzipped_file
    return in_file

Beispiel #25

0

Datei anzeigen

Datei: star.py Projekt: curoverse/bcbio-nextgen

def align(fastq_file, pair_file, ref_file, names, align_dir, data):
    config = data["config"]
    out_prefix = os.path.join(align_dir, dd.get_lane(data))
    out_file = out_prefix + "Aligned.out.sam"
    out_dir = os.path.join(align_dir, "%s_star" % dd.get_lane(data))

    if not ref_file:
        logger.error("STAR index not found. We don't provide the STAR indexes "
                     "by default because they are very large. You can install "
                     "the index for your genome with: bcbio_nextgen.py upgrade "
                     "--aligners star --genomes genome-build-name --data")
        sys.exit(1)

    final_out = os.path.join(out_dir, "{0}.bam".format(names["sample"]))
    if file_exists(final_out):
        data = _update_data(final_out, out_dir, names, data)
        return data

    star_path = config_utils.get_program("STAR", config)
    fastq = " ".join([fastq_file, pair_file]) if pair_file else fastq_file
    num_cores = config["algorithm"].get("num_cores", 1)

    safe_makedir(align_dir)
    cmd = ("{star_path} --genomeDir {ref_file} --readFilesIn {fastq} "
           "--runThreadN {num_cores} --outFileNamePrefix {out_prefix} "
           "--outReadsUnmapped Fastx --outFilterMultimapNmax 10 "
           "--outStd SAM "
           "--outSAMunmapped Within --outSAMattributes %s" % " ".join(ALIGN_TAGS))
    cmd = cmd + " --readFilesCommand zcat " if is_gzipped(fastq_file) else cmd
    cmd += _read_group_option(names)
    fusion_mode = utils.get_in(data, ("config", "algorithm", "fusion_mode"), False)
    if fusion_mode:
        cmd += " --chimSegmentMin 15 --chimJunctionOverhangMin 15"
    strandedness = utils.get_in(data, ("config", "algorithm", "strandedness"),
                                "unstranded").lower()
    if strandedness == "unstranded":
        cmd += " --outSAMstrandField intronMotif "

    if dd.get_rsem(data) and not is_transcriptome_broken():
        cmd += " --quantMode TranscriptomeSAM "

    with tx_tmpdir(data) as tmp_dir:
        sam_to_bam = bam.sam_to_bam_stream_cmd(config)
        sort = bam.sort_cmd(config, tmp_dir)
        cmd += "| {sam_to_bam} | {sort} -o {tx_final_out} "
        run_message = "Running STAR aligner on %s and %s" % (fastq_file, ref_file)
        with file_transaction(data, final_out) as tx_final_out:
            do.run(cmd.format(**locals()), run_message, None)

    data = _update_data(final_out, out_dir, names, data)
    return data

Beispiel #26

0

Datei anzeigen

Datei: rapmap.py Projekt: cbrueffer/bcbio-nextgen

def rapmap_pseudoalign(fq1, fq2, rapmap_dir, gtf_file, ref_file, data):
    safe_makedir(rapmap_dir)
    samplename = dd.get_sample_name(data)
    out_file = os.path.join(rapmap_dir, samplename + ".bam")
    if file_exists(out_file):
        return out_file
    rapmap_idx = rapmap_index(gtf_file, ref_file, data, sailfish_dir)
    num_cores = dd.get_num_cores(data)
    rapmap = config_utils.get_program("rapmap", data["config"])
    cmd = ("{rapmap} pseudomap -i {rapmap_idx} -t {num_cores} ")
    fq1_cmd = "{fq1}" if not is_gzipped(fq1) else "<(gzip -cd {fq1})"
    fq1_cmd = fq1_cmd.format(fq1=fq1)
    if not fq2:
        cmd += " -r {fq1_cmd} "
    else:
        fq2_cmd = "{fq2}" if not is_gzipped(fq2) else "<(gzip -cd {fq2})"
        fq2_cmd = fq2_cmd.format(fq2=fq2)
        cmd += " -1 {fq1_cmd} -2 {fq2_cmd} "
    message = "pseudomapping transcripts in {fq1} and {fq2}."
    with file_transaction(data, out_file) as tx_out_file:
        cmd += " | " + postalign.sam_to_sortbam_cl(data, tx_out_file)
        do.run(cmd.format(**locals()), message.format(**locals()), None)
    return out_file

Beispiel #27

0

Datei anzeigen

def rapmap_pseudoalign(fq1, fq2, rapmap_dir, gtf_file, ref_file, data):
    safe_makedir(rapmap_dir)
    samplename = dd.get_sample_name(data)
    out_file = os.path.join(rapmap_dir, samplename + ".bam")
    if file_exists(out_file):
        return out_file
    rapmap_idx = rapmap_index(gtf_file, ref_file, data, sailfish_dir)
    num_cores = dd.get_num_cores(data)
    rapmap = config_utils.get_program("rapmap", data["config"])
    cmd = ("{rapmap} pseudomap -i {rapmap_idx} -t {num_cores} ")
    fq1_cmd = "{fq1}" if not is_gzipped(fq1) else "<(gzip -cd {fq1})"
    fq1_cmd = fq1_cmd.format(fq1=fq1)
    if not fq2:
        cmd += " -r {fq1_cmd} "
    else:
        fq2_cmd = "{fq2}" if not is_gzipped(fq2) else "<(gzip -cd {fq2})"
        fq2_cmd = fq2_cmd.format(fq2=fq2)
        cmd += " -1 {fq1_cmd} -2 {fq2_cmd} "
    message = "pseudomapping transcripts in {fq1} and {fq2}."
    with file_transaction(data, out_file) as tx_out_file:
        cmd += " | " + postalign.sam_to_sortbam_cl(data, tx_out_file)
        do.run(cmd.format(**locals()), message.format(**locals()), None)
    return out_file

Beispiel #28

0

Datei anzeigen

def _gzip_fastq(in_file):
    """
    gzip a fastq file if it is not already gzipped
    """
    if (fastq.is_fastq(in_file) and not utils.is_gzipped(in_file)
            and not objectstore.is_remote(in_file)):
        gzipped_file = in_file + ".gz"
        if file_exists(gzipped_file):
            return gzipped_file
        message = "gzipping {in_file}.".format(in_file=in_file)
        do.run("gzip -c {in_file} > {gzipped_file}".format(**locals()),
               message)
        return gzipped_file
    return in_file

Beispiel #29

0

Datei anzeigen

def rapmap_pseudoalign(fq1, fq2, rapmap_dir, gtf_file, ref_file, data):
    safe_makedir(rapmap_dir)
    samplename = dd.get_sample_name(data)
    out_file = os.path.join(rapmap_dir, samplename + ".bam")
    if file_exists(out_file):
        return out_file
    rapmap_index = rapmap_pseudoindex(gtf_file, ref_file, data, rapmap_dir)
    num_cores = dd.get_num_cores(data)
    rapmap = config_utils.get_program("rapmap", dd.get_config(data))
    cmd = "{rapmap} pseudomap -t {num_cores} -i {rapmap_index} "
    fq1_cmd = "{fq1} " if not is_gzipped(fq1) else "<(gzip -cd {fq1}) "
    fq1_cmd = fq1_cmd.format(fq1=fq1)
    if not fq2:
        cmd += "-r {fq1_cmd} "
    else:
        fq2_cmd = "{fq2} " if not is_gzipped(fq2) else "<(gzip -cd {fq2}) "
        fq2_cmd = fq2_cmd.format(fq2=fq2)
        cmd += "-1 {fq2_cmd} -2 {fq2_cmd} "
    with file_transaction(out_file) as tx_out_file:
        cmd += "| " + postalign.sam_to_sortbam_cl(data, tx_out_file)
        run_message = ("Pseudomapping %s and %s to %s with Rapmap. "
                       % (fq1, fq2, rapmap_index))
        do.run(cmd.format(**locals()), run_message, None)
    return out_file

Beispiel #30

0

Datei anzeigen

Datei: umi.py Projekt: hliang/bcbio-nextgen

def barcode_histogram(data):
    fq1 = dd.get_input_sequence_files(data)[0]
    umi_dir = os.path.join(dd.get_work_dir(data), "umis")
    sample_dir = os.path.join(umi_dir, dd.get_sample_name(data))
    umis = config_utils.get_program("umis", data, default="umis")
    safe_makedir(sample_dir)
    out_file = os.path.join(sample_dir, "cb-histogram.txt")
    if file_exists(out_file):
        return [[data]]
    fq1_cmd = "{fq1}" if not is_gzipped(fq1) else "<(gzip -cd {fq1})"
    fq1_cmd = fq1_cmd.format(fq1=fq1)
    cmd = "{umis} cb_histogram {fq1_cmd} > {tx_out_file}"
    with file_transaction(out_file) as tx_out_file:
        message = "Computing cellular barcode counts for %s." % fq1
        do.run(cmd.format(**locals()), message)
    return [[data]]

Beispiel #31

0

Datei anzeigen

def barcode_histogram(data):
    fq1 = dd.get_input_sequence_files(data)[0]
    umi_dir = os.path.join(dd.get_work_dir(data), "umis")
    sample_dir = os.path.join(umi_dir, dd.get_sample_name(data))
    umis = config_utils.get_program("umis", data, default="umis")
    safe_makedir(sample_dir)
    out_file = os.path.join(sample_dir, "cb-histogram.txt")
    if file_exists(out_file):
        return [[data]]
    fq1_cmd = "{fq1}" if not is_gzipped(fq1) else "<(gzip -cd {fq1})"
    fq1_cmd = fq1_cmd.format(fq1=fq1)
    cmd = "{umis} cb_histogram {fq1_cmd} > {tx_out_file}"
    with file_transaction(out_file) as tx_out_file:
        message = "Computing cellular barcode counts for %s." % fq1
        do.run(cmd.format(**locals()), message)
    return [[data]]

Beispiel #32

0

Datei anzeigen

Datei: fastq.py Projekt: Cyberbio-Lab/bcbio-nextgen

def _gzip_fastq(in_file):
    """
    gzip a fastq file if it is not already gzipped, handling conversion
    from bzip to gzipped files
    """
    if fastq.is_fastq(in_file) and not objectstore.is_remote(in_file):
        if utils.is_bzipped(in_file):
            return _bzip_gzip(in_file)
        elif not utils.is_gzipped(in_file):
            gzipped_file = in_file + ".gz"
            if file_exists(gzipped_file):
                return gzipped_file
            message = "gzipping {in_file}.".format(in_file=in_file)
            with file_transaction(gzipped_file) as tx_gzipped_file:
                do.run("gzip -c {in_file} > {tx_gzipped_file}".format(**locals()),
                       message)
            return gzipped_file
    return in_file

Beispiel #33

0

Datei anzeigen

def _gzip_fastq(in_file):
    """
    gzip a fastq file if it is not already gzipped, handling conversion
    from bzip to gzipped files
    """
    if fastq.is_fastq(in_file) and not objectstore.is_remote(in_file):
        if utils.is_bzipped(in_file):
            return _bzip_gzip(in_file)
        elif not utils.is_gzipped(in_file):
            gzipped_file = in_file + ".gz"
            if file_exists(gzipped_file):
                return gzipped_file
            message = "gzipping {in_file}.".format(in_file=in_file)
            with file_transaction(gzipped_file) as tx_gzipped_file:
                do.run("gzip -c {in_file} > {tx_gzipped_file}".format(**locals()),
                       message)
            return gzipped_file
    return in_file

Beispiel #34

0

Datei anzeigen

def filter_barcodes(data):
    fq1 = dd.get_input_sequence_files(data)[0]
    umi_dir = os.path.join(dd.get_work_dir(data), "umis")
    correction = dd.get_cellular_barcode_correction(data)
    bc = dd.get_cellular_barcodes(data)
    if not bc:
        return [[data]]
    bc1 = None
    bc2 = None
    umi_dir = os.path.join(dd.get_work_dir(data), "umis")
    if isinstance(bc, basestring):
        bc1 = bc
    if len(bc) == 1:
        bc1 = bc[0]
    if len(bc) == 2:
        bc1 = bc[0]
        bc2 = bc[1]
    out_base = dd.get_sample_name(data) + ".filtered.fq.gz"
    out_file = os.path.join(umi_dir, out_base)
    if file_exists(out_file):
        data["files"] = [out_file]
        return [[data]]

    ncores = dd.get_num_cores(data)
    cmd = "{umis} cb_filter --cores {ncores} "
    if bc1:
        cmd += "--bc1 {bc1} "
        if correction:
            cmd += "--nedit {correction} "
    if bc2:
        cmd += "--bc2 {bc2} "

    fq1_cmd = "{fq1} " if not is_gzipped(fq1) else "<(gzip -cd {fq1}) "
    fq1_cmd = fq1_cmd.format(fq1=fq1)
    cmd += "{fq1_cmd} | gzip > {tx_out_file}"

    sample_dir = os.path.join(umi_dir, dd.get_sample_name(data))
    safe_makedir(sample_dir)
    umis = config_utils.get_program("umis", data, default="umis")
    with file_transaction(out_file) as tx_out_file:
        message = "Filtering by cellular barcode."
        do.run(cmd.format(**locals()), message)
    data["files"] = [out_file]
    return [[data]]

Beispiel #35

0

Datei anzeigen

Datei: umi.py Projekt: sdwfrost/bcbio-nextgen

def filter_barcodes(data):
    fq1 = dd.get_input_sequence_files(data)[0]
    umi_dir = os.path.join(dd.get_work_dir(data), "umis")
    correction = dd.get_cellular_barcode_correction(data)
    bc = get_cellular_barcodes(data)
    if not bc:
        return [[data]]
    bc1 = None
    bc2 = None
    umi_dir = os.path.join(dd.get_work_dir(data), "umis")
    if isinstance(bc, basestring):
        bc1 = bc
    if len(bc) == 1:
        bc1 = bc[0]
    if len(bc) == 2:
        bc1 = bc[0]
        bc2 = bc[1]
    out_base = dd.get_sample_name(data) + ".filtered.fq.gz"
    out_file = os.path.join(umi_dir, out_base)
    if file_exists(out_file):
        data["files"] = [out_file]
        return [[data]]

    ncores = dd.get_num_cores(data)
    cmd = "{umis} cb_filter --cores {ncores} "
    if bc1:
        cmd += "--bc1 {bc1} "
        if correction:
            cmd += "--nedit {correction} "
    if bc2:
        cmd += "--bc2 {bc2} "

    fq1_cmd = "{fq1} " if not is_gzipped(fq1) else "<(gzip -cd {fq1}) "
    fq1_cmd = fq1_cmd.format(fq1=fq1)
    cmd += "{fq1_cmd} | gzip > {tx_out_file}"

    sample_dir = os.path.join(umi_dir, dd.get_sample_name(data))
    safe_makedir(sample_dir)
    umis = config_utils.get_program("umis", data, default="umis")
    with file_transaction(out_file) as tx_out_file:
        message = "Filtering by cellular barcode."
        do.run(cmd.format(**locals()), message)
    data["files"] = [out_file]
    return [[data]]

Beispiel #36

0

Datei anzeigen

def align(fastq_file, pair_file, ref_file, names, align_dir, data):
    config = data["config"]
    out_prefix = os.path.join(align_dir, names["lane"])
    out_file = out_prefix + "Aligned.out.sam"
    out_dir = os.path.join(align_dir, "%s_star" % names["lane"])

    final_out = os.path.join(out_dir, "{0}.bam".format(names["sample"]))
    if file_exists(final_out):
        return final_out
    star_path = config_utils.get_program("STAR", config)
    fastq = " ".join([fastq_file, pair_file]) if pair_file else fastq_file
    num_cores = config["algorithm"].get("num_cores", 1)

    safe_makedir(align_dir)
    cmd = ("{star_path} --genomeDir {ref_file} --readFilesIn {fastq} "
           "--runThreadN {num_cores} --outFileNamePrefix {out_prefix} "
           "--outReadsUnmapped Fastx --outFilterMultimapNmax 10 "
           "--outStd SAM "
           "--outSAMunmapped Within --outSAMattributes %s" %
           " ".join(ALIGN_TAGS))
    cmd = cmd + " --readFilesCommand zcat " if is_gzipped(fastq_file) else cmd
    cmd += _read_group_option(names)
    fusion_mode = utils.get_in(data, ("config", "algorithm", "fusion_mode"),
                               False)
    if fusion_mode:
        cmd += " --chimSegmentMin 15 --chimJunctionOverhangMin 15"
    strandedness = utils.get_in(data, ("config", "algorithm", "strandedness"),
                                "unstranded").lower()
    if strandedness == "unstranded":
        cmd += " --outSAMstrandField intronMotif "
    with tx_tmpdir(data) as tmp_dir:
        sam_to_bam = bam.sam_to_bam_stream_cmd(config)
        sort = bam.sort_cmd(config, tmp_dir)
        cmd += "| {sam_to_bam} | {sort} -o {tx_final_out} "
        run_message = "Running STAR aligner on %s and %s." % (fastq_file,
                                                              ref_file)
        with file_transaction(data, final_out) as tx_final_out:
            do.run(cmd.format(**locals()), run_message, None)
    return final_out

Beispiel #37

0

Datei anzeigen

def align(fastq_file, pair_file, ref_file, names, align_dir, data):
    if not ref_file:
        logger.error(
            "STAR index not found. We don't provide the STAR indexes "
            "by default because they are very large. You can install "
            "the index for your genome with: bcbio_nextgen.py upgrade "
            "--aligners star --genomes genome-build-name --data")
        sys.exit(1)

    max_hits = 10
    srna = True if data["analysis"].lower().startswith(
        "smallrna-seq") else False
    srna_opts = ""
    if srna:
        max_hits = 1000
        srna_opts = "--alignIntronMax 1"
    config = data["config"]
    star_dirs = _get_star_dirnames(align_dir, data, names)
    if file_exists(star_dirs.final_out):
        data = _update_data(star_dirs.final_out, star_dirs.out_dir, names,
                            data)
        return data

    star_path = config_utils.get_program("STAR", config)
    fastq_files = " ".join([fastq_file, pair_file
                            ]) if pair_file else fastq_file
    num_cores = dd.get_num_cores(data)
    gtf_file = dd.get_gtf_file(data)
    if ref_file.endswith("chrLength"):
        ref_file = os.path.dirname(ref_file)

    with file_transaction(data, align_dir) as tx_align_dir:
        tx_star_dirnames = _get_star_dirnames(tx_align_dir, data, names)
        tx_out_dir, tx_out_file, tx_out_prefix, tx_final_out = tx_star_dirnames
        safe_makedir(tx_align_dir)
        safe_makedir(tx_out_dir)
        cmd = (
            "{star_path} --genomeDir {ref_file} --readFilesIn {fastq_files} "
            "--runThreadN {num_cores} --outFileNamePrefix {tx_out_prefix} "
            "--outReadsUnmapped Fastx --outFilterMultimapNmax {max_hits} "
            "--outStd SAM {srna_opts} "
            "--outSAMunmapped Within --outSAMattributes %s " %
            " ".join(ALIGN_TAGS))
        cmd += _add_sj_index_commands(fastq_file, ref_file,
                                      gtf_file) if not srna else ""
        cmd += " --readFilesCommand zcat " if is_gzipped(fastq_file) else ""
        cmd += _read_group_option(names)
        fusion_mode = utils.get_in(data,
                                   ("config", "algorithm", "fusion_mode"),
                                   False)
        if fusion_mode:
            cmd += (" --chimSegmentMin 12 --chimJunctionOverhangMin 12 "
                    "--chimScoreDropMax 30 --chimSegmentReadGapMax 5 "
                    "--chimScoreSeparation 5 "
                    "--chimOutType WithinSAM ")
        strandedness = utils.get_in(data,
                                    ("config", "algorithm", "strandedness"),
                                    "unstranded").lower()
        if strandedness == "unstranded" and not srna:
            cmd += " --outSAMstrandField intronMotif "
        if not srna:
            cmd += " --quantMode TranscriptomeSAM "
        cmd += " | " + postalign.sam_to_sortbam_cl(data, tx_final_out)
        run_message = "Running STAR aligner on %s and %s" % (fastq_file,
                                                             ref_file)
        do.run(cmd.format(**locals()), run_message, None)
        print("hello")

    data = _update_data(star_dirs.final_out, star_dirs.out_dir, names, data)
    return data