コード例 #1
0
ファイル: run_peakseq.py プロジェクト: firebitsbr/changlab
def make_peakseq_command(treat_filename, control_filename, outpath,
                         experiment_name, fragment_length, mappability_file):
    import os
    from genomicode import config
    from genomicode import filelib
    from genomicode import parallel

    # pypeakseq.py --experiment_name EXPERIMENT_NAME
    #   --fragment_length FRAGMENT_LENGTH
    #   <mappability_file> <treatment_bam> <control_bam> <outpath>
    pypeakseq = filelib.which_assert(config.pypeakseq)

    assert os.path.exists(treat_filename)
    assert os.path.exists(control_filename)
    assert os.path.exists(mappability_file)
    assert fragment_length > 0 and fragment_length < 100000

    sq = parallel.quote
    cmd = [
        sq(pypeakseq),
        "--experiment_name",
        experiment_name,
        "--fragment_length",
        str(fragment_length),
        sq(mappability_file),
        sq(treat_filename),
        sq(control_filename),
        sq(outpath),
    ]
    return " ".join(cmd)
コード例 #2
0
ファイル: run_spp.py プロジェクト: firebitsbr/changlab
def make_pyspp_command(treat_filename,
                       control_filename,
                       outpath,
                       num_procs=None):
    import os
    from genomicode import config
    from genomicode import filelib
    from genomicode import parallel

    assert num_procs is None or (num_procs >= 1 and num_procs < 256)

    # pyspp.py [-j NUM_PROCS] [--fdr_cutoff FDR_CUTOFF]
    #   <treatment_bam> <control_bam> <outpath>
    pyspp = filelib.which_assert(config.pyspp)

    assert os.path.exists(treat_filename)
    assert os.path.exists(control_filename)

    sq = parallel.quote
    cmd = [
        sq(pyspp),
    ]
    if num_procs:
        cmd += ["-j", str(num_procs)]
    cmd += [
        sq(treat_filename),
        sq(control_filename),
        sq(outpath),
    ]
    return " ".join(cmd)
コード例 #3
0
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            out_path):
        from genomicode import config
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib

        bwa = filelib.which_assert(config.bwa)
        ref = alignlib.standardize_reference_genome(in_data.identifier,
                                                    out_path,
                                                    use_symlinks=True)

        # bwa index <out_stem.fa>
        # Makes files:
        # <out_stem>.fa.amb .ann .bwt .pac .sa

        sq = parallel.quote
        cmd = [
            sq(bwa),
            "index",
            sq(ref.fasta_file_full),
        ]
        parallel.sshell(cmd, path=out_path)

        # Make sure the indexing worked properly.
        EXTENSIONS = [".amb", ".ann", ".bwt", ".pac", ".sa"]
        for ext in EXTENSIONS:
            f = "%s%s" % (ref.fasta_file_full, ext)
            assert filelib.exists_nz(f), "Missing: %s" % f
コード例 #4
0
def make_macs2_command(treat_filename,
                       control_filename=None,
                       genome_size=None,
                       name=None,
                       save_bedgraph_file=False,
                       broad_peak_calling=False,
                       normalize_read_counts=False,
                       paired=False):
    from genomicode import config
    from genomicode import filelib
    from genomicode import parallel

    assert genome_size in ["hs", "mm", "ce", "dm"]

    # Regular peak calling:
    # macs2 callpeak -t sample.bam -c control.bam \
    #   -f [BAM,BAMPE] -g hs -n T_53BP1 -B -q 0.01
    #
    # Broad peak calling:
    # macs2 callpeak --broad -t sample.bam -c control.bam \
    #   -f [BAM,BAMPE] -g hs -n T_53BP1 --broad-cutoff 0.1
    #
    # -n  name.  For saving output files.
    # -w  Save extended fragment pileup at every WIGEXTEND bp in wiggle
    #     file.
    # -B  Save extended fragment pileup at every bp in a bedGraph file.
    #     Much smaller than wiggle file.
    # --broad-cutoff  q-value for merging broad regions.
    # --SPMR          Normalize coverage plot by millions of reads.
    macs2 = filelib.which_assert(config.macs2)

    sq = parallel.quote
    cmd = [
        sq(macs2),
        "callpeak",
    ]
    if broad_peak_calling:
        cmd += ["--broad"]
    if normalize_read_counts:
        cmd += ["--SPMR"]
    cmd += ["-t", sq(treat_filename)]
    if control_filename:
        cmd += [
            "-c",
            sq(control_filename),
        ]
    format_ = "BAM"
    if paired:
        format_ = "BAMPE"
    cmd += [
        "-f",
        format_,
        "-g",
        genome_size,
    ]
    if name:
        cmd.extend(["-n", sq(name)])
    if save_bedgraph_file:
        cmd.append("-B")
    return " ".join(cmd)
コード例 #5
0
def make_bedtools_genomecov_command(bam_filename, reference_file,
                                    cov_filename):
    import os
    import config
    import filelib
    import parallel

    # Generates a histogram of the counts for each read depth.
    # bedtools genomecov [OPTIONS] -ibam <align.bam> -g <ref.fa>
    bedtools = filelib.which_assert(config.bedtools)
    assert os.path.exists(bam_filename)
    assert os.path.exists(reference_file)

    sq = parallel.quote
    x = [
        sq(bedtools),
        "genomecov",
        "-ibam",
        sq(bam_filename),
        "-g",
        sq(reference_file),
        ">&",
        sq(cov_filename),
    ]
    return " ".join(x)
コード例 #6
0
    def run(
        self, network, in_data, out_attributes, user_options, num_cores,
        outfile):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import config

        signal_node = in_data
        signal_file = signal_node.identifier
        assert os.path.exists(signal_file)
        
        slice_matrix = filelib.which_assert(config.slice_matrix)

        sq = parallel.quote
        cmd = [
            sq(slice_matrix),
            "--cpm",
            signal_file,
            ]
        cmd = " ".join(cmd)
        cmd = "%s >& %s" % (cmd, outfile)

        parallel.sshell(cmd)
        filelib.assert_exists_nz(outfile)
コード例 #7
0
def make_bedtools_coverage_command(bam_filename, features_bed, cov_filename):
    import os

    import config
    import filelib
    import parallel

    # XXX Generates a histogram of the counts for each read depth.
    # bedtools coverage [OPTIONS] -abam <align.bam> -b <features.bed>
    bedtools = filelib.which_assert(config.bedtools)
    assert os.path.exists(bam_filename)
    assert os.path.exists(features_bed)

    sq = parallel.quote
    x = [
        sq(bedtools),
        "coverage",
        "-abam",
        sq(bam_filename),
        "-b",
        sq(features_bed),
        ">&",
        sq(cov_filename),
    ]
    return " ".join(x)
コード例 #8
0
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            out_path):
        import os
        from genomicode import config
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib

        bowtie2_build = filelib.which_assert(config.bowtie2_build)
        ref = alignlib.standardize_reference_genome(in_data.identifier,
                                                    out_path,
                                                    use_symlinks=True)

        # bowtie2-build <ref.fa> <output_stem>
        # Makes files:
        # <output_stem>.[1234].bt2
        # <output_stem>.rev.[12].bt2

        sq = parallel.quote
        cmd = [
            sq(bowtie2_build),
            sq(ref.fasta_file_full),
            ref.name,
        ]
        parallel.sshell(cmd, path=out_path)

        # Check to make sure index was created successfully.
        f = os.path.join(out_path, "%s.1.bt2" % ref.name)
        assert filelib.exists_nz(f)
コード例 #9
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, outfile):
        import itertools
        from genomicode import config
        from genomicode import parallel
        from genomicode import filelib

        signal_node, annotation_node = antecedents
        signal_filename = signal_node.identifier
        annotation_filename = annotation_node.identifier
        filelib.assert_exists_nz(signal_filename)
        filelib.assert_exists_nz(annotation_filename)
        metadata = {}

        align_matrices = filelib.which_assert(config.align_matrices)

        # Make sure the signal_filename has an ID_REF header.
        header = filelib.read_cols(signal_filename).next()
        assert header[0] == "ID_REF", "Missing ID_REF header: %s" % \
               signal_filename

        signal_align_file = "signal.aligned.txt"
        annot_align_file = "annot.aligned.txt"

        # First, align the two files.
        sq = parallel.quote
        cmd = [
            sq(align_matrices),
            "--annot_file",
            signal_filename,
            "--header",
            "ID_REF",
            "--annot_file",
            annotation_filename,
            "--left_join",
            signal_align_file,
            annot_align_file,
        ]
        cmd = " ".join(cmd)
        parallel.sshell(cmd)
        metadata["command"] = cmd

        # Now merge them.  Take the first column of the expression
        # file (should be ID_REF), the whole annotation file, then the
        # remainder of the expression file.
        signal_handle = filelib.read_cols(signal_align_file)
        annot_handle = filelib.read_cols(annot_align_file)
        outhandle = open(outfile, 'w')
        for x1, x2 in itertools.izip(signal_handle, annot_handle):
            x = [x1[0]] + x2 + x1[1:]
            print >> outhandle, "\t".join(x)
        outhandle.close()

        #cmd = "paste %s %s > %s" % (
        #    annot_align_file, signal_align_file, outfile)
        #shell.single(cmd)

        filelib.assert_exists_nz(outfile)
コード例 #10
0
ファイル: pypeakseq.py プロジェクト: firebitsbr/changlab
def make_peakseq_preproc_command(bam_file, out_path):
    from genomicode import config
    from genomicode import filelib
    from genomicode import parallel

    # samtools view bam11.bam | PeakSeq -preprocess SAM stdin bam12
    samtools = filelib.which_assert(config.samtools)
    peakseq = filelib.which_assert(config.peakseq)
    sq = parallel.quote
    cmd = [
        sq(samtools),
        "view",
        sq(bam_file),
        "|",
        sq(peakseq),
        "-preprocess",
        "SAM",
        "stdin",
        sq(out_path),
    ]
    return " ".join(cmd)
コード例 #11
0
    def run(
        self, network, in_data, out_attributes, user_options, num_cores,
        out_path):
        import os
        from genomicode import config
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib

        bam_path = in_data.identifier
        assert os.path.exists(bam_path)
        assert os.path.isdir(bam_path)
        filelib.safe_mkdir(out_path)

        metadata = {}
        metadata["tool"] = "samtools %s" % alignlib.get_samtools_version()

        # Find all the BAM files.
        bam_filenames = filelib.list_files_in_path(
            bam_path, endswith=".bam", case_insensitive=True)

        jobs = []  # list of in_filename, out_filename
        for in_filename in bam_filenames:
            p, f = os.path.split(in_filename)
            out_filename = os.path.join(out_path, f)
            assert not os.path.exists(out_filename)
            x = in_filename, out_filename
            jobs.append(x)

        # Symlink the BAM files to the output path.
        for x in jobs:
            in_filename, out_filename = x
            os.symlink(in_filename, out_filename)

        # Index each of the files.
        sq = parallel.quote
        samtools = filelib.which_assert(config.samtools)
        commands = []
        for x in jobs:
            in_filename, out_filename = x
            cmd = [
                sq(samtools),
                "index",
                sq(out_filename),
                ]
            x = " ".join(cmd)
            commands.append(x)
        metadata["commands"] = commands
        parallel.pshell(commands, max_procs=num_cores, path=out_path)

        # TODO: Check for output files.
        
        return metadata
コード例 #12
0
def get_bedtools_version():
    import re
    from genomicode import config
    from genomicode import filelib
    from genomicode import parallel

    bedtools = filelib.which_assert(config.bedtools)
    x = parallel.sshell("%s --version" % bedtools, ignore_nonzero_exit=True)
    x = x.strip()
    # bedtools v2.23.0
    # Version: 1.2 (using htslib 1.2.1)
    m = re.search(r"v([\w\. ]+)", x)
    assert m, "Missing version string"
    return m.group(1)
コード例 #13
0
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            outfile):
        import os
        from genomicode import filelib
        from genomicode import parselib
        from genomicode import alignlib
        from genomicode import config
        from genomicode import parallel

        log_filenames = _find_output_logs(in_data.identifier)
        assert log_filenames

        results = {}  # dict of sample -> dictionary of output
        for filename in log_filenames:
            # <path>/<sample>.log
            path, file_ = os.path.split(filename)
            f, e = os.path.splitext(file_)
            assert e == ".log"
            sample = f
            results[sample] = alignlib.parse_bowtie1_output(filename)

        # Make table where the rows are the samples and the columns
        # are the statistics.
        all_samples = sorted(results)
        table = []
        header = "Sample", "Aligned Reads", "Total Reads", "Perc Aligned"
        table.append(header)
        for sample in all_samples:
            stats = results[sample]
            total_reads = stats["reads_processed"]
            aligned_reads = stats["aligned_reads"]
            perc_aligned = float(aligned_reads) / total_reads * 100

            x1 = parselib.pretty_int(aligned_reads)
            x2 = parselib.pretty_int(total_reads)
            x3 = "%.2f%%" % perc_aligned
            x = sample, x1, x2, x3
            table.append(x)

        # Write out the table as text file.
        TXT_FILE = "summary.txt"
        handle = open(TXT_FILE, 'w')
        for x in table:
            print >> handle, "\t".join(x)
        handle.close()

        txt2xls = filelib.which_assert(config.txt2xls)
        os.system("%s -b %s > %s" %
                  (parallel.quote(txt2xls), TXT_FILE, outfile))
コード例 #14
0
def get_config(name,
               which_assert_file=False,
               assert_exists=False,
               quote=False):
    from genomicode import filelib
    from genomicode import config

    assert hasattr(config, name), "Not configured for genomicode: %s" % name
    x = getattr(config, name)
    if which_assert_file:
        x = filelib.which_assert(x)
    elif assert_exists:
        filelib.assert_exists(x)
    if quote:
        x = sq(x)
    return x
コード例 #15
0
ファイル: pypeakseq.py プロジェクト: firebitsbr/changlab
def make_peakseq_run_command(config_file):
    import os
    from genomicode import config
    from genomicode import filelib
    from genomicode import parallel

    assert os.path.exists(config_file)
    config_file = os.path.realpath(config_file)

    # PeakSeq -peak_select <config_file>
    peakseq = filelib.which_assert(config.peakseq)
    sq = parallel.quote
    cmd = [
        sq(peakseq),
        "-peak_select",
        config_file,
    ]
    return " ".join(cmd)
コード例 #16
0
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            out_filename):
        import os
        import shutil
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import config

        in_filename = in_data.identifier
        filelib.assert_exists_nz(in_filename)

        vcftools = filelib.which_assert(config.vcftools)

        # vcftools --vcf test31.txt --remove-indels --recode --recode-INFO-all
        #   --out test32
        # Writes stuff to console.  Should capture in log file.
        # Saves file test32.recode.vcf

        p, f = os.path.split(in_filename)
        s, ext = os.path.splitext(in_filename)
        sample = s

        out_stem = "%s.filtered" % sample
        log_filename = "%s.log" % sample
        # Should create file <out_stem>.recode.vcf
        outfile = "%s.recode.vcf" % out_stem

        sq = parallel.quote
        cmd = [
            sq(vcftools),
            "--vcf",
            sq(in_filename),
            "--remove-indels",
            "--recode",
            "--recode-INFO-all",
            "--out",
            out_stem,
        ]
        cmd = " ".join(cmd)
        cmd = "%s >& %s" % (cmd, log_filename)
        parallel.sshell(cmd)

        filelib.assert_exists_nz(outfile)
        shutil.copy2(outfile, out_filename)
コード例 #17
0
def _make_samtools_filter_cmd(in_bamfile, out_bamfile):
    from genomicode import filelib
    from genomicode import parallel
    from genomicode import config

    filelib.assert_exists_nz(in_bamfile)
    samtools = filelib.which_assert(config.samtools)
    sq = parallel.quote

    cmd = [
        sq(samtools),
        "view",
        "-bF 4",
        sq(in_bamfile),
        ">",
        sq(out_bamfile),
    ]
    cmd = " ".join(cmd)
    return cmd
コード例 #18
0
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            out_path):
        from genomicode import config
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib

        samtools = filelib.which_assert(config.samtools)
        ref = alignlib.standardize_reference_genome(in_data.identifier,
                                                    out_path,
                                                    use_symlinks=True)

        ## fa_filenames = module_utils.find_fasta_files(out_path)
        ## # Filter out the FASTA files created by RSEM indexing.
        ## # <assembly>.idx.fa
        ## # <assembly>.n2g.idx.fa
        ## # <assembly>.transcripts.fa
        ## # Could these end with ".fasta"?
        ## x = fa_filenames
        ## x = [x for x in x if not x.endswith(".idx.fa")]
        ## x = [x for x in x if not x.endswith(".n2g.idx.fa")]
        ## x = [x for x in x if not x.endswith(".transcripts.fa")]
        ## fa_filenames = x
        ## assert fa_filenames, "Could not find reference genome."
        ## assert len(fa_filenames) == 1, "Found multiple reference genomes."
        ## reference_filename = fa_filenames[0]

        # samtools faidx <ref>.fa
        # Makes files:
        # <ref>.fa.fai

        sq = parallel.quote
        cmd = [
            sq(samtools),
            "faidx",
            sq(ref.fasta_file_full),
        ]
        parallel.sshell(cmd, path=out_path)

        # Check to make sure index was created successfully.
        f = "%s.fai" % ref.fasta_file_full
        assert filelib.exists_nz(f)
コード例 #19
0
ファイル: download_tcga.py プロジェクト: firebitsbr/changlab
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, outfile):
        import os
        import subprocess
        from genomicode import config
        from genomicode import filelib
        #out_attributes = set_out_attributes(in_data, out_attributes)

        TCGA_BIN = filelib.which_assert(config.download_tcga)

        assert 'disease' in user_options
        command = [
            'python',
            TCGA_BIN,
            '--disease',
            user_options['disease'],
            '--data',
            out_attributes['preprocess'],
            '--download_only',
        ]
        if 'date' in user_options:
            command += ['--date', user_options['date']]
        # TODO: Need to return results from command.
        #shell.single(command)

        process = subprocess.Popen(command,
                                   shell=False,
                                   stdout=subprocess.PIPE,
                                   stderr=subprocess.PIPE)
        error_message = process.communicate()[1]
        if error_message:
            raise ValueError(error_message)

        result_files = os.listdir(".")
        result_format = 'tar.gz'
        for result_file in result_files:
            if result_file.endswith(result_format):
                os.rename(result_file, outfile)

        assert filelib.exists_nz(outfile), (
            'the output file %s for download_tcga fails' % outfile)
コード例 #20
0
def _make_intervallist_file(intervallist_file, features_bed, bam_filename):
    from genomicode import config
    from genomicode import filelib
    from genomicode import parallel

    outhandle = open(intervallist_file, 'w')

    # Add the @HD and @SQ headers from the bam file.
    # samtools view -H <filename>
    samtools = filelib.which_assert(config.samtools)
    sq = parallel.quote
    cmd = [
        sq(samtools),
        "view",
        "-H",
        sq(bam_filename),
    ]
    cmd = " ".join(cmd)
    x = parallel.sshell(cmd)
    lines = x.split("\n")
    lines = [x.rstrip() for x in lines]

    for line in lines:
        if line.startswith("@HD") or line.startswith("@SQ"):
            print >> outhandle, line

    # Add the information from the BAM files.
    # BED       chrom chromStart (0-based) chromEnd name score strand
    # Interval  chrom chromStart (1-based) chromEnd strand name
    for cols in filelib.read_cols(features_bed):
        assert len(cols) >= 6
        chrom, chromStart0, chromEnd, name, score, strand = cols[:6]
        chromStart0, chromEnd = int(chromStart0), int(chromEnd)
        chromStart1 = chromStart0 + 1
        x = chrom, chromStart1, chromEnd, strand, name
        print >> outhandle, "\t".join(map(str, x))
    outhandle.close()
コード例 #21
0
def make_macs14_command(treat_filename,
                        control_filename=None,
                        genome_size=None,
                        name=None,
                        shiftsize=None,
                        save_wiggle_file=False,
                        save_single_wiggle_file=False,
                        save_bedgraph_file=False,
                        call_subpeaks=False):
    from genomicode import config
    from genomicode import filelib
    from genomicode import parallel

    assert genome_size in ["hs", "mm", "ce", "dm"]
    if call_subpeaks:
        save_wiggle_file = True
        save_bedgraph_file = False
    if shiftsize:
        assert shiftsize > 0 and shiftsize < 10000

    #macs14 -t Sample_4_T_53BP1.sorted.bam -c Sample_8_T_input.sorted.bam \
    #   -g hs -n T_53BP1 -B -S --call-subpeaks >& T_53BP1.log
    # -n  name.  For saving output files.
    # -w  Save extended fragment pileup at every WIGEXTEND bp in wiggle
    #     file.
    # -B  Save extended fragment pileup at every bp in a bedGraph file.
    #     Much smaller than wiggle file.
    # -S  A single wiggle file will be saved for treatment and input.
    #     i.e. for whole genome, rather than for each chromosome.
    # --call_subpeaks  Use PeakSplitter algorithm to find subpeaks.
    #                  -w needs to be on, and -B should be off.
    #
    # If estimated fragment size is too short (e.g. 53), then specify
    # your own fragment size.  shiftsize is 1/2 of fragment size.
    # --nomodel --shiftsize 73 (for fragment size of 146)
    # Often fragment size is 150-200 for ChIP-Seq.
    macs14 = filelib.which_assert(config.macs14)

    sq = parallel.quote
    cmd = [
        sq(macs14),
        "-t",
        sq(treat_filename),
    ]
    if control_filename:
        cmd += [
            "-c",
            sq(control_filename),
        ]
    cmd += [
        "-f",
        "BAM",
        "-g",
        genome_size,
    ]
    if name:
        cmd.extend(["-n", sq(name)])
    if shiftsize:
        cmd.extend([
            "--nomodel",
            "--shiftsize",
            str(shiftsize),
        ])
    if save_wiggle_file:
        cmd.append("-w")
    if save_single_wiggle_file:
        cmd.append("-S")
    if save_bedgraph_file:
        cmd.append("-B")
    if call_subpeaks:
        cmd.append("--call_subpeaks")
    return " ".join(cmd)
コード例 #22
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import parallel
        from genomicode import hashlib
        from genomicode import filelib
        from genomicode import config
        from Betsy import module_utils

        bam_node, group_node = antecedents
        bam_path = module_utils.check_inpath(bam_node.identifier)
        sample_groups = module_utils.read_sample_group_file(
            group_node.identifier)

        # Get options.
        treat_sample = module_utils.get_user_option(user_options,
                                                    "treatment_sample",
                                                    not_empty=True)
        control_sample = module_utils.get_user_option(user_options,
                                                      "control_sample")
        genome_size = module_utils.get_user_option(user_options,
                                                   "macs_genome",
                                                   not_empty=True)
        shiftsize = module_utils.get_user_option(user_options,
                                                 "macs_shiftsize")
        if shiftsize:
            shiftsize = int(shiftsize)

        # Set the name.
        name = hashlib.hash_var(treat_sample)
        if control_sample:
            x = hashlib.hash_var(control_sample)
            name = "%s_vs_%s" % (treat_sample, x)

        # Make sure the samples exist.
        samples = [x[1] for x in sample_groups]
        assert treat_sample in samples, "Unknown sample: %s" % treat_sample
        if control_sample:
            assert control_sample in samples, \
                   "Unknown sample: %s" % control_sample

        # Find the BAM files.
        treat_filename = find_bam_file(bam_path, treat_sample, sample_groups)
        assert treat_filename, "Missing bam file for %s" % treat_sample
        control_filename = None
        if control_sample:
            control_filename = find_bam_file(bam_path, control_sample,
                                             sample_groups)
            assert control_filename, "Missing bam file for %s" % control_sample

        cmd = make_macs14_command(treat_filename,
                                  control_filename,
                                  name=name,
                                  genome_size=genome_size,
                                  shiftsize=shiftsize,
                                  save_bedgraph_file=True)
        parallel.sshell(cmd, path=out_path)

        # Run Rscript on the model, if one was generated.
        model_file = os.path.join(out_path, "%s_model.r" % name)
        if os.path.exists(model_file):
            Rscript = filelib.which_assert(config.Rscript)
            cmd = [parallel.quote(Rscript), model_file]
            parallel.sshell(cmd, path=out_path)

        files = [
            "%s_peaks.xls" % name,
            "%s_summits.bed" % name,
        ]
        filenames = [os.path.join(out_path, x) for x in files]
        filelib.assert_exists_nz_many(filenames)
コード例 #23
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib
        from Betsy import module_utils as mlib

        # For debugging.
        RUN_VARIANT_CALLING = True
        FILTER_CALLS = True
        MERGE_CALLS = True
        FIX_VCF_FILES = True

        dna_bam_node, rna_bam_node, nc_node, ref_node = antecedents
        dna_bam_filenames = mlib.find_bam_files(dna_bam_node.identifier)
        assert dna_bam_filenames, "No DNA .bam files."
        rna_bam_filenames = mlib.find_bam_files(rna_bam_node.identifier)
        assert rna_bam_filenames, "No RNA .bam files."
        nc_match = mlib.read_normal_cancer_file(nc_node.identifier)
        ref = alignlib.create_reference_genome(ref_node.identifier)
        filelib.safe_mkdir(out_path)
        metadata = {}
        metadata["tool"] = "Radia %s" % alignlib.get_radia_version()

        ## Make sure the BAM files do not contain spaces in the
        ## filenames.  Radia doesn't work well with spaces.
        #filenames = dna_bam_filenames + rna_bam_filenames
        #has_spaces = []
        #for filename in filenames:
        #    if filename.find(" ") >= 0:
        #        has_spaces.append(filename)
        #x = has_spaces
        #if len(x) > 5:
        #    x = x[:5] + ["..."]
        #x = ", ".join(x)
        #msg = "Radia breaks if there are spaces in filenames: %s" % x
        #assert not has_spaces, msg

        # sample -> bam filename
        dnasample2bamfile = mlib.root2filename(dna_bam_filenames)
        rnasample2bamfile = mlib.root2filename(rna_bam_filenames)
        # Make sure files exist for all the samples.  The DNA-Seq
        # should have both normal and cancer.  RNA is not needed for
        # normal sample.
        mlib.assert_normal_cancer_samples(nc_match, dnasample2bamfile)
        mlib.assert_normal_cancer_samples(nc_match,
                                          rnasample2bamfile,
                                          ignore_normal_sample=True)

        # Make sure Radia and snpEff are configured.
        radia_genome_assembly = mlib.get_user_option(user_options,
                                                     "radia_genome_assembly",
                                                     not_empty=True)
        assert radia_genome_assembly == "hg19", "Only hg19 handled."
        snp_eff_genome = mlib.get_user_option(user_options,
                                              "snp_eff_genome",
                                              not_empty=True)

        radia_path = mlib.get_config("radia_path", assert_exists=True)
        snp_eff_path = mlib.get_config("snp_eff_path", assert_exists=True)
        radia_files = get_radia_files(radia_path, radia_genome_assembly)

        # Make a list of the chromosomes to use.  Pick an arbitrarily
        # BAM file.  Look at only the chromosomes that are present in
        # all files.
        all_bamfiles = dnasample2bamfile.values() + rnasample2bamfile.values()
        chroms = list_common_chromosomes(all_bamfiles)
        assert chroms, "No chromosomes found in all files."
        # Only use the chromosomes that can be filtered by Radia.
        chroms = filter_radia_chromosomes(chroms, radia_files)

        # Make output directories.
        radia_outpath = "radia1.tmp"
        filter_outpath = "radia2.tmp"
        merge_outpath = "radia3.tmp"

        if not os.path.exists(radia_outpath):
            os.mkdir(radia_outpath)
        if not os.path.exists(filter_outpath):
            os.mkdir(filter_outpath)
        if not os.path.exists(merge_outpath):
            os.mkdir(merge_outpath)

        # Steps:
        # 1.  Call variants (radia.py)
        #     -o <file.vcf>
        # 2.  Filter variants (filterRadia.py)
        #     <outpath>
        #     Creates a file: <filter_outpath>/<patient_id>_chr<chrom>.vcf
        # 3.  Merge (mergeChroms.py)
        #     Takes as input: <filter_outpath>
        #     Produces: <merge_outpath>/<patient_id>.vcf

        # list of (normal_sample, cancer_sample, chrom,
        #   normal_bamfile, dna_tumor_bamfile, rna_tumor_bamfile,
        #   radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile,
        #   final_vcf_outfile,
        #   radia_logfile, filter_logfile, merge_logfile)
        opj = os.path.join
        jobs = []
        for i, (normal_sample, cancer_sample) in enumerate(nc_match):
            normal_bamfile = dnasample2bamfile[normal_sample]
            dna_tumor_bamfile = dnasample2bamfile[cancer_sample]
            rna_tumor_bamfile = rnasample2bamfile[cancer_sample]

            merge_vcf_outfile = opj(merge_outpath, "%s.vcf" % cancer_sample)
            merge_logfile = opj(merge_outpath, "%s.log" % cancer_sample)
            final_vcf_outfile = opj(out_path, "%s.vcf" % cancer_sample)

            for chrom in chroms:
                radia_vcf_outfile = opj(
                    radia_outpath, "%s_chr%s.vcf" % (cancer_sample, chrom))
                filter_vcf_outfile = opj(
                    filter_outpath, "%s_chr%s.vcf" % (cancer_sample, chrom))
                radia_logfile = opj(radia_outpath,
                                    "%s_chr%s.log" % (cancer_sample, chrom))
                filter_logfile = opj(filter_outpath,
                                     "%s_chr%s.log" % (cancer_sample, chrom))
                x = normal_sample, cancer_sample, chrom, \
                    normal_bamfile, dna_tumor_bamfile, rna_tumor_bamfile, \
                    radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile, \
                    final_vcf_outfile, \
                    radia_logfile, filter_logfile, merge_logfile
                jobs.append(x)

        # Since Radia doesn't work well if there are spaces in the
        # filenames, symlink these files here to guarantee that there
        # are no spaces.
        normal_path = "normal.bam"
        dna_path = "dna.bam"
        rna_path = "rna.bam"
        if not os.path.exists(normal_path):
            os.mkdir(normal_path)
        if not os.path.exists(dna_path):
            os.mkdir(dna_path)
        if not os.path.exists(rna_path):
            os.mkdir(rna_path)
        for i, x in enumerate(jobs):
            normal_sample, cancer_sample, chrom, \
                normal_bamfile, dna_tumor_bamfile, rna_tumor_bamfile, \
                radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile, \
                final_vcf_outfile, \
                radia_logfile, filter_logfile, merge_logfile = x
            x1 = hash_and_symlink_bamfile(normal_bamfile, normal_path)
            x2 = hash_and_symlink_bamfile(dna_tumor_bamfile, dna_path)
            x3 = hash_and_symlink_bamfile(rna_tumor_bamfile, rna_path)
            clean_normal, clean_dna, clean_rna = x1, x2, x3
            x = normal_sample, cancer_sample, chrom, \
                clean_normal, clean_dna, clean_rna, \
                radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile, \
                final_vcf_outfile, \
                radia_logfile, filter_logfile, merge_logfile
            jobs[i] = x

        # Generate the commands for doing variant calling.
        python = mlib.get_config("python", which_assert_file=True)

        # filterRadia.py calls the "blat" command, and there's no way
        # to set the path.  Make sure "blat" is executable.
        if not filelib.which("blat"):
            # Find "blat" in the configuration and add it to the path.
            x = mlib.get_config("blat", which_assert_file=True)
            path, x = os.path.split(x)
            if os.environ["PATH"]:
                path = "%s:%s" % (os.environ["PATH"], path)
            os.environ["PATH"] = path
            # Make sure it's findable now.
            filelib.which_assert("blat")

        # STEP 1.  Call variants with radia.py.
        # python radia.py test31 5 \
        # -n bam04/PIM001_G.bam \
        # -t bam04/196B-MG.bam \
        # -r bam34/196B-MG.bam \
        # -f genomes/Broad.hg19/Homo_sapiens_assembly19.fa \
        # -o test32.vcf
        # --dnaTumorMitochon MT \
        # --rnaTumorMitochon MT \
        sq = mlib.sq
        commands = []
        for x in jobs:
            normal_sample, cancer_sample, chrom, \
                normal_bamfile, dna_tumor_bamfile, rna_tumor_bamfile, \
                radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile, \
                final_vcf_outfile, \
                radia_logfile, filter_logfile, merge_logfile = x

            x = [
                sq(python),
                sq(radia_files.radia_py),
                cancer_sample,
                chrom,
                "-n",
                sq(normal_bamfile),
                "-t",
                sq(dna_tumor_bamfile),
                "-r",
                sq(rna_tumor_bamfile),
                "-f",
                sq(ref.fasta_file_full),
                "-o",
                radia_vcf_outfile,
            ]
            if "MT" in chroms:
                x += [
                    "--dnaNormalMitochon MT",
                    "--dnaTumorMitochon MT",
                    "--rnaTumorMitochon MT",
                ]
            x = " ".join(x)
            x = "%s >& %s" % (x, radia_logfile)
            commands.append(x)
        assert len(commands) == len(jobs)
        # Only uses ~200 Mb of ram.
        if RUN_VARIANT_CALLING:
            parallel.pshell(commands, max_procs=num_cores)
        metadata["num_cores"] = num_cores
        metadata["commands"] = commands

        # Make sure log files are empty.
        logfiles = [x[10] for x in jobs]
        filelib.assert_exists_z_many(logfiles)

        # STEP 2.  Filter variants with filterRadia.py.
        commands = []
        for x in jobs:
            normal_sample, cancer_sample, chrom, \
                normal_bamfile, dna_tumor_bamfile, rna_tumor_bamfile, \
                radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile, \
                final_vcf_outfile, \
                radia_logfile, filter_logfile, merge_logfile = x

            x = [
                sq(python),
                sq(radia_files.filterRadia_py),
                cancer_sample,
                chrom,
                sq(radia_vcf_outfile),
                sq(filter_outpath),
                sq(radia_files.scripts_dir),
                "-b",
                sq(radia_files.blacklist_dir),
                "-d",
                sq(radia_files.snp_dir),
                "-r",
                sq(radia_files.retro_dir),
                "-p",
                sq(radia_files.pseudo_dir),
                "-c",
                sq(radia_files.cosmic_dir),
                "-t",
                sq(radia_files.target_dir),
                "-s",
                sq(snp_eff_path),
                "-e",
                snp_eff_genome,
                "--rnaGeneBlckFile",
                sq(radia_files.rnageneblck_file),
                "--rnaGeneFamilyBlckFile",
                sq(radia_files.rnagenefamilyblck_file),
            ]
            x = " ".join(x)
            x = "%s >& %s" % (x, filter_logfile)
            commands.append(x)
        assert len(commands) == len(jobs)

        # Sometimes samtools crashes in the middle of a run.  Detect
        # this case, and re-run the analysis if needed.
        assert len(commands) == len(jobs)
        py_commands = []
        for x, cmd in zip(jobs, commands):
            normal_sample, cancer_sample, chrom, \
                normal_bamfile, dna_tumor_bamfile, rna_tumor_bamfile, \
                radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile, \
                final_vcf_outfile, \
                radia_logfile, filter_logfile, merge_logfile = x
            args = cmd, cancer_sample, chrom, filter_logfile
            x = _run_filterRadia_with_restart, args, {}
            py_commands.append(x)
        # Takes ~10 Gb each.
        nc = mlib.calc_max_procs_from_ram(25, upper_max=num_cores)
        if FILTER_CALLS:
            parallel.pyfun(py_commands, num_procs=nc)
        metadata["commands"] += commands

        # Make sure log files are empty.
        logfiles = [x[11] for x in jobs]
        filelib.assert_exists_z_many(logfiles)

        # Make sure filter_vcf_outfile exists.
        outfiles = [x[7] for x in jobs]
        filelib.assert_exists_nz_many(outfiles)

        # STEP 3.  Merge the results.
        commands = []
        for x in jobs:
            normal_sample, cancer_sample, chrom, \
                normal_bamfile, dna_tumor_bamfile, rna_tumor_bamfile, \
                radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile, \
                final_vcf_outfile, \
                radia_logfile, filter_logfile, merge_logfile = x

            # python /usr/local/radia/scripts/mergeChroms.py 196B-MG \
            #   radia2.tmp/ radia3.tmp
            # The "/" after radia2.tmp is important.  If not given,
            # will generate some files with only newlines.

            fo = filter_outpath
            if not fo.endswith("/"):
                fo = "%s/" % fo
            x = [
                sq(python),
                sq(radia_files.mergeChroms_py),
                cancer_sample,
                fo,
                merge_outpath,
            ]
            x = " ".join(x)
            x = "%s >& %s" % (x, merge_logfile)
            commands.append(x)
        assert len(commands) == len(jobs)
        # Since the chromosomes were separated for the previous steps,
        # this will generate one merge for each chromosome.  This is
        # unnecessary, since we only need to merge once per sample.
        # Get rid of duplicates.
        commands = sorted({}.fromkeys(commands))
        if MERGE_CALLS:
            parallel.pshell(commands, max_procs=num_cores)
        metadata["commands"] += commands

        # Make sure log files are empty.
        logfiles = [x[12] for x in jobs]
        logfiles = sorted({}.fromkeys(logfiles))
        filelib.assert_exists_z_many(logfiles)

        # Fix the VCF files.
        commands = []
        for x in jobs:
            normal_sample, cancer_sample, chrom, \
                normal_bamfile, dna_tumor_bamfile, rna_tumor_bamfile, \
                radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile, \
                final_vcf_outfile, \
                radia_logfile, filter_logfile, merge_logfile = x
            args = normal_sample, cancer_sample, \
                   merge_vcf_outfile, final_vcf_outfile
            x = alignlib.clean_radia_vcf, args, {}
            commands.append(x)
        if FIX_VCF_FILES:
            parallel.pyfun(commands, num_procs=num_cores)

        # Make sure output VCF files exist.
        x = [x[9] for x in jobs]
        filelib.assert_exists_nz_many(x)

        return metadata
コード例 #24
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import config
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib
        from Betsy import module_utils

        ## Importing pysam is hard!
        #import sys
        #sys_path_old = sys.path[:]
        #sys.path = [x for x in sys.path if x.find("RSeQC") < 0]
        #import pysam
        #sys.path = sys_path_old

        bam_node, ref_node = antecedents
        bam_filenames = module_utils.find_bam_files(bam_node.identifier)
        assert bam_filenames, "No .bam files."
        ref = alignlib.create_reference_genome(ref_node.identifier)
        filelib.safe_mkdir(out_path)

        # list of (in_filename, err_filename, out_filename)
        jobs = []
        for in_filename in bam_filenames:
            p, f = os.path.split(in_filename)
            s, ext = os.path.splitext(f)
            log_filename = os.path.join(out_path, "%s.log" % s)
            out_filename = os.path.join(out_path, f)
            assert in_filename != out_filename
            x = in_filename, log_filename, out_filename
            jobs.append(x)

        # Don't do this.  Need MD, NM, NH in
        # summarize_alignment_cigar.  To be sure, just redo it.
        ## If the files already have MD tags, then just symlink the
        ## files.  Don't add again.
        #i = 0
        #while i < len(jobs):
        #    in_filename, out_filename = jobs[i]
        #
        #    handle = pysam.AlignmentFile(in_filename, "rb")
        #    align = handle.next()
        #    tag_dict = dict(align.tags)
        #    if "MD" not in tag_dict:
        #        i += 1
        #        continue
        #    # Has MD tags.  Just symlink and continue.
        #    os.symlink(in_filename, out_filename)
        #    del jobs[i]

        # Make a list of samtools commands.
        # Takes ~200 Mb per process, so should not be a big issue.
        samtools = filelib.which_assert(config.samtools)
        sq = parallel.quote
        commands = []
        for x in jobs:
            in_filename, log_filename, out_filename = x

            # samtools calmd -b <in.bam> <ref.fasta> > <out.bam>

            # May generate error:
            # [bam_fillmd1] different NM for read
            #   'ST-J00106:118:H75L3BBXX:3:2128:21846:47014': 0 -> 19
            # Pipe stderr to different file.
            x = [
                samtools,
                "calmd",
                "-b",
                sq(in_filename),
                sq(ref.fasta_file_full),
            ]
            x = " ".join(x)
            x = "%s 2> %s 1> %s" % (x, sq(log_filename), sq(out_filename))
            commands.append(x)
        parallel.pshell(commands, max_procs=num_cores)

        # Make sure the analysis completed successfully.
        x = [x[-1] for x in jobs]
        filelib.assert_exists_nz_many(x)
コード例 #25
0
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            outfile):
        import os
        from genomicode import filelib
        from genomicode import parselib
        from genomicode import alignlib
        from genomicode import config
        from genomicode import parallel

        align_node = in_data
        x = filelib.list_files_in_path(align_node.identifier,
                                       endswith="align_summary.txt")
        align_filenames = x
        assert align_filenames, "Missing align_summary.txt"

        results = {}  # dict of sample -> dictionary of output
        for filename in align_filenames:
            # Names must in the format:
            # <path>/<sample>.tophat/alignment_summary.txt
            # full_path   <path>/<sample>.tophat
            # path        <path>
            # tophat_dir  <sample>.tophat
            # file_       accepted_hits.bam
            # sample      <sample>

            full_path, file_ = os.path.split(filename)
            path, tophat_dir = os.path.split(full_path)
            assert file_ == "align_summary.txt"
            assert tophat_dir.endswith(".tophat")
            sample = tophat_dir[:-7]

            x = alignlib.parse_tophat_align_summary(filename)
            results[sample] = x

        # Make table where the rows are the samples and the columns
        # are the statistics.
        all_samples = sorted(results)
        table = []
        header = "Sample", "Aligned Reads", "Total Reads", "Perc Aligned"
        table.append(header)
        for sample in all_samples:
            stats = results[sample]
            total_reads = stats["reads_processed"]
            aligned_reads = stats["aligned_reads"]
            perc_aligned = float(aligned_reads) / total_reads * 100

            x1 = parselib.pretty_int(aligned_reads)
            x2 = parselib.pretty_int(total_reads)
            x3 = "%.2f%%" % perc_aligned
            x = sample, x1, x2, x3
            table.append(x)

        # Write out the table as text file.
        TXT_FILE = "summary.txt"
        handle = open(TXT_FILE, 'w')
        for x in table:
            print >> handle, "\t".join(x)
        handle.close()

        txt2xls = filelib.which_assert(config.txt2xls)
        os.system("%s -b %s > %s" %
                  (parallel.quote(txt2xls), TXT_FILE, outfile))
コード例 #26
0
def which(bin_name):
    from genomicode import filelib
    return filelib.which_assert(bin_name)
コード例 #27
0
ファイル: run_RNA_SeQC.py プロジェクト: firebitsbr/changlab
    def run(
        self, network, antecedents, out_attributes, user_options, num_cores,
        out_path):
        import os
        from genomicode import config
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib
        from genomicode import hashlib
        from Betsy import module_utils

        bam_node, ref_node = antecedents
        bam_filenames = module_utils.find_bam_files(bam_node.identifier)
        ref = alignlib.create_reference_genome(ref_node.identifier)
        filelib.safe_mkdir(out_path)

        # java -jar /usr/local/bin/RNA-SeQC_v1.1.8.jar \
        #   -o <sample> -r <reference_file> -s "<sample>|<in_filename>|NA"
        #   -t <gtf_file> >& <log_filename>"
        # <out_path>        Output directory.  Will be created if not exists.
        # <in_filename>     BAM file
        # <reference_file>  /data/biocore/genomes/UCSC/mm10.fa
        # <gtf_file>   /data/biocore/rsem/mouse_refseq_mm10/UCSC_knownGenes.gtf
        #
        # <reference_file> must be indexed and have a dict file.

        rna_seqc_jar = filelib.which_assert(config.rna_seqc_jar)

        GTF = module_utils.get_user_option(
            user_options, "rna_seqc_gtf_file", not_empty=True)
        assert os.path.exists(GTF), "File not found: %s" % GTF

        # list of infile, out_path, ref_file, gtf_file, sample, log_file
        jobs = []
        for in_filename in bam_filenames:
            p, file_ = os.path.split(in_filename)
            f, e = os.path.splitext(file_)
            sample = hashlib.hash_var(f)
            out_path_rna_seqc = os.path.join(out_path, sample)
            log_filename = os.path.join(out_path, "%s.log" % sample)

            x = in_filename, out_path_rna_seqc, ref.fasta_file_full, GTF, \
                sample, log_filename
            jobs.append(x)

        sq = parallel.quote
        commands = []
        for x in jobs:
            (in_filename, out_path_rna_seqc, ref_filename, gtf_filename, \
             sample, log_filename) = x

            x = [sample, in_filename, "NA"]
            x = "|".join(x)
            x = [
                'java',
                '-jar', rna_seqc_jar,
                '-o', sq(out_path_rna_seqc),
                '-r', sq(ref_filename),
                '-s', "'%s'" % x,
                '-t', gtf_filename,
                ]
            x = " ".join(x)
            cmd = "%s >& %s" % (x, log_filename)
            commands.append(cmd)

        # Gets lots of errors.

        x = parallel.pshell(commands, max_procs=num_cores)
        run_log = os.path.join(out_path, "run.log")
        open(run_log, 'w').write(x)

        # Check for outfile.
        # Make sure the analysis completed successfully.
        for x in jobs:
            (in_filename, out_path_rna_seqc, ref_filename, gtf_filename, \
             sample, log_filename) = x
            filelib.assert_exists_nz(out_path_rna_seqc)
コード例 #28
0
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            out_path):
        import os
        from genomicode import config
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib
        #from genomicode import hashlib
        from Betsy import module_utils

        in_filenames = module_utils.find_bam_files(in_data.identifier)
        assert in_filenames, "No .bam files."
        filelib.safe_mkdir(out_path)

        metadata = {}
        metadata["tool"] = "samtools %s" % alignlib.get_samtools_version()

        jobs = []
        #seen = {}
        for i, in_filename in enumerate(in_filenames):
            p, f = os.path.split(in_filename)
            temp_prefix = "temp_%s" % f
            #temp_prefix = "temp_%s" % hashlib.hash_var(f)
            # Make sure no duplicates.
            #assert temp_prefix not in seen
            #seen[temp_prefix] = 1
            #temp_outfilename = "%d.bam" % i
            out_filename = os.path.join(out_path, f)
            x = filelib.GenericObject(
                in_filename=in_filename,
                temp_prefix=temp_prefix,
                #temp_outfilename=temp_outfilename,
                out_filename=out_filename)
            jobs.append(x)

        samtools = filelib.which_assert(config.samtools)

        # Calculate the number of threads per process.
        nc = module_utils.calc_max_procs_from_ram(4, upper_max=num_cores)
        num_threads = max(nc / len(jobs), 1)

        # Make a list of samtools commands.
        # Without -m, takes ~1 Gb per process.
        sq = parallel.quote
        commands = []
        for j in jobs:
            # Usage has changed.  Below no longer valid.
            # samtools sort <in_filename> <out_filestem>
            # .bam automatically added to <out_filestem>, so don't
            # need it.
            #x = out_filename
            #assert x.endswith(".bam")
            #x = x[:-4]
            #out_filestem = x

            x = [
                sq(samtools),
                "sort",
                "-O",
                "bam",
                "-T",
                sq(j.temp_prefix),
                "-m",
                "4G",  # Crashing, so try increasing memory.
                sq(j.in_filename),
                #"-o", sq(j.temp_outfilename),
                "-o",
                sq(j.out_filename),
            ]
            if num_threads > 1:
                x += ["-@", num_threads]
            x = " ".join(map(str, x))
            commands.append(x)
        metadata["commands"] = commands
        metadata["num_cores"] = nc

        parallel.pshell(commands, max_procs=nc)
        #for cmd in commands:
        #    parallel.sshell(cmd)

        #for j in jobs:
        #    # Move the temporary files to the final location.
        #    shutil.move(j.temp_outfilename, j.out_filename)

        # Make sure the analysis completed successfully.
        x = [j.out_filename for j in jobs]
        filelib.assert_exists_nz_many(x)

        return metadata
コード例 #29
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import config
        from genomicode import parallel
        from genomicode import alignlib
        from genomicode import filelib
        from Betsy import module_utils

        bam_node, ref_node, pos_node = antecedents
        bam_filenames = module_utils.find_bam_files(bam_node.identifier)
        assert bam_filenames, "No .bam files."
        ref = alignlib.create_reference_genome(ref_node.identifier)
        filelib.safe_mkdir(out_path)
        metadata = {}

        # Positions file has 0-based coordinates (like BAM files).
        # But samtools requires 1-based coordinates.  Convert to
        # 1-based coordinates.
        positions_filename = "positions.txt"
        outhandle = open(positions_filename, 'w')
        for x in filelib.read_cols(pos_node.identifier):
            assert len(x) == 2
            chrom, pos = x
            pos = int(pos) + 1  # convert from 0- to 1-based coords.
            x = chrom, pos
            print >> outhandle, "\t".join(map(str, x))
        outhandle.close()

        # list of (in_filename, err_filename, out_filename)
        jobs = []
        for in_filename in bam_filenames:
            p, f = os.path.split(in_filename)
            sample, ext = os.path.splitext(f)
            err_filename = os.path.join(out_path, "%s.log" % sample)
            out_filename = os.path.join(out_path, "%s.pileup" % sample)
            x = filelib.GenericObject(in_filename=in_filename,
                                      err_filename=err_filename,
                                      out_filename=out_filename)
            jobs.append(x)

        ## Get possible positions file.
        #positions_filename = module_utils.get_user_option(
        #    user_options, "positions_file", check_file=True)

        # Figure out whether the purpose is to get coverage.  Change
        # the parameters if it is.
        assert "vartype" in out_attributes
        vartype = out_attributes["vartype"]
        assert vartype in ["all", "snp", "indel", "consensus"]
        #if cov == "yes":
        #    assert positions_filename, "Missing: positions_file"

        # samtools mpileup -l freq04.txt -R -B -q 0 -Q 0 -d10000000 \
        #   -f genomes/Broad.hg19/Homo_sapiens_assembly19.fasta \
        #   $i > $j"
        samtools = filelib.which_assert(config.samtools)

        # Get an error if the BAM files are not indexed.
        # [W::bam_hdr_read] EOF marker is absent. The input is probably
        #   truncated.

        #if vartype == "consensus":
        #    args = [
        #        "-R",        # Ignore read group tags.
        #        "-B",        # Disable BAQ (base quality) computation.
        #        "-q", 0,     # Skip bases with mapQ smaller than this.
        #        "-Q", 0,     # Skip bases with BAQ smaller than this.
        #        "-d10000000",  # Allow deep reads.
        #        ]
        #else:
        #    raise NotImplementedError
        args = [
            "-R",  # Ignore read group tags.
            "-B",  # Disable BAQ (base quality) computation.
            "-q",
            0,  # Skip bases with mapQ smaller than this.
            "-Q",
            0,  # Skip bases with BAQ smaller than this.
            "-d10000000",  # Allow deep reads.
        ]

        sq = parallel.quote
        commands = []
        for j in jobs:
            x = [
                sq(samtools),
                "mpileup",
                "-f",
                sq(ref.fasta_file_full),
            ]
            if positions_filename:
                x.extend(["-l", positions_filename])
            x.extend(args)
            x.append(sq(j.in_filename))
            x = " ".join(map(str, x))
            x = "%s 2> %s 1> %s" % (x, j.err_filename, j.out_filename)
            commands.append(x)

        #for x in commands:
        #    print x
        parallel.pshell(commands, max_procs=num_cores)
        metadata["commands"] = commands

        # File may be empty if there are no reads.
        x = [x.out_filename for x in jobs]
        filelib.assert_exists_many(x)

        # Make sure there's no errors in the log files.
        for j in jobs:
            check_log_file(j.err_filename)

        return metadata
コード例 #30
0
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib
        from genomicode import config
        from Betsy import module_utils as mlib

        mpileup_node = in_data
        mpileup_filenames = filelib.list_files_in_path(mpileup_node.identifier,
                                                       endswith=".pileup")
        assert mpileup_filenames, "No .pileup files."
        #nc_match = mlib.read_normal_cancer_file(nc_node.identifier)
        #ref = alignlib.create_reference_genome(ref_node.identifier)
        filelib.safe_mkdir(out_path)

        # Figure out whether the purpose is to get coverage.  Change
        # the parameters if it is.
        assert "vartype" in out_attributes
        vartype = out_attributes["vartype"]
        assert vartype in ["snp", "indel"]
        tool = "mpileup2snp"
        if vartype == "indel":
            tool = "mpileup2indel"

        # list of (sample, in_filename, tmp1_filename, tmp2_filename,
        #          out_filename)
        jobs = []
        for in_filename in mpileup_filenames:
            p, sample, ext = mlib.splitpath(in_filename)
            tmp1_filename = os.path.join(out_path, "%s.tmp1" % sample)
            tmp2_filename = os.path.join(out_path, "%s.tmp2" % sample)
            out_filename = os.path.join(out_path, "%s.vcf" % sample)
            x = sample, in_filename, tmp1_filename, tmp2_filename, out_filename
            jobs.append(x)

        # VarScan will generate a "Parsing Exception" if there are 0
        # reads in a location.  Filter those out.
        sq = parallel.quote
        commands = []
        for x in jobs:
            sample, in_filename, tmp1_filename, tmp2_filename, out_filename = x
            x = "awk -F'\t' '$4 != 0 {print}' %s > %s" % (in_filename,
                                                          tmp1_filename)
            commands.append(x)
        parallel.pshell(commands, max_procs=num_cores)
        x = [x[2] for x in jobs]
        filelib.assert_exists_nz_many(x)

        # java -jar /usr/local/bin/VarScan.jar <tool> $i --output_vcf 1 > $j
        varscan = filelib.which_assert(config.varscan_jar)

        # Make a list of commands.
        commands = []
        for x in jobs:
            sample, in_filename, tmp1_filename, tmp2_filename, out_filename = x
            x = [
                "java",
                "-jar",
                sq(varscan),
                tool,
                tmp1_filename,
                "--p-value",
                0.05,
                "--output-vcf",
                1,
            ]
            x = " ".join(map(str, x))
            x = "%s >& %s" % (x, tmp2_filename)
            commands.append(x)

        #for x in commands:
        #    print x
        #import sys; sys.exit(0)

        parallel.pshell(commands, max_procs=num_cores)
        x = [x[3] for x in jobs]
        filelib.assert_exists_nz_many(x)

        # Clean up the VCF files.  VarScan leaves extraneous lines
        # there.
        for x in jobs:
            sample, in_filename, tmp1_filename, tmp2_filename, out_filename = x
            alignlib.clean_varscan_vcf(sample, tmp2_filename, out_filename)
        x = [x[-1] for x in jobs]
        filelib.assert_exists_nz_many(x)

        # The tmp files are really big.  Don't save those.
        for x in jobs:
            sample, in_filename, tmp1_filename, tmp2_filename, out_filename = x
            filelib.safe_unlink(tmp1_filename)
            filelib.safe_unlink(tmp2_filename)