Esempio n. 1
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib
        from Betsy import module_utils as mlib

        # For debugging.
        RUN_VARIANT_CALLING = True
        FILTER_CALLS = True
        MERGE_CALLS = True
        FIX_VCF_FILES = True

        dna_bam_node, rna_bam_node, nc_node, ref_node = antecedents
        dna_bam_filenames = mlib.find_bam_files(dna_bam_node.identifier)
        assert dna_bam_filenames, "No DNA .bam files."
        rna_bam_filenames = mlib.find_bam_files(rna_bam_node.identifier)
        assert rna_bam_filenames, "No RNA .bam files."
        nc_match = mlib.read_normal_cancer_file(nc_node.identifier)
        ref = alignlib.create_reference_genome(ref_node.identifier)
        filelib.safe_mkdir(out_path)
        metadata = {}
        metadata["tool"] = "Radia %s" % alignlib.get_radia_version()

        ## Make sure the BAM files do not contain spaces in the
        ## filenames.  Radia doesn't work well with spaces.
        #filenames = dna_bam_filenames + rna_bam_filenames
        #has_spaces = []
        #for filename in filenames:
        #    if filename.find(" ") >= 0:
        #        has_spaces.append(filename)
        #x = has_spaces
        #if len(x) > 5:
        #    x = x[:5] + ["..."]
        #x = ", ".join(x)
        #msg = "Radia breaks if there are spaces in filenames: %s" % x
        #assert not has_spaces, msg

        # sample -> bam filename
        dnasample2bamfile = mlib.root2filename(dna_bam_filenames)
        rnasample2bamfile = mlib.root2filename(rna_bam_filenames)
        # Make sure files exist for all the samples.  The DNA-Seq
        # should have both normal and cancer.  RNA is not needed for
        # normal sample.
        mlib.assert_normal_cancer_samples(nc_match, dnasample2bamfile)
        mlib.assert_normal_cancer_samples(nc_match,
                                          rnasample2bamfile,
                                          ignore_normal_sample=True)

        # Make sure Radia and snpEff are configured.
        radia_genome_assembly = mlib.get_user_option(user_options,
                                                     "radia_genome_assembly",
                                                     not_empty=True)
        assert radia_genome_assembly == "hg19", "Only hg19 handled."
        snp_eff_genome = mlib.get_user_option(user_options,
                                              "snp_eff_genome",
                                              not_empty=True)

        radia_path = mlib.get_config("radia_path", assert_exists=True)
        snp_eff_path = mlib.get_config("snp_eff_path", assert_exists=True)
        radia_files = get_radia_files(radia_path, radia_genome_assembly)

        # Make a list of the chromosomes to use.  Pick an arbitrarily
        # BAM file.  Look at only the chromosomes that are present in
        # all files.
        all_bamfiles = dnasample2bamfile.values() + rnasample2bamfile.values()
        chroms = list_common_chromosomes(all_bamfiles)
        assert chroms, "No chromosomes found in all files."
        # Only use the chromosomes that can be filtered by Radia.
        chroms = filter_radia_chromosomes(chroms, radia_files)

        # Make output directories.
        radia_outpath = "radia1.tmp"
        filter_outpath = "radia2.tmp"
        merge_outpath = "radia3.tmp"

        if not os.path.exists(radia_outpath):
            os.mkdir(radia_outpath)
        if not os.path.exists(filter_outpath):
            os.mkdir(filter_outpath)
        if not os.path.exists(merge_outpath):
            os.mkdir(merge_outpath)

        # Steps:
        # 1.  Call variants (radia.py)
        #     -o <file.vcf>
        # 2.  Filter variants (filterRadia.py)
        #     <outpath>
        #     Creates a file: <filter_outpath>/<patient_id>_chr<chrom>.vcf
        # 3.  Merge (mergeChroms.py)
        #     Takes as input: <filter_outpath>
        #     Produces: <merge_outpath>/<patient_id>.vcf

        # list of (normal_sample, cancer_sample, chrom,
        #   normal_bamfile, dna_tumor_bamfile, rna_tumor_bamfile,
        #   radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile,
        #   final_vcf_outfile,
        #   radia_logfile, filter_logfile, merge_logfile)
        opj = os.path.join
        jobs = []
        for i, (normal_sample, cancer_sample) in enumerate(nc_match):
            normal_bamfile = dnasample2bamfile[normal_sample]
            dna_tumor_bamfile = dnasample2bamfile[cancer_sample]
            rna_tumor_bamfile = rnasample2bamfile[cancer_sample]

            merge_vcf_outfile = opj(merge_outpath, "%s.vcf" % cancer_sample)
            merge_logfile = opj(merge_outpath, "%s.log" % cancer_sample)
            final_vcf_outfile = opj(out_path, "%s.vcf" % cancer_sample)

            for chrom in chroms:
                radia_vcf_outfile = opj(
                    radia_outpath, "%s_chr%s.vcf" % (cancer_sample, chrom))
                filter_vcf_outfile = opj(
                    filter_outpath, "%s_chr%s.vcf" % (cancer_sample, chrom))
                radia_logfile = opj(radia_outpath,
                                    "%s_chr%s.log" % (cancer_sample, chrom))
                filter_logfile = opj(filter_outpath,
                                     "%s_chr%s.log" % (cancer_sample, chrom))
                x = normal_sample, cancer_sample, chrom, \
                    normal_bamfile, dna_tumor_bamfile, rna_tumor_bamfile, \
                    radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile, \
                    final_vcf_outfile, \
                    radia_logfile, filter_logfile, merge_logfile
                jobs.append(x)

        # Since Radia doesn't work well if there are spaces in the
        # filenames, symlink these files here to guarantee that there
        # are no spaces.
        normal_path = "normal.bam"
        dna_path = "dna.bam"
        rna_path = "rna.bam"
        if not os.path.exists(normal_path):
            os.mkdir(normal_path)
        if not os.path.exists(dna_path):
            os.mkdir(dna_path)
        if not os.path.exists(rna_path):
            os.mkdir(rna_path)
        for i, x in enumerate(jobs):
            normal_sample, cancer_sample, chrom, \
                normal_bamfile, dna_tumor_bamfile, rna_tumor_bamfile, \
                radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile, \
                final_vcf_outfile, \
                radia_logfile, filter_logfile, merge_logfile = x
            x1 = hash_and_symlink_bamfile(normal_bamfile, normal_path)
            x2 = hash_and_symlink_bamfile(dna_tumor_bamfile, dna_path)
            x3 = hash_and_symlink_bamfile(rna_tumor_bamfile, rna_path)
            clean_normal, clean_dna, clean_rna = x1, x2, x3
            x = normal_sample, cancer_sample, chrom, \
                clean_normal, clean_dna, clean_rna, \
                radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile, \
                final_vcf_outfile, \
                radia_logfile, filter_logfile, merge_logfile
            jobs[i] = x

        # Generate the commands for doing variant calling.
        python = mlib.get_config("python", which_assert_file=True)

        # filterRadia.py calls the "blat" command, and there's no way
        # to set the path.  Make sure "blat" is executable.
        if not filelib.which("blat"):
            # Find "blat" in the configuration and add it to the path.
            x = mlib.get_config("blat", which_assert_file=True)
            path, x = os.path.split(x)
            if os.environ["PATH"]:
                path = "%s:%s" % (os.environ["PATH"], path)
            os.environ["PATH"] = path
            # Make sure it's findable now.
            filelib.which_assert("blat")

        # STEP 1.  Call variants with radia.py.
        # python radia.py test31 5 \
        # -n bam04/PIM001_G.bam \
        # -t bam04/196B-MG.bam \
        # -r bam34/196B-MG.bam \
        # -f genomes/Broad.hg19/Homo_sapiens_assembly19.fa \
        # -o test32.vcf
        # --dnaTumorMitochon MT \
        # --rnaTumorMitochon MT \
        sq = mlib.sq
        commands = []
        for x in jobs:
            normal_sample, cancer_sample, chrom, \
                normal_bamfile, dna_tumor_bamfile, rna_tumor_bamfile, \
                radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile, \
                final_vcf_outfile, \
                radia_logfile, filter_logfile, merge_logfile = x

            x = [
                sq(python),
                sq(radia_files.radia_py),
                cancer_sample,
                chrom,
                "-n",
                sq(normal_bamfile),
                "-t",
                sq(dna_tumor_bamfile),
                "-r",
                sq(rna_tumor_bamfile),
                "-f",
                sq(ref.fasta_file_full),
                "-o",
                radia_vcf_outfile,
            ]
            if "MT" in chroms:
                x += [
                    "--dnaNormalMitochon MT",
                    "--dnaTumorMitochon MT",
                    "--rnaTumorMitochon MT",
                ]
            x = " ".join(x)
            x = "%s >& %s" % (x, radia_logfile)
            commands.append(x)
        assert len(commands) == len(jobs)
        # Only uses ~200 Mb of ram.
        if RUN_VARIANT_CALLING:
            parallel.pshell(commands, max_procs=num_cores)
        metadata["num_cores"] = num_cores
        metadata["commands"] = commands

        # Make sure log files are empty.
        logfiles = [x[10] for x in jobs]
        filelib.assert_exists_z_many(logfiles)

        # STEP 2.  Filter variants with filterRadia.py.
        commands = []
        for x in jobs:
            normal_sample, cancer_sample, chrom, \
                normal_bamfile, dna_tumor_bamfile, rna_tumor_bamfile, \
                radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile, \
                final_vcf_outfile, \
                radia_logfile, filter_logfile, merge_logfile = x

            x = [
                sq(python),
                sq(radia_files.filterRadia_py),
                cancer_sample,
                chrom,
                sq(radia_vcf_outfile),
                sq(filter_outpath),
                sq(radia_files.scripts_dir),
                "-b",
                sq(radia_files.blacklist_dir),
                "-d",
                sq(radia_files.snp_dir),
                "-r",
                sq(radia_files.retro_dir),
                "-p",
                sq(radia_files.pseudo_dir),
                "-c",
                sq(radia_files.cosmic_dir),
                "-t",
                sq(radia_files.target_dir),
                "-s",
                sq(snp_eff_path),
                "-e",
                snp_eff_genome,
                "--rnaGeneBlckFile",
                sq(radia_files.rnageneblck_file),
                "--rnaGeneFamilyBlckFile",
                sq(radia_files.rnagenefamilyblck_file),
            ]
            x = " ".join(x)
            x = "%s >& %s" % (x, filter_logfile)
            commands.append(x)
        assert len(commands) == len(jobs)

        # Sometimes samtools crashes in the middle of a run.  Detect
        # this case, and re-run the analysis if needed.
        assert len(commands) == len(jobs)
        py_commands = []
        for x, cmd in zip(jobs, commands):
            normal_sample, cancer_sample, chrom, \
                normal_bamfile, dna_tumor_bamfile, rna_tumor_bamfile, \
                radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile, \
                final_vcf_outfile, \
                radia_logfile, filter_logfile, merge_logfile = x
            args = cmd, cancer_sample, chrom, filter_logfile
            x = _run_filterRadia_with_restart, args, {}
            py_commands.append(x)
        # Takes ~10 Gb each.
        nc = mlib.calc_max_procs_from_ram(25, upper_max=num_cores)
        if FILTER_CALLS:
            parallel.pyfun(py_commands, num_procs=nc)
        metadata["commands"] += commands

        # Make sure log files are empty.
        logfiles = [x[11] for x in jobs]
        filelib.assert_exists_z_many(logfiles)

        # Make sure filter_vcf_outfile exists.
        outfiles = [x[7] for x in jobs]
        filelib.assert_exists_nz_many(outfiles)

        # STEP 3.  Merge the results.
        commands = []
        for x in jobs:
            normal_sample, cancer_sample, chrom, \
                normal_bamfile, dna_tumor_bamfile, rna_tumor_bamfile, \
                radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile, \
                final_vcf_outfile, \
                radia_logfile, filter_logfile, merge_logfile = x

            # python /usr/local/radia/scripts/mergeChroms.py 196B-MG \
            #   radia2.tmp/ radia3.tmp
            # The "/" after radia2.tmp is important.  If not given,
            # will generate some files with only newlines.

            fo = filter_outpath
            if not fo.endswith("/"):
                fo = "%s/" % fo
            x = [
                sq(python),
                sq(radia_files.mergeChroms_py),
                cancer_sample,
                fo,
                merge_outpath,
            ]
            x = " ".join(x)
            x = "%s >& %s" % (x, merge_logfile)
            commands.append(x)
        assert len(commands) == len(jobs)
        # Since the chromosomes were separated for the previous steps,
        # this will generate one merge for each chromosome.  This is
        # unnecessary, since we only need to merge once per sample.
        # Get rid of duplicates.
        commands = sorted({}.fromkeys(commands))
        if MERGE_CALLS:
            parallel.pshell(commands, max_procs=num_cores)
        metadata["commands"] += commands

        # Make sure log files are empty.
        logfiles = [x[12] for x in jobs]
        logfiles = sorted({}.fromkeys(logfiles))
        filelib.assert_exists_z_many(logfiles)

        # Fix the VCF files.
        commands = []
        for x in jobs:
            normal_sample, cancer_sample, chrom, \
                normal_bamfile, dna_tumor_bamfile, rna_tumor_bamfile, \
                radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile, \
                final_vcf_outfile, \
                radia_logfile, filter_logfile, merge_logfile = x
            args = normal_sample, cancer_sample, \
                   merge_vcf_outfile, final_vcf_outfile
            x = alignlib.clean_radia_vcf, args, {}
            commands.append(x)
        if FIX_VCF_FILES:
            parallel.pyfun(commands, num_procs=num_cores)

        # Make sure output VCF files exist.
        x = [x[9] for x in jobs]
        filelib.assert_exists_nz_many(x)

        return metadata
Esempio n. 2
0
    def run(
        self, network, antecedents, out_attributes, user_options, num_cores,
        out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib
        from Betsy import module_utils as mlib

        bam_node, nc_node, ref_node, interval_node = antecedents
        bam_filenames = mlib.find_bam_files(bam_node.identifier)
        assert bam_filenames, "No .bam files."
        nc_match = mlib.read_normal_cancer_file(nc_node.identifier)
        ref = alignlib.create_reference_genome(ref_node.identifier)
        filelib.assert_exists_nz(interval_node.identifier)
        filelib.safe_mkdir(out_path)
        metadata = {}
        # TODO: Figure out MuTect version.

        # Make sure intervals file ends with:
        # .bed, .list, .picard, .interval_list, or .intervals
        x, x, ext = mlib.splitpath(interval_node.identifier)
        assert ext in [
            ".bed", ".list", ".picard", ".interval_list", ".intervals"]

        cosmic_file = mlib.get_user_option(
            user_options, "mutect_cosmic_vcf", not_empty=True, check_file=True)
        dbsnp_file = mlib.get_user_option(
            user_options, "mutect_dbsnp_vcf", not_empty=True, check_file=True)

        # sample -> bam filename
        sample2bamfile = mlib.root2filename(bam_filenames)
        # Make sure files exist for all the samples.
        mlib.assert_normal_cancer_samples(nc_match, sample2bamfile)

        # list of (cancer_sample, normal_bamfile, tumor_bamfile, call_outfile,
        #    coverage_outfile, vcf_outfile, logfile)
        opj = os.path.join
        jobs = []
        for (normal_sample, cancer_sample) in nc_match:
            normal_bamfile = sample2bamfile[normal_sample]
            cancer_bamfile = sample2bamfile[cancer_sample]
            path, sample, ext = mlib.splitpath(cancer_bamfile)
            call_outfile = opj(out_path, "%s.call_stats.out" % sample)
            cov_outfile = opj(out_path, "%s.coverage.wig.txt" % sample)
            raw_vcf_outfile = opj(out_path, "%s.vcf.raw" % sample)
            vcf_outfile = opj(out_path, "%s.vcf" % sample)
            log_outfile = opj(out_path, "%s.log" % sample)
            x = normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \
                call_outfile, cov_outfile, raw_vcf_outfile, vcf_outfile, \
                log_outfile
            jobs.append(x)

        # java -Xmx2g -jar muTect.jar
        #   --analysis_type MuTect
        #   --reference_sequence <reference>
        #   --cosmic <cosmic.vcf>
        #   --dbsnp <dbsnp.vcf>
        #   --intervals <intervals_to_process>
        #   --input_file:normal <normal.bam>
        #   --input_file:tumor <tumor.bam>
        #   --out <call_stats.out>
        #   --coverage_file <coverage.wig.txt>

        # Generate the commands.
        sq = mlib.sq
        commands = []
        for x in jobs:
            normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \
                call_outfile, cov_outfile, raw_vcf_outfile, vcf_outfile, \
                log_outfile = x

            UNHASHABLE = [
                ("input_file:normal", sq(normal_bamfile)),
                ("input_file:tumor", sq(cancer_bamfile)),
                ]
            x = alignlib.make_MuTect_command(
                analysis_type="MuTect",
                reference_sequence=sq(ref.fasta_file_full),
                cosmic=sq(cosmic_file),
                dbsnp=sq(dbsnp_file),
                intervals=sq(interval_node.identifier),
                out=sq(call_outfile),
                coverage_file=sq(cov_outfile),
                vcf=sq(raw_vcf_outfile),
                _UNHASHABLE=UNHASHABLE,
                )
            x = "%s >& %s" % (x, log_outfile)
            commands.append(x)
        assert len(commands) == len(jobs)
        nc = mlib.calc_max_procs_from_ram(15, upper_max=num_cores)
        parallel.pshell(commands, max_procs=nc)
        metadata["num_cores"] = nc
        metadata["commands"] = commands

        # Make sure log files have no errors.  Check the log files
        # before the VCF files.  If there's an error, the VCF files
        # may not be created.
        # ##### ERROR -------------------------------------------------------
        # ##### ERROR A GATK RUNTIME ERROR has occurred (version 2.2-25-g2a68
        # ##### ERROR
        # ##### ERROR Please visit the wiki to see if this is a known problem
        # ##### ERROR If not, please post the error, with stack trace, to the
        # ##### ERROR Visit our website and forum for extensive documentation
        # ##### ERROR commonly asked questions http://www.broadinstitute.org/
        # ##### ERROR
        # ##### ERROR MESSAGE: java.lang.IllegalArgumentException: Comparison
        # ##### ERROR -------------------------------------------------------
        for i, x in enumerate(jobs):
            normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \
                call_outfile, cov_outfile, raw_vcf_outfile, vcf_outfile, \
                log_outfile = x
            # Pull out the error lines.
            x = [x for x in open(log_outfile)]
            x = [x for x in x if x.startswith("##### ERROR")]
            x = "".join(x)
            msg = "MuTect error [%s]:\n%s\n%s" % (
                cancer_sample, commands[i], x)
            assert not x, msg

        # Make sure output VCF files exist.
        x = [x[6] for x in jobs]
        filelib.assert_exists_many(x)

        # Fix the files.
        for x in jobs:
            normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \
                call_outfile, cov_outfile, raw_vcf_outfile, vcf_outfile, \
                log_outfile = x
            alignlib.clean_mutect_vcf(
                normal_bamfile, cancer_bamfile, normal_sample, cancer_sample,
                raw_vcf_outfile, vcf_outfile)
            
        return metadata
Esempio n. 3
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib
        from Betsy import module_utils as mlib

        bam_node, nc_node, ref_node = antecedents
        bam_filenames = mlib.find_bam_files(bam_node.identifier)
        assert bam_filenames, "No .bam files."
        nc_match = mlib.read_normal_cancer_file(nc_node.identifier)
        ref = alignlib.create_reference_genome(ref_node.identifier)
        filelib.safe_mkdir(out_path)
        metadata = {}
        # TODO: Figure out version.

        # Figure out whether the user wants SNPs or INDELs.
        #assert "vartype" in out_attributes
        #vartype = out_attributes["vartype"]
        #assert vartype in ["all", "snp", "indel"]

        # sample -> bam filename
        sample2bamfile = mlib.root2filename(bam_filenames)
        # Make sure files exist for all the samples.
        mlib.assert_normal_cancer_samples(nc_match, sample2bamfile)

        # list of (cancer_sample, normal_bamfile, tumor_bamfile, orig_outfile,
        #          fixed_outfile, filtered_outfile)
        opj = os.path.join
        jobs = []
        for (normal_sample, cancer_sample) in nc_match:
            normal_bamfile = sample2bamfile[normal_sample]
            cancer_bamfile = sample2bamfile[cancer_sample]
            path, sample, ext = mlib.splitpath(cancer_bamfile)
            orig_outfile = opj(out_path, "%s.raw" % sample)
            fix_outfile = opj(out_path, "%s.vcf" % sample)
            #filter_outfile = opj(out_path, "%s.vcf" % sample)
            x = cancer_sample, normal_bamfile, cancer_bamfile, \
                orig_outfile, fix_outfile
            x = filelib.GenericObject(cancer_sample=cancer_sample,
                                      normal_bamfile=normal_bamfile,
                                      cancer_bamfile=cancer_bamfile,
                                      orig_outfile=orig_outfile,
                                      fix_outfile=fix_outfile)
            jobs.append(x)

        # python /usr/local/museq/classify.py \
        #   normal:test31/normal.bam tumour:test31/tumor.bam \
        #   reference:genomes/Broad.hg19/Homo_sapiens_assembly19.fa \
        #   model:/usr/local/museq/model_v4.1.2.npz \
        #   --config /usr/local/museq/metadata.config \
        #   -o test51.vcf
        opj = os.path.join
        museq = mlib.get_config("museq", assert_exists=True)
        classify_py = opj(museq, "classify.py")
        model_file = opj(museq, "model_v4.1.2.npz")
        config_file = opj(museq, "metadata.config")
        filelib.assert_exists_nz(classify_py)
        filelib.assert_exists_nz(model_file)
        filelib.assert_exists_nz(config_file)

        # museq's config file generates a broken VCF file.  Fix it.
        fixed_config_file = "fixed.config"
        fix_config_file(config_file, fixed_config_file)

        # Generate the commands.
        sq = mlib.sq
        commands = []
        for j in jobs:
            #cancer_sample, normal_bamfile, cancer_bamfile, \
            #               raw_outfile, fix_outfile, vcf_outfile = x

            x = [
                "python",  # should allow user to specify python
                sq(classify_py),
                sq("normal:%s" % j.normal_bamfile),
                sq("tumour:%s" % j.cancer_bamfile),
                sq("reference:%s" % ref.fasta_file_full),
                sq("model:%s" % model_file),
                "--config",
                sq(fixed_config_file),
                "-o",
                sq(j.orig_outfile),
            ]
            x = " ".join(map(str, x))
            commands.append(x)
        # Not sure how much RAM this takes.  On Thunderbolts test,
        # took < 1 Gb.
        nc = mlib.calc_max_procs_from_ram(5, upper_max=num_cores)
        parallel.pshell(commands, max_procs=nc)
        metadata["num_cores"] = nc
        metadata["commands"] = commands

        # JointSNVMix produces non-standard VCF files.  Fix this so it
        # will work with other programs downstream.
        for j in jobs:
            #cancer_sample, normal_bamfile, cancer_bamfile, \
            #               raw_outfile, fix_outfile, vcf_outfile = x
            fix_vcf_file(j.cancer_sample, j.orig_outfile, j.fix_outfile)

        # Filter each of the VCF files.
        #for x in jobs:
        #    cancer_sample, normal_bamfile, cancer_bamfile, \
        #                   raw_outfile, fix_outfile, vcf_outfile = x
        #    filter_by_vartype(vartype, fix_outfile, vcf_outfile)
        #metadata["filter"] = vartype

        #x = [x[-1] for x in jobs]
        x = [j.fix_outfile for x in jobs]
        filelib.assert_exists_many(x)

        return metadata
Esempio n. 4
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib
        from Betsy import module_utils as mlib

        bam_node, nc_node, ref_node = antecedents
        bam_filenames = mlib.find_bam_files(bam_node.identifier)
        assert bam_filenames, "No .bam files."
        nc_match = mlib.read_normal_cancer_file(nc_node.identifier)
        ref = alignlib.create_reference_genome(ref_node.identifier)
        filelib.safe_mkdir(out_path)
        metadata = {}
        metadata["tool"] = "MuSE %s" % alignlib.get_muse_version()

        wgs_or_wes = mlib.get_user_option(user_options,
                                          "wgs_or_wes",
                                          not_empty=True,
                                          allowed_values=["wgs", "wes"])
        dbsnp_file = mlib.get_user_option(user_options,
                                          "muse_dbsnp_vcf",
                                          not_empty=True,
                                          check_file=True)

        # Make sure dbsnp_file is compressed and indexed.
        assert dbsnp_file.endswith(".vcf.gz"), \
               "muse_dbsnp_vcf must be bgzip compressed."
        x = "%s.tbi" % dbsnp_file
        assert filelib.exists_nz(x), "muse_dbsnp_vcf must be tabix indexed."

        # sample -> bam filename
        sample2bamfile = mlib.root2filename(bam_filenames)
        # Make sure files exist for all the samples.
        mlib.assert_normal_cancer_samples(nc_match, sample2bamfile)

        # list of (normal_sample, cancer_sample, normal_bamfile, tumor_bamfile,
        #   muse_call_stem, muse_call_file, raw_vcf_outfile, vcf_outfile,
        #   logfile1, logfile2)
        opj = os.path.join
        jobs = []
        for (normal_sample, cancer_sample) in nc_match:
            normal_bamfile = sample2bamfile[normal_sample]
            cancer_bamfile = sample2bamfile[cancer_sample]
            path, sample, ext = mlib.splitpath(cancer_bamfile)
            muse_call_stem = opj(out_path, "%s.call" % cancer_sample)
            muse_call_file = "%s.MuSE.txt" % muse_call_stem
            raw_vcf_outfile = opj(out_path, "%s.vcf.raw" % cancer_sample)
            vcf_outfile = opj(out_path, "%s.vcf" % cancer_sample)
            log_outfile1 = opj(out_path, "%s.call.log" % cancer_sample)
            log_outfile2 = opj(out_path, "%s.sump.log" % cancer_sample)
            x = normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \
                muse_call_stem, muse_call_file, raw_vcf_outfile, vcf_outfile, \
                log_outfile1, log_outfile2
            jobs.append(x)

        # Generate the commands.
        # MuSE call -O test11 -f genomes/Broad.hg19/Homo_sapiens_assembly19.fa\
        #   bam04/196B-MG.bam bam04/PIM001_G.bam
        # MuSE sump -I test11.MuSE.txt -E -O test12.vcf \
        #   -D MuSE/dbsnp_132_b37.leftAligned.vcf.gz

        MuSE = mlib.findbin("muse")

        sq = mlib.sq
        commands = []
        for x in jobs:
            normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \
                muse_call_stem, muse_call_file, raw_vcf_outfile, vcf_outfile, \
                log_outfile1, log_outfile2 = x

            x = [
                sq(MuSE),
                "call",
                "-O",
                muse_call_stem,
                "-f",
                sq(ref.fasta_file_full),
                cancer_bamfile,
                normal_bamfile,
            ]
            x = " ".join(x)
            x = "%s >& %s" % (x, log_outfile1)
            commands.append(x)
        assert len(commands) == len(jobs)
        # Not sure about RAM.
        nc = mlib.calc_max_procs_from_ram(10, upper_max=num_cores)
        parallel.pshell(commands, max_procs=nc)
        metadata["num_cores"] = nc
        metadata["commands"] = commands

        # Make sure the log files have no errors.  The files should be
        # empty.
        log_files = [x[8] for x in jobs]
        filelib.assert_exists_z_many(log_files)

        # Make sure the call files are created and not empty.
        call_files = [x[5] for x in jobs]
        filelib.assert_exists_nz_many(call_files)

        # Run the "sump" step.
        commands = []
        for x in jobs:
            normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \
                muse_call_stem, muse_call_file, raw_vcf_outfile, vcf_outfile, \
                log_outfile1, log_outfile2 = x

            x = [
                sq(MuSE),
                "sump",
                "-I",
                sq(muse_call_file),
            ]
            assert wgs_or_wes in ["wgs", "wes"]
            if wgs_or_wes == "wgs":
                x += ["-G"]
            else:
                x += ["-E"]
            x += [
                "-O",
                sq(raw_vcf_outfile),
                "-D",
                sq(dbsnp_file),
            ]
            x = " ".join(x)
            x = "%s >& %s" % (x, log_outfile2)
            commands.append(x)
        assert len(commands) == len(jobs)
        # Not sure about RAM.
        nc = mlib.calc_max_procs_from_ram(10, upper_max=num_cores)
        parallel.pshell(commands, max_procs=nc)
        metadata["commands"] = metadata["commands"] + commands

        # Make sure the log files have no errors.  The files should be
        # empty.
        log_files = [x[9] for x in jobs]
        filelib.assert_exists_z_many(log_files)

        # Make sure the raw files are created and not empty.
        vcf_files = [x[6] for x in jobs]
        filelib.assert_exists_nz_many(vcf_files)

        # Fix the files.
        commands = []  # Should be python commands.
        for x in jobs:
            normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \
                muse_call_stem, muse_call_file, raw_vcf_outfile, vcf_outfile, \
                log_outfile1, log_outfile2 = x
            args = normal_sample, cancer_sample, raw_vcf_outfile, vcf_outfile
            x = alignlib.clean_muse_vcf, args, {}
            commands.append(x)
        parallel.pyfun(commands, num_procs=num_cores)

        # Delete the log_outfiles if empty.
        for x in jobs:
            normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \
                muse_call_stem, muse_call_file, raw_vcf_outfile, vcf_outfile, \
                log_outfile1, log_outfile2 = x
            if os.path.exists(log_outfile1):
                os.unlink(log_outfile1)
            if os.path.exists(log_outfile2):
                os.unlink(log_outfile2)

        # Make sure output VCF files exist.
        x = [x[7] for x in jobs]
        filelib.assert_exists_many(x)

        return metadata
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib
        from Betsy import module_utils as mlib
        import call_somatic_varscan

        bam_node, nc_node, ref_node, interval_node = antecedents
        bam_filenames = mlib.find_bam_files(bam_node.identifier)
        assert bam_filenames, "No .bam files."
        nc_match = mlib.read_normal_cancer_file(nc_node.identifier)
        ref = alignlib.create_reference_genome(ref_node.identifier)
        filelib.assert_exists_nz(interval_node.identifier)
        filelib.safe_mkdir(out_path)
        metadata = {}
        # TODO: Figure out GATK version.

        # Make sure intervals file ends with:
        # .bed, .list, .picard, .interval_list, or .intervals
        x, x, ext = mlib.splitpath(interval_node.identifier)
        assert ext in [
            ".bed", ".list", ".picard", ".interval_list", ".intervals"
        ]

        cosmic_file = mlib.get_user_option(user_options,
                                           "mutect_cosmic_vcf",
                                           not_empty=True,
                                           check_file=True)
        dbsnp_file = mlib.get_user_option(user_options,
                                          "mutect_dbsnp_vcf",
                                          not_empty=True,
                                          check_file=True)

        # sample -> bam filename
        sample2bamfile = mlib.root2filename(bam_filenames)
        # Make sure files exist for all the samples.
        mlib.assert_normal_cancer_samples(nc_match, sample2bamfile)

        opj = os.path.join
        jobs = []
        for (normal_sample, cancer_sample) in nc_match:
            normal_bamfile = sample2bamfile[normal_sample]
            cancer_bamfile = sample2bamfile[cancer_sample]
            path, sample, ext = mlib.splitpath(cancer_bamfile)
            vcf_outfile = opj(out_path, "%s.vcf" % sample)
            log_outfile = opj(out_path, "%s.log" % sample)
            x = filelib.GenericObject(normal_sample=normal_sample,
                                      cancer_sample=cancer_sample,
                                      normal_bamfile=normal_bamfile,
                                      cancer_bamfile=cancer_bamfile,
                                      vcf_outfile=vcf_outfile,
                                      log_outfile=log_outfile)
            jobs.append(x)

        # java -jar GenomeAnalysisTK.jar \
        #   -T MuTect2 \
        #   -R reference.fasta \
        #   -I:tumor tumor.bam \
        #   -I:normal normal.bam \
        #   [--dbsnp dbSNP.vcf] \
        #   [--cosmic COSMIC.vcf] \
        #   [-L targets.interval_list] \
        #   -o output.vcf

        # Generate the commands.
        sq = mlib.sq
        commands = []
        for j in jobs:
            UNHASHABLE = [
                ("I:normal", sq(normal_bamfile)),
                ("I:tumor", sq(cancer_bamfile)),
                # --dbsnp and --cosmic use two dashes, for some
                # reason.  Since make_GATK_command only uses one dash,
                # add one manually.
                ("-dbsnp", sq(dbsnp_file)),
                ("-cosmic", sq(cosmic_file)),
            ]
            x = alignlib.make_GATK_command(
                T="MuTect2",
                R=sq(ref.fasta_file_full),
                L=sq(interval_node.identifier),
                o=sq(j.vcf_outfile),
                _UNHASHABLE=UNHASHABLE,
            )
            x = "%s >& %s" % (x, j.log_outfile)
            commands.append(x)
        assert len(commands) == len(jobs)

        nc = mlib.calc_max_procs_from_ram(25, upper_max=num_cores)
        parallel.pshell(commands, max_procs=nc)
        metadata["num_cores"] = nc
        metadata["commands"] = commands

        # Make sure log files have no errors.  Check the log files
        # before the VCF files.  If there's an error, the VCF files
        # may not be created.
        # ##### ERROR -------------------------------------------------------
        # ##### ERROR A GATK RUNTIME ERROR has occurred (version 2.2-25-g2a68
        # ##### ERROR
        # ##### ERROR Please visit the wiki to see if this is a known problem
        # ##### ERROR If not, please post the error, with stack trace, to the
        # ##### ERROR Visit our website and forum for extensive documentation
        # ##### ERROR commonly asked questions http://www.broadinstitute.org/
        # ##### ERROR
        # ##### ERROR MESSAGE: java.lang.IllegalArgumentException: Comparison
        # ##### ERROR -------------------------------------------------------
        for i, j in enumerate(jobs):
            # Pull out the error lines.
            x = [x for x in open(j.log_outfile)]
            x = [x for x in x if x.startswith("##### ERROR")]
            x = "".join(x)
            msg = "MuTect2 error [%s]:\n%s\n%s" % (cancer_sample, commands[i],
                                                   x)
            assert not x, msg

        # Make sure output VCF files exist.
        x = [x.vcf_outfile for x in jobs]
        filelib.assert_exists_many(x)

        # Mutect2 names the samples "NORMAL" and "TUMOR".  Replace
        # them with the actual names.
        for j in jobs:
            call_somatic_varscan._fix_normal_cancer_names(
                j.vcf_outfile, j.normal_sample, j.cancer_sample)

        return metadata
Esempio n. 6
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib
        from Betsy import module_utils as mlib

        bam_node, nc_node, ref_node = antecedents
        bam_filenames = mlib.find_bam_files(bam_node.identifier)
        assert bam_filenames, "No .bam files."
        nc_match = mlib.read_normal_cancer_file(nc_node.identifier)
        ref = alignlib.create_reference_genome(ref_node.identifier)
        filelib.safe_mkdir(out_path)
        metadata = {}
        # TODO: Figure out Strelka version.

        skip_depth_filter = False
        x = mlib.get_user_option(user_options,
                                 "strelka_skip_depth_filter",
                                 allowed_values=["no", "yes"],
                                 not_empty=True)
        if x == "yes":
            skip_depth_filter = True
        assert "vartype" in out_attributes, "Missing attribute: vartype"
        x = out_attributes["vartype"]
        assert x in ["snp", "indel"]
        vartype = x

        # sample -> bam filename
        sample2bamfile = mlib.root2filename(bam_filenames)
        # Make sure files exist for all the samples.
        mlib.assert_normal_cancer_samples(nc_match, sample2bamfile)

        # Make sure each cancer sample is unique.  Otherwise, the
        # analysis directories will conflict.
        tumor_samples = [x[-1] for x in nc_match]
        dups = {}
        for i in range(1, len(tumor_samples)):
            if tumor_samples[i] in tumor_samples[:i]:
                dups[tumor_samples[i]] = 1
        assert not dups, "NormalCancerFile contains multiple instances of: %s"\
               % ", ".join(sorted(dups))

        # list of (normal_sample, cancer_sample, normal_bamfile, tumor_bamfile,
        #          config_file, output_dir
        opj = os.path.join
        jobs = []
        for (normal_sample, cancer_sample) in nc_match:
            normal_bamfile = sample2bamfile[normal_sample]
            cancer_bamfile = sample2bamfile[cancer_sample]
            path, sample, ext = mlib.splitpath(cancer_bamfile)
            config_file = opj(out_path, "config.%s.ini" % cancer_sample)
            analysis_path = opj(out_path, "analysis.%s" % cancer_sample)
            x = normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \
                config_file, analysis_path
            jobs.append(x)

        # Make each of the config files.
        for x in jobs:
            normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \
                           config_file, analysis_path = x
            _make_config_file(config_file, skip_depth_filter=skip_depth_filter)

        # Make the analysis directories.
        jobs2 = []
        for x in jobs:
            normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \
                           config_file, analysis_path = x
            fn = _make_analysis_directory
            args = (analysis_path, config_file, ref.fasta_file_full,
                    normal_bamfile, cancer_bamfile)
            keywds = None
            jobs2.append((fn, args, keywds))
        parallel.pyfun(jobs2, num_procs=num_cores)

        # Run the analysis.
        for x in jobs:
            normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \
                           config_file, analysis_path = x
            cmd = "make -j %d" % num_cores
            parallel.sshell(cmd, path=analysis_path)
        metadata["num_cores"] = num_cores

        # Make sure files exists.
        x = [x[-1] for x in jobs]
        x = [os.path.join(x, "results", "all.somatic.snvs.vcf") for x in x]
        filelib.assert_exists_nz_many(x)

        # Clean the VCF files and save into the out_path.
        for x in jobs:
            normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \
                           config_file, analysis_path = x
            # <analysis_path>/results/all.somatic.snvs.vcf
            # <analysis_path>/results/all.somatic.indels.vcf
            vartype2file = {
                "snp": "all.somatic.snvs.vcf",
                "indel": "all.somatic.indels.vcf",
            }
            assert vartype in vartype2file
            x = vartype2file[vartype]
            src_file = os.path.join(analysis_path, "results", x)
            dst_file = os.path.join(out_path, "%s.vcf" % cancer_sample)
            alignlib.clean_strelka_vcf(normal_sample, cancer_sample, src_file,
                                       dst_file)

        #metadata["commands"] = commands
        return metadata
Esempio n. 7
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib
        from Betsy import module_utils as mlib
        import call_somatic_varscan

        bam_node, nc_node, ref_node = antecedents
        bam_filenames = mlib.find_bam_files(bam_node.identifier)
        assert bam_filenames, "No .bam files."
        nc_match = mlib.read_normal_cancer_file(nc_node.identifier)
        ref = alignlib.create_reference_genome(ref_node.identifier)
        filelib.safe_mkdir(out_path)
        metadata = {}
        # TODO: Figure out version.

        # sample -> bam filename
        sample2bamfile = mlib.root2filename(bam_filenames)
        # Make sure files exist for all the samples.
        mlib.assert_normal_cancer_samples(nc_match, sample2bamfile)

        # list of (normal_sample, cancer_sample, normal_bamfile, tumor_bamfile,
        #          vcf_outfile)
        opj = os.path.join
        jobs = []
        for (normal_sample, cancer_sample) in nc_match:
            normal_bamfile = sample2bamfile[normal_sample]
            cancer_bamfile = sample2bamfile[cancer_sample]
            path, sample, ext = mlib.splitpath(cancer_bamfile)
            vcf_outfile = opj(out_path, "%s.vcf" % sample)
            x = normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \
                vcf_outfile
            jobs.append(x)

        # bam-somaticsniper -q 1 -Q 15 -G -L -F vcf \
        #   -f genomes/Broad.hg19/Homo_sapiens_assembly19.fa \
        #   test31/tumor.bam test31/normal.bam test41.vcf
        somaticsniper = mlib.get_config("somaticsniper",
                                        which_assert_file=True)

        # Generate the commands.
        sq = mlib.sq
        commands = []
        for x in jobs:
            normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \
                           vcf_outfile = x

            x = [
                sq(somaticsniper),
                "-q",
                1,
                "-Q",
                15,
                "-G",
                "-L",
                "-F",
                "vcf",
                "-f",
                sq(ref.fasta_file_full),
                sq(cancer_bamfile),
                sq(normal_bamfile),
                sq(vcf_outfile),
            ]
            x = " ".join(map(str, x))
            commands.append(x)
        # Not sure how much RAM this takes.
        nc = mlib.calc_max_procs_from_ram(15, upper_max=num_cores)
        parallel.pshell(commands, max_procs=nc)
        metadata["num_cores"] = nc
        metadata["commands"] = commands

        # SomaticSniper names the samples "NORMAL" and "TUMOR".
        # Replace them with the actual names.
        for x in jobs:
            normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \
                           vcf_outfile = x
            call_somatic_varscan._fix_normal_cancer_names(
                vcf_outfile, normal_sample, cancer_sample)

        x = [x[-1] for x in jobs]
        filelib.assert_exists_many(x)

        return metadata