def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import filelib
        from Betsy import module_utils as mlib
        import merge_vcf_folder

        vcffolders_node = antecedents
        filelib.safe_mkdir(out_path)
        metadata = {}

        x = os.listdir(vcffolders_node.identifier)
        x = [x for x in x if x.endswith(".vcf")]
        assert x, "No VCF folders found: %s" % vcffolders_node.identifier
        x = [os.path.join(vcffolders_node.identifier, x) for x in x]
        vcf_folders = x

        jobs = []
        for folder in vcf_folders:
            path, root, ext = mlib.splitpath(folder)
            assert ext == ".vcf"
            caller = root
            vcf_filenames = filelib.list_files_in_path(folder,
                                                       endswith=".vcf",
                                                       toplevel_only=True)
            assert vcf_filenames, "No .vcf files: %s" % folder
            out_filename = os.path.join(out_path, "%s.vcf" % root)
            tmp_path = "%s.indexed.vcf" % caller
            x = filelib.GenericObject(caller=caller,
                                      vcf_filenames=vcf_filenames,
                                      out_filename=out_filename,
                                      tmp_path=tmp_path)
            jobs.append(x)

        for j in jobs:
            m = merge_vcf_folder.merge_vcf_files(j.vcf_filenames,
                                                 j.out_filename, num_cores,
                                                 j.tmp_path)
            if "commands" not in metadata:
                metadata["commands"] = []
            metadata["commands"].extend(m["commands"])

        x = [x.out_filename for x in jobs]
        filelib.assert_exists_many(x)

        return metadata
Esempio n. 2
0
def get_radia_files(radia_path, assembly):
    import os
    from genomicode import filelib

    opj = os.path.join

    radia_py = opj(radia_path, "scripts", "radia.py")
    filterRadia_py = opj(radia_path, "scripts", "filterRadia.py")
    mergeChroms_py = opj(radia_path, "scripts", "mergeChroms.py")

    # For hg19 only.
    scripts_dir = opj(radia_path, "scripts")
    blacklist_dir = opj(radia_path,
                        "data/%s/blacklists/1000Genomes/phase1" % assembly)
    snp_dir = opj(radia_path, "data/%s/snp135" % assembly)
    retro_dir = opj(radia_path, "data/%s/retroGenes" % assembly)
    pseudo_dir = opj(radia_path, "data/%s/pseudoGenes" % assembly)
    cosmic_dir = opj(radia_path, "data/%s/cosmic" % assembly)
    target_dir = opj(radia_path, "data/%s/gaf/2_1" % assembly)

    rnageneblck_file = opj(radia_path, "data/rnaGeneBlacklist.tab")
    rnagenefamilyblck_file = opj(radia_path, "data/rnaGeneFamilyBlacklist.tab")

    files = [
        radia_py,
        filterRadia_py,
        mergeChroms_py,
        rnageneblck_file,
        rnagenefamilyblck_file,
    ]
    paths = [
        scripts_dir,
        blacklist_dir,
        snp_dir,
        retro_dir,
        pseudo_dir,
        cosmic_dir,
        target_dir,
    ]
    filelib.assert_exists_nz_many(files)
    filelib.assert_exists_many(paths)

    x = RadiaFiles(radia_py, filterRadia_py, mergeChroms_py, scripts_dir,
                   blacklist_dir, snp_dir, retro_dir, pseudo_dir, cosmic_dir,
                   target_dir, rnageneblck_file, rnagenefamilyblck_file)
    return x
Esempio n. 3
0
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel

        vcf_node = in_data
        vcf_files = filelib.list_files_in_path(vcf_node.identifier,
                                               endswith=".vcf",
                                               case_insensitive=True)
        filelib.safe_mkdir(out_path)
        metadata = {}

        jobs = []  # in_vcf_filename, out_vcf_filename
        for vcf_file in vcf_files:
            path, file_ = os.path.split(vcf_file)
            out_vcf_file = os.path.join(out_path, file_)
            x = vcf_file, out_vcf_file
            jobs.append(x)

        # Figure out whether the user wants SNPs or INDELs.
        assert "vartype" in out_attributes
        vartype = out_attributes["vartype"]
        assert vartype in ["all", "snp", "indel"]

        # Generate the commands.
        commands = []
        for x in jobs:
            in_vcf_file, out_vcf_file = x

            args = vartype, in_vcf_file, out_vcf_file
            x = filter_by_vartype, args, {}
            commands.append(x)
        parallel.pyfun(commands, num_procs=num_cores)
        metadata["num_cores"] = num_cores

        x = [x[-1] for x in jobs]
        filelib.assert_exists_many(x)

        return metadata
Esempio n. 4
0
    def run(
        self, network, antecedents, out_attributes, user_options, num_cores,
        out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib
        from Betsy import module_utils as mlib

        bam_node, nc_node, ref_node, interval_node = antecedents
        bam_filenames = mlib.find_bam_files(bam_node.identifier)
        assert bam_filenames, "No .bam files."
        nc_match = mlib.read_normal_cancer_file(nc_node.identifier)
        ref = alignlib.create_reference_genome(ref_node.identifier)
        filelib.assert_exists_nz(interval_node.identifier)
        filelib.safe_mkdir(out_path)
        metadata = {}
        # TODO: Figure out MuTect version.

        # Make sure intervals file ends with:
        # .bed, .list, .picard, .interval_list, or .intervals
        x, x, ext = mlib.splitpath(interval_node.identifier)
        assert ext in [
            ".bed", ".list", ".picard", ".interval_list", ".intervals"]

        cosmic_file = mlib.get_user_option(
            user_options, "mutect_cosmic_vcf", not_empty=True, check_file=True)
        dbsnp_file = mlib.get_user_option(
            user_options, "mutect_dbsnp_vcf", not_empty=True, check_file=True)

        # sample -> bam filename
        sample2bamfile = mlib.root2filename(bam_filenames)
        # Make sure files exist for all the samples.
        mlib.assert_normal_cancer_samples(nc_match, sample2bamfile)

        # list of (cancer_sample, normal_bamfile, tumor_bamfile, call_outfile,
        #    coverage_outfile, vcf_outfile, logfile)
        opj = os.path.join
        jobs = []
        for (normal_sample, cancer_sample) in nc_match:
            normal_bamfile = sample2bamfile[normal_sample]
            cancer_bamfile = sample2bamfile[cancer_sample]
            path, sample, ext = mlib.splitpath(cancer_bamfile)
            call_outfile = opj(out_path, "%s.call_stats.out" % sample)
            cov_outfile = opj(out_path, "%s.coverage.wig.txt" % sample)
            raw_vcf_outfile = opj(out_path, "%s.vcf.raw" % sample)
            vcf_outfile = opj(out_path, "%s.vcf" % sample)
            log_outfile = opj(out_path, "%s.log" % sample)
            x = normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \
                call_outfile, cov_outfile, raw_vcf_outfile, vcf_outfile, \
                log_outfile
            jobs.append(x)

        # java -Xmx2g -jar muTect.jar
        #   --analysis_type MuTect
        #   --reference_sequence <reference>
        #   --cosmic <cosmic.vcf>
        #   --dbsnp <dbsnp.vcf>
        #   --intervals <intervals_to_process>
        #   --input_file:normal <normal.bam>
        #   --input_file:tumor <tumor.bam>
        #   --out <call_stats.out>
        #   --coverage_file <coverage.wig.txt>

        # Generate the commands.
        sq = mlib.sq
        commands = []
        for x in jobs:
            normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \
                call_outfile, cov_outfile, raw_vcf_outfile, vcf_outfile, \
                log_outfile = x

            UNHASHABLE = [
                ("input_file:normal", sq(normal_bamfile)),
                ("input_file:tumor", sq(cancer_bamfile)),
                ]
            x = alignlib.make_MuTect_command(
                analysis_type="MuTect",
                reference_sequence=sq(ref.fasta_file_full),
                cosmic=sq(cosmic_file),
                dbsnp=sq(dbsnp_file),
                intervals=sq(interval_node.identifier),
                out=sq(call_outfile),
                coverage_file=sq(cov_outfile),
                vcf=sq(raw_vcf_outfile),
                _UNHASHABLE=UNHASHABLE,
                )
            x = "%s >& %s" % (x, log_outfile)
            commands.append(x)
        assert len(commands) == len(jobs)
        nc = mlib.calc_max_procs_from_ram(15, upper_max=num_cores)
        parallel.pshell(commands, max_procs=nc)
        metadata["num_cores"] = nc
        metadata["commands"] = commands

        # Make sure log files have no errors.  Check the log files
        # before the VCF files.  If there's an error, the VCF files
        # may not be created.
        # ##### ERROR -------------------------------------------------------
        # ##### ERROR A GATK RUNTIME ERROR has occurred (version 2.2-25-g2a68
        # ##### ERROR
        # ##### ERROR Please visit the wiki to see if this is a known problem
        # ##### ERROR If not, please post the error, with stack trace, to the
        # ##### ERROR Visit our website and forum for extensive documentation
        # ##### ERROR commonly asked questions http://www.broadinstitute.org/
        # ##### ERROR
        # ##### ERROR MESSAGE: java.lang.IllegalArgumentException: Comparison
        # ##### ERROR -------------------------------------------------------
        for i, x in enumerate(jobs):
            normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \
                call_outfile, cov_outfile, raw_vcf_outfile, vcf_outfile, \
                log_outfile = x
            # Pull out the error lines.
            x = [x for x in open(log_outfile)]
            x = [x for x in x if x.startswith("##### ERROR")]
            x = "".join(x)
            msg = "MuTect error [%s]:\n%s\n%s" % (
                cancer_sample, commands[i], x)
            assert not x, msg

        # Make sure output VCF files exist.
        x = [x[6] for x in jobs]
        filelib.assert_exists_many(x)

        # Fix the files.
        for x in jobs:
            normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \
                call_outfile, cov_outfile, raw_vcf_outfile, vcf_outfile, \
                log_outfile = x
            alignlib.clean_mutect_vcf(
                normal_bamfile, cancer_bamfile, normal_sample, cancer_sample,
                raw_vcf_outfile, vcf_outfile)
            
        return metadata
Esempio n. 5
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib
        from Betsy import module_utils as mlib

        bam_node, ref_node, insert_size_node, alignment_node = antecedents
        bam_filenames = mlib.find_bam_files(bam_node.identifier)
        assert bam_filenames, "No .bam files."
        ref = alignlib.create_reference_genome(ref_node.identifier)
        filelib.safe_mkdir(out_path)
        metadata = {}

        # ./pindel -f <reference.fa> -i <bam_configuration_file>
        #   -c <chromosome_name> -o <out_prefix>
        #   -T <num threads>
        #
        # Creates files:
        # <out_prefix>_D     Deletion
        # <out_prefix>_SI    Short insertion
        # <out_prefix>_LI    Long insertion
        # <out_prefix>_INV   Inversion
        # <out_prefix>_TD    Tandem deletion
        # <out_prefix>_BP    Breakpoint
        # <out_prefix>_RP    ??? read pair???
        # <out_prefix>_CloseEndMapped   Only on end could be mapped.

        # Pindel cannot handle spaces in the BAM filenames (because of
        # the config file).  Symlink the file to a local directory to make
        # sure there are no spaces.
        bam_path = "bam"

        opj = os.path.join
        jobs = []  # list of filelib.GenericObject
        for bam_filename in bam_filenames:
            p, f = os.path.split(bam_filename)
            sample, ext = os.path.splitext(f)
            bai_filename = "%s.bai" % bam_filename
            filelib.assert_exists_nz(bai_filename)
            x = sample.replace(" ", "_")
            local_bam = opj(bam_path, "%s.bam" % x)
            local_bai = opj(bam_path, "%s.bam.bai" % x)
            config_filename = opj(out_path, "%s.config.txt" % sample)
            out_prefix = opj(out_path, sample)
            log_filename = opj(out_path, "%s.log" % sample)
            x = filelib.GenericObject(sample=sample,
                                      bam_filename=bam_filename,
                                      bai_filename=bai_filename,
                                      local_bam=local_bam,
                                      local_bai=local_bai,
                                      config_filename=config_filename,
                                      out_prefix=out_prefix,
                                      log_filename=log_filename)
            jobs.append(x)

        filelib.safe_mkdir(bam_path)
        for j in jobs:
            assert " " not in j.local_bam
            filelib.assert_exists_nz(j.bam_filename)
            filelib.assert_exists_nz(j.bai_filename)
            if not os.path.exists(j.local_bam):
                os.symlink(j.bam_filename, j.local_bam)
            if not os.path.exists(j.local_bai):
                os.symlink(j.bai_filename, j.local_bai)

        # Read the insert sizes.
        summary_file = opj(insert_size_node.identifier, "summary.txt")
        filelib.assert_exists_nz(summary_file)
        sample2size = _read_insert_sizes(summary_file)
        # Make sure all the samples have inserts.
        for j in jobs:
            assert j.sample in sample2size, \
                   "Missing in insert size file: %s" % j.sample

        # Read the fragment sizes.
        summary_file = opj(alignment_node.identifier, "summary.txt")
        filelib.assert_exists_nz(summary_file)
        sample2readlen = _read_fragment_sizes(summary_file)
        # Make sure all the samples have read lengths.
        for j in jobs:
            assert j.sample in sample2readlen, \
                   "Missing in alignment summary file: %s" % j.sample

        # Make the config file.
        for j in jobs:
            # <insert size> is the whole length to be sequenced, including
            # the length of the pair of reads.  Picard only counts the
            # sequence between the reads.
            size = sample2size[j.sample]
            read_length = sample2readlen[j.sample]
            insert_size = size + read_length * 2
            handle = open(j.config_filename, 'w')
            print >> handle, "%s %s %s" % (j.local_bam, insert_size, j.sample)
            handle.close()

        # Make a list of commands.
        pindel = mlib.get_config("pindel", which_assert_file=True)
        sq = parallel.quote
        commands = []
        for j in jobs:
            cmd = [
                sq(pindel),
                "-f",
                sq(ref.fasta_file_full),
                "-i",
                sq(j.config_filename),
                "-c",
                "ALL",
                "-T",
                1,
                "-o",
                sq(j.out_prefix),
            ]
            cmd = " ".join(map(str, cmd))
            cmd = "%s >& %s" % (cmd, j.log_filename)
            commands.append(cmd)
        parallel.pshell(commands, max_procs=num_cores)
        metadata["num_cores"] = num_cores
        metadata["commands"] = commands

        # Make sure the analysis completed successfully.  If not, try
        # to diagnose.
        x = [x.log_filename for x in jobs]
        filelib.assert_exists_nz_many(x)
        x1 = ["%s_D" % x.out_prefix for x in jobs]
        x2 = ["%s_SI" % x.out_prefix for x in jobs]
        x3 = ["%s_LI" % x.out_prefix for x in jobs]
        x4 = ["%s_INV" % x.out_prefix for x in jobs]
        x5 = ["%s_TD" % x.out_prefix for x in jobs]
        x6 = ["%s_BP" % x.out_prefix for x in jobs]
        x = x1 + x2 + x3 + x4 + x5 + x6
        filelib.assert_exists_many(x)

        return metadata
Esempio n. 6
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib
        from Betsy import module_utils as mlib

        bam_node, nc_node, ref_node = antecedents
        bam_filenames = mlib.find_bam_files(bam_node.identifier)
        assert bam_filenames, "No .bam files."
        nc_match = mlib.read_normal_cancer_file(nc_node.identifier)
        ref = alignlib.create_reference_genome(ref_node.identifier)
        filelib.safe_mkdir(out_path)
        metadata = {}
        # TODO: Figure out version.

        # Figure out whether the user wants SNPs or INDELs.
        #assert "vartype" in out_attributes
        #vartype = out_attributes["vartype"]
        #assert vartype in ["all", "snp", "indel"]

        # sample -> bam filename
        sample2bamfile = mlib.root2filename(bam_filenames)
        # Make sure files exist for all the samples.
        mlib.assert_normal_cancer_samples(nc_match, sample2bamfile)

        # list of (cancer_sample, normal_bamfile, tumor_bamfile, orig_outfile,
        #          fixed_outfile, filtered_outfile)
        opj = os.path.join
        jobs = []
        for (normal_sample, cancer_sample) in nc_match:
            normal_bamfile = sample2bamfile[normal_sample]
            cancer_bamfile = sample2bamfile[cancer_sample]
            path, sample, ext = mlib.splitpath(cancer_bamfile)
            orig_outfile = opj(out_path, "%s.raw" % sample)
            fix_outfile = opj(out_path, "%s.vcf" % sample)
            #filter_outfile = opj(out_path, "%s.vcf" % sample)
            x = cancer_sample, normal_bamfile, cancer_bamfile, \
                orig_outfile, fix_outfile
            x = filelib.GenericObject(cancer_sample=cancer_sample,
                                      normal_bamfile=normal_bamfile,
                                      cancer_bamfile=cancer_bamfile,
                                      orig_outfile=orig_outfile,
                                      fix_outfile=fix_outfile)
            jobs.append(x)

        # python /usr/local/museq/classify.py \
        #   normal:test31/normal.bam tumour:test31/tumor.bam \
        #   reference:genomes/Broad.hg19/Homo_sapiens_assembly19.fa \
        #   model:/usr/local/museq/model_v4.1.2.npz \
        #   --config /usr/local/museq/metadata.config \
        #   -o test51.vcf
        opj = os.path.join
        museq = mlib.get_config("museq", assert_exists=True)
        classify_py = opj(museq, "classify.py")
        model_file = opj(museq, "model_v4.1.2.npz")
        config_file = opj(museq, "metadata.config")
        filelib.assert_exists_nz(classify_py)
        filelib.assert_exists_nz(model_file)
        filelib.assert_exists_nz(config_file)

        # museq's config file generates a broken VCF file.  Fix it.
        fixed_config_file = "fixed.config"
        fix_config_file(config_file, fixed_config_file)

        # Generate the commands.
        sq = mlib.sq
        commands = []
        for j in jobs:
            #cancer_sample, normal_bamfile, cancer_bamfile, \
            #               raw_outfile, fix_outfile, vcf_outfile = x

            x = [
                "python",  # should allow user to specify python
                sq(classify_py),
                sq("normal:%s" % j.normal_bamfile),
                sq("tumour:%s" % j.cancer_bamfile),
                sq("reference:%s" % ref.fasta_file_full),
                sq("model:%s" % model_file),
                "--config",
                sq(fixed_config_file),
                "-o",
                sq(j.orig_outfile),
            ]
            x = " ".join(map(str, x))
            commands.append(x)
        # Not sure how much RAM this takes.  On Thunderbolts test,
        # took < 1 Gb.
        nc = mlib.calc_max_procs_from_ram(5, upper_max=num_cores)
        parallel.pshell(commands, max_procs=nc)
        metadata["num_cores"] = nc
        metadata["commands"] = commands

        # JointSNVMix produces non-standard VCF files.  Fix this so it
        # will work with other programs downstream.
        for j in jobs:
            #cancer_sample, normal_bamfile, cancer_bamfile, \
            #               raw_outfile, fix_outfile, vcf_outfile = x
            fix_vcf_file(j.cancer_sample, j.orig_outfile, j.fix_outfile)

        # Filter each of the VCF files.
        #for x in jobs:
        #    cancer_sample, normal_bamfile, cancer_bamfile, \
        #                   raw_outfile, fix_outfile, vcf_outfile = x
        #    filter_by_vartype(vartype, fix_outfile, vcf_outfile)
        #metadata["filter"] = vartype

        #x = [x[-1] for x in jobs]
        x = [j.fix_outfile for x in jobs]
        filelib.assert_exists_many(x)

        return metadata
Esempio n. 7
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib
        from Betsy import module_utils as mlib

        bam_node, nc_node, ref_node = antecedents
        bam_filenames = mlib.find_bam_files(bam_node.identifier)
        assert bam_filenames, "No .bam files."
        nc_match = mlib.read_normal_cancer_file(nc_node.identifier)
        ref = alignlib.create_reference_genome(ref_node.identifier)
        filelib.safe_mkdir(out_path)
        metadata = {}
        metadata["tool"] = "MuSE %s" % alignlib.get_muse_version()

        wgs_or_wes = mlib.get_user_option(user_options,
                                          "wgs_or_wes",
                                          not_empty=True,
                                          allowed_values=["wgs", "wes"])
        dbsnp_file = mlib.get_user_option(user_options,
                                          "muse_dbsnp_vcf",
                                          not_empty=True,
                                          check_file=True)

        # Make sure dbsnp_file is compressed and indexed.
        assert dbsnp_file.endswith(".vcf.gz"), \
               "muse_dbsnp_vcf must be bgzip compressed."
        x = "%s.tbi" % dbsnp_file
        assert filelib.exists_nz(x), "muse_dbsnp_vcf must be tabix indexed."

        # sample -> bam filename
        sample2bamfile = mlib.root2filename(bam_filenames)
        # Make sure files exist for all the samples.
        mlib.assert_normal_cancer_samples(nc_match, sample2bamfile)

        # list of (normal_sample, cancer_sample, normal_bamfile, tumor_bamfile,
        #   muse_call_stem, muse_call_file, raw_vcf_outfile, vcf_outfile,
        #   logfile1, logfile2)
        opj = os.path.join
        jobs = []
        for (normal_sample, cancer_sample) in nc_match:
            normal_bamfile = sample2bamfile[normal_sample]
            cancer_bamfile = sample2bamfile[cancer_sample]
            path, sample, ext = mlib.splitpath(cancer_bamfile)
            muse_call_stem = opj(out_path, "%s.call" % cancer_sample)
            muse_call_file = "%s.MuSE.txt" % muse_call_stem
            raw_vcf_outfile = opj(out_path, "%s.vcf.raw" % cancer_sample)
            vcf_outfile = opj(out_path, "%s.vcf" % cancer_sample)
            log_outfile1 = opj(out_path, "%s.call.log" % cancer_sample)
            log_outfile2 = opj(out_path, "%s.sump.log" % cancer_sample)
            x = normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \
                muse_call_stem, muse_call_file, raw_vcf_outfile, vcf_outfile, \
                log_outfile1, log_outfile2
            jobs.append(x)

        # Generate the commands.
        # MuSE call -O test11 -f genomes/Broad.hg19/Homo_sapiens_assembly19.fa\
        #   bam04/196B-MG.bam bam04/PIM001_G.bam
        # MuSE sump -I test11.MuSE.txt -E -O test12.vcf \
        #   -D MuSE/dbsnp_132_b37.leftAligned.vcf.gz

        MuSE = mlib.findbin("muse")

        sq = mlib.sq
        commands = []
        for x in jobs:
            normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \
                muse_call_stem, muse_call_file, raw_vcf_outfile, vcf_outfile, \
                log_outfile1, log_outfile2 = x

            x = [
                sq(MuSE),
                "call",
                "-O",
                muse_call_stem,
                "-f",
                sq(ref.fasta_file_full),
                cancer_bamfile,
                normal_bamfile,
            ]
            x = " ".join(x)
            x = "%s >& %s" % (x, log_outfile1)
            commands.append(x)
        assert len(commands) == len(jobs)
        # Not sure about RAM.
        nc = mlib.calc_max_procs_from_ram(10, upper_max=num_cores)
        parallel.pshell(commands, max_procs=nc)
        metadata["num_cores"] = nc
        metadata["commands"] = commands

        # Make sure the log files have no errors.  The files should be
        # empty.
        log_files = [x[8] for x in jobs]
        filelib.assert_exists_z_many(log_files)

        # Make sure the call files are created and not empty.
        call_files = [x[5] for x in jobs]
        filelib.assert_exists_nz_many(call_files)

        # Run the "sump" step.
        commands = []
        for x in jobs:
            normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \
                muse_call_stem, muse_call_file, raw_vcf_outfile, vcf_outfile, \
                log_outfile1, log_outfile2 = x

            x = [
                sq(MuSE),
                "sump",
                "-I",
                sq(muse_call_file),
            ]
            assert wgs_or_wes in ["wgs", "wes"]
            if wgs_or_wes == "wgs":
                x += ["-G"]
            else:
                x += ["-E"]
            x += [
                "-O",
                sq(raw_vcf_outfile),
                "-D",
                sq(dbsnp_file),
            ]
            x = " ".join(x)
            x = "%s >& %s" % (x, log_outfile2)
            commands.append(x)
        assert len(commands) == len(jobs)
        # Not sure about RAM.
        nc = mlib.calc_max_procs_from_ram(10, upper_max=num_cores)
        parallel.pshell(commands, max_procs=nc)
        metadata["commands"] = metadata["commands"] + commands

        # Make sure the log files have no errors.  The files should be
        # empty.
        log_files = [x[9] for x in jobs]
        filelib.assert_exists_z_many(log_files)

        # Make sure the raw files are created and not empty.
        vcf_files = [x[6] for x in jobs]
        filelib.assert_exists_nz_many(vcf_files)

        # Fix the files.
        commands = []  # Should be python commands.
        for x in jobs:
            normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \
                muse_call_stem, muse_call_file, raw_vcf_outfile, vcf_outfile, \
                log_outfile1, log_outfile2 = x
            args = normal_sample, cancer_sample, raw_vcf_outfile, vcf_outfile
            x = alignlib.clean_muse_vcf, args, {}
            commands.append(x)
        parallel.pyfun(commands, num_procs=num_cores)

        # Delete the log_outfiles if empty.
        for x in jobs:
            normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \
                muse_call_stem, muse_call_file, raw_vcf_outfile, vcf_outfile, \
                log_outfile1, log_outfile2 = x
            if os.path.exists(log_outfile1):
                os.unlink(log_outfile1)
            if os.path.exists(log_outfile2):
                os.unlink(log_outfile2)

        # Make sure output VCF files exist.
        x = [x[7] for x in jobs]
        filelib.assert_exists_many(x)

        return metadata
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib
        from Betsy import module_utils as mlib
        import call_somatic_varscan

        bam_node, nc_node, ref_node, interval_node = antecedents
        bam_filenames = mlib.find_bam_files(bam_node.identifier)
        assert bam_filenames, "No .bam files."
        nc_match = mlib.read_normal_cancer_file(nc_node.identifier)
        ref = alignlib.create_reference_genome(ref_node.identifier)
        filelib.assert_exists_nz(interval_node.identifier)
        filelib.safe_mkdir(out_path)
        metadata = {}
        # TODO: Figure out GATK version.

        # Make sure intervals file ends with:
        # .bed, .list, .picard, .interval_list, or .intervals
        x, x, ext = mlib.splitpath(interval_node.identifier)
        assert ext in [
            ".bed", ".list", ".picard", ".interval_list", ".intervals"
        ]

        cosmic_file = mlib.get_user_option(user_options,
                                           "mutect_cosmic_vcf",
                                           not_empty=True,
                                           check_file=True)
        dbsnp_file = mlib.get_user_option(user_options,
                                          "mutect_dbsnp_vcf",
                                          not_empty=True,
                                          check_file=True)

        # sample -> bam filename
        sample2bamfile = mlib.root2filename(bam_filenames)
        # Make sure files exist for all the samples.
        mlib.assert_normal_cancer_samples(nc_match, sample2bamfile)

        opj = os.path.join
        jobs = []
        for (normal_sample, cancer_sample) in nc_match:
            normal_bamfile = sample2bamfile[normal_sample]
            cancer_bamfile = sample2bamfile[cancer_sample]
            path, sample, ext = mlib.splitpath(cancer_bamfile)
            vcf_outfile = opj(out_path, "%s.vcf" % sample)
            log_outfile = opj(out_path, "%s.log" % sample)
            x = filelib.GenericObject(normal_sample=normal_sample,
                                      cancer_sample=cancer_sample,
                                      normal_bamfile=normal_bamfile,
                                      cancer_bamfile=cancer_bamfile,
                                      vcf_outfile=vcf_outfile,
                                      log_outfile=log_outfile)
            jobs.append(x)

        # java -jar GenomeAnalysisTK.jar \
        #   -T MuTect2 \
        #   -R reference.fasta \
        #   -I:tumor tumor.bam \
        #   -I:normal normal.bam \
        #   [--dbsnp dbSNP.vcf] \
        #   [--cosmic COSMIC.vcf] \
        #   [-L targets.interval_list] \
        #   -o output.vcf

        # Generate the commands.
        sq = mlib.sq
        commands = []
        for j in jobs:
            UNHASHABLE = [
                ("I:normal", sq(normal_bamfile)),
                ("I:tumor", sq(cancer_bamfile)),
                # --dbsnp and --cosmic use two dashes, for some
                # reason.  Since make_GATK_command only uses one dash,
                # add one manually.
                ("-dbsnp", sq(dbsnp_file)),
                ("-cosmic", sq(cosmic_file)),
            ]
            x = alignlib.make_GATK_command(
                T="MuTect2",
                R=sq(ref.fasta_file_full),
                L=sq(interval_node.identifier),
                o=sq(j.vcf_outfile),
                _UNHASHABLE=UNHASHABLE,
            )
            x = "%s >& %s" % (x, j.log_outfile)
            commands.append(x)
        assert len(commands) == len(jobs)

        nc = mlib.calc_max_procs_from_ram(25, upper_max=num_cores)
        parallel.pshell(commands, max_procs=nc)
        metadata["num_cores"] = nc
        metadata["commands"] = commands

        # Make sure log files have no errors.  Check the log files
        # before the VCF files.  If there's an error, the VCF files
        # may not be created.
        # ##### ERROR -------------------------------------------------------
        # ##### ERROR A GATK RUNTIME ERROR has occurred (version 2.2-25-g2a68
        # ##### ERROR
        # ##### ERROR Please visit the wiki to see if this is a known problem
        # ##### ERROR If not, please post the error, with stack trace, to the
        # ##### ERROR Visit our website and forum for extensive documentation
        # ##### ERROR commonly asked questions http://www.broadinstitute.org/
        # ##### ERROR
        # ##### ERROR MESSAGE: java.lang.IllegalArgumentException: Comparison
        # ##### ERROR -------------------------------------------------------
        for i, j in enumerate(jobs):
            # Pull out the error lines.
            x = [x for x in open(j.log_outfile)]
            x = [x for x in x if x.startswith("##### ERROR")]
            x = "".join(x)
            msg = "MuTect2 error [%s]:\n%s\n%s" % (cancer_sample, commands[i],
                                                   x)
            assert not x, msg

        # Make sure output VCF files exist.
        x = [x.vcf_outfile for x in jobs]
        filelib.assert_exists_many(x)

        # Mutect2 names the samples "NORMAL" and "TUMOR".  Replace
        # them with the actual names.
        for j in jobs:
            call_somatic_varscan._fix_normal_cancer_names(
                j.vcf_outfile, j.normal_sample, j.cancer_sample)

        return metadata
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import config
        from genomicode import parallel
        from genomicode import alignlib
        from genomicode import filelib
        from Betsy import module_utils

        bam_node, ref_node, pos_node = antecedents
        bam_filenames = module_utils.find_bam_files(bam_node.identifier)
        assert bam_filenames, "No .bam files."
        ref = alignlib.create_reference_genome(ref_node.identifier)
        filelib.safe_mkdir(out_path)
        metadata = {}

        # Positions file has 0-based coordinates (like BAM files).
        # But samtools requires 1-based coordinates.  Convert to
        # 1-based coordinates.
        positions_filename = "positions.txt"
        outhandle = open(positions_filename, 'w')
        for x in filelib.read_cols(pos_node.identifier):
            assert len(x) == 2
            chrom, pos = x
            pos = int(pos) + 1  # convert from 0- to 1-based coords.
            x = chrom, pos
            print >> outhandle, "\t".join(map(str, x))
        outhandle.close()

        # list of (in_filename, err_filename, out_filename)
        jobs = []
        for in_filename in bam_filenames:
            p, f = os.path.split(in_filename)
            sample, ext = os.path.splitext(f)
            err_filename = os.path.join(out_path, "%s.log" % sample)
            out_filename = os.path.join(out_path, "%s.pileup" % sample)
            x = filelib.GenericObject(in_filename=in_filename,
                                      err_filename=err_filename,
                                      out_filename=out_filename)
            jobs.append(x)

        ## Get possible positions file.
        #positions_filename = module_utils.get_user_option(
        #    user_options, "positions_file", check_file=True)

        # Figure out whether the purpose is to get coverage.  Change
        # the parameters if it is.
        assert "vartype" in out_attributes
        vartype = out_attributes["vartype"]
        assert vartype in ["all", "snp", "indel", "consensus"]
        #if cov == "yes":
        #    assert positions_filename, "Missing: positions_file"

        # samtools mpileup -l freq04.txt -R -B -q 0 -Q 0 -d10000000 \
        #   -f genomes/Broad.hg19/Homo_sapiens_assembly19.fasta \
        #   $i > $j"
        samtools = filelib.which_assert(config.samtools)

        # Get an error if the BAM files are not indexed.
        # [W::bam_hdr_read] EOF marker is absent. The input is probably
        #   truncated.

        #if vartype == "consensus":
        #    args = [
        #        "-R",        # Ignore read group tags.
        #        "-B",        # Disable BAQ (base quality) computation.
        #        "-q", 0,     # Skip bases with mapQ smaller than this.
        #        "-Q", 0,     # Skip bases with BAQ smaller than this.
        #        "-d10000000",  # Allow deep reads.
        #        ]
        #else:
        #    raise NotImplementedError
        args = [
            "-R",  # Ignore read group tags.
            "-B",  # Disable BAQ (base quality) computation.
            "-q",
            0,  # Skip bases with mapQ smaller than this.
            "-Q",
            0,  # Skip bases with BAQ smaller than this.
            "-d10000000",  # Allow deep reads.
        ]

        sq = parallel.quote
        commands = []
        for j in jobs:
            x = [
                sq(samtools),
                "mpileup",
                "-f",
                sq(ref.fasta_file_full),
            ]
            if positions_filename:
                x.extend(["-l", positions_filename])
            x.extend(args)
            x.append(sq(j.in_filename))
            x = " ".join(map(str, x))
            x = "%s 2> %s 1> %s" % (x, j.err_filename, j.out_filename)
            commands.append(x)

        #for x in commands:
        #    print x
        parallel.pshell(commands, max_procs=num_cores)
        metadata["commands"] = commands

        # File may be empty if there are no reads.
        x = [x.out_filename for x in jobs]
        filelib.assert_exists_many(x)

        # Make sure there's no errors in the log files.
        for j in jobs:
            check_log_file(j.err_filename)

        return metadata
Esempio n. 10
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib
        from Betsy import module_utils as mlib
        import call_somatic_varscan

        bam_node, nc_node, ref_node = antecedents
        bam_filenames = mlib.find_bam_files(bam_node.identifier)
        assert bam_filenames, "No .bam files."
        nc_match = mlib.read_normal_cancer_file(nc_node.identifier)
        ref = alignlib.create_reference_genome(ref_node.identifier)
        filelib.safe_mkdir(out_path)
        metadata = {}
        # TODO: Figure out version.

        # sample -> bam filename
        sample2bamfile = mlib.root2filename(bam_filenames)
        # Make sure files exist for all the samples.
        mlib.assert_normal_cancer_samples(nc_match, sample2bamfile)

        # list of (normal_sample, cancer_sample, normal_bamfile, tumor_bamfile,
        #          vcf_outfile)
        opj = os.path.join
        jobs = []
        for (normal_sample, cancer_sample) in nc_match:
            normal_bamfile = sample2bamfile[normal_sample]
            cancer_bamfile = sample2bamfile[cancer_sample]
            path, sample, ext = mlib.splitpath(cancer_bamfile)
            vcf_outfile = opj(out_path, "%s.vcf" % sample)
            x = normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \
                vcf_outfile
            jobs.append(x)

        # bam-somaticsniper -q 1 -Q 15 -G -L -F vcf \
        #   -f genomes/Broad.hg19/Homo_sapiens_assembly19.fa \
        #   test31/tumor.bam test31/normal.bam test41.vcf
        somaticsniper = mlib.get_config("somaticsniper",
                                        which_assert_file=True)

        # Generate the commands.
        sq = mlib.sq
        commands = []
        for x in jobs:
            normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \
                           vcf_outfile = x

            x = [
                sq(somaticsniper),
                "-q",
                1,
                "-Q",
                15,
                "-G",
                "-L",
                "-F",
                "vcf",
                "-f",
                sq(ref.fasta_file_full),
                sq(cancer_bamfile),
                sq(normal_bamfile),
                sq(vcf_outfile),
            ]
            x = " ".join(map(str, x))
            commands.append(x)
        # Not sure how much RAM this takes.
        nc = mlib.calc_max_procs_from_ram(15, upper_max=num_cores)
        parallel.pshell(commands, max_procs=nc)
        metadata["num_cores"] = nc
        metadata["commands"] = commands

        # SomaticSniper names the samples "NORMAL" and "TUMOR".
        # Replace them with the actual names.
        for x in jobs:
            normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \
                           vcf_outfile = x
            call_somatic_varscan._fix_normal_cancer_names(
                vcf_outfile, normal_sample, cancer_sample)

        x = [x[-1] for x in jobs]
        filelib.assert_exists_many(x)

        return metadata