コード例 #1
0
    def run(
        self, network, antecedents, out_attributes, user_options, num_cores,
        out_path):
        import os
        from genomicode import filelib
        #from genomicode import parallel
        from genomicode import hashlib
        from Betsy import module_utils as mlib

        # TODO: Merge with merge_variants_snp.py.
        #CALLERS = [
        #    "gatk", "platypus", "varscan",
        #    ]
        vcf_paths = [x.identifier for x in antecedents]
        nodes = [x.data for x in antecedents]
        CALLERS = [x.attributes["caller"] for x in nodes]
        assert len(CALLERS) == len(vcf_paths)
        filelib.safe_mkdir(out_path)
        metadata = {}

        # list of (sample, caller, out_vcf_path, in_vcf_file, out_vcf_file)
        jobs = []
        for i, caller in enumerate(CALLERS):
            inpath = vcf_paths[i]
            caller_h = hashlib.hash_var(caller)
            
            vcf_files = filelib.list_files_in_path(
                inpath, endswith=".vcf", toplevel_only=True)
            for file_ in vcf_files:
                # IN_FILE:   <inpath>/<sample>.vcf
                # OUT_FILE:  <out_path>/<caller>.vcf/<sample>.vcf
                p, sample, e = mlib.splitpath(file_)
                assert e == ".vcf"
                out_vcf_path = os.path.join(out_path, "%s.vcf" % caller_h)
                out_vcf_file = os.path.join(out_vcf_path, "%s.vcf" % sample)

                x = filelib.GenericObject(
                    sample=sample, caller=caller,
                    out_vcf_path=out_vcf_path, in_vcf_file=file_,
                    out_vcf_file=out_vcf_file)
                jobs.append(x)
                
        # Make sure the same samples are found in all callers.
        caller2samples = {}
        for j in jobs:
            if j.caller not in caller2samples:
                caller2samples[j.caller] = []
            caller2samples[j.caller].append(j.sample)
        comp_samples = None
        for caller, samples in caller2samples.iteritems():
            samples = sorted(samples)
            if comp_samples is None:
                comp_samples = samples
            assert comp_samples == samples, "%s %s" % (comp_samples, samples)

        for j in jobs:
            filelib.safe_mkdir(j.out_vcf_path)
            os.symlink(j.in_vcf_file, j.out_vcf_file)

        return metadata
コード例 #2
0
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from Betsy import module_utils

        # This this is I/O heavy, don't use so many cores.
        MAX_CORES = 4

        filelib.safe_mkdir(out_path)
        filenames = module_utils.find_fastq_files(in_data.identifier)
        assert filenames, "I could not find any FASTQ files."

        REMOVE = [".gz", ".bz2", ".xz"]

        # Uncompress the files to the new directory in parallel.
        commands = []
        for in_filename in filenames:
            in_path, in_file = os.path.split(in_filename)
            x = in_file
            for r in REMOVE:
                if x.lower().endswith(r):
                    x = x[:-len(r)]
            out_file = x
            out_filename = os.path.join(out_path, out_file)

            args = in_filename, out_filename
            keywds = {}
            x = uncompress_file, args, keywds
            commands.append(x)

        nc = min(MAX_CORES, num_cores)
        parallel.pyfun(commands, num_procs=nc)
コード例 #3
0
    def run(
        self, network, in_data, out_attributes, user_options, num_cores,
        out_path):
        import os
        import shutil
        from genomicode import filelib
        from Betsy import module_utils as mlib
        import cluster_genes_by_hierarchical as clust
        
        filelib.safe_mkdir(out_path)
        metadata = {}

        kmeans_k = mlib.get_user_option(
            user_options, "kmeans_k", not_empty=True, type=int)
        assert kmeans_k >= 2 and kmeans_k < 100

        x = clust.run_cluster30(
            in_data.identifier, "kmeans", user_options, kmeans_k=kmeans_k)
        cmd, cluster_files = x
        metadata["command"] = cmd
        
        opj = os.path.join
        out_cdt_file = opj(out_path, "signal.cdt")
        out_kag_file = opj(out_path, "array_cluster.kag")
        out_kgg_file = opj(out_path, "gene_cluster.kgg")

        assert "cdt" in cluster_files
        shutil.copy2(cluster_files["cdt"], out_cdt_file)
        if "kag" in cluster_files:
            shutil.copy2(cluster_files["kag"], out_kag_file)
        if "kgg" in cluster_files:
            shutil.copy2(cluster_files["kgg"], out_kgg_file)
        
        return metadata
コード例 #4
0
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from Betsy import module_utils

        bam_filenames = module_utils.find_bam_files(in_data.identifier)
        assert bam_filenames, "No .bam files."
        filelib.safe_mkdir(out_path)

        jobs = []  # list of (in_filename, out_filename)
        for in_filename in bam_filenames:
            p, f = os.path.split(in_filename)
            s, ext = os.path.splitext(f)
            out_filename = os.path.join(out_path, "%s.matches.txt" % s)
            x = in_filename, out_filename
            jobs.append(x)

        jobs2 = []  # list of (function, args, keywds)
        for x in jobs:
            in_filename, out_filename = x
            x = summarize_bam_file, (in_filename, out_filename), None
            jobs2.append(x)

        parallel.pyfun(jobs2, num_procs=num_cores, DELAY=0.1)

        # Make sure the analysis completed successfully.
        out_filenames = [x[-1] for x in jobs]
        filelib.assert_exists_nz_many(out_filenames)
コード例 #5
0
    def run(
        self, network, antecedents, out_attributes, user_options, num_cores,
        out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib
        from Betsy import module_utils as mlib

        MAX_RAM = 64   # maximum amount of ram to use in Gb.

        bam_node, ref_node = antecedents
        bam_filenames = mlib.find_bam_files(bam_node.identifier)
        assert bam_filenames, "No .bam files."
        ref = alignlib.create_reference_genome(ref_node.identifier)
        filelib.safe_mkdir(out_path)
        metadata = {}

        jobs = []  # list of (in_filename, log_filename, out_filename)
        for in_filename in bam_filenames:
            p, f = os.path.split(in_filename)
            s, ext = os.path.splitext(f)
            log_filename = os.path.join(out_path, "%s.log" % s)
            out_filename = os.path.join(out_path, f)
            x = in_filename, log_filename, out_filename
            jobs.append(x)
        
        # java -Xmx5g -jar /usr/local/bin/GATK/GenomeAnalysisTK.jar
        #   -T SplitNCigarReads -R ../hg19.fa -I $i -o $j
        #   -rf ReassignOneMappingQuality -RMQF 255 -RMQT 60
        #   -U ALLOW_N_CIGAR_READS

        # Start with 5 Gb RAM.
        commands = make_commands(jobs, ref.fasta_file_full, 5)
        nc = mlib.calc_max_procs_from_ram(5, upper_max=num_cores)
        parallel.pshell(commands, max_procs=nc)
        metadata["commands"] = commands
        metadata["num_procs"] = nc

        # If any of the analyses didn't finish, try again with more
        # RAM.
        jobs2 = []
        for x in jobs:
            in_filename, log_filename, out_filename = x
            if filelib.exists_nz(out_filename):
                continue
            jobs2.append(x)
        if jobs2:
            commands = make_commands(jobs2, ref.fasta_file_full, MAX_RAM)
            nc = mlib.calc_max_procs_from_ram(MAX_RAM, upper_max=num_cores)
            parallel.pshell(commands, max_procs=nc)
            metadata["commands"] += commands
            
        # Make sure the analysis completed successfully.
        out_filenames = [x[-1] for x in jobs]
        filelib.assert_exists_nz_many(out_filenames)

        return metadata
コード例 #6
0
    def run(
        self, network, in_data, out_attributes, user_options, num_cores,
        out_path):
        import os
        from genomicode import config
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib

        bam_path = in_data.identifier
        assert os.path.exists(bam_path)
        assert os.path.isdir(bam_path)
        filelib.safe_mkdir(out_path)

        metadata = {}
        metadata["tool"] = "samtools %s" % alignlib.get_samtools_version()

        # Find all the BAM files.
        bam_filenames = filelib.list_files_in_path(
            bam_path, endswith=".bam", case_insensitive=True)

        jobs = []  # list of in_filename, out_filename
        for in_filename in bam_filenames:
            p, f = os.path.split(in_filename)
            out_filename = os.path.join(out_path, f)
            assert not os.path.exists(out_filename)
            x = in_filename, out_filename
            jobs.append(x)

        # Symlink the BAM files to the output path.
        for x in jobs:
            in_filename, out_filename = x
            os.symlink(in_filename, out_filename)

        # Index each of the files.
        sq = parallel.quote
        samtools = filelib.which_assert(config.samtools)
        commands = []
        for x in jobs:
            in_filename, out_filename = x
            cmd = [
                sq(samtools),
                "index",
                sq(out_filename),
                ]
            x = " ".join(cmd)
            commands.append(x)
        metadata["commands"] = commands
        parallel.pshell(commands, max_procs=num_cores, path=out_path)

        # TODO: Check for output files.
        
        return metadata
コード例 #7
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import parallel
        from genomicode import alignlib
        from genomicode import filelib
        from Betsy import module_utils as mlib

        bam_node, ref_node = antecedents
        bam_filenames = mlib.find_bam_files(bam_node.identifier)
        assert bam_filenames, "No .bam files."
        ref = alignlib.create_reference_genome(ref_node.identifier)
        filelib.safe_mkdir(out_path)
        metadata = {}
        metadata["tool"] = "samtools %s" % alignlib.get_samtools_version()

        # list of (in_filename, err_filename, out_filename)
        jobs = []
        for in_filename in bam_filenames:
            p, f = os.path.split(in_filename)
            sample, ext = os.path.splitext(f)
            err_filename = os.path.join(out_path, "%s.log" % sample)
            out_filename = os.path.join(out_path, "%s.pileup" % sample)
            x = in_filename, err_filename, out_filename
            jobs.append(x)

        # samtools mpileup -f [reference sequence] [BAM file(s)]
        #   > myData.mpileup
        samtools = mlib.findbin("samtools")
        sq = mlib.sq
        commands = []
        for x in jobs:
            in_filename, err_filename, out_filename = x

            x = [
                sq(samtools),
                "mpileup",
                "-f",
                sq(ref.fasta_file_full),
            ]
            x.append(sq(in_filename))
            x = " ".join(map(str, x))
            x = "%s 2> %s 1> %s" % (x, err_filename, out_filename)
            commands.append(x)
        parallel.pshell(commands, max_procs=num_cores)
        metadata["num_cores"] = num_cores
        metadata["commands"] = commands

        x = [x[-1] for x in jobs]
        filelib.assert_exists_nz_many(x)

        return metadata
コード例 #8
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib
        from Betsy import module_utils

        bam_node, ref_node = antecedents

        #in_filenames = filelib.list_files_in_path(
        #    bam_node.identifier, endswith=".bam", case_insensitive=True)
        in_filenames = module_utils.find_bam_files(bam_node.identifier)
        ref = alignlib.create_reference_genome(ref_node.identifier)
        filelib.safe_mkdir(out_path)

        # java -Xmx5g -jar /usr/local/bin/picard/picard.jar ReorderSam \
        #   I=<input.bam> O=<output.bam> REFERENCE=ucsc.hg19.fasta
        picard_jar = alignlib.find_picard_jar("picard")

        jobs = []  # list of (in_filename, out_filename)
        for in_filename in in_filenames:
            p, f = os.path.split(in_filename)
            out_filename = os.path.join(out_path, f)
            x = in_filename, out_filename
            jobs.append(x)

        # Make a list of commands.
        sq = parallel.quote
        commands = []
        for x in jobs:
            in_filename, out_filename = x

            x = [
                "java",
                "-Xmx5g",
                "-jar",
                sq(picard_jar),
                "ReorderSam",
                "I=%s" % sq(in_filename),
                "O=%s" % sq(out_filename),
                "REFERENCE=%s" % ref.fasta_file_full,
            ]
            x = " ".join(x)
            commands.append(x)

        parallel.pshell(commands, max_procs=num_cores)

        # Make sure the analysis completed successfully.
        for x in jobs:
            in_filename, out_filename = x
            filelib.assert_exists_nz(out_filename)
コード例 #9
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import parallel
        from genomicode import filelib
        from genomicode import alignlib
        from Betsy import module_utils as mlib

        fastq_node, sample_node, reference_node = antecedents
        fastq_files = mlib.find_merged_fastq_files(sample_node.identifier,
                                                   fastq_node.identifier)
        ref = alignlib.create_reference_genome(reference_node.identifier)
        assert os.path.exists(ref.fasta_file_full)
        filelib.safe_mkdir(out_path)

        metadata = {}
        metadata["tool"] = "bowtie2 %s" % alignlib.get_bowtie2_version()

        # Make a list of the jobs to run.
        jobs = []
        for x in fastq_files:
            sample, pair1, pair2 = x
            sam_filename = os.path.join(out_path, "%s.sam" % sample)
            log_filename = os.path.join(out_path, "%s.log" % sample)
            x = sample, pair1, pair2, sam_filename, log_filename
            jobs.append(x)

        sq = mlib.sq
        commands = []
        for x in jobs:
            sample, pair1, pair2, sam_filename, log_filename = x
            nc = max(1, num_cores / len(jobs))
            x = alignlib.make_bowtie2_command(ref.fasta_file_full,
                                              pair1,
                                              fastq_file2=pair2,
                                              sam_file=sam_filename,
                                              num_threads=nc)
            x = "%s >& %s" % (x, sq(log_filename))
            commands.append(x)
        metadata["commands"] = commands
        metadata["num_cores"] = num_cores
        parallel.pshell(commands, max_procs=num_cores)

        # Make sure the analysis completed successfully.
        x = [x[-2] for x in jobs]
        filelib.assert_exists_nz_many(x)

        return metadata
コード例 #10
0
    def run(
        self, network, in_data, out_attributes, user_options, num_cores,
        out_path):
        import os
        from genomicode import filelib

        # If align_with_star is run with two_pass=yes, this will leave
        # two BAM files for every sample.
        # p1.<sample>.Aligned.out.bam    pass 1
        # <sample>.Aligned.out.bam       pass 2
        # Make sure to ignore the pass1 files.
        x = filelib.list_files_in_path(
            in_data.identifier, endswith=".Aligned.out.bam",
            file_not_startswith="p1.")
        bam_filenames = x
        if not bam_filenames:
            x = filelib.list_files_in_path(
                in_data.identifier, endswith=".Aligned.out.sam")
            sam_filenames = x
            if sam_filenames:
                assert bam_filenames, \
                       "No .Aligned.out.bam files.  Looks like .sam generated."
            assert bam_filenames, "No .Aligned.out.bam files."
        filelib.safe_mkdir(out_path)

        jobs = []  # list of (in_filename, out_filename)
        for in_filename in bam_filenames:
            # in_filename has format:
            # <path>/<sample>.Aligned.out.sam
            path, f = os.path.split(in_filename)
            sample, x = f.split(".", 1)
            assert x == "Aligned.out.bam", f
            out_filename = os.path.join(out_path, "%s.bam" % sample)
            assert in_filename != out_filename
            jobs.append((in_filename, out_filename))

        # Make sure outfiles are unique.
        x = [x[-1] for x in jobs]
        x = {}.fromkeys(x)
        assert len(jobs) == len(x), "Duplicate sample names."

        for x in jobs:
            in_filename, out_filename = x
            os.symlink(in_filename, out_filename)

        # Make sure the analysis completed successfully.
        out_filenames = [x[-1] for x in jobs]
        filelib.assert_exists_nz_many(out_filenames)
コード例 #11
0
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from Betsy import module_utils as mlib

        sam_filenames = mlib.find_sam_files(in_data.identifier)
        assert sam_filenames, "No .sam files."
        filelib.safe_mkdir(out_path)
        metadata = {}

        samtools = mlib.findbin("samtools")

        jobs = []  # list of (sam_filename, bam_filename)
        for sam_filename in sam_filenames:
            p, f = os.path.split(sam_filename)
            assert f.endswith(".sam")
            f = f.replace(".sam", ".bam")
            bam_filename = os.path.join(out_path, f)
            x = sam_filename, bam_filename
            jobs.append(x)

        # Make a list of samtools commands.
        sq = parallel.quote
        commands = []
        for x in jobs:
            sam_filename, bam_filename = x

            # samtools view -bS -o <bam_filename> <sam_filename>
            x = [
                sq(samtools),
                "view",
                "-bS",
                "-o",
                sq(bam_filename),
                sq(sam_filename),
            ]
            x = " ".join(x)
            commands.append(x)
        metadata["commands"] = commands
        metadata["num_cores"] = num_cores
        parallel.pshell(commands, max_procs=num_cores)

        # Make sure the analysis completed successfully.
        x = [x[-1] for x in jobs]
        filelib.assert_exists_nz_many(x)
        return metadata
コード例 #12
0
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib
        from Betsy import module_utils

        vcf_node = in_data
        vcf_filenames = filelib.list_files_in_path(vcf_node.identifier,
                                                   endswith=".vcf")
        assert vcf_filenames, "No .vcf files."
        filelib.safe_mkdir(out_path)

        buildver = module_utils.get_user_option(user_options,
                                                "buildver",
                                                allowed_values=["hg19"],
                                                not_empty=True)

        jobs = []  # list of (in_filename, log_filename, out_filestem)
        for in_filename in vcf_filenames:
            # Annovar takes a filestem, without the ".vcf".
            p, f = os.path.split(in_filename)
            f, exp = os.path.splitext(f)
            log_filename = os.path.join(out_path, "%s.log" % f)
            out_filestem = os.path.join(out_path, f)
            x = in_filename, log_filename, out_filestem
            jobs.append(x)

        # Make a list of commands.
        commands = []
        for x in jobs:
            in_filename, log_filename, out_filestem = x

            x = alignlib.make_annovar_command(in_filename, log_filename,
                                              out_filestem, buildver)
            commands.append(x)

        #for x in commands:
        #    print x
        #import sys; sys.exit(0)

        parallel.pshell(commands, max_procs=num_cores)

        # Make sure the analysis completed successfully.
        x = [x[-1] for x in jobs]  # out_filestems
        x = ["%s.%s_multianno.vcf" % (x, buildver) for x in x]
        filelib.assert_exists_nz_many(x)
コード例 #13
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import filelib
        from Betsy import module_utils as mlib
        import merge_vcf_folder

        vcffolders_node = antecedents
        filelib.safe_mkdir(out_path)
        metadata = {}

        x = os.listdir(vcffolders_node.identifier)
        x = [x for x in x if x.endswith(".vcf")]
        assert x, "No VCF folders found: %s" % vcffolders_node.identifier
        x = [os.path.join(vcffolders_node.identifier, x) for x in x]
        vcf_folders = x

        jobs = []
        for folder in vcf_folders:
            path, root, ext = mlib.splitpath(folder)
            assert ext == ".vcf"
            caller = root
            vcf_filenames = filelib.list_files_in_path(folder,
                                                       endswith=".vcf",
                                                       toplevel_only=True)
            assert vcf_filenames, "No .vcf files: %s" % folder
            out_filename = os.path.join(out_path, "%s.vcf" % root)
            tmp_path = "%s.indexed.vcf" % caller
            x = filelib.GenericObject(caller=caller,
                                      vcf_filenames=vcf_filenames,
                                      out_filename=out_filename,
                                      tmp_path=tmp_path)
            jobs.append(x)

        for j in jobs:
            m = merge_vcf_folder.merge_vcf_files(j.vcf_filenames,
                                                 j.out_filename, num_cores,
                                                 j.tmp_path)
            if "commands" not in metadata:
                metadata["commands"] = []
            metadata["commands"].extend(m["commands"])

        x = [x.out_filename for x in jobs]
        filelib.assert_exists_many(x)

        return metadata
コード例 #14
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import filelib

        (
            bam_node,
            fastqc_summary1_node,
            fastqc_folder1_node,
            fastqc_summary2_node,
            fastqc_folder2_node,
            rseqc_node,
            signal1_node,  # TPM
            signal2_node,  # TPM, isoform
            aligned_reads_node,
            signal3_node,  # count
            htseq_reads_node) = antecedents
        filelib.safe_mkdir(out_path)

        FILES = [
            (bam_node.identifier, False, "alignment.bam"),
            (fastqc_summary1_node.identifier, True, "fastqc.no_trim.xls"),
            (fastqc_folder1_node.identifier, False, "fastqc.no_trim"),
            (fastqc_summary2_node.identifier, True, "fastqc.trim.xls"),
            (fastqc_folder2_node.identifier, False, "fastqc.trim"),
            (rseqc_node.identifier, False, "RSeQC"),
            (signal1_node.identifier, True, "expression.gene.tpm"),
            (signal2_node.identifier, True, "expression.isoform.tpm"),
            (aligned_reads_node.identifier, True, "aligned.xls"),
            (signal3_node.identifier, True, "expression.counts"),
            (htseq_reads_node.identifier, True, "mapped.htseq.txt"),
        ]

        for x in FILES:
            orig_filename, is_file, new_file = x
            new_filename = os.path.join(out_path, new_file)

            # Copy or link the data into the right place.
            if is_file:
                filelib.assert_exists_nz(orig_filename)
            else:
                assert filelib.dir_exists(orig_filename), \
                       "Directory not found or not directory: %s" % \
                       orig_filename
            os.symlink(orig_filename, new_filename)
コード例 #15
0
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            out_path):
        import os
        from genomicode import filelib
        from Betsy import module_utils

        align_node = in_data
        x = module_utils.find_bam_files(align_node.identifier)
        x = [x for x in x if x.endswith("accepted_hits.bam")]
        bam_filenames = x
        assert bam_filenames, "No accepted_hits.bam files."
        filelib.safe_mkdir(out_path)

        jobs = []  # list of (in_filename, out_filename)
        for in_filename in bam_filenames:
            # Names must in the format:
            # <path>/<sample>.tophat/accepted_hits.bam
            # full_path   <path>/<sample>.tophat
            # path        <path>
            # tophat_dir  <sample>.tophat
            # file_       accepted_hits.bam
            # sample      <sample>

            full_path, file_ = os.path.split(in_filename)
            path, tophat_dir = os.path.split(full_path)

            assert file_ == "accepted_hits.bam"
            assert tophat_dir.endswith(".tophat")
            sample = tophat_dir[:-7]
            out_filename = os.path.join(out_path, "%s.bam" % sample)
            assert in_filename != out_filename
            jobs.append((in_filename, out_filename))

        # Make sure outfiles are unique.
        x = [x[-1] for x in jobs]
        x = {}.fromkeys(x)
        assert len(jobs) == len(x), "Duplicate sample names."

        for x in jobs:
            in_filename, out_filename = x
            os.symlink(in_filename, out_filename)

        # Make sure the analysis completed successfully.
        out_filenames = [x[-1] for x in jobs]
        filelib.assert_exists_nz_many(out_filenames)
コード例 #16
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib

        ref_node, gene_node = antecedents
        ref = alignlib.standardize_reference_genome(ref_node.identifier,
                                                    out_path,
                                                    use_symlinks=True)
        filelib.safe_mkdir(out_path)

        x = alignlib.make_STAR_index_command(ref.fasta_file_full,
                                             out_path,
                                             gtf_file=gene_node.identifier,
                                             num_cores=num_cores)
        x = "%s >& out.txt" % x
        parallel.sshell(x, path=out_path)

        # Check to make sure index was created successfully.
        alignlib.assert_is_STAR_reference(out_path)
コード例 #17
0
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel

        vcf_node = in_data
        vcf_files = filelib.list_files_in_path(vcf_node.identifier,
                                               endswith=".vcf",
                                               case_insensitive=True)
        filelib.safe_mkdir(out_path)
        metadata = {}

        jobs = []  # in_vcf_filename, out_vcf_filename
        for vcf_file in vcf_files:
            path, file_ = os.path.split(vcf_file)
            out_vcf_file = os.path.join(out_path, file_)
            x = vcf_file, out_vcf_file
            jobs.append(x)

        # Figure out whether the user wants SNPs or INDELs.
        assert "vartype" in out_attributes
        vartype = out_attributes["vartype"]
        assert vartype in ["all", "snp", "indel"]

        # Generate the commands.
        commands = []
        for x in jobs:
            in_vcf_file, out_vcf_file = x

            args = vartype, in_vcf_file, out_vcf_file
            x = filter_by_vartype, args, {}
            commands.append(x)
        parallel.pyfun(commands, num_procs=num_cores)
        metadata["num_cores"] = num_cores

        x = [x[-1] for x in jobs]
        filelib.assert_exists_many(x)

        return metadata
コード例 #18
0
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        import filter_variants_GATK

        vcf_node = in_data
        vcf_filenames = filelib.list_files_in_path(vcf_node.identifier,
                                                   endswith=".vcf",
                                                   not_empty=True)
        assert vcf_filenames, "No VCF files found."
        filelib.safe_mkdir(out_path)
        metadata = {}

        # Figure out whether the user wants SNPs or INDELs.
        assert "vartype" in out_attributes
        vartype = out_attributes["vartype"]
        assert vartype in ["snp", "indel"]
        metadata["filter"] = vartype

        jobs = []  # list of filelib.GenericObject
        for in_filename in vcf_filenames:
            p, f = os.path.split(in_filename)
            out_filename = os.path.join(out_path, f)
            x = filelib.GenericObject(in_filename=in_filename,
                                      out_filename=out_filename)
            jobs.append(x)

        # Filter each of the VCF files.
        jobs2 = []
        for j in jobs:
            args = vartype, j.in_filename, j.out_filename
            x = filter_variants_GATK.filter_by_vartype, args, {}
            jobs2.append(x)
        parallel.pyfun(jobs2, num_procs=num_cores)
        metadata["num_cores"] = num_cores

        return metadata
コード例 #19
0
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from Betsy import module_utils as mlib

        # This this is I/O heavy, don't use so many cores.
        MAX_CORES = 2

        filenames = mlib.find_fastq_files(in_data.identifier)
        assert filenames, "I could not find any FASTQ files."
        filelib.safe_mkdir(out_path)
        metadata = {}

        num_samples = mlib.get_user_option(user_options,
                                           "num_samples",
                                           not_empty=True,
                                           type=int)
        metadata["num_samples"] = num_samples

        jobs = []
        for in_filename in filenames:
            p, f = os.path.split(in_filename)
            out_filename = os.path.join(out_path, f)
            x = in_filename, out_filename
            jobs.append(x)

        cmds = []
        for x in jobs:
            in_filename, out_filename = x
            x = copy_fastq_file, (in_filename, out_filename, num_samples), {}
            cmds.append(x)

        nc = min(MAX_CORES, num_cores)
        metadata["num cores"] = nc
        parallel.pyfun(cmds, num_procs=nc)

        return metadata
コード例 #20
0
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            out_path):
        import os
        import shutil
        from genomicode import filelib
        from genomicode import cluster30
        from Betsy import module_utils as mlib

        filelib.safe_mkdir(out_path)
        metadata = {}

        LINKAGES = cluster30.METHOD2ID.keys()
        linkage = mlib.get_user_option(user_options,
                                       "linkage",
                                       not_empty=True,
                                       allowed_values=LINKAGES)

        x = run_cluster30(in_data.identifier,
                          "hierarchical",
                          user_options,
                          method=linkage)
        cmd, cluster_files = x
        metadata["command"] = cmd

        opj = os.path.join
        out_cdt_file = opj(out_path, "signal.cdt")
        out_atr_file = opj(out_path, "array_tree.atr")
        out_gtr_file = opj(out_path, "gene_tree.gtr")

        assert "cdt" in cluster_files
        shutil.copy2(cluster_files["cdt"], out_cdt_file)
        if "atr" in cluster_files:
            shutil.copy2(cluster_files["atr"], out_atr_file)
        if "gtr" in cluster_files:
            shutil.copy2(cluster_files["gtr"], out_gtr_file)

        return metadata
コード例 #21
0
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from Betsy import module_utils as mlib

        filenames = mlib.find_fastq_files(in_data.identifier)
        assert filenames, "FASTQ files not found: %s" % in_data.identifier
        filelib.safe_mkdir(out_path)
        metadata = {}

        fastqc = mlib.findbin("fastqc")
        fastqc_q = parallel.quote(fastqc)

        commands = [
            "%s --outdir=%s --extract %s" % (fastqc_q, out_path, x)
            for x in filenames
        ]
        metadata["commands"] = commands
        metadata["num_cores"] = num_cores
        #commands = ["ls > %s" % x for x in filenames]
        parallel.pshell(commands, max_procs=num_cores)

        # Fastqc generates files:
        # <file>_fastqc/
        # <file>_fastqc.zip
        # The contents of the .zip file are identical to the directories.
        # If this happens, then delete the .zip files because they are
        # redundant.
        files = os.listdir(out_path)
        filenames = [os.path.join(out_path, x) for x in files]
        for filename in filenames:
            zip_filename = "%s.zip" % filename
            if os.path.exists(zip_filename):
                os.unlink(zip_filename)
コード例 #22
0
    def run(
        self, network, antecedents, out_attributes, user_options, num_cores,
        out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib
        from Betsy import module_utils as mlib

        bam_node, ref_node = antecedents
        bam_filenames = mlib.find_bam_files(bam_node.identifier)
        assert bam_filenames, "No .bam files."
        ref = alignlib.create_reference_genome(ref_node.identifier)
        filelib.safe_mkdir(out_path)
        metadata = {}

        # java -jar picard.jar CollectAlignmentSummaryMetrics \
        #   R=reference_sequence.fasta \
        #   I=input.bam \
        #   O=output.txt
        opj = os.path.join
        jobs = []   # list of filelib.GenericObject
        for bam_filename in bam_filenames:
            # <in_path>/<sample>.bam
            in_path, sample, ext = mlib.splitpath(bam_filename)
            assert ext == ".bam"
            out_filename = opj(out_path, "%s.alignment_metrics.txt" % sample)
            log_filename = opj(out_path, "%s.log" % sample)
            x = filelib.GenericObject(
                sample=sample,
                bam_filename=bam_filename,
                out_filename=out_filename,
                log_filename=log_filename)
            jobs.append(x)

        # Make the commands to run picard.
        picard_jar = alignlib.find_picard_jar("picard")
        sq = parallel.quote
        commands = []
        for j in jobs:
            # Should have better way of getting java path.
            cmd = [
                "java",
                "-Xmx10g",
                "-jar", sq(picard_jar), "CollectAlignmentSummaryMetrics",
                "I=%s" % sq(j.bam_filename),
                "R=%s" % sq(ref.fasta_file_full),
                "O=%s" % sq(j.out_filename),
                ]
            cmd = " ".join(cmd)
            cmd = "%s >& %s" % (cmd, sq(j.log_filename))
            commands.append(cmd)

        metadata["commands"] = commands
        parallel.pshell(commands, max_procs=num_cores)
        x = [x.out_filename for x in jobs]
        filelib.assert_exists_nz_many(x)

        # Summarize the insert size files.
        outfile = opj(out_path, "summary.txt")
        _summarize_alignment_summary_metrics(jobs, outfile)
        filelib.assert_exists_nz(outfile)

        return metadata
コード例 #23
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import parallel
        from genomicode import filelib
        from genomicode import alignlib
        from Betsy import module_utils as mlib

        bam_folder, sample_node, gene_node, strand_node = antecedents
        bam_path = bam_folder.identifier
        assert filelib.dir_exists(bam_path)
        gtf_file = gene_node.identifier
        filelib.assert_exists_nz(gtf_file)
        stranded = mlib.read_stranded(strand_node.identifier)
        filelib.safe_mkdir(out_path)

        metadata = {}

        attr2order = {
            "name": "name",
            "coordinate": "pos",
        }
        x = bam_folder.data.attributes["sorted"]
        sort_order = attr2order.get(x)
        assert sort_order, "Cannot handle sorted: %s" % x

        #attr2stranded = {
        #    "single" : "no",
        #    "paired" : "no",
        #    "paired_ff" : None,
        #    "paired_fr" : "yes",
        #    "paired_rf" : "reverse",
        #    }
        #x = sample_node.data.attributes["orientation"]
        #stranded = attr2stranded.get(x)
        #assert stranded, "Cannot handle orientation: %s" % x

        ht_stranded = None
        if stranded.stranded == "unstranded":
            ht_stranded = "no"
        elif stranded.stranded == "firststrand":
            ht_stranded = "reverse"
        elif stranded.stranded == "secondstrand":
            ht_stranded = "yes"
        assert ht_stranded is not None

        #gtf_file = mlib.get_user_option(
        #    user_options, "gtf_file", not_empty=True)
        #assert os.path.exists(gtf_file), "File not found: %s" % gtf_file

        mode = mlib.get_user_option(user_options,
                                    "htseq_count_mode",
                                    allowed_values=[
                                        "union", "intersection-strict",
                                        "intersection-nonempty"
                                    ])

        # Make a list of the jobs to run.
        jobs = []
        for bam_filename in filelib.list_files_in_path(bam_path,
                                                       endswith=".bam",
                                                       case_insensitive=True):
            x = os.path.split(bam_filename)[1]
            x = os.path.splitext(x)[0]
            x = "%s.count" % x
            out_file = x
            x = bam_filename, out_file
            jobs.append(x)

        # Generate commands for each of the files.
        sq = parallel.quote
        commands = []
        for x in jobs:
            bam_filename, out_file = x
            x = alignlib.make_htseq_count_command(bam_filename,
                                                  gtf_file,
                                                  sort_order,
                                                  ht_stranded,
                                                  mode=mode)
            x = "%s >& %s" % (x, sq(out_file))
            commands.append(x)
        metadata["commands"] = commands
        metadata["num_cores"] = num_cores
        parallel.pshell(commands, max_procs=num_cores, path=out_path)

        # Make sure the analysis completed successfully.
        x = [x[1] for x in jobs]
        x = [os.path.join(out_path, x) for x in x]
        output_filenames = x
        filelib.assert_exists_nz_many(output_filenames)

        return metadata
コード例 #24
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib
        from Betsy import module_utils as mlib

        bam_node, ref_node = antecedents
        in_filenames = mlib.find_bam_files(bam_node.identifier)
        assert in_filenames, "No .bam files."
        ref = alignlib.create_reference_genome(ref_node.identifier)
        filelib.safe_mkdir(out_path)
        metadata = {}

        jobs = []  # list of (in_filename, log_filename, out_filename)
        for in_filename in in_filenames:
            p, f = os.path.split(in_filename)
            f, ext = os.path.splitext(f)
            log_filename = os.path.join(out_path, "%s.log" % f)
            out_filename = os.path.join(out_path, "%s.intervals" % f)
            x = in_filename, log_filename, out_filename
            jobs.append(x)

        filter_reads_with_N_cigar = mlib.get_user_option(
            user_options,
            "filter_reads_with_N_cigar",
            allowed_values=["no", "yes"])

        known_sites = []
        x1 = mlib.get_user_option(user_options,
                                  "realign_known_sites1",
                                  check_file=True)
        x2 = mlib.get_user_option(user_options,
                                  "realign_known_sites2",
                                  check_file=True)
        x3 = mlib.get_user_option(user_options,
                                  "realign_known_sites3",
                                  check_file=True)
        x = [x1, x2, x3]
        x = [x for x in x if x]
        known_sites = x
        assert known_sites

        # I/O bound, so not likely to get a big speedup with nt.

        # java -Xmx5g -jar /usr/local/bin/GATK/GenomeAnalysisTK.jar -nt 4
        #   -T RealignerTargetCreator -R ../genome.idx/erdman.fa -I $i -o $j
        #   --known <known_vcf_file>

        # RealignerTargetCreator takes ~10Gb per process.  Each thread
        # takes the full amount of memory.
        nc = mlib.calc_max_procs_from_ram(12, upper_max=num_cores)

        # Make a list of commands.
        commands = []
        for x in jobs:
            in_filename, log_filename, out_filename = x

            n = max(1, nc / len(jobs))
            x = [("-known", x) for x in known_sites]
            if filter_reads_with_N_cigar == "yes":
                x.append(("-filter_reads_with_N_cigar", None))
            x = alignlib.make_GATK_command(nt=n,
                                           T="RealignerTargetCreator",
                                           R=ref.fasta_file_full,
                                           I=in_filename,
                                           o=out_filename,
                                           _UNHASHABLE=x)
            x = "%s >& %s" % (x, log_filename)
            commands.append(x)

        parallel.pshell(commands, max_procs=nc)
        metadata["num_procs"] = nc
        metadata["commands"] = commands

        # Make sure the analysis completed successfully.
        out_filenames = [x[-1] for x in jobs]
        filelib.assert_exists_nz_many(out_filenames)
        return metadata
コード例 #25
0
    def run(
        self, network, antecedents, out_attributes, user_options, num_cores,
        out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib
        from Betsy import module_utils as mlib

        bam_node, nc_node, ref_node, interval_node = antecedents
        bam_filenames = mlib.find_bam_files(bam_node.identifier)
        assert bam_filenames, "No .bam files."
        nc_match = mlib.read_normal_cancer_file(nc_node.identifier)
        ref = alignlib.create_reference_genome(ref_node.identifier)
        filelib.assert_exists_nz(interval_node.identifier)
        filelib.safe_mkdir(out_path)
        metadata = {}
        # TODO: Figure out MuTect version.

        # Make sure intervals file ends with:
        # .bed, .list, .picard, .interval_list, or .intervals
        x, x, ext = mlib.splitpath(interval_node.identifier)
        assert ext in [
            ".bed", ".list", ".picard", ".interval_list", ".intervals"]

        cosmic_file = mlib.get_user_option(
            user_options, "mutect_cosmic_vcf", not_empty=True, check_file=True)
        dbsnp_file = mlib.get_user_option(
            user_options, "mutect_dbsnp_vcf", not_empty=True, check_file=True)

        # sample -> bam filename
        sample2bamfile = mlib.root2filename(bam_filenames)
        # Make sure files exist for all the samples.
        mlib.assert_normal_cancer_samples(nc_match, sample2bamfile)

        # list of (cancer_sample, normal_bamfile, tumor_bamfile, call_outfile,
        #    coverage_outfile, vcf_outfile, logfile)
        opj = os.path.join
        jobs = []
        for (normal_sample, cancer_sample) in nc_match:
            normal_bamfile = sample2bamfile[normal_sample]
            cancer_bamfile = sample2bamfile[cancer_sample]
            path, sample, ext = mlib.splitpath(cancer_bamfile)
            call_outfile = opj(out_path, "%s.call_stats.out" % sample)
            cov_outfile = opj(out_path, "%s.coverage.wig.txt" % sample)
            raw_vcf_outfile = opj(out_path, "%s.vcf.raw" % sample)
            vcf_outfile = opj(out_path, "%s.vcf" % sample)
            log_outfile = opj(out_path, "%s.log" % sample)
            x = normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \
                call_outfile, cov_outfile, raw_vcf_outfile, vcf_outfile, \
                log_outfile
            jobs.append(x)

        # java -Xmx2g -jar muTect.jar
        #   --analysis_type MuTect
        #   --reference_sequence <reference>
        #   --cosmic <cosmic.vcf>
        #   --dbsnp <dbsnp.vcf>
        #   --intervals <intervals_to_process>
        #   --input_file:normal <normal.bam>
        #   --input_file:tumor <tumor.bam>
        #   --out <call_stats.out>
        #   --coverage_file <coverage.wig.txt>

        # Generate the commands.
        sq = mlib.sq
        commands = []
        for x in jobs:
            normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \
                call_outfile, cov_outfile, raw_vcf_outfile, vcf_outfile, \
                log_outfile = x

            UNHASHABLE = [
                ("input_file:normal", sq(normal_bamfile)),
                ("input_file:tumor", sq(cancer_bamfile)),
                ]
            x = alignlib.make_MuTect_command(
                analysis_type="MuTect",
                reference_sequence=sq(ref.fasta_file_full),
                cosmic=sq(cosmic_file),
                dbsnp=sq(dbsnp_file),
                intervals=sq(interval_node.identifier),
                out=sq(call_outfile),
                coverage_file=sq(cov_outfile),
                vcf=sq(raw_vcf_outfile),
                _UNHASHABLE=UNHASHABLE,
                )
            x = "%s >& %s" % (x, log_outfile)
            commands.append(x)
        assert len(commands) == len(jobs)
        nc = mlib.calc_max_procs_from_ram(15, upper_max=num_cores)
        parallel.pshell(commands, max_procs=nc)
        metadata["num_cores"] = nc
        metadata["commands"] = commands

        # Make sure log files have no errors.  Check the log files
        # before the VCF files.  If there's an error, the VCF files
        # may not be created.
        # ##### ERROR -------------------------------------------------------
        # ##### ERROR A GATK RUNTIME ERROR has occurred (version 2.2-25-g2a68
        # ##### ERROR
        # ##### ERROR Please visit the wiki to see if this is a known problem
        # ##### ERROR If not, please post the error, with stack trace, to the
        # ##### ERROR Visit our website and forum for extensive documentation
        # ##### ERROR commonly asked questions http://www.broadinstitute.org/
        # ##### ERROR
        # ##### ERROR MESSAGE: java.lang.IllegalArgumentException: Comparison
        # ##### ERROR -------------------------------------------------------
        for i, x in enumerate(jobs):
            normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \
                call_outfile, cov_outfile, raw_vcf_outfile, vcf_outfile, \
                log_outfile = x
            # Pull out the error lines.
            x = [x for x in open(log_outfile)]
            x = [x for x in x if x.startswith("##### ERROR")]
            x = "".join(x)
            msg = "MuTect error [%s]:\n%s\n%s" % (
                cancer_sample, commands[i], x)
            assert not x, msg

        # Make sure output VCF files exist.
        x = [x[6] for x in jobs]
        filelib.assert_exists_many(x)

        # Fix the files.
        for x in jobs:
            normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \
                call_outfile, cov_outfile, raw_vcf_outfile, vcf_outfile, \
                log_outfile = x
            alignlib.clean_mutect_vcf(
                normal_bamfile, cancer_bamfile, normal_sample, cancer_sample,
                raw_vcf_outfile, vcf_outfile)
            
        return metadata
コード例 #26
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib
        from Betsy import module_utils as mlib

        # For debugging.
        RUN_VARIANT_CALLING = True
        FILTER_CALLS = True
        MERGE_CALLS = True
        FIX_VCF_FILES = True

        dna_bam_node, rna_bam_node, nc_node, ref_node = antecedents
        dna_bam_filenames = mlib.find_bam_files(dna_bam_node.identifier)
        assert dna_bam_filenames, "No DNA .bam files."
        rna_bam_filenames = mlib.find_bam_files(rna_bam_node.identifier)
        assert rna_bam_filenames, "No RNA .bam files."
        nc_match = mlib.read_normal_cancer_file(nc_node.identifier)
        ref = alignlib.create_reference_genome(ref_node.identifier)
        filelib.safe_mkdir(out_path)
        metadata = {}
        metadata["tool"] = "Radia %s" % alignlib.get_radia_version()

        ## Make sure the BAM files do not contain spaces in the
        ## filenames.  Radia doesn't work well with spaces.
        #filenames = dna_bam_filenames + rna_bam_filenames
        #has_spaces = []
        #for filename in filenames:
        #    if filename.find(" ") >= 0:
        #        has_spaces.append(filename)
        #x = has_spaces
        #if len(x) > 5:
        #    x = x[:5] + ["..."]
        #x = ", ".join(x)
        #msg = "Radia breaks if there are spaces in filenames: %s" % x
        #assert not has_spaces, msg

        # sample -> bam filename
        dnasample2bamfile = mlib.root2filename(dna_bam_filenames)
        rnasample2bamfile = mlib.root2filename(rna_bam_filenames)
        # Make sure files exist for all the samples.  The DNA-Seq
        # should have both normal and cancer.  RNA is not needed for
        # normal sample.
        mlib.assert_normal_cancer_samples(nc_match, dnasample2bamfile)
        mlib.assert_normal_cancer_samples(nc_match,
                                          rnasample2bamfile,
                                          ignore_normal_sample=True)

        # Make sure Radia and snpEff are configured.
        radia_genome_assembly = mlib.get_user_option(user_options,
                                                     "radia_genome_assembly",
                                                     not_empty=True)
        assert radia_genome_assembly == "hg19", "Only hg19 handled."
        snp_eff_genome = mlib.get_user_option(user_options,
                                              "snp_eff_genome",
                                              not_empty=True)

        radia_path = mlib.get_config("radia_path", assert_exists=True)
        snp_eff_path = mlib.get_config("snp_eff_path", assert_exists=True)
        radia_files = get_radia_files(radia_path, radia_genome_assembly)

        # Make a list of the chromosomes to use.  Pick an arbitrarily
        # BAM file.  Look at only the chromosomes that are present in
        # all files.
        all_bamfiles = dnasample2bamfile.values() + rnasample2bamfile.values()
        chroms = list_common_chromosomes(all_bamfiles)
        assert chroms, "No chromosomes found in all files."
        # Only use the chromosomes that can be filtered by Radia.
        chroms = filter_radia_chromosomes(chroms, radia_files)

        # Make output directories.
        radia_outpath = "radia1.tmp"
        filter_outpath = "radia2.tmp"
        merge_outpath = "radia3.tmp"

        if not os.path.exists(radia_outpath):
            os.mkdir(radia_outpath)
        if not os.path.exists(filter_outpath):
            os.mkdir(filter_outpath)
        if not os.path.exists(merge_outpath):
            os.mkdir(merge_outpath)

        # Steps:
        # 1.  Call variants (radia.py)
        #     -o <file.vcf>
        # 2.  Filter variants (filterRadia.py)
        #     <outpath>
        #     Creates a file: <filter_outpath>/<patient_id>_chr<chrom>.vcf
        # 3.  Merge (mergeChroms.py)
        #     Takes as input: <filter_outpath>
        #     Produces: <merge_outpath>/<patient_id>.vcf

        # list of (normal_sample, cancer_sample, chrom,
        #   normal_bamfile, dna_tumor_bamfile, rna_tumor_bamfile,
        #   radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile,
        #   final_vcf_outfile,
        #   radia_logfile, filter_logfile, merge_logfile)
        opj = os.path.join
        jobs = []
        for i, (normal_sample, cancer_sample) in enumerate(nc_match):
            normal_bamfile = dnasample2bamfile[normal_sample]
            dna_tumor_bamfile = dnasample2bamfile[cancer_sample]
            rna_tumor_bamfile = rnasample2bamfile[cancer_sample]

            merge_vcf_outfile = opj(merge_outpath, "%s.vcf" % cancer_sample)
            merge_logfile = opj(merge_outpath, "%s.log" % cancer_sample)
            final_vcf_outfile = opj(out_path, "%s.vcf" % cancer_sample)

            for chrom in chroms:
                radia_vcf_outfile = opj(
                    radia_outpath, "%s_chr%s.vcf" % (cancer_sample, chrom))
                filter_vcf_outfile = opj(
                    filter_outpath, "%s_chr%s.vcf" % (cancer_sample, chrom))
                radia_logfile = opj(radia_outpath,
                                    "%s_chr%s.log" % (cancer_sample, chrom))
                filter_logfile = opj(filter_outpath,
                                     "%s_chr%s.log" % (cancer_sample, chrom))
                x = normal_sample, cancer_sample, chrom, \
                    normal_bamfile, dna_tumor_bamfile, rna_tumor_bamfile, \
                    radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile, \
                    final_vcf_outfile, \
                    radia_logfile, filter_logfile, merge_logfile
                jobs.append(x)

        # Since Radia doesn't work well if there are spaces in the
        # filenames, symlink these files here to guarantee that there
        # are no spaces.
        normal_path = "normal.bam"
        dna_path = "dna.bam"
        rna_path = "rna.bam"
        if not os.path.exists(normal_path):
            os.mkdir(normal_path)
        if not os.path.exists(dna_path):
            os.mkdir(dna_path)
        if not os.path.exists(rna_path):
            os.mkdir(rna_path)
        for i, x in enumerate(jobs):
            normal_sample, cancer_sample, chrom, \
                normal_bamfile, dna_tumor_bamfile, rna_tumor_bamfile, \
                radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile, \
                final_vcf_outfile, \
                radia_logfile, filter_logfile, merge_logfile = x
            x1 = hash_and_symlink_bamfile(normal_bamfile, normal_path)
            x2 = hash_and_symlink_bamfile(dna_tumor_bamfile, dna_path)
            x3 = hash_and_symlink_bamfile(rna_tumor_bamfile, rna_path)
            clean_normal, clean_dna, clean_rna = x1, x2, x3
            x = normal_sample, cancer_sample, chrom, \
                clean_normal, clean_dna, clean_rna, \
                radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile, \
                final_vcf_outfile, \
                radia_logfile, filter_logfile, merge_logfile
            jobs[i] = x

        # Generate the commands for doing variant calling.
        python = mlib.get_config("python", which_assert_file=True)

        # filterRadia.py calls the "blat" command, and there's no way
        # to set the path.  Make sure "blat" is executable.
        if not filelib.which("blat"):
            # Find "blat" in the configuration and add it to the path.
            x = mlib.get_config("blat", which_assert_file=True)
            path, x = os.path.split(x)
            if os.environ["PATH"]:
                path = "%s:%s" % (os.environ["PATH"], path)
            os.environ["PATH"] = path
            # Make sure it's findable now.
            filelib.which_assert("blat")

        # STEP 1.  Call variants with radia.py.
        # python radia.py test31 5 \
        # -n bam04/PIM001_G.bam \
        # -t bam04/196B-MG.bam \
        # -r bam34/196B-MG.bam \
        # -f genomes/Broad.hg19/Homo_sapiens_assembly19.fa \
        # -o test32.vcf
        # --dnaTumorMitochon MT \
        # --rnaTumorMitochon MT \
        sq = mlib.sq
        commands = []
        for x in jobs:
            normal_sample, cancer_sample, chrom, \
                normal_bamfile, dna_tumor_bamfile, rna_tumor_bamfile, \
                radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile, \
                final_vcf_outfile, \
                radia_logfile, filter_logfile, merge_logfile = x

            x = [
                sq(python),
                sq(radia_files.radia_py),
                cancer_sample,
                chrom,
                "-n",
                sq(normal_bamfile),
                "-t",
                sq(dna_tumor_bamfile),
                "-r",
                sq(rna_tumor_bamfile),
                "-f",
                sq(ref.fasta_file_full),
                "-o",
                radia_vcf_outfile,
            ]
            if "MT" in chroms:
                x += [
                    "--dnaNormalMitochon MT",
                    "--dnaTumorMitochon MT",
                    "--rnaTumorMitochon MT",
                ]
            x = " ".join(x)
            x = "%s >& %s" % (x, radia_logfile)
            commands.append(x)
        assert len(commands) == len(jobs)
        # Only uses ~200 Mb of ram.
        if RUN_VARIANT_CALLING:
            parallel.pshell(commands, max_procs=num_cores)
        metadata["num_cores"] = num_cores
        metadata["commands"] = commands

        # Make sure log files are empty.
        logfiles = [x[10] for x in jobs]
        filelib.assert_exists_z_many(logfiles)

        # STEP 2.  Filter variants with filterRadia.py.
        commands = []
        for x in jobs:
            normal_sample, cancer_sample, chrom, \
                normal_bamfile, dna_tumor_bamfile, rna_tumor_bamfile, \
                radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile, \
                final_vcf_outfile, \
                radia_logfile, filter_logfile, merge_logfile = x

            x = [
                sq(python),
                sq(radia_files.filterRadia_py),
                cancer_sample,
                chrom,
                sq(radia_vcf_outfile),
                sq(filter_outpath),
                sq(radia_files.scripts_dir),
                "-b",
                sq(radia_files.blacklist_dir),
                "-d",
                sq(radia_files.snp_dir),
                "-r",
                sq(radia_files.retro_dir),
                "-p",
                sq(radia_files.pseudo_dir),
                "-c",
                sq(radia_files.cosmic_dir),
                "-t",
                sq(radia_files.target_dir),
                "-s",
                sq(snp_eff_path),
                "-e",
                snp_eff_genome,
                "--rnaGeneBlckFile",
                sq(radia_files.rnageneblck_file),
                "--rnaGeneFamilyBlckFile",
                sq(radia_files.rnagenefamilyblck_file),
            ]
            x = " ".join(x)
            x = "%s >& %s" % (x, filter_logfile)
            commands.append(x)
        assert len(commands) == len(jobs)

        # Sometimes samtools crashes in the middle of a run.  Detect
        # this case, and re-run the analysis if needed.
        assert len(commands) == len(jobs)
        py_commands = []
        for x, cmd in zip(jobs, commands):
            normal_sample, cancer_sample, chrom, \
                normal_bamfile, dna_tumor_bamfile, rna_tumor_bamfile, \
                radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile, \
                final_vcf_outfile, \
                radia_logfile, filter_logfile, merge_logfile = x
            args = cmd, cancer_sample, chrom, filter_logfile
            x = _run_filterRadia_with_restart, args, {}
            py_commands.append(x)
        # Takes ~10 Gb each.
        nc = mlib.calc_max_procs_from_ram(25, upper_max=num_cores)
        if FILTER_CALLS:
            parallel.pyfun(py_commands, num_procs=nc)
        metadata["commands"] += commands

        # Make sure log files are empty.
        logfiles = [x[11] for x in jobs]
        filelib.assert_exists_z_many(logfiles)

        # Make sure filter_vcf_outfile exists.
        outfiles = [x[7] for x in jobs]
        filelib.assert_exists_nz_many(outfiles)

        # STEP 3.  Merge the results.
        commands = []
        for x in jobs:
            normal_sample, cancer_sample, chrom, \
                normal_bamfile, dna_tumor_bamfile, rna_tumor_bamfile, \
                radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile, \
                final_vcf_outfile, \
                radia_logfile, filter_logfile, merge_logfile = x

            # python /usr/local/radia/scripts/mergeChroms.py 196B-MG \
            #   radia2.tmp/ radia3.tmp
            # The "/" after radia2.tmp is important.  If not given,
            # will generate some files with only newlines.

            fo = filter_outpath
            if not fo.endswith("/"):
                fo = "%s/" % fo
            x = [
                sq(python),
                sq(radia_files.mergeChroms_py),
                cancer_sample,
                fo,
                merge_outpath,
            ]
            x = " ".join(x)
            x = "%s >& %s" % (x, merge_logfile)
            commands.append(x)
        assert len(commands) == len(jobs)
        # Since the chromosomes were separated for the previous steps,
        # this will generate one merge for each chromosome.  This is
        # unnecessary, since we only need to merge once per sample.
        # Get rid of duplicates.
        commands = sorted({}.fromkeys(commands))
        if MERGE_CALLS:
            parallel.pshell(commands, max_procs=num_cores)
        metadata["commands"] += commands

        # Make sure log files are empty.
        logfiles = [x[12] for x in jobs]
        logfiles = sorted({}.fromkeys(logfiles))
        filelib.assert_exists_z_many(logfiles)

        # Fix the VCF files.
        commands = []
        for x in jobs:
            normal_sample, cancer_sample, chrom, \
                normal_bamfile, dna_tumor_bamfile, rna_tumor_bamfile, \
                radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile, \
                final_vcf_outfile, \
                radia_logfile, filter_logfile, merge_logfile = x
            args = normal_sample, cancer_sample, \
                   merge_vcf_outfile, final_vcf_outfile
            x = alignlib.clean_radia_vcf, args, {}
            commands.append(x)
        if FIX_VCF_FILES:
            parallel.pyfun(commands, num_procs=num_cores)

        # Make sure output VCF files exist.
        x = [x[9] for x in jobs]
        filelib.assert_exists_nz_many(x)

        return metadata
コード例 #27
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import parallel
        from genomicode import filelib
        from genomicode import alignlib
        from Betsy import module_utils as mlib

        fastq_node, group_node, reference_node = antecedents
        fastq_files = mlib.find_merged_fastq_files(group_node.identifier,
                                                   fastq_node.identifier)
        assert fastq_files, "No FASTQ files found."
        ref = alignlib.create_reference_genome(reference_node.identifier)
        filelib.safe_mkdir(out_path)

        metadata = {}
        metadata["tool"] = "bwa %s" % alignlib.get_bwa_version()

        # Make sure no duplicate samples.
        x1 = [x[0] for x in fastq_files]
        x2 = {}.fromkeys(x1).keys()
        assert len(x1) == len(x2), "dup sample"

        # Make a list of all FASTQ files to align.
        fastq_filenames = []
        for x in fastq_files:
            sample, pair1, pair2 = x
            assert pair1
            fastq_filenames.append(pair1)
            if pair2:
                fastq_filenames.append(pair2)

        # Make a list of all the jobs to do.
        jobs = []  # list of (fastq_filename, sai_filename)
        for in_filename in fastq_filenames:
            in_path, in_file = os.path.split(in_filename)
            x = in_file
            if x.lower().endswith(".fq"):
                x = x[:-3]
            elif x.lower().endswith(".fastq"):
                x = x[:-6]
            sai_filename = os.path.join(out_path, "%s.sai" % x)
            log_filename = os.path.join(out_path, "%s.log" % x)
            x = in_filename, sai_filename, log_filename
            jobs.append(x)

        # Calculate the number of threads per job.
        nc = max(1, num_cores / len(jobs))

        # Make the bwa commands.
        commands = []
        for x in jobs:
            fastq_filename, sai_filename, log_filename = x
            x = alignlib.make_bwa_aln_command(ref.fasta_file_full,
                                              fastq_filename,
                                              sai_filename,
                                              log_filename,
                                              num_threads=nc)
            commands.append(x)
        metadata["commands"] = commands
        metadata["num cores"] = num_cores
        parallel.pshell(commands, max_procs=num_cores)

        # Make sure the analysis completed successfully.
        for x in jobs:
            in_filename, sai_filename, log_filename = x
            assert filelib.exists_nz(sai_filename), \
                   "Missing: %s" % sai_filename
        return metadata
コード例 #28
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import config
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib
        from Betsy import module_utils

        ## Importing pysam is hard!
        #import sys
        #sys_path_old = sys.path[:]
        #sys.path = [x for x in sys.path if x.find("RSeQC") < 0]
        #import pysam
        #sys.path = sys_path_old

        bam_node, ref_node = antecedents
        bam_filenames = module_utils.find_bam_files(bam_node.identifier)
        assert bam_filenames, "No .bam files."
        ref = alignlib.create_reference_genome(ref_node.identifier)
        filelib.safe_mkdir(out_path)

        # list of (in_filename, err_filename, out_filename)
        jobs = []
        for in_filename in bam_filenames:
            p, f = os.path.split(in_filename)
            s, ext = os.path.splitext(f)
            log_filename = os.path.join(out_path, "%s.log" % s)
            out_filename = os.path.join(out_path, f)
            assert in_filename != out_filename
            x = in_filename, log_filename, out_filename
            jobs.append(x)

        # Don't do this.  Need MD, NM, NH in
        # summarize_alignment_cigar.  To be sure, just redo it.
        ## If the files already have MD tags, then just symlink the
        ## files.  Don't add again.
        #i = 0
        #while i < len(jobs):
        #    in_filename, out_filename = jobs[i]
        #
        #    handle = pysam.AlignmentFile(in_filename, "rb")
        #    align = handle.next()
        #    tag_dict = dict(align.tags)
        #    if "MD" not in tag_dict:
        #        i += 1
        #        continue
        #    # Has MD tags.  Just symlink and continue.
        #    os.symlink(in_filename, out_filename)
        #    del jobs[i]

        # Make a list of samtools commands.
        # Takes ~200 Mb per process, so should not be a big issue.
        samtools = filelib.which_assert(config.samtools)
        sq = parallel.quote
        commands = []
        for x in jobs:
            in_filename, log_filename, out_filename = x

            # samtools calmd -b <in.bam> <ref.fasta> > <out.bam>

            # May generate error:
            # [bam_fillmd1] different NM for read
            #   'ST-J00106:118:H75L3BBXX:3:2128:21846:47014': 0 -> 19
            # Pipe stderr to different file.
            x = [
                samtools,
                "calmd",
                "-b",
                sq(in_filename),
                sq(ref.fasta_file_full),
            ]
            x = " ".join(x)
            x = "%s 2> %s 1> %s" % (x, sq(log_filename), sq(out_filename))
            commands.append(x)
        parallel.pshell(commands, max_procs=num_cores)

        # Make sure the analysis completed successfully.
        x = [x[-1] for x in jobs]
        filelib.assert_exists_nz_many(x)
コード例 #29
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib
        from Betsy import module_utils as mlib
        #import call_variants_GATK

        bam_node, ref_node = antecedents
        bam_filenames = mlib.find_bam_files(bam_node.identifier)
        assert bam_filenames, "No .bam files."
        ref = alignlib.create_reference_genome(ref_node.identifier)
        filelib.safe_mkdir(out_path)
        metadata = {}

        # Figure out whether the user wants SNPs or INDELs.
        #assert "vartype" in out_attributes
        #vartype = out_attributes["vartype"]
        #assert vartype in ["all", "snp", "indel"]

        # Platypus generates an error if there are spaces in the BAM
        # filename.  Symlink the file to a local directory to make
        # sure there are no spaces.
        bam_path = "bam"

        jobs = []  # list of filelib.GenericObject
        for bam_filename in bam_filenames:
            p, f = os.path.split(bam_filename)
            sample, ext = os.path.splitext(f)
            bai_filename = "%s.bai" % bam_filename
            filelib.assert_exists_nz(bai_filename)
            x = sample.replace(" ", "_")
            local_bam = os.path.join(bam_path, "%s.bam" % x)
            local_bai = os.path.join(bam_path, "%s.bam.bai" % x)
            log_filename = os.path.join(out_path, "%s.log" % sample)
            err_filename = os.path.join(out_path, "%s.err" % sample)
            # Unfiltered file.
            #raw_filename = os.path.join(out_path, "%s.raw" % sample)
            # Final VCF file.
            out_filename = os.path.join(out_path, "%s.vcf" % sample)
            x = filelib.GenericObject(bam_filename=bam_filename,
                                      bai_filename=bai_filename,
                                      local_bam=local_bam,
                                      local_bai=local_bai,
                                      log_filename=log_filename,
                                      err_filename=err_filename,
                                      out_filename=out_filename)
            jobs.append(x)

        filelib.safe_mkdir(bam_path)
        for j in jobs:
            assert " " not in j.local_bam
            filelib.assert_exists_nz(j.bam_filename)
            filelib.assert_exists_nz(j.bai_filename)
            if not os.path.exists(j.local_bam):
                os.symlink(j.bam_filename, j.local_bam)
            if not os.path.exists(j.local_bai):
                os.symlink(j.bai_filename, j.local_bai)

        # TODO: Keep better track of the metadata.
        buffer_size = 100000
        max_reads = 5E6
        # Running into errors sometimes, so increase these numbers.
        #   WARNING - Too many reads (5000000) in region
        #   1:500000-600000. Quitting now. Either reduce --bufferSize or
        #   increase --maxReads.
        buffer_size = buffer_size * 10
        max_reads = max_reads * 10

        # Make a list of commands.
        commands = []
        for j in jobs:
            #nc = max(1, num_cores/len(jobs))
            x = alignlib.make_platypus_command(bam_file=j.local_bam,
                                               ref_file=ref.fasta_file_full,
                                               log_file=j.log_filename,
                                               out_file=j.out_filename,
                                               buffer_size=buffer_size,
                                               max_reads=max_reads)
            x = "%s >& %s" % (x, j.err_filename)
            commands.append(x)

        #for x in commands:
        #    print x
        #import sys; sys.exit(0)
        parallel.pshell(commands, max_procs=num_cores)

        # Make sure the analysis completed successfully.  If not, try
        # to diagnose.
        for j in jobs:
            if filelib.exists_nz(j.out_filename):
                continue
            for line in open(j.err_filename):
                if line.find("WARNING - Too many reads") >= 0:
                    print line,
        x = [j.out_filename for j in jobs]
        filelib.assert_exists_nz_many(x)

        # Filter each of the VCF files.
        #for j in jobs:
        #    call_variants_GATK.filter_by_vartype(
        #        vartype, j.raw_filename, j.out_filename)
        #metadata["filter"] = vartype

        return metadata
コード例 #30
0
    def run(
        self, network, in_data, out_attributes, user_options, num_cores,
        out_path):
        import os
        import shutil
        from genomicode import parallel
        from genomicode import filelib
        from genomicode import alignlib
        from Betsy import module_utils as mlib

        bam_filenames = mlib.find_bam_files(in_data.identifier)
        filelib.safe_mkdir(out_path)

        metadata = {}
        metadata["tool"] = "bam2fastx (unknown version)"

        # Somehow bam2fastx doesn't work if there are spaces in the
        # filename.  Make a temporary filename with no spaces, and
        # then rename it later.
        # Actually, may not be bam2fastx's fault.

        jobs = []
        for i, bam_filename in enumerate(bam_filenames):
            p, f, e = mlib.splitpath(bam_filename)
            #bai_filename = alignlib.find_bai_file(bam_filename)
            #assert bai_filename, "Missing index for: %s" % bam_filename
            #temp_bam_filename = "%d.bam" % i
            #temp_bai_filename = "%d.bam.bai" % i
            #temp_fa_filename = "%d.fa" % i
            fa_filename = os.path.join(out_path, "%s.fa" % f)
            x = filelib.GenericObject(
                bam_filename=bam_filename,
                #bai_filename=bai_filename,
                #temp_bam_filename=temp_bam_filename,
                #temp_bai_filename=temp_bai_filename,
                #temp_fa_filename=temp_fa_filename,
                fa_filename=fa_filename)
            jobs.append(x)
        bam2fastx = mlib.findbin("bam2fastx")

        # Link all the bam files.
        #for j in jobs:
        #    assert not os.path.exists(j.temp_bam_filename)
        #    #assert not os.path.exists(j.temp_bai_filename)
        #    os.symlink(j.bam_filename, j.temp_bam_filename)
        #    #os.symlink(j.bai_filename, j.temp_bai_filename)

        commands = []
        for j in jobs:
            # bam2fastx -A --fasta -o rqc14.fa rqc11.bam
            x = [
                mlib.sq(bam2fastx),
                "-A",
                "--fasta",
                #"-o", mlib.sq(j.temp_fa_filename),
                #mlib.sq(j.temp_bam_filename),
                "-o", mlib.sq(j.fa_filename),
                mlib.sq(j.bam_filename),
                ]
            x = " ".join(x)
            commands.append(x)
        metadata["commands"] = commands
        metadata["num_cores"] = num_cores
        parallel.pshell(commands, max_procs=num_cores)

        #for j in jobs:
        #    # Move the temporary files to the final location.
        #    shutil.move(j.temp_fa_filename, j.fa_filename)
        #    # Remove the link to the BAM file.
        #    os.unlink(j.temp_bam_filename)
        
        x = [j.fa_filename for x in jobs]
        filelib.assert_exists_nz_many(x)

        return metadata