Esempio n. 1
0
    def run(
        self, network, antecedents, out_attributes, user_options, num_cores,
        out_path):
        import os
        from genomicode import filelib
        #from genomicode import parallel
        from genomicode import hashlib
        from Betsy import module_utils as mlib

        # TODO: Merge with merge_variants_snp.py.
        #CALLERS = [
        #    "gatk", "platypus", "varscan",
        #    ]
        vcf_paths = [x.identifier for x in antecedents]
        nodes = [x.data for x in antecedents]
        CALLERS = [x.attributes["caller"] for x in nodes]
        assert len(CALLERS) == len(vcf_paths)
        filelib.safe_mkdir(out_path)
        metadata = {}

        # list of (sample, caller, out_vcf_path, in_vcf_file, out_vcf_file)
        jobs = []
        for i, caller in enumerate(CALLERS):
            inpath = vcf_paths[i]
            caller_h = hashlib.hash_var(caller)
            
            vcf_files = filelib.list_files_in_path(
                inpath, endswith=".vcf", toplevel_only=True)
            for file_ in vcf_files:
                # IN_FILE:   <inpath>/<sample>.vcf
                # OUT_FILE:  <out_path>/<caller>.vcf/<sample>.vcf
                p, sample, e = mlib.splitpath(file_)
                assert e == ".vcf"
                out_vcf_path = os.path.join(out_path, "%s.vcf" % caller_h)
                out_vcf_file = os.path.join(out_vcf_path, "%s.vcf" % sample)

                x = filelib.GenericObject(
                    sample=sample, caller=caller,
                    out_vcf_path=out_vcf_path, in_vcf_file=file_,
                    out_vcf_file=out_vcf_file)
                jobs.append(x)
                
        # Make sure the same samples are found in all callers.
        caller2samples = {}
        for j in jobs:
            if j.caller not in caller2samples:
                caller2samples[j.caller] = []
            caller2samples[j.caller].append(j.sample)
        comp_samples = None
        for caller, samples in caller2samples.iteritems():
            samples = sorted(samples)
            if comp_samples is None:
                comp_samples = samples
            assert comp_samples == samples, "%s %s" % (comp_samples, samples)

        for j in jobs:
            filelib.safe_mkdir(j.out_vcf_path)
            os.symlink(j.in_vcf_file, j.out_vcf_file)

        return metadata
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import filelib
        from Betsy import module_utils as mlib
        import merge_vcf_folder

        vcffolders_node = antecedents
        filelib.safe_mkdir(out_path)
        metadata = {}

        x = os.listdir(vcffolders_node.identifier)
        x = [x for x in x if x.endswith(".vcf")]
        assert x, "No VCF folders found: %s" % vcffolders_node.identifier
        x = [os.path.join(vcffolders_node.identifier, x) for x in x]
        vcf_folders = x

        jobs = []
        for folder in vcf_folders:
            path, root, ext = mlib.splitpath(folder)
            assert ext == ".vcf"
            caller = root
            vcf_filenames = filelib.list_files_in_path(folder,
                                                       endswith=".vcf",
                                                       toplevel_only=True)
            assert vcf_filenames, "No .vcf files: %s" % folder
            out_filename = os.path.join(out_path, "%s.vcf" % root)
            tmp_path = "%s.indexed.vcf" % caller
            x = filelib.GenericObject(caller=caller,
                                      vcf_filenames=vcf_filenames,
                                      out_filename=out_filename,
                                      tmp_path=tmp_path)
            jobs.append(x)

        for j in jobs:
            m = merge_vcf_folder.merge_vcf_files(j.vcf_filenames,
                                                 j.out_filename, num_cores,
                                                 j.tmp_path)
            if "commands" not in metadata:
                metadata["commands"] = []
            metadata["commands"].extend(m["commands"])

        x = [x.out_filename for x in jobs]
        filelib.assert_exists_many(x)

        return metadata
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        import filter_variants_GATK

        vcf_node = in_data
        vcf_filenames = filelib.list_files_in_path(vcf_node.identifier,
                                                   endswith=".vcf",
                                                   not_empty=True)
        assert vcf_filenames, "No VCF files found."
        filelib.safe_mkdir(out_path)
        metadata = {}

        # Figure out whether the user wants SNPs or INDELs.
        assert "vartype" in out_attributes
        vartype = out_attributes["vartype"]
        assert vartype in ["snp", "indel"]
        metadata["filter"] = vartype

        jobs = []  # list of filelib.GenericObject
        for in_filename in vcf_filenames:
            p, f = os.path.split(in_filename)
            out_filename = os.path.join(out_path, f)
            x = filelib.GenericObject(in_filename=in_filename,
                                      out_filename=out_filename)
            jobs.append(x)

        # Filter each of the VCF files.
        jobs2 = []
        for j in jobs:
            args = vartype, j.in_filename, j.out_filename
            x = filter_variants_GATK.filter_by_vartype, args, {}
            jobs2.append(x)
        parallel.pyfun(jobs2, num_procs=num_cores)
        metadata["num_cores"] = num_cores

        return metadata
Esempio n. 4
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib
        from Betsy import module_utils as mlib

        bam_node, ref_node, insert_size_node, alignment_node = antecedents
        bam_filenames = mlib.find_bam_files(bam_node.identifier)
        assert bam_filenames, "No .bam files."
        ref = alignlib.create_reference_genome(ref_node.identifier)
        filelib.safe_mkdir(out_path)
        metadata = {}

        # ./pindel -f <reference.fa> -i <bam_configuration_file>
        #   -c <chromosome_name> -o <out_prefix>
        #   -T <num threads>
        #
        # Creates files:
        # <out_prefix>_D     Deletion
        # <out_prefix>_SI    Short insertion
        # <out_prefix>_LI    Long insertion
        # <out_prefix>_INV   Inversion
        # <out_prefix>_TD    Tandem deletion
        # <out_prefix>_BP    Breakpoint
        # <out_prefix>_RP    ??? read pair???
        # <out_prefix>_CloseEndMapped   Only on end could be mapped.

        # Pindel cannot handle spaces in the BAM filenames (because of
        # the config file).  Symlink the file to a local directory to make
        # sure there are no spaces.
        bam_path = "bam"

        opj = os.path.join
        jobs = []  # list of filelib.GenericObject
        for bam_filename in bam_filenames:
            p, f = os.path.split(bam_filename)
            sample, ext = os.path.splitext(f)
            bai_filename = "%s.bai" % bam_filename
            filelib.assert_exists_nz(bai_filename)
            x = sample.replace(" ", "_")
            local_bam = opj(bam_path, "%s.bam" % x)
            local_bai = opj(bam_path, "%s.bam.bai" % x)
            config_filename = opj(out_path, "%s.config.txt" % sample)
            out_prefix = opj(out_path, sample)
            log_filename = opj(out_path, "%s.log" % sample)
            x = filelib.GenericObject(sample=sample,
                                      bam_filename=bam_filename,
                                      bai_filename=bai_filename,
                                      local_bam=local_bam,
                                      local_bai=local_bai,
                                      config_filename=config_filename,
                                      out_prefix=out_prefix,
                                      log_filename=log_filename)
            jobs.append(x)

        filelib.safe_mkdir(bam_path)
        for j in jobs:
            assert " " not in j.local_bam
            filelib.assert_exists_nz(j.bam_filename)
            filelib.assert_exists_nz(j.bai_filename)
            if not os.path.exists(j.local_bam):
                os.symlink(j.bam_filename, j.local_bam)
            if not os.path.exists(j.local_bai):
                os.symlink(j.bai_filename, j.local_bai)

        # Read the insert sizes.
        summary_file = opj(insert_size_node.identifier, "summary.txt")
        filelib.assert_exists_nz(summary_file)
        sample2size = _read_insert_sizes(summary_file)
        # Make sure all the samples have inserts.
        for j in jobs:
            assert j.sample in sample2size, \
                   "Missing in insert size file: %s" % j.sample

        # Read the fragment sizes.
        summary_file = opj(alignment_node.identifier, "summary.txt")
        filelib.assert_exists_nz(summary_file)
        sample2readlen = _read_fragment_sizes(summary_file)
        # Make sure all the samples have read lengths.
        for j in jobs:
            assert j.sample in sample2readlen, \
                   "Missing in alignment summary file: %s" % j.sample

        # Make the config file.
        for j in jobs:
            # <insert size> is the whole length to be sequenced, including
            # the length of the pair of reads.  Picard only counts the
            # sequence between the reads.
            size = sample2size[j.sample]
            read_length = sample2readlen[j.sample]
            insert_size = size + read_length * 2
            handle = open(j.config_filename, 'w')
            print >> handle, "%s %s %s" % (j.local_bam, insert_size, j.sample)
            handle.close()

        # Make a list of commands.
        pindel = mlib.get_config("pindel", which_assert_file=True)
        sq = parallel.quote
        commands = []
        for j in jobs:
            cmd = [
                sq(pindel),
                "-f",
                sq(ref.fasta_file_full),
                "-i",
                sq(j.config_filename),
                "-c",
                "ALL",
                "-T",
                1,
                "-o",
                sq(j.out_prefix),
            ]
            cmd = " ".join(map(str, cmd))
            cmd = "%s >& %s" % (cmd, j.log_filename)
            commands.append(cmd)
        parallel.pshell(commands, max_procs=num_cores)
        metadata["num_cores"] = num_cores
        metadata["commands"] = commands

        # Make sure the analysis completed successfully.  If not, try
        # to diagnose.
        x = [x.log_filename for x in jobs]
        filelib.assert_exists_nz_many(x)
        x1 = ["%s_D" % x.out_prefix for x in jobs]
        x2 = ["%s_SI" % x.out_prefix for x in jobs]
        x3 = ["%s_LI" % x.out_prefix for x in jobs]
        x4 = ["%s_INV" % x.out_prefix for x in jobs]
        x5 = ["%s_TD" % x.out_prefix for x in jobs]
        x6 = ["%s_BP" % x.out_prefix for x in jobs]
        x = x1 + x2 + x3 + x4 + x5 + x6
        filelib.assert_exists_many(x)

        return metadata
Esempio n. 5
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib
        from Betsy import module_utils as mlib

        bam_node, nc_node, ref_node = antecedents
        bam_filenames = mlib.find_bam_files(bam_node.identifier)
        assert bam_filenames, "No .bam files."
        nc_match = mlib.read_normal_cancer_file(nc_node.identifier)
        ref = alignlib.create_reference_genome(ref_node.identifier)
        filelib.safe_mkdir(out_path)
        metadata = {}
        # TODO: Figure out version.

        # Figure out whether the user wants SNPs or INDELs.
        #assert "vartype" in out_attributes
        #vartype = out_attributes["vartype"]
        #assert vartype in ["all", "snp", "indel"]

        # sample -> bam filename
        sample2bamfile = mlib.root2filename(bam_filenames)
        # Make sure files exist for all the samples.
        mlib.assert_normal_cancer_samples(nc_match, sample2bamfile)

        # list of (cancer_sample, normal_bamfile, tumor_bamfile, orig_outfile,
        #          fixed_outfile, filtered_outfile)
        opj = os.path.join
        jobs = []
        for (normal_sample, cancer_sample) in nc_match:
            normal_bamfile = sample2bamfile[normal_sample]
            cancer_bamfile = sample2bamfile[cancer_sample]
            path, sample, ext = mlib.splitpath(cancer_bamfile)
            orig_outfile = opj(out_path, "%s.raw" % sample)
            fix_outfile = opj(out_path, "%s.vcf" % sample)
            #filter_outfile = opj(out_path, "%s.vcf" % sample)
            x = cancer_sample, normal_bamfile, cancer_bamfile, \
                orig_outfile, fix_outfile
            x = filelib.GenericObject(cancer_sample=cancer_sample,
                                      normal_bamfile=normal_bamfile,
                                      cancer_bamfile=cancer_bamfile,
                                      orig_outfile=orig_outfile,
                                      fix_outfile=fix_outfile)
            jobs.append(x)

        # python /usr/local/museq/classify.py \
        #   normal:test31/normal.bam tumour:test31/tumor.bam \
        #   reference:genomes/Broad.hg19/Homo_sapiens_assembly19.fa \
        #   model:/usr/local/museq/model_v4.1.2.npz \
        #   --config /usr/local/museq/metadata.config \
        #   -o test51.vcf
        opj = os.path.join
        museq = mlib.get_config("museq", assert_exists=True)
        classify_py = opj(museq, "classify.py")
        model_file = opj(museq, "model_v4.1.2.npz")
        config_file = opj(museq, "metadata.config")
        filelib.assert_exists_nz(classify_py)
        filelib.assert_exists_nz(model_file)
        filelib.assert_exists_nz(config_file)

        # museq's config file generates a broken VCF file.  Fix it.
        fixed_config_file = "fixed.config"
        fix_config_file(config_file, fixed_config_file)

        # Generate the commands.
        sq = mlib.sq
        commands = []
        for j in jobs:
            #cancer_sample, normal_bamfile, cancer_bamfile, \
            #               raw_outfile, fix_outfile, vcf_outfile = x

            x = [
                "python",  # should allow user to specify python
                sq(classify_py),
                sq("normal:%s" % j.normal_bamfile),
                sq("tumour:%s" % j.cancer_bamfile),
                sq("reference:%s" % ref.fasta_file_full),
                sq("model:%s" % model_file),
                "--config",
                sq(fixed_config_file),
                "-o",
                sq(j.orig_outfile),
            ]
            x = " ".join(map(str, x))
            commands.append(x)
        # Not sure how much RAM this takes.  On Thunderbolts test,
        # took < 1 Gb.
        nc = mlib.calc_max_procs_from_ram(5, upper_max=num_cores)
        parallel.pshell(commands, max_procs=nc)
        metadata["num_cores"] = nc
        metadata["commands"] = commands

        # JointSNVMix produces non-standard VCF files.  Fix this so it
        # will work with other programs downstream.
        for j in jobs:
            #cancer_sample, normal_bamfile, cancer_bamfile, \
            #               raw_outfile, fix_outfile, vcf_outfile = x
            fix_vcf_file(j.cancer_sample, j.orig_outfile, j.fix_outfile)

        # Filter each of the VCF files.
        #for x in jobs:
        #    cancer_sample, normal_bamfile, cancer_bamfile, \
        #                   raw_outfile, fix_outfile, vcf_outfile = x
        #    filter_by_vartype(vartype, fix_outfile, vcf_outfile)
        #metadata["filter"] = vartype

        #x = [x[-1] for x in jobs]
        x = [j.fix_outfile for x in jobs]
        filelib.assert_exists_many(x)

        return metadata
Esempio n. 6
0
    def run(
        self, network, in_data, out_attributes, user_options, num_cores,
        out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib
        from genomicode import hashlib
        from Betsy import module_utils
        
        bam_filenames = module_utils.find_bam_files(in_data.identifier)
        assert bam_filenames, "No .bam files."
        filelib.safe_mkdir(out_path)
        metadata = {}
        
        jobs = []
        for in_filename in bam_filenames:
            p, f = os.path.split(in_filename)
            s, ext = os.path.splitext(f)
            sample = hashlib.hash_var(s)
            log_filename = os.path.join(out_path, "%s.log" % s)
            out_filename = os.path.join(out_path, f)
            x = filelib.GenericObject(
                in_filename=in_filename,
                sample=sample,
                log_filename=log_filename,
                out_filename=out_filename)
            jobs.append(x)
        
        gid = "group1"
        library = "library"
        platform_unit = "platform"
        #sample = "sample"
        platform = "illumina"

        # java -Xmx5g -jar AddOrReplaceReadGroups.jar
        #   I=<input.sam or .bam> O=<output.bam> ID=<group ID>
        #   LB=<group library> PU=<platform unit> SM=<group sample name>
        #   PL=<platform> CREATE_INDEX=true VALIDATION_STRINGENCY=LENIENT
        picard_jar = alignlib.find_picard_jar("picard")

        # Make a list of commands.
        sq = parallel.quote
        commands = []
        for j in jobs:
            x = [
                "java", "-Xmx5g",
                "-jar", sq(picard_jar),
                "AddOrReplaceReadGroups", 
                "I=%s" % sq(j.in_filename),
                "O=%s" % sq(j.out_filename),
                "ID=%s" % gid,
                "LB=%s" % library,
                "PU=%s" % platform_unit,
                "SM=%s" % j.sample,
                "PL=%s" % platform,
                #"CREATE_INDEX=true",
                "VALIDATION_STRINGENCY=LENIENT",
                ]
            x = " ".join(x)
            x = "%s >& %s" % (x, sq(j.log_filename))
            commands.append(x)
            
        parallel.pshell(commands, max_procs=num_cores)
        metadata["commands"] = commands
        metadata["num_cores"] = num_cores

        # Make sure the analysis completed successfully.
        # Make sure outfiles exist.
        out_filenames = [j.out_filename for x in jobs]
        filelib.assert_exists_nz_many(out_filenames)

        # Check the log files to make sure there are no error.
        for j in jobs:
            check_log_file(j.log_filename)

        return metadata
Esempio n. 7
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib
        from Betsy import module_utils as mlib
        #import call_variants_GATK

        bam_node, ref_node = antecedents
        bam_filenames = mlib.find_bam_files(bam_node.identifier)
        assert bam_filenames, "No .bam files."
        ref = alignlib.create_reference_genome(ref_node.identifier)
        filelib.safe_mkdir(out_path)
        metadata = {}

        # Figure out whether the user wants SNPs or INDELs.
        #assert "vartype" in out_attributes
        #vartype = out_attributes["vartype"]
        #assert vartype in ["all", "snp", "indel"]

        # Platypus generates an error if there are spaces in the BAM
        # filename.  Symlink the file to a local directory to make
        # sure there are no spaces.
        bam_path = "bam"

        jobs = []  # list of filelib.GenericObject
        for bam_filename in bam_filenames:
            p, f = os.path.split(bam_filename)
            sample, ext = os.path.splitext(f)
            bai_filename = "%s.bai" % bam_filename
            filelib.assert_exists_nz(bai_filename)
            x = sample.replace(" ", "_")
            local_bam = os.path.join(bam_path, "%s.bam" % x)
            local_bai = os.path.join(bam_path, "%s.bam.bai" % x)
            log_filename = os.path.join(out_path, "%s.log" % sample)
            err_filename = os.path.join(out_path, "%s.err" % sample)
            # Unfiltered file.
            #raw_filename = os.path.join(out_path, "%s.raw" % sample)
            # Final VCF file.
            out_filename = os.path.join(out_path, "%s.vcf" % sample)
            x = filelib.GenericObject(bam_filename=bam_filename,
                                      bai_filename=bai_filename,
                                      local_bam=local_bam,
                                      local_bai=local_bai,
                                      log_filename=log_filename,
                                      err_filename=err_filename,
                                      out_filename=out_filename)
            jobs.append(x)

        filelib.safe_mkdir(bam_path)
        for j in jobs:
            assert " " not in j.local_bam
            filelib.assert_exists_nz(j.bam_filename)
            filelib.assert_exists_nz(j.bai_filename)
            if not os.path.exists(j.local_bam):
                os.symlink(j.bam_filename, j.local_bam)
            if not os.path.exists(j.local_bai):
                os.symlink(j.bai_filename, j.local_bai)

        # TODO: Keep better track of the metadata.
        buffer_size = 100000
        max_reads = 5E6
        # Running into errors sometimes, so increase these numbers.
        #   WARNING - Too many reads (5000000) in region
        #   1:500000-600000. Quitting now. Either reduce --bufferSize or
        #   increase --maxReads.
        buffer_size = buffer_size * 10
        max_reads = max_reads * 10

        # Make a list of commands.
        commands = []
        for j in jobs:
            #nc = max(1, num_cores/len(jobs))
            x = alignlib.make_platypus_command(bam_file=j.local_bam,
                                               ref_file=ref.fasta_file_full,
                                               log_file=j.log_filename,
                                               out_file=j.out_filename,
                                               buffer_size=buffer_size,
                                               max_reads=max_reads)
            x = "%s >& %s" % (x, j.err_filename)
            commands.append(x)

        #for x in commands:
        #    print x
        #import sys; sys.exit(0)
        parallel.pshell(commands, max_procs=num_cores)

        # Make sure the analysis completed successfully.  If not, try
        # to diagnose.
        for j in jobs:
            if filelib.exists_nz(j.out_filename):
                continue
            for line in open(j.err_filename):
                if line.find("WARNING - Too many reads") >= 0:
                    print line,
        x = [j.out_filename for j in jobs]
        filelib.assert_exists_nz_many(x)

        # Filter each of the VCF files.
        #for j in jobs:
        #    call_variants_GATK.filter_by_vartype(
        #        vartype, j.raw_filename, j.out_filename)
        #metadata["filter"] = vartype

        return metadata
Esempio n. 8
0
    def run(
        self, network, antecedents, out_attributes, user_options, num_cores,
        out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib
        from Betsy import module_utils as mlib

        bam_node, ref_node = antecedents
        bam_filenames = mlib.find_bam_files(bam_node.identifier)
        assert bam_filenames, "No .bam files."
        ref = alignlib.create_reference_genome(ref_node.identifier)
        filelib.safe_mkdir(out_path)
        metadata = {}

        # java -jar picard.jar CollectAlignmentSummaryMetrics \
        #   R=reference_sequence.fasta \
        #   I=input.bam \
        #   O=output.txt
        opj = os.path.join
        jobs = []   # list of filelib.GenericObject
        for bam_filename in bam_filenames:
            # <in_path>/<sample>.bam
            in_path, sample, ext = mlib.splitpath(bam_filename)
            assert ext == ".bam"
            out_filename = opj(out_path, "%s.alignment_metrics.txt" % sample)
            log_filename = opj(out_path, "%s.log" % sample)
            x = filelib.GenericObject(
                sample=sample,
                bam_filename=bam_filename,
                out_filename=out_filename,
                log_filename=log_filename)
            jobs.append(x)

        # Make the commands to run picard.
        picard_jar = alignlib.find_picard_jar("picard")
        sq = parallel.quote
        commands = []
        for j in jobs:
            # Should have better way of getting java path.
            cmd = [
                "java",
                "-Xmx10g",
                "-jar", sq(picard_jar), "CollectAlignmentSummaryMetrics",
                "I=%s" % sq(j.bam_filename),
                "R=%s" % sq(ref.fasta_file_full),
                "O=%s" % sq(j.out_filename),
                ]
            cmd = " ".join(cmd)
            cmd = "%s >& %s" % (cmd, sq(j.log_filename))
            commands.append(cmd)

        metadata["commands"] = commands
        parallel.pshell(commands, max_procs=num_cores)
        x = [x.out_filename for x in jobs]
        filelib.assert_exists_nz_many(x)

        # Summarize the insert size files.
        outfile = opj(out_path, "summary.txt")
        _summarize_alignment_summary_metrics(jobs, outfile)
        filelib.assert_exists_nz(outfile)

        return metadata
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import parallel
        from genomicode import filelib
        from genomicode import alignlib
        from Betsy import module_utils as mlib

        fastq_node, sample_node, ref_node = antecedents
        fastq_files = mlib.find_merged_fastq_files(sample_node.identifier,
                                                   fastq_node.identifier)
        ref = alignlib.create_reference_genome(ref_node.identifier)
        filelib.safe_mkdir(out_path)

        # Do a quick check to make sure the reference is correct.
        # Otherwise, error may be hard to disgnose.
        alignlib.assert_is_STAR_reference(ref.path)

        metadata = {}
        metadata["tool"] = "STAR %s" % alignlib.get_STAR_version()

        # Figure out the strandedness.
        is_stranded = False

        # STAR --runThreadN 40 --genomeDir test05 \
        #   --readFilesIn test.fastq/test03_R1_001.fastq \
        #   test.fastq/test03_R2_001.fastq --outFileNamePrefix test06.
        # If unstranded, add --outSAMstrandField intronMotif

        # Make a list of the jobs to run.
        jobs = []  # list of filelib.GenericObject objects
        for x in fastq_files:
            sample, pair1, pair2 = x
            out_prefix = "%s." % sample
            bam_filename = os.path.join(out_path,
                                        "%sAligned.out.bam" % out_prefix)
            log_filename = os.path.join(out_path, "%s.log" % sample)

            x = filelib.GenericObject(
                sample=sample,
                pair1=pair1,
                pair2=pair2,
                out_prefix=out_prefix,
                bam_filename=bam_filename,
                log_filename=log_filename,
            )
            jobs.append(x)

        # Run pass 1.
        commands = []
        for j in jobs:
            x = os.path.join(out_path, j.out_prefix)
            cmd = alignlib.make_STAR_command(ref.path, x, num_cores,
                                             is_stranded, j.pair1, j.pair2,
                                             j.log_filename)
            # For debugging.  If this file already exists, skip it.
            if not filelib.exists_nz(j.bam_filename):
                parallel.sshell(cmd, path=out_path)
            filelib.assert_exists_nz(j.bam_filename)
            commands.append(cmd)

        metadata["commands"] = commands
        metadata["num_cores"] = num_cores

        return metadata
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            out_path):
        import os
        from genomicode import config
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib
        #from genomicode import hashlib
        from Betsy import module_utils

        in_filenames = module_utils.find_bam_files(in_data.identifier)
        assert in_filenames, "No .bam files."
        filelib.safe_mkdir(out_path)

        metadata = {}
        metadata["tool"] = "samtools %s" % alignlib.get_samtools_version()

        jobs = []
        #seen = {}
        for i, in_filename in enumerate(in_filenames):
            p, f = os.path.split(in_filename)
            temp_prefix = "temp_%s" % f
            #temp_prefix = "temp_%s" % hashlib.hash_var(f)
            # Make sure no duplicates.
            #assert temp_prefix not in seen
            #seen[temp_prefix] = 1
            #temp_outfilename = "%d.bam" % i
            out_filename = os.path.join(out_path, f)
            x = filelib.GenericObject(
                in_filename=in_filename,
                temp_prefix=temp_prefix,
                #temp_outfilename=temp_outfilename,
                out_filename=out_filename)
            jobs.append(x)

        samtools = filelib.which_assert(config.samtools)

        # Calculate the number of threads per process.
        nc = module_utils.calc_max_procs_from_ram(4, upper_max=num_cores)
        num_threads = max(nc / len(jobs), 1)

        # Make a list of samtools commands.
        # Without -m, takes ~1 Gb per process.
        sq = parallel.quote
        commands = []
        for j in jobs:
            # Usage has changed.  Below no longer valid.
            # samtools sort <in_filename> <out_filestem>
            # .bam automatically added to <out_filestem>, so don't
            # need it.
            #x = out_filename
            #assert x.endswith(".bam")
            #x = x[:-4]
            #out_filestem = x

            x = [
                sq(samtools),
                "sort",
                "-O",
                "bam",
                "-T",
                sq(j.temp_prefix),
                "-m",
                "4G",  # Crashing, so try increasing memory.
                sq(j.in_filename),
                #"-o", sq(j.temp_outfilename),
                "-o",
                sq(j.out_filename),
            ]
            if num_threads > 1:
                x += ["-@", num_threads]
            x = " ".join(map(str, x))
            commands.append(x)
        metadata["commands"] = commands
        metadata["num_cores"] = nc

        parallel.pshell(commands, max_procs=nc)
        #for cmd in commands:
        #    parallel.sshell(cmd)

        #for j in jobs:
        #    # Move the temporary files to the final location.
        #    shutil.move(j.temp_outfilename, j.out_filename)

        # Make sure the analysis completed successfully.
        x = [j.out_filename for j in jobs]
        filelib.assert_exists_nz_many(x)

        return metadata
Esempio n. 11
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import vcflib
        from Betsy import module_utils as mlib

        vcf_node, nc_node = antecedents
        vcf_filenames = filelib.list_files_in_path(vcf_node.identifier,
                                                   endswith=".vcf")
        assert vcf_filenames, "No .vcf files."
        nc_match = mlib.read_normal_cancer_file(nc_node.identifier)
        filelib.safe_mkdir(out_path)
        metadata = {}

        # Filenames:
        # <caller>.vcf

        wgs_or_wes = mlib.get_user_option(user_options,
                                          "wgs_or_wes",
                                          not_empty=True,
                                          allowed_values=["wgs", "wes"])
        genome = mlib.get_user_option(user_options,
                                      "snpeff_genome",
                                      not_empty=True)
        databases = list_snpeff_databases()
        assert genome in databases, "Unknown genome database: %s" % genome

        # For each caller, do the SnpEFF calls.  Some callers include
        # the somatic information, others do not.  If germline samples
        # are present, then do with _cancer.  Otherwise, do not.

        # java -Xmx16g -jar $SNPEFF -v -cancer -cancerSamples vcf03.txt
        #   GRCh37.75 vcf02.txt 1> test03.txt 2> test03.log

        # Don't bother annotating positions that do not pass filter.
        # Filter them out first based on FILTER column.

        opj = os.path.join
        jobs = []
        for in_filename in vcf_filenames:
            path, stem, ext = mlib.splitpath(in_filename)
            samples_file = opj(out_path, "%s.cancerSamples.txt" % stem)
            filtered_filename = opj(out_path, "%s.filtered_input" % stem)
            out_filename = opj(out_path, "%s.vcf" % stem)
            log_filename = opj(out_path, "%s.log" % stem)
            x = filelib.GenericObject(in_filename=in_filename,
                                      samples_file=samples_file,
                                      filtered_filename=filtered_filename,
                                      out_filename=out_filename,
                                      log_filename=log_filename)
            jobs.append(x)

        # First, filter each of the VCF files.
        commands = []
        for j in jobs:
            # For debugging.  If this file exists, don't filter it again.
            if os.path.exists(j.filtered_filename):
                continue
            args = j.in_filename, j.filtered_filename, wgs_or_wes
            x = vcflib.filter_vcf_file, args, {}
            commands.append(x)
        parallel.pyfun(commands, num_procs=num_cores)

        # Make the cancer_samples files.
        for j in jobs:
            # Will generate this if there are cancer samples.
            make_cancer_samples_file(j.filtered_filename, nc_match,
                                     j.samples_file)

        # Make a list of commands.
        commands = []
        for j in jobs:
            cancer = False
            if os.path.exists(j.samples_file):
                cancer = True
            x = make_snpeff_command(j.filtered_filename,
                                    genome,
                                    j.out_filename,
                                    j.log_filename,
                                    is_cancer=cancer,
                                    cancer_samples_file=j.samples_file)
            commands.append(x)

        nc = mlib.calc_max_procs_from_ram(16, upper_max=num_cores)
        parallel.pshell(commands, max_procs=nc)
        metadata["commands"] = commands
        metadata["num_cores"] = nc

        # Make sure the analysis completed successfully.
        x = [x.out_filename for x in jobs]
        filelib.assert_exists_nz_many(x)

        # Log files should be empty.
        for j in jobs:
            filelib.assert_exists(j.log_filename)
            assert not filelib.exists_nz(j.log_filename), \
                   "Error with %s.\n%s" % (j.stem, j.log_filename)
            filelib.safe_unlink(j.log_filename)

        return metadata
def calc_gsea(expression_file, class_label_file, user_options, num_cores,
              out_path, permutation_type, database):
    import os
    import arrayio
    from genomicode import parallel
    from genomicode import arraysetlib
    from genomicode import hashlib
    from genomicode import filelib
    from genomicode import genesetlib
    from Betsy import module_utils as mlib

    names, classes = arraysetlib.read_cls_file(class_label_file)
    assert names
    assert len(names) >= 2, ("At least 2 classes needed for GSEA analysis.  "
                             "Found only: %s" % (names[0]))
    # Make sure there are the same number of samples in the class
    # label file as in the gene expression file.
    MATRIX = arrayio.read(expression_file)
    assert MATRIX.ncol() == len(classes), (
        "Mismatch: expression (%d) classes (%d)" %
        (MATRIX.ncol(), len(classes)))
    # Make sure classes go from [0, len(names))
    for i in classes:
        assert i >= 0 and i < len(names)

    fdr_cutoff = mlib.get_user_option(user_options,
                                      "gsea_fdr_cutoff",
                                      not_empty=True,
                                      type=float)
    assert fdr_cutoff > 0 and fdr_cutoff <= 1

    # Find all combinations of names and classes.
    opj = os.path.join
    jobs = []
    for i1 in range(len(names) - 1):
        for i2 in range(i1 + 1, len(names)):
            N1 = names[i1]
            N2 = names[i2]
            # Indexes should be 1-based.
            I1 = [i + 1 for i in range(len(classes)) if classes[i] == i1]
            I2 = [i + 1 for i in range(len(classes)) if classes[i] == i2]
            N1_h = hashlib.hash_var(N1)
            N2_h = hashlib.hash_var(N2)
            stem = "%s.vs.%s" % (N1_h, N2_h)

            gsea_path = opj(out_path, "%s.%s.gsea" % (stem, database))

            x = filelib.GenericObject(N1=N1,
                                      N2=N2,
                                      I1=I1,
                                      I2=I2,
                                      stem=stem,
                                      gsea_path=gsea_path)
            jobs.append(x)

    permutation_types = {}
    commands = []
    for j in jobs:
        # Need at least 3 samples for "phenotype" permutations.  If
        # there are fewer samples, then set to "gene_set".
        if len(I1) < 3 or len(I2) < 3:
            permutation_type = "gene_set"
        permutation_types[permutation_type] = 1
        cmd = make_gsea_command(expression_file, class_label_file, j.gsea_path,
                                j.N1, j.N2, j.I1, j.I2, permutation_type,
                                database)
        commands.append(cmd)
    for cmd in commands:
        parallel.sshell(cmd)

    # Summarize results.
    # Make a geneset file.
    significant = []
    for j in jobs:
        x = find_significant_gene_sets(j.gsea_path, j.N1, j.N2, fdr_cutoff)
        significant.append(x)

    genesets = []
    for j, x in zip(jobs, significant):
        genes1, genes2 = x
        gs_name1 = "%s_%s" % (j.stem, j.N1)
        gs_name2 = "%s_%s" % (j.stem, j.N2)
        gs1 = genesetlib.GeneSet(gs_name1, "", genes1)
        gs2 = genesetlib.GeneSet(gs_name2, "", genes2)
        genesets.extend([gs1, gs2])
    x = "genesets.fdr_%g.gmt" % fdr_cutoff
    geneset_file = opj(out_path, x)
    genesetlib.write_gmt(geneset_file, genesets)

    # Count the number of significant gene sets.
    x = "num_genesets.fdr_%g.txt" % fdr_cutoff
    summary_file = opj(out_path, x)
    handle = open(summary_file, 'w')
    header = "Group 1", "Group 2", "Gene Sets in Group 1", \
             "Gene Sets in Group 2"
    print >> handle, "\t".join(header)
    for j, x in zip(jobs, significant):
        genes1, genes2 = x
        x = j.N1, j.N2, len(genes1), len(genes2)
        assert len(x) == len(header)
        print >> handle, "\t".join(map(str, x))
    handle.close()

    return commands, sorted(permutation_types)
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, outfile):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from Betsy import module_utils as mlib

        svm_node, vcf_node = antecedents
        vcf_filenames = filelib.list_files_in_path(vcf_node.identifier,
                                                   endswith=".vcf",
                                                   not_empty=True)
        metadata = {}

        # 1.  vcf_filenames
        # 2.  parsed_snpeff_files   one for each VCF file
        # 3.  merged_snpeff_file    just one file
        # 4.  clean_snpeff_file     clean up the annotations to final form
        # 5.  outfile

        merged_snpeff_file = "snpeff.merged.txt"
        cleaned_snpeff_file = "snpeff.clean.txt"

        jobs = []
        for vcf_filename in vcf_filenames:
            path, caller, ext = mlib.splitpath(vcf_filename)
            parsed_snpeff_file = "%s.parsed.txt" % caller
            j = filelib.GenericObject(
                caller=caller,
                vcf_filename=vcf_filename,
                parsed_snpeff_file=parsed_snpeff_file,
            )
            jobs.append(j)

        # Parse each of the snpeff files.
        commands = []
        for j in jobs:
            args = j.vcf_filename, j.parsed_snpeff_file
            # Debugging.  If this file exists, do not generate it
            # again.
            if os.path.exists(j.parsed_snpeff_file):
                continue
            x = parse_snpeff_file, args, {}
            commands.append(x)
        parallel.pyfun(commands, num_procs=num_cores)
        metadata["num_cores"] = num_cores

        # Merge the parsed files.
        x = [j.parsed_snpeff_file for j in jobs]
        x = [x for x in x if os.path.exists(x)]
        parsed_files = x
        # For debugging, don't regenerate if I don't need to.
        if not filelib.exists_nz(merged_snpeff_file):
            merge_parsed_files(parsed_files, merged_snpeff_file)

        # Clean up the snpEff file.  Coordinates should be unique.
        # For debugging, don't regenerate if I don't need to.
        if not filelib.exists_nz(cleaned_snpeff_file):
            clean_snpeff_file(merged_snpeff_file, cleaned_snpeff_file)

        # Merge the snpEff annotations into the SimpleVariantMatrix.
        add_snpeff_to_svm(svm_node.identifier, cleaned_snpeff_file, outfile)

        return metadata
Esempio n. 14
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import parallel
        from genomicode import filelib
        from genomicode import alignlib
        from genomicode import hashlib
        from Betsy import module_utils as mlib

        fastq_node, sample_node, orient_node, reference_node = antecedents
        fastq_files = mlib.find_merged_fastq_files(sample_node.identifier,
                                                   fastq_node.identifier)
        ref = alignlib.create_reference_genome(reference_node.identifier)
        assert os.path.exists(ref.fasta_file_full)
        orient = mlib.read_orientation(orient_node.identifier)
        filelib.safe_mkdir(out_path)

        metadata = {}
        metadata["tool"] = "bowtie2 %s" % alignlib.get_bowtie2_version()

        # Bowtie2 doesn't handle files with spaces in them.  Make
        # temporary files without spaces.

        # Make a list of the jobs to run.
        jobs = []
        for i, x in enumerate(fastq_files):
            sample, pair1, pair2 = x
            bam_filename = os.path.join(out_path, "%s.bam" % sample)
            log_filename = os.path.join(out_path, "%s.log" % sample)
            sample_h = hashlib.hash_var(sample)
            temp_pair1 = "%d_%s_1.fa" % (i, sample_h)
            temp_pair2 = None
            if pair2:
                temp_pair2 = "%d_%s_2.fa" % (i, sample_h)
            j = filelib.GenericObject(sample=sample,
                                      pair1=pair1,
                                      pair2=pair2,
                                      temp_pair1=temp_pair1,
                                      temp_pair2=temp_pair2,
                                      bam_filename=bam_filename,
                                      log_filename=log_filename)
            jobs.append(j)

        for j in jobs:
            os.symlink(j.pair1, j.temp_pair1)
            if pair2:
                os.symlink(j.pair2, j.temp_pair2)

        # Generate bowtie2 commands for each of the files.
        attr2orient = {
            "single": None,
            "paired_fr": "fr",
            "paired_rf": "rf",
            "paired_ff": "ff",
        }
        orientation = attr2orient[orient.orientation]
        #x = sample_node.data.attributes["orientation"]
        #orientation = attr2orient[x]

        # Takes ~4 Gb per job.
        samtools = mlib.findbin("samtools")
        sq = parallel.quote
        commands = []
        for j in jobs:
            #sample, pair1, pair2, bam_filename, log_filename = x
            nc = max(1, num_cores / len(jobs))

            # bowtie2 -p 8 -x <genome> -1 <.fq> -2 <.fq> --fr
            #  2> test.log | samtools view -bS -o test.bam -
            x1 = alignlib.make_bowtie2_command(ref.fasta_file_full,
                                               j.temp_pair1,
                                               fastq_file2=j.temp_pair2,
                                               orientation=orientation,
                                               num_threads=nc)
            x2 = [
                sq(samtools),
                "view",
                "-bS",
                "-o",
                sq(j.bam_filename),
                "-",
            ]
            x2 = " ".join(x2)
            x = "%s 2> %s | %s" % (x1, sq(j.log_filename), x2)
            #x = "%s >& %s" % (x, sq(log_filename))
            commands.append(x)
        metadata["commands"] = commands
        parallel.pshell(commands, max_procs=num_cores)

        # Make sure the analysis completed successfully.
        x1 = [x.bam_filename for x in jobs]
        x2 = [x.log_filename for x in jobs]
        filelib.assert_exists_nz_many(x1 + x2)

        return metadata
Esempio n. 15
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import filelib
        from genomicode import ngslib
        from genomicode import parallel
        from genomicode import alignlib
        from Betsy import module_utils as mlib

        bam_node, ref_node = antecedents
        bam_filenames = mlib.find_bam_files(bam_node.identifier)
        ref = alignlib.create_reference_genome(ref_node.identifier)
        filelib.safe_mkdir(out_path)
        metadata = {}

        features_bed = mlib.get_user_option(user_options,
                                            "features_bed",
                                            check_file=True)
        if features_bed:
            metadata["features_bed"] = features_bed

        # Applies to genomecov.
        min_coverage = user_options.get("ignore_coverage_below")
        if min_coverage == "":
            min_coverage = None
        if min_coverage is not None:
            min_coverage = int(min_coverage)
            assert min_coverage >= 0

        metadata["tool"] = "bedtools %s" % ngslib.get_bedtools_version()
        metadata["num_cores"] = num_cores
        metadata["commands"] = []

        # Set up the filenames.
        # list of (
        #   sample,
        #   orig_bam_filename,    Original bam filename.
        #   bam_filename,         bam file, after filtering out unmapped reads.
        #   genomecov_filename,   Generated by genomecov.  Histogram.
        #   histo_datafile,       Data file to generate histogram (from cov).
        #   histo_plotfile,       Histogram plot.
        #   histo_prismfile,      To make histogram in PRISM.
        #
        #   ONLY USED IF features_bed
        #   intervallist_file,    Made from BED file.
        #   cov_filename,         Generated by Picard.
        #   targetcov_filename,   Generated by Picard.  Per target coverage.
        #   log_filename,         Output from Picard.
        #   )
        opj = os.path.join
        jobs = []  # list of filelib.GenericObject
        for bam_filename in bam_filenames:
            # <in_path>/<sample>.bam
            in_path, sample, ext = mlib.splitpath(bam_filename)
            assert ext == ".bam"
            clean_bam_filename = opj(out_path, "%s.bam" % sample)
            assert clean_bam_filename != bam_filename
            genomecov_filename = opj(out_path, "%s.genomecov.txt" % sample)
            histo_datafile = opj(out_path, "%s.histo.txt" % sample)
            histo_plotfile = opj(out_path, "%s.histo.png" % sample)
            histo_prismfile = opj(out_path, "%s.prism.txt" % sample)

            intervallist_file = opj(out_path, "%s.interval.txt" % sample)
            cov_filename = opj(out_path, "%s.coverage.txt" % sample)
            targetcov_filename = opj(out_path, "%s.targetcov.txt" % sample)
            log_filename = opj(out_path, "%s.picard.log" % sample)

            x = filelib.GenericObject(sample=sample,
                                      orig_bam_filename=bam_filename,
                                      bam_filename=clean_bam_filename,
                                      genomecov_filename=genomecov_filename,
                                      histo_datafile=histo_datafile,
                                      histo_plotfile=histo_plotfile,
                                      histo_prismfile=histo_prismfile,
                                      intervallist_file=intervallist_file,
                                      cov_filename=cov_filename,
                                      targetcov_filename=targetcov_filename,
                                      log_filename=log_filename)
            #x = sample, bam_filename, genomecov_filename, \
            #    histo_datafile, histo_plotfile, histo_prismfile, \
            #    intervallist_file, cov_filename, targetcov_filename, \
            #    log_filename
            jobs.append(x)

        # Remove unmapped reads from the BAM files.
        # Need to remove the unmapped reads or Picard might complain:
        # Exception in thread "main"
        # htsjdk.samtools.SAMFormatException: SAM validation error:
        # ERROR: Record 154286082, Read name
        # DF9F08P1:326:C5KJFACXX:5:1304:12068:90850, MAPQ should be 0
        # for unmapped read.
        #
        # This can happen with BWA generated alignments.
        cmds = []
        for x in jobs:
            x = _make_samtools_filter_cmd(x.orig_bam_filename, x.bam_filename)
            cmds.append(x)
        parallel.pshell(cmds, max_procs=num_cores)
        x = [x.bam_filename for x in jobs]
        filelib.assert_exists_nz_many(x)

        # Generate the intervallist_file(s).
        if features_bed:
            cmds = []
            for x in jobs:
                args = x.intervallist_file, features_bed, x.bam_filename
                x = _make_intervallist_file, args, {}
                cmds.append(x)
            parallel.pyfun(cmds, num_procs=num_cores)

        # Make the commands to run picard.
        if features_bed:
            commands = []
            for x in jobs:
                x = _make_calculatehsmetrics_command(
                    x.intervallist_file, x.bam_filename, x.cov_filename,
                    x.targetcov_filename, ref.fasta_file_full, x.log_filename)
                commands.append(x)
            metadata["commands"].append(commands)
            parallel.pshell(commands, max_procs=num_cores)

            x1 = [x.cov_filename for x in jobs]
            x2 = [x.targetcov_filename for x in jobs]
            filelib.assert_exists_nz_many(x1 + x2)

        # Use genomecov to count read depth.
        x = _run_genomecov(jobs, ref_node.identifier, num_cores)
        metadata["commands"].append(x)

        # Summarize the average read depth.
        summary_file = opj(out_path, "summary.xls")
        _summarize_average_read_depth(jobs, min_coverage, summary_file)

        # Make histograms of the distribution of the read depth for
        # each sample.
        for x in jobs:
            _make_histo_file(x.genomecov_filename, x.histo_datafile)

        # Delete the filtered BAM files to save space.
        for x in jobs:
            filelib.assert_exists_nz(x.bam_filename)
            os.unlink(x.bam_filename)
        return metadata
Esempio n. 16
0
def merge_vcf_files(vcf_filenames, out_filename, num_cores, tmp_path):
    # Put indexed files in tmp_path.
    import os
    import stat
    import shutil
    from genomicode import filelib
    from genomicode import hashlib
    from genomicode import parallel
    from Betsy import module_utils as mlib

    # TODO: find the version number of these tools.
    bgzip = mlib.findbin("bgzip")
    tabix = mlib.findbin("tabix")
    bcftools = mlib.findbin("bcftools")
    sq = parallel.quote

    tmp_path = os.path.realpath(tmp_path)
    filelib.safe_mkdir(tmp_path)

    # Keep track of all commands run.
    metadata = {}
    metadata["commands"] = []

    # Ignore VCF files that don't have any variants.
    vcf_filenames = [x for x in vcf_filenames if os.stat(x)[stat.ST_SIZE] > 0]

    # If there are no VCF files with any variants, then just create an
    # empty outfile and return.
    if not vcf_filenames:
        open(out_filename, 'w')
        return

    # 1.  Copy VCF files to temporary directory.             tmp_filename
    # 2.  Fix VCF files (e.g. NextGENe, JointSNVMix broken)
    # 3.  Sort the VCF files (needed for tabix)
    # 4.  Compress  (bgzip)
    # 5.  Index     (tabix)
    # 6.  Merge

    jobs = []
    for in_filename in vcf_filenames:
        path, root, ext = mlib.splitpath(in_filename)
        sample = root
        x = "%s%s" % (hashlib.hash_var(root), ext)
        tmp_filename = os.path.join(tmp_path, x)
        x = filelib.GenericObject(
            sample=sample,
            in_filename=in_filename,
            tmp_filename=tmp_filename,
        )
        jobs.append(x)

    # Make sure temporary files are unique.
    seen = {}
    for j in jobs:
        assert j.tmp_filename not in seen
        seen[j.tmp_filename] = 1

    # Merge them in order of sample.  The germline sample will be
    # duplicated, and we will know the order of the germline sample.
    schwartz = [(x.sample, x) for x in jobs]
    schwartz.sort()
    jobs = [x[-1] for x in schwartz]

    # Copy all the VCF files to a temporary directory.
    for j in jobs:
        shutil.copy2(j.in_filename, j.tmp_filename)

    #for j in jobs:
    #    make_file_smaller(j.tmp_filename, 1000)

    for j in jobs:
        # NextGENe creates broken VCF files.  Fix them.
        fix_nextgene_vcf(j.tmp_filename)
        # JointSNVMix creates broken VCF files.  Fix them.
        fix_jointsnvmix_vcf(j.tmp_filename)

    for j in jobs:
        sort_vcf_file(j.tmp_filename)

    ## # Since we are merging the files, we need to make sure that
    ## # each file has a unique name.  If the names aren't unique,
    ## # then make them unique by adding the name of the file.
    ## all_unique = True
    ## seen = {}
    ## for x in jobs:
    ##     sample, in_filename, tmp_filename = x
    ##     samples = _get_samples_from_vcf(tmp_filename)
    ##     for s in samples:
    ##         if s in seen:
    ##             all_unique = False
    ##             break
    ##         seen[s] = 1
    ##     if not all_unique:
    ##         break
    ## if not all_unique:
    ##     for x in jobs:
    ##         sample, in_filename, tmp_filename = x
    ##         _uniquify_samples_in_vcf(tmp_filename, sample)

    # Compress the VCF files.
    # bgzip file.vcf
    commands = []
    for j in jobs:
        x = "%s %s" % (sq(bgzip), sq(j.tmp_filename))
        commands.append(x)
    parallel.pshell(commands, max_procs=num_cores, path=tmp_path)
    metadata["commands"].extend(commands)
    metadata["num_cores"] = num_cores
    x = ["%s.gz" % x.tmp_filename for x in jobs]
    filelib.assert_exists_nz_many(x)

    # Index the VCF files.
    # tabix -p vcf file.vcf.gz
    commands = []
    for j in jobs:
        x = "%s -p vcf %s.gz" % (sq(tabix), sq(j.tmp_filename))
        commands.append(x)
    parallel.pshell(commands, max_procs=num_cores, path=tmp_path)
    metadata["commands"].extend(commands)
    x = ["%s.gz.tbi" % j.tmp_filename for j in jobs]
    filelib.assert_exists_nz_many(x)

    # Run bcftools
    ## For VCF files from somatic calls, the germline sample will
    ## be duplicated.  Add --force-samples to make sure this is
    ## still merged.

    # Since we need to append all the VCF files, it's easy to run
    # into error:
    # OSError: [Errno 7] Argument list too long
    #
    # To reduce the chance of this, figure out the path of the
    # tmp_filename, and run the analysis in that path so we can
    # use relative filenames.
    tmp_path = None
    for j in jobs:
        path, file_ = os.path.split(j.tmp_filename)
        if tmp_path is None:
            tmp_path = path
        assert path == tmp_path

    cmd = [
        sq(bcftools),
        "merge",
        "-o %s" % sq(out_filename),
        "-O v",
        "--force-samples",
    ]
    for j in jobs:
        path, file_ = os.path.split(j.tmp_filename)
        assert path == tmp_path
        cmd.append("%s.gz" % file_)
    x = " ".join(cmd)
    parallel.sshell(x, path=tmp_path)
    metadata["commands"].append(x)

    return metadata
Esempio n. 17
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import parallel
        from genomicode import filelib
        from genomicode import alignlib
        from Betsy import module_utils as mlib

        fastq_node, sample_node, strand_node, ref_node = antecedents
        fastq_files = mlib.find_merged_fastq_files(sample_node.identifier,
                                                   fastq_node.identifier)
        ref = alignlib.create_reference_genome(ref_node.identifier)
        stranded = mlib.read_stranded(strand_node.identifier)
        filelib.safe_mkdir(out_path)

        # Do a quick check to make sure the reference is correct.
        # Otherwise, error may be hard to disgnose.
        alignlib.assert_is_STAR_reference(ref.path)

        metadata = {}
        metadata["tool"] = "STAR %s" % alignlib.get_STAR_version()

        x = mlib.get_user_option(user_options,
                                 "two_pass",
                                 allowed_values=["no", "yes"])
        two_pass = (x == "yes")

        # Figure out the strandedness.
        is_stranded = stranded.stranded != "unstranded"

        # STAR --runThreadN 40 --genomeDir test05 \
        #   --readFilesIn test.fastq/test03_R1_001.fastq \
        #   test.fastq/test03_R2_001.fastq --outFileNamePrefix test06.
        # If unstranded, add --outSAMstrandField intronMotif

        # Make a list of the jobs to run.
        jobs = []  # list of filelib.GenericObject objects
        for x in fastq_files:
            sample, pair1, pair2 = x
            pass1_out_prefix = "p1.%s." % sample
            pass2_out_prefix = "%s." % sample
            pass1_bam_filename = os.path.join(
                out_path, "%sAligned.out.bam" % pass1_out_prefix)
            pass2_bam_filename = os.path.join(
                out_path, "%sAligned.out.bam" % pass2_out_prefix)
            sjdb_filename = os.path.join(out_path, "p1.%s.SJ.out.tab" % sample)
            log1_filename = os.path.join(out_path, "p1.%s.log" % sample)
            log2_filename = os.path.join(out_path, "%s.log" % sample)

            x = filelib.GenericObject(
                sample=sample,
                pair1=pair1,
                pair2=pair2,
                pass1_out_prefix=pass1_out_prefix,
                pass2_out_prefix=pass2_out_prefix,
                pass1_bam_filename=pass1_bam_filename,
                pass2_bam_filename=pass2_bam_filename,
                sjdb_filename=sjdb_filename,
                log1_filename=log1_filename,
                log2_filename=log2_filename,
            )
            jobs.append(x)

        # Run pass 1.
        commands = []
        for j in jobs:
            x = os.path.join(out_path, j.pass1_out_prefix)
            cmd = alignlib.make_STAR_command(ref.path, x, num_cores,
                                             is_stranded, j.pair1, j.pair2,
                                             j.log1_filename)
            # For debugging.  If this file already exists, skip it.
            if not filelib.exists_nz(j.pass1_bam_filename):
                parallel.sshell(cmd, path=out_path)
            filelib.assert_exists_nz(j.pass1_bam_filename)
            commands.append(cmd)

        if two_pass:
            # Make a new index with the splice junction information.
            sj_index = os.path.join(out_path, "genome.2pass")
            x = [x.sjdb_filename for x in jobs]
            filelib.assert_exists_nz_many(x)
            x = alignlib.make_STAR_index_command(ref.fasta_file_full,
                                                 sj_index,
                                                 sjdb_files=x,
                                                 num_cores=num_cores)
            x = "%s >& genome.2pass.log" % x
            commands.append(x)

            # For debugging.  If this file already exists, skip it.
            if not filelib.exists_nz("genome.2pass.log"):
                parallel.sshell(x, path=out_path)
            alignlib.assert_is_STAR_reference(sj_index)

        # Run pass 2.
        for j in jobs:
            # For debugging.  If this file already exists, skip it.
            if os.path.exists(j.pass2_bam_filename):
                continue
            if two_pass:
                x = os.path.join(out_path, j.pass2_out_prefix)
                cmd = alignlib.make_STAR_command(sj_index, x, num_cores,
                                                 is_stranded, j.pair1, j.pair2,
                                                 j.log2_filename)
                parallel.sshell(cmd, path=out_path)
                commands.append(cmd)
            else:
                # link pass1_bam_filename to pass2_bam_filename
                os.symlink(j.pass1_bam_filename, j.pass2_bam_filename)
                continue
            filelib.assert_exists_nz(j.pass2_bam_filename)

        metadata["commands"] = commands
        metadata["num_cores"] = num_cores

        # STAR takes 28 Gb per process.  Make sure we don't use up
        # more memory than is available on the machine.
        # Defaults:
        # --limitGenomeGenerateRAM   31000000000
        # --outFilterMismatchNmax    10             Num mismatches.
        #nc = mlib.calc_max_procs_from_ram(50, buffer=100, upper_max=num_cores)
        #metadata["num_cores"] = nc
        #parallel.pshell(commands, max_procs=nc, path=out_path)

        # Make sure the analysis completed successfully.
        #x = [x[-2] for x in jobs]  # sam_filename
        #filelib.assert_exists_nz_many(x)
        return metadata
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import config
        from genomicode import parallel
        from genomicode import alignlib
        from genomicode import filelib
        from Betsy import module_utils

        bam_node, ref_node, pos_node = antecedents
        bam_filenames = module_utils.find_bam_files(bam_node.identifier)
        assert bam_filenames, "No .bam files."
        ref = alignlib.create_reference_genome(ref_node.identifier)
        filelib.safe_mkdir(out_path)
        metadata = {}

        # Positions file has 0-based coordinates (like BAM files).
        # But samtools requires 1-based coordinates.  Convert to
        # 1-based coordinates.
        positions_filename = "positions.txt"
        outhandle = open(positions_filename, 'w')
        for x in filelib.read_cols(pos_node.identifier):
            assert len(x) == 2
            chrom, pos = x
            pos = int(pos) + 1  # convert from 0- to 1-based coords.
            x = chrom, pos
            print >> outhandle, "\t".join(map(str, x))
        outhandle.close()

        # list of (in_filename, err_filename, out_filename)
        jobs = []
        for in_filename in bam_filenames:
            p, f = os.path.split(in_filename)
            sample, ext = os.path.splitext(f)
            err_filename = os.path.join(out_path, "%s.log" % sample)
            out_filename = os.path.join(out_path, "%s.pileup" % sample)
            x = filelib.GenericObject(in_filename=in_filename,
                                      err_filename=err_filename,
                                      out_filename=out_filename)
            jobs.append(x)

        ## Get possible positions file.
        #positions_filename = module_utils.get_user_option(
        #    user_options, "positions_file", check_file=True)

        # Figure out whether the purpose is to get coverage.  Change
        # the parameters if it is.
        assert "vartype" in out_attributes
        vartype = out_attributes["vartype"]
        assert vartype in ["all", "snp", "indel", "consensus"]
        #if cov == "yes":
        #    assert positions_filename, "Missing: positions_file"

        # samtools mpileup -l freq04.txt -R -B -q 0 -Q 0 -d10000000 \
        #   -f genomes/Broad.hg19/Homo_sapiens_assembly19.fasta \
        #   $i > $j"
        samtools = filelib.which_assert(config.samtools)

        # Get an error if the BAM files are not indexed.
        # [W::bam_hdr_read] EOF marker is absent. The input is probably
        #   truncated.

        #if vartype == "consensus":
        #    args = [
        #        "-R",        # Ignore read group tags.
        #        "-B",        # Disable BAQ (base quality) computation.
        #        "-q", 0,     # Skip bases with mapQ smaller than this.
        #        "-Q", 0,     # Skip bases with BAQ smaller than this.
        #        "-d10000000",  # Allow deep reads.
        #        ]
        #else:
        #    raise NotImplementedError
        args = [
            "-R",  # Ignore read group tags.
            "-B",  # Disable BAQ (base quality) computation.
            "-q",
            0,  # Skip bases with mapQ smaller than this.
            "-Q",
            0,  # Skip bases with BAQ smaller than this.
            "-d10000000",  # Allow deep reads.
        ]

        sq = parallel.quote
        commands = []
        for j in jobs:
            x = [
                sq(samtools),
                "mpileup",
                "-f",
                sq(ref.fasta_file_full),
            ]
            if positions_filename:
                x.extend(["-l", positions_filename])
            x.extend(args)
            x.append(sq(j.in_filename))
            x = " ".join(map(str, x))
            x = "%s 2> %s 1> %s" % (x, j.err_filename, j.out_filename)
            commands.append(x)

        #for x in commands:
        #    print x
        parallel.pshell(commands, max_procs=num_cores)
        metadata["commands"] = commands

        # File may be empty if there are no reads.
        x = [x.out_filename for x in jobs]
        filelib.assert_exists_many(x)

        # Make sure there's no errors in the log files.
        for j in jobs:
            check_log_file(j.err_filename)

        return metadata
Esempio n. 19
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import filelib
        from genomicode import alignlib
        from genomicode import parallel
        from genomicode import hashlib
        from Betsy import module_utils as mlib

        fastq_node, sample_node, strand_node, reference_node = antecedents
        fastq_files = mlib.find_merged_fastq_files(sample_node.identifier,
                                                   fastq_node.identifier)
        assert fastq_files, "I could not find any FASTQ files."
        ref = alignlib.create_reference_genome(reference_node.identifier)
        stranded = mlib.read_stranded(strand_node.identifier)
        filelib.safe_mkdir(out_path)

        metadata = {}
        metadata["tool"] = "RSEM %s" % alignlib.get_rsem_version()

        # Figure out whether to align to genome or transcriptome.
        x = out_attributes["align_to"]
        assert x in ["genome", "transcriptome"]
        align_to_genome = (x == "genome")

        # RSEM makes files:
        # <sample_name>.genome.bam
        # <sample_name>.transcript.bam
        # <sample_name>.genes.results
        # <sample_name>.isoforms.results
        # <sample_name>.stat
        #
        # Does not work right if there is a space in the sample name.
        # Therefore, give a hashed sample name, and then re-name
        # later.

        # Make a list of the jobs to run.
        jobs = []
        for x in fastq_files:
            sample, pair1, pair2 = x
            sample_h = hashlib.hash_var(sample)

            x1, x2, x3 = mlib.splitpath(pair1)
            x = "%s%s" % (hashlib.hash_var(x2), x3)
            pair1_h = os.path.join(out_path, x)
            if pair2:
                x1, x2, x3 = mlib.splitpath(pair2)
                x = "%s%s" % (hashlib.hash_var(x2), x3)
                pair2_h = os.path.join(out_path, x)
            results_filename = os.path.join(out_path,
                                            "%s.genes.results" % sample)
            log_filename = os.path.join(out_path, "%s.log" % sample)
            x = filelib.GenericObject(sample=sample,
                                      sample_h=sample_h,
                                      pair1=pair1,
                                      pair2=pair2,
                                      pair1_h=pair1_h,
                                      pair2_h=pair2_h,
                                      results_filename=results_filename,
                                      log_filename=log_filename)
            jobs.append(x)

        # Make sure hashed samples are unique.
        seen = {}
        for j in jobs:
            assert j.sample_h not in seen, \
                   "Dup (%d): %s" % (len(jobs), j.sample_h)
            assert j.pair1_h not in seen
            assert j.pair2_h not in seen
            seen[j.sample_h] = 1
            seen[j.pair1_h] = 1
            seen[j.pair2_h] = 1

        # Symlink the fastq files.
        for j in jobs:
            os.symlink(j.pair1, j.pair1_h)
            if j.pair2:
                os.symlink(j.pair2, j.pair2_h)

        s2fprob = {
            "unstranded": None,
            "firststrand": 0.0,
            "secondstrand": 1.0,
        }
        assert stranded.stranded in s2fprob, "Unknown stranded: %s" % \
               stranded.stranded
        forward_prob = s2fprob[stranded.stranded]

        # How much memory for bowtie.  May need to increase this if
        # there are lots of memory warnings in the log files:
        #   Warning: Exhausted best-first chunk memory for read
        #   ST-J00106:110:H5NY5BBXX:6:1101:18203:44675 1:N:0:1/1
        #   (patid 2076693); skipping read
        # Default is 64.
        # Seems like too high a value can cause problems.
        #chunkmbs = 4*1024   # Generates warnings.
        chunkmbs = 512

        # Get lots of warnings with bowtie:
        # Warning: Detected a read pair whose two mates have different names

        # Use STAR aligner instead.
        use_STAR = True

        sq = parallel.quote
        commands = []
        for j in jobs:
            # Debug: If the results file exists, don't run it again.
            if filelib.exists_nz(j.results_filename) and \
                   filelib.exists(j.log_filename):
                continue
            # If using the STAR aligner, then most memory efficient
            # way is to let STAR take care of the multiprocessing.
            nc = max(1, num_cores / len(jobs))
            if use_STAR:
                nc = num_cores

            keywds = {}
            if use_STAR:
                keywds["align_with_star"] = True
            else:
                keywds["align_with_bowtie2"] = True
            x = alignlib.make_rsem_command(ref.fasta_file_full,
                                           j.sample_h,
                                           j.pair1_h,
                                           fastq_file2=j.pair2_h,
                                           forward_prob=forward_prob,
                                           output_genome_bam=align_to_genome,
                                           bowtie_chunkmbs=chunkmbs,
                                           num_threads=nc,
                                           **keywds)
            x = "%s >& %s" % (x, sq(j.log_filename))
            commands.append(x)
        metadata["commands"] = commands
        metadata["num cores"] = num_cores
        # Need to run in out_path.  Otherwise, files will be everywhere.
        nc = num_cores
        if use_STAR:
            nc = 1
        parallel.pshell(commands, max_procs=nc, path=out_path)

        # Rename the hashed sample names back to the original unhashed
        # ones.
        files = os.listdir(out_path)
        rename_files = []  # list of (src, dst)
        for j in jobs:
            if j.sample == j.sample_h:
                continue
            for f in files:
                if not f.startswith(j.sample_h):
                    continue
                src = os.path.join(out_path, f)
                x = j.sample + f[len(j.sample_h):]
                dst = os.path.join(out_path, x)
                rename_files.append((src, dst))
        for src, dst in rename_files:
            filelib.assert_exists(src)
            os.rename(src, dst)

        # Delete the symlinked fastq files.
        for j in jobs:
            filelib.safe_unlink(j.pair1_h)
            filelib.safe_unlink(j.pair2_h)

        # Make sure the analysis completed successfully.
        x1 = [x.results_filename for x in jobs]
        x2 = [x.log_filename for x in jobs]
        filelib.assert_exists_nz_many(x1 + x2)

        return metadata
Esempio n. 20
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib
        from Betsy import module_utils

        bam_node, ref_node = antecedents
        bam_filenames = module_utils.find_bam_files(bam_node.identifier)
        assert bam_filenames, "No .bam files."
        ref = alignlib.create_reference_genome(ref_node.identifier)
        filelib.safe_mkdir(out_path)
        metadata = {}
        # TODO: Figure out GATK version.

        ## Figure out whether the user wants SNPs or INDELs.
        #assert "vartype" in out_attributes
        #vartype = out_attributes["vartype"]
        #assert vartype in ["all", "snp", "indel"]

        jobs = []
        for bam_filename in bam_filenames:
            p, f = os.path.split(bam_filename)
            sample, ext = os.path.splitext(f)
            #raw_outfile = os.path.join(out_path, "%s.raw" % sample)
            vcf_outfile = os.path.join(out_path, "%s.vcf" % sample)
            log_filename = os.path.join(out_path, "%s.log" % sample)
            x = filelib.GenericObject(bam_filename=bam_filename,
                                      vcf_outfile=vcf_outfile,
                                      log_filename=log_filename)
            jobs.append(x)

        # java -Xmx5g -jar /usr/local/bin/GATK/GenomeAnalysisTK.jar
        #   -T HaplotypeCaller -R ucsc.hg19.fasta
        #   -dontUseSoftClippedBases -stand_call_conf 20.0
        #   -stand_emit_conf 20.0 -I $i -o $j

        # Make a list of commands.
        commands = []
        for j in jobs:
            # For debugging.  If exists, don't do it again.
            #if filelib.exists_nz(j.raw_outfile):
            if filelib.exists_nz(j.vcf_outfile):
                continue
            x = alignlib.make_GATK_command(T="HaplotypeCaller",
                                           R=ref.fasta_file_full,
                                           dontUseSoftClippedBases=None,
                                           stand_call_conf=20.0,
                                           stand_emit_conf=20.0,
                                           I=j.bam_filename,
                                           o=j.vcf_outfile)
            x = "%s >& %s" % (x, j.log_filename)
            commands.append(x)

        parallel.pshell(commands, max_procs=num_cores)

        # Filter each of the VCF files.
        #for j in jobs:
        #    filter_by_vartype(vartype, j.raw_outfile, j.vcf_outfile)
        #metadata["filter"] = vartype

        # Make sure the analysis completed successfully.
        x = [j.vcf_outfile for j in jobs]
        filelib.assert_exists_nz_many(x)

        return metadata
Esempio n. 21
0
    def run(
        self, network, in_data, out_attributes, user_options, num_cores,
        out_path):
        import os
        import shutil
        from genomicode import parallel
        from genomicode import filelib
        from genomicode import alignlib
        from Betsy import module_utils as mlib

        bam_filenames = mlib.find_bam_files(in_data.identifier)
        filelib.safe_mkdir(out_path)

        metadata = {}
        metadata["tool"] = "bam2fastx (unknown version)"

        # Somehow bam2fastx doesn't work if there are spaces in the
        # filename.  Make a temporary filename with no spaces, and
        # then rename it later.
        # Actually, may not be bam2fastx's fault.

        jobs = []
        for i, bam_filename in enumerate(bam_filenames):
            p, f, e = mlib.splitpath(bam_filename)
            #bai_filename = alignlib.find_bai_file(bam_filename)
            #assert bai_filename, "Missing index for: %s" % bam_filename
            #temp_bam_filename = "%d.bam" % i
            #temp_bai_filename = "%d.bam.bai" % i
            #temp_fa_filename = "%d.fa" % i
            fa_filename = os.path.join(out_path, "%s.fa" % f)
            x = filelib.GenericObject(
                bam_filename=bam_filename,
                #bai_filename=bai_filename,
                #temp_bam_filename=temp_bam_filename,
                #temp_bai_filename=temp_bai_filename,
                #temp_fa_filename=temp_fa_filename,
                fa_filename=fa_filename)
            jobs.append(x)
        bam2fastx = mlib.findbin("bam2fastx")

        # Link all the bam files.
        #for j in jobs:
        #    assert not os.path.exists(j.temp_bam_filename)
        #    #assert not os.path.exists(j.temp_bai_filename)
        #    os.symlink(j.bam_filename, j.temp_bam_filename)
        #    #os.symlink(j.bai_filename, j.temp_bai_filename)

        commands = []
        for j in jobs:
            # bam2fastx -A --fasta -o rqc14.fa rqc11.bam
            x = [
                mlib.sq(bam2fastx),
                "-A",
                "--fasta",
                #"-o", mlib.sq(j.temp_fa_filename),
                #mlib.sq(j.temp_bam_filename),
                "-o", mlib.sq(j.fa_filename),
                mlib.sq(j.bam_filename),
                ]
            x = " ".join(x)
            commands.append(x)
        metadata["commands"] = commands
        metadata["num_cores"] = num_cores
        parallel.pshell(commands, max_procs=num_cores)

        #for j in jobs:
        #    # Move the temporary files to the final location.
        #    shutil.move(j.temp_fa_filename, j.fa_filename)
        #    # Remove the link to the BAM file.
        #    os.unlink(j.temp_bam_filename)
        
        x = [j.fa_filename for x in jobs]
        filelib.assert_exists_nz_many(x)

        return metadata
Esempio n. 22
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib
        from Betsy import module_utils as mlib
        import call_somatic_varscan

        bam_node, nc_node, ref_node, interval_node = antecedents
        bam_filenames = mlib.find_bam_files(bam_node.identifier)
        assert bam_filenames, "No .bam files."
        nc_match = mlib.read_normal_cancer_file(nc_node.identifier)
        ref = alignlib.create_reference_genome(ref_node.identifier)
        filelib.assert_exists_nz(interval_node.identifier)
        filelib.safe_mkdir(out_path)
        metadata = {}
        # TODO: Figure out GATK version.

        # Make sure intervals file ends with:
        # .bed, .list, .picard, .interval_list, or .intervals
        x, x, ext = mlib.splitpath(interval_node.identifier)
        assert ext in [
            ".bed", ".list", ".picard", ".interval_list", ".intervals"
        ]

        cosmic_file = mlib.get_user_option(user_options,
                                           "mutect_cosmic_vcf",
                                           not_empty=True,
                                           check_file=True)
        dbsnp_file = mlib.get_user_option(user_options,
                                          "mutect_dbsnp_vcf",
                                          not_empty=True,
                                          check_file=True)

        # sample -> bam filename
        sample2bamfile = mlib.root2filename(bam_filenames)
        # Make sure files exist for all the samples.
        mlib.assert_normal_cancer_samples(nc_match, sample2bamfile)

        opj = os.path.join
        jobs = []
        for (normal_sample, cancer_sample) in nc_match:
            normal_bamfile = sample2bamfile[normal_sample]
            cancer_bamfile = sample2bamfile[cancer_sample]
            path, sample, ext = mlib.splitpath(cancer_bamfile)
            vcf_outfile = opj(out_path, "%s.vcf" % sample)
            log_outfile = opj(out_path, "%s.log" % sample)
            x = filelib.GenericObject(normal_sample=normal_sample,
                                      cancer_sample=cancer_sample,
                                      normal_bamfile=normal_bamfile,
                                      cancer_bamfile=cancer_bamfile,
                                      vcf_outfile=vcf_outfile,
                                      log_outfile=log_outfile)
            jobs.append(x)

        # java -jar GenomeAnalysisTK.jar \
        #   -T MuTect2 \
        #   -R reference.fasta \
        #   -I:tumor tumor.bam \
        #   -I:normal normal.bam \
        #   [--dbsnp dbSNP.vcf] \
        #   [--cosmic COSMIC.vcf] \
        #   [-L targets.interval_list] \
        #   -o output.vcf

        # Generate the commands.
        sq = mlib.sq
        commands = []
        for j in jobs:
            UNHASHABLE = [
                ("I:normal", sq(normal_bamfile)),
                ("I:tumor", sq(cancer_bamfile)),
                # --dbsnp and --cosmic use two dashes, for some
                # reason.  Since make_GATK_command only uses one dash,
                # add one manually.
                ("-dbsnp", sq(dbsnp_file)),
                ("-cosmic", sq(cosmic_file)),
            ]
            x = alignlib.make_GATK_command(
                T="MuTect2",
                R=sq(ref.fasta_file_full),
                L=sq(interval_node.identifier),
                o=sq(j.vcf_outfile),
                _UNHASHABLE=UNHASHABLE,
            )
            x = "%s >& %s" % (x, j.log_outfile)
            commands.append(x)
        assert len(commands) == len(jobs)

        nc = mlib.calc_max_procs_from_ram(25, upper_max=num_cores)
        parallel.pshell(commands, max_procs=nc)
        metadata["num_cores"] = nc
        metadata["commands"] = commands

        # Make sure log files have no errors.  Check the log files
        # before the VCF files.  If there's an error, the VCF files
        # may not be created.
        # ##### ERROR -------------------------------------------------------
        # ##### ERROR A GATK RUNTIME ERROR has occurred (version 2.2-25-g2a68
        # ##### ERROR
        # ##### ERROR Please visit the wiki to see if this is a known problem
        # ##### ERROR If not, please post the error, with stack trace, to the
        # ##### ERROR Visit our website and forum for extensive documentation
        # ##### ERROR commonly asked questions http://www.broadinstitute.org/
        # ##### ERROR
        # ##### ERROR MESSAGE: java.lang.IllegalArgumentException: Comparison
        # ##### ERROR -------------------------------------------------------
        for i, j in enumerate(jobs):
            # Pull out the error lines.
            x = [x for x in open(j.log_outfile)]
            x = [x for x in x if x.startswith("##### ERROR")]
            x = "".join(x)
            msg = "MuTect2 error [%s]:\n%s\n%s" % (cancer_sample, commands[i],
                                                   x)
            assert not x, msg

        # Make sure output VCF files exist.
        x = [x.vcf_outfile for x in jobs]
        filelib.assert_exists_many(x)

        # Mutect2 names the samples "NORMAL" and "TUMOR".  Replace
        # them with the actual names.
        for j in jobs:
            call_somatic_varscan._fix_normal_cancer_names(
                j.vcf_outfile, j.normal_sample, j.cancer_sample)

        return metadata
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, outfile):
        import os
        import arrayio
        from genomicode import filelib
        from Betsy import bie3
        from Betsy import rulebase
        from Betsy import read_label_file

        cls_node, data_node = antecedents
        M = arrayio.read(data_node.identifier)
        x = read_label_file.read(cls_node.identifier)
        a, training_label, second_line = x

        predict_model = __import__(
            'Betsy.modules.' + 'classify_with_random_forest', globals(),
            locals(), ['classify_with_random_forest'], -2)
        evaluate_model = __import__('Betsy.modules.' + 'evaluate_prediction',
                                    globals(), locals(),
                                    ['evaluate_prediction'], -2)

        full_index = range(M.ncol())

        f = file(outfile, 'w')
        f.write('\t'.join([
            'sample_name', 'Predicted_class', 'Confidence', 'Actual_class',
            'Correct?'
        ]))
        f.write('\n')
        for i in range(M.ncol()):
            # Make filenames
            # gene expression for N samples.
            merge_file = 'merge' + '_' + str(i)
            # class label file for the training samples (samples 1-(N-1)).
            train_label = 'train_label' + '_' + str(i)
            # class label file for the test sample (sample N).
            test_label = 'test_label' + '_' + str(i)
            # Save the output of the prediction and evaluation.
            predict_file = "predict.txt"
            evaluate_file = "evaluate.txt"

            test_index = i
            train_index = full_index[:]
            train_index.remove(test_index)
            merge_index = train_index + [test_index]
            y_training = [training_label[x] for x in train_index]
            y_test = [training_label[test_index]]

            # Write the files for this iteration.
            M_merge = M.matrix(None, merge_index)
            arrayio.gct_format.write(M_merge, open(merge_file, 'w'))
            read_label_file.write(train_label, second_line, y_training)
            read_label_file.write(test_label, second_line, y_test[0])

            # Make objects to be used in this analysis.
            x = rulebase.SignalFile.output(format='gct',
                                           contents='class0,class1,test')
            merge_data = bie3.IdentifiedDataNode(x, identifier=merge_file)
            x = rulebase.ClassLabelFile.output(contents='class0,class1')
            train_label_data = bie3.IdentifiedDataNode(x,
                                                       identifier=train_label)
            x = rulebase.ClassLabelFile.output(contents='test')
            test_label_data = bie3.IdentifiedDataNode(x, identifier=test_label)

            # Make a fake object to pass to evaluate_model.run.
            out_node = filelib.GenericObject()
            out_node.identifier = predict_file

            # Run the predictions.
            x = train_label_data, merge_data
            predict_model.Module().run(network, x, out_attributes,
                                       user_options, num_cores, predict_file)

            # Run the evaluation.
            new_parameters = out_attributes.copy()
            x = test_label_data, out_node
            evaluate_model.Module().run(network, x, new_parameters,
                                        user_options, num_cores, evaluate_file)

            # Is this the right line?
            lines = open(evaluate_file).readlines()

            f.write(lines[1])
            os.remove(merge_file)
            os.remove(train_label)
            os.remove(test_label)
            os.remove(predict_file)
            os.remove(evaluate_file)

        f.close()
Esempio n. 24
0
def main():
    import os
    import sys
    import time
    import argparse
    import shutil
    from genomicode import parselib
    from genomicode import filelib
    from Betsy import config
    from Betsy import rule_engine
    from Betsy import module_utils as mlib

    parser = argparse.ArgumentParser()
    parser.add_argument("-v", "--verbose", default=0, action="count")
    parser.add_argument("--running",
                        "--run",
                        dest="running",
                        action="store_true",
                        help="Show only running processes.")
    parser.add_argument("--broken",
                        action="store_true",
                        help="Show only broken processes.")
    #parser.add_argument(
    #    "--clean_broken", action="store_true",
    #    help="Remove all broken analyses.")
    parser.add_argument(
        "--clear_cache",
        help="Clear out old analyses in the cache.  Argument is the "
        "amount of bytes to be cleared.  Examples: 1000, 1Tb, 500G, 1024Mb.")
    parser.add_argument(
        "--dry_run",
        action="store_true",
        help="Used for --clear_acche.  Just show the directories to clear "
        "rather than actually clearing them.")
    parser.add_argument(
        "--ls",
        action="store_true",
        help="Show the modules in the BETSY cache, sorted by decreasing "
        "modification time.")
    parser.add_argument(
        "--cd",
        help="Show a current working directory to a module in the BETSY "
        "cache.  If the argument is a number (e.g. --goto <num>), will set "
        "the directory to the <num>th most recently created module.  "
        "If the argument is a string, will set the directory to the most "
        "recently created module whose directory name contains that string.")

    args = parser.parse_args()
    args.clean_broken = False

    output_path = config.CACHE_PATH
    if not os.path.exists(output_path):
        return
    output_path = os.path.realpath(output_path)

    assert not (args.ls and args.cd)
    if args.ls:
        list_directory(output_path)
        return
    if args.cd:
        change_directory(output_path, args.cd)
        return

    print "BETSY cache path: %s" % output_path
    print

    bytes_to_clear = None
    if args.clear_cache:
        bytes_to_clear = parse_clear_cache(args.clear_cache)
        #print "Clearing %d bytes" % bytes_to_clear

    # GenericObject with path (full path), size, status, last_accessed.
    path_info = []

    # Don't sort.  Just print as it comes out for speed.
    for x in os.listdir(output_path):
        if x.startswith("tmp"):  # OBSOLETE
            continue
        x = os.path.join(output_path, x)
        if not os.path.isdir(x):
            continue
        path = x

        p, f = os.path.split(path)
        # index_bam_folder__B006__617b92ee4d313bcd0148b1ab6a91b12f
        x = f.split("__")
        if len(x) != 3:
            print "Unrecognized path: %s" % path
            continue
        module_name, version, hash_ = x

        # Format the directory size.
        size = mlib.get_dirsize(path)

        # See if this module is still running.
        f = os.path.join(path, rule_engine.IN_PROGRESS_FILE)
        IN_PROGRESS = os.path.exists(f)

        if args.running and not IN_PROGRESS:
            continue

        # Read the parameter file.
        params = {}
        x = os.path.join(path, rule_engine.BETSY_PARAMETER_FILE)
        if os.path.exists(x):
            params = rule_engine._read_parameter_file(x)
        assert params.get("module_name", module_name) == module_name

        # Figure out the state of this module.
        status = None
        start_time = None
        if params:
            status = S_DONE
            start_time = params.get("start_time")
            assert start_time, "Missing: start_time"
            time_ = time.strptime(start_time, rule_engine.TIME_FMT)
            #time_str = time.strftime("%a %m/%d %I:%M %p", start_time)
            run_time = params.get("elapsed_pretty")
            if not run_time:
                run_time = "unknown"
            #assert run_time, "Missing elapsed_pretty: %s" % path
            #if run_time == "instant":
            #    x = "ran instantly"
            #else:
            #    x = "took %s" % run_time
        elif IN_PROGRESS:
            status = S_RUNNING
            # Get time that path was created.
            time_ = time.localtime(os.path.getctime(path))
            run_time = None
            #time_str = time.strftime("%a %m/%d %I:%M %p", x)
        else:
            # Get time that path was created.
            status = S_BROKEN
            time_ = time.localtime(os.path.getctime(path))
            #time_ = time.localtime(create_time)
            #time_ = time.strftime("%a %m/%d %I:%M %p", x)
            run_time = None

        if args.broken and status != S_BROKEN:
            continue

        # Figure out the last accessed time.
        last_accessed = None  # seconds since epoch
        x = os.path.join(path, rule_engine.LAST_ACCESSED_FILE)
        if os.path.exists(x):
            last_accessed = os.path.getmtime(x)
        # If I can't find the LAST_ACCESSED_FILE, then use the
        # parameters file.
        x = os.path.join(path, rule_engine.BETSY_PARAMETER_FILE)
        if not last_accessed and os.path.exists(x):
            last_accessed = os.path.getmtime(x)
        # Otherwise, use the path time.
        if not last_accessed:
            last_accessed = os.path.getmtime(path)

        # Update sizes.
        x = filelib.GenericObject(module_name=module_name,
                                  path=path,
                                  time_=time_,
                                  size=size,
                                  status=status,
                                  last_accessed=last_accessed,
                                  hash_=hash_,
                                  run_time=run_time)
        path_info.append(x)

        # Print out the time stamp and state.
        if not args.clear_cache:
            x = format_module_summary(x)
            parselib.print_split(x, prefixn=2)

        if status == S_DONE and args.verbose >= 1:
            # Print out the has stuff.
            hash_lines = []
            for name, value in params["hash"]:
                x = "%s=%s" % (name, value)
                hash_lines.append(x)
            if hash_lines:
                print "  HASH:"
                for x in hash_lines:
                    parselib.print_split(x, prefix1=4, prefixn=6)
        if status == S_RUNNING and args.verbose >= 1:
            # Print out the files in the directory.
            for x in os.walk(path):
                dirpath, dirnames, filenames = x
                filenames = [os.path.join(dirpath, x) for x in filenames]

                all_files = []  # tuple of (mod time, relative_file, filename)
                for filename in filenames:
                    file_ = os.path.relpath(filename, path)
                    if file_ == rule_engine.IN_PROGRESS_FILE:
                        continue
                    mtime = os.path.getmtime(filename)
                    all_files.append((mtime, file_, filename))
                # Sort by decreasing modification time.
                schwartz = [(-x[0], x) for x in all_files]
                schwartz.sort()
                all_files = [x[-1] for x in schwartz]

                for (mtime, relfile, filename) in all_files:
                    x = time.localtime(mtime)
                    mtime = time.strftime("%a %m/%d %I:%M %p", x)
                    x = os.path.getsize(filename)
                    size = parselib.pretty_filesize(x)
                    x = "[%s]  %s (%s)" % (mtime, relfile, size)
                    parselib.print_split(x, prefix1=2, prefixn=4)

        # Print out the metadata.
        metadata = params.get("metadata", {})
        if args.verbose >= 1:
            for key, value in metadata.iteritems():
                if key in ["commands"]:
                    continue
                x = "%s: %s" % (key.upper(), value)
                parselib.print_split(x, prefix1=2, prefixn=4)
        if args.verbose >= 2:
            for x in metadata.get("commands", []):
                x = "COMMAND: %s" % x
                parselib.print_split(x, prefix1=2, prefixn=4)
                #print "  %s" % x

        if status == S_BROKEN and args.clean_broken:
            shutil.rmtree(path)

        sys.stdout.flush()

    # Figure out which paths to delete.
    if args.clear_cache:
        assert bytes_to_clear

        # Figure out which paths symlink into other paths.
        real2links = {}  # real path -> list of symlinks that point to it
        for p in path_info:
            # Make a list of all the files under this path.
            all_filenames = []
            for x in os.walk(p.path):
                dirpath, dirnames, files = x
                x = [os.path.join(dirpath, x) for x in files]
                all_filenames.extend(x)
            # Follow the symlinks.
            all_filenames = [x for x in all_filenames if os.path.islink(x)]
            all_filenames = [os.path.realpath(x) for x in all_filenames]
            # Look at whether any of these files are in other paths.
            for filename in all_filenames:
                for x in path_info:
                    if x == p:
                        continue
                    if not filename.startswith(x.path):
                        continue
                    if x.path not in real2links:
                        real2links[x.path] = []
                    if p.path not in real2links[x.path]:
                        real2links[x.path].append(p.path)

        # Make a list of the paths that we can't delete.
        # Don't delete any path that is running.
        cant_delete = [x for x in path_info if x.status == S_RUNNING]
        # If we can't delete a path, then we also can't delete any
        # path with a real file that it symlinks into (because then
        # this path would be broken).
        for real_path, linked_paths in real2links.iteritems():
            if real_path in cant_delete:
                continue
            p = [x for x in linked_paths if x in cant_delete]
            if p:
                cant_delete.append(real_path)
        # Sort the paths by priority.
        x = path_info
        x = [x for x in x if x not in cant_delete]
        schwartz = [(get_clear_priority(x), x) for x in x]
        schwartz.sort()
        x = [x[-1] for x in schwartz]
        prioritized = x
        # Add up the sizes until I reach the desired output.
        to_delete = []
        num_bytes = 0
        for i in range(len(prioritized)):
            if num_bytes >= bytes_to_clear:
                break
            to_delete.append(prioritized[i])
            num_bytes += prioritized[i].size
        # Delete the directories.
        paths_to_delete = []
        for info in to_delete:
            x = format_module_summary(info)
            parselib.print_split(x, prefixn=2)
            if not args.dry_run:
                shutil.rmtree(info.path)
            i = path_info.index(info)
            path_info.pop(i)
            # Also delete an path with symlinks into here.
            x = real2links.get(info.path, [])
            paths_to_delete.extend(x)
        # Delete any of the extra paths (from symlinks).
        for path in paths_to_delete:
            found = False
            for i in range(len(path_info)):
                if path_info[i].path == path:
                    found = True
                    break
            # If already deleted, then ignore.
            if not found:
                continue
            if not args.dry_run:
                shutil.rmtree(path_info[i].path)
            path_info.pop(i)

    # BUG: Does not account for size in tmp directories.
    x = [x.size for x in path_info]
    total_size = sum(x)
    x = parselib.pretty_filesize(total_size)
    print "Used: %s" % x

    x = os.statvfs(output_path)
    free_size = x.f_bavail * x.f_frsize
    x = parselib.pretty_filesize(free_size)
    print "Free: %s" % x