コード例 #1
0
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            out_filename):
        import shutil
        from genomicode import filelib
        from genomicode import vcflib
        from Betsy import module_utils as mlib

        simple_file = in_data.identifier
        metadata = {}

        x = mlib.get_user_option(user_options, "remove_samples")
        x = x.split(",")
        x = [x.strip() for x in x]
        remove_samples = x

        x = mlib.get_user_option(user_options,
                                 "apply_filter",
                                 allowed_values=["no", "yes"])
        apply_filter = (x == "yes")

        wgs_or_wes = mlib.get_user_option(user_options,
                                          "wgs_or_wes",
                                          not_empty=True,
                                          allowed_values=["wgs", "wes"])

        name2caller = {}  # name -> Caller object
        for caller in vcflib.CALLERS:
            caller = caller()
            assert caller.name not in name2caller
            name2caller[caller.name] = caller

        TEMPFILE = "temp.txt"
        handle = open(TEMPFILE, 'w')
        it = filelib.read_row(simple_file, header=1)
        print >> handle, "\t".join(it._header)
        for d in it:
            # Find the caller.
            assert d.Caller in name2caller, "Unknown caller: %s" % d.Caller
            caller = name2caller[d.Caller]

            # remove_sample
            if d.Sample in remove_samples:
                continue
            #if remove_radia_rna_samples and d.Sample.endswith("_RNA"):
            #    continue

            # apply_filter
            if apply_filter:
                args = d.Filter,
                if d.Caller == "MuSE":
                    args = d.Filter, wgs_or_wes
                if not caller.is_pass(*args):
                    continue

            print >> handle, "\t".join(d._cols)
        handle.close()

        shutil.move(TEMPFILE, out_filename)

        return metadata
コード例 #2
0
ファイル: run_spp.py プロジェクト: firebitsbr/changlab
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import parallel
        from genomicode import hashlib
        from genomicode import filelib
        from Betsy import module_utils
        import run_MACS14

        bam_node, group_node = antecedents
        bam_path = module_utils.check_inpath(bam_node.identifier)
        sample_groups = module_utils.read_sample_group_file(
            group_node.identifier)

        # Get options.
        treat_sample = module_utils.get_user_option(user_options,
                                                    "treatment_sample",
                                                    not_empty=True)
        control_sample = module_utils.get_user_option(user_options,
                                                      "control_sample",
                                                      not_empty=True)

        # Set the experiment name.
        name1 = hashlib.hash_var(treat_sample)
        name2 = hashlib.hash_var(control_sample)
        experiment_name = "%s_vs_%s" % (name1, name2)

        # Make sure the samples exist.
        samples = [x[1] for x in sample_groups]
        assert treat_sample in samples, "Unknown sample: %s" % treat_sample
        assert control_sample in samples, "Unknown sample: %s" % control_sample

        # Find the BAM files.
        treat_filename = run_MACS14.find_bam_file(bam_path, treat_sample,
                                                  sample_groups)
        control_filename = run_MACS14.find_bam_file(bam_path, control_sample,
                                                    sample_groups)
        assert treat_filename, "Missing bam file for %s" % treat_sample
        assert control_filename, "Missing bam file for %s" % control_sample

        cmd = make_pyspp_command(treat_filename,
                                 control_filename,
                                 out_path,
                                 num_procs=num_cores)
        log_file = "%s.log" % experiment_name
        cmd = "%s >& %s" % (cmd, log_file)
        parallel.sshell(cmd, path=out_path)

        files = [
            "binding.positions.txt",
            #"broadPeak",
            "crosscorrelation.pdf",
            "density.wig",
            "enrichment.estimates.wig",
            "enrichment.wig",
            #"narrowPeak",   # might be empty if no peaks found
            log_file,
        ]
        filenames = [os.path.join(out_path, x) for x in files]
        filelib.assert_exists_nz_many(filenames)
コード例 #3
0
    def run(
        self, network, in_data, out_attributes, user_options, num_cores,
        out_path):
        import os
        import shutil
        from genomicode import filelib
        from Betsy import module_utils as mlib
        import cluster_genes_by_hierarchical as clust
        
        filelib.safe_mkdir(out_path)
        metadata = {}

        kmeans_k = mlib.get_user_option(
            user_options, "kmeans_k", not_empty=True, type=int)
        assert kmeans_k >= 2 and kmeans_k < 100

        x = clust.run_cluster30(
            in_data.identifier, "kmeans", user_options, kmeans_k=kmeans_k)
        cmd, cluster_files = x
        metadata["command"] = cmd
        
        opj = os.path.join
        out_cdt_file = opj(out_path, "signal.cdt")
        out_kag_file = opj(out_path, "array_cluster.kag")
        out_kgg_file = opj(out_path, "gene_cluster.kgg")

        assert "cdt" in cluster_files
        shutil.copy2(cluster_files["cdt"], out_cdt_file)
        if "kag" in cluster_files:
            shutil.copy2(cluster_files["kag"], out_kag_file)
        if "kgg" in cluster_files:
            shutil.copy2(cluster_files["kgg"], out_kgg_file)
        
        return metadata
コード例 #4
0
def relabel(data_file, rename_file, outfile, user_options):
    from genomicode import filelib
    from genomicode import parallel
    from Betsy import module_utils as mlib

    sample_header = mlib.get_user_option(
        user_options, "sample_labels_header", not_empty=True)
    # Make sure sample_header in rename file.
    x = open(rename_file).readline()
    x = x.rstrip("\r\n").split("\t")
    assert sample_header in x, "Missing header (%s): %s" % (
        sample_header, rename_file)

    sq = parallel.quote
    slice_matrix = mlib.get_config("slice_matrix", which_assert_file=True)
    x = "'%s,%s'" % (rename_file, sample_header)
    cmd = [
        "python",
        sq(slice_matrix),
        '--relabel_col_ids', x,
        sq(data_file),
        ]
    cmd = " ".join(cmd)
    cmd = "%s >& %s" % (cmd, outfile)
    parallel.sshell(cmd)

    filelib.assert_exists_nz(outfile)
    return cmd
コード例 #5
0
def run_cluster30(filename, algorithm, user_options, **more_args):
    import arrayio
    from genomicode import cluster30
    from Betsy import module_utils as mlib

    MATRIX_FILE = "data.pcl"

    DISTANCE_MEASURES = cluster30.DIST2ID.keys()
    YESNO = ["yes", "no"]

    cluster_genes = mlib.get_user_option(user_options,
                                         "cluster_genes",
                                         not_empty=True,
                                         allowed_values=YESNO)
    cluster_arrays = mlib.get_user_option(user_options,
                                          "cluster_arrays",
                                          not_empty=True,
                                          allowed_values=YESNO)
    distance_metric = mlib.get_user_option(user_options,
                                           "distance_measure",
                                           not_empty=True,
                                           allowed_values=DISTANCE_MEASURES)

    # Make a PCL-formatted file for cluster 3.0.  It might
    # misinterpret the columns of a tab-delimited file.
    matrix = arrayio.read(filename)
    matrix = arrayio.convert(matrix, to_format=arrayio.pcl_format)
    arrayio.write(matrix, open(MATRIX_FILE, 'w'))

    jobname = "cluster"
    cmd = cluster30.cluster30_file(MATRIX_FILE, (cluster_genes == "yes"),
                                   (cluster_arrays == "yes"),
                                   algorithm,
                                   distance=distance_metric,
                                   jobname=jobname,
                                   **more_args)

    # Find the output files and name them appropriately.
    cluster_files = cluster30._find_cluster_files(jobname)
    fix_cluster30_dup_header(cluster_files["cdt"])

    return cmd, cluster_files
コード例 #6
0
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib
        from Betsy import module_utils

        vcf_node = in_data
        vcf_filenames = filelib.list_files_in_path(vcf_node.identifier,
                                                   endswith=".vcf")
        assert vcf_filenames, "No .vcf files."
        filelib.safe_mkdir(out_path)

        buildver = module_utils.get_user_option(user_options,
                                                "buildver",
                                                allowed_values=["hg19"],
                                                not_empty=True)

        jobs = []  # list of (in_filename, log_filename, out_filestem)
        for in_filename in vcf_filenames:
            # Annovar takes a filestem, without the ".vcf".
            p, f = os.path.split(in_filename)
            f, exp = os.path.splitext(f)
            log_filename = os.path.join(out_path, "%s.log" % f)
            out_filestem = os.path.join(out_path, f)
            x = in_filename, log_filename, out_filestem
            jobs.append(x)

        # Make a list of commands.
        commands = []
        for x in jobs:
            in_filename, log_filename, out_filestem = x

            x = alignlib.make_annovar_command(in_filename, log_filename,
                                              out_filestem, buildver)
            commands.append(x)

        #for x in commands:
        #    print x
        #import sys; sys.exit(0)

        parallel.pshell(commands, max_procs=num_cores)

        # Make sure the analysis completed successfully.
        x = [x[-1] for x in jobs]  # out_filestems
        x = ["%s.%s_multianno.vcf" % (x, buildver) for x in x]
        filelib.assert_exists_nz_many(x)
コード例 #7
0
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from Betsy import module_utils as mlib

        # This this is I/O heavy, don't use so many cores.
        MAX_CORES = 2

        filenames = mlib.find_fastq_files(in_data.identifier)
        assert filenames, "I could not find any FASTQ files."
        filelib.safe_mkdir(out_path)
        metadata = {}

        num_samples = mlib.get_user_option(user_options,
                                           "num_samples",
                                           not_empty=True,
                                           type=int)
        metadata["num_samples"] = num_samples

        jobs = []
        for in_filename in filenames:
            p, f = os.path.split(in_filename)
            out_filename = os.path.join(out_path, f)
            x = in_filename, out_filename
            jobs.append(x)

        cmds = []
        for x in jobs:
            in_filename, out_filename = x
            x = copy_fastq_file, (in_filename, out_filename, num_samples), {}
            cmds.append(x)

        nc = min(MAX_CORES, num_cores)
        metadata["num cores"] = nc
        parallel.pyfun(cmds, num_procs=nc)

        return metadata
コード例 #8
0
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            out_path):
        import os
        import shutil
        from genomicode import filelib
        from genomicode import cluster30
        from Betsy import module_utils as mlib

        filelib.safe_mkdir(out_path)
        metadata = {}

        LINKAGES = cluster30.METHOD2ID.keys()
        linkage = mlib.get_user_option(user_options,
                                       "linkage",
                                       not_empty=True,
                                       allowed_values=LINKAGES)

        x = run_cluster30(in_data.identifier,
                          "hierarchical",
                          user_options,
                          method=linkage)
        cmd, cluster_files = x
        metadata["command"] = cmd

        opj = os.path.join
        out_cdt_file = opj(out_path, "signal.cdt")
        out_atr_file = opj(out_path, "array_tree.atr")
        out_gtr_file = opj(out_path, "gene_tree.gtr")

        assert "cdt" in cluster_files
        shutil.copy2(cluster_files["cdt"], out_cdt_file)
        if "atr" in cluster_files:
            shutil.copy2(cluster_files["atr"], out_atr_file)
        if "gtr" in cluster_files:
            shutil.copy2(cluster_files["gtr"], out_gtr_file)

        return metadata
コード例 #9
0
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            out_filename):
        import os
        import shutil
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib
        from Betsy import module_utils

        mvcf_node = in_data
        in_filename = mvcf_node.identifier
        filelib.assert_exists_nz(in_filename)

        buildver = module_utils.get_user_option(user_options,
                                                "buildver",
                                                allowed_values=["hg19"],
                                                not_empty=True)

        # Annovar takes a filestem, without the ".vcf".
        p, f = os.path.split(in_filename)
        f, exp = os.path.splitext(f)
        log_filename = "%s.log" % f

        p, f = os.path.split(out_filename)
        f, exp = os.path.splitext(f)
        out_filestem = f

        cmd = alignlib.make_annovar_command(in_filename, log_filename,
                                            out_filestem, buildver)
        parallel.sshell(cmd)

        # Make sure the analysis completed successfully.
        x = "%s.%s_multianno.vcf" % (out_filestem, buildver)
        filelib.assert_exists_nz(x)
        if os.path.realpath(x) != os.path.realpath(out_filename):
            shutil.copy2(x, out_filename)
コード例 #10
0
def plot_heatmap(filename, outfile, cluster_files, user_options):
    from genomicode import parallel
    from genomicode import graphlib
    from Betsy import module_utils as mlib
    
    python = mlib.get_config(
        "python", which_assert_file=True, assert_exists=True)
    arrayplot = mlib.get_config(
        "arrayplot", which_assert_file=True, assert_exists=True)

    COLORS = [
        "red", "white", "red-green", "blue-yellow", "red-green-soft",
        "red-blue-soft", "matlab", "bild", "genepattern", "genespring",
        "yahoo", "brewer-prgn-div", "brewer-rdbu-div", 
        "brewer-rdylbu-div", "brewer-rdylgn-div", "brewer-spectral-div",
        "brewer-blues-seq", "brewer-greens-seq", "brewer-reds-seq",
        "brewer-ylorbr-seq", "brewer-qual-set",
        ]
    YESNO = ["no", "yes"]

    hm_width = mlib.get_user_option(user_options, "hm_width", type=int)
    hm_height = mlib.get_user_option(user_options, "hm_height", type=int)
    hm_color = mlib.get_user_option(
        user_options, "hm_color", allowed_values=COLORS, not_empty=True)

    hm_colorbar = mlib.get_user_option(
        user_options, "hm_colorbar", not_empty=True, allowed_values=YESNO)
    hm_colorbar_horizontal = mlib.get_user_option(
        user_options, "hm_colorbar_horizontal", not_empty=True,
        allowed_values=YESNO)
    hm_colorbar_height = mlib.get_user_option(
        user_options, "hm_colorbar_height", not_empty=True, type=float)
    hm_colorbar_width = mlib.get_user_option(
        user_options, "hm_colorbar_width", not_empty=True, type=float)
    hm_colorbar_font = mlib.get_user_option(
        user_options, "hm_colorbar_font", not_empty=True, type=float)

    hm_label_genes = mlib.get_user_option(
        user_options, "hm_label_genes", allowed_values=YESNO)
    hm_scale_gene_labels = mlib.get_user_option(
        user_options, "hm_scale_gene_labels", not_empty=True, type=float)
    hm_label_arrays = mlib.get_user_option(
        user_options, "hm_label_arrays", allowed_values=YESNO)
    hm_scale_array_labels = mlib.get_user_option(
        user_options, "hm_scale_array_labels", not_empty=True, type=float)

    hm_show_gene_tree = None
    hm_show_array_tree = None
    hm_show_gene_cluster = None
    hm_show_array_cluster = None
    if "hm_show_gene_tree" in user_options:
        hm_show_gene_tree = mlib.get_user_option(
            user_options, "hm_show_gene_tree", allowed_values=YESNO,
            not_empty=True)
        hm_show_array_tree = mlib.get_user_option(
            user_options, "hm_show_array_tree", allowed_values=YESNO,
            not_empty=True)
        hm_show_gene_cluster = mlib.get_user_option(
            user_options, "hm_show_gene_cluster", allowed_values=YESNO,
            not_empty=True)
        hm_show_array_cluster = mlib.get_user_option(
            user_options, "hm_show_array_cluster", allowed_values=YESNO,
            not_empty=True)

    # Set default values.
    if not hm_width or not hm_height:
        nrow, ncol = get_matrix_size(filename)
        fn = graphlib.find_wide_heatmap_size
        if nrow > ncol:
            fn = graphlib.find_tall_heatmap_size
        x = fn(
            nrow, ncol, max_total_height=4096, max_total_width=4096,
            max_box_height=200, max_box_width=200)
        hm_width, hm_height = x

    if not hm_label_genes:
        nrow, ncol = get_matrix_size(filename)
        hm_label_genes = "no"
        if nrow <= 50:
            hm_label_genes = "yes"
    if not hm_label_arrays:
        nrow, ncol = get_matrix_size(filename)
        hm_label_arrays = "no"
        if ncol <= 50:
            hm_label_arrays = "yes"
    
        
    # Check values.
    assert hm_width >= 1 and hm_width <= 256, "Invalid width: %s" % hm_width
    assert hm_height >= 1 and hm_height <= 256, \
           "Invalid height: %s" % hm_height
    assert hm_scale_gene_labels > 0 and hm_scale_gene_labels < 10
    assert hm_scale_array_labels > 0 and hm_scale_array_labels < 10

    sq = parallel.quote
    cmd = [
        sq(python),
        sq(arrayplot),
        "--grid",
        "-x", hm_width,
        "-y", hm_height,
        "--color", hm_color,
        ]
    if hm_colorbar == "yes":
        cmd += [
            "--colorbar",
            "--cb_height", hm_colorbar_height,
            "--cb_width", hm_colorbar_width,
            "--cb_font", hm_colorbar_font,
            ]
        if hm_colorbar_horizontal == "yes":
            cmd += ["--cb_horizontal"]

    if hm_label_genes == "yes":
        cmd += [
            "--label_genes",
            "--scale_gene_labels", hm_scale_gene_labels,
            ]
    if hm_label_arrays == "yes":
        cmd += [
            "--label_arrays",
            "--scale_array_labels", hm_scale_array_labels,
            ]
    if hm_show_gene_tree == "yes" and "gtr" in cluster_files:
        cmd += ["--gene_tree_file", cluster_files["gtr"]]
    if hm_show_array_tree == "yes" and "atr" in cluster_files:
        cmd += ["--array_tree_file", cluster_files["atr"]]
    if hm_show_gene_cluster == "yes" and "kgg" in cluster_files:
        cmd += ["--gene_cluster_file", cluster_files["kgg"]]
    if hm_show_array_cluster == "yes" and "kag" in cluster_files:
        cmd += ["--array_cluster_file", cluster_files["kag"]]
    cmd += [
        sq(filename),
        sq(outfile),
        ]
    cmd = " ".join(map(str, cmd))
    parallel.sshell(cmd)

    return cmd
コード例 #11
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import genesetlib
        from genomicode import parallel
        from genomicode import filelib
        from Betsy import module_utils as mlib

        in_data = antecedents
        if not os.path.exists(out_path):
            os.mkdir(out_path)
        metadata = {}

        merge = mlib.get_user_option(user_options,
                                     "merge_up_and_down_genes",
                                     not_empty=True,
                                     allowed_values=["yes", "no"])
        merge = (merge == "yes")

        opj = os.path.join
        gs_filename = opj(out_path, "gene_sets.gmt")
        intersect_filename = opj(out_path, "intersection.gmt")
        count_filename = opj(out_path, "pairwise_count_matrix.txt")
        venn_plot_file = opj(out_path, "venn.tiff")

        # Make a list of all the data sets in the antecedents.
        # <stem>.nocutoff.txt
        # <stem>.<cutoff>.txt
        # <stem>.<cutoff>.gmt
        # <stem>.<cutoff>.heatmap.png
        x = os.listdir(in_data.identifier)
        x = sorted(x)
        x = [x for x in x if x.endswith(".gmt")]
        x = [x for x in x if x.find("nocuttof") < 0]
        x = [opj(in_data.identifier, x) for x in x]
        filtered_geneset_files = x
        assert filtered_geneset_files, "Missing: filtered geneset files"

        # For each of the filtered_geneset_files, figure out the
        # <stem>.  This is tricky because <cutoff> may contain
        # multiple dots.
        # <stem>.fdr_0.05.p_0.05.fc_1.5.gmt
        stems = [None] * len(filtered_geneset_files)
        for i, x in enumerate(filtered_geneset_files):
            x = filtered_geneset_files[i]
            x = os.path.split(x)[1]
            x = x.split(".")
            j = 0
            while j < len(x):
                if x[j].startswith("fdr_") or x[j].startswith("fc_") or \
                       x[j].startswith("p_") or x[j] == "gmt":
                    x = x[:j]
                else:
                    j += 1
            x = ".".join(x)
            stems[i] = x

        genesets = []
        geneset_stems = []
        for i, filename in enumerate(filtered_geneset_files):
            for x in genesetlib.read_genesets(filename):
                name, description, genes = x
                x = genesetlib.GeneSet(name, description, genes)
                genesets.append(x)
                geneset_stems.append(stems[i])
        assert genesets, "I could not find any gene sets"

        # Should contain gene sets whose name fits the pattern:
        # <NAME>_ID_UP
        # <NAME>_ID_DN
        # <NAME>_NAME_UP
        # <NAME>_NAME_DN
        # Only want the _ID_ gene sets for comparison.
        I = [i for (i, x) in enumerate(genesets) if x.name.find("_ID_") >= 0]
        genesets = [genesets[i] for i in I]
        geneset_stems = [geneset_stems[i] for i in I]
        assert genesets, "I could not find any '_ID_' gene sets"

        # Rename each of the gene sets.
        for i, gs in enumerate(genesets):
            name = gs.name
            n = "%s_%s" % (geneset_stems[i], name)
            if merge:
                # If I'm merging, then which gene set is UP or DN
                # doesn't matter.
                suffix = name[-6:]
                assert suffix in ["_ID_UP", "_ID_DN"]
                n = "%s%s" % (geneset_stems[i], suffix)
            gs.name = n

        # Write out the gene sets.
        genesetlib.write_gmt(gs_filename, genesets)

        # Count the number of gene sets.
        x = [x.name for x in genesets]
        if merge:
            x = [x.replace("_DN", "_UP") for x in x]
            x = [x.replace("_DOWN", "_UP") for x in x]
        num_genesets = len({}.fromkeys(x))

        calc_venn = mlib.get_config("calc_venn", which_assert_file=True)
        sq = parallel.quote
        cmd = [
            sq(calc_venn),
            "-o",
            sq(intersect_filename),
            "--all_genesets",
            "--num_to_compare",
            2,
        ]
        if num_genesets <= 5:
            # Can only plot up to 5 circles.
            cmd += ["--plotfile", sq(venn_plot_file)]
        if merge:
            cmd += ["--automatch"]
        cmd.append(sq(gs_filename))
        cmd = " ".join(map(str, cmd))
        cmd = "%s >& %s" % (cmd, sq(count_filename))
        parallel.sshell(cmd)
        metadata["commands"] = [cmd]

        # Make a heatmap of the counts.
        UNCLUSTERED_FILE = "unclustered.txt"
        CLUSTERED_FILE = "clustered.txt"
        COL_TREE_FILE = "col_tree.txt"
        ROW_TREE_FILE = "row_tree.txt"
        HEATMAP_FILE = opj(out_path, "heatmap.counts.png")

        # Make a file with the counts.
        outhandle = open(UNCLUSTERED_FILE, 'w')
        for line in open(count_filename):
            if not line.strip():
                break
            outhandle.write(line)
        outhandle.close()

        # Cluster the counts.
        slice_matrix = mlib.get_config("slice_matrix", which_assert_file=True)
        arrayplot = mlib.get_config("arrayplot", which_assert_file=True)
        cmd = [
            sq(slice_matrix),
            "--reorder_col_cluster",
            "--col_tree_file",
            sq(COL_TREE_FILE),
            "--reorder_row_cluster",
            "--row_tree_file",
            sq(ROW_TREE_FILE),
            sq(UNCLUSTERED_FILE),
        ]
        cmd = "%s > %s" % (" ".join(cmd), sq(CLUSTERED_FILE))
        parallel.sshell(cmd)
        metadata["commands"].append(cmd)
        filelib.assert_exists_nz(CLUSTERED_FILE)

        # Draw the heatmap.
        cmd = [
            sq(arrayplot),
            "--grid",
            "--array_tree_file",
            sq(COL_TREE_FILE),
            "--al",
            "--gene_tree_file",
            sq(ROW_TREE_FILE),
            "--gl",
            "--colorbar",
            "--color",
            "brewer-greens-seq",
            sq(CLUSTERED_FILE),
            sq(HEATMAP_FILE),
        ]
        cmd = " ".join(cmd)
        parallel.sshell(cmd)
        metadata["commands"].append(cmd)
        filelib.assert_exists_nz(HEATMAP_FILE)

        mlib.txt2xls(count_filename)

        return metadata
コード例 #12
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import parallel
        from genomicode import filelib
        from genomicode import alignlib
        from Betsy import module_utils as mlib

        fastq_node, sample_node, strand_node, ref_node = antecedents
        fastq_files = mlib.find_merged_fastq_files(sample_node.identifier,
                                                   fastq_node.identifier)
        ref = alignlib.create_reference_genome(ref_node.identifier)
        stranded = mlib.read_stranded(strand_node.identifier)
        filelib.safe_mkdir(out_path)

        # Do a quick check to make sure the reference is correct.
        # Otherwise, error may be hard to disgnose.
        alignlib.assert_is_STAR_reference(ref.path)

        metadata = {}
        metadata["tool"] = "STAR %s" % alignlib.get_STAR_version()

        x = mlib.get_user_option(user_options,
                                 "two_pass",
                                 allowed_values=["no", "yes"])
        two_pass = (x == "yes")

        # Figure out the strandedness.
        is_stranded = stranded.stranded != "unstranded"

        # STAR --runThreadN 40 --genomeDir test05 \
        #   --readFilesIn test.fastq/test03_R1_001.fastq \
        #   test.fastq/test03_R2_001.fastq --outFileNamePrefix test06.
        # If unstranded, add --outSAMstrandField intronMotif

        # Make a list of the jobs to run.
        jobs = []  # list of filelib.GenericObject objects
        for x in fastq_files:
            sample, pair1, pair2 = x
            pass1_out_prefix = "p1.%s." % sample
            pass2_out_prefix = "%s." % sample
            pass1_bam_filename = os.path.join(
                out_path, "%sAligned.out.bam" % pass1_out_prefix)
            pass2_bam_filename = os.path.join(
                out_path, "%sAligned.out.bam" % pass2_out_prefix)
            sjdb_filename = os.path.join(out_path, "p1.%s.SJ.out.tab" % sample)
            log1_filename = os.path.join(out_path, "p1.%s.log" % sample)
            log2_filename = os.path.join(out_path, "%s.log" % sample)

            x = filelib.GenericObject(
                sample=sample,
                pair1=pair1,
                pair2=pair2,
                pass1_out_prefix=pass1_out_prefix,
                pass2_out_prefix=pass2_out_prefix,
                pass1_bam_filename=pass1_bam_filename,
                pass2_bam_filename=pass2_bam_filename,
                sjdb_filename=sjdb_filename,
                log1_filename=log1_filename,
                log2_filename=log2_filename,
            )
            jobs.append(x)

        # Run pass 1.
        commands = []
        for j in jobs:
            x = os.path.join(out_path, j.pass1_out_prefix)
            cmd = alignlib.make_STAR_command(ref.path, x, num_cores,
                                             is_stranded, j.pair1, j.pair2,
                                             j.log1_filename)
            # For debugging.  If this file already exists, skip it.
            if not filelib.exists_nz(j.pass1_bam_filename):
                parallel.sshell(cmd, path=out_path)
            filelib.assert_exists_nz(j.pass1_bam_filename)
            commands.append(cmd)

        if two_pass:
            # Make a new index with the splice junction information.
            sj_index = os.path.join(out_path, "genome.2pass")
            x = [x.sjdb_filename for x in jobs]
            filelib.assert_exists_nz_many(x)
            x = alignlib.make_STAR_index_command(ref.fasta_file_full,
                                                 sj_index,
                                                 sjdb_files=x,
                                                 num_cores=num_cores)
            x = "%s >& genome.2pass.log" % x
            commands.append(x)

            # For debugging.  If this file already exists, skip it.
            if not filelib.exists_nz("genome.2pass.log"):
                parallel.sshell(x, path=out_path)
            alignlib.assert_is_STAR_reference(sj_index)

        # Run pass 2.
        for j in jobs:
            # For debugging.  If this file already exists, skip it.
            if os.path.exists(j.pass2_bam_filename):
                continue
            if two_pass:
                x = os.path.join(out_path, j.pass2_out_prefix)
                cmd = alignlib.make_STAR_command(sj_index, x, num_cores,
                                                 is_stranded, j.pair1, j.pair2,
                                                 j.log2_filename)
                parallel.sshell(cmd, path=out_path)
                commands.append(cmd)
            else:
                # link pass1_bam_filename to pass2_bam_filename
                os.symlink(j.pass1_bam_filename, j.pass2_bam_filename)
                continue
            filelib.assert_exists_nz(j.pass2_bam_filename)

        metadata["commands"] = commands
        metadata["num_cores"] = num_cores

        # STAR takes 28 Gb per process.  Make sure we don't use up
        # more memory than is available on the machine.
        # Defaults:
        # --limitGenomeGenerateRAM   31000000000
        # --outFilterMismatchNmax    10             Num mismatches.
        #nc = mlib.calc_max_procs_from_ram(50, buffer=100, upper_max=num_cores)
        #metadata["num_cores"] = nc
        #parallel.pshell(commands, max_procs=nc, path=out_path)

        # Make sure the analysis completed successfully.
        #x = [x[-2] for x in jobs]  # sam_filename
        #filelib.assert_exists_nz_many(x)
        return metadata
コード例 #13
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_filename):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from Betsy import module_utils as mlib

        fastq_node, sample_node, align_node = antecedents
        fastq_data = mlib.find_merged_fastq_files(sample_node.identifier,
                                                  fastq_node.identifier)
        assert fastq_data, "I could not find any FASTQ files."
        align_filenames = filelib.list_files_in_path(align_node.identifier,
                                                     endswith=".matches.txt")
        assert align_filenames, "No .matches.txt files."
        align_filenames.sort()
        metadata = {}

        assert len(fastq_data) == len(align_filenames), \
               "Mismatch: num samples %d %d" % (
            len(fastq_data), len(align_filenames))

        num_mismatches = mlib.get_user_option(user_options,
                                              "num_mismatches",
                                              type=int)
        assert num_mismatches >= 0 and num_mismatches < 25
        metadata["num_mismatches"] = num_mismatches

        sample2fastqdata = {}
        for x in fastq_data:
            sample, f1, f2 = x
            sample2fastqdata[sample] = x

        # list of (sample, align_filename, summary_filename,
        #   fastq_filename1, fastq_filename2)
        jobs = []
        for in_filename in align_filenames:
            p, f = os.path.split(in_filename)
            # <sample>.matches.txt
            ext = ".matches.txt"
            assert f.endswith(ext)
            sample = f[:-len(ext)]
            assert sample in sample2fastqdata, "Missing FASTQ: %s" % sample
            summary_filename = "%s.summary.txt" % sample
            x, fastq_filename1, fastq_filename2 = sample2fastqdata[sample]
            x = sample, in_filename, summary_filename, \
                fastq_filename1, fastq_filename2
            jobs.append(x)

        jobs2 = []  # list of (function, args, keywds)
        for x in jobs:
            sample, align_filename, summary_filename, \
                    fastq_file1, fastq_file2 = x
            args = align_filename, fastq_file1, fastq_file2, num_mismatches
            keywds = {
                "temp_path": ".",
                "outfile": summary_filename,
            }
            x = summarize_matches_file, args, keywds
            jobs2.append(x)

        # Since this can take a lot of memory (depending on the number
        # of reads, can easily take 8 Gb), do just 1 process at a
        # time.  Also, I/O intensive.  Don't do too many at a time.
        #MAX_PROCS = 1
        MAX_PROCS = 4
        nc = mlib.calc_max_procs_from_ram(30, upper_max=MAX_PROCS)
        #nc = min(MAX_PROCS, num_cores)
        results = parallel.pyfun(jobs2, num_procs=nc, DELAY=0.1)
        metadata["num_cores"] = nc
        assert len(results) == len(jobs2)

        # Put together the results in a table.
        handle = open(out_filename, 'w')
        header = "sample", "match", "total", "RPM", "match", "mismatch"
        print >> handle, "\t".join(header)
        for x in zip(jobs, results):
            x, d = x
            sample, in_filename, summary_filename, \
                    fastq_filename1, fastq_filename2 = x
            match = d["perfect_alignments"]
            total = d["total_alignments"]
            rpm = int(float(match) / total * 1E6)
            perc_match = d["perc_perfect"]
            perc_mismatch = 1 - d["perc_perfect"]
            x = sample, match, total, rpm, perc_match, perc_mismatch
            assert len(x) == len(header)
            print >> handle, "\t".join(map(str, x))
        handle.close()
        return metadata
コード例 #14
0
ファイル: run_peakseq.py プロジェクト: firebitsbr/changlab
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import parallel
        from genomicode import hashlib
        from genomicode import filelib
        from Betsy import module_utils
        import run_MACS14

        bam_node, group_node = antecedents
        bam_path = module_utils.check_inpath(bam_node.identifier)
        sample_groups = module_utils.read_sample_group_file(
            group_node.identifier)

        # Get options.
        treat_sample = module_utils.get_user_option(user_options,
                                                    "treatment_sample",
                                                    not_empty=True)
        control_sample = module_utils.get_user_option(user_options,
                                                      "control_sample")
        fragment_length = module_utils.get_user_option(
            user_options, "peakseq_fragment_length", not_empty=True, type=int)
        mappability_file = module_utils.get_user_option(user_options,
                                                        "mappability_file",
                                                        not_empty=True,
                                                        check_file=True)
        assert fragment_length > 0 and fragment_length < 1000

        # Set the experiment name.
        name1 = hashlib.hash_var(treat_sample)
        name2 = hashlib.hash_var(control_sample)
        experiment_name = "%s_vs_%s" % (name1, name2)

        # Make sure the samples exist.
        samples = [x[1] for x in sample_groups]
        assert treat_sample in samples, "Unknown sample: %s" % treat_sample
        if control_sample:
            assert control_sample in samples, \
                   "Unknown sample: %s" % control_sample

        # Find the BAM files.
        treat_filename = run_MACS14.find_bam_file(bam_path, treat_sample,
                                                  sample_groups)
        control_filename = run_MACS14.find_bam_file(bam_path, control_sample,
                                                    sample_groups)
        assert treat_filename, "Missing bam file for %s" % treat_sample
        assert control_filename, "Missing bam file for %s" % control_sample

        cmd = make_peakseq_command(treat_filename, control_filename, out_path,
                                   experiment_name, fragment_length,
                                   mappability_file)
        log_file = "%s.log" % experiment_name
        cmd = "%s >& %s" % (cmd, log_file)
        parallel.sshell(cmd, path=out_path)

        files = [
            "config.dat",
            log_file,
            "%s.txt" % experiment_name,
            # Can be length 0, if no peaks found.
            #"%s_narrowPeak.txt" % experiment_name,
        ]
        filenames = [os.path.join(out_path, x) for x in files]
        filelib.assert_exists_nz_many(filenames)
コード例 #15
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import parallel
        from genomicode import filelib
        from genomicode import alignlib
        from Betsy import module_utils as mlib

        bam_folder, sample_node, gene_node, strand_node = antecedents
        bam_path = bam_folder.identifier
        assert filelib.dir_exists(bam_path)
        gtf_file = gene_node.identifier
        filelib.assert_exists_nz(gtf_file)
        stranded = mlib.read_stranded(strand_node.identifier)
        filelib.safe_mkdir(out_path)

        metadata = {}

        attr2order = {
            "name": "name",
            "coordinate": "pos",
        }
        x = bam_folder.data.attributes["sorted"]
        sort_order = attr2order.get(x)
        assert sort_order, "Cannot handle sorted: %s" % x

        #attr2stranded = {
        #    "single" : "no",
        #    "paired" : "no",
        #    "paired_ff" : None,
        #    "paired_fr" : "yes",
        #    "paired_rf" : "reverse",
        #    }
        #x = sample_node.data.attributes["orientation"]
        #stranded = attr2stranded.get(x)
        #assert stranded, "Cannot handle orientation: %s" % x

        ht_stranded = None
        if stranded.stranded == "unstranded":
            ht_stranded = "no"
        elif stranded.stranded == "firststrand":
            ht_stranded = "reverse"
        elif stranded.stranded == "secondstrand":
            ht_stranded = "yes"
        assert ht_stranded is not None

        #gtf_file = mlib.get_user_option(
        #    user_options, "gtf_file", not_empty=True)
        #assert os.path.exists(gtf_file), "File not found: %s" % gtf_file

        mode = mlib.get_user_option(user_options,
                                    "htseq_count_mode",
                                    allowed_values=[
                                        "union", "intersection-strict",
                                        "intersection-nonempty"
                                    ])

        # Make a list of the jobs to run.
        jobs = []
        for bam_filename in filelib.list_files_in_path(bam_path,
                                                       endswith=".bam",
                                                       case_insensitive=True):
            x = os.path.split(bam_filename)[1]
            x = os.path.splitext(x)[0]
            x = "%s.count" % x
            out_file = x
            x = bam_filename, out_file
            jobs.append(x)

        # Generate commands for each of the files.
        sq = parallel.quote
        commands = []
        for x in jobs:
            bam_filename, out_file = x
            x = alignlib.make_htseq_count_command(bam_filename,
                                                  gtf_file,
                                                  sort_order,
                                                  ht_stranded,
                                                  mode=mode)
            x = "%s >& %s" % (x, sq(out_file))
            commands.append(x)
        metadata["commands"] = commands
        metadata["num_cores"] = num_cores
        parallel.pshell(commands, max_procs=num_cores, path=out_path)

        # Make sure the analysis completed successfully.
        x = [x[1] for x in jobs]
        x = [os.path.join(out_path, x) for x in x]
        output_filenames = x
        filelib.assert_exists_nz_many(output_filenames)

        return metadata
コード例 #16
0
    def run(
        self, network, antecedents, out_attributes, user_options, num_cores,
        out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib
        from Betsy import module_utils as mlib

        bam_node, nc_node, ref_node, interval_node = antecedents
        bam_filenames = mlib.find_bam_files(bam_node.identifier)
        assert bam_filenames, "No .bam files."
        nc_match = mlib.read_normal_cancer_file(nc_node.identifier)
        ref = alignlib.create_reference_genome(ref_node.identifier)
        filelib.assert_exists_nz(interval_node.identifier)
        filelib.safe_mkdir(out_path)
        metadata = {}
        # TODO: Figure out MuTect version.

        # Make sure intervals file ends with:
        # .bed, .list, .picard, .interval_list, or .intervals
        x, x, ext = mlib.splitpath(interval_node.identifier)
        assert ext in [
            ".bed", ".list", ".picard", ".interval_list", ".intervals"]

        cosmic_file = mlib.get_user_option(
            user_options, "mutect_cosmic_vcf", not_empty=True, check_file=True)
        dbsnp_file = mlib.get_user_option(
            user_options, "mutect_dbsnp_vcf", not_empty=True, check_file=True)

        # sample -> bam filename
        sample2bamfile = mlib.root2filename(bam_filenames)
        # Make sure files exist for all the samples.
        mlib.assert_normal_cancer_samples(nc_match, sample2bamfile)

        # list of (cancer_sample, normal_bamfile, tumor_bamfile, call_outfile,
        #    coverage_outfile, vcf_outfile, logfile)
        opj = os.path.join
        jobs = []
        for (normal_sample, cancer_sample) in nc_match:
            normal_bamfile = sample2bamfile[normal_sample]
            cancer_bamfile = sample2bamfile[cancer_sample]
            path, sample, ext = mlib.splitpath(cancer_bamfile)
            call_outfile = opj(out_path, "%s.call_stats.out" % sample)
            cov_outfile = opj(out_path, "%s.coverage.wig.txt" % sample)
            raw_vcf_outfile = opj(out_path, "%s.vcf.raw" % sample)
            vcf_outfile = opj(out_path, "%s.vcf" % sample)
            log_outfile = opj(out_path, "%s.log" % sample)
            x = normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \
                call_outfile, cov_outfile, raw_vcf_outfile, vcf_outfile, \
                log_outfile
            jobs.append(x)

        # java -Xmx2g -jar muTect.jar
        #   --analysis_type MuTect
        #   --reference_sequence <reference>
        #   --cosmic <cosmic.vcf>
        #   --dbsnp <dbsnp.vcf>
        #   --intervals <intervals_to_process>
        #   --input_file:normal <normal.bam>
        #   --input_file:tumor <tumor.bam>
        #   --out <call_stats.out>
        #   --coverage_file <coverage.wig.txt>

        # Generate the commands.
        sq = mlib.sq
        commands = []
        for x in jobs:
            normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \
                call_outfile, cov_outfile, raw_vcf_outfile, vcf_outfile, \
                log_outfile = x

            UNHASHABLE = [
                ("input_file:normal", sq(normal_bamfile)),
                ("input_file:tumor", sq(cancer_bamfile)),
                ]
            x = alignlib.make_MuTect_command(
                analysis_type="MuTect",
                reference_sequence=sq(ref.fasta_file_full),
                cosmic=sq(cosmic_file),
                dbsnp=sq(dbsnp_file),
                intervals=sq(interval_node.identifier),
                out=sq(call_outfile),
                coverage_file=sq(cov_outfile),
                vcf=sq(raw_vcf_outfile),
                _UNHASHABLE=UNHASHABLE,
                )
            x = "%s >& %s" % (x, log_outfile)
            commands.append(x)
        assert len(commands) == len(jobs)
        nc = mlib.calc_max_procs_from_ram(15, upper_max=num_cores)
        parallel.pshell(commands, max_procs=nc)
        metadata["num_cores"] = nc
        metadata["commands"] = commands

        # Make sure log files have no errors.  Check the log files
        # before the VCF files.  If there's an error, the VCF files
        # may not be created.
        # ##### ERROR -------------------------------------------------------
        # ##### ERROR A GATK RUNTIME ERROR has occurred (version 2.2-25-g2a68
        # ##### ERROR
        # ##### ERROR Please visit the wiki to see if this is a known problem
        # ##### ERROR If not, please post the error, with stack trace, to the
        # ##### ERROR Visit our website and forum for extensive documentation
        # ##### ERROR commonly asked questions http://www.broadinstitute.org/
        # ##### ERROR
        # ##### ERROR MESSAGE: java.lang.IllegalArgumentException: Comparison
        # ##### ERROR -------------------------------------------------------
        for i, x in enumerate(jobs):
            normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \
                call_outfile, cov_outfile, raw_vcf_outfile, vcf_outfile, \
                log_outfile = x
            # Pull out the error lines.
            x = [x for x in open(log_outfile)]
            x = [x for x in x if x.startswith("##### ERROR")]
            x = "".join(x)
            msg = "MuTect error [%s]:\n%s\n%s" % (
                cancer_sample, commands[i], x)
            assert not x, msg

        # Make sure output VCF files exist.
        x = [x[6] for x in jobs]
        filelib.assert_exists_many(x)

        # Fix the files.
        for x in jobs:
            normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \
                call_outfile, cov_outfile, raw_vcf_outfile, vcf_outfile, \
                log_outfile = x
            alignlib.clean_mutect_vcf(
                normal_bamfile, cancer_bamfile, normal_sample, cancer_sample,
                raw_vcf_outfile, vcf_outfile)
            
        return metadata
コード例 #17
0
ファイル: run_RNA_SeQC.py プロジェクト: firebitsbr/changlab
    def run(
        self, network, antecedents, out_attributes, user_options, num_cores,
        out_path):
        import os
        from genomicode import config
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib
        from genomicode import hashlib
        from Betsy import module_utils

        bam_node, ref_node = antecedents
        bam_filenames = module_utils.find_bam_files(bam_node.identifier)
        ref = alignlib.create_reference_genome(ref_node.identifier)
        filelib.safe_mkdir(out_path)

        # java -jar /usr/local/bin/RNA-SeQC_v1.1.8.jar \
        #   -o <sample> -r <reference_file> -s "<sample>|<in_filename>|NA"
        #   -t <gtf_file> >& <log_filename>"
        # <out_path>        Output directory.  Will be created if not exists.
        # <in_filename>     BAM file
        # <reference_file>  /data/biocore/genomes/UCSC/mm10.fa
        # <gtf_file>   /data/biocore/rsem/mouse_refseq_mm10/UCSC_knownGenes.gtf
        #
        # <reference_file> must be indexed and have a dict file.

        rna_seqc_jar = filelib.which_assert(config.rna_seqc_jar)

        GTF = module_utils.get_user_option(
            user_options, "rna_seqc_gtf_file", not_empty=True)
        assert os.path.exists(GTF), "File not found: %s" % GTF

        # list of infile, out_path, ref_file, gtf_file, sample, log_file
        jobs = []
        for in_filename in bam_filenames:
            p, file_ = os.path.split(in_filename)
            f, e = os.path.splitext(file_)
            sample = hashlib.hash_var(f)
            out_path_rna_seqc = os.path.join(out_path, sample)
            log_filename = os.path.join(out_path, "%s.log" % sample)

            x = in_filename, out_path_rna_seqc, ref.fasta_file_full, GTF, \
                sample, log_filename
            jobs.append(x)

        sq = parallel.quote
        commands = []
        for x in jobs:
            (in_filename, out_path_rna_seqc, ref_filename, gtf_filename, \
             sample, log_filename) = x

            x = [sample, in_filename, "NA"]
            x = "|".join(x)
            x = [
                'java',
                '-jar', rna_seqc_jar,
                '-o', sq(out_path_rna_seqc),
                '-r', sq(ref_filename),
                '-s', "'%s'" % x,
                '-t', gtf_filename,
                ]
            x = " ".join(x)
            cmd = "%s >& %s" % (x, log_filename)
            commands.append(cmd)

        # Gets lots of errors.

        x = parallel.pshell(commands, max_procs=num_cores)
        run_log = os.path.join(out_path, "run.log")
        open(run_log, 'w').write(x)

        # Check for outfile.
        # Make sure the analysis completed successfully.
        for x in jobs:
            (in_filename, out_path_rna_seqc, ref_filename, gtf_filename, \
             sample, log_filename) = x
            filelib.assert_exists_nz(out_path_rna_seqc)
コード例 #18
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import filelib
        from genomicode import parselib
        from genomicode import parallel
        from Betsy import module_utils as mlib

        in_vcf_node, bf_vcf_node = antecedents
        in_vcf_filenames = filelib.list_files_in_path(in_vcf_node.identifier,
                                                      endswith=".vcf",
                                                      toplevel_only=True)
        bf_vcf_filenames = filelib.list_files_in_path(bf_vcf_node.identifier,
                                                      endswith=".vcf",
                                                      toplevel_only=True)
        filelib.safe_mkdir(out_path)
        metadata = {}

        common_only = mlib.get_user_option(user_options,
                                           "backfill_common_only",
                                           allowed_values=["no", "yes"],
                                           not_empty=True)

        in_vcf_samples = [mlib.splitpath(x)[1] for x in in_vcf_filenames]
        bf_vcf_samples = [mlib.splitpath(x)[1] for x in bf_vcf_filenames]

        # Make sure there are no duplicate sample names.
        x1 = {}.fromkeys(in_vcf_samples).keys()
        x2 = {}.fromkeys(bf_vcf_samples).keys()
        assert len(in_vcf_samples) == len(x1), "Duplicate samples"
        assert len(bf_vcf_samples) == len(x2), "Duplicate samples"

        # Find the samples.
        common = [x for x in in_vcf_samples if x in bf_vcf_samples]
        in_only = [x for x in in_vcf_samples if x not in common]
        bf_only = [x for x in bf_vcf_samples if x not in common]
        assert common, "No common samples."

        pretty_in = parselib.pretty_list(in_only, max_items=5)
        pretty_bf = parselib.pretty_list(bf_only, max_items=5)
        if common_only == "no":
            assert not (in_only and bf_only), \
                   "Extra samples in both sets:\n%s\n%s" % (
                pretty_in, pretty_bf)
            assert not in_only, "Target VCF file has extra samples: %s" % \
                   pretty_in
            assert not bf_only, "Source VCF file has extra samples: %s." % \
                   pretty_bf
        SAMPLES = common

        # list of sample, in_vcf_filename, bf_vcf_filename, out_filename
        jobs = []
        for sample in SAMPLES:
            assert sample in in_vcf_samples
            assert sample in bf_vcf_samples
            i = in_vcf_samples.index(sample)
            j = bf_vcf_samples.index(sample)
            in_filename = in_vcf_filenames[i]
            bf_filename = bf_vcf_filenames[j]
            out_filename = os.path.join(out_path, "%s.vcf" % sample)
            x = sample, in_filename, bf_filename, out_filename
            jobs.append(x)

        jobs2 = []
        for x in jobs:
            sample, in_filename, bf_filename, out_filename = x
            fn = backfill_vcf
            args = in_filename, bf_filename, out_filename
            keywds = {}
            jobs2.append((fn, args, keywds))
        #num_cores = 1
        parallel.pyfun(jobs2, num_procs=num_cores)
        metadata["num_cores"] = num_cores

        return metadata
コード例 #19
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib
        from Betsy import module_utils as mlib

        bam_node, nc_node, ref_node = antecedents
        bam_filenames = mlib.find_bam_files(bam_node.identifier)
        assert bam_filenames, "No .bam files."
        nc_match = mlib.read_normal_cancer_file(nc_node.identifier)
        ref = alignlib.create_reference_genome(ref_node.identifier)
        filelib.safe_mkdir(out_path)
        metadata = {}
        metadata["tool"] = "MuSE %s" % alignlib.get_muse_version()

        wgs_or_wes = mlib.get_user_option(user_options,
                                          "wgs_or_wes",
                                          not_empty=True,
                                          allowed_values=["wgs", "wes"])
        dbsnp_file = mlib.get_user_option(user_options,
                                          "muse_dbsnp_vcf",
                                          not_empty=True,
                                          check_file=True)

        # Make sure dbsnp_file is compressed and indexed.
        assert dbsnp_file.endswith(".vcf.gz"), \
               "muse_dbsnp_vcf must be bgzip compressed."
        x = "%s.tbi" % dbsnp_file
        assert filelib.exists_nz(x), "muse_dbsnp_vcf must be tabix indexed."

        # sample -> bam filename
        sample2bamfile = mlib.root2filename(bam_filenames)
        # Make sure files exist for all the samples.
        mlib.assert_normal_cancer_samples(nc_match, sample2bamfile)

        # list of (normal_sample, cancer_sample, normal_bamfile, tumor_bamfile,
        #   muse_call_stem, muse_call_file, raw_vcf_outfile, vcf_outfile,
        #   logfile1, logfile2)
        opj = os.path.join
        jobs = []
        for (normal_sample, cancer_sample) in nc_match:
            normal_bamfile = sample2bamfile[normal_sample]
            cancer_bamfile = sample2bamfile[cancer_sample]
            path, sample, ext = mlib.splitpath(cancer_bamfile)
            muse_call_stem = opj(out_path, "%s.call" % cancer_sample)
            muse_call_file = "%s.MuSE.txt" % muse_call_stem
            raw_vcf_outfile = opj(out_path, "%s.vcf.raw" % cancer_sample)
            vcf_outfile = opj(out_path, "%s.vcf" % cancer_sample)
            log_outfile1 = opj(out_path, "%s.call.log" % cancer_sample)
            log_outfile2 = opj(out_path, "%s.sump.log" % cancer_sample)
            x = normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \
                muse_call_stem, muse_call_file, raw_vcf_outfile, vcf_outfile, \
                log_outfile1, log_outfile2
            jobs.append(x)

        # Generate the commands.
        # MuSE call -O test11 -f genomes/Broad.hg19/Homo_sapiens_assembly19.fa\
        #   bam04/196B-MG.bam bam04/PIM001_G.bam
        # MuSE sump -I test11.MuSE.txt -E -O test12.vcf \
        #   -D MuSE/dbsnp_132_b37.leftAligned.vcf.gz

        MuSE = mlib.findbin("muse")

        sq = mlib.sq
        commands = []
        for x in jobs:
            normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \
                muse_call_stem, muse_call_file, raw_vcf_outfile, vcf_outfile, \
                log_outfile1, log_outfile2 = x

            x = [
                sq(MuSE),
                "call",
                "-O",
                muse_call_stem,
                "-f",
                sq(ref.fasta_file_full),
                cancer_bamfile,
                normal_bamfile,
            ]
            x = " ".join(x)
            x = "%s >& %s" % (x, log_outfile1)
            commands.append(x)
        assert len(commands) == len(jobs)
        # Not sure about RAM.
        nc = mlib.calc_max_procs_from_ram(10, upper_max=num_cores)
        parallel.pshell(commands, max_procs=nc)
        metadata["num_cores"] = nc
        metadata["commands"] = commands

        # Make sure the log files have no errors.  The files should be
        # empty.
        log_files = [x[8] for x in jobs]
        filelib.assert_exists_z_many(log_files)

        # Make sure the call files are created and not empty.
        call_files = [x[5] for x in jobs]
        filelib.assert_exists_nz_many(call_files)

        # Run the "sump" step.
        commands = []
        for x in jobs:
            normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \
                muse_call_stem, muse_call_file, raw_vcf_outfile, vcf_outfile, \
                log_outfile1, log_outfile2 = x

            x = [
                sq(MuSE),
                "sump",
                "-I",
                sq(muse_call_file),
            ]
            assert wgs_or_wes in ["wgs", "wes"]
            if wgs_or_wes == "wgs":
                x += ["-G"]
            else:
                x += ["-E"]
            x += [
                "-O",
                sq(raw_vcf_outfile),
                "-D",
                sq(dbsnp_file),
            ]
            x = " ".join(x)
            x = "%s >& %s" % (x, log_outfile2)
            commands.append(x)
        assert len(commands) == len(jobs)
        # Not sure about RAM.
        nc = mlib.calc_max_procs_from_ram(10, upper_max=num_cores)
        parallel.pshell(commands, max_procs=nc)
        metadata["commands"] = metadata["commands"] + commands

        # Make sure the log files have no errors.  The files should be
        # empty.
        log_files = [x[9] for x in jobs]
        filelib.assert_exists_z_many(log_files)

        # Make sure the raw files are created and not empty.
        vcf_files = [x[6] for x in jobs]
        filelib.assert_exists_nz_many(vcf_files)

        # Fix the files.
        commands = []  # Should be python commands.
        for x in jobs:
            normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \
                muse_call_stem, muse_call_file, raw_vcf_outfile, vcf_outfile, \
                log_outfile1, log_outfile2 = x
            args = normal_sample, cancer_sample, raw_vcf_outfile, vcf_outfile
            x = alignlib.clean_muse_vcf, args, {}
            commands.append(x)
        parallel.pyfun(commands, num_procs=num_cores)

        # Delete the log_outfiles if empty.
        for x in jobs:
            normal_sample, cancer_sample, normal_bamfile, cancer_bamfile, \
                muse_call_stem, muse_call_file, raw_vcf_outfile, vcf_outfile, \
                log_outfile1, log_outfile2 = x
            if os.path.exists(log_outfile1):
                os.unlink(log_outfile1)
            if os.path.exists(log_outfile2):
                os.unlink(log_outfile2)

        # Make sure output VCF files exist.
        x = [x[7] for x in jobs]
        filelib.assert_exists_many(x)

        return metadata
コード例 #20
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib
        from Betsy import module_utils as mlib
        import call_somatic_varscan

        bam_node, nc_node, ref_node, interval_node = antecedents
        bam_filenames = mlib.find_bam_files(bam_node.identifier)
        assert bam_filenames, "No .bam files."
        nc_match = mlib.read_normal_cancer_file(nc_node.identifier)
        ref = alignlib.create_reference_genome(ref_node.identifier)
        filelib.assert_exists_nz(interval_node.identifier)
        filelib.safe_mkdir(out_path)
        metadata = {}
        # TODO: Figure out GATK version.

        # Make sure intervals file ends with:
        # .bed, .list, .picard, .interval_list, or .intervals
        x, x, ext = mlib.splitpath(interval_node.identifier)
        assert ext in [
            ".bed", ".list", ".picard", ".interval_list", ".intervals"
        ]

        cosmic_file = mlib.get_user_option(user_options,
                                           "mutect_cosmic_vcf",
                                           not_empty=True,
                                           check_file=True)
        dbsnp_file = mlib.get_user_option(user_options,
                                          "mutect_dbsnp_vcf",
                                          not_empty=True,
                                          check_file=True)

        # sample -> bam filename
        sample2bamfile = mlib.root2filename(bam_filenames)
        # Make sure files exist for all the samples.
        mlib.assert_normal_cancer_samples(nc_match, sample2bamfile)

        opj = os.path.join
        jobs = []
        for (normal_sample, cancer_sample) in nc_match:
            normal_bamfile = sample2bamfile[normal_sample]
            cancer_bamfile = sample2bamfile[cancer_sample]
            path, sample, ext = mlib.splitpath(cancer_bamfile)
            vcf_outfile = opj(out_path, "%s.vcf" % sample)
            log_outfile = opj(out_path, "%s.log" % sample)
            x = filelib.GenericObject(normal_sample=normal_sample,
                                      cancer_sample=cancer_sample,
                                      normal_bamfile=normal_bamfile,
                                      cancer_bamfile=cancer_bamfile,
                                      vcf_outfile=vcf_outfile,
                                      log_outfile=log_outfile)
            jobs.append(x)

        # java -jar GenomeAnalysisTK.jar \
        #   -T MuTect2 \
        #   -R reference.fasta \
        #   -I:tumor tumor.bam \
        #   -I:normal normal.bam \
        #   [--dbsnp dbSNP.vcf] \
        #   [--cosmic COSMIC.vcf] \
        #   [-L targets.interval_list] \
        #   -o output.vcf

        # Generate the commands.
        sq = mlib.sq
        commands = []
        for j in jobs:
            UNHASHABLE = [
                ("I:normal", sq(normal_bamfile)),
                ("I:tumor", sq(cancer_bamfile)),
                # --dbsnp and --cosmic use two dashes, for some
                # reason.  Since make_GATK_command only uses one dash,
                # add one manually.
                ("-dbsnp", sq(dbsnp_file)),
                ("-cosmic", sq(cosmic_file)),
            ]
            x = alignlib.make_GATK_command(
                T="MuTect2",
                R=sq(ref.fasta_file_full),
                L=sq(interval_node.identifier),
                o=sq(j.vcf_outfile),
                _UNHASHABLE=UNHASHABLE,
            )
            x = "%s >& %s" % (x, j.log_outfile)
            commands.append(x)
        assert len(commands) == len(jobs)

        nc = mlib.calc_max_procs_from_ram(25, upper_max=num_cores)
        parallel.pshell(commands, max_procs=nc)
        metadata["num_cores"] = nc
        metadata["commands"] = commands

        # Make sure log files have no errors.  Check the log files
        # before the VCF files.  If there's an error, the VCF files
        # may not be created.
        # ##### ERROR -------------------------------------------------------
        # ##### ERROR A GATK RUNTIME ERROR has occurred (version 2.2-25-g2a68
        # ##### ERROR
        # ##### ERROR Please visit the wiki to see if this is a known problem
        # ##### ERROR If not, please post the error, with stack trace, to the
        # ##### ERROR Visit our website and forum for extensive documentation
        # ##### ERROR commonly asked questions http://www.broadinstitute.org/
        # ##### ERROR
        # ##### ERROR MESSAGE: java.lang.IllegalArgumentException: Comparison
        # ##### ERROR -------------------------------------------------------
        for i, j in enumerate(jobs):
            # Pull out the error lines.
            x = [x for x in open(j.log_outfile)]
            x = [x for x in x if x.startswith("##### ERROR")]
            x = "".join(x)
            msg = "MuTect2 error [%s]:\n%s\n%s" % (cancer_sample, commands[i],
                                                   x)
            assert not x, msg

        # Make sure output VCF files exist.
        x = [x.vcf_outfile for x in jobs]
        filelib.assert_exists_many(x)

        # Mutect2 names the samples "NORMAL" and "TUMOR".  Replace
        # them with the actual names.
        for j in jobs:
            call_somatic_varscan._fix_normal_cancer_names(
                j.vcf_outfile, j.normal_sample, j.cancer_sample)

        return metadata
コード例 #21
0
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            out_filename):
        #from genomicode import filelib
        from genomicode import SimpleVariantMatrix
        from Betsy import module_utils as mlib

        simple_file = in_data.identifier
        metadata = {}

        num_callers = mlib.get_user_option(user_options,
                                           "num_callers",
                                           not_empty=True,
                                           type=int)
        assert num_callers >= 0 and num_callers < 100

        var_matrix = SimpleVariantMatrix.read(simple_file)
        annot_matrix = var_matrix.annot_matrix
        call_matrix = var_matrix.call_matrix

        # For each coord and sample, count the number of callers.
        coord2sample2nc = {}  # (chrom, pos, ref, alt) -> sample -> num callers
        for x in call_matrix.coord2samplecaller2call.iteritems():
            coord, samplecaller2call = x
            if coord not in coord2sample2nc:
                coord2sample2nc[coord] = {}
            sample2nc = coord2sample2nc[coord]
            for (sample, caller), call in samplecaller2call.iteritems():
                # Make sure this is a real call.
                if not (call.num_ref or call.num_alt or call.total
                        or call.vaf):
                    continue
                sample2nc[sample] = sample2nc.get(sample, 0) + 1

        # Make a list of the coordinates that have the right number of calls.
        calls = {}  # coord -> sample -> nc
        for coord, sample2nc in coord2sample2nc.iteritems():
            for sample, nc in sample2nc.iteritems():
                if nc < num_callers:
                    continue
                if coord not in calls:
                    calls[coord] = {}
                calls[coord][sample] = nc

        handle = open(out_filename, 'w')

        # Print out the matrix.
        header = annot_matrix.headers + var_matrix.samples
        print >> handle, "\t".join(header)

        # Cache for convenience.
        j2annots = {}
        for j, h in enumerate(annot_matrix.headers_h):
            annots = annot_matrix.header2annots[h]
            j2annots[j] = annots
        num_annots = len(j2annots)

        chrom, pos = annot_matrix["Chrom"], annot_matrix["Pos"]
        ref, alt = annot_matrix["Ref"], annot_matrix["Alt"]
        pos = [int(x) for x in pos]
        for i, coord in enumerate(zip(chrom, pos, ref, alt)):
            if coord not in calls:
                continue

            row0 = [None] * num_annots
            for j in range(num_annots):
                row0[j] = j2annots[j][i]
            row1 = [""] * len(var_matrix.samples)
            for j, sample in enumerate(var_matrix.samples):
                if sample in calls[coord]:
                    row1[j] = coord2sample2nc[coord][sample]

            row = row0 + row1
            assert len(row) == len(header)
            print >> handle, "\t".join(map(str, row))

        return metadata
コード例 #22
0
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            out_filename):
        import itertools
        from genomicode import SimpleVariantMatrix
        from genomicode import AnnotationMatrix
        from Betsy import module_utils as mlib

        summary_file = in_data.identifier
        metadata = {}

        #x = mlib.get_user_option(
        #    user_options, "nonsynonymous_and_stopgain_only",
        #    allowed_values=["no", "yes"])
        #nonsynonymous_and_stopgain_only = (x == "yes")

        min_alt_reads = mlib.get_user_option(user_options,
                                             "filter_by_min_alt_reads",
                                             not_empty=True,
                                             type=int)
        assert min_alt_reads >= 0 and min_alt_reads < 10000

        min_total_reads = mlib.get_user_option(user_options,
                                               "filter_by_min_total_reads",
                                               not_empty=True,
                                               type=int)
        assert min_total_reads >= 0 and min_total_reads < 10000

        min_vaf = mlib.get_user_option(user_options,
                                       "filter_by_min_vaf",
                                       not_empty=True,
                                       type=float)
        assert min_vaf >= 0.0 and min_vaf < 1.0

        #min_gq = mlib.get_user_option(
        #    user_options, "filter_by_min_GQ", not_empty=True, type=float)
        #assert min_gq >= 0 and min_gq < 1000

        assert min_total_reads or min_alt_reads, "No filter"

        matrix = SimpleVariantMatrix.read_as_am(summary_file)
        #var_matrix = SimpleVariantMatrix.read(summary_file)
        #call_matrix = var_matrix.call_matrix
        #annot_matrix = var_matrix.annot_matrix

        #annovar_matrix = None
        #for (name, matrix) in var_matrix.named_matrices:
        #    if "ExonicFunc.refGene" in matrix.headers:
        #        annovar_matrix = matrix
        #        break
        #assert annovar_matrix, "Missing annotation: ExonicFunc.refGene"

        # copy.deepcopy is very slow.  Try to avoid it.
        # Strategy:
        # 1.  Make a list of the changes to be made.
        # 2.  Save the filtered rows.
        # 3.  Make the changes.
        # 4.  Save the non-filtered rows.
        I_remove = {}  # i -> 1
        call_remove = {}  # i -> (sample, caller) -> 1

        #CHROM = matrix.header2annots["______Chrom"]
        #POS = matrix.header2annots["______Pos"]
        #POS = [int(x) for x in POS]
        #REF = matrix.header2annots["______Ref"]
        #ALT = matrix.header2annots["______Alt"]

        # Optimization: normalize the headers for the samples and callers.
        sc2header = {}  # (sample, caller) -> header_h
        for sc in itertools.product(matrix.samples, matrix.callers):
            sample, caller = sc
            header = "%s___%s___Ref/Alt/VAF" % (sample, caller)
            header_h = matrix.normalize_header(header)
            assert header_h
            sc2header[sc] = header_h

        for i in range(matrix.num_annots()):
            has_calls = False  # whether this row has any calls.
            for sc in itertools.product(matrix.samples, matrix.callers):
                sample, caller = sc

                header_h = sc2header[sc]
                call_str = matrix.header2annots[header_h][i]
                if not call_str:
                    continue
                call = SimpleVariantMatrix._parse_call(call_str)

                filt = False
                # filter_by_min_alt_reads
                if min_alt_reads > 0 and \
                   (call.num_alt is None or call.num_alt < min_alt_reads):
                    filt = True
                # filter_by_min_total_reads
                if min_total_reads > 0 and (call.total is None
                                            or call.total < min_total_reads):
                    filt = True

                # filter_by_min_vaf
                if min_vaf >= 1E-6 and (call.vaf is None
                                        or call.vaf < min_vaf):
                    filt = True

                if filt:
                    if i not in call_remove:
                        call_remove[i] = {}
                    call_remove[i][sc] = 1
                else:
                    has_calls = True

            # If this coordinate has no more calls, then remove the
            # whole row.
            if not has_calls:
                I_remove[i] = 1
        I_remove = sorted(I_remove)

        # Write out a matrix of the discarded rows.
        filtered_matrix = AnnotationMatrix.rowslice(matrix, I_remove)
        SimpleVariantMatrix.write_from_am("discarded.txt", filtered_matrix)

        # Remove the calls.
        for i in call_remove:
            for sc in call_remove[i]:
                header_h = sc2header[sc]
                call_str = matrix.header2annots[header_h][i]
                assert call_str
                matrix.header2annots[header_h][i] = ""

        # Which rows to keep.
        I_remove_dict = {}.fromkeys(I_remove)
        I_keep = [
            i for i in range(matrix.num_annots()) if i not in I_remove_dict
        ]
        filtered_matrix = AnnotationMatrix.rowslice(matrix, I_keep)
        SimpleVariantMatrix.write_from_am(out_filename, filtered_matrix)

        ## ## Filter out synonymous variants.
        ## #if nonsynonymous_and_stopgain_only:
        ## #    # Make sure annotated with Annovar.
        ## #    assert "ExonicFunc.refGene" in annovar_matrix.headers
        ## #    exonic_func = annovar_matrix["ExonicFunc.refGene"]
        ## #    for i, efunc in enumerate(exonic_func):
        ## #        efunc = exonic_func[i]
        ## #        assert efunc in [
        ## #            "", "nonsynonymous SNV", "synonymous SNV",
        ## #            "stopgain", "stoploss",
        ## #            "frameshift substitution", "nonframeshift substitution",
        ## #            "unknown"], \
        ## #            "Unknown exonic_func: %s" % efunc
        ## #        if efunc not in ["nonsynonymous SNV", "stopgain"]:
        ## #            I_remove[i] = 1
        ## #            continue

        ## # Filter based on the calls.
        ## if min_alt_reads > 0 or min_total_reads > 0:
        ##     all_coord = call_matrix.coord2samplecaller2call.keys()
        ##     for coord in all_coord:
        ##         all_sc = call_matrix.coord2samplecaller2call[coord].keys()
        ##         for sc in all_sc:
        ##             # SimpleVariantMatrix.Call object.
        ##             call = call_matrix.coord2samplecaller2call[coord][sc]

        ##             # filter_by_min_alt_reads
        ##             if min_alt_reads > 0 and \
        ##                (call.num_alt is None or call.num_alt < min_alt_reads):
        ##                 if coord not in call_remove:
        ##                     call_remove[coord] = {}
        ##                 call_remove[coord][sc] = 1

        ##             # filter_by_min_total_reads
        ##             if min_total_reads > 0 and (
        ##                 call.total is None or call.total < min_total_reads):
        ##                 if coord not in call_remove:
        ##                     call_remove[coord] = {}
        ##                 call_remove[coord][sc] = 1

        ## # Filter based on VAF.
        ## if min_vaf >= 1E-6:
        ##     all_coord = call_matrix.coord2samplecaller2call.keys()
        ##     for coord in all_coord:
        ##         all_sc = call_matrix.coord2samplecaller2call[coord].keys()
        ##         for sc in all_sc:
        ##             call = call_matrix.coord2samplecaller2call[coord][sc]

        ##             # filter_by_min_vaf
        ##             if call.vaf is None or call.vaf < min_vaf:
        ##                 if coord not in call_remove:
        ##                     call_remove[coord] = {}
        ##                 call_remove[coord][sc] = 1

        ## # If any of these coordinates have no more variants, then
        ## # remove the whole row.
        ## if call_remove:
        ##     chrom, pos = annot_matrix["Chrom"], annot_matrix["Pos"]
        ##     ref, alt = annot_matrix["Ref"], annot_matrix["Alt"]
        ##     pos = [int(x) for x in pos]
        ##     coord2i = {}
        ##     for i, coord in enumerate(zip(chrom, pos, ref, alt)):
        ##         coord2i[coord] = i

        ##     for coord in call_remove:
        ##         num_remove = len(call_remove[coord])
        ##         num_calls = len(call_matrix.coord2samplecaller2call[coord])
        ##         assert num_remove <= num_calls
        ##         if num_remove == num_calls:
        ##             i = coord2i[coord]
        ##             I_remove[i] = 1

        ## # Make a matrix of the discarded rows.
        ## old_annot_matrix = var_matrix.annot_matrix
        ## old_named_matrices = var_matrix.named_matrices
        ## filtered_matrix = var_matrix
        ## x = AnnotationMatrix.rowslice(var_matrix.annot_matrix, I_remove)
        ## filtered_matrix.annot_matrix = x
        ## named_matrices = []
        ## for (name, matrix) in var_matrix.named_matrices:
        ##     matrix = AnnotationMatrix.rowslice(matrix, I_remove)
        ##     named_matrices.append((name, matrix))
        ## filtered_matrix.named_matrices = named_matrices
        ## SimpleVariantMatrix.write("discarded.txt", filtered_matrix)
        ## var_matrix.annot_matrix = old_annot_matrix
        ## var_matrix.named_matrices = old_named_matrices

        ## # Remove the calls.
        ## for coord in call_remove:
        ##     chrom, pos, ref, alt = coord
        ##     for (sample, caller) in call_remove[coord]:
        ##         var_matrix.call_matrix.set_call(
        ##             chrom, pos, ref, alt, sample, caller, None)

        ## # Which rows to keep.
        ## I_keep = [
        ##     i for i in range(var_matrix.num_variants()) if i not in I_remove]
        ## # Filter annotation matrix
        ## var_matrix.annot_matrix = AnnotationMatrix.rowslice(
        ##     var_matrix.annot_matrix, I_keep)
        ## # Filter named matrices.
        ## for i, (name, matrix) in enumerate(var_matrix.named_matrices):
        ##     matrix = AnnotationMatrix.rowslice(matrix, I_keep)
        ##     var_matrix.named_matrices[i] = (name, matrix)

        ## SimpleVariantMatrix.write(out_filename, var_matrix)

        return metadata
コード例 #23
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib
        from Betsy import module_utils as mlib

        # For debugging.
        RUN_VARIANT_CALLING = True
        FILTER_CALLS = True
        MERGE_CALLS = True
        FIX_VCF_FILES = True

        dna_bam_node, rna_bam_node, nc_node, ref_node = antecedents
        dna_bam_filenames = mlib.find_bam_files(dna_bam_node.identifier)
        assert dna_bam_filenames, "No DNA .bam files."
        rna_bam_filenames = mlib.find_bam_files(rna_bam_node.identifier)
        assert rna_bam_filenames, "No RNA .bam files."
        nc_match = mlib.read_normal_cancer_file(nc_node.identifier)
        ref = alignlib.create_reference_genome(ref_node.identifier)
        filelib.safe_mkdir(out_path)
        metadata = {}
        metadata["tool"] = "Radia %s" % alignlib.get_radia_version()

        ## Make sure the BAM files do not contain spaces in the
        ## filenames.  Radia doesn't work well with spaces.
        #filenames = dna_bam_filenames + rna_bam_filenames
        #has_spaces = []
        #for filename in filenames:
        #    if filename.find(" ") >= 0:
        #        has_spaces.append(filename)
        #x = has_spaces
        #if len(x) > 5:
        #    x = x[:5] + ["..."]
        #x = ", ".join(x)
        #msg = "Radia breaks if there are spaces in filenames: %s" % x
        #assert not has_spaces, msg

        # sample -> bam filename
        dnasample2bamfile = mlib.root2filename(dna_bam_filenames)
        rnasample2bamfile = mlib.root2filename(rna_bam_filenames)
        # Make sure files exist for all the samples.  The DNA-Seq
        # should have both normal and cancer.  RNA is not needed for
        # normal sample.
        mlib.assert_normal_cancer_samples(nc_match, dnasample2bamfile)
        mlib.assert_normal_cancer_samples(nc_match,
                                          rnasample2bamfile,
                                          ignore_normal_sample=True)

        # Make sure Radia and snpEff are configured.
        radia_genome_assembly = mlib.get_user_option(user_options,
                                                     "radia_genome_assembly",
                                                     not_empty=True)
        assert radia_genome_assembly == "hg19", "Only hg19 handled."
        snp_eff_genome = mlib.get_user_option(user_options,
                                              "snp_eff_genome",
                                              not_empty=True)

        radia_path = mlib.get_config("radia_path", assert_exists=True)
        snp_eff_path = mlib.get_config("snp_eff_path", assert_exists=True)
        radia_files = get_radia_files(radia_path, radia_genome_assembly)

        # Make a list of the chromosomes to use.  Pick an arbitrarily
        # BAM file.  Look at only the chromosomes that are present in
        # all files.
        all_bamfiles = dnasample2bamfile.values() + rnasample2bamfile.values()
        chroms = list_common_chromosomes(all_bamfiles)
        assert chroms, "No chromosomes found in all files."
        # Only use the chromosomes that can be filtered by Radia.
        chroms = filter_radia_chromosomes(chroms, radia_files)

        # Make output directories.
        radia_outpath = "radia1.tmp"
        filter_outpath = "radia2.tmp"
        merge_outpath = "radia3.tmp"

        if not os.path.exists(radia_outpath):
            os.mkdir(radia_outpath)
        if not os.path.exists(filter_outpath):
            os.mkdir(filter_outpath)
        if not os.path.exists(merge_outpath):
            os.mkdir(merge_outpath)

        # Steps:
        # 1.  Call variants (radia.py)
        #     -o <file.vcf>
        # 2.  Filter variants (filterRadia.py)
        #     <outpath>
        #     Creates a file: <filter_outpath>/<patient_id>_chr<chrom>.vcf
        # 3.  Merge (mergeChroms.py)
        #     Takes as input: <filter_outpath>
        #     Produces: <merge_outpath>/<patient_id>.vcf

        # list of (normal_sample, cancer_sample, chrom,
        #   normal_bamfile, dna_tumor_bamfile, rna_tumor_bamfile,
        #   radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile,
        #   final_vcf_outfile,
        #   radia_logfile, filter_logfile, merge_logfile)
        opj = os.path.join
        jobs = []
        for i, (normal_sample, cancer_sample) in enumerate(nc_match):
            normal_bamfile = dnasample2bamfile[normal_sample]
            dna_tumor_bamfile = dnasample2bamfile[cancer_sample]
            rna_tumor_bamfile = rnasample2bamfile[cancer_sample]

            merge_vcf_outfile = opj(merge_outpath, "%s.vcf" % cancer_sample)
            merge_logfile = opj(merge_outpath, "%s.log" % cancer_sample)
            final_vcf_outfile = opj(out_path, "%s.vcf" % cancer_sample)

            for chrom in chroms:
                radia_vcf_outfile = opj(
                    radia_outpath, "%s_chr%s.vcf" % (cancer_sample, chrom))
                filter_vcf_outfile = opj(
                    filter_outpath, "%s_chr%s.vcf" % (cancer_sample, chrom))
                radia_logfile = opj(radia_outpath,
                                    "%s_chr%s.log" % (cancer_sample, chrom))
                filter_logfile = opj(filter_outpath,
                                     "%s_chr%s.log" % (cancer_sample, chrom))
                x = normal_sample, cancer_sample, chrom, \
                    normal_bamfile, dna_tumor_bamfile, rna_tumor_bamfile, \
                    radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile, \
                    final_vcf_outfile, \
                    radia_logfile, filter_logfile, merge_logfile
                jobs.append(x)

        # Since Radia doesn't work well if there are spaces in the
        # filenames, symlink these files here to guarantee that there
        # are no spaces.
        normal_path = "normal.bam"
        dna_path = "dna.bam"
        rna_path = "rna.bam"
        if not os.path.exists(normal_path):
            os.mkdir(normal_path)
        if not os.path.exists(dna_path):
            os.mkdir(dna_path)
        if not os.path.exists(rna_path):
            os.mkdir(rna_path)
        for i, x in enumerate(jobs):
            normal_sample, cancer_sample, chrom, \
                normal_bamfile, dna_tumor_bamfile, rna_tumor_bamfile, \
                radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile, \
                final_vcf_outfile, \
                radia_logfile, filter_logfile, merge_logfile = x
            x1 = hash_and_symlink_bamfile(normal_bamfile, normal_path)
            x2 = hash_and_symlink_bamfile(dna_tumor_bamfile, dna_path)
            x3 = hash_and_symlink_bamfile(rna_tumor_bamfile, rna_path)
            clean_normal, clean_dna, clean_rna = x1, x2, x3
            x = normal_sample, cancer_sample, chrom, \
                clean_normal, clean_dna, clean_rna, \
                radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile, \
                final_vcf_outfile, \
                radia_logfile, filter_logfile, merge_logfile
            jobs[i] = x

        # Generate the commands for doing variant calling.
        python = mlib.get_config("python", which_assert_file=True)

        # filterRadia.py calls the "blat" command, and there's no way
        # to set the path.  Make sure "blat" is executable.
        if not filelib.which("blat"):
            # Find "blat" in the configuration and add it to the path.
            x = mlib.get_config("blat", which_assert_file=True)
            path, x = os.path.split(x)
            if os.environ["PATH"]:
                path = "%s:%s" % (os.environ["PATH"], path)
            os.environ["PATH"] = path
            # Make sure it's findable now.
            filelib.which_assert("blat")

        # STEP 1.  Call variants with radia.py.
        # python radia.py test31 5 \
        # -n bam04/PIM001_G.bam \
        # -t bam04/196B-MG.bam \
        # -r bam34/196B-MG.bam \
        # -f genomes/Broad.hg19/Homo_sapiens_assembly19.fa \
        # -o test32.vcf
        # --dnaTumorMitochon MT \
        # --rnaTumorMitochon MT \
        sq = mlib.sq
        commands = []
        for x in jobs:
            normal_sample, cancer_sample, chrom, \
                normal_bamfile, dna_tumor_bamfile, rna_tumor_bamfile, \
                radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile, \
                final_vcf_outfile, \
                radia_logfile, filter_logfile, merge_logfile = x

            x = [
                sq(python),
                sq(radia_files.radia_py),
                cancer_sample,
                chrom,
                "-n",
                sq(normal_bamfile),
                "-t",
                sq(dna_tumor_bamfile),
                "-r",
                sq(rna_tumor_bamfile),
                "-f",
                sq(ref.fasta_file_full),
                "-o",
                radia_vcf_outfile,
            ]
            if "MT" in chroms:
                x += [
                    "--dnaNormalMitochon MT",
                    "--dnaTumorMitochon MT",
                    "--rnaTumorMitochon MT",
                ]
            x = " ".join(x)
            x = "%s >& %s" % (x, radia_logfile)
            commands.append(x)
        assert len(commands) == len(jobs)
        # Only uses ~200 Mb of ram.
        if RUN_VARIANT_CALLING:
            parallel.pshell(commands, max_procs=num_cores)
        metadata["num_cores"] = num_cores
        metadata["commands"] = commands

        # Make sure log files are empty.
        logfiles = [x[10] for x in jobs]
        filelib.assert_exists_z_many(logfiles)

        # STEP 2.  Filter variants with filterRadia.py.
        commands = []
        for x in jobs:
            normal_sample, cancer_sample, chrom, \
                normal_bamfile, dna_tumor_bamfile, rna_tumor_bamfile, \
                radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile, \
                final_vcf_outfile, \
                radia_logfile, filter_logfile, merge_logfile = x

            x = [
                sq(python),
                sq(radia_files.filterRadia_py),
                cancer_sample,
                chrom,
                sq(radia_vcf_outfile),
                sq(filter_outpath),
                sq(radia_files.scripts_dir),
                "-b",
                sq(radia_files.blacklist_dir),
                "-d",
                sq(radia_files.snp_dir),
                "-r",
                sq(radia_files.retro_dir),
                "-p",
                sq(radia_files.pseudo_dir),
                "-c",
                sq(radia_files.cosmic_dir),
                "-t",
                sq(radia_files.target_dir),
                "-s",
                sq(snp_eff_path),
                "-e",
                snp_eff_genome,
                "--rnaGeneBlckFile",
                sq(radia_files.rnageneblck_file),
                "--rnaGeneFamilyBlckFile",
                sq(radia_files.rnagenefamilyblck_file),
            ]
            x = " ".join(x)
            x = "%s >& %s" % (x, filter_logfile)
            commands.append(x)
        assert len(commands) == len(jobs)

        # Sometimes samtools crashes in the middle of a run.  Detect
        # this case, and re-run the analysis if needed.
        assert len(commands) == len(jobs)
        py_commands = []
        for x, cmd in zip(jobs, commands):
            normal_sample, cancer_sample, chrom, \
                normal_bamfile, dna_tumor_bamfile, rna_tumor_bamfile, \
                radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile, \
                final_vcf_outfile, \
                radia_logfile, filter_logfile, merge_logfile = x
            args = cmd, cancer_sample, chrom, filter_logfile
            x = _run_filterRadia_with_restart, args, {}
            py_commands.append(x)
        # Takes ~10 Gb each.
        nc = mlib.calc_max_procs_from_ram(25, upper_max=num_cores)
        if FILTER_CALLS:
            parallel.pyfun(py_commands, num_procs=nc)
        metadata["commands"] += commands

        # Make sure log files are empty.
        logfiles = [x[11] for x in jobs]
        filelib.assert_exists_z_many(logfiles)

        # Make sure filter_vcf_outfile exists.
        outfiles = [x[7] for x in jobs]
        filelib.assert_exists_nz_many(outfiles)

        # STEP 3.  Merge the results.
        commands = []
        for x in jobs:
            normal_sample, cancer_sample, chrom, \
                normal_bamfile, dna_tumor_bamfile, rna_tumor_bamfile, \
                radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile, \
                final_vcf_outfile, \
                radia_logfile, filter_logfile, merge_logfile = x

            # python /usr/local/radia/scripts/mergeChroms.py 196B-MG \
            #   radia2.tmp/ radia3.tmp
            # The "/" after radia2.tmp is important.  If not given,
            # will generate some files with only newlines.

            fo = filter_outpath
            if not fo.endswith("/"):
                fo = "%s/" % fo
            x = [
                sq(python),
                sq(radia_files.mergeChroms_py),
                cancer_sample,
                fo,
                merge_outpath,
            ]
            x = " ".join(x)
            x = "%s >& %s" % (x, merge_logfile)
            commands.append(x)
        assert len(commands) == len(jobs)
        # Since the chromosomes were separated for the previous steps,
        # this will generate one merge for each chromosome.  This is
        # unnecessary, since we only need to merge once per sample.
        # Get rid of duplicates.
        commands = sorted({}.fromkeys(commands))
        if MERGE_CALLS:
            parallel.pshell(commands, max_procs=num_cores)
        metadata["commands"] += commands

        # Make sure log files are empty.
        logfiles = [x[12] for x in jobs]
        logfiles = sorted({}.fromkeys(logfiles))
        filelib.assert_exists_z_many(logfiles)

        # Fix the VCF files.
        commands = []
        for x in jobs:
            normal_sample, cancer_sample, chrom, \
                normal_bamfile, dna_tumor_bamfile, rna_tumor_bamfile, \
                radia_vcf_outfile, filter_vcf_outfile, merge_vcf_outfile, \
                final_vcf_outfile, \
                radia_logfile, filter_logfile, merge_logfile = x
            args = normal_sample, cancer_sample, \
                   merge_vcf_outfile, final_vcf_outfile
            x = alignlib.clean_radia_vcf, args, {}
            commands.append(x)
        if FIX_VCF_FILES:
            parallel.pyfun(commands, num_procs=num_cores)

        # Make sure output VCF files exist.
        x = [x[9] for x in jobs]
        filelib.assert_exists_nz_many(x)

        return metadata
コード例 #24
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib
        from Betsy import module_utils

        bam_node, ref_node, target_node = antecedents

        bam_filenames = module_utils.find_bam_files(bam_node.identifier)
        assert bam_filenames, "No .bam files."
        target_filenames = filelib.list_files_in_path(target_node.identifier,
                                                      endswith=".intervals")
        assert target_filenames, "No .intervals files."
        ref = alignlib.create_reference_genome(ref_node.identifier)
        filelib.safe_mkdir(out_path)

        assert len(bam_filenames) == len(target_filenames), \
               "Should have an .intervals file for each bam file."
        sample2bamfilename = {}
        for filename in bam_filenames:
            p, f = os.path.split(filename)
            sample, ext = os.path.splitext(f)
            assert sample not in sample2bamfilename
            sample2bamfilename[sample] = filename
        sample2targetfilename = {}
        for filename in target_filenames:
            p, f = os.path.split(filename)
            sample, ext = os.path.splitext(f)
            assert sample not in sample2targetfilename
            sample2targetfilename[sample] = filename
        assert len(sample2bamfilename) == len(sample2targetfilename)

        missing = [
            x for x in sample2bamfilename if x not in sample2targetfilename
        ]
        assert not missing, "Missing interval files for %d bam files." % \
               len(missing)

        # list of (bam_filename, target_filename, log_filename, out_filename)
        jobs = []
        for sample in sample2bamfilename:
            bam_filename = sample2bamfilename[sample]
            target_filename = sample2targetfilename[sample]

            p, f = os.path.split(bam_filename)
            sample, ext = os.path.splitext(f)
            out_filename = os.path.join(out_path, "%s.bam" % sample)
            log_filename = os.path.join(out_path, "%s.log" % sample)
            x = bam_filename, target_filename, log_filename, out_filename
            jobs.append(x)

        known_sites = []
        x1 = module_utils.get_user_option(user_options,
                                          "realign_known_sites1",
                                          check_file=True)
        x2 = module_utils.get_user_option(user_options,
                                          "realign_known_sites2",
                                          check_file=True)
        x3 = module_utils.get_user_option(user_options,
                                          "realign_known_sites3",
                                          check_file=True)
        x = [x1, x2, x3]
        x = [x for x in x if x]
        known_sites = x
        assert known_sites

        # java -Xmx5g -jar /usr/local/bin/GATK/GenomeAnalysisTK.jar \
        #   -T IndelRealigner -R <ref.fa> \
        #   -I <bam_file> -targetIntervals <target_file> -o <bam_file>

        # Make a list of commands.
        commands = []
        for x in jobs:
            bam_filename, target_filename, log_filename, out_filename = x
            x = [("known", x) for x in known_sites]
            x = alignlib.make_GATK_command(T="IndelRealigner",
                                           R=ref.fasta_file_full,
                                           I=bam_filename,
                                           targetIntervals=target_filename,
                                           o=out_filename,
                                           _UNHASHABLE=x)
            x = "%s >& %s" % (x, log_filename)
            commands.append(x)

        #for x in commands:
        #    print x
        #import sys; sys.exit(0)

        parallel.pshell(commands, max_procs=num_cores)

        # Make sure the analysis completed successfully.
        out_filenames = [x[-1] for x in jobs]
        filelib.assert_exists_nz_many(out_filenames)
コード例 #25
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib
        from Betsy import module_utils as mlib

        bam_node, ref_node = antecedents
        in_filenames = mlib.find_bam_files(bam_node.identifier)
        assert in_filenames, "No .bam files."
        ref = alignlib.create_reference_genome(ref_node.identifier)
        filelib.safe_mkdir(out_path)
        metadata = {}

        jobs = []  # list of (in_filename, log_filename, out_filename)
        for in_filename in in_filenames:
            p, f = os.path.split(in_filename)
            f, ext = os.path.splitext(f)
            log_filename = os.path.join(out_path, "%s.log" % f)
            out_filename = os.path.join(out_path, "%s.intervals" % f)
            x = in_filename, log_filename, out_filename
            jobs.append(x)

        filter_reads_with_N_cigar = mlib.get_user_option(
            user_options,
            "filter_reads_with_N_cigar",
            allowed_values=["no", "yes"])

        known_sites = []
        x1 = mlib.get_user_option(user_options,
                                  "realign_known_sites1",
                                  check_file=True)
        x2 = mlib.get_user_option(user_options,
                                  "realign_known_sites2",
                                  check_file=True)
        x3 = mlib.get_user_option(user_options,
                                  "realign_known_sites3",
                                  check_file=True)
        x = [x1, x2, x3]
        x = [x for x in x if x]
        known_sites = x
        assert known_sites

        # I/O bound, so not likely to get a big speedup with nt.

        # java -Xmx5g -jar /usr/local/bin/GATK/GenomeAnalysisTK.jar -nt 4
        #   -T RealignerTargetCreator -R ../genome.idx/erdman.fa -I $i -o $j
        #   --known <known_vcf_file>

        # RealignerTargetCreator takes ~10Gb per process.  Each thread
        # takes the full amount of memory.
        nc = mlib.calc_max_procs_from_ram(12, upper_max=num_cores)

        # Make a list of commands.
        commands = []
        for x in jobs:
            in_filename, log_filename, out_filename = x

            n = max(1, nc / len(jobs))
            x = [("-known", x) for x in known_sites]
            if filter_reads_with_N_cigar == "yes":
                x.append(("-filter_reads_with_N_cigar", None))
            x = alignlib.make_GATK_command(nt=n,
                                           T="RealignerTargetCreator",
                                           R=ref.fasta_file_full,
                                           I=in_filename,
                                           o=out_filename,
                                           _UNHASHABLE=x)
            x = "%s >& %s" % (x, log_filename)
            commands.append(x)

        parallel.pshell(commands, max_procs=nc)
        metadata["num_procs"] = nc
        metadata["commands"] = commands

        # Make sure the analysis completed successfully.
        out_filenames = [x[-1] for x in jobs]
        filelib.assert_exists_nz_many(out_filenames)
        return metadata
コード例 #26
0
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            out_path):
        import os
        import shutil
        from genomicode import filelib
        from genomicode import cluster30
        from Betsy import module_utils as mlib
        import cluster_genes_by_hierarchical

        filelib.safe_mkdir(out_path)
        metadata = {}

        raise NotImplementedError

        DISTANCE_MEASURES = cluster30.DIST2ID.keys()
        YESNO = ["yes", "no"]

        cluster_genes = mlib.get_user_option(user_options,
                                             "cluster_genes",
                                             not_empty=True,
                                             allowed_values=YESNO)
        cluster_arrays = mlib.get_user_option(user_options,
                                              "cluster_arrays",
                                              not_empty=True,
                                              allowed_values=YESNO)
        distance_metric = mlib.get_user_option(
            user_options,
            "distance_measure",
            not_empty=True,
            allowed_values=DISTANCE_MEASURES)
        som_rows = mlib.get_user_option(user_options,
                                        "som_rows",
                                        not_empty=True,
                                        type=int)
        som_cols = mlib.get_user_option(user_options,
                                        "som_cols",
                                        not_empty=True,
                                        type=int)
        assert som_rows >= 1 and som_rows < 100
        assert som_cols >= 1 and som_cols < 100

        jobname = "cluster"
        cmd = cluster30.cluster30_file(in_data.identifier,
                                       (cluster_genes == "yes"),
                                       (cluster_arrays == "yes"),
                                       "som",
                                       distance=distance_metric,
                                       som_rows=som_rows,
                                       som_cols=som_cols,
                                       jobname=jobname)
        metadata["command"] = cmd

        # Find the output files and name them appropriately.
        cluster_files = cluster30._find_cluster_files(jobname)
        cluster_genes_by_hierarchical.fix_cluster30_dup_header(
            cluster_files["cdt"])

        opj = os.path.join
        out_cdt_file = opj(out_path, "signal.cdt")
        #out_kag_file = opj(out_path, "array_cluster.kag")
        #out_kgg_file = opj(out_path, "gene_cluster.kgg")

        assert "txt" in cluster_files
        shutil.copy2(cluster_files["txt"], out_cdt_file)
        #if "kag" in cluster_files:
        #    shutil.copy2(cluster_files["kag"], out_kag_file)
        #if "kgg" in cluster_files:
        #    shutil.copy2(cluster_files["kgg"], out_kgg_file)

        return metadata
コード例 #27
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib
        from Betsy import module_utils

        vcf_node, ref_node = antecedents
        vcf_filenames = filelib.list_files_in_path(vcf_node.identifier,
                                                   endswith=".vcf")
        assert vcf_filenames, "No .vcf files."
        ref = alignlib.create_reference_genome(ref_node.identifier)
        filelib.safe_mkdir(out_path)

        jobs = []
        for in_filename in vcf_filenames:
            p, f = os.path.split(in_filename)
            f, exp = os.path.splitext(f)
            out_filename = os.path.join(out_path, "%s.grp" % f)
            log_filename = os.path.join(out_path, "%s.log" % f)
            recal_filename = os.path.join(out_path,
                                          "%s.recalibrate_SNP.recal" % f)
            tranches_filename = os.path.join(out_path,
                                             "%s.recalibrate_SNP.tranches" % f)
            rscript_filename = os.path.join(out_path,
                                            "%s.recalibrate_SNP_plots.R" % f)
            assert in_filename != out_filename
            x = (in_filename, log_filename, recal_filename, tranches_filename,
                 rscript_filename)
            jobs.append(x)

        # -resource:dbsnp,known=true,training=false,truth=false,prior=6.0
        #    dbsnp_135.b37.vcf
        # -resource:hapmap,known=false,training=true,truth=true,prior=15.0
        #    hapmap_3.3.b37.sites.vcf
        # -resource:1000G,known=false,training=true,truth=false,prior=10.0
        #    1000G_phase1.snps.high_confidence.vcf
        # -resource:omni,known=false,training=true,truth=false,prior=12.0
        #    1000G_omni2.5.b37.sites.vcf
        known_sites = []
        x1 = module_utils.get_user_option(user_options,
                                          "vcf_recal_dbsnp",
                                          not_empty=True,
                                          check_file=True)
        x2 = module_utils.get_user_option(user_options,
                                          "vcf_recal_mills_indels",
                                          not_empty=True,
                                          check_file=True)
        x3 = module_utils.get_user_option(user_options,
                                          "vcf_recal_1kg_indels",
                                          not_empty=True,
                                          check_file=True)
        x4 = module_utils.get_user_option(user_options,
                                          "vcf_recal_omni",
                                          not_empty=True,
                                          check_file=True)
        y1 = "resource:dbsnp,known=true,training=false,truth=false,prior=6.0"
        y2 = "resource:hapmap,known=false,training=true,truth=true,prior=15.0"
        y3 = "resource:1000G,known=false,training=true,truth=false,prior=10.0"
        y4 = "resource:omni,known=false,training=true,truth=false,prior=12.0"
        known_sites = [(y1, x1), (y2, x2), (y3, x3), (y4, x4)]

        # Names of annotations to be used for annotations.
        AN = [
            "DP", "QD", "FS", "SOR", "MQ", "MQRankSum", "ReadPosRankSum",
            "InbreedingCoeff"
        ]
        TRANCHE = ["100.0", "99.9", "99.0", "90.0"]

        # Make a list of commands.
        commands = []
        for x in jobs:
            (in_filename, log_filename, recal_filename, tranches_filename,
             rscript_filename) = x
            x1 = known_sites
            x2 = [("an", x) for x in AN]
            x3 = [("tranche", x) for x in TRANCHE]
            unhash = x1 + x2 + x3
            x = alignlib.make_GATK_command(T="VariantRecalibrator",
                                           R=ref.fasta_file_full,
                                           input=in_filename,
                                           mode="SNP",
                                           recalFile=recal_filename,
                                           tranchesFile=tranches_filename,
                                           rscriptFile=rscript_filename,
                                           _UNHASHABLE=unhash)
            x = "%s >& %s" % (x, log_filename)
            commands.append(x)

        #for x in commands:
        #    print x
        #import sys; sys.exit(0)

        parallel.pshell(commands, max_procs=num_cores)

        # Make sure the analysis completed successfully.
        out_filenames = [x[-1] for x in jobs]
        filelib.assert_exists_nz_many(out_filenames)
コード例 #28
0
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            out_filename):
        from genomicode import filelib
        from genomicode import SimpleVariantMatrix
        from Betsy import module_utils as mlib

        simplematrix_file = in_data.identifier
        filelib.assert_exists_nz(simplematrix_file)
        metadata = {}

        x = mlib.get_user_option(user_options,
                                 "nonsynonymous_and_stopgain_only",
                                 allowed_values=["no", "yes"])
        nonsynonymous_and_stopgain_only = (x == "yes")

        x = mlib.get_user_option(user_options,
                                 "sift_polyphen_damaging",
                                 allowed_values=["no", "yes"])
        sift_polyphen_damaging = (x == "yes")

        min_coverage_in_every_sample = None
        min_callers_in_every_sample = None
        min_callers_in_any_sample = None
        min_gene_expression_in_every_sample = None
        x = mlib.get_user_option(user_options,
                                 "min_coverage_in_every_sample",
                                 type=int)
        if x != "":
            min_coverage_in_every_sample = x
        x = mlib.get_user_option(user_options,
                                 "min_callers_in_every_sample",
                                 type=int)
        if x != "":
            min_callers_in_every_sample = x
        x = mlib.get_user_option(user_options,
                                 "min_callers_in_any_sample",
                                 type=int)
        if x != "":
            min_callers_in_any_sample = x
        x = mlib.get_user_option(user_options,
                                 "min_gene_expression_in_every_sample",
                                 type=float)
        if x != "":
            min_gene_expression_in_every_sample = x

        assert not (min_callers_in_every_sample and min_callers_in_any_sample)
        assert nonsynonymous_and_stopgain_only or \
               sift_polyphen_damaging or \
               min_callers_in_every_sample or \
               min_callers_in_any_sample or \
               min_gene_expression_in_every_sample or \
               min_coverage_in_every_sample, \
               "No filters"

        MATRIX = SimpleVariantMatrix.read_as_am(simplematrix_file)

        commands = []
        #in_attrs = in_data.data.attributes
        if nonsynonymous_and_stopgain_only:
            # Actually, just look into the file instead.
            #assert in_attrs["annotated"] == "yes"
            MATRIX = filter_nonsynonymous(MATRIX)
            commands.append("Keep only nonsynonymous and stopgain variants.")
        if sift_polyphen_damaging:
            MATRIX = filter_sift_polyphen_damaging(MATRIX)
            commands.append("Keep only if predicted to be damaging by "
                            "SIFT or Polyphen2.")
        if min_coverage_in_every_sample is not None:
            MATRIX = filter_min_coverage_in_every_sample(
                MATRIX, min_coverage_in_every_sample)
            commands.append("Keep only variants with coverage >= %d "
                            "in every sample." % min_coverage_in_every_sample)
        if min_callers_in_every_sample is not None:
            MATRIX = filter_min_callers_in_every_sample(
                MATRIX, min_callers_in_every_sample)
            commands.append("Keep only variants called with >= %d callers "
                            "in every sample." % min_callers_in_every_sample)
        if min_callers_in_any_sample is not None:
            MATRIX = filter_min_callers_in_any_sample(
                MATRIX, min_callers_in_any_sample)
            commands.append("Keep only variants called with >= %d callers "
                            "in at least one sample." %
                            min_callers_in_any_sample)
        if min_gene_expression_in_every_sample is not None:
            # Actually, just look into the file instead.
            #assert in_attrs["with_gxp"] == "yes"
            MATRIX = filter_min_gene_expression_in_every_sample(
                MATRIX, min_gene_expression_in_every_sample)
            commands.append("Keep only variants with gene expression >= %g "
                            "in every sample." %
                            min_gene_expression_in_every_sample)
        metadata["commands"] = commands

        SimpleVariantMatrix.write_from_am(out_filename, MATRIX)

        return metadata
コード例 #29
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import parallel
        from genomicode import hashlib
        from genomicode import filelib
        from genomicode import config
        from Betsy import module_utils

        bam_node, group_node = antecedents
        bam_path = module_utils.check_inpath(bam_node.identifier)
        sample_groups = module_utils.read_sample_group_file(
            group_node.identifier)

        # Get options.
        treat_sample = module_utils.get_user_option(user_options,
                                                    "treatment_sample",
                                                    not_empty=True)
        control_sample = module_utils.get_user_option(user_options,
                                                      "control_sample")
        genome_size = module_utils.get_user_option(user_options,
                                                   "macs_genome",
                                                   not_empty=True)
        shiftsize = module_utils.get_user_option(user_options,
                                                 "macs_shiftsize")
        if shiftsize:
            shiftsize = int(shiftsize)

        # Set the name.
        name = hashlib.hash_var(treat_sample)
        if control_sample:
            x = hashlib.hash_var(control_sample)
            name = "%s_vs_%s" % (treat_sample, x)

        # Make sure the samples exist.
        samples = [x[1] for x in sample_groups]
        assert treat_sample in samples, "Unknown sample: %s" % treat_sample
        if control_sample:
            assert control_sample in samples, \
                   "Unknown sample: %s" % control_sample

        # Find the BAM files.
        treat_filename = find_bam_file(bam_path, treat_sample, sample_groups)
        assert treat_filename, "Missing bam file for %s" % treat_sample
        control_filename = None
        if control_sample:
            control_filename = find_bam_file(bam_path, control_sample,
                                             sample_groups)
            assert control_filename, "Missing bam file for %s" % control_sample

        cmd = make_macs14_command(treat_filename,
                                  control_filename,
                                  name=name,
                                  genome_size=genome_size,
                                  shiftsize=shiftsize,
                                  save_bedgraph_file=True)
        parallel.sshell(cmd, path=out_path)

        # Run Rscript on the model, if one was generated.
        model_file = os.path.join(out_path, "%s_model.r" % name)
        if os.path.exists(model_file):
            Rscript = filelib.which_assert(config.Rscript)
            cmd = [parallel.quote(Rscript), model_file]
            parallel.sshell(cmd, path=out_path)

        files = [
            "%s_peaks.xls" % name,
            "%s_summits.bed" % name,
        ]
        filenames = [os.path.join(out_path, x) for x in files]
        filelib.assert_exists_nz_many(filenames)
コード例 #30
0
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            outfile):
        import math
        from genomicode import filelib
        from genomicode import jmath
        from genomicode import AnnotationMatrix
        from genomicode import SimpleVariantMatrix
        from Betsy import module_utils as mlib

        svm_node = in_data
        filelib.assert_exists_nz(svm_node.identifier)

        linked_file = mlib.get_user_option(user_options,
                                           "linked_variants_file",
                                           not_empty=True,
                                           check_file=True)

        # Read the variant file.
        SVM = SimpleVariantMatrix.read_as_am(svm_node.identifier)
        CHROM = SVM["______Chrom"]
        POS = SVM["______Pos"]
        POS = [int(x) for x in POS]
        all_coords = {}  # (chrom, pos) -> 1
        for x in zip(CHROM, POS):
            all_coords[x] = 1

        # Read the linked variant file.
        # Chrom  Pos  Perc Linked  p
        coord2info = {}  # (chrom, pos) -> d
        for d in filelib.read_row(linked_file, header=1):
            pos = int(d.Pos)
            if (d.Chrom, pos) not in all_coords:
                continue
            coord2info[(d.Chrom, pos)] = d

        # Align the linked annotations to the matrix.
        MAX_SCORE = 1000
        min_p = 10**-(MAX_SCORE / 10)
        linked_headers = ["Perc Linked", "Score"]
        annotations = []
        for (chrom, pos) in zip(CHROM, POS):
            if (chrom, pos) not in coord2info:
                x = [""] * len(linked_headers)
                annotations.append(x)
                continue
            d = coord2info[(chrom, pos)]
            score = MAX_SCORE
            if float(d.p) >= min_p:
                score = -10 * math.log(float(d.p), 10)
            x = d.Perc_Linked, score
            assert len(x) == len(linked_headers)
            annotations.append(x)
        # Convert the headers and annotations to SVM format.
        linked_headers = ["Linkage______%s" % x for x in linked_headers]
        linked_annotations = jmath.transpose(annotations)

        # Make the new SimpleVariantMatrix.
        # Figure out where to put these annotations.
        INDEX = 4
        ## If Annovar exists, put after.
        #I = [i for (i, x) in enumerate(SVM.headers)
        #     if x.upper().startswith("ANNOVAR")]
        #if I:
        #    INDEX = max(INDEX, max(I)+1)
        headers = SVM.headers[:INDEX] + linked_headers + SVM.headers[INDEX:]
        x = [SVM.header2annots[x] for x in SVM.headers_h]
        all_annots = x[:INDEX] + linked_annotations + x[INDEX:]
        merged = AnnotationMatrix.create_from_annotations(
            headers, all_annots, headerlines=SVM.headerlines)

        SimpleVariantMatrix.write_from_am(outfile, merged)