def run(self, network, antecedents, out_attributes, user_options,
            num_cores, outfile):
        from genomicode import filelib
        from genomicode import parallel
        from Betsy import module_utils as mlib

        in_data = antecedents
        metadata = {}

        #module_utils.plot_line_keywd(in_data.identifier, 'biotin', outfile)
        lineplot = mlib.get_config("lineplot", which_assert_file=True)

        sq = parallel.quote
        cmd = [
            sq(lineplot),
            "--gene_names",
            "biotin",
            "--mar_bottom",
            1.50,
            "--yaxis_starts_at_0",
            sq(in_data.identifier),
            sq(outfile),
        ]
        cmd = " ".join(map(str, cmd))
        parallel.sshell(cmd)
        metadata["commands"] = [cmd]
        filelib.assert_exists_nz(outfile)
        return metadata
Beispiel #2
0
def check_log_file(filename):
    from genomicode import filelib
    # Log file format:
    # [Sat Dec 31 19:29:27 CST 2016] picard.sam.AddOrReplaceReadGroups INPUT=
    # [Sat Dec 31 19:29:27 CST 2016] Executing as [email protected]
    # INFO    2016-12-31 19:29:27     AddOrReplaceReadGroups  Created read gr
    # INFO    2016-12-31 19:29:42     AddOrReplaceReadGroups  Processed     1
    # INFO    2016-12-31 19:29:58     AddOrReplaceReadGroups  Processed     2
    # [...]
    # [Sat Dec 31 19:48:14 CST 2016] picard.sam.AddOrReplaceReadGroups done. 
    # Runtime.totalMemory()=1609564160
    #
    # Sometimes these lines are interspersed.  Probably OK not to flag.
    # Ignoring SAM validation error: ERROR: Read name HWI-ST1120:331:C6VW5ACX
    #
    # ERROR: Sometimes see exceptions.
    # Exception in thread "main" java.lang.RuntimeException: BGZF file has in
    #         at htsjdk.samtools.util.BlockGunzipper.unzipBlock(BlockGunzippe
    # [...]

    # The log file should not be empty.
    filelib.assert_exists_nz(filename)

    lines = open(filename).readlines()
    # Make sure there's no exception.
    i_exception = None
    for i in range(len(lines)):
        if lines[i].startswith("Exception in thread "):
            i_exception = i
            break
    if i_exception is None:
        return
    x = "".join(lines[i:]).strip()
    raise AssertionError, "Exception in Picard output:\n%s" % x
Beispiel #3
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, outfile):
        from genomicode import filelib
        from genomicode import alignlib
        from Betsy import module_utils as mlib

        bam_node, gene_node = antecedents
        bam_filenames = mlib.find_bam_files(bam_node.identifier)
        gtf_file = gene_node.identifier
        filelib.assert_exists_nz(gtf_file)
        assert bam_filenames, "No bam files found."
        metadata = {}

        # Make output filenames.
        p, r, e = mlib.splitpath(gtf_file)
        bed_file = "%s.bed" % r

        # Make bed file.
        alignlib.gtf_to_bed(gtf_file, bed_file)
        #bed_file = "/data/jchang/biocore/gtf02.txt"

        # Figure out the orientation.
        x = get_paired_stranded_rseqc(bed_file, bam_filenames[0])
        single_or_paired, stranded, frac_failed, frac_first, frac_second = x

        x = mlib.Stranded(single_or_paired, stranded, frac_failed, frac_first,
                          frac_second)
        mlib.write_stranded(x, outfile)
        return metadata
Beispiel #4
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        import shutil
        from genomicode import filelib

        data_nodes = [
            ("SignalFile", "gene_expression.nonorm.txt"),
            ("SignalFile", "gene_expression.normalized.txt"),
            ("SignalDistributionBoxplot", "signal_distribution.png"),
            ("ActbPlot", "ACTB.nonorm.png"),  # beta-actin expression
            ("ActbPlot", "ACTB.normalized.png"),  # beta-actin expression
            ("PCAPlot", "pca.nonorm.png"),  # No normalization
            ("PCAPlot", "pca.normalized.png"),  # Normalized
            ("Heatmap", "heatmap.nonorm.png"),
            ("Heatmap", "heatmap.normalized.png"),
        ]
        assert len(antecedents) == len(data_nodes)
        for i, (dtype, outfile) in enumerate(data_nodes):
            inode = antecedents[i]
            filelib.assert_exists_nz(inode.identifier)
            assert inode.data.datatype.name == dtype, "Mismatch: %s %s" % (
                inode.data.datatype.name, dtype)
        if not os.path.exists(out_path):
            os.mkdir(out_path)
        metadata = {}

        # Copy the files over.
        for i, (dtype, outfile) in enumerate(data_nodes):
            inode = antecedents[i]
            outfilename = os.path.join(out_path, outfile)
            shutil.copy2(inode.identifier, outfilename)

        return metadata
Beispiel #5
0
def relabel(data_file, rename_file, outfile, user_options):
    from genomicode import filelib
    from genomicode import parallel
    from Betsy import module_utils as mlib

    sample_header = mlib.get_user_option(
        user_options, "sample_labels_header", not_empty=True)
    # Make sure sample_header in rename file.
    x = open(rename_file).readline()
    x = x.rstrip("\r\n").split("\t")
    assert sample_header in x, "Missing header (%s): %s" % (
        sample_header, rename_file)

    sq = parallel.quote
    slice_matrix = mlib.get_config("slice_matrix", which_assert_file=True)
    x = "'%s,%s'" % (rename_file, sample_header)
    cmd = [
        "python",
        sq(slice_matrix),
        '--relabel_col_ids', x,
        sq(data_file),
        ]
    cmd = " ".join(cmd)
    cmd = "%s >& %s" % (cmd, outfile)
    parallel.sshell(cmd)

    filelib.assert_exists_nz(outfile)
    return cmd
Beispiel #6
0
def get_paired_stranded_rseqc(reference_bed, bam_filename):
    from genomicode import alignlib
    from genomicode import filelib
    from genomicode import parallel
    from Betsy import module_utils as mlib

    script = alignlib.find_rseqc_script("infer_experiment.py")
    filelib.assert_exists_nz(reference_bed)
    filelib.assert_exists_nz(bam_filename)

    # RSeQC scripts use #!/usr/bin/python, which may not be the right
    # one.  Use the python on the path.
    cmd = [
        "python",
        mlib.sq(script),
        "-r",
        mlib.sq(reference_bed),
        "-i",
        mlib.sq(bam_filename),
    ]
    cmd = " ".join(cmd)
    x = parallel.sshell(cmd)
    x = parse_rseqc_infer_experiment(x)
    #single_or_paired, stranded, frac_failed, frac_first, frac_second = x
    return x
Beispiel #7
0
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            outfile):
        """log the input file"""
        import math
        import arrayio
        from genomicode import filelib
        from genomicode import binreg

        signal_file = in_data.identifier
        filelib.assert_exists_nz(signal_file)

        M = arrayio.read(signal_file)
        assert not binreg.is_logged_array_data(M), 'the file is logged'
        # Change the matrix in place.
        X = M._X
        for i in range(len(X)):
            for j in range(len(X[i])):
                x = X[i][j]
                if x is None:
                    continue
                x = float(x)
                if x < 1:
                    x = 1
                x = math.log(x, 2)
                X[i][j] = x

        M_c = arrayio.convert(M, to_format=arrayio.tab_delimited_format)

        handle = open(outfile, 'w')
        arrayio.tab_delimited_format.write(M_c, handle)
Beispiel #8
0
def read_stranded(filename):
    import json
    from genomicode import filelib
    filelib.assert_exists_nz(filename)
    text = open(filename).read()
    x = json.loads(text)
    return Stranded(**x)
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, outfile):
        # Given a GEOID and GPLID, get the series matrix file.
        from genomicode import geolib
        from genomicode import filelib

        metadata = {}

        GSEID = user_options['GSEID']
        GPLID = user_options.get("GPLID")
        assert GSEID.startswith('GSE'), 'GSEID %s is not correct' % GSEID
        assert not GPLID or GPLID.startswith('GPL'), \
               'GPLID %s is not correct' % GPLID
        # Don't need to save user_options.
        #metadata["GSEID"] = GSEID
        #if GPLID:
        #    metadata["GPLID"] = GPLID

        outhandle = open(outfile, 'w')
        geolib.download_seriesmatrix_file(outhandle, GSEID, GPLID)
        outhandle.close()
        filelib.assert_exists_nz(outfile)
        #metadata["filesize"] = filelib.filesize(outfile)
        #if not os.path.exists(outfile):
        #    os.mkdir(outfile)
        #matrix_files = get_seriesmatrix_file(GSEID, GPLID)
        #for matrix_file in matrix_files:
        #    newmatrix_filename = os.path.split(matrix_file)[-1]
        #    shutil.copyfile(matrix_file, os.path.join(outfile, newmatrix_filename))
        #assert filelib.exists_nz(outfile), (
        #    'the output file %s for download_geo_dseriesmatrix fails' % outfile
        #)
        return metadata
Beispiel #10
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, outfile):
        from genomicode import filelib

        in_data = antecedents
        plot_hyb_bar(in_data.identifier, outfile)
        filelib.assert_exists_nz(outfile)
Beispiel #11
0
    def run(
        self, network, in_data, out_attributes, user_options, num_cores,
        outfile):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import config

        signal_node = in_data
        signal_file = signal_node.identifier
        assert os.path.exists(signal_file)
        
        slice_matrix = filelib.which_assert(config.slice_matrix)

        sq = parallel.quote
        cmd = [
            sq(slice_matrix),
            "--cpm",
            signal_file,
            ]
        cmd = " ".join(cmd)
        cmd = "%s >& %s" % (cmd, outfile)

        parallel.sshell(cmd)
        filelib.assert_exists_nz(outfile)
Beispiel #12
0
def read_orientation(filename):
    import json
    from genomicode import filelib
    filelib.assert_exists_nz(filename)
    text = open(filename).read()
    x = json.loads(text)
    return Orientation(**x)
Beispiel #13
0
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            outfile):
        from genomicode import filelib

        outhandle = open(outfile, 'w')
        extract_signal(in_data.identifier, outhandle)
        outhandle.close()
        filelib.assert_exists_nz(outfile)
Beispiel #14
0
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            outfile):
        import os
        from genomicode import filelib
        from genomicode import sortlib
        from Betsy import module_utils as mlib

        # Should be a folder of fastqc results.
        fastqc_path = in_data.identifier

        # Find all the FASTQC results.
        x = filelib.list_files_in_path(fastqc_path, endswith="summary.txt")
        x = [os.path.split(x)[0] for x in x]
        paths = x
        assert paths, "No FASTQC files found."

        # Read the results.
        all_results = [read_fastqc_results(x) for x in paths]
        assert all_results

        # Make table where the rows are the samples and the columns
        # are the statistics.
        sample2results = {}
        for x in all_results:
            assert x.sample not in sample2results
            sample2results[x.sample] = x
        all_statistics = all_results[0].statistics_order
        all_samples = sortlib.sort_natural(sample2results)

        table = []
        header = [
            "Sample", "Total Sequences", "Filtered Sequences",
            "Sequence length", "GC"
        ] + all_statistics
        table.append(header)
        for sample in all_samples:
            results = sample2results[sample]
            x1 = [sample]
            x2 = [
                results.total_sequences, results.filtered_sequences,
                results.sequence_length, results.percent_gc
            ]
            x3 = [results.statistics[x] for x in all_statistics]
            x = x1 + x2 + x3
            assert len(x) == len(header)
            table.append(x)

        # Write out the table as text file.
        TXT_FILE = "fastqc_summary.txt"
        handle = open(TXT_FILE, 'w')
        for x in table:
            print >> handle, "\t".join(map(str, x))
        handle.close()

        x = mlib.get_config("txt2xls", which_assert_file=True, quote=True)
        os.system("%s -b %s > %s" % (x, TXT_FILE, outfile))
        filelib.assert_exists_nz(outfile)
Beispiel #15
0
    def run(
        self, network, antecedents, out_attributes, user_options, num_cores,
        outfile):
        from genomicode import filelib
        from genomicode import parallel
        from Betsy import module_utils as mlib
        
        in_data = antecedents
        metadata = {}

##         data_node, cls_node = antecedents
##         a, b, c = read_label_file.read(cls_node.identifier)
##         if len(a) > 1:
##             colors = []
##             for i in range(5):
##                 colors.append(cm.hot(i / 5.0, 1))
##                 colors.append(cm.autumn(i / 5.0, i))
##                 colors.append(cm.cool(i / 5.0, i))
##                 colors.append(cm.jet(i / 5.0, i))
##                 colors.append(cm.spring(i / 5.0, i))
##                 colors.append(cm.prism(i / 5.0, i))
##                 colors.append(cm.summer(i / 5.0, i))
##                 colors.append(cm.winter(i / 5.0, i))
##             opts = [colors[int(i)] for i in b]
##             legend = [c[int(i)] for i in b]
##             plot_pca(data_node.identifier, outfile, opts, legend)

        #num_genes = mlib.get_user_option(
        #    user_options, "pca_num_genes", type=int)
        #assert num_genes >= 5 and num_genes < 1E5
        #metadata["num_genes"] = num_genes

        pcaplot = mlib.get_config("pcaplot", which_assert_file=True)

        prism_file = "prism.txt"
        row_pc_file = "row_components.txt"
        col_pc_file = "col_components.txt"

        sq = parallel.quote
        cmd = [
            sq(pcaplot),
            "--label",
            #"-g", num_genes,
            "--prism_file", prism_file,
            "--row_pc_file", row_pc_file,
            "--col_pc_file", col_pc_file,
            sq(in_data.identifier),
            sq(outfile),
            ]
        cmd = " ".join(map(str, cmd))
        parallel.sshell(cmd)
        metadata["commands"] = [cmd]
        
        filelib.assert_exists_nz(outfile)
        
        return metadata
Beispiel #16
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib
        from Betsy import module_utils

        bam_node, ref_node = antecedents

        #in_filenames = filelib.list_files_in_path(
        #    bam_node.identifier, endswith=".bam", case_insensitive=True)
        in_filenames = module_utils.find_bam_files(bam_node.identifier)
        ref = alignlib.create_reference_genome(ref_node.identifier)
        filelib.safe_mkdir(out_path)

        # java -Xmx5g -jar /usr/local/bin/picard/picard.jar ReorderSam \
        #   I=<input.bam> O=<output.bam> REFERENCE=ucsc.hg19.fasta
        picard_jar = alignlib.find_picard_jar("picard")

        jobs = []  # list of (in_filename, out_filename)
        for in_filename in in_filenames:
            p, f = os.path.split(in_filename)
            out_filename = os.path.join(out_path, f)
            x = in_filename, out_filename
            jobs.append(x)

        # Make a list of commands.
        sq = parallel.quote
        commands = []
        for x in jobs:
            in_filename, out_filename = x

            x = [
                "java",
                "-Xmx5g",
                "-jar",
                sq(picard_jar),
                "ReorderSam",
                "I=%s" % sq(in_filename),
                "O=%s" % sq(out_filename),
                "REFERENCE=%s" % ref.fasta_file_full,
            ]
            x = " ".join(x)
            commands.append(x)

        parallel.pshell(commands, max_procs=num_cores)

        # Make sure the analysis completed successfully.
        for x in jobs:
            in_filename, out_filename = x
            filelib.assert_exists_nz(out_filename)
Beispiel #17
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, outfile):
        """given a GEOID  get the family soft file"""
        from genomicode import filelib

        metadata = {}
        GSEID = user_options['GSEID']
        assert GSEID.startswith('GSE'), 'GSEID %s is not correct' % GSEID
        metadata["GSEID"] = GSEID

        download_series_family(GSEID, 300, open(outfile, 'w'))
        filelib.assert_exists_nz(outfile)
        #metadata["filesize"] = filelib.filesize(outfile)
        return metadata
    def run(
        self, network, in_data, out_attributes, user_options, num_cores,
        outfile):
        from genomicode import filelib

        metadata = {}
        cmd = plot_heatmap(in_data.identifier, outfile, {}, user_options)
        metadata["command"] = cmd

        #M = arrayio.read(in_data.identifier)
        #nrow = M.nrow()
        #ncol = M.ncol()
        #ratio = float(nrow) / ncol
        #max_box_height = 20
        #max_box_width = 60
    
        #if 'hm_width' in user_options:
        #    max_box_width = user_options['hm_width']
        #if 'hm_height' in user_options:
        #    max_box_height = user_options['hm_height']
        
        #if ratio >= 4:
        #    x, y = graphlib.find_tall_heatmap_size(
        #        nrow, ncol,
        #        max_box_height=max_box_height,
        #        max_box_width=max_box_width,
        #        min_box_height=20,
        #        min_box_width=20,
        #        max_megapixels=128)
        #else:
        #    x, y = graphlib.find_wide_heatmap_size(
        #        nrow, ncol,
        #        max_box_height=max_box_height,
        #        max_box_width=max_box_width,
        #        min_box_height=20,
        #        min_box_width=20,
        #        max_megapixels=128)
        #command.extend(['-x', str(x), '-y', str(y)])
        #
        #process = subprocess.Popen(command,
        #                           shell=False,
        #                           stdout=subprocess.PIPE,
        #                           stderr=subprocess.PIPE)
        #error_message = process.communicate()[1]
        #if error_message:
        #    raise ValueError(error_message)

        filelib.assert_exists_nz(outfile)
        return metadata
    def run(
        self, network, in_data, out_attributes, user_options, num_cores,
        outfile):
        from genomicode import filelib
        from Betsy import module_utils as mlib
        from plot_signal_heatmap import plot_heatmap

        metadata = {}
        cluster_files = mlib.find_cluster_files(in_data.identifier)
        assert "cdt" in cluster_files
        cmd = plot_heatmap(
            cluster_files["cdt"], outfile, cluster_files, user_options)
        metadata["command"] = cmd
        filelib.assert_exists_nz(outfile)
        return metadata
Beispiel #20
0
def main():
    import os
    import argparse
    from genomicode import filelib
    from genomicode import parallel

    p = filelib.tswrite
    parser = argparse.ArgumentParser(description="")
    parser.add_argument("treatment_bam", help="BAM file of treated sample.")
    parser.add_argument("control_bam", help="BAM file of background sample.")
    parser.add_argument("outpath", help="Directory to store the results.")

    parser.add_argument("-j",
                        dest="num_procs",
                        type=int,
                        default=1,
                        help="Number of jobs to run in parallel.")
    parser.add_argument("--fdr_cutoff", default=0.05, type=float, help="")

    args = parser.parse_args()
    filelib.assert_exists_nz(args.treatment_bam)
    filelib.assert_exists_nz(args.control_bam)
    args.treatment_bam = os.path.realpath(args.treatment_bam)
    args.control_bam = os.path.realpath(args.control_bam)

    assert args.num_procs >= 1 and args.num_procs < 100, \
           "Please specify between 1 and 100 processes."
    assert args.fdr_cutoff > 0.0 and args.fdr_cutoff < 1.0

    # Set up directories to run it on.
    p("Setting up directories.\n")
    if not os.path.exists(args.outpath):
        os.mkdir(args.outpath)

    # Run SPP.
    p("Running spp in %s.\n" % args.outpath)
    sq = parallel.quote
    sppscript = find_sppscript()
    x = sq(args.treatment_bam), sq(args.control_bam), args.fdr_cutoff, \
        args.num_procs
    x = " ".join(map(str, x))
    cmd = "cat %s | R --vanilla %s" % (sppscript, x)
    x = parallel.sshell(cmd, path=args.outpath)
    print x

    p("Done.\n")
Beispiel #21
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, outfile):
        import arrayio
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import arrayplatformlib as apl
        from Betsy import module_utils as mlib

        in_data = antecedents
        metadata = {}

        M = arrayio.read(in_data.identifier)
        cat2header = apl.categorize_headers(M)
        header = cat2header.get(apl.GENE_SYMBOL)
        if header is None:
            header = cat2header.get(apl.GENE_ID)
        assert header is not None, "I could not find gene IDs or symbols: %s" \
               % in_data.identifier
        metadata["dedup_header"] = header

        slice_matrix = mlib.get_config("slice_matrix", which_assert_file=True)

        sq = parallel.quote
        algorithm = out_attributes['unique_genes']
        if algorithm == "average_genes":
            raise NotImplementedError
        elif algorithm == "high_var":
            dedup_cmd = ["--dedup_row_by_var", sq(header)]
            pass
        elif algorithm == "first_gene":
            raise NotImplementedError
        else:
            raise AssertionError, "Unknown algorithm: %s" % algorithm

        cmd = [
            sq(slice_matrix),
        ]
        cmd += dedup_cmd
        cmd += [sq(in_data.identifier)]
        cmd = " ".join(cmd)
        cmd = "%s >& %s" % (cmd, outfile)
        parallel.sshell(cmd)

        filelib.assert_exists_nz(outfile)

        return metadata
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import filelib

        (
            bam_node,
            fastqc_summary1_node,
            fastqc_folder1_node,
            fastqc_summary2_node,
            fastqc_folder2_node,
            rseqc_node,
            signal1_node,  # TPM
            signal2_node,  # TPM, isoform
            aligned_reads_node,
            signal3_node,  # count
            htseq_reads_node) = antecedents
        filelib.safe_mkdir(out_path)

        FILES = [
            (bam_node.identifier, False, "alignment.bam"),
            (fastqc_summary1_node.identifier, True, "fastqc.no_trim.xls"),
            (fastqc_folder1_node.identifier, False, "fastqc.no_trim"),
            (fastqc_summary2_node.identifier, True, "fastqc.trim.xls"),
            (fastqc_folder2_node.identifier, False, "fastqc.trim"),
            (rseqc_node.identifier, False, "RSeQC"),
            (signal1_node.identifier, True, "expression.gene.tpm"),
            (signal2_node.identifier, True, "expression.isoform.tpm"),
            (aligned_reads_node.identifier, True, "aligned.xls"),
            (signal3_node.identifier, True, "expression.counts"),
            (htseq_reads_node.identifier, True, "mapped.htseq.txt"),
        ]

        for x in FILES:
            orig_filename, is_file, new_file = x
            new_filename = os.path.join(out_path, new_file)

            # Copy or link the data into the right place.
            if is_file:
                filelib.assert_exists_nz(orig_filename)
            else:
                assert filelib.dir_exists(orig_filename), \
                       "Directory not found or not directory: %s" % \
                       orig_filename
            os.symlink(orig_filename, new_filename)
def make_gsea_command(expression_file, class_label_file, gsea_path, name1,
                      name2, indexes1, indexes2, permutation_type, database):
    # indexes should be 1-based, not including headers.
    from genomicode import parallel
    from genomicode import filelib
    from genomicode import parselib
    from Betsy import module_utils as mlib
    from Betsy.rules import GSEAAnalysis

    filelib.assert_exists_nz(expression_file)
    filelib.assert_exists_nz(class_label_file)
    assert permutation_type in GSEAAnalysis.GSEA_PERMUTATION
    assert database in GSEAAnalysis.GSEA_DATABASE

    ranges1 = [(i, i + 1) for i in indexes1]
    ranges2 = [(i, i + 1) for i in indexes2]
    indexes1_str = parselib.unparse_ranges(ranges1)
    indexes2_str = parselib.unparse_ranges(ranges2)

    gsea = mlib.get_config("gsea", which_assert_file=True)

    sq = parallel.quote
    cmd = [
        sq(gsea),
        "--name1",
        name1,
        "--name2",
        name2,
        "--indexes1",
        indexes1_str,
        "--indexes2",
        indexes2_str,
        "--permutation_type",
        sq(permutation_type),
        "--database",
        sq(database),
        "--min_match_score",
        0.80,
        "--clobber",
        sq(expression_file),
        sq(gsea_path),
    ]
    cmd = " ".join(map(str, cmd))
    return cmd
Beispiel #24
0
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            outfile):
        import os
        import arrayio
        from genomicode import jmath
        from genomicode import filelib
        from genomicode import parallel
        from Betsy import module_utils as mlib

        metadata = {}

        norm_para = ["variance", "sum_of_squares"]
        assert "gene_normalize" in out_attributes
        normalize = out_attributes["gene_normalize"]
        assert normalize in norm_para, \
               "Invalid normalize option: %s" % normalize

        if normalize == "variance":
            f = file(outfile, 'w')
            M = arrayio.read(in_data.identifier, format=arrayio.pcl_format)
            M_n = jmath.safe_norm_mv(M.slice())
            M._X = M_n
            M_c = arrayio.convert(M, to_format=arrayio.pcl_format)
            arrayio.pcl_format.write(M_c, f)
            f.close()
        elif normalize == "sum_of_squares":
            cluster = mlib.get_config("cluster", which_assert_file=True)
            sq = parallel.quote
            cmd = [
                sq(cluster),
                "-f",
                sq(in_data.identifier),
                "-ng",
                "-u",
                outfile,
            ]
            parallel.sshell(cmd)
            metadata["command"] = cmd
            outputfile = outfile + '.nrm'
            filelib.assert_exists_nz(outputfile)
            os.rename(outputfile, outfile)

        filelib.assert_exists_nz(outfile)
        return metadata
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            out_filename):
        import os
        import shutil
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import config

        in_filename = in_data.identifier
        filelib.assert_exists_nz(in_filename)

        vcftools = filelib.which_assert(config.vcftools)

        # vcftools --vcf test31.txt --remove-indels --recode --recode-INFO-all
        #   --out test32
        # Writes stuff to console.  Should capture in log file.
        # Saves file test32.recode.vcf

        p, f = os.path.split(in_filename)
        s, ext = os.path.splitext(in_filename)
        sample = s

        out_stem = "%s.filtered" % sample
        log_filename = "%s.log" % sample
        # Should create file <out_stem>.recode.vcf
        outfile = "%s.recode.vcf" % out_stem

        sq = parallel.quote
        cmd = [
            sq(vcftools),
            "--vcf",
            sq(in_filename),
            "--remove-indels",
            "--recode",
            "--recode-INFO-all",
            "--out",
            out_stem,
        ]
        cmd = " ".join(cmd)
        cmd = "%s >& %s" % (cmd, log_filename)
        parallel.sshell(cmd)

        filelib.assert_exists_nz(outfile)
        shutil.copy2(outfile, out_filename)
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            outfile):
        import shutil
        import arrayio
        from genomicode import filelib
        from genomicode import parallel
        from Betsy import module_utils as mlib

        filename = in_data.identifier
        filelib.assert_exists_nz(filename)

        # De-duplicate by every single header.  Not sure if this is
        # right.
        MATRIX = arrayio.read(filename)
        # Figure out which columns has duplicates.
        has_dup = []
        for name in MATRIX.row_names():
            annots = MATRIX.row_names(name)
            assert name not in has_dup
            seen = {}
            for annot in annots:
                if annot in seen:
                    has_dup.append(name)
                    break
                seen[annot] = 1
        if not has_dup:
            shutil.copy2(filename, outfile)
            return

        sq = parallel.quote
        slice_matrix = mlib.get_config("slice_matrix", which_assert_file=True)
        for i, name in enumerate(has_dup):
            f = "outfile.%d.txt" % i
            x = [
                sq(slice_matrix),
                "--dedup_row_by_var",
                sq(name),
                sq(filename),
                ">&",
                sq(f),
            ]
            x = " ".join(map(str, x))
            parallel.sshell(x)
        shutil.copy2(f, outfile)
def _make_samtools_filter_cmd(in_bamfile, out_bamfile):
    from genomicode import filelib
    from genomicode import parallel
    from genomicode import config

    filelib.assert_exists_nz(in_bamfile)
    samtools = filelib.which_assert(config.samtools)
    sq = parallel.quote

    cmd = [
        sq(samtools),
        "view",
        "-bF 4",
        sq(in_bamfile),
        ">",
        sq(out_bamfile),
    ]
    cmd = " ".join(cmd)
    return cmd
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, outfile):
        import arrayio
        from genomicode import filelib
        from Betsy import module_utils

        in_data = antecedents
        assert module_utils.is_missing(in_data.identifier), 'no missing values'

        M = arrayio.read(in_data.identifier)
        for i in range(M.dim()[0]):
            for j in range(M.dim()[1]):
                if M._X[i][j] is None:
                    M._X[i][j] = '0'

        f_out = file(outfile, 'w')
        arrayio.tab_delimited_format.write(M, f_out)
        f_out.close()

        filelib.assert_exists_nz(outfile)
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_filename):
        from genomicode import filelib
        import add_coverage_to_simplevariantmatrix

        simple_node, coverage_node = antecedents
        filelib.assert_exists_nz(simple_node.identifier)
        filelib.assert_exists_nz(coverage_node.identifier)

        # Figure out if I'm adding coverage data from DNA or RNA.
        #in_attrs = simple_node.data.attributes
        #out_attrs = out_attributes
        #name = "with_rna_coverage"
        #assert name in in_attrs and name in out_attrs
        #is_rna_cov = False
        #if in_attrs[name] == "no" and out_attrs[name] == "yes":
        #    is_rna_cov = True
        add_coverage_to_simplevariantmatrix.add_coverage_to_svm(
            simple_node.identifier, coverage_node.identifier, out_filename,
            True)
Beispiel #30
0
def _convert_gene_ids_local(in_platform, out_platform):
    # Return a dictionary of gene_id -> list of converted_ids, or None
    # if these platforms cannot be converted.
    import os
    from genomicode import config
    from genomicode import filelib

    filelib.assert_exists_nz(config.convert_platform)
    x = "%s___%s.txt" % (in_platform, out_platform)
    filename = os.path.join(config.convert_platform, x)
    if not os.path.exists(filename):
        return None

    in2out = {}
    for cols in filelib.read_cols(filename):
        # <in_id>  <out_id1> ... <out_idn>
        assert len(cols) >= 2
        in_id = cols[0]
        out_ids = cols[1:]
        in2out[in_id] = out_ids
    return in2out