Beispiel #1
0
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            out_path):
        from genomicode import config
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib

        bwa = filelib.which_assert(config.bwa)
        ref = alignlib.standardize_reference_genome(in_data.identifier,
                                                    out_path,
                                                    use_symlinks=True)

        # bwa index <out_stem.fa>
        # Makes files:
        # <out_stem>.fa.amb .ann .bwt .pac .sa

        sq = parallel.quote
        cmd = [
            sq(bwa),
            "index",
            sq(ref.fasta_file_full),
        ]
        parallel.sshell(cmd, path=out_path)

        # Make sure the indexing worked properly.
        EXTENSIONS = [".amb", ".ann", ".bwt", ".pac", ".sa"]
        for ext in EXTENSIONS:
            f = "%s%s" % (ref.fasta_file_full, ext)
            assert filelib.exists_nz(f), "Missing: %s" % f
Beispiel #2
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import parallel
        from genomicode import hashlib
        from genomicode import filelib
        from Betsy import module_utils
        import run_MACS14

        bam_node, group_node = antecedents
        bam_path = module_utils.check_inpath(bam_node.identifier)
        sample_groups = module_utils.read_sample_group_file(
            group_node.identifier)

        # Get options.
        treat_sample = module_utils.get_user_option(user_options,
                                                    "treatment_sample",
                                                    not_empty=True)
        control_sample = module_utils.get_user_option(user_options,
                                                      "control_sample",
                                                      not_empty=True)

        # Set the experiment name.
        name1 = hashlib.hash_var(treat_sample)
        name2 = hashlib.hash_var(control_sample)
        experiment_name = "%s_vs_%s" % (name1, name2)

        # Make sure the samples exist.
        samples = [x[1] for x in sample_groups]
        assert treat_sample in samples, "Unknown sample: %s" % treat_sample
        assert control_sample in samples, "Unknown sample: %s" % control_sample

        # Find the BAM files.
        treat_filename = run_MACS14.find_bam_file(bam_path, treat_sample,
                                                  sample_groups)
        control_filename = run_MACS14.find_bam_file(bam_path, control_sample,
                                                    sample_groups)
        assert treat_filename, "Missing bam file for %s" % treat_sample
        assert control_filename, "Missing bam file for %s" % control_sample

        cmd = make_pyspp_command(treat_filename,
                                 control_filename,
                                 out_path,
                                 num_procs=num_cores)
        log_file = "%s.log" % experiment_name
        cmd = "%s >& %s" % (cmd, log_file)
        parallel.sshell(cmd, path=out_path)

        files = [
            "binding.positions.txt",
            #"broadPeak",
            "crosscorrelation.pdf",
            "density.wig",
            "enrichment.estimates.wig",
            "enrichment.wig",
            #"narrowPeak",   # might be empty if no peaks found
            log_file,
        ]
        filenames = [os.path.join(out_path, x) for x in files]
        filelib.assert_exists_nz_many(filenames)
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            out_path):
        import os
        from genomicode import config
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib

        bowtie2_build = filelib.which_assert(config.bowtie2_build)
        ref = alignlib.standardize_reference_genome(in_data.identifier,
                                                    out_path,
                                                    use_symlinks=True)

        # bowtie2-build <ref.fa> <output_stem>
        # Makes files:
        # <output_stem>.[1234].bt2
        # <output_stem>.rev.[12].bt2

        sq = parallel.quote
        cmd = [
            sq(bowtie2_build),
            sq(ref.fasta_file_full),
            ref.name,
        ]
        parallel.sshell(cmd, path=out_path)

        # Check to make sure index was created successfully.
        f = os.path.join(out_path, "%s.1.bt2" % ref.name)
        assert filelib.exists_nz(f)
Beispiel #4
0
def relabel(data_file, rename_file, outfile, user_options):
    from genomicode import filelib
    from genomicode import parallel
    from Betsy import module_utils as mlib

    sample_header = mlib.get_user_option(
        user_options, "sample_labels_header", not_empty=True)
    # Make sure sample_header in rename file.
    x = open(rename_file).readline()
    x = x.rstrip("\r\n").split("\t")
    assert sample_header in x, "Missing header (%s): %s" % (
        sample_header, rename_file)

    sq = parallel.quote
    slice_matrix = mlib.get_config("slice_matrix", which_assert_file=True)
    x = "'%s,%s'" % (rename_file, sample_header)
    cmd = [
        "python",
        sq(slice_matrix),
        '--relabel_col_ids', x,
        sq(data_file),
        ]
    cmd = " ".join(cmd)
    cmd = "%s >& %s" % (cmd, outfile)
    parallel.sshell(cmd)

    filelib.assert_exists_nz(outfile)
    return cmd
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, outfile):
        from genomicode import filelib
        from genomicode import parallel
        from Betsy import module_utils as mlib

        in_data = antecedents
        metadata = {}

        #module_utils.plot_line_keywd(in_data.identifier, 'biotin', outfile)
        lineplot = mlib.get_config("lineplot", which_assert_file=True)

        sq = parallel.quote
        cmd = [
            sq(lineplot),
            "--gene_names",
            "biotin",
            "--mar_bottom",
            1.50,
            "--yaxis_starts_at_0",
            sq(in_data.identifier),
            sq(outfile),
        ]
        cmd = " ".join(map(str, cmd))
        parallel.sshell(cmd)
        metadata["commands"] = [cmd]
        filelib.assert_exists_nz(outfile)
        return metadata
Beispiel #6
0
    def run(
        self, network, in_data, out_attributes, user_options, num_cores,
        outfile):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import config

        signal_node = in_data
        signal_file = signal_node.identifier
        assert os.path.exists(signal_file)
        
        slice_matrix = filelib.which_assert(config.slice_matrix)

        sq = parallel.quote
        cmd = [
            sq(slice_matrix),
            "--cpm",
            signal_file,
            ]
        cmd = " ".join(cmd)
        cmd = "%s >& %s" % (cmd, outfile)

        parallel.sshell(cmd)
        filelib.assert_exists_nz(outfile)
Beispiel #7
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, outfile):
        import itertools
        from genomicode import config
        from genomicode import parallel
        from genomicode import filelib

        signal_node, annotation_node = antecedents
        signal_filename = signal_node.identifier
        annotation_filename = annotation_node.identifier
        filelib.assert_exists_nz(signal_filename)
        filelib.assert_exists_nz(annotation_filename)
        metadata = {}

        align_matrices = filelib.which_assert(config.align_matrices)

        # Make sure the signal_filename has an ID_REF header.
        header = filelib.read_cols(signal_filename).next()
        assert header[0] == "ID_REF", "Missing ID_REF header: %s" % \
               signal_filename

        signal_align_file = "signal.aligned.txt"
        annot_align_file = "annot.aligned.txt"

        # First, align the two files.
        sq = parallel.quote
        cmd = [
            sq(align_matrices),
            "--annot_file",
            signal_filename,
            "--header",
            "ID_REF",
            "--annot_file",
            annotation_filename,
            "--left_join",
            signal_align_file,
            annot_align_file,
        ]
        cmd = " ".join(cmd)
        parallel.sshell(cmd)
        metadata["command"] = cmd

        # Now merge them.  Take the first column of the expression
        # file (should be ID_REF), the whole annotation file, then the
        # remainder of the expression file.
        signal_handle = filelib.read_cols(signal_align_file)
        annot_handle = filelib.read_cols(annot_align_file)
        outhandle = open(outfile, 'w')
        for x1, x2 in itertools.izip(signal_handle, annot_handle):
            x = [x1[0]] + x2 + x1[1:]
            print >> outhandle, "\t".join(x)
        outhandle.close()

        #cmd = "paste %s %s > %s" % (
        #    annot_align_file, signal_align_file, outfile)
        #shell.single(cmd)

        filelib.assert_exists_nz(outfile)
Beispiel #8
0
    def run(
        self, network, antecedents, out_attributes, user_options, num_cores,
        outfile):
        from genomicode import filelib
        from genomicode import parallel
        from Betsy import module_utils as mlib
        
        in_data = antecedents
        metadata = {}

##         data_node, cls_node = antecedents
##         a, b, c = read_label_file.read(cls_node.identifier)
##         if len(a) > 1:
##             colors = []
##             for i in range(5):
##                 colors.append(cm.hot(i / 5.0, 1))
##                 colors.append(cm.autumn(i / 5.0, i))
##                 colors.append(cm.cool(i / 5.0, i))
##                 colors.append(cm.jet(i / 5.0, i))
##                 colors.append(cm.spring(i / 5.0, i))
##                 colors.append(cm.prism(i / 5.0, i))
##                 colors.append(cm.summer(i / 5.0, i))
##                 colors.append(cm.winter(i / 5.0, i))
##             opts = [colors[int(i)] for i in b]
##             legend = [c[int(i)] for i in b]
##             plot_pca(data_node.identifier, outfile, opts, legend)

        #num_genes = mlib.get_user_option(
        #    user_options, "pca_num_genes", type=int)
        #assert num_genes >= 5 and num_genes < 1E5
        #metadata["num_genes"] = num_genes

        pcaplot = mlib.get_config("pcaplot", which_assert_file=True)

        prism_file = "prism.txt"
        row_pc_file = "row_components.txt"
        col_pc_file = "col_components.txt"

        sq = parallel.quote
        cmd = [
            sq(pcaplot),
            "--label",
            #"-g", num_genes,
            "--prism_file", prism_file,
            "--row_pc_file", row_pc_file,
            "--col_pc_file", col_pc_file,
            sq(in_data.identifier),
            sq(outfile),
            ]
        cmd = " ".join(map(str, cmd))
        parallel.sshell(cmd)
        metadata["commands"] = [cmd]
        
        filelib.assert_exists_nz(outfile)
        
        return metadata
Beispiel #9
0
def get_paired_stranded_rseqc(reference_bed, bam_filename):
    from genomicode import alignlib
    from genomicode import filelib
    from genomicode import parallel
    from Betsy import module_utils as mlib

    script = alignlib.find_rseqc_script("infer_experiment.py")
    filelib.assert_exists_nz(reference_bed)
    filelib.assert_exists_nz(bam_filename)

    # RSeQC scripts use #!/usr/bin/python, which may not be the right
    # one.  Use the python on the path.
    cmd = [
        "python",
        mlib.sq(script),
        "-r",
        mlib.sq(reference_bed),
        "-i",
        mlib.sq(bam_filename),
    ]
    cmd = " ".join(cmd)
    x = parallel.sshell(cmd)
    x = parse_rseqc_infer_experiment(x)
    #single_or_paired, stranded, frac_failed, frac_first, frac_second = x
    return x
Beispiel #10
0
def count_reads(fastq_filename):
    # Requires an uncompressed fastq file.
    from genomicode import filelib
    from genomicode import parallel

    sq = parallel.quote

    # Make sure it's a fastq file.
    # @M03807:17:000000000-AHGYH:1:1101:20554:1508 1:N:0:16
    # CTTTACACCCAGTGGAGAAGCTCCCAACCAAGCTCTCTTGAGGATCTTGAAGGAAACTGA
    # +
    # <BCC@FAFEC8,C<8968<@EEEFFCCFEC@EDEFGGGGA,@,@EFGGF9,,88,@FFA<
    handle = filelib.openfh(fastq_filename)
    x = [handle.readline() for i in range(4)]
    x = [x.strip() for x in x]
    x = [x for x in x]
    assert len(x) == 4
    assert len(x[1]) == len(x[3])
    assert x[2] == "+"

    wc_out = parallel.sshell("wc -l %s" % sq(fastq_filename))
    # velocitron:biocore$ wc -l test01.txt
    # 22278 test01.txt
    # 0 test 1.txt
    x = wc_out.strip().split()
    assert len(x) >= 2, "Unknown format from wc -l\n" % wc_out
    num_lines, filename = x[0], " ".join(x[1:])
    num_lines = int(num_lines)
    num_reads = num_lines / 4
    return num_reads
Beispiel #11
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, outfile):
        import arrayio
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import arrayplatformlib as apl
        from Betsy import module_utils as mlib

        in_data = antecedents
        metadata = {}

        M = arrayio.read(in_data.identifier)
        cat2header = apl.categorize_headers(M)
        header = cat2header.get(apl.GENE_SYMBOL)
        if header is None:
            header = cat2header.get(apl.GENE_ID)
        assert header is not None, "I could not find gene IDs or symbols: %s" \
               % in_data.identifier
        metadata["dedup_header"] = header

        slice_matrix = mlib.get_config("slice_matrix", which_assert_file=True)

        sq = parallel.quote
        algorithm = out_attributes['unique_genes']
        if algorithm == "average_genes":
            raise NotImplementedError
        elif algorithm == "high_var":
            dedup_cmd = ["--dedup_row_by_var", sq(header)]
            pass
        elif algorithm == "first_gene":
            raise NotImplementedError
        else:
            raise AssertionError, "Unknown algorithm: %s" % algorithm

        cmd = [
            sq(slice_matrix),
        ]
        cmd += dedup_cmd
        cmd += [sq(in_data.identifier)]
        cmd = " ".join(cmd)
        cmd = "%s >& %s" % (cmd, outfile)
        parallel.sshell(cmd)

        filelib.assert_exists_nz(outfile)

        return metadata
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            outfile):
        import shutil
        import arrayio
        from genomicode import filelib
        from genomicode import parallel
        from Betsy import module_utils as mlib

        filename = in_data.identifier
        filelib.assert_exists_nz(filename)

        # De-duplicate by every single header.  Not sure if this is
        # right.
        MATRIX = arrayio.read(filename)
        # Figure out which columns has duplicates.
        has_dup = []
        for name in MATRIX.row_names():
            annots = MATRIX.row_names(name)
            assert name not in has_dup
            seen = {}
            for annot in annots:
                if annot in seen:
                    has_dup.append(name)
                    break
                seen[annot] = 1
        if not has_dup:
            shutil.copy2(filename, outfile)
            return

        sq = parallel.quote
        slice_matrix = mlib.get_config("slice_matrix", which_assert_file=True)
        for i, name in enumerate(has_dup):
            f = "outfile.%d.txt" % i
            x = [
                sq(slice_matrix),
                "--dedup_row_by_var",
                sq(name),
                sq(filename),
                ">&",
                sq(f),
            ]
            x = " ".join(map(str, x))
            parallel.sshell(x)
        shutil.copy2(f, outfile)
Beispiel #13
0
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            outfile):
        import os
        import arrayio
        from genomicode import jmath
        from genomicode import filelib
        from genomicode import parallel
        from Betsy import module_utils as mlib

        metadata = {}

        norm_para = ["variance", "sum_of_squares"]
        assert "gene_normalize" in out_attributes
        normalize = out_attributes["gene_normalize"]
        assert normalize in norm_para, \
               "Invalid normalize option: %s" % normalize

        if normalize == "variance":
            f = file(outfile, 'w')
            M = arrayio.read(in_data.identifier, format=arrayio.pcl_format)
            M_n = jmath.safe_norm_mv(M.slice())
            M._X = M_n
            M_c = arrayio.convert(M, to_format=arrayio.pcl_format)
            arrayio.pcl_format.write(M_c, f)
            f.close()
        elif normalize == "sum_of_squares":
            cluster = mlib.get_config("cluster", which_assert_file=True)
            sq = parallel.quote
            cmd = [
                sq(cluster),
                "-f",
                sq(in_data.identifier),
                "-ng",
                "-u",
                outfile,
            ]
            parallel.sshell(cmd)
            metadata["command"] = cmd
            outputfile = outfile + '.nrm'
            filelib.assert_exists_nz(outputfile)
            os.rename(outputfile, outfile)

        filelib.assert_exists_nz(outfile)
        return metadata
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            out_filename):
        import os
        import shutil
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import config

        in_filename = in_data.identifier
        filelib.assert_exists_nz(in_filename)

        vcftools = filelib.which_assert(config.vcftools)

        # vcftools --vcf test31.txt --remove-indels --recode --recode-INFO-all
        #   --out test32
        # Writes stuff to console.  Should capture in log file.
        # Saves file test32.recode.vcf

        p, f = os.path.split(in_filename)
        s, ext = os.path.splitext(in_filename)
        sample = s

        out_stem = "%s.filtered" % sample
        log_filename = "%s.log" % sample
        # Should create file <out_stem>.recode.vcf
        outfile = "%s.recode.vcf" % out_stem

        sq = parallel.quote
        cmd = [
            sq(vcftools),
            "--vcf",
            sq(in_filename),
            "--remove-indels",
            "--recode",
            "--recode-INFO-all",
            "--out",
            out_stem,
        ]
        cmd = " ".join(cmd)
        cmd = "%s >& %s" % (cmd, log_filename)
        parallel.sshell(cmd)

        filelib.assert_exists_nz(outfile)
        shutil.copy2(outfile, out_filename)
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            out_path):
        from genomicode import config
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib

        samtools = filelib.which_assert(config.samtools)
        ref = alignlib.standardize_reference_genome(in_data.identifier,
                                                    out_path,
                                                    use_symlinks=True)

        ## fa_filenames = module_utils.find_fasta_files(out_path)
        ## # Filter out the FASTA files created by RSEM indexing.
        ## # <assembly>.idx.fa
        ## # <assembly>.n2g.idx.fa
        ## # <assembly>.transcripts.fa
        ## # Could these end with ".fasta"?
        ## x = fa_filenames
        ## x = [x for x in x if not x.endswith(".idx.fa")]
        ## x = [x for x in x if not x.endswith(".n2g.idx.fa")]
        ## x = [x for x in x if not x.endswith(".transcripts.fa")]
        ## fa_filenames = x
        ## assert fa_filenames, "Could not find reference genome."
        ## assert len(fa_filenames) == 1, "Found multiple reference genomes."
        ## reference_filename = fa_filenames[0]

        # samtools faidx <ref>.fa
        # Makes files:
        # <ref>.fa.fai

        sq = parallel.quote
        cmd = [
            sq(samtools),
            "faidx",
            sq(ref.fasta_file_full),
        ]
        parallel.sshell(cmd, path=out_path)

        # Check to make sure index was created successfully.
        f = "%s.fai" % ref.fasta_file_full
        assert filelib.exists_nz(f)
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib

        ref_node, gene_node = antecedents
        ref = alignlib.standardize_reference_genome(ref_node.identifier,
                                                    out_path,
                                                    use_symlinks=True)
        filelib.safe_mkdir(out_path)

        x = alignlib.make_STAR_index_command(ref.fasta_file_full,
                                             out_path,
                                             gtf_file=gene_node.identifier,
                                             num_cores=num_cores)
        x = "%s >& out.txt" % x
        parallel.sshell(x, path=out_path)

        # Check to make sure index was created successfully.
        alignlib.assert_is_STAR_reference(out_path)
Beispiel #17
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, outfile):
        from genomicode import filelib
        from genomicode import parallel
        from Betsy import module_utils as mlib

        in_data = antecedents
        metadata = {}

        lineplot = mlib.get_config("lineplot", which_assert_file=True)

        gene_names = [
            "ACTB",
            60,  # Human beta actin.
            "TUBB",
            203068,  # Human beta tubulin.
            "Actb",
            22461,  # Mouse beta actin.
            "Tubb4a",
            22153,  # Mouse beta tubulin.
        ]

        infile = in_data.identifier

        sq = parallel.quote
        cmd = [
            sq(lineplot),
            "--gene_names",
            ",".join(map(str, gene_names)),
            "--mar_bottom",
            1.50,
            sq(infile),
            sq(outfile),
        ]
        cmd = " ".join(map(str, cmd))
        parallel.sshell(cmd)
        metadata["commands"] = [cmd]

        filelib.assert_exists_nz(outfile)
        return metadata
Beispiel #18
0
    def run(
        self, network, antecedents, out_attributes, user_options, num_cores,
        outfile):
        from genomicode import filelib
        from genomicode import parallel
        from Betsy import module_utils as mlib
        
        in_data = antecedents
        metadata = {}


        #M = arrayio.read(in_data.identifier)
        #data = jmath.transpose(M._X)
        #tickname = M._col_names['_SAMPLE_NAME']
        #fig = mplgraph.boxplot(
        #    data,
        #    xlabel='Sample Name',
        #    ylabel='Signal',
        #    title='Signal Intensity',
        #    box_label=tickname)
        #fig.savefig(outfile)

        boxplot = mlib.get_config("boxplot", which_assert_file=True)

        sq = parallel.quote
        cmd = [
            sq(boxplot),
            sq(in_data.identifier),
            sq(outfile),
            ]
        cmd = " ".join(map(str, cmd))
        parallel.sshell(cmd)

        metadata["commands"] = [cmd]
        
        filelib.assert_exists_nz(outfile)
        
        return metadata
Beispiel #19
0
def get_bedtools_version():
    import re
    from genomicode import config
    from genomicode import filelib
    from genomicode import parallel

    bedtools = filelib.which_assert(config.bedtools)
    x = parallel.sshell("%s --version" % bedtools, ignore_nonzero_exit=True)
    x = x.strip()
    # bedtools v2.23.0
    # Version: 1.2 (using htslib 1.2.1)
    m = re.search(r"v([\w\. ]+)", x)
    assert m, "Missing version string"
    return m.group(1)
Beispiel #20
0
def _make_analysis_directory(analysis_path, config_file, reference_fa,
                             normal_bam, tumor_bam):
    import os
    from genomicode import filelib
    from genomicode import parallel
    from Betsy import module_utils as mlib

    filelib.assert_exists_nz(config_file)
    filelib.assert_exists_nz(reference_fa)
    filelib.assert_exists_nz(normal_bam)
    filelib.assert_exists_nz(tumor_bam)

    strelka_path = mlib.get_config("strelka", assert_exists=True)
    config_pl = os.path.join(strelka_path, "bin",
                             "configureStrelkaWorkflow.pl")
    filelib.assert_exists_nz(config_pl)

    # $STRELKA/bin/configureStrelkaWorkflow.pl \
    #   --normal=../test31.bam --tumor=../test32.bam \
    #   --ref=../genomes/Broad.hg19/Homo_sapiens_assembly19.fa \
    #   --config=./config.ini --output-dir=./myAnalysis
    sq = mlib.sq
    cmd = [
        sq(config_pl),
        "--normal",
        sq(normal_bam),
        "--tumor",
        sq(tumor_bam),
        "--ref",
        sq(reference_fa),
        "--config",
        sq(config_file),
        "--output-dir",
        sq(analysis_path),
    ]
    cmd = " ".join(cmd)
    parallel.sshell(cmd)
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            out_filename):
        import os
        import shutil
        from genomicode import filelib
        from genomicode import parallel
        from genomicode import alignlib
        from Betsy import module_utils

        mvcf_node = in_data
        in_filename = mvcf_node.identifier
        filelib.assert_exists_nz(in_filename)

        buildver = module_utils.get_user_option(user_options,
                                                "buildver",
                                                allowed_values=["hg19"],
                                                not_empty=True)

        # Annovar takes a filestem, without the ".vcf".
        p, f = os.path.split(in_filename)
        f, exp = os.path.splitext(f)
        log_filename = "%s.log" % f

        p, f = os.path.split(out_filename)
        f, exp = os.path.splitext(f)
        out_filestem = f

        cmd = alignlib.make_annovar_command(in_filename, log_filename,
                                            out_filestem, buildver)
        parallel.sshell(cmd)

        # Make sure the analysis completed successfully.
        x = "%s.%s_multianno.vcf" % (out_filestem, buildver)
        filelib.assert_exists_nz(x)
        if os.path.realpath(x) != os.path.realpath(out_filename):
            shutil.copy2(x, out_filename)
Beispiel #22
0
    def run(
        self, network, in_data, out_attributes, user_options, num_cores,
        outfile):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from Betsy import module_utils as mlib

        metadata = {}
        
        center_alg = {
            'mean': 'a',
            'median': 'm',
            }
        assert "gene_center" in out_attributes
        center = out_attributes['gene_center']
        assert center in center_alg, "Invalid center option: %s" % center
        center_parameter = center_alg[center]

        cluster = mlib.get_config("cluster", which_assert_file=True)
        sq = parallel.quote
        cmd = [
            sq(cluster),
            "-f", sq(in_data.identifier),
            "-cg", center_parameter,
            "-u", outfile,
            ]
        cmd = " ".join(map(str, cmd))
        parallel.sshell(cmd)
        metadata["commands"] = [cmd]
        
        outputfile = outfile + '.nrm'
        filelib.assert_exists_nz(outputfile)
        os.rename(outputfile, outfile)

        return metadata
Beispiel #23
0
def _run_filterRadia_with_restart(cmd, cancer_sample, chrom, logfile):
    # Sometimes samtools crashes in the middle of a run.  Detect this
    # case, and re-run the analysis if needed.
    from genomicode import parallel
    from genomicode import filelib

    num_tries = 0
    while num_tries <= 3:
        num_tries += 1
        parallel.sshell(cmd, ignore_nonzero_exit=True)
        filelib.assert_exists(logfile)
        log = open(logfile).read()
        # Empty logfile means cmd completed successfully.
        if not log.strip():
            break
        # Look for evidence that samtools died.  If this occurs, try again.
        # 06/29/2016 09:57:16 AM  ERROR   The return code of '1' from the
        #   following filter command indicates an error.
        # 06/29/2016 09:57:16 AM  ERROR   Error from /usr/bin/python
        #   /usr/local/radia/scripts/createBlatFile.pyc 196C-lung2
        #   radia2.tmp/196C-lung2_dnaFiltered_chr1.vcf
        #   radia2.tmp/196C-lung2_mpileup_rna_origin_chr1.vcf
        #   -o radia2.tmp/196C-lung2_blatInput_chr1.fa
        #   --allVCFCalls --blatRnaNormalReads --blatRnaTumorReads:
        # <Traceback>
        # [...]
        #   samtoolsCall.kill()
        # [...]
        # OSError: [Errno 3] No such process
        if log.find("samtoolsCall.kill") >= 0 \
               and log.find("No such process") >= 0:
            continue
        # Otherwise, the process failed for some other reason.  Raise
        # an exception.
        raise AssertionError, "Problem filtering: %s %s\n%s" % (cancer_sample,
                                                                chrom, log)
Beispiel #24
0
def main():
    import os
    import argparse
    from genomicode import filelib
    from genomicode import parallel

    p = filelib.tswrite
    parser = argparse.ArgumentParser(description="")
    parser.add_argument("treatment_bam", help="BAM file of treated sample.")
    parser.add_argument("control_bam", help="BAM file of background sample.")
    parser.add_argument("outpath", help="Directory to store the results.")

    parser.add_argument("-j",
                        dest="num_procs",
                        type=int,
                        default=1,
                        help="Number of jobs to run in parallel.")
    parser.add_argument("--fdr_cutoff", default=0.05, type=float, help="")

    args = parser.parse_args()
    filelib.assert_exists_nz(args.treatment_bam)
    filelib.assert_exists_nz(args.control_bam)
    args.treatment_bam = os.path.realpath(args.treatment_bam)
    args.control_bam = os.path.realpath(args.control_bam)

    assert args.num_procs >= 1 and args.num_procs < 100, \
           "Please specify between 1 and 100 processes."
    assert args.fdr_cutoff > 0.0 and args.fdr_cutoff < 1.0

    # Set up directories to run it on.
    p("Setting up directories.\n")
    if not os.path.exists(args.outpath):
        os.mkdir(args.outpath)

    # Run SPP.
    p("Running spp in %s.\n" % args.outpath)
    sq = parallel.quote
    sppscript = find_sppscript()
    x = sq(args.treatment_bam), sq(args.control_bam), args.fdr_cutoff, \
        args.num_procs
    x = " ".join(map(str, x))
    cmd = "cat %s | R --vanilla %s" % (sppscript, x)
    x = parallel.sshell(cmd, path=args.outpath)
    print x

    p("Done.\n")
def get_paired_orientation_rseqc(reference_bed, bam_filename):
    from genomicode import alignlib
    from genomicode import filelib
    from genomicode import parallel
    from Betsy import module_lib as mlib

    script = alignlib.find_rseqc_script("infer_experiment.py")
    filelib.assert_exists_nz(reference_bed)
    filelib.assert_exists_nz(bam_filename)

    # RSeQC scripts use #!/usr/bin/python, which may not be the right
    # one.  Use the python on the path.
    cmd = [
        "python",
        mlib.sq(script),
        "-r", mlib.sq(reference_bed)
        "-i", mlib.sq(bam_filename)
        ]
    cmd = " ".join(cmd)
    x = parallel.sshell(cmd)

    print x
    import sys; sys.exit(0)
def _make_intervallist_file(intervallist_file, features_bed, bam_filename):
    from genomicode import config
    from genomicode import filelib
    from genomicode import parallel

    outhandle = open(intervallist_file, 'w')

    # Add the @HD and @SQ headers from the bam file.
    # samtools view -H <filename>
    samtools = filelib.which_assert(config.samtools)
    sq = parallel.quote
    cmd = [
        sq(samtools),
        "view",
        "-H",
        sq(bam_filename),
    ]
    cmd = " ".join(cmd)
    x = parallel.sshell(cmd)
    lines = x.split("\n")
    lines = [x.rstrip() for x in lines]

    for line in lines:
        if line.startswith("@HD") or line.startswith("@SQ"):
            print >> outhandle, line

    # Add the information from the BAM files.
    # BED       chrom chromStart (0-based) chromEnd name score strand
    # Interval  chrom chromStart (1-based) chromEnd strand name
    for cols in filelib.read_cols(features_bed):
        assert len(cols) >= 6
        chrom, chromStart0, chromEnd, name, score, strand = cols[:6]
        chromStart0, chromEnd = int(chromStart0), int(chromEnd)
        chromStart1 = chromStart0 + 1
        x = chrom, chromStart1, chromEnd, strand, name
        print >> outhandle, "\t".join(map(str, x))
    outhandle.close()
Beispiel #27
0
def list_snpeff_databases():
    import os
    import StringIO
    from genomicode import parallel
    from genomicode import filelib
    from Betsy import module_utils as mlib

    path = mlib.get_config("snp_eff_path", which_assert_file=True)
    snpeff = os.path.join(path, "snpEff.jar")
    filelib.assert_exists_nz(snpeff)

    # Genome    Organism    Status    Bundle    Database download link
    # ------    --------    ------    ------    ----------------------
    sq = parallel.quote
    cmd = [
        "java",
        "-Xmx16g",
        "-jar",
        sq(snpeff),
        "databases",
    ]
    output = parallel.sshell(cmd)
    header = i_db = None
    databases = []
    for cols in filelib.read_cols(StringIO.StringIO(output)):
        cols = [x.strip() for x in cols]
        if header is None:
            header = cols
            assert "Genome" in header
            i_db = header.index("Genome")
            continue
        assert len(cols) == len(header)
        if cols[0].startswith("---"):
            continue
        db_name = cols[i_db]
        databases.append(db_name)
    return databases
Beispiel #28
0
    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import parallel
        from genomicode import hashlib
        from genomicode import filelib
        from genomicode import config
        from Betsy import module_utils

        bam_node, group_node = antecedents
        bam_path = module_utils.check_inpath(bam_node.identifier)
        sample_groups = module_utils.read_sample_group_file(
            group_node.identifier)

        # Get options.
        treat_sample = module_utils.get_user_option(user_options,
                                                    "treatment_sample",
                                                    not_empty=True)
        control_sample = module_utils.get_user_option(user_options,
                                                      "control_sample")
        genome_size = module_utils.get_user_option(user_options,
                                                   "macs_genome",
                                                   not_empty=True)
        shiftsize = module_utils.get_user_option(user_options,
                                                 "macs_shiftsize")
        if shiftsize:
            shiftsize = int(shiftsize)

        # Set the name.
        name = hashlib.hash_var(treat_sample)
        if control_sample:
            x = hashlib.hash_var(control_sample)
            name = "%s_vs_%s" % (treat_sample, x)

        # Make sure the samples exist.
        samples = [x[1] for x in sample_groups]
        assert treat_sample in samples, "Unknown sample: %s" % treat_sample
        if control_sample:
            assert control_sample in samples, \
                   "Unknown sample: %s" % control_sample

        # Find the BAM files.
        treat_filename = find_bam_file(bam_path, treat_sample, sample_groups)
        assert treat_filename, "Missing bam file for %s" % treat_sample
        control_filename = None
        if control_sample:
            control_filename = find_bam_file(bam_path, control_sample,
                                             sample_groups)
            assert control_filename, "Missing bam file for %s" % control_sample

        cmd = make_macs14_command(treat_filename,
                                  control_filename,
                                  name=name,
                                  genome_size=genome_size,
                                  shiftsize=shiftsize,
                                  save_bedgraph_file=True)
        parallel.sshell(cmd, path=out_path)

        # Run Rscript on the model, if one was generated.
        model_file = os.path.join(out_path, "%s_model.r" % name)
        if os.path.exists(model_file):
            Rscript = filelib.which_assert(config.Rscript)
            cmd = [parallel.quote(Rscript), model_file]
            parallel.sshell(cmd, path=out_path)

        files = [
            "%s_peaks.xls" % name,
            "%s_summits.bed" % name,
        ]
        filenames = [os.path.join(out_path, x) for x in files]
        filelib.assert_exists_nz_many(filenames)
def plot_heatmap(filename, outfile, cluster_files, user_options):
    from genomicode import parallel
    from genomicode import graphlib
    from Betsy import module_utils as mlib
    
    python = mlib.get_config(
        "python", which_assert_file=True, assert_exists=True)
    arrayplot = mlib.get_config(
        "arrayplot", which_assert_file=True, assert_exists=True)

    COLORS = [
        "red", "white", "red-green", "blue-yellow", "red-green-soft",
        "red-blue-soft", "matlab", "bild", "genepattern", "genespring",
        "yahoo", "brewer-prgn-div", "brewer-rdbu-div", 
        "brewer-rdylbu-div", "brewer-rdylgn-div", "brewer-spectral-div",
        "brewer-blues-seq", "brewer-greens-seq", "brewer-reds-seq",
        "brewer-ylorbr-seq", "brewer-qual-set",
        ]
    YESNO = ["no", "yes"]

    hm_width = mlib.get_user_option(user_options, "hm_width", type=int)
    hm_height = mlib.get_user_option(user_options, "hm_height", type=int)
    hm_color = mlib.get_user_option(
        user_options, "hm_color", allowed_values=COLORS, not_empty=True)

    hm_colorbar = mlib.get_user_option(
        user_options, "hm_colorbar", not_empty=True, allowed_values=YESNO)
    hm_colorbar_horizontal = mlib.get_user_option(
        user_options, "hm_colorbar_horizontal", not_empty=True,
        allowed_values=YESNO)
    hm_colorbar_height = mlib.get_user_option(
        user_options, "hm_colorbar_height", not_empty=True, type=float)
    hm_colorbar_width = mlib.get_user_option(
        user_options, "hm_colorbar_width", not_empty=True, type=float)
    hm_colorbar_font = mlib.get_user_option(
        user_options, "hm_colorbar_font", not_empty=True, type=float)

    hm_label_genes = mlib.get_user_option(
        user_options, "hm_label_genes", allowed_values=YESNO)
    hm_scale_gene_labels = mlib.get_user_option(
        user_options, "hm_scale_gene_labels", not_empty=True, type=float)
    hm_label_arrays = mlib.get_user_option(
        user_options, "hm_label_arrays", allowed_values=YESNO)
    hm_scale_array_labels = mlib.get_user_option(
        user_options, "hm_scale_array_labels", not_empty=True, type=float)

    hm_show_gene_tree = None
    hm_show_array_tree = None
    hm_show_gene_cluster = None
    hm_show_array_cluster = None
    if "hm_show_gene_tree" in user_options:
        hm_show_gene_tree = mlib.get_user_option(
            user_options, "hm_show_gene_tree", allowed_values=YESNO,
            not_empty=True)
        hm_show_array_tree = mlib.get_user_option(
            user_options, "hm_show_array_tree", allowed_values=YESNO,
            not_empty=True)
        hm_show_gene_cluster = mlib.get_user_option(
            user_options, "hm_show_gene_cluster", allowed_values=YESNO,
            not_empty=True)
        hm_show_array_cluster = mlib.get_user_option(
            user_options, "hm_show_array_cluster", allowed_values=YESNO,
            not_empty=True)

    # Set default values.
    if not hm_width or not hm_height:
        nrow, ncol = get_matrix_size(filename)
        fn = graphlib.find_wide_heatmap_size
        if nrow > ncol:
            fn = graphlib.find_tall_heatmap_size
        x = fn(
            nrow, ncol, max_total_height=4096, max_total_width=4096,
            max_box_height=200, max_box_width=200)
        hm_width, hm_height = x

    if not hm_label_genes:
        nrow, ncol = get_matrix_size(filename)
        hm_label_genes = "no"
        if nrow <= 50:
            hm_label_genes = "yes"
    if not hm_label_arrays:
        nrow, ncol = get_matrix_size(filename)
        hm_label_arrays = "no"
        if ncol <= 50:
            hm_label_arrays = "yes"
    
        
    # Check values.
    assert hm_width >= 1 and hm_width <= 256, "Invalid width: %s" % hm_width
    assert hm_height >= 1 and hm_height <= 256, \
           "Invalid height: %s" % hm_height
    assert hm_scale_gene_labels > 0 and hm_scale_gene_labels < 10
    assert hm_scale_array_labels > 0 and hm_scale_array_labels < 10

    sq = parallel.quote
    cmd = [
        sq(python),
        sq(arrayplot),
        "--grid",
        "-x", hm_width,
        "-y", hm_height,
        "--color", hm_color,
        ]
    if hm_colorbar == "yes":
        cmd += [
            "--colorbar",
            "--cb_height", hm_colorbar_height,
            "--cb_width", hm_colorbar_width,
            "--cb_font", hm_colorbar_font,
            ]
        if hm_colorbar_horizontal == "yes":
            cmd += ["--cb_horizontal"]

    if hm_label_genes == "yes":
        cmd += [
            "--label_genes",
            "--scale_gene_labels", hm_scale_gene_labels,
            ]
    if hm_label_arrays == "yes":
        cmd += [
            "--label_arrays",
            "--scale_array_labels", hm_scale_array_labels,
            ]
    if hm_show_gene_tree == "yes" and "gtr" in cluster_files:
        cmd += ["--gene_tree_file", cluster_files["gtr"]]
    if hm_show_array_tree == "yes" and "atr" in cluster_files:
        cmd += ["--array_tree_file", cluster_files["atr"]]
    if hm_show_gene_cluster == "yes" and "kgg" in cluster_files:
        cmd += ["--gene_cluster_file", cluster_files["kgg"]]
    if hm_show_array_cluster == "yes" and "kag" in cluster_files:
        cmd += ["--array_cluster_file", cluster_files["kag"]]
    cmd += [
        sq(filename),
        sq(outfile),
        ]
    cmd = " ".join(map(str, cmd))
    parallel.sshell(cmd)

    return cmd
    def run(self, network, in_data, out_attributes, user_options, num_cores,
            outfile):
        #import subprocess
        import shutil
        import arrayio
        #from genomicode import config
        from genomicode import arrayplatformlib
        from genomicode import parallel
        #from genomicode import filelib
        from Betsy import module_utils as mlib

        DATA = arrayio.read(in_data.identifier)

        #chipname = arrayplatformlib.identify_platform_of_matrix(DATA)
        scores = arrayplatformlib.score_matrix(DATA)
        assert scores, "Unable to identify platform: %s" % in_data.identifier
        chipname = scores[0]

        platform = "HG_U133A"
        assert arrayplatformlib.get_bm_attribute(platform), \
               "Unrecognized platform: %s" % platform

        if chipname == platform:
            shutil.copyfile(in_data.identifier, outfile)
        else:
            Annot_BIN = mlib.get_config("annotate_matrix",
                                        which_assert_file=True)
            sq = parallel.quote
            cmd = [
                "python",
                sq(Annot_BIN),
                sq(in_data.identifier),
                "--platform",
                sq(platform),
                '--min_match_score',
                0.80,
            ]
            cmd = " ".join(map(str, cmd))
            cmd = "%s >& %s" % (cmd, sq(outfile))
            parallel.sshell(cmd)

            #f = file(outfile, 'w')
            #try:
            #    process = subprocess.Popen(
            #        command, shell=False, stdout=f, stderr=subprocess.PIPE)
            #finally:
            #    f.close()
            #error_message = process.communicate()[1]
            #if error_message:
            #    raise ValueError(error_message)
        #change the HG_U133A to the first column

        f = file(outfile, 'r')
        txt = f.readlines()
        f.close()
        header = txt[0].split('\t')
        index = header.index('HG_U133A')
        f = file(outfile, 'w')
        for line in txt:
            line = line.split('\t')
            newline = [line[index]] + line[0:index] + line[index + 1:]
            f.write('\t'.join(newline))
        f.close()