Python read_sample_group_fileの例、Betsy.module_utils.read_sample_group_file Pythonの例

コード例 #1

0

ファイルを表示

ファイル: run_spp.py プロジェクト: firebitsbr/changlab

    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import parallel
        from genomicode import hashlib
        from genomicode import filelib
        from Betsy import module_utils
        import run_MACS14

        bam_node, group_node = antecedents
        bam_path = module_utils.check_inpath(bam_node.identifier)
        sample_groups = module_utils.read_sample_group_file(
            group_node.identifier)

        # Get options.
        treat_sample = module_utils.get_user_option(user_options,
                                                    "treatment_sample",
                                                    not_empty=True)
        control_sample = module_utils.get_user_option(user_options,
                                                      "control_sample",
                                                      not_empty=True)

        # Set the experiment name.
        name1 = hashlib.hash_var(treat_sample)
        name2 = hashlib.hash_var(control_sample)
        experiment_name = "%s_vs_%s" % (name1, name2)

        # Make sure the samples exist.
        samples = [x[1] for x in sample_groups]
        assert treat_sample in samples, "Unknown sample: %s" % treat_sample
        assert control_sample in samples, "Unknown sample: %s" % control_sample

        # Find the BAM files.
        treat_filename = run_MACS14.find_bam_file(bam_path, treat_sample,
                                                  sample_groups)
        control_filename = run_MACS14.find_bam_file(bam_path, control_sample,
                                                    sample_groups)
        assert treat_filename, "Missing bam file for %s" % treat_sample
        assert control_filename, "Missing bam file for %s" % control_sample

        cmd = make_pyspp_command(treat_filename,
                                 control_filename,
                                 out_path,
                                 num_procs=num_cores)
        log_file = "%s.log" % experiment_name
        cmd = "%s >& %s" % (cmd, log_file)
        parallel.sshell(cmd, path=out_path)

        files = [
            "binding.positions.txt",
            #"broadPeak",
            "crosscorrelation.pdf",
            "density.wig",
            "enrichment.estimates.wig",
            "enrichment.wig",
            #"narrowPeak",   # might be empty if no peaks found
            log_file,
        ]
        filenames = [os.path.join(out_path, x) for x in files]
        filelib.assert_exists_nz_many(filenames)

コード例 #2

0

ファイルを表示

    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import parallel
        from genomicode import hashlib
        from genomicode import filelib
        from genomicode import config
        from Betsy import module_utils

        bam_node, group_node = antecedents
        bam_path = module_utils.check_inpath(bam_node.identifier)
        sample_groups = module_utils.read_sample_group_file(
            group_node.identifier)

        # Get options.
        treat_sample = module_utils.get_user_option(user_options,
                                                    "treatment_sample",
                                                    not_empty=True)
        control_sample = module_utils.get_user_option(user_options,
                                                      "control_sample")
        genome_size = module_utils.get_user_option(user_options,
                                                   "macs_genome",
                                                   not_empty=True)
        shiftsize = module_utils.get_user_option(user_options,
                                                 "macs_shiftsize")
        if shiftsize:
            shiftsize = int(shiftsize)

        # Set the name.
        name = hashlib.hash_var(treat_sample)
        if control_sample:
            x = hashlib.hash_var(control_sample)
            name = "%s_vs_%s" % (treat_sample, x)

        # Make sure the samples exist.
        samples = [x[1] for x in sample_groups]
        assert treat_sample in samples, "Unknown sample: %s" % treat_sample
        if control_sample:
            assert control_sample in samples, \
                   "Unknown sample: %s" % control_sample

        # Find the BAM files.
        treat_filename = find_bam_file(bam_path, treat_sample, sample_groups)
        assert treat_filename, "Missing bam file for %s" % treat_sample
        control_filename = None
        if control_sample:
            control_filename = find_bam_file(bam_path, control_sample,
                                             sample_groups)
            assert control_filename, "Missing bam file for %s" % control_sample

        cmd = make_macs14_command(treat_filename,
                                  control_filename,
                                  name=name,
                                  genome_size=genome_size,
                                  shiftsize=shiftsize,
                                  save_bedgraph_file=True)
        parallel.sshell(cmd, path=out_path)

        # Run Rscript on the model, if one was generated.
        model_file = os.path.join(out_path, "%s_model.r" % name)
        if os.path.exists(model_file):
            Rscript = filelib.which_assert(config.Rscript)
            cmd = [parallel.quote(Rscript), model_file]
            parallel.sshell(cmd, path=out_path)

        files = [
            "%s_peaks.xls" % name,
            "%s_summits.bed" % name,
        ]
        filenames = [os.path.join(out_path, x) for x in files]
        filelib.assert_exists_nz_many(filenames)

コード例 #3

0

ファイルを表示

ファイル: run_peakseq.py プロジェクト: firebitsbr/changlab

    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import parallel
        from genomicode import hashlib
        from genomicode import filelib
        from Betsy import module_utils
        import run_MACS14

        bam_node, group_node = antecedents
        bam_path = module_utils.check_inpath(bam_node.identifier)
        sample_groups = module_utils.read_sample_group_file(
            group_node.identifier)

        # Get options.
        treat_sample = module_utils.get_user_option(user_options,
                                                    "treatment_sample",
                                                    not_empty=True)
        control_sample = module_utils.get_user_option(user_options,
                                                      "control_sample")
        fragment_length = module_utils.get_user_option(
            user_options, "peakseq_fragment_length", not_empty=True, type=int)
        mappability_file = module_utils.get_user_option(user_options,
                                                        "mappability_file",
                                                        not_empty=True,
                                                        check_file=True)
        assert fragment_length > 0 and fragment_length < 1000

        # Set the experiment name.
        name1 = hashlib.hash_var(treat_sample)
        name2 = hashlib.hash_var(control_sample)
        experiment_name = "%s_vs_%s" % (name1, name2)

        # Make sure the samples exist.
        samples = [x[1] for x in sample_groups]
        assert treat_sample in samples, "Unknown sample: %s" % treat_sample
        if control_sample:
            assert control_sample in samples, \
                   "Unknown sample: %s" % control_sample

        # Find the BAM files.
        treat_filename = run_MACS14.find_bam_file(bam_path, treat_sample,
                                                  sample_groups)
        control_filename = run_MACS14.find_bam_file(bam_path, control_sample,
                                                    sample_groups)
        assert treat_filename, "Missing bam file for %s" % treat_sample
        assert control_filename, "Missing bam file for %s" % control_sample

        cmd = make_peakseq_command(treat_filename, control_filename, out_path,
                                   experiment_name, fragment_length,
                                   mappability_file)
        log_file = "%s.log" % experiment_name
        cmd = "%s >& %s" % (cmd, log_file)
        parallel.sshell(cmd, path=out_path)

        files = [
            "config.dat",
            log_file,
            "%s.txt" % experiment_name,
            # Can be length 0, if no peaks found.
            #"%s_narrowPeak.txt" % experiment_name,
        ]
        filenames = [os.path.join(out_path, x) for x in files]
        filelib.assert_exists_nz_many(filenames)

コード例 #4

0

ファイルを表示

ファイル: merge_reads.py プロジェクト: firebitsbr/changlab

    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import filelib
        from genomicode import parallel
        from Betsy import module_utils

        # This this is I/O heavy, don't use so many cores.
        MAX_CORES = 2

        fastq_node, group_node = antecedents
        fastq_path = fastq_node.identifier
        sample_group_file = group_node.identifier
        filelib.safe_mkdir(out_path)
        metadata = {}

        module_utils.assert_sample_group_file(sample_group_file, fastq_path)
        x = module_utils.read_sample_group_file(group_node.identifier)
        x = module_utils.fix_sample_group_filenames(x, fastq_path)
        sample_groups = x

        # For merging, the order of the files in the sample_group_file
        # must be maintainted.  Otherwise, will be merged out of order.

        # The new files should be named:
        # <Sample>.fastq          # if single end
        # <Sample>_<Pair>.fastq   # if paired end
        jobs = []
        for x in sample_groups:
            in_filename, sample, pair = x
            #in_filename = os.path.join(fastq_path, file_)
            assert os.path.exists(in_filename)

            out_file = "%s.fastq" % sample
            if pair:
                out_file = "%s_%s.fastq" % (sample, pair)
            out_filename = os.path.join(out_path, out_file)
            x = in_filename, sample, pair, out_filename
            jobs.append(x)

        out2ins = {}  # out_filename -> list of in_filenames
        for x in jobs:
            in_filename, sample, pair, out_filename = x
            if out_filename not in out2ins:
                out2ins[out_filename] = []
            out2ins[out_filename].append(in_filename)

        commands = []
        for out_filename, in_filenames in out2ins.iteritems():
            # Debugging.  Don't merge again if it already exists.
            if os.path.exists(out_filename):
                continue
            args = in_filenames, out_filename
            keywds = {}
            x = merge_or_symlink_files, args, keywds
            commands.append(x)
        commands.sort()

        nc = min(MAX_CORES, num_cores)
        parallel.pyfun(commands, nc)
        metadata["num_cores"] = nc

        # If the files are paired, make sure they are paired
        # correctly.
        sample2outfiles = {}  # sample -> list of out filenames
        for x in jobs:
            in_filename, sample, pair, out_filename = x
            if sample not in sample2outfiles:
                sample2outfiles[sample] = []
            if out_filename not in sample2outfiles[sample]:
                sample2outfiles[sample].append(out_filename)
        commands = []
        all_samples = sorted(sample2outfiles)
        for sample in all_samples:
            out_filenames = sorted(sample2outfiles[sample])
            if len(out_filenames) == 1:
                continue
            # Make sure they are aligned.
            x = check_fastq_alignment, (sample, out_filenames), {}
            commands.append(x)
        commands.sort()
        retvals = parallel.pyfun(commands, nc)
        assert len(retvals) == len(commands)

        errors = [x for x in retvals if x]
        assert not errors, "\n".join(errors)

        return metadata

コード例 #5

0

ファイルを表示

    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import parallel
        from genomicode import hashlib
        from genomicode import filelib
        from genomicode import config
        from Betsy import module_utils

        tag_node, group_node = antecedents
        tag_path = module_utils.check_inpath(tag_node.identifier)
        sample_groups = module_utils.read_sample_group_file(
            group_node.identifier)

        # Get options.
        treat_sample = module_utils.get_user_option(user_options,
                                                    "treatment_sample",
                                                    not_empty=True)
        control_sample = module_utils.get_user_option(user_options,
                                                      "control_sample")

        # Set the experiment name.
        experiment_name = treat_sample
        if control_sample:
            name1 = hashlib.hash_var(treat_sample)
            name2 = hashlib.hash_var(control_sample)
            experiment_name = "%s_vs_%s" % (name1, name2)

        # Make sure the samples exist.
        samples = [x[1] for x in sample_groups]
        assert treat_sample in samples, "Unknown sample: %s" % treat_sample
        assert control_sample in samples, "Unknown sample: %s" % control_sample

        # Find the tag directories.
        treat_path = os.path.join(tag_path, treat_sample)
        assert os.path.exists(treat_path)
        if control_sample:
            control_path = os.path.join(tag_path, control_sample)
            assert os.path.exists(control_path)

        # Get the command.
        homer_path = filelib.which_assert(config.homer_path)
        x = os.path.join(homer_path, "bin", "findPeaks")
        assert filelib.exists_nz(x)
        find_peaks = x

        log_file = "%s.log" % experiment_name
        peak_file = "%s.peaks.txt" % experiment_name

        sq = parallel.quote
        cmd = [
            sq(find_peaks),
            sq(treat_path),
            "-style",
            "factor",
        ]
        if control_sample:
            cmd += ["-i", control_path]
        cmd = " ".join(cmd)
        cmd = "%s 2> %s 1> %s" % (cmd, log_file, peak_file)
        parallel.sshell(cmd, path=out_path)

        x = os.path.join(out_path, peak_file)
        filelib.assert_exists_nz(x)

コード例 #6

0

ファイルを表示

    def run(self, network, antecedents, out_attributes, user_options,
            num_cores, out_path):
        import os
        from genomicode import parallel
        from genomicode import hashlib
        from genomicode import filelib
        from Betsy import module_utils
        import run_MACS14

        bam_node, group_node = antecedents
        bam_path = module_utils.check_inpath(bam_node.identifier)
        sample_groups = module_utils.read_sample_group_file(
            group_node.identifier)

        # Get options.
        treat_sample = module_utils.get_user_option(user_options,
                                                    "treatment_sample",
                                                    not_empty=True)
        control_sample = module_utils.get_user_option(user_options,
                                                      "control_sample")
        genome_size = module_utils.get_user_option(user_options,
                                                   "macs_genome",
                                                   not_empty=True)
        x = module_utils.get_user_option(user_options,
                                         "broad_peaks",
                                         allowed_values=["no", "yes"])
        broad_peaks = (x == "yes")
        x = module_utils.get_user_option(user_options,
                                         "macs_paired",
                                         allowed_values=["no", "yes"])
        is_paired = (x == "yes")

        # Set the name.
        name = hashlib.hash_var(treat_sample)
        if control_sample:
            x = hashlib.hash_var(control_sample)
            name = "%s_vs_%s" % (treat_sample, x)

        # Make sure the samples exist.
        samples = [x[1] for x in sample_groups]
        assert treat_sample in samples, "Unknown sample: %s" % treat_sample
        if control_sample:
            assert control_sample in samples, \
                   "Unknown sample: %s" % control_sample

        # Find the BAM files.
        treat_filename = run_MACS14.find_bam_file(bam_path, treat_sample,
                                                  sample_groups)
        assert treat_filename, "Missing bam file for %s" % treat_sample
        control_filename = None
        if control_sample:
            control_filename = run_MACS14.find_bam_file(
                bam_path, control_sample, sample_groups)
            assert control_filename, "Missing bam file for %s" % control_sample

        cmd = make_macs2_command(treat_filename,
                                 control_filename=control_filename,
                                 genome_size=genome_size,
                                 save_bedgraph_file=True,
                                 name=name,
                                 normalize_read_counts=True,
                                 paired=is_paired,
                                 broad_peak_calling=broad_peaks)
        parallel.sshell(cmd, path=out_path)

        files = [
            "%s_peaks.xls" % name,
        ]
        filenames = [os.path.join(out_path, x) for x in files]
        filelib.assert_exists_nz_many(filenames)