Example #1
0
File: trim.py Project: roryk/bipy
 def _get_adapters(self, chemistry):
     adapters = [ADAPTERS.get(x, []) for x in chemistry]
     adapters += self.user_adapters
     adapters = list(flatten(adapters))
     adapters += self._rc_adapters(adapters)
     adapter_args = [["-a", adapter] for adapter in adapters]
     return list(flatten(adapter_args))
Example #2
0
File: trim.py Project: roryk/bipy
 def _get_adapters(self, chemistry):
     adapters = [ADAPTERS.get(x, []) for x in chemistry]
     adapters += self.user_adapters
     adapters = list(flatten(adapters))
     adapters += self._rc_adapters(adapters)
     adapter_args = [["-a", adapter] for adapter in adapters]
     return list(flatten(adapter_args))
Example #3
0
File: fastqc.py Project: roryk/bipy
def _build_command(input_file, fastqc_config, config):
    program = fastqc_config["program"]
    options = map(str, list(flatten(fastqc_config["options"])))
    outdir = _make_outdir(config)
    options += ["--outdir", outdir, "--kmers", "6"]
    cmd = list(flatten([program, options, input_file]))
    return cmd
Example #4
0
File: fastqc.py Project: roryk/bipy
def _build_command(input_file, fastqc_config, config):
    program = fastqc_config["program"]
    options = map(str, list(flatten(fastqc_config["options"])))
    outdir = _make_outdir(config)
    options += ["--outdir", outdir, "--kmers", "6"]
    cmd = list(flatten([program, options, input_file]))
    return cmd
Example #5
0
def _build_command(input_file, options, control_file=None, out_dir=None):
    name = remove_suffix(os.path.basename(input_file))
    #if out_dir:
    #    name = os.path.join(out_dir, name)

    options = ["=".join(map(str, x)) for x in options]

    cmd = ["macs14", "--treatment=" + input_file, flatten(options),
           "--name=" + name]
    if control_file:
        cmd += ["--control=" + control_file]

    return map(str, flatten(cmd))
Example #6
0
File: macs.py Project: roryk/bipy
def _build_command(input_file, options, control_file=None, out_dir=None):
    name = remove_suffix(os.path.basename(input_file))
    #if out_dir:
    #    name = os.path.join(out_dir, name)

    options = ["=".join(map(str, x)) for x in options]

    cmd = [
        "macs14", "--treatment=" + input_file,
        flatten(options), "--name=" + name
    ]
    if control_file:
        cmd += ["--control=" + control_file]

    return map(str, flatten(cmd))
Example #7
0
def _build_command(input_file, ref, novoalign_config):
    cmd = [
        which("novoalign"),
        flatten_options(novoalign_config), "-o", "SAM", "-d", ref, "-f",
        input_file
    ]
    return list(map(str, flatten(cmd)))
Example #8
0
def main(config, view):

    # make the needed directories
    map(safe_makedir, config["dir"].values())

    # specific for project
    human_input = find_sam_files(config["input_dir_human"])
    mouse_input = find_sam_files(config["input_dir_mouse"])
    if len(human_input) != len(mouse_input):
        logger.error("The length of the number of human SAM files does "
                     "not match the length of the number of mouse SAM "
                     "files, aborting.")
        sys.exit(1)
    input_files = zip(human_input, mouse_input)

    curr_files = input_files

    logger.info("Running disambiguation pipeline on %s." % (curr_files))

    for stage in config["run"]:
        if stage == "disambiguate":
            logger.info("Disambiguating %s." % (curr_files))
            disambiguate = Disambiguate(config)
            out_files = list(flatten(view.map(disambiguate, curr_files)))
            bam_files = view.map(sam.sam2bam, out_files)
            bam_sorted = view.map(sam.bamsort, bam_files)
            view.map(sam.bamindex, bam_sorted)
Example #9
0
 def _disambiguate_out(self, in_tuple):
     """
     returns the set of output filenames that will be made from
     running disambiguate on the tuple of input files
     """
     return list(flatten(map(self._organism_files,
                             in_tuple, self.organisms)))
Example #10
0
def input_files_from_dir(in_dir, id_file):

    with open(os.path.join(in_dir, id_file)) as in_handle:
        ids = yaml.load(in_handle)

    sample_names = [x for x in ids]
    samples = [glob.glob(in_dir + "/*_%s.R1.fastq" % (x)) for x in sample_names]
    return list(flatten(samples))
Example #11
0
def _build_command(input_file, tagdust_config, config):
    cl = [tagdust_config["program"], flatten_options(tagdust_config)]
    if "clean" in tagdust_config["keep"]:
        cl += ["-o", _build_output_file(input_file, "clean", config)]
    if "dirty" in tagdust_config["keep"]:
        cl += ["-a", _build_output_file(input_file, "dirty", config)]
    cl += [tagdust_config["contaminants"], input_file]
    return list(map(str, flatten(cl)))
Example #12
0
def _build_command(input_file, tagdust_config, config):
    cl = [tagdust_config["program"], flatten_options(tagdust_config)]
    if "clean" in tagdust_config["keep"]:
        cl += ["-o", _build_output_file(input_file, "clean", config)]
    if "dirty" in tagdust_config["keep"]:
        cl += ["-a", _build_output_file(input_file, "dirty", config)]
    cl += [tagdust_config["contaminants"], input_file]
    return list(map(str, flatten(cl)))
Example #13
0
def _find_input_files(config):
    input_dirs = config["input_dirs"]
    """ find all of the fastq files by identifier """
    identifier = config["sample_parse"]["identifier"]
    input_files = [
        glob.glob(os.path.join(config["dir"]["data"], input_dir, identifier))
        for input_dir in input_dirs
    ]
    return list(flatten(input_files))
Example #14
0
def _find_input_files(config):
    input_dirs = config["input_dirs"]
    """ find all of the fastq files by identifier """
    identifier = config["sample_parse"]["identifier"]
    input_files = [glob.glob(os.path.join(config["dir"]["data"],
                                          input_dir,
                                          identifier))
                                          for input_dir in input_dirs]
    return list(flatten(input_files))
Example #15
0
def _parse(config):
    # handle the adapters, defaulting to illumina and a poly-a trimmer
    # if none are provided
    adapters = []
    adapters += flatten(map(_get_adapter, config.get("adapters", [])))
    # add built in platform if available
    platform = config.get("platform", None)
    if platform:
        adapters += flatten(map(_get_platform_adapters, [p for p in platform if p in ADAPTERS]))
    # default to illumina and poly A
    if not adapters:
        adapters += flatten(map(_get_platform_adapters, [p for p in ["illumina", "polya"]]))

    arguments = []
    arguments += adapters
    # grab everything else
    arguments += config.get("options", [])
    return map(str, list(flatten(arguments)))
def input_files_from_dir(in_dir, id_file):

    with open(os.path.join(in_dir, id_file)) as in_handle:
        ids = yaml.load(in_handle)

    sample_names = [x for x in ids]
    samples = [
        glob.glob(in_dir + "/*_%s.R1.fastq" % (x)) for x in sample_names
    ]
    return list(flatten(samples))
Example #17
0
def _parse(config):
    # handle the adapters, defaulting to illumina and a poly-a trimmer
    # if none are provided
    adapters = []
    adapters += flatten(map(_get_adapter,
                            config.get("adapters", [])))
    # add built in platform if available
    platform = config.get("platform", None)
    if platform:
        adapters += flatten(map(_get_platform_adapters,
                                [p for p in platform if p in ADAPTERS]))
    # default to illumina and poly A
    if not adapters:
        adapters += flatten(map(_get_platform_adapters,
                        [p for p in ["illumina", "polya"]]))

    arguments = []
    arguments += adapters
    # grab everything else
    arguments += config.get("options", [])
    return map(str, list(flatten(arguments)))
Example #18
0
def multi_intersect(in_files, options=None, out_file=None):
    """ reports the intersection of multiple bed files """

    if options is None:
        options = []

    cmd = ["multiIntersectBed", options, "-i", in_files]
    cmd = flatten(cmd)
    cmd = map(str, cmd)

    out_file = _run_command(in_files, cmd, suffix=".intersect.bed",
                            out_file=out_file)
    return out_file
Example #19
0
def _build_command(in_file, stage_config, config):
    cmd = ["java", "-jar", stage_config["program"]]
    out_dir = os.path.join(config["dir"]["results"],
                           stage_config.get("name", "rna_seqc"),
                           get_stem(in_file))
    safe_makedir(out_dir)
    cmd += ["-o", out_dir]
    sample = "|".join([get_stem(in_file), in_file, "rna_seqc"])
    cmd += ["-s", sample]
    cmd += ["-r", config["ref_fasta"]]
    cmd += ["-t", config["gtf"]]
    cmd += [stage_config.get("options", [])]
    return list(flatten(cmd))
Example #20
0
def multi_intersect(in_files, options=None, out_file=None):
    """ reports the intersection of multiple bed files """

    if options is None:
        options = []

    cmd = ["multiIntersectBed", options, "-i", in_files]
    cmd = flatten(cmd)
    cmd = map(str, cmd)

    out_file = _run_command(in_files,
                            cmd,
                            suffix=".intersect.bed",
                            out_file=out_file)
    return out_file
Example #21
0
def run(input_file, gtf_file, options=None, out_file=None):
    if options is None:
        options = []
    if out_file is None:
        out_file = _get_outfilename(input_file)

    safe_makedir(os.path.dirname(out_file))

    if file_exists(out_file):
        return out_file

    cmd = map(str, flatten(["htseq-count", options, input_file, gtf_file]))
    with open(out_file, "w") as out_handle:
        subprocess.check_call(cmd, stdout=out_handle)

    return out_file
Example #22
0
def run(input_file, gtf_file, options=None, out_file=None):
    if options is None:
        options = []
    if out_file is None:
        out_file = _get_outfilename(input_file)

    safe_makedir(os.path.dirname(out_file))

    if file_exists(out_file):
        return out_file

    cmd = map(str, flatten(["htseq-count", options, input_file, gtf_file]))
    with open(out_file, "w") as out_handle:
        subprocess.check_call(cmd, stdout=out_handle)

    return out_file
Example #23
0
File: trim.py Project: roryk/bipy
 def __call__(self, in_file):
     raise NotImplementedError("Waiting to hear back from maintainer to "
                               "handle multiple adapters before finishing.")
     adapters = list(flatten(map(self.get_adapters, self.chemistry)))
     # if it is a list assume these are pairs
     if isinstance(in_file, list):
         out_files = map(self._in2out, in_file)
         if all(map(file_exists, out_files)):
             return out_files
         self.trim_galore(in_file, self.options, adapters, paired=True)
         return out_files
     # if it is only one file just run it
     else:
         out_file = self._in2out(in_file)
         if file_exists(out_file):
             return out_file
         self.trim_galore(in_file, self.options, adapters)
         return out_file
Example #24
0
File: trim.py Project: roryk/bipy
 def __call__(self, in_file):
     raise NotImplementedError("Waiting to hear back from maintainer to "
                               "handle multiple adapters before finishing.")
     adapters = list(flatten(map(self.get_adapters, self.chemistry)))
     # if it is a list assume these are pairs
     if isinstance(in_file, list):
         out_files = map(self._in2out, in_file)
         if all(map(file_exists, out_files)):
             return out_files
         self.trim_galore(in_file, self.options, adapters, paired=True)
         return out_files
     # if it is only one file just run it
     else:
         out_file = self._in2out(in_file)
         if file_exists(out_file):
             return out_file
         self.trim_galore(in_file, self.options, adapters)
         return out_file
Example #25
0
def _build_command(input_file, novoindex_config, output_file):
    options = novoindex_config["options"].items()
    cmd = map(str, (flatten(["novoindex", options, output_file,
                             input_file])))
    return cmd
Example #26
0
File: trim.py Project: roryk/bipy
 def get_adapters(self, chemistry):
     return list(flatten([["-a", x] for x in ADAPTERS.get(chemistry, [])]))
Example #27
0
def main(config_file):
    if config_file:
        with open(config_file) as in_handle:
            config = yaml.load(in_handle)

    dirs = config["in_dir"]
    conditions = config["conditions"]
    glob_string = config["glob_string"]

    files = list(flatten([glob.glob(os.path.join(x, glob_string)) for x in dirs]))
    out_dir = config["dir"]["results"]
    safe_makedir(out_dir)

    curr_files = []
    for condition in conditions:
        condition_files = [x for x in files if condition in x]
        out_file = os.path.join(out_dir, condition + "_v2_v3.bam")
        print "Combining %s into %s." % (condition_files, out_file)
        sh.samtools.merge(list(flatten([out_file, condition_files])))
        #        bsub_call = list(flatten(["-q", "hsph", "-o", "out" + condition, "-e", "err" + condition, "samtools", "merge", out_file, condition_files]))
        #sh.bsub(bsub_call)
        sorted_prefix = remove_suffix(out_file) + ".sorted"
        sorted_file = sorted_prefix + ".bam"
        sh.samtools.sort(out_file, sorted_prefix)
        sh.samtools.index(sorted_file)
        mapped_file = append_stem(sorted_file, "mapped")
        sh.samtools.view(sorted_file, F=4, b=True, o=mapped_file)
        sh.samtools.index(mapped_file)

        # find the reads that don't intersect with the rrna
        in_file = mapped_file
        out_file = os.path.join(out_dir, condition + "_noribo" + "_v2_v3.bam")
        ribo = config["ribo"]
        print "Filtering %s for rRNA in %s into %s." % (in_file, ribo, out_file)
        sh.bedtools.intersect("-abam", in_file, "-v", "-b", ribo, _out=out_file)
        filtered_file = out_file

        print "Calculating RNASeq metrics on %s." % (out_file)
        in_file = out_file
        ref = blastn.prepare_ref_file(config["stage"]["new_coverage"]["ref"],
                                      config)
        ribo = config["stage"]["new_coverage"]["ribo"]
        picard = BroadRunner(config["program"]["picard"])
        out_dir = os.path.join(config["dir"]["results"], "new_coverage")
        safe_makedir(out_dir)
        out_file = replace_suffix(os.path.basename(in_file), "metrics")
        out_file = os.path.join(out_dir, out_file)
        metrics_file = picardrun.picard_rnaseq_metrics(picard, in_file, ref,
                                                       ribo, out_file)

        jelly_dir = os.path.join(config["dir"]["results"], "jellyfish")
        safe_makedir(jelly_dir)
        # convert the filtered file to fastq for jellyfish counting
        fastq_file = os.path.join(jelly_dir,
                                  os.path.basename(replace_suffix(filtered_file,
                                                                  "fastq")))
        sh.bam2fastx(filtered_file, fastq=True, _out=fastq_file)
        for mer in config["stage"]["jellyfish"]["mer_lengths"]:
            base, _ = os.path.splitext(os.path.basename(fastq_file))
            out_prefix = base + "_%dmer" % (mer)
            out_file = os.path.join(jelly_dir, out_prefix)
            if not file_exists(out_file):
                sh.jellyfish.count(fastq_file,
                                   config["stage"]["jellyfish"]["options"],
                                   m=mer, o=out_file)
Example #28
0
def main(config_file):
    # load yaml config file
    with open(config_file) as in_handle:
        config = yaml.load(in_handle)

    # setup logging
    setup_logging(config)
    from bipy.log import logger
    # start cluster
    start_cluster(config)
    from bipy.cluster import view

    found = sh.find(config["dir"]["data"], "-name", "Variations")
    var_dirs = [str(x).strip() for x in found]
    logger.info("Var_dirs: %s" % (var_dirs))
    in_dirs = map(os.path.dirname, var_dirs)
    logger.info("in_dirs: %s" % (in_dirs))
    # XXX for testing only load 3
    #curr_files = in_dirs[0:5]
    curr_files = in_dirs

    # run the illumina fixer
    logger.info("Running illumina fixer on %s." % (curr_files))
    illf_class = STAGE_LOOKUP.get("illumina_fixer")
    illf = illf_class(config)
    curr_files = view.map(illf, curr_files)

    # sort the vcf files
    def sort_vcf(in_file):
        from bipy.utils import append_stem
        from bcbio.distributed.transaction import file_transaction
        from bcbio.utils import file_exists
        import sh

        out_file = append_stem(in_file, "sorted")
        if file_exists(out_file):
            return out_file
        with file_transaction(out_file) as tmp_out_file:
            sh.vcf_sort(in_file, _out=tmp_out_file)
        return out_file

    # combine
    out_file = os.path.join(config["dir"].get("results", "results"),
                            "geminiloader", "all_combined.vcf")
    logger.info("Combining files %s into %s." % (curr_files, out_file))
    if file_exists(out_file):
        curr_files = [out_file]
    else:
        curr_files = [
            genotype.combine_variant_files(curr_files, out_file,
                                           config["ref"]["fasta"], config)
        ]

    # break the VCF files up by chromosome for speed
    logger.info("Breaking up %s by chromosome." % (curr_files))
    breakvcf_class = STAGE_LOOKUP.get("breakvcf")
    breakvcf = breakvcf_class(config)
    curr_files = view.map(breakvcf, curr_files)

    # run VEP on the separate files in parallel
    logger.info("Running VEP on %s." % (curr_files))
    vep_class = STAGE_LOOKUP.get("vep")
    vep = vep_class(config)
    curr_files = view.map(vep, list(flatten(curr_files)))

    curr_files = filter(file_exists, curr_files)

    # load the files into gemini not in parallel
    # don't run in parallel

    # sort the vcf files
    logger.info("Sorting %s." % (curr_files))
    curr_files = view.map(sort_vcf, curr_files)
    # don't run the rest of this in parallel, so take the cluster down
    stop_cluster()

    out_file = os.path.join(config["dir"].get("results", "results"),
                            "geminiloader", "all_combined.vep.vcf")
    logger.info("Combining files %s into %s." % (curr_files, out_file))
    if file_exists(out_file):
        curr_files = [out_file]
    else:
        curr_files = [
            genotype.combine_variant_files(curr_files, out_file,
                                           config["ref"]["fasta"], config)
        ]

    logger.info("Loading %s into gemini." % (curr_files))
    gemini_class = STAGE_LOOKUP.get("geminiloader")
    geminiloader = gemini_class(config)
    curr_files = map(geminiloader, curr_files)
    logger.info("Run complete.")
Example #29
0
def main(config_file):
    with open(config_file) as in_handle:
        config = yaml.load(in_handle)

    # make the needed directories
    map(safe_makedir, config["dir"].values())

    # specific for thesis pipeline
    input_dirs = config["input_dirs"]

    results_dir = config["dir"].get("results", "results")
    input_files = _find_input_files(config)
    conditions = _group_input_by_condition(input_files)
    logger.info("Input_files: %s" % (input_files))
    logger.info("Condition groups %s" %(conditions))
    htseq_outdict = {}

    for condition, curr_files in conditions.items():
        condition_dir = os.path.join(results_dir, condition)
        safe_makedir(condition_dir)
        config["dir"]["results"] = condition_dir

        for stage in config["run"]:
            if stage == "fastqc":
                _emit_stage_message(stage, curr_files)
                fastqc_config = _get_stage_config(config, stage)
                fastqc_args = zip(*product(curr_files, [fastqc_config],
                                           [config]))
                view.map(fastqc.run, *fastqc_args)

            if stage == "cutadapt":
                _emit_stage_message(stage, curr_files)
                cutadapt_config = _get_stage_config(config, stage)
                cutadapt_args = zip(*product(curr_files, [cutadapt_config],
                                             [config]))
                cutadapt_outputs = view.map(cutadapt_tool.run, *cutadapt_args)
                curr_files = cutadapt_outputs
                logger.info("Fixing mate pair information.")
                pairs = combine_pairs(curr_files)
                first = [x[0] for x in pairs]
                second = [x[1] for x in pairs]
                logger.info("Forward: %s" % (first))
                logger.info("Reverse: %s" % (second))
                fixed = view.map(fastq.fix_mate_pairs_with_config,
                                 first, second, [config] * len(first))
                curr_files = list(flatten(fixed))

            if stage == "sickle":
                _emit_stage_message(stage, curr_files)
                pairs = combine_pairs(curr_files)
                first = [x[0] for x in pairs]
                second = [x[1] for x in pairs]
                fixed = view.map(sickle.run_with_config,
                                 first, second, [config] * len(first))
                curr_files = list(flatten(fixed))

            if stage == "tophat":
                _emit_stage_message(stage, curr_files)
                tophat_config = _get_stage_config(config, stage)
                pairs = combine_pairs(curr_files)
                first = [x[0] for x in pairs]
                second = [x[1] for x in pairs]
                logger.info("first %s" % (first))
                logger.info("second %s" % (second))

                #tophat_args = zip(*product(first, second, [config["ref"]],
                #                           ["tophat"], [config]))
                tophat_outputs = view.map(tophat.run_with_config,
                                          first, second,
                                          [config["ref"]] * len(first),
                                          ["tophat"] * len(first),
                                          [config] * len(first))
                bamfiles = view.map(sam.sam2bam, tophat_outputs)
                bamsort = view.map(sam.bamsort, bamfiles)
                view.map(sam.bamindex, bamsort)
                final_bamfiles = bamsort
                curr_files = tophat_outputs

            if stage == "htseq-count":
                _emit_stage_message(stage, curr_files)
                htseq_config = _get_stage_config(config, stage)
                htseq_args = zip(*product(curr_files, [config], [stage]))
                htseq_outputs = view.map(htseq_count.run_with_config,
                                         *htseq_args)
                htseq_outdict[condition] = htseq_outputs

            if stage == "coverage":
                logger.info("Calculating RNASeq metrics on %s." % (curr_files))
                nrun = len(curr_files)
                ref = prepare_ref_file(config["stage"][stage]["ref"], config)
                ribo = config["stage"][stage]["ribo"]
                picard = BroadRunner(config["program"]["picard"])
                out_dir = os.path.join(results_dir, stage)
                safe_makedir(out_dir)
                out_files = [replace_suffix(os.path.basename(x),
                                            "metrics") for x in curr_files]
                out_files = [os.path.join(out_dir, x) for x in out_files]
                out_files = view.map(picardrun.picard_rnaseq_metrics,
                                     [picard] * nrun,
                                     curr_files,
                                     [ref] * nrun,
                                     [ribo] * nrun,
                                     out_files)

            if stage == "rseqc":
                _emit_stage_message(stage, curr_files)
                rseqc_config = _get_stage_config(config, stage)
                rseq_args = zip(*product(curr_files, [config]))
                view.map(rseqc.bam_stat, *rseq_args)
                view.map(rseqc.genebody_coverage, *rseq_args)
                view.map(rseqc.junction_annotation, *rseq_args)
                view.map(rseqc.junction_saturation, *rseq_args)
                RPKM_args = zip(*product(final_bamfiles, [config]))
                RPKM_count_out = view.map(rseqc.RPKM_count, *RPKM_args)
                RPKM_count_fixed = view.map(rseqc.fix_RPKM_count_file,
                                            RPKM_count_out)
                """
                                annotate_args = zip(*product(RPKM_count_fixed,
                                             ["gene_id"],
                                             ["ensembl_gene_id"],
                                             ["human"]))
                view.map(annotate.annotate_table_with_biomart,
                         *annotate_args)
                         """
                view.map(rseqc.RPKM_saturation, *rseq_args)
                curr_files = tophat_outputs

    # combine htseq-count files and run deseq on them
    conditions, htseq_files = dict_to_vectors(htseq_outdict)
    deseq_config = _get_stage_config(config, "deseq")
    cell_types = _group_input_by_cell_type(htseq_files)
    for cell_type, files in cell_types.items():
        for comparison in deseq_config["comparisons"]:
            comparison_name = "_vs_".join(comparison)
            deseq_dir = os.path.join(results_dir, "deseq", cell_type,
                                     comparison_name)
            safe_makedir(deseq_dir)
            out_file = os.path.join(deseq_dir, comparison_name + ".counts.txt")
            files_by_condition = _group_input_by_condition(files)
            _emit_stage_message("deseq", files_by_condition)
            c, f = dict_to_vectors(files_by_condition)
            combined_out = htseq_count.combine_counts(f,
                                                      None,
                                                      out_file)
            deseq_out = os.path.join(deseq_dir, comparison_name)
            logger.info("Running deseq on %s with conditions %s "
                        "and writing ot %s" % (combined_out,
                                               conditions,
                                               deseq_out))
            deseq_out = view.map(deseq.run, [combined_out], [c], [deseq_out])
            annotate.annotate_table_with_biomart(deseq_out[0],
                                                 "id",
                                                 "ensembl_gene_id",
                                                 "human")
            #annotated_file = view.map(annotate.annotate_table_with_biomart,
            #                          [deseq_out],
            #                          ["id"],
            #                          ["ensembl_gene_id"],
            #                          ["human"])

    # end gracefully
    stop_cluster()
Example #30
0
def main(config_file):
    # load yaml config file
    with open(config_file) as in_handle:
        config = yaml.load(in_handle)

    # setup logging
    setup_logging(config)
    from bipy.log import logger
    # start cluster
    start_cluster(config)
    from bipy.cluster import view

    found = sh.find(config["dir"]["data"], "-name", "Variations")
    var_dirs = [str(x).strip() for x in found]
    logger.info("Var_dirs: %s" % (var_dirs))
    in_dirs = map(os.path.dirname, var_dirs)
    logger.info("in_dirs: %s" % (in_dirs))
    # XXX for testing only load 3
    #curr_files = in_dirs[0:5]
    curr_files = in_dirs


    # run the illumina fixer
    logger.info("Running illumina fixer on %s." % (curr_files))
    illf_class = STAGE_LOOKUP.get("illumina_fixer")
    illf = illf_class(config)
    curr_files = view.map(illf, curr_files)

    # sort the vcf files
    def sort_vcf(in_file):
        from bipy.utils import append_stem
        from bcbio.distributed.transaction import file_transaction
        from bcbio.utils import file_exists
        import sh

        out_file = append_stem(in_file, "sorted")
        if file_exists(out_file):
            return out_file
        with file_transaction(out_file) as tmp_out_file:
            sh.vcf_sort(in_file, _out=tmp_out_file)
        return out_file


    # combine
    out_file = os.path.join(config["dir"].get("results", "results"),
                            "geminiloader",
                            "all_combined.vcf")
    logger.info("Combining files %s into %s." % (curr_files, out_file))
    if file_exists(out_file):
        curr_files = [out_file]
    else:
        curr_files = [genotype.combine_variant_files(curr_files, out_file,
                                                     config["ref"]["fasta"],
                                                     config)]

    # break the VCF files up by chromosome for speed
    logger.info("Breaking up %s by chromosome." % (curr_files))
    breakvcf_class = STAGE_LOOKUP.get("breakvcf")
    breakvcf = breakvcf_class(config)
    curr_files = view.map(breakvcf, curr_files)

    # run VEP on the separate files in parallel
    logger.info("Running VEP on %s." % (curr_files))
    vep_class = STAGE_LOOKUP.get("vep")
    vep = vep_class(config)
    curr_files = view.map(vep, list(flatten(curr_files)))

    curr_files = filter(file_exists, curr_files)

    # load the files into gemini not in parallel
    # don't run in parallel

    # sort the vcf files
    logger.info("Sorting %s." % (curr_files))
    curr_files = view.map(sort_vcf, curr_files)
    # don't run the rest of this in parallel, so take the cluster down
    stop_cluster()

    out_file = os.path.join(config["dir"].get("results", "results"),
                            "geminiloader",
                            "all_combined.vep.vcf")
    logger.info("Combining files %s into %s." % (curr_files, out_file))
    if file_exists(out_file):
        curr_files = [out_file]
    else:
        curr_files = [genotype.combine_variant_files(curr_files, out_file,
                                                     config["ref"]["fasta"],
                                                     config)]


    logger.info("Loading %s into gemini." % (curr_files))
    gemini_class = STAGE_LOOKUP.get("geminiloader")
    geminiloader = gemini_class(config)
    curr_files = map(geminiloader, curr_files)
    logger.info("Run complete.")
Example #31
0
File: trim.py Project: roryk/bipy
 def get_adapters(self, chemistry):
     return list(flatten([["-a", x] for x in ADAPTERS.get(chemistry, [])]))
Example #32
0
def _build_command(input_file, out_prefix, jellyfish_config):
    cmd = ["jellyfish", jellyfish_config["task"], jellyfish_config["options"]]
    cmd += ["-o", out_prefix, input_file]
    return list(flatten(cmd))
Example #33
0
def _build_command(input_file, out_prefix, jellyfish_config):
    cmd = ["jellyfish", jellyfish_config["task"],
           jellyfish_config["options"]]
    cmd += ["-o", out_prefix, input_file]
    return list(flatten(cmd))
Example #34
0
def main(config_file):
    with open(config_file) as in_handle:
        config = yaml.load(in_handle)
    setup_logging(config)
    from bipy.log import logger
    start_cluster(config)

    data_dir = config["dir"]["data"]
    from bipy.cluster import view
    input_files = [glob.glob(os.path.join(data_dir, x, "*_rep*")) for x in
                   config["input_dirs"]]
    input_files = list(flatten(input_files))
    logger.info("Input files to process: %s" % (input_files))
    results_dir = config["dir"]["results"]

    map(safe_makedir, config["dir"].values())

    curr_files = input_files

    for stage in config["run"]:
        if stage == "fastqc":
            nfiles = len(curr_files)
            logger.info("Running %s on %s" % (stage, str(curr_files)))
            fastqc_config = _get_stage_config(config, stage)
            fastqc_outputs = view.map(fastqc.run, curr_files,
                                      [fastqc_config] * nfiles,
                                      [config] * nfiles)

        if stage == "cutadapt":
            nfiles = len(curr_files)
            cutadapt_config = _get_stage_config(config, stage)
            cutadapt_outputs = view.map(cutadapt_tool.run,
                                        curr_files,
                                        [cutadapt_config] * nfiles,
                                        [config] * nfiles)
            curr_files = cutadapt_outputs

        if stage == "novoalign":
            nfiles = len(curr_files)
            novoalign_config = _get_stage_config(config, stage)
            #db = novoindex.run(config["ref"],
            #                   _get_stage_config(config, "novoindex"),
            #                   config)
            db = config["genome"]["file"]
            novoalign_outputs = view.map(novoalign.run, curr_files,
                                         [db] * nfiles,
                                         [novoalign_config] * nfiles,
                                         [config] * nfiles)
            picard = BroadRunner(config["program"]["picard"])
            args = zip(*itertools.product([picard], novoalign_outputs))
            # conver to bam
            bamfiles = view.map(picardrun.picard_formatconverter,
                                *args)
            args = zip(*itertools.product([picard], bamfiles))
            # sort bam
            sorted_bf = view.map(picardrun.picard_sort, *args)
            # index bam
            args = zip(*itertools.product([picard], sorted_bf))
            view.map(picardrun.picard_index, *args)
            curr_files = novoalign_outputs

        if stage == "htseq-count":
            logger.info("Running htseq-count on %s" %(curr_files))
            htseq_outputs = curr_files
            column_names = _get_short_names(input_files)
            logger.info("Column names: %s" % (column_names))
            out_file = os.path.join(config["dir"]["results"], stage,
                                    "combined.counts")
            out_dir = os.path.join(results_dir, stage)
            safe_makedir(out_dir)
            combined_out = htseq_count.combine_counts(htseq_outputs,
                                                      column_names,
                                                      out_file)
            rpkm = htseq_count.calculate_rpkm(combined_out,
                                              config["annotation"]["file"])
            rpkm_file = os.path.join(config["dir"]["results"], stage,
                                     "rpkm.txt")
            rpkm.to_csv(rpkm_file, sep="\t")

        if stage == "coverage":
            logger.info("Calculating RNASeq metrics on %s." % (curr_files))
            nrun = len(curr_files)
            ref = blastn.prepare_ref_file(config["stage"][stage]["ref"],
                                          config)
            ribo = config["stage"][stage]["ribo"]
            picard = BroadRunner(config["program"]["picard"])
            out_dir = os.path.join(results_dir, stage)
            safe_makedir(out_dir)
            out_files = [replace_suffix(os.path.basename(x),
                                        "metrics") for x in curr_files]
            out_files = [os.path.join(out_dir, x) for x in out_files]
            out_files = view.map(picardrun.picard_rnaseq_metrics,
                                 [picard] * nrun,
                                 curr_files,
                                 [ref] * nrun,
                                 [ribo] * nrun,
                                 out_files)

        if stage == "deseq":
            conditions = [os.path.basename(x).split("_")[0] for x in
                          input_files]
            deseq_config = _get_stage_config(config, stage)
            out_dir = os.path.join(results_dir, stage)
            safe_makedir(out_dir)
            for comparison in deseq_config["comparisons"]:
                comparison_name = "_vs_".join(comparison)
                out_dir = os.path.join(results_dir, stage, comparison_name)
                safe_makedir(out_dir)
                # get the of the conditons that match this comparison
                indexes = [x for x, y in enumerate(conditions) if
                           y in comparison]
                # find the htseq_files to combine and combine them
                htseq_files = [htseq_outputs[index] for index in indexes]
                htseq_columns = [column_names[index] for index in indexes]
                logger.info(htseq_files)
                logger.info(htseq_columns)
                out_file = os.path.join(out_dir,
                                        comparison_name + ".counts.txt")
                combined_out = htseq_count.combine_counts(htseq_files,
                                                          htseq_columns,
                                                          out_file)
                deseq_conds = [conditions[index] for index in indexes]
                deseq_prefix = os.path.join(out_dir, comparison_name)

                deseq_out = view.map(deseq.run, [combined_out],
                                     [deseq_conds], [deseq_prefix])
                logger.info("Annotating %s." % (deseq_out))
                annotated_file = view.map(annotate.annotate_table_with_biomart,
                                          deseq_out,
                                          ["id"],
                                          ["ensembl_gene_id"],
                                          ["human"])

        if stage == "dss":
            conditions = [os.path.basename(x).split("_")[0] for x in
                          input_files]
            dss_config = _get_stage_config(config, stage)
            out_dir = os.path.join(results_dir, stage)
            safe_makedir(out_dir)
            for comparison in dss_config["comparisons"]:
                comparison_name = "_vs_".join(comparison)
                out_dir = os.path.join(results_dir, stage, comparison_name)
                safe_makedir(out_dir)
                # get the of the conditons that match this comparison
                indexes = [x for x, y in enumerate(conditions) if
                           y in comparison]
                # find the htseq_files to combine and combine them
                htseq_files = [htseq_outputs[index] for index in indexes]
                htseq_columns = [column_names[index] for index in indexes]
                out_file = os.path.join(out_dir,
                                        comparison_name + ".counts.txt")
                combined_out = htseq_count.combine_counts(htseq_files,
                                                          htseq_columns,
                                                          out_file)
                dss_conds = [conditions[index] for index in indexes]
                dss_prefix = os.path.join(out_dir, comparison_name)
                logger.info("Running DSS on %s with conditions %s and comparison %s." % (combined_out, dss_conds, comparison))

                dss_out = dss.run(combined_out, dss_conds, comparison,
                                  dss_prefix)

    stop_cluster()
Example #35
0
def _build_command(input_file, ref, novoalign_config):
    cmd = [which("novoalign"), flatten_options(novoalign_config),
           "-o", "SAM", "-d", ref, "-f", input_file]
    return list(map(str, flatten(cmd)))
Example #36
0
def _build_command(input_file, novoindex_config, output_file):
    options = novoindex_config["options"].items()
    cmd = map(str, (flatten(["novoindex", options, output_file, input_file])))
    return cmd
Example #37
0
def main(config_file):
    with open(config_file) as in_handle:
        config = yaml.load(in_handle)

    # make the needed directories
    map(safe_makedir, config["dir"].values())

    # specific for thesis pipeline
    input_dirs = config["input_dirs"]

    results_dir = config["dir"].get("results", "results")
    input_files = _find_input_files(config)
    conditions = _group_input_by_condition(input_files)
    logger.info("Input_files: %s" % (input_files))
    logger.info("Condition groups %s" % (conditions))
    htseq_outdict = {}

    for condition, curr_files in conditions.items():
        condition_dir = os.path.join(results_dir, condition)
        safe_makedir(condition_dir)
        config["dir"]["results"] = condition_dir

        for stage in config["run"]:
            if stage == "fastqc":
                _emit_stage_message(stage, curr_files)
                fastqc_config = _get_stage_config(config, stage)
                fastqc_args = zip(
                    *product(curr_files, [fastqc_config], [config]))
                view.map(fastqc.run, *fastqc_args)

            if stage == "cutadapt":
                _emit_stage_message(stage, curr_files)
                cutadapt_config = _get_stage_config(config, stage)
                cutadapt_args = zip(
                    *product(curr_files, [cutadapt_config], [config]))
                cutadapt_outputs = view.map(cutadapt_tool.run, *cutadapt_args)
                curr_files = cutadapt_outputs
                logger.info("Fixing mate pair information.")
                pairs = combine_pairs(curr_files)
                first = [x[0] for x in pairs]
                second = [x[1] for x in pairs]
                logger.info("Forward: %s" % (first))
                logger.info("Reverse: %s" % (second))
                fixed = view.map(fastq.fix_mate_pairs_with_config, first,
                                 second, [config] * len(first))
                curr_files = list(flatten(fixed))

            if stage == "sickle":
                _emit_stage_message(stage, curr_files)
                pairs = combine_pairs(curr_files)
                first = [x[0] for x in pairs]
                second = [x[1] for x in pairs]
                fixed = view.map(sickle.run_with_config, first, second,
                                 [config] * len(first))
                curr_files = list(flatten(fixed))

            if stage == "tophat":
                _emit_stage_message(stage, curr_files)
                tophat_config = _get_stage_config(config, stage)
                pairs = combine_pairs(curr_files)
                first = [x[0] for x in pairs]
                second = [x[1] for x in pairs]
                logger.info("first %s" % (first))
                logger.info("second %s" % (second))

                #tophat_args = zip(*product(first, second, [config["ref"]],
                #                           ["tophat"], [config]))
                tophat_outputs = view.map(tophat.run_with_config, first,
                                          second, [config["ref"]] * len(first),
                                          ["tophat"] * len(first),
                                          [config] * len(first))
                bamfiles = view.map(sam.sam2bam, tophat_outputs)
                bamsort = view.map(sam.bamsort, bamfiles)
                view.map(sam.bamindex, bamsort)
                final_bamfiles = bamsort
                curr_files = tophat_outputs

            if stage == "htseq-count":
                _emit_stage_message(stage, curr_files)
                htseq_config = _get_stage_config(config, stage)
                htseq_args = zip(*product(curr_files, [config], [stage]))
                htseq_outputs = view.map(htseq_count.run_with_config,
                                         *htseq_args)
                htseq_outdict[condition] = htseq_outputs

            if stage == "coverage":
                logger.info("Calculating RNASeq metrics on %s." % (curr_files))
                nrun = len(curr_files)
                ref = prepare_ref_file(config["stage"][stage]["ref"], config)
                ribo = config["stage"][stage]["ribo"]
                picard = BroadRunner(config["program"]["picard"])
                out_dir = os.path.join(results_dir, stage)
                safe_makedir(out_dir)
                out_files = [
                    replace_suffix(os.path.basename(x), "metrics")
                    for x in curr_files
                ]
                out_files = [os.path.join(out_dir, x) for x in out_files]
                out_files = view.map(picardrun.picard_rnaseq_metrics,
                                     [picard] * nrun, curr_files, [ref] * nrun,
                                     [ribo] * nrun, out_files)

            if stage == "rseqc":
                _emit_stage_message(stage, curr_files)
                rseqc_config = _get_stage_config(config, stage)
                rseq_args = zip(*product(curr_files, [config]))
                view.map(rseqc.bam_stat, *rseq_args)
                view.map(rseqc.genebody_coverage, *rseq_args)
                view.map(rseqc.junction_annotation, *rseq_args)
                view.map(rseqc.junction_saturation, *rseq_args)
                RPKM_args = zip(*product(final_bamfiles, [config]))
                RPKM_count_out = view.map(rseqc.RPKM_count, *RPKM_args)
                RPKM_count_fixed = view.map(rseqc.fix_RPKM_count_file,
                                            RPKM_count_out)
                """
                                annotate_args = zip(*product(RPKM_count_fixed,
                                             ["gene_id"],
                                             ["ensembl_gene_id"],
                                             ["human"]))
                view.map(annotate.annotate_table_with_biomart,
                         *annotate_args)
                         """
                view.map(rseqc.RPKM_saturation, *rseq_args)
                curr_files = tophat_outputs

    # combine htseq-count files and run deseq on them
    conditions, htseq_files = dict_to_vectors(htseq_outdict)
    deseq_config = _get_stage_config(config, "deseq")
    cell_types = _group_input_by_cell_type(htseq_files)
    for cell_type, files in cell_types.items():
        for comparison in deseq_config["comparisons"]:
            comparison_name = "_vs_".join(comparison)
            deseq_dir = os.path.join(results_dir, "deseq", cell_type,
                                     comparison_name)
            safe_makedir(deseq_dir)
            out_file = os.path.join(deseq_dir, comparison_name + ".counts.txt")
            files_by_condition = _group_input_by_condition(files)
            _emit_stage_message("deseq", files_by_condition)
            c, f = dict_to_vectors(files_by_condition)
            combined_out = htseq_count.combine_counts(f, None, out_file)
            deseq_out = os.path.join(deseq_dir, comparison_name)
            logger.info("Running deseq on %s with conditions %s "
                        "and writing ot %s" %
                        (combined_out, conditions, deseq_out))
            deseq_out = view.map(deseq.run, [combined_out], [c], [deseq_out])
            annotate.annotate_table_with_biomart(deseq_out[0], "id",
                                                 "ensembl_gene_id", "human")
            #annotated_file = view.map(annotate.annotate_table_with_biomart,
            #                          [deseq_out],
            #                          ["id"],
            #                          ["ensembl_gene_id"],
            #                          ["human"])

    # end gracefully
    stop_cluster()