Ejemplo n.º 1
0
    def run_single(self, reads_ref, params):
        """
        Performs a single run of HISAT2 against a single reads reference. The rest of the info
        is taken from the params dict - see the spec for details.
        """
        # 1. Get hisat2 index from genome.
        #    a. If it exists in cache, use that.
        #    b. Otherwise, build it
        idx_prefix = self.build_index(params["genome_ref"])

        # 2. Fetch the reads file and deal make sure input params are correct.
        reads = fetch_reads_from_reference(reads_ref["ref"], self.callback_url)
        # if the reads ref came from a different sample set, then we need to drop that
        # reference inside the reads info object so it can be linked in the alignment
        if reads_ref["ref"] != params["sampleset_ref"]:
            reads["sampleset_ref"] = params["sampleset_ref"]
        # make sure condition info carries over if we have it
        if "condition" in reads_ref:
            reads["condition"] = reads_ref["condition"]
        elif "condition" in params:
            reads["condition"] = params["condition"]
        reads["name"] = reads_ref["name"]
        output_file = "accepted_hits"

        # 3. Finally all set, do the alignment and upload the output.
        alignment_file = self.run_hisat2(idx_prefix,
                                         reads,
                                         params,
                                         output_file=output_file)
        alignment_name = reads["name"] + params["alignment_suffix"]
        output_ref = self.upload_alignment(params, reads, alignment_name,
                                           alignment_file)
        alignment_set_ref = None
        if is_set(params["sampleset_ref"], self.workspace_url):
            # alignment_items, alignmentset_name, ws_name
            set_name = get_object_names(
                [params["sampleset_ref"]],
                self.workspace_url)[params["sampleset_ref"]]
            alignment_set_name = set_name + params["alignmentset_suffix"]
            alignment_set_ref = self.upload_alignment_set(
                [{
                    "ref": output_ref,
                    "label": reads["condition"]
                }], alignment_set_name, params["ws_name"])
        alignments = dict()
        alignments[reads_ref["ref"]] = {
            "ref": output_ref,
            "name": alignment_name
        }
        os.remove(reads["file_fwd"])
        if "file_rev" in reads:
            os.remove(reads["file_rev"])
        return (alignments, output_ref, alignment_set_ref)
Ejemplo n.º 2
0
    def run_batch(self, reads_refs, params):
        """
        Runs HISAT2 in batch mode.
        reads_refs should be a list of dicts, where each looks like the following:
        {
            "ref": reads object reference,
            "condition": condition for that ref (string)
        }
        """
        # build task list and send it to KBParallel
        tasks = list()
        set_name = get_object_names(
            [params["sampleset_ref"]],
            self.workspace_url)[params["sampleset_ref"]]
        for idx, reads_ref in enumerate(reads_refs):
            single_param = dict(params)  # need a copy of the params
            single_param["build_report"] = 0
            single_param["sampleset_ref"] = reads_ref["ref"]
            if "condition" in reads_ref:
                single_param["condition"] = reads_ref["condition"]
            else:
                single_param["condition"] = "unspecified"

            tasks.append({
                "module_name": "kb_hisat2",
                "function_name": "run_hisat2",
                "version": self.my_version,
                "parameters": single_param
            })
        # UNCOMMENT BELOW FOR LOCAL TESTING
        batch_run_params = {
            "tasks": tasks,
            "runner": "parallel",
            # "concurrent_local_tasks": 3,
            # "concurrent_njsw_tasks": 0,
            "max_retries": 2
        }
        parallel_runner = KBParallel(self.callback_url)
        results = parallel_runner.run_batch(batch_run_params)["results"]
        alignment_items = list()
        alignments = dict()
        for idx, result in enumerate(results):
            # idx of the result is the same as the idx of the inputs AND reads_refs
            if result["is_error"] != 0:
                raise RuntimeError(
                    "Failed a parallel run of HISAT2! {}".format(
                        result["result_package"]["error"]))
            reads_ref = tasks[idx]["parameters"]["sampleset_ref"]
            alignment_items.append({
                "ref":
                result["result_package"]["result"][0]["alignment_objs"]
                [reads_ref]["ref"],
                "label":
                reads_refs[idx].get("condition",
                                    params.get("condition", "unspecified"))
            })
            alignments[reads_ref] = result["result_package"]["result"][0][
                "alignment_objs"][reads_ref]
        # build the final alignment set
        output_ref = self.upload_alignment_set(
            alignment_items, set_name + params["alignmentset_suffix"],
            params["ws_name"])
        return (alignments, output_ref)