Esempio n. 1
0
    def _filter(self, data):
        """Filter data to contain the required fields."""
        if self.select_expr is None:
            return data

        opts = jmespath.Options(custom_functions=JMESExtensions(data))
        return jmespath.search(self.select_expr, data, opts)
Esempio n. 2
0
    def run(self, data, config=None, pipeline=None):
        """Parse metadata from mmCIF file."""
        pdb_id = self.get_vals(data)

        mmcif_file = phyre_engine.tools.pdb.find_pdb(pdb_id,
                                                     suffix_list=(".cif",
                                                                  ".cif.gz"),
                                                     base_dir=self.mmcif_dir)
        data.setdefault("metadata", {})
        if mmcif_file is None:
            raise FileNotFoundError(
                "Could not find mmCIF file {} in {}".format(
                    pdb_id, self.mmcif_dir))

        with phyre_engine.tools.pdb.open_pdb(mmcif_file) as mmcif_in:
            if self.prefilter:
                mmcif_in = self._prefilter(mmcif_in)

            mmcif_dict = Bio.PDB.MMCIF2Dict.MMCIF2Dict(mmcif_in)
            jmes_extensions = JMESExtensions(mmcif_dict)
            jmes_opts = jmespath.Options(custom_functions=jmes_extensions)
            for field, jmespath_expr in self.fields.items():
                value = jmespath.search(jmespath_expr, mmcif_dict, jmes_opts)
                data["metadata"][field] = value

        return data
Esempio n. 3
0
    def run(self, data, config=None, pipeline=None):
        """Collect and index the files that form an hhsuite database."""
        jmes_opts = jmespath.Options(custom_functions=JMESExtensions(data))
        templates = jmespath.search(self.select_expr, data, jmes_opts)

        with tempfile.NamedTemporaryFile("w") as file_list:
            # Write IDs to remove to temp file
            for template in templates:
                print(template["name"], file=file_list)
            file_list.flush()

            db_types = ["a3m", "hhm", "cs219"]
            db_prefix = Path(self.db_prefix)

            for file_type in db_types:
                ffindex = Path("{}_{}.ffindex".format(db_prefix, file_type))
                #                if ffindex.exists():
                cmd_line = tools.ffindex_modify(
                    (self.bin_dir, "ffindex_modify"),
                    options={"file_list": file_list.name},
                    flags=["sort", "unlink"],
                    positional=[ffindex])
                self.logger.debug("Running command %s", cmd_line)
                tools.run(cmd_line, check=True)
        return data
Esempio n. 4
0
 def key_fn(datum):
     """Getter closure using `jmespath_key`."""
     jmespath_opts = jmespath.Options(custom_functions=JMESExtensions(root))
     field_value = jmespath.search(jmespath_key, datum, jmespath_opts)
     if allow_none:
         return (field_value is None, field_value)
     return field_value
Esempio n. 5
0
    def run(self, data, config=None, pipeline=None):
        """Replace results of `self.select_expr` with `self.value_expr`."""
        opts = jmespath.Options(custom_functions=JMESExtensions(data))

        to_replace = jmespath.search(self.select_expr, data, opts)
        replace_with = jmespath.search(self.value_expr, to_replace, opts)
        if isinstance(to_replace, collections.abc.Mapping):
            if not isinstance(replace_with, collections.abc.Mapping):
                self._type_error(to_replace, replace_with)
            # Edge case: If the two elements are the same, then we cannot
            # call "clear", because it will erase both. In that case, do
            # nothing.
            if to_replace is not replace_with:
                to_replace.clear()
                to_replace.update(replace_with)
        elif isinstance(to_replace, collections.abc.Sequence):
            if not isinstance(replace_with, collections.abc.Sequence):
                self._type_error(to_replace, replace_with)
            self.logger.info(
                ("Replacing result of '%s' (a list of length %d) "
                 "with a %d-element list"),
                self.select_expr, len(to_replace), len(replace_with))
            to_replace[:] = replace_with
        else:
            self._invalid_selection(to_replace)
        return data
Esempio n. 6
0
 def generate_config(self, data, config):
     """
     Generate child pipeline configuration from runtime configuration
     and pipeline state.
     """
     config = config if config is not None else {}
     jmes_opts = jmespath.Options(custom_functions=JMESExtensions(data))
     for search_term, config_location in self.mapping.items():
         state_value = jmespath.search(search_term, data, jmes_opts)
         apply_dotted_key(config, config_location, state_value)
     return config
Esempio n. 7
0
    def run(self, data, config=None, pipeline=None):
        """Run em4gmm for automatic clustering."""

        # Select sample
        jmes_opts = jmespath.Options(custom_functions=JMESExtensions(data))
        sample_list = jmespath.search(self.select_expr, data, jmes_opts)

        # Extract data points
        data_points = [
            jmespath.search(self.dimensions_expr, sample, jmes_opts)
            for sample in sample_list
        ]

        num_samples = len(data_points)
        num_dims = len(data_points[0])

        # Write samples to file
        with tempfile.NamedTemporaryFile("w") as sample_file:
            print("{} {}".format(num_dims, num_samples), file=sample_file)
            for sample in data_points:
                print(" ".join([str(i) for i in sample]), file=sample_file)
            sample_file.flush()

            # Run trainer
            gmmtrain_opts = {"samples": sample_file.name}
            gmmtrain_opts.update(self.gmmtrain_opts)
            gmmtrain = self.GMMTRAIN((self.bin_dir, "gmmtrain"),
                                     options=gmmtrain_opts)
            self.logger.debug("Running %s", gmmtrain)
            subprocess.run(gmmtrain, check=True)

            # Run classifier
            gmmclass_opts = {"samples": sample_file.name}
            gmmclass_opts.update(self.gmmclass_opts)
            gmmclass = self.GMMCLASS((self.bin_dir, "gmmclass"),
                                     options=gmmclass_opts)
            self.logger.debug("Running %s", gmmclass)
            subprocess.run(gmmclass, check=True)

        # Parse cluster definitions from trainer log file
        with open(self.gmmtrain_opts["model_details"], "r") as model_in:
            model = json.load(model_in)
            data["clusters"] = model

        # Parse sample data, adding to the samples
        with open(self.gmmclass_opts["sample_details"], "r") as samples_in:
            sample_details = json.load(samples_in)["samples_results"]

            for details in sample_details:
                i = details["sample"]
                sample_list[i]["cluster"] = details["class"]
                sample_list[i]["lprob"] = details["lprob"]
        return data
Esempio n. 8
0
    def run(self, data, config=None, pipeline=None):
        """Collect and index the files that form an hhsuite database."""
        jmes_opts = jmespath.Options(custom_functions=JMESExtensions(data))
        templates = jmespath.search(self.select_expr, data, jmes_opts)

        # First, sort templates by sequence length.
        templates.sort(key=lambda t: len(t["sequence"]))

        # Make database directory if it doesn't exist.
        Path(self.db_prefix).parent.mkdir(parents=True, exist_ok=True)

        # Collect a3m/hhm/cs219 files into ffindex/ffdata databases
        to_collect = ["a3m", "hhm", "cs219"]
        ff_dbs = {}
        db_prefix = Path(self.db_prefix)

        for file_type in to_collect:
            db_name = Path("{}_{}".format(str(db_prefix), file_type))
            ffindex = Path("{}.ffindex".format(str(db_name)))
            ffdata = Path("{}.ffdata".format(str(db_name)))

            if self.overwrite:
                if ffindex.exists():
                    ffindex.unlink()
                if ffdata.exists():
                    ffdata.unlink()

            with tempfile.NamedTemporaryFile("w") as index:
                # Write all files of file_type `file_type` to a temp file
                for template in templates:
                    print(template[file_type], file=index)
                index.flush()

                # Run ffindex_build using the the temp file as the list of files
                # to include in the DB.
                cmd_line = tools.ffindex_build(
                    (self.bin_dir, "ffindex_build"),
                    positional=[ffdata, ffindex],
                    flags=["sort", "append"],
                    options={"file_list": index.name})
                self.logger.debug("Running command %s", cmd_line)
                tools.run(cmd_line, check=True)
                ff_dbs[file_type] = db_name

        # Cut useless information from the indices of each file.
        for ff_db in ff_dbs.values():
            self._trim_index_names(ff_db)

        data["database"] = str(db_prefix)
        return data
Esempio n. 9
0
    def run(self, data, config=None, pipeline=None):
        """Sort pipeline state."""
        jmespath_opts = jmespath.Options(custom_functions=JMESExtensions(data))
        to_sort = jmespath.search(self.field, data, jmespath_opts)

        # Sort according to each key, running from last to first to take
        # advantage of Python's stable sorting.
        for sort_key in reversed(self.keys):
            reverse = sort_key.get("reverse", False)
            allow_none = sort_key.get("allow_none", False)
            to_sort = jmes_sort(to_sort,
                                sort_key["key"],
                                root=data,
                                reverse=reverse,
                                allow_none=allow_none)
        jmespath.search(self.field, data, jmespath_opts)[:] = to_sort
        return data
Esempio n. 10
0
    def run(self, data, config=None, pipeline=None):
        """Update results of `self.select_expr` with `self.value_expr`."""
        jmespath_opts = jmespath.Options(
            custom_functions=JMESExtensions(data))

        to_replace = jmespath.search(self.select_expr, data, jmespath_opts)
        if isinstance(to_replace, collections.abc.Mapping):
            value = jmespath.search(self.value_expr, to_replace, jmespath_opts)
            if not isinstance(value, collections.abc.Mapping):
                self._type_error(to_replace, value)
            to_replace.update(value)
        elif isinstance(to_replace, collections.abc.Sequence):
            for item in to_replace:
                value = jmespath.search(self.value_expr, item, jmespath_opts)
                item.update(value)
        else:
            self._invalid_selection(to_replace)
        return data
Esempio n. 11
0
    def run(self, data, config=None, pipeline=None):
        """Write CSV file."""
        jmespath_opts = jmespath.Options(custom_functions=JMESExtensions(data))
        results = jmespath.search(self.select_expr, data, jmespath_opts)

        with Stream(self.output, "w") as csv_out:
            if not results:
                print("# No results", file=csv_out)
            else:
                writer = csv.DictWriter(csv_out,
                                        sorted(results[0].keys()),
                                        restval=self.null_placeholder)
                if self.header:
                    writer.writeheader()
                for record in results:
                    self._fill_placeholders(record)
                    writer.writerow(record)

        return data
Esempio n. 12
0
    def _sets(self, root):
        """
        Return a list of sets, each of which contains the results of
        evaluating `self.jmespath_key` on each item. This also returns a map
        of those identifiers to the corresponding objects.
        """

        key_map = {}
        key_sets = []
        opts = jmespath.Options(custom_functions=JMESExtensions(root))
        # Reverse the list of sets so the assignment to key_map occurs in
        # reverse order. That is, we prefer to keep elements from the first
        # set over the last set.
        for set_expr in reversed(self.jmespath_sets):
            item_set = set()
            item_list = jmespath.search(set_expr, root, opts)

            # Get list of identifiers by evaluating self.jmespath_key on each
            # item. We explicitly turn any sequences into tuples so they can
            # be hashed.
            identifiers = []
            for item in item_list:
                identifier = jmespath.search(self.jmespath_key, item, opts)
                if isinstance(identifier, collections.abc.Sequence):
                    identifier = tuple(identifier)
                identifiers.append(identifier)

            # Build map of identifiers to items, and list of sets of IDs.
            for ident, item in zip(identifiers, item_list):
                if ident not in item_set:
                    key_map[ident] = item
                    item_set.add(ident)
            key_sets.append(item_set)
        # Compensate for initial "reverse"
        key_sets.reverse()
        return key_sets, key_map
Esempio n. 13
0
 def search(self, expr, data):
     """Call :py:func:`jmespath.search` with extended functions."""
     extensions = JMESExtensions(data)
     jmespath_opts = jmespath.Options(custom_functions=extensions)
     return jmespath.search(expr, data, jmespath_opts)
Esempio n. 14
0
 def run(self, data, config=None, pipeline=None):
     """Shuffle pipeline state."""
     jmespath_opts = jmespath.Options(custom_functions=JMESExtensions(data))
     to_shuffle = jmespath.search(self.field, data, jmespath_opts)
     self.random.shuffle(to_shuffle)
     return data