Example #1
0
class SMARTSGraph(nx.Graph):
    """A graph representation of a SMARTS pattern.

    Attributes
    ----------
    smarts_string : str
        The SMARTS string outlined in the force field
    parser : foyer.smarts.SMARTS
        The parser whose grammar rules convert the SMARTSstring
        into the AST
    name : str
    overrides : set
        Rules or SMARTSGraph over which this SMARTSGraph takes precedence

    Attributes
    ----------
    graph_matcher : smarts_graph.SMARTSMatcher
        implementation of VF2 that handles subgraph matching

    Notes
    -----
    SMARTSGraph inherits from networkx.Graph, available features can be found
    at networkx.org/documentation/stable/reference/classes/graph.html
    """

    # Because the first atom in a SMARTS string is always the one we want to
    # type, the graph's nodes needs to be ordered.

    def __init__(self,
                 smarts_string,
                 parser=None,
                 name=None,
                 overrides=None,
                 typemap=None,
                 *args,
                 **kwargs):
        super(SMARTSGraph, self).__init__(*args, **kwargs)

        self.smarts_string = smarts_string
        self.name = name
        self.overrides = overrides
        self.typemap = typemap

        if parser is None:
            self.ast = SMARTS().parse(smarts_string)
        else:
            self.ast = parser.parse(smarts_string)

        self._atom_indices = OrderedDict()
        self._add_nodes()
        self._add_edges(self.ast)
        self._add_label_edges()
        self._graph_matcher = None

    def _add_nodes(self):
        """Add all atoms in the SMARTS string as nodes in the graph."""
        for n, atom in enumerate(
            [x for x in self.ast.iter_subtrees_topdown() if x.data == "atom"]):
            self.add_node(n, atom=atom)
            self._atom_indices[id(atom)] = n

    def _add_edges(self, ast_node, trunk=None):
        """Add all bonds in the SMARTS string as edges in the graph."""
        atom_indices = self._atom_indices
        for ast_child in ast_node.children:
            if ast_child.data == "atom":
                atom_idx = atom_indices[id(ast_child)]
                if trunk is not None:
                    trunk_idx = atom_indices[id(trunk)]
                    self.add_edge(atom_idx, trunk_idx)
                trunk = ast_child
            elif ast_child.data == "branch":
                self._add_edges(ast_child, trunk)

    def _add_label_edges(self):
        """Add edges between all atoms with the same atom_label in rings."""
        # We need each individual label and atoms with multiple ring labels
        # would yield e.g. the string '12' so split those up.
        label_digits = defaultdict(list)
        for node, attr in self.nodes(data=True):
            atom = attr["atom"]
            for label in atom.find_data("atom_label"):
                digits = list(label.children[0])
                for digit in digits:
                    label_digits[digit].append(atom)

        for label, (atom1, atom2) in label_digits.items():
            atom1_idx = self._atom_indices[id(atom1)]
            atom2_idx = self._atom_indices[id(atom2)]
            self.add_edge(atom1_idx, atom2_idx)

    def _node_match(self, host, pattern):
        """Determine if two graph nodes are equal."""
        atom_expr = pattern["atom"].children[0]
        atom = host["atom_data"]
        bond_partners = host["bond_partners"]
        return self._atom_expr_matches(atom_expr, atom, bond_partners)

    def _atom_expr_matches(self, atom_expr, atom, bond_partners):
        """Evaluate SMARTS string expressions."""
        if atom_expr.data == "not_expression":
            return not self._atom_expr_matches(atom_expr.children[0], atom,
                                               bond_partners)
        elif atom_expr.data in ("and_expression", "weak_and_expression"):
            return self._atom_expr_matches(
                atom_expr.children[0], atom,
                bond_partners) and self._atom_expr_matches(
                    atom_expr.children[1], atom, bond_partners)
        elif atom_expr.data == "or_expression":
            return self._atom_expr_matches(
                atom_expr.children[0], atom,
                bond_partners) or self._atom_expr_matches(
                    atom_expr.children[1], atom, bond_partners)
        elif atom_expr.data == "atom_id":
            return self._atom_id_matches(atom_expr.children[0], atom,
                                         bond_partners, self.typemap)
        elif atom_expr.data == "atom_symbol":
            return self._atom_id_matches(atom_expr, atom, bond_partners,
                                         self.typemap)
        else:
            raise TypeError("Expected atom_id, atom_symbol, and_expression, "
                            "or_expression, or not_expression. "
                            "Got {}".format(atom_expr.data))

    @staticmethod
    def _atom_id_matches(atom_id, atom, bond_partners, typemap):
        """Compare atomic indices, symbols, neighbors, rings."""
        atomic_num = atom.atomic_number
        atom_name = atom.name
        atom_idx = atom.index

        if atom_id.data == "atomic_num":
            return atomic_num == int(atom_id.children[0])
        elif atom_id.data == "atom_symbol":
            if str(atom_id.children[0]) == "*":
                return True
            elif str(atom_id.children[0]).startswith("_"):
                # Store non-element elements in .name
                return atom_name == str(atom_id.children[0])
            else:
                return atomic_num == pt.AtomicNum[str(atom_id.children[0])]
        elif atom_id.data == "has_label":
            label = atom_id.children[0][
                1:]  # Strip the % sign from the beginning.
            return label in typemap[atom_idx]["whitelist"]
        elif atom_id.data == "neighbor_count":
            return len(bond_partners) == int(atom_id.children[0])
        elif atom_id.data == "ring_size":
            cycle_len = int(atom_id.children[0])
            for cycle in typemap[atom_idx]["cycles"]:
                if len(cycle) == cycle_len:
                    return True
            return False
        elif atom_id.data == "ring_count":
            n_cycles = len(typemap[atom_idx]["cycles"])
            if n_cycles == int(atom_id.children[0]):
                return True
            return False
        elif atom_id.data == "matches_string":
            raise NotImplementedError("matches_string is not yet implemented")

    def find_matches(self, topology_graph, typemap):
        """Return sets of atoms that match this SMARTS pattern in a topology.

        Parameters
        ----------
        topology_graph : TopologyGraph
            The topology that we are trying to atomtype.
        typemap : dict
            The target typemap being used/edited

        Notes
        -----
        When this function gets used in atomtyper.py, we actively modify the
        white- and blacklists of the atoms in `topology` after finding a match.
        This means that between every successive call of
        `subgraph_isomorphisms_iter()`, the topology against which we are
        matching may have actually changed. Currently, we take advantage of this
        behavior in some edges cases (e.g. see `test_hexa_coordinated` in
        `test_smarts.py`).

        """
        # Note: Needs to be updated in sync with the grammar in `smarts.py`.
        ring_tokens = ["ring_size", "ring_count"]
        has_ring_rules = any(
            list(self.ast.find_data(token)) for token in ring_tokens)
        topology_graph.add_bond_partners()
        _prepare_atoms(topology_graph, typemap, compute_cycles=has_ring_rules)

        if self._graph_matcher is None:
            atom = nx.get_node_attributes(self, name="atom")[0]
            if len(list(atom.find_data("atom_symbol"))) == 1 and not list(
                    atom.find_data("not_expression")):
                try:
                    element = next(atom.find_data("atom_symbol")).children[0]
                except IndexError:
                    try:
                        atomic_num = next(
                            atom.find_data("atomic_num")).children[0]
                        element = pt.Element[int(atomic_num)]
                    except IndexError:
                        element = None
            else:
                element = None
            self._graph_matcher = SMARTSMatcher(
                topology_graph,
                self,
                node_match=self._node_match,
                element=element,
                typemap=typemap,
            )

        matched_atoms = set()
        for mapping in self._graph_matcher.subgraph_isomorphisms_iter():
            mapping = {
                node_id: atom_id
                for atom_id, node_id in mapping.items()
            }
            # The first node in the smarts graph always corresponds to the atom
            # that we are trying to match.
            atom_index = mapping[0]
            # Don't yield duplicate matches found via matching the pattern in a
            # different order.
            if atom_index not in matched_atoms:
                matched_atoms.add(atom_index)
                yield atom_index
Example #2
0
class SMARTSGraph(nx.Graph):
    """A graph representation of a SMARTS pattern.

    Attributes
    ----------
    smarts_string : str
        The SMARTS string outlined in the force field
    parser : foyer.smarts.SMARTS
        The parser whose grammar rules convert the SMARTSstring 
        into the AST
    name : str
    overrides : set
        Rules or SMARTSGraph over which this SMARTSGraph takes precedence

    Other Parameters
    ----------
    args
    kwargs

    Attributes
    ----------
    graph_matcher : smarts_graph.SMARTSMatcher
        implementation of VF2 that handles subgraph matching
    """

    # Because the first atom in a SMARTS string is always the one we want to
    # type, the graph's nodes needs to be ordered.

    def __init__(self,
                 smarts_string,
                 parser=None,
                 name=None,
                 overrides=None,
                 typemap=None,
                 *args,
                 **kwargs):
        super(SMARTSGraph, self).__init__(*args, **kwargs)

        self.smarts_string = smarts_string
        self.name = name
        self.overrides = overrides
        self.typemap = typemap

        if parser is None:
            self.ast = SMARTS().parse(smarts_string)
        else:
            self.ast = parser.parse(smarts_string)

        self._atom_indices = OrderedDict()
        self._add_nodes()
        self._add_edges(self.ast)
        self._add_label_edges()
        self._graph_matcher = None

    def _add_nodes(self):
        """Add all atoms in the SMARTS string as nodes in the graph."""
        for n, atom in enumerate(
            [x for x in self.ast.iter_subtrees_topdown() if x.data == 'atom']):
            self.add_node(n, atom=atom)
            self._atom_indices[id(atom)] = n

    def _add_edges(self, ast_node, trunk=None):
        """"Add all bonds in the SMARTS string as edges in the graph."""
        atom_indices = self._atom_indices
        for ast_child in ast_node.children:
            if ast_child.data == 'atom':
                atom_idx = atom_indices[id(ast_child)]
                if trunk is not None:
                    trunk_idx = atom_indices[id(trunk)]
                    self.add_edge(atom_idx, trunk_idx)
                trunk = ast_child
            elif ast_child.data == 'branch':
                self._add_edges(ast_child, trunk)

    def _add_label_edges(self):
        """Add edges between all atoms with the same atom_label in rings."""
        # We need each individual label and atoms with multiple ring labels
        # would yield e.g. the string '12' so split those up.
        label_digits = defaultdict(list)
        for node, attr in self.nodes(data=True):
            atom = attr["atom"]
            for label in atom.find_data("atom_label"):
                digits = list(label.children[0])
                for digit in digits:
                    label_digits[digit].append(atom)

        for label, (atom1, atom2) in label_digits.items():
            atom1_idx = self._atom_indices[id(atom1)]
            atom2_idx = self._atom_indices[id(atom2)]
            self.add_edge(atom1_idx, atom2_idx)

    def _node_match(self, host, pattern):
        """ Determine if two graph nodes are equal """
        atom_expr = pattern['atom'].children[0]
        atom = host['atom']
        return self._atom_expr_matches(atom_expr, atom)

    def _atom_expr_matches(self, atom_expr, atom):
        """ Helper function for evaluating SMARTS string expressions """
        if atom_expr.data == 'not_expression':
            return not self._atom_expr_matches(atom_expr.children[0], atom)
        elif atom_expr.data in ('and_expression', 'weak_and_expression'):
            return (self._atom_expr_matches(atom_expr.children[0], atom)
                    and self._atom_expr_matches(atom_expr.children[1], atom))
        elif atom_expr.data == 'or_expression':
            return (self._atom_expr_matches(atom_expr.children[0], atom)
                    or self._atom_expr_matches(atom_expr.children[1], atom))
        elif atom_expr.data == 'atom_id':
            return self._atom_id_matches(atom_expr.children[0], atom,
                                         self.typemap)
        elif atom_expr.data == 'atom_symbol':
            return self._atom_id_matches(atom_expr, atom, self.typemap)
        else:
            raise TypeError('Expected atom_id, atom_symbol, and_expression, '
                            'or_expression, or not_expression. '
                            'Got {}'.format(atom_expr.data))

    @staticmethod
    def _atom_id_matches(atom_id, atom, typemap):
        """ Helper func for comparing atomic indices, symbols, neighbors, rings """
        atomic_num = atom.element
        if atom_id.data == 'atomic_num':
            return atomic_num == int(atom_id.children[0])
        elif atom_id.data == 'atom_symbol':
            if str(atom_id.children[0]) == '*':
                return True
            elif str(atom_id.children[0]).startswith('_'):
                # Store non-element elements in .name
                return atom.name == str(atom_id.children[0])
            else:
                return atomic_num == pt.AtomicNum[str(atom_id.children[0])]
        elif atom_id.data == 'has_label':
            label = atom_id.children[0][
                1:]  # Strip the % sign from the beginning.
            return label in typemap[atom.idx]['whitelist']
        elif atom_id.data == 'neighbor_count':
            return len(atom.bond_partners) == int(atom_id.children[0])
        elif atom_id.data == 'ring_size':
            cycle_len = int(atom_id.children[0])
            for cycle in typemap[atom.idx]['cycles']:
                if len(cycle) == cycle_len:
                    return True
            return False
        elif atom_id.data == 'ring_count':
            n_cycles = len(typemap[atom.idx]['cycles'])
            if n_cycles == int(atom_id.children[0]):
                return True
            return False
        elif atom_id.data == 'matches_string':
            raise NotImplementedError('matches_string is not yet implemented')

    def find_matches(self, structure, typemap):
        """Return sets of atoms that match this SMARTS pattern in a topology.

        Notes:
        ------
        When this function gets used in atomtyper.py, we actively modify the
        white- and blacklists of the atoms in `topology` after finding a match.
        This means that between every successive call of
        `subgraph_isomorphisms_iter()`, the topology against which we are
        matching may have actually changed. Currently, we take advantage of this
        behavior in some edges cases (e.g. see `test_hexa_coordinated` in
        `test_smarts.py`).

        """
        # Note: Needs to be updated in sync with the grammar in `smarts.py`.
        ring_tokens = ['ring_size', 'ring_count']
        has_ring_rules = any(
            list(self.ast.find_data(token)) for token in ring_tokens)
        _prepare_atoms(structure, typemap, compute_cycles=has_ring_rules)

        top_graph = nx.Graph()
        top_graph.add_nodes_from(((a.idx, {
            'atom': a
        }) for a in structure.atoms))
        top_graph.add_edges_from(
            ((b.atom1.idx, b.atom2.idx) for b in structure.bonds))

        if self._graph_matcher is None:
            atom = nx.get_node_attributes(self, name='atom')[0]
            if len(list(atom.find_data('atom_symbol'))) == 1 and \
                        not list(atom.find_data('not_expression')):
                try:
                    element = next(atom.find_data('atom_symbol')).children[0]
                except IndexError:
                    try:
                        atomic_num = next(
                            atom.find_data('atomic_num')).children[0]
                        element = pt.Element[int(atomic_num)]
                    except IndexError:
                        element = None
            else:
                element = None
            self._graph_matcher = SMARTSMatcher(top_graph,
                                                self,
                                                node_match=self._node_match,
                                                element=element,
                                                typemap=typemap)

        matched_atoms = set()
        for mapping in self._graph_matcher.subgraph_isomorphisms_iter():
            mapping = {
                node_id: atom_id
                for atom_id, node_id in mapping.items()
            }
            # The first node in the smarts graph always corresponds to the atom
            # that we are trying to match.
            atom_index = mapping[0]
            # Don't yield duplicate matches found via matching the pattern in a
            # different order.
            if atom_index not in matched_atoms:
                matched_atoms.add(atom_index)
                yield atom_index