def _in_pi_subgraph(atom_symbol: str, bonds: Tuple[str]) -> bool: """Checks whether a SMILES atom symbol should be a node in the pi subgraph, based on its bonds. More specifically, an atom should be a node in the pi subgraph if it has an unpaired valence electron, and thus, is able to make a double bond. Reference: https://depth-first.com/articles/2020/02/10/a-comprehensive -treatment-of-aromaticity-in-the-smiles-language/ :param atom_symbol: a SMILES atom symbol representing an atom. :param bonds: the bonds connected to ``atom_symbol``. :return: True if ``atom_symbol`` should be included in the pi subgraph, and False otherwise. """ atom, h_count, charge = parse_atom_symbol(atom_symbol) used_electrons = 0 for b in bonds: used_electrons += get_num_from_bond(b) # e.g. c1ccccc1 if (atom == 'c') and (h_count == charge == 0) and (len(bonds) == 2): h_count += 1 # implied bonded hydrogen if h_count > 1: raise ValueError(f"Kekulization Failed: {atom_symbol} not supported.") elif h_count == 1: # e.g. [nH] used_electrons += 1 valence = _aromatic_valences[atom] - charge free_electrons = valence - used_electrons return free_electrons % 2 != 0
def _in_pi_subgraph(atom_symbol: str, bonds: Tuple[str]) -> bool: """Checks whether a SMILES atom symbol should be a node in the pi subgraph, based on its bonds. More specifically, an atom should be a node in the pi subgraph if it has an unpaired valence electron, and thus, is able to make a double bond. Reference: https://depth-first.com/articles/2020/02/10/a-comprehensive -treatment-of-aromaticity-in-the-smiles-language/ :param atom_symbol: a SMILES atom symbol representing an atom. :param bonds: the bonds connected to ``atom_symbol``. :return: True if ``atom_symbol`` should be included in the pi subgraph, and False otherwise. """ atom, h_count, charge = parse_atom_symbol(atom_symbol) used_electrons = 0 for b in bonds: used_electrons += get_num_from_bond(b) # e.g. c1ccccc1 # this also covers the neutral carbon radical case (e.g. C1=[C]NC=C1), # which is treated equivalently to a 1-H carbon (e.g. C1=[CH]NC=C1) if (atom == 'c') and (h_count == charge == 0) \ and (len(bonds) == 2) and ('#' not in bonds): h_count += 1 # implied bonded hydrogen if h_count > 1: raise ValueError( "unrecognized aromatic symbol '{}'".format(atom_symbol)) elif h_count == 1: # e.g. [nH] used_electrons += 1 valence = _aromatic_valences[atom] - charge free_electrons = valence - used_electrons return free_electrons % 2 != 0
def _translate_smiles_derive(smiles_gen: Iterable[Tuple[str, str, int]], rings: Dict[int, Tuple[str, int]], counter: List[int]) -> Tuple[str, int]: """Recursive helper for _translate_smiles. Derives the SELFIES from a SMILES, and returns a tuple of (1) the translated SELFIES and (2) the symbol length of the translated SELFIES. :param smiles_gen: an iterable of the symbols (and their types) of the SMILES to be translated, created by ``_parse_smiles``. :param rings: See ``rings`` in ``_translate_smiles``. :param counter: a one-element list that serves as a mutable counter. See ``derived_counter`` in ``_translate_smiles``. :return: A tuple of the translated SELFIES and its symbol length. """ selfies = "" selfies_len = 0 prev_idx = -1 for bond, symbol, symbol_type in smiles_gen: if bond == '-': # ignore explicit single bonds bond = '' if symbol_type == ATOM_TYPE: if symbol[0] == '[': selfies += "[{}{}expl]".format(bond, symbol[1:-1]) else: selfies += "[{}{}]".format(bond, symbol) prev_idx = counter[0] counter[0] += 1 selfies_len += 1 elif symbol_type == BRANCH_TYPE: if symbol == '(': # NOTE: looping inside a loop on a generator will produce # expected behaviour in this case. branch, branch_len = \ _translate_smiles_derive(smiles_gen, rings, counter) N_as_symbols = get_symbols_from_n(branch_len - 1) bond_num = get_num_from_bond(bond) selfies += "[Branch{}_{}]".format(len(N_as_symbols), bond_num) selfies += ''.join(N_as_symbols) + branch selfies_len += 1 + len(N_as_symbols) + branch_len else: # symbol == ')' break else: # symbol_type == RING_TYPE ring_id = int(symbol) if ring_id in rings: left_bond, left_end = rings.pop(ring_id) right_bond, right_end = bond, prev_idx ring_len = right_end - left_end N_as_symbols = get_symbols_from_n(ring_len - 1) if left_bond != '': selfies += "[Expl{}Ring{}]".format(left_bond, len(N_as_symbols)) elif right_bond != '': selfies += "[Expl{}Ring{}]".format(right_bond, len(N_as_symbols)) else: selfies += "[Ring{}]".format(len(N_as_symbols)) selfies += ''.join(N_as_symbols) selfies_len += 1 + len(N_as_symbols) else: rings[ring_id] = (bond, prev_idx) return selfies, selfies_len
def _form_rings_bilocally(derived: List[List[Union[str, int]]], rings: List[Tuple[int, int, str]]) -> None: """Forms all the rings specified by the rings list, in first-to-last order, by updating derived. :param derived: see ``derived`` in ``_translate_selfies``. :param rings: see ``rings`` in ``_translate_selfies``. :return: ``None``. """ # due to the behaviour of allowing multiple rings between the same atom # pair, or rings between already bonded atoms, we first resolve all rings # so that only valid rings are left and placed into <ring_locs>. ring_locs = OrderedDict() for left_idx, right_idx, bond_symbol in rings: if left_idx == right_idx: # ring to the same atom forbidden continue left_end = derived[left_idx] right_end = derived[right_idx] bond_num = get_num_from_bond(bond_symbol) if left_end[1] <= 0 or right_end[1] <= 0: continue # no room for bond if bond_num > min(left_end[1], right_end[1]): bond_num = min(left_end[1], right_end[1]) bond_symbol = get_bond_from_num(bond_num) # ring is formed between two atoms that are already bonded # e.g. CC1C1C --> CC=CC if left_idx == right_end[2]: right_symbol = right_end[0] if right_symbol[0] in {'-', '/', '\\', '=', '#'}: old_bond = right_symbol[0] else: old_bond = '' # update bond multiplicity and symbol new_bond_num = min(bond_num + get_num_from_bond(old_bond), 3) new_bond_symbol = get_bond_from_num(new_bond_num) right_end[0] = new_bond_symbol + right_end[0][len(old_bond):] # ring is formed between two atoms that are not bonded, e.g. C1CC1C else: loc = (left_idx, right_idx) if loc in ring_locs: # a ring is formed between two atoms that are have previously # been bonded by a ring, so ring bond multiplicity is updated new_bond_num = min( bond_num + get_num_from_bond(ring_locs[loc]), 3) new_bond_symbol = get_bond_from_num(new_bond_num) ring_locs[loc] = new_bond_symbol else: ring_locs[loc] = bond_symbol left_end[1] -= bond_num right_end[1] -= bond_num # finally, use <ring_locs> to add all the rings into <derived> ring_counter = 1 for (left_idx, right_idx), bond_symbol in ring_locs.items(): ring_id = str(ring_counter) if len(ring_id) == 2: ring_id = "%" + ring_id ring_counter += 1 # increment derived[left_idx][0] += bond_symbol + ring_id derived[right_idx][0] += bond_symbol + ring_id
def _translate_selfies_derive(selfies_gen: Iterable[str], init_state: int, derived: List[List[Union[str, int]]], prev_idx: int, branches: Dict[int, int], rings: List[Tuple[int, int, str]]) -> None: """Recursive helper for _translate_selfies. Derives the SMILES symbols one-by-one from a SELFIES, and populates derived, branches, and rings. The main chain and side branches of the SELFIES are translated recursively. Rings are not actually translated, but saved to the rings list to be added later. :param selfies_gen: an iterable of the symbols of the SELFIES to be translated, created by ``_parse_selfies``. :param init_state: the initial derivation state. :param derived: see ``derived`` in ``_translate_selfies``. :param prev_idx: the index of the previously derived atom, or -1, if no atoms have been derived yet. :param branches: see ``branches`` in ``_translate_selfies``. :param rings: see ``rings`` in ``_translate_selfies``. :return: ``None``. """ curr_symbol = next(selfies_gen) state = init_state while curr_symbol != '' and state >= 0: # Case 1: Branch symbol (e.g. [Branch1_2]) if 'Branch' in curr_symbol: branch_init_state, new_state = \ get_next_branch_state(curr_symbol, state) if state <= 1: # state = 0, 1 pass # ignore no symbols else: L = int(curr_symbol[-4]) # corresponds to [BranchL_X] L_symbols = [] for _ in range(L): L_symbols.append(next(selfies_gen)) N = get_n_from_symbols(*L_symbols) branch_symbols = [] for _ in range(N + 1): branch_symbols.append(next(selfies_gen)) branch_gen = _parse_selfies_symbols(branch_symbols) branch_start = len(derived) _translate_selfies_derive(branch_gen, branch_init_state, derived, prev_idx, branches, rings) branch_end = len(derived) - 1 # resolve C((C)Cl)C --> C(C)(Cl)C while branch_start in branches: branch_start = branches[branch_start] + 1 # finally, register the branch in branches if branch_start <= branch_end: branches[branch_start] = branch_end # Case 2: Ring symbol (e.g. [Ring2]) elif 'Ring' in curr_symbol: new_state = state if state == 0: pass # ignore no symbols else: L = int(curr_symbol[-2]) # corresponds to [RingL] L_symbols = [] for _ in range(L): L_symbols.append(next(selfies_gen)) N = get_n_from_symbols(*L_symbols) left_idx = max(0, prev_idx - (N + 1)) right_idx = prev_idx bond_symbol = '' if curr_symbol[1:5] == 'Expl': bond_symbol = curr_symbol[5] rings.append((left_idx, right_idx, bond_symbol)) # Case 3: regular symbol (e.g. [N], [=C], [F]) else: new_symbol, new_state = get_next_state(curr_symbol, state) if new_symbol != '': # in case of [epsilon] derived.append([new_symbol, new_state, prev_idx]) if prev_idx >= 0: bond_num = get_num_from_bond(new_symbol[0]) derived[prev_idx][1] -= bond_num prev_idx = len(derived) - 1 curr_symbol = next(selfies_gen) # update symbol and state state = new_state