def get_all(self):
     sub_mol, pro_mol, name = self.get_mol()
     for i in tqdm(range(len(sub_mol))):
         react_atom = set()
         sub_adj = rdmolops.GetAdjacencyMatrix(sub_mol[i],
                                               useBO=1).astype(int)
         pro_adj = rdmolops.GetAdjacencyMatrix(pro_mol[i],
                                               useBO=1).astype(int)
         sub_map2id = self.map2id(sub_mol[i])
         pro_map2id = self.map2id(pro_mol[i])
         # print('sub_id', sub_map2id)
         # print('pro_id', pro_map2id)
         sub_id2map = self.id2map(sub_mol[i])
         pro_id2map = self.id2map(pro_mol[i])
         # print('sub_map', sub_id2map)
         # print('pro_map', pro_id2map)
         sub2pro_id = self.sub_map2pro_map2pro_id(sub_mol[i], pro_mol[i])
         pro2sub_id = self.pro_map2sub_map2sub_id(sub_mol[i], pro_mol[i])
         # print('sub_pro_id', sub2pro_id)
         # print("name", name[i])
         ra = self.compare_adj(react_atom, sub_adj, pro_adj, sub_id2map,
                               sub_map2id, pro_map2id, pro_id2map,
                               sub2pro_id, pro2sub_id)
         res_str = ''
         if len(ra) > 0:
             for j in ra:
                 res_str = res_str + ' ' + str(j)
             sub_mol[i].SetProp('SOM', res_str)
             sub_mol[i].SetProp('_Name', name[i])
             w.write(sub_mol[i])
         # print('--------------------------------------------------------')
     print('done')
Beispiel #2
0
    def score(self, smiles):
        mol = Chem.MolFromSmiles(smiles)

        try:
            logp = MolLogP(mol)
        except:
            logp = -1000

        sa_score = -sascorer.calculateScore(mol)
        cycle_list = nx.cycle_basis(nx.Graph(rdmolops.GetAdjacencyMatrix(mol)))
        if len(cycle_list) == 0:
            cycle_length = 0
        else:
            cycle_length = max([len(j) for j in cycle_list])
        if cycle_length <= 6:
            cycle_length = 0
        else:
            cycle_length = cycle_length - 6

        cycle_score = -cycle_length
        sa_score_norm = (sa_score - self._sa_mean) / self._sa_std
        logp_norm = (logp - self._logp_mean) / self._logp_std
        cycle_score_norm = (cycle_score - self._cycle_mean) / self._cycle_std

        return sa_score_norm + logp_norm + cycle_score_norm
Beispiel #3
0
def calc_score(smiles):
    if verify_sequence(smiles):
        try:
            molecule = MolFromSmiles(smiles)
            if Descriptors.MolWt(molecule) > 500:
                return -1e10
            current_log_P_value = Descriptors.MolLogP(molecule)
            current_SA_score = -sascorer.calculateScore(molecule)
            cycle_list = nx.cycle_basis(
                nx.Graph(rdmolops.GetAdjacencyMatrix(molecule)))
            if len(cycle_list) == 0:
                cycle_length = 0
            else:
                cycle_length = max([len(j) for j in cycle_list])
            if cycle_length <= 6:
                cycle_length = 0
            else:
                cycle_length = cycle_length - 6
            current_cycle_score = -cycle_length

            current_SA_score_normalized = (current_SA_score - SA_mean) / SA_std
            current_log_P_value_normalized = (current_log_P_value -
                                              logP_mean) / logP_std
            current_cycle_score_normalized = (current_cycle_score -
                                              cycle_mean) / cycle_std

            score = (current_SA_score_normalized +
                     current_log_P_value_normalized +
                     current_cycle_score_normalized)
            return score
        except Exception:
            return -1e10
    else:
        return -1e10
    def _tensorize(self, batch_x):
        atom_tensor = np.zeros(
            (len(batch_x), self.num_atoms, self.get_num_features()))
        adjm_tensor = np.zeros((len(batch_x), self.num_atoms, self.num_atoms))

        for mol_idx, mol in enumerate(batch_x):
            mol_atoms = mol.GetNumAtoms()

            # Atom features
            atom_tensor[mol_idx, :mol_atoms, :] = self.get_atom_features(mol)

            # Adjacency matrix
            adjms = np.array(rdmolops.GetAdjacencyMatrix(mol), dtype="float")

            # Normalize adjacency matrix by D^(-1/2) * A_hat * D^(-1/2), Kipf et al. 2016
            adjms += np.eye(mol_atoms)
            degree = np.array(adjms.sum(1))
            deg_inv_sqrt = np.power(degree, -0.5)
            deg_inv_sqrt[np.isinf(deg_inv_sqrt)] = 0.
            deg_inv_sqrt = np.diag(deg_inv_sqrt)

            adjms = np.matmul(np.matmul(deg_inv_sqrt, adjms), deg_inv_sqrt)

            adjm_tensor[mol_idx, :mol_atoms, :mol_atoms] = adjms

        return [atom_tensor, adjm_tensor]
Beispiel #5
0
 def logp_evaluator(self, new_compound, rank):
     ind=rank
     try:
         m = Chem.MolFromSmiles(str(new_compound[0]))
     except BaseException:
         m = None
     if m is not None:
         try:
             logp = Descriptors.MolLogP(m)
         except BaseException:
             logp = -1000
         SA_score = -sascorer.calculateScore(MolFromSmiles(new_compound[0]))
         cycle_list = nx.cycle_basis(
             nx.Graph(
                 rdmolops.GetAdjacencyMatrix(
                     MolFromSmiles(
                         new_compound[0]))))
         if len(cycle_list) == 0:
             cycle_length = 0
         else:
             cycle_length = max([len(j) for j in cycle_list])
         if cycle_length <= 6:
             cycle_length = 0
         else:
             cycle_length = cycle_length - 6
         cycle_score = -cycle_length
         SA_score_norm = SA_score  # (SA_score-SA_mean)/SA_std
         logp_norm = logp  # (logp-logP_mean)/logP_std
         cycle_score_norm = cycle_score  # (cycle_score-cycle_mean)/cycle_std
         score_one = SA_score_norm + logp_norm + cycle_score_norm
         score = score_one / (1 + abs(score_one))
     else:
         score = -1000 / (1 + 1000)
     return score, new_compound[0]
def compute_mol_score(s):
    logP_values, SA_scores, cycle_scores, SA_scores_normalized, logP_values_normalized, cycle_scores_normalized = get_rdkit_score(
    )
    current_log_P_value = Descriptors.MolLogP(MolFromSmiles(s))
    current_SA_score = -sascorer.calculateScore(MolFromSmiles(s))
    cycle_list = nx.cycle_basis(
        nx.Graph(rdmolops.GetAdjacencyMatrix(MolFromSmiles(s))))
    if len(cycle_list) == 0:
        cycle_length = 0
    else:
        cycle_length = max([len(j) for j in cycle_list])
    if cycle_length <= 6:
        cycle_length = 0
    else:
        cycle_length = cycle_length - 6

    current_cycle_score = -cycle_length

    current_SA_score_normalized = (current_SA_score -
                                   np.mean(SA_scores)) / np.std(SA_scores)
    current_log_P_value_normalized = (
        current_log_P_value - np.mean(logP_values)) / np.std(logP_values)
    current_cycle_score_normalized = (
        current_cycle_score - np.mean(cycle_scores)) / np.std(cycle_scores)

    score = current_SA_score_normalized + current_log_P_value_normalized + current_cycle_score_normalized
    #y_new = -current_log_P_value_normalized
    return score
Beispiel #7
0
def load_bbbp(N=40):
    print('Loading data...')
    df = pd.read_csv('bbbp/BBBP.csv')
    feature_matrices = []  # np.zeros((len(df), N, 1))
    adj_matrices = []  # np.zeros((len(df), N, N))
    labels = []  # np.zeros((len(df), 1))
    smiles_list = []
    nums = []
    for i in tqdm(range(len(df))):
        row = df.iloc[i]
        nums.append(row.num)
        smiles_list.append(row.smiles)
        mol = Chem.MolFromSmiles(row.smiles)
        if mol is None:
            continue

        # Adjacency Matrix
        adj = rdmolops.GetAdjacencyMatrix(mol, useBO=True)
        adj_matrix = np.zeros((N, N))
        s0, s1 = adj.shape
        if s0 > N:
            continue
        # adj_matrix[:s0, :s1] = adj + np.eye(s0)
        adj_matrix[:s0, :s1] = adj
        adj_matrices.append(adj_matrix)

        # Feature Vector
        atomic_nums = [atom.GetAtomicNum() for atom in mol.GetAtoms()]
        padded_atomic_nums = [0] * N
        padded_atomic_nums[:len(atomic_nums)] = atomic_nums
        feature_matrices.append(padded_atomic_nums)

        # Labels
        labels.append(row.p_np)

    enc = OneHotEncoder(handle_unknown='ignore', sparse=False)
    one_hot_feature_matrices = enc.fit_transform(feature_matrices)
    one_hot_feature_matrices = np.reshape(one_hot_feature_matrices, (-1, N, 8))
    dataset = []
    for i in range(len(labels)):
        X = torch.from_numpy(one_hot_feature_matrices[i]).float()
        A = torch.from_numpy(adj_matrices[i]).float()
        y = torch.Tensor([[labels[i]]]).float()
        mol_num = torch.Tensor([nums[i]])
        A_coo = coo_matrix(A)
        edge_index = torch.from_numpy(np.vstack([A_coo.row, A_coo.col])).long()
        edge_weight = torch.from_numpy(A_coo.data).float()
        # breakpoint()
        dataset.append(
            Data(
                x=X,
                edge_index=edge_index,
                edge_attr=edge_weight,
                y=y,
                # smiles=smiles_list[i],
                A=A,
                # atomic_nums=feature_matrices[i],
                mol_num=mol_num))

    return dataset
Beispiel #8
0
def get_atom_features(mol, dist_matrix):
    """
    Compute the following features for each atom in 'mol':
        - atom type: H, C, N, O, F (one-hot)
        - degree: 1, 2, 3, 4, 5 (one-hot)
        - Hybridization: SP, SP2, SP3, UNSPECIFIED (one-hot)
        - is aromatic: bool {0, 1}
        - formal charge: int
        - atomic number: float
        - average bond length: float
        - average weight of neigboring atoms: float
    """
    n_atoms = mol.GetNumAtoms()  # 获取原子个数
    features = np.zeros((n_atoms, C.N_ATOM_FEATURES))  # 初始化原子特征数组
    adj_matrix = rdmolops.GetAdjacencyMatrix(
        mol)  # 通过rdmolops.GetAdjacencyMatrix函数获取领接矩阵
    for a in mol.GetAtoms():
        idx = a.GetIdx()  # 通过mol结构内置函数GetIdx()获取id
        if sum(adj_matrix[idx]) > 0:  #
            ave_bond_length = np.mean(
                dist_matrix[idx][adj_matrix[idx] == 1])  # 获取键长的均值
            ave_neighbor_wt = np.mean(
                [n.GetAtomicNum() for n in a.GetNeighbors()])  # 获取周边原子的质子数的均值
        else:
            ave_bond_length, ave_neighbor_wt = 0.0, 0.0  # 如果没有周边原子,赋值为0

        sym = a.GetSymbol()  # 获取原子的标记symbol
        a_feats = one_hot_encoding(sym, C.SYMBOLS) \
            + one_hot_encoding(a.GetDegree(), C.DEGREES) \
            + one_hot_encoding(a.GetHybridization(), C.HYBRIDIZATIONS) \
            + [a.GetIsAromatic(), a.GetFormalCharge(), a.GetAtomicNum(),
               ave_bond_length, ave_neighbor_wt]      # one-hot编码分子中原子的类型、度矩阵、杂化类型; 添加分子中是否有芳香环、部分电荷、原子个数、键长均值、周边原子的质子均值
        features[idx, :len(a_feats)] = np.array(a_feats)  # 填充features数组
    return features
def mol2graph_igraph(mol):
    """
    Convert molecule to nx.Graph

    Adapted from
    https://iwatobipen.wordpress.com/2016/12/30/convert-rdkit-molecule-object-to-igraph-graph-object/
    """
    mol = mol.to_rdkit()
    admatrix = rdmolops.GetAdjacencyMatrix(mol)
    bondidxs = [(b.GetBeginAtomIdx(), b.GetEndAtomIdx())
                for b in mol.GetBonds()]
    adlist = np.ndarray.tolist(admatrix)
    graph = igraph.Graph()
    g = graph.Adjacency(adlist).as_undirected()

    ## set properties
    # for idx in g.vs.indices:
    #     g.vs[idx][ "AtomicNum" ] = mol.GetAtomWithIdx(idx).GetAtomicNum()
    #     g.vs[idx][ "AtomicSymbole" ] = mol.GetAtomWithIdx(idx).GetSymbol()

    # for bd in bondidxs:
    #     btype = mol.GetBondBetweenAtoms(bd[0], bd[1]).GetBondTypeAsDouble()
    #     g.es[g.get_eid(bd[0], bd[1])]["BondType"] = btype
    #     print( bd, mol.GetBondBetweenAtoms(bd[0], bd[1]).GetBondTypeAsDouble() )
    return g
Beispiel #10
0
def calc_score(mol):
    logP_mean = 2.457  # np.mean(logP_values)
    logP_std = 1.434  # np.std(logP_values)
    SA_mean = -3.053  # np.mean(SA_scores)
    SA_std = 0.834  # np.std(SA_scores)
    cycle_mean = -0.048  # np.mean(cycle_scores)
    cycle_std = 0.287  # np.std(cycle_scores)

    molecule = mol
    if Descriptors.MolWt(molecule) > 500:
        return -1e10
    current_log_P_value = Descriptors.MolLogP(molecule)
    current_SA_score = -sascorer.calculateScore(molecule)
    cycle_list = nx.cycle_basis(nx.Graph(
        rdmolops.GetAdjacencyMatrix(molecule)))
    if len(cycle_list) == 0:
        cycle_length = 0
    else:
        cycle_length = max([len(j) for j in cycle_list])
    if cycle_length <= 6:
        cycle_length = 0
    else:
        cycle_length = cycle_length - 6
    current_cycle_score = -cycle_length

    current_SA_score_normalized = (current_SA_score - SA_mean) / SA_std
    current_log_P_value_normalized = (current_log_P_value -
                                      logP_mean) / logP_std
    current_cycle_score_normalized = (current_cycle_score -
                                      cycle_mean) / cycle_std

    score = (current_SA_score_normalized + current_log_P_value_normalized +
             current_cycle_score_normalized)
Beispiel #11
0
def gaussion_workers(chem_model, val):
    while True:
        simulation_time = time.time()
        task = comm.recv(source=0, tag=MPI.ANY_TAG, status=status)
        tag = status.Get_tag()
        if tag == START:
            state = task[0]
            m = task[1]
            all_posible = chem_kn_simulation(chem_model, state, val, m)
            generate_smile = predict_smile(all_posible, val)
            new_compound = make_input_smile(generate_smile)
            score = []
            kao = []

            try:
                m = Chem.MolFromSmiles(str(new_compound[0]))
            except:
                m = None
            #if m!=None and len(task[i])<=81:
            if m != None:
                try:
                    logp = Descriptors.MolLogP(m)
                except:
                    logp = -1000
                SA_score = -sascorer.calculateScore(
                    MolFromSmiles(new_compound[0]))
                cycle_list = nx.cycle_basis(
                    nx.Graph(
                        rdmolops.GetAdjacencyMatrix(
                            MolFromSmiles(new_compound[0]))))
                if len(cycle_list) == 0:
                    cycle_length = 0
                else:
                    cycle_length = max([len(j) for j in cycle_list])
                if cycle_length <= 6:
                    cycle_length = 0
                else:
                    cycle_length = cycle_length - 6
                cycle_score = -cycle_length
                #print cycle_score
                #print SA_score
                #print logp
                SA_score_norm = (SA_score - SA_mean) / SA_std
                logp_norm = (logp - logP_mean) / logP_std
                cycle_score_norm = (cycle_score - cycle_mean) / cycle_std
                score_one = SA_score_norm + logp_norm + cycle_score_norm
                score.append(score_one)

            else:
                score.append(-1000)
            score.append(new_compound[0])
            score.append(rank)

            comm.send(score, dest=0, tag=DONE)
            simulation_fi_time = time.time() - simulation_time
            print "simulation_fi_time:", simulation_fi_time
        if tag == EXIT:
            MPI.Abort(MPI.COMM_WORLD)

    comm.send(None, dest=0, tag=EXIT)
Beispiel #12
0
def calc_score(smiles):
    if verify_sequence(smiles):
        molecule = MolFromSmiles(smiles)
        current_log_P_value = Descriptors.MolLogP(molecule)
        current_SA_score = -sascorer.calculateScore(molecule)
        cycle_list = nx.cycle_basis(
            nx.Graph(rdmolops.GetAdjacencyMatrix(molecule)))
        if len(cycle_list) == 0:
            cycle_length = 0
        else:
            cycle_length = max([len(j) for j in cycle_list])
        if cycle_length <= 6:
            cycle_length = 0
        else:
            cycle_length = cycle_length - 6
        current_cycle_score = -cycle_length

        current_SA_score_normalized = (current_SA_score -
                                       np.mean(SA_scores)) / np.std(SA_scores)
        current_log_P_value_normalized = (
            current_log_P_value - np.mean(logP_values)) / np.std(logP_values)
        current_cycle_score_normalized = (
            current_cycle_score - np.mean(cycle_scores)) / np.std(cycle_scores)

        score = (current_SA_score_normalized + current_log_P_value_normalized +
                 current_cycle_score_normalized)
        return score
    else:
        raise ValueError("Error in calc_score: smiles is invalid.")
Beispiel #13
0
def check_node_type(new_compound):
    node_index = []
    valid_compound = []
    all_smile = []
    distance = []

    score = []
    for i in range(len(new_compound)):
        ko = Chem.MolFromSmiles(new_compound[i])
        if ko != None:
            SA_score = -sascorer.calculateScore(MolFromSmiles(new_compound[i]))
            cycle_list = nx.cycle_basis(
                nx.Graph(
                    rdmolops.GetAdjacencyMatrix(MolFromSmiles(
                        new_compound[i]))))
            if len(cycle_list) == 0:
                cycle_length = 0
            else:
                cycle_length = max([len(j) for j in cycle_list])
            if cycle_length <= 6:
                cycle_length = 0
            if cycle_length == 0:
                m = rdock_score(new_compound[i])
                if m < 10**10:
                    node_index.append(i)
                    valid_compound.append(new_compound[i])
                    score.append(m)

    return node_index, score, valid_compound
def is_ts_correct(rsmi, psmi, irc_start_xyz, irc_end_xyz):
    """
    This function compares the input smiles with the smiles of the endpoints of
    the IRC.
    """
    print(rsmi, psmi)
    rmol = smiles_to_mol(rsmi)
    pmol = smiles_to_mol(psmi)

    charge = GetFormalCharge(rmol)

    ts_found = False

    #doing smiles check
    irc_start_smi, _, _ = get_smiles(irc_start_xyz, charge)
    print("reverse SMILES: ", irc_start_smi)
    irc_end_smi, _, _ = get_smiles(irc_end_xyz, charge)
    print("forward smiles: ", irc_end_smi)
    if irc_start_smi == rsmi and irc_end_smi == psmi:
        ts_found = True
        print("SMILES MATCH: TS FOUND: reactant = reverse")

    if irc_start_smi == psmi and irc_end_smi == rsmi:
        ts_found = True
        print("SMILES MATCH: TS FOUND: reactant = forward")

    #doing AC check
    r_ac = rdmolops.GetAdjacencyMatrix(rmol)
    p_ac = rdmolops.GetAdjacencyMatrix(pmol)

    irc_start_mol = smiles_to_mol(irc_start_smi)
    irc_end_mol = smiles_to_mol(irc_end_smi)

    irc_start_ac = rdmolops.GetAdjacencyMatrix(irc_start_mol)
    irc_end_ac = rdmolops.GetAdjacencyMatrix(irc_end_mol)

    if np.all(irc_start_ac == irc_end_ac):
        print("found TS for conformational change")
    else:
        print("found non-coonformational change")

    if np.all(r_ac == irc_start_ac) and np.all(p_ac == irc_end_ac):
        print("AC MATCH: reactant = reverse")
    if np.all(p_ac == irc_start_ac) and np.all(r_ac == irc_end_ac):
        print("AC MATCH: reactant = forward")

    return ts_found
Beispiel #15
0
    def compute(self, mol):
        cycle_list = nx.cycle_basis(nx.Graph(rdmolops.GetAdjacencyMatrix(mol)))
        if len(cycle_list) == 0:
            cycle_length = 0
        else:
            cycle_length = max(map(len, cycle_list))

        return max(0, cycle_length - 6)
Beispiel #16
0
def max_pair_distance_pairs(mol: RDKitMol,
                            max_pair_distance: Optional[int]) -> np.ndarray:
  """Helper method which finds atom pairs within max_pair_distance graph distance.

  This helper method is used to find atoms which are within max_pair_distance
  graph_distance of one another. This is done by using the fact that the
  powers of an adjacency matrix encode path connectivity information. In
  particular, if `adj` is the adjacency matrix, then `adj**k` has a nonzero
  value at `(i, j)` if and only if there exists a path of graph distance `k`
  between `i` and `j`. To find all atoms within `max_pair_distance` of each
  other, we can compute the adjacency matrix powers `[adj, adj**2,
  ...,adj**max_pair_distance]` and find pairs which are nonzero in any of
  these matrices. Since adjacency matrices and their powers are positive
  numbers, this is simply the nonzero elements of `adj + adj**2 + ... +
  adj**max_pair_distance`.

  Parameters
  ----------
  mol: rdkit.Chem.rdchem.Mol
    RDKit molecules
  max_pair_distance: Optional[int], (default None)
    This value can be a positive integer or None. This
    parameter determines the maximum graph distance at which pair
    features are computed. For example, if `max_pair_distance==2`,
    then pair features are computed only for atoms at most graph
    distance 2 apart. If `max_pair_distance` is `None`, all pairs are
    considered (effectively infinite `max_pair_distance`)


  Returns
  -------
  np.ndarray
    Of shape `(2, num_pairs)` where `num_pairs` is the total number of pairs
    within `max_pair_distance` of one another.
  """
  from rdkit import Chem
  from rdkit.Chem import rdmolops
  N = len(mol.GetAtoms())
  if (max_pair_distance is None or max_pair_distance >= N):
    max_distance = N
  elif max_pair_distance is not None and max_pair_distance <= 0:
    raise ValueError(
        "max_pair_distance must either be a positive integer or None")
  elif max_pair_distance is not None:
    max_distance = max_pair_distance
  adj = rdmolops.GetAdjacencyMatrix(mol)
  # Handle edge case of self-pairs (i, i)
  sum_adj = np.eye(N)
  for i in range(max_distance):
    # Increment by 1 since we don't want 0-indexing
    power = i + 1
    sum_adj += np.linalg.matrix_power(adj, power)
  nonzero_locs = np.where(sum_adj != 0)
  num_pairs = len(nonzero_locs[0])
  # This creates a matrix of shape (2, num_pairs)
  pair_edges = np.reshape(np.array(list(zip(nonzero_locs))), (2, num_pairs))
  return pair_edges
Beispiel #17
0
def check_node_type(new_compound, SA_mean, SA_std, logP_mean, logP_std,
                    cycle_mean, cycle_std):
    node_index = []
    valid_compound = []
    logp_value = []
    all_smile = []
    distance = []
    #print "SA_mean:",SA_mean
    #print "SA_std:",SA_std
    #print "logP_mean:",logP_mean
    #print "logP_std:",logP_std
    #print "cycle_mean:",cycle_mean
    #print "cycle_std:",cycle_std
    activity = []
    score = []

    for i in range(len(new_compound)):
        try:
            m = Chem.MolFromSmiles(str(new_compound[i]))
        except:
            print(None)
        if m != None and len(new_compound[i]) <= 81:
            try:
                logp = Descriptors.MolLogP(m)
            except:
                logp = -1000
            node_index.append(i)
            valid_compound.append(new_compound[i])
            SA_score = -sascorer.calculateScore(MolFromSmiles(new_compound[i]))
            cycle_list = nx.cycle_basis(
                nx.Graph(
                    rdmolops.GetAdjacencyMatrix(MolFromSmiles(
                        new_compound[i]))))
            if len(cycle_list) == 0:
                cycle_length = 0
            else:
                cycle_length = max([len(j) for j in cycle_list])
            if cycle_length <= 6:
                cycle_length = 0
            else:
                cycle_length = cycle_length - 6
            cycle_score = -cycle_length
            #print cycle_score
            #print SA_score
            #print logp
            SA_score_norm = (SA_score - SA_mean) / SA_std
            logp_norm = (logp - logP_mean) / logP_std
            cycle_score_norm = (cycle_score - cycle_mean) / cycle_std
            score_one = SA_score_norm + logp_norm + cycle_score_norm
            score.append(score_one)

        all_smile.append(new_compound[i])

    return node_index, score, valid_compound, all_smile
def add_sc_angle_features(df, xyzs, dist_matrices):
    """
    Adds the following angle features to 'df':
    - diangle: for 3J couplings
    - cos_angle: for 2J couplings, angle between sc atom 0, atom in between sc
        atoms and sc atom 1
    - cos_angle0: for all coupling types, cos angle between sc atoms and atom
        closest to atom 0 (except for 1J coupling)
    - cos_angle1: for all coupling types, cos angle between sc atoms and atom
        closest to atom 1
    """
    df['diangle'] = 0.0
    df['cos_angle'] = 0.0
    df['cos_angle0'] = 0.0
    df['cos_angle1'] = 0.0
    diangles, cos_angles, cos_angles0, cos_angles1 = {}, {}, {}, {}
    print('Add scalar coupling angle based features.')
    n = len(df)
    for i, (idx, row) in enumerate(df.iterrows()):
        print_progress(i, n, 500000)
        #if row['molecule_name'] == 'dsgdb9nsd_086797':
        #    pdb.set_trace()
        mol_name = row['molecule_name']
        mol, xyz = mols[mol_name], xyzs[mol_name]
        dist_matrix = dist_matrices[mol_name]
        adj_matrix = rdmolops.GetAdjacencyMatrix(mol)
        idx0, idx1 = row['atom_index_0'], row['atom_index_1']
        atom_ids = rdmolops.GetShortestPath(mol, idx0, idx1)

        if len(atom_ids)==4:
            diangles[idx] = dihedral(xyz[atom_ids,:])
        elif len(atom_ids)==3:
            cos_angles[idx] = cosine_angle(xyz[atom_ids,:])

        if row['type'] not in [0, 2]:
            neighbors0 = np.where(adj_matrix[idx0]==1)[0]           ###
            if len(neighbors0) > 0:
                idx0_closest = neighbors0[
                    dist_matrix[idx0][neighbors0].argmin()]
                cos_angles0[idx] = cosine_angle(
                    xyz[[idx0_closest, idx0, idx1],:])
        neighbors1 = np.setdiff1d(np.where(adj_matrix[idx1]==1)[0], [idx0])
        if len(neighbors1) > 0:
            idx1_closest = neighbors1[
                dist_matrix[idx1][neighbors1].argmin()]
            cos_angles1[idx] = cosine_angle(
                xyz[[idx0, idx1, idx1_closest],:])

    df['diangle'] = pd.Series(diangles).abs()
    df['cos_angle'] = pd.Series(cos_angles)
    df['cos_angle0'] = pd.Series(cos_angles0)
    df['cos_angle1'] = pd.Series(cos_angles1)
    df.fillna(0., inplace=True)
    return df
Beispiel #19
0
def _cycle_score(mol):
    cycle_list = nx.cycle_basis(nx.Graph(rdmolops.GetAdjacencyMatrix(mol)))
    if len(cycle_list) == 0:
        cycle_length = 0
    else:
        cycle_length = max([len(j) for j in cycle_list])
    if cycle_length <= 6:
        cycle_length = 0
    else:
        cycle_length = cycle_length - 6
    return cycle_length
Beispiel #20
0
def simulation(chem_model, state, node):
    #time.sleep(10)
    val = [
        '\n', '&', 'C', '(', ')', 'c', '1', '2', 'o', '=', 'O', 'N', '3', 'F',
        '[C@@H]', 'n', '-', '#', 'S', 'Cl', '[O-]', '[C@H]', '[NH+]', '[C@]',
        's', 'Br', '/', '[nH]', '[NH3+]', '4', '[NH2+]', '[C@@]', '[N+]',
        '[nH+]', '\\', '[S@]', '5', '[N-]', '[n+]', '[S@@]', '[S-]', '6', '7',
        'I', '[n-]', 'P', '[OH+]', '[NH-]', '[P@@H]', '[P@@]', '[PH2]', '[P@]',
        '[P+]', '[S+]', '[o+]', '[CH2-]', '[CH-]', '[SH+]', '[O+]', '[s+]',
        '[PH+]', '[PH]', '8', '[S@@+]'
    ]
    all_posible = chem_kn_simulation(chem_model, state, val)
    generate_smile = predict_smile(all_posible, val)
    new_compound = make_input_smile(generate_smile)
    #score=[]
    kao = []
    try:
        m = Chem.MolFromSmiles(str(new_compound[0]))
        #print (str(new_compound[0]))
    except:
        m = None
    if m != None:
        try:
            logp = Descriptors.MolLogP(m)
        except:
            logp = -1000
        SA_score = -sascorer.calculateScore(MolFromSmiles(new_compound[0]))
        cycle_list = nx.cycle_basis(
            nx.Graph(
                rdmolops.GetAdjacencyMatrix(MolFromSmiles(new_compound[0]))))
        if len(cycle_list) == 0:
            cycle_length = 0
        else:
            cycle_length = max([len(j) for j in cycle_list])
        if cycle_length <= 6:
            cycle_length = 0
        else:
            cycle_length = cycle_length - 6
        cycle_score = -cycle_length
        SA_score_norm = SA_score  #(SA_score-SA_mean)/SA_std
        logp_norm = logp  #(logp-logP_mean)/logP_std
        cycle_score_norm = cycle_score  #(cycle_score-cycle_mean)/cycle_std
        score_one = SA_score_norm + logp_norm + cycle_score_norm
        #score.append(score_one)
        score = score_one / (1 + abs(score_one))
    else:
        #score.append(-1000)
        score = -1000 / (1 + 1000)
    #score.append(new_compound[0])
    #score.append(rank)

    return score
Beispiel #21
0
def construct_adj_matrix(mol, out_size=-1, self_connection=True):
    """Returns the adjacent matrix of the given molecule.

    This function returns the adjacent matrix of the given molecule.
    Contrary to the specification of
    :func:`rdkit.Chem.rdmolops.GetAdjacencyMatrix`,
    The diagonal entries of the returned matrix are all-one.

    Args:
        mol (rdkit.Chem.Mol): Input molecule.
        out_size (int): The size of the returned matrix.
            If this option is negative, it does not take any effect.
            Otherwise, it must be larger than the number of atoms
            in the input molecules. In that case, the adjacent
            matrix is expanded and zeros are padded to right
            columns and bottom rows.
        self_connection (bool): Add self connection or not.
            If True, diagonal element of adjacency matrix is filled with 1.

    Returns:
        adj_array (numpy.ndarray): The adjacent matrix of the input molecule.
            It is 2-dimensional array with shape (atoms1, atoms2), where
            atoms1 & atoms2 represent from and to of the edge respectively.
            If ``out_size`` is non-negative, the returned
            its size is equal to that value. Otherwise,
            it is equal to the number of atoms in the the molecule.
    """

    adj = rdmolops.GetAdjacencyMatrix(mol)
    s0, s1 = adj.shape
    if s0 != s1:
        raise ValueError('The adjacent matrix of the input molecule'
                         'has an invalid shape: ({}, {}). '
                         'It must be square.'.format(s0, s1))

    if self_connection:
        adj = adj + numpy.eye(s0)
    if out_size < 0:
        adj_array = adj.astype(numpy.float32)
    elif out_size >= s0:
        adj_array = numpy.zeros((out_size, out_size),
                                dtype=numpy.float32)
        adj_array[:s0, :s1] = adj
    else:
        raise ValueError(
            '`out_size` (={}) must be negative or larger than or equal to the '
            'number of atoms in the input molecules (={}).'
            .format(out_size, s0))
    return adj_array
Beispiel #22
0
def mol2graph(mol):
    admatrix = rdmolops.GetAdjacencyMatrix(mol)
    bondidxs = [(b.GetBeginAtomIdx(), b.GetEndAtomIdx())
                for b in mol.GetBonds()]
    adlist = np.ndarray.tolist(admatrix)
    graph = igraph.Graph()
    g = graph.Adjacency(adlist).as_undirected()
    for idx in g.vs.indices:
        g.vs[idx]["AtomicNum"] = mol.GetAtomWithIdx(idx).GetAtomicNum()
        g.vs[idx]["AtomicSymbole"] = mol.GetAtomWithIdx(idx).GetSymbol()
    for bd in bondidxs:
        btype = mol.GetBondBetweenAtoms(bd[0], bd[1]).GetBondTypeAsDouble()
        g.es[g.get_eid(bd[0], bd[1])]["BondType"] = btype
        # print( bd, mol.GetBondBetweenAtoms( bd[0], bd[1] ).GetBondTypeAsDouble() )
    return g
Beispiel #23
0
def get_atom_features(mol, dist_matrix):
    """
    Compute the following features for each atom in 'mol':
        - atom type: H, C, N, O, F (one-hot)
        - degree: 1, 2, 3, 4, 5 (one-hot)
        - Hybridization: SP, SP2, SP3, UNSPECIFIED (one-hot)
        - is aromatic: bool {0, 1}
        - formal charge: int
        - atomic number: float
        - average bond length: float
        - average weight of neigboring atoms: float
        - donor: bool {0, 1}
        - acceptor: bool {0, 1}
    """
    n_atoms = mol.GetNumAtoms()
    features = np.zeros((n_atoms, C.N_ATOM_FEATURES))
    adj_matrix = rdmolops.GetAdjacencyMatrix(mol)
    for a in mol.GetAtoms():
        idx = a.GetIdx()
        if sum(adj_matrix[idx]) > 0:
            ave_bond_length = np.mean(dist_matrix[idx][adj_matrix[idx] == 1])
            ave_neighbor_wt = np.mean(
                [n.GetAtomicNum() for n in a.GetNeighbors()])
        else:
            ave_bond_length, ave_neighbor_wt = 0.0, 0.0

        sym = a.GetSymbol()
        a_feats = one_hot_encoding(sym, C.SYMBOLS) \
            + one_hot_encoding(a.GetDegree(), C.DEGREES) \
            + one_hot_encoding(a.GetHybridization(), C.HYBRIDIZATIONS) \
            + [a.GetIsAromatic(), a.GetFormalCharge(), a.GetAtomicNum(),
               ave_bond_length, ave_neighbor_wt]
        features[idx, :len(a_feats)] = np.array(a_feats)

    feat_factory = ChemicalFeatures.BuildFeatureFactory(C.FDEF)
    try:
        chem_feats = feat_factory.GetFeaturesForMol(mol)
        for t in range(len(chem_feats)):
            if chem_feats[t].GetFamily() == 'Donor':
                for i in chem_feats[t].GetAtomIds():
                    features[i, -2] = 1
            elif chem_feats[t].GetFamily() == 'Acceptor':
                for i in chem_feats[t].GetAtomIds():
                    features[i, -1] = 1
    except RuntimeError as e:
        print(e)

    return features
Beispiel #24
0
def mol_to_graph_data(mol):
    A = rdmolops.GetAdjacencyMatrix(mol)
    node_features, edge_features = {}, {}

    bondidxs = [(b.GetBeginAtomIdx(), b.GetEndAtomIdx())
                for b in mol.GetBonds()]

    for idx in range(A.shape[0]):
        atomic_num = mol.GetAtomWithIdx(idx).GetAtomicNum()
        node_features[idx]["label"] = int(atomic_num)

    for b1, b2 in bondidxs:
        btype = mol.GetBondBetweenAtoms(b1, b2).GetBondTypeAsDouble()
        edge_features[(b1, b2)]["label"] = int(btype)

    return A, node_features, edge_features
Beispiel #25
0
def cycle_score(m):
    """
    Input : a mol object
    Output : cycle score penalty (scalar)
    """
    cycle_list = nx.cycle_basis(nx.Graph(rdmolops.GetAdjacencyMatrix(m)))
    if len(cycle_list) == 0:
        cycle_length = 0
    else:
        cycle_length = max([ len(j) for j in cycle_list ])
    if cycle_length <= 6:
        cycle_length = 0
    else:
        cycle_length = cycle_length - 6
        
    return float(cycle_length)
Beispiel #26
0
 def _get_valid_mols(self):
     tmpmols = Chem.SDMolSupplier(self.filename)
     mols = []
     for m in tmpmols:
         if m is None:
             continue
         try:
             rdmolops.GetAdjacencyMatrix(m)
         except Exception as e:
             print(e)
             continue
         edge_index, _ = get_mol_edge_index(m, self.edge_types)
         if edge_index.nelement() == 0:
             continue
         mols.append(m)
     return mols
Beispiel #27
0
def simulation(chem_model, state, node):
    val = [
        '\n', '&', 'C', 'O', '(', 'F', ')', '1', '2', '=', '#', '[C@H]',
        '[C@@H]', '3', '[O-]', '[C@@]', '[C]', '[CH]', '/', '[C@]', '[CH2]',
        '4', '[O+]', '[O]', '5'
    ]
    all_posible = chem_kn_simulation(chem_model, state)
    generate_smile = predict_smile(all_posible, val)
    new_compound = make_input_smile(generate_smile)
    #score=[]
    kao = []
    try:
        m = Chem.MolFromSmiles(str(new_compound[0]))
    except:
        m = None
    #if m!=None and len(task[i])<=81:
    if m != None:
        try:
            logp = Descriptors.MolLogP(m)
        except:
            logp = -1000
        SA_score = -sascorer.calculateScore(MolFromSmiles(new_compound[0]))
        cycle_list = nx.cycle_basis(
            nx.Graph(
                rdmolops.GetAdjacencyMatrix(MolFromSmiles(new_compound[0]))))
        if len(cycle_list) == 0:
            cycle_length = 0
        else:
            cycle_length = max([len(j) for j in cycle_list])
        if cycle_length <= 6:
            cycle_length = 0
        else:
            cycle_length = cycle_length - 6
        cycle_score = -cycle_length
        SA_score_norm = (SA_score - SA_mean) / SA_std
        logp_norm = (logp - logP_mean) / logP_std
        cycle_score_norm = (cycle_score - cycle_mean) / cycle_std
        score_one = SA_score_norm + logp_norm + cycle_score_norm
        #score.append(score_one)
        score = score_one / (1 + abs(score_one))
    else:
        #score.append(-1000)
        score = -1000 / (1 + 1000)
    #score.append(new_compound[0])
    #score.append(rank)
    node.reward = score
    return node
Beispiel #28
0
def get_molecules():
    """
    Constructs rdkit mol objects derrived from the .xyz files. Also returns:
        - mol ids (unique numerical ids)
        - set of molecule level features
        - arrays of xyz coordinates
        - euclidean distance matrices
        - graph distance matrices.
    All objects are returned in dictionaries with 'mol_name' as keys.
    """
    mols, mol_ids, mol_feats = {}, {}, {}
    xyzs, dist_matrices, graph_dist_matrices = {}, {}, {}
    print('Create molecules and distance matrices.')
    for i in range(C.N_MOLS):
        print_progress(i, C.N_MOLS)
        filepath = xyz_filepath_list[i]
        mol_name = filepath.split('/')[-1][:-4]
        mol, xyz, dist_matrix = mol_from_xyz(filepath)  #读取XYZ文件获取结构mol和距离矩阵,坐标
        mols[mol_name] = mol
        xyzs[mol_name] = xyz
        dist_matrices[mol_name] = dist_matrix
        mol_ids[mol_name] = i  # 数据集中分子序号作为分子的id

        # make padded graph distance matrix dataframes
        n_atoms = len(xyz)
        graph_dist_matrix = pd.DataFrame(
            np.pad(rdmolops.GetDistanceMatrix(mol),
                   [(0, 0), (0, C.MAX_N_ATOMS - n_atoms)],
                   'constant'))  #通过ramolops.GetDistanceMatrix获取 图距离矩阵
        graph_dist_matrix['molecule_id'] = n_atoms * [
            i
        ]  # eg: CH4 5 * [0] = [0, 0, 0, 0, 0] list数据可以为dataframe赋值
        graph_dist_matrices[mol_name] = graph_dist_matrix  #字典:value: dataframe

        # compute molecule level features
        adj_matrix = rdmolops.GetAdjacencyMatrix(
            mol)  #通过ramolops.GetDistanceMatrix获取 图邻接矩阵
        atomic_num_list, _, _ = read_xyz_file(
            filepath)  #读取XYZ文件获取分子中各原子的原子序数和坐标
        dists = dist_matrix.ravel()[np.tril(adj_matrix).ravel() ==
                                    1]  #通过邻接矩阵的下三角获取与相邻原子之间的距离
        mol_feats[mol_name] = pd.Series(
            [np.mean(dists),
             np.std(dists),
             np.mean(atomic_num_list)],
            index=mol_feat_columns)  #获取与领接原子之间距离均值和标准差、原子序数的均值(分子级特征)
    return mols, mol_ids, mol_feats, xyzs, dist_matrices, graph_dist_matrices  #返回训练集所有分子结构mol和分子ids,分子级特征,原子坐标,距离矩阵,图距离矩阵
Beispiel #29
0
def get_score_components_from_mol(this_mol):
    try:
        logP = Descriptors.MolLogP(this_mol)
    except:
        logP = 0.0
    SA_score = -sascorer.calculateScore(this_mol)
    cycle_list = nx.cycle_basis(nx.Graph(
        rdmolops.GetAdjacencyMatrix(this_mol)))
    if len(cycle_list) == 0:
        cycle_length = 0
    else:
        cycle_length = max([len(j) for j in cycle_list])
    if cycle_length <= 6:
        cycle_length = 0
    else:
        cycle_length = cycle_length - 6
    cycle_score = -cycle_length
    return logP, SA_score, cycle_score
Beispiel #30
0
    def tensorize(self, batch_x, batch_c):
        atom_tensor = np.zeros(
            (len(batch_x), self.num_atoms, self.get_num_features()))
        adjm_tensor = np.zeros((len(batch_x), self.num_atoms, self.num_atoms))
        posn_tensor = np.zeros(
            (len(batch_x), self.num_atoms, self.num_atoms, 3))

        for mol_idx, mol in enumerate(batch_x):
            Chem.RemoveHs(mol)
            mol_atoms = mol.GetNumAtoms()

            # Atom features
            atom_tensor[mol_idx, :mol_atoms, :] = self.get_atom_features(mol)

            # Adjacency matrix
            adjms = np.array(rdmolops.GetAdjacencyMatrix(mol), dtype="float")

            # Normalize adjacency matrix by D^(-1/2) * A_hat * D^(-1/2), Kipf et al. 2016
            adjms += np.eye(mol_atoms)
            degree = np.array(adjms.sum(1))
            deg_inv_sqrt = np.power(degree, -0.5)
            deg_inv_sqrt[np.isinf(deg_inv_sqrt)] = 0.
            deg_inv_sqrt = np.diag(deg_inv_sqrt)

            adjms = np.matmul(np.matmul(deg_inv_sqrt, adjms), deg_inv_sqrt)

            adjm_tensor[mol_idx, :mol_atoms, :mol_atoms] = adjms

            # Relative position matrix
            for atom_idx in range(mol_atoms):
                pos_c = batch_c[mol_idx][atom_idx]

                for neighbor_idx in range(mol_atoms):
                    pos_n = batch_c[mol_idx][neighbor_idx]

                    # Direction should be Neighbor -> Center
                    n_to_c = [
                        pos_c[0] - pos_n[0], pos_c[1] - pos_n[1],
                        pos_c[2] - pos_n[2]
                    ]
                    posn_tensor[mol_idx, atom_idx, neighbor_idx, :] = n_to_c

        return [atom_tensor, adjm_tensor, posn_tensor]