def mol_from_xyz(filepath, add_hs=True, compute_dist_centre=False): """Wrapper function for calling xyz2mol function.""" charged_fragments = True # alternatively radicals are made # quick is faster for large systems but requires networkx # if you don't want to install networkx set quick=False and # uncomment 'import networkx as nx' at the top of the file quick = True atomicNumList, charge, xyz_coordinates = read_xyz_file(filepath) mol, dMat = xyz2mol(atomicNumList, charge, xyz_coordinates, charged_fragments, quick, check_chiral_stereo=False) #The molecular structure mol and distance matrix can be obtained by reading the XYZ file return mol, np.array(xyz_coordinates), dMat
def test_smiles_from_xyz_files(filename, charge, answer): atoms, charge_read, coordinates = x2m.read_xyz_file(filename) mols = x2m.xyz2mol(atoms, coordinates, charge=charge) smiles_list = [] for mol in mols: mol = Chem.RemoveHs(mol) smiles = Chem.MolToSmiles(mol) smiles_list.append(smiles) assert answer in smiles_list
def MolFromXYZ(filename): charged_fragments = True quick = True cache_filename = filename.parent / f'{filename.stem}.pkl' if cache_filename.exists(): return pickle.load(open(cache_filename, 'rb')) else: try: atomicNumList, charge, xyz_coordinates = read_xyz_file(filename) mol = xyz2mol(atomicNumList, charge, xyz_coordinates, charged_fragments, quick) pickle.dump(mol, open(cache_filename, 'wb')) except: print(filename) return mol
def get_molecules(): """ Constructs rdkit mol objects derrived from the .xyz files. Also returns: - mol ids (unique numerical ids) - set of molecule level features - arrays of xyz coordinates - euclidean distance matrices - graph distance matrices. All objects are returned in dictionaries with 'mol_name' as keys. """ mols, mol_ids, mol_feats = {}, {}, {} xyzs, dist_matrices, graph_dist_matrices = {}, {}, {} print('Create molecules and distance matrices.') for i in range(C.N_MOLS): print_progress(i, C.N_MOLS) filepath = xyz_filepath_list[i] mol_name = filepath.split('/')[-1][:-4] mol, xyz, dist_matrix = mol_from_xyz(filepath) #读取XYZ文件获取结构mol和距离矩阵,坐标 mols[mol_name] = mol xyzs[mol_name] = xyz dist_matrices[mol_name] = dist_matrix mol_ids[mol_name] = i # 数据集中分子序号作为分子的id # make padded graph distance matrix dataframes n_atoms = len(xyz) graph_dist_matrix = pd.DataFrame( np.pad(rdmolops.GetDistanceMatrix(mol), [(0, 0), (0, C.MAX_N_ATOMS - n_atoms)], 'constant')) #通过ramolops.GetDistanceMatrix获取 图距离矩阵 graph_dist_matrix['molecule_id'] = n_atoms * [ i ] # eg: CH4 5 * [0] = [0, 0, 0, 0, 0] list数据可以为dataframe赋值 graph_dist_matrices[mol_name] = graph_dist_matrix #字典:value: dataframe # compute molecule level features adj_matrix = rdmolops.GetAdjacencyMatrix( mol) #通过ramolops.GetDistanceMatrix获取 图邻接矩阵 atomic_num_list, _, _ = read_xyz_file( filepath) #读取XYZ文件获取分子中各原子的原子序数和坐标 dists = dist_matrix.ravel()[np.tril(adj_matrix).ravel() == 1] #通过邻接矩阵的下三角获取与相邻原子之间的距离 mol_feats[mol_name] = pd.Series( [np.mean(dists), np.std(dists), np.mean(atomic_num_list)], index=mol_feat_columns) #获取与领接原子之间距离均值和标准差、原子序数的均值(分子级特征) return mols, mol_ids, mol_feats, xyzs, dist_matrices, graph_dist_matrices #返回训练集所有分子结构mol和分子ids,分子级特征,原子坐标,距离矩阵,图距离矩阵
def get_molecules(): """ Constructs rdkit mol objects derrived from the .xyz files. Also returns: - mol ids (unique numerical ids) - set of molecule level features - arrays of xyz coordinates - euclidean distance matrices - graph distance matrices. All objects are returned in dictionaries with 'mol_name' as keys. """ mols, mol_ids, mol_feats = {}, {}, {} xyzs, dist_matrices, graph_dist_matrices = {}, {}, {} print('Create molecules and distance matrices.') for i in range(C.N_MOLS): print_progress(i, C.N_MOLS) filepath = xyz_filepath_list[i] mol_name = filepath.split('/')[-1][:-4] mol, xyz, dist_matrix = mol_from_xyz(filepath) mols[mol_name] = mol xyzs[mol_name] = xyz dist_matrices[mol_name] = dist_matrix mol_ids[mol_name] = i # make padded graph distance matrix dataframes n_atoms = len(xyz) graph_dist_matrix = pd.DataFrame( np.pad(rdmolops.GetDistanceMatrix(mol), [(0, 0), (0, C.MAX_N_ATOMS - n_atoms)], 'constant')) graph_dist_matrix['molecule_id'] = n_atoms * [i] graph_dist_matrices[mol_name] = graph_dist_matrix # compute molecule level features adj_matrix = rdmolops.GetAdjacencyMatrix(mol) atomic_num_list, _, _ = read_xyz_file(filepath) dists = dist_matrix.ravel()[np.tril(adj_matrix).ravel() == 1] mol_feats[mol_name] = pd.Series( [np.mean(dists), np.std(dists), np.mean(atomic_num_list)], index=mol_feat_columns) return mols, mol_ids, mol_feats, xyzs, dist_matrices, graph_dist_matrices
import xyz2mol as x2m if __name__ == "__main__": #print(rdBase.rdkitVersion) filename = "ethane.xyz" filename = "acetate.xyz" filename = "chiral_stereo_test.xyz" charged_fragments = True quick = True huckel = True atomicNumList,charge,xyz_coordinates = x2m.read_xyz_file(filename) mol = x2m.xyz2mol(atomicNumList,charge,xyz_coordinates,charged_fragments,quick,huckel) print(Chem.MolToSmiles(mol, isomericSmiles=True)) # code to test using SMILES instead of xyz file smiles_list = ['C=C([O-])CC','C=C([NH3+])CC','CC(=O)[O-]','C[N+](=O)[O-]','CS(CC)(=O)=O','CS([O-])(=O)=O', 'C=C(C)CC', 'CC(C)CC','C=C(N)CC','C=C(C)C=C','C#CC=C','c1ccccc1','c1ccccc1c1ccccc1', '[NH3+]CS([O-])(=O)=O','CC(NC)=O','[O-]c1ccccc1','O=C(C=C1)C=CC1=CCC([O-])=O', 'C#CC#C','Cc1ccc(cc1)C1C=CC2C(C=CC2(C#N)C#N)=CC=1'] #smiles_list = ['C[NH+]=C([O-])CC[NH+]=C([O-])C','C[NH+]=CC=C([O-])C', # "[C+](C)(C)CC[C-](C)(C)",'O=C(C=C1)C=CC1=CCC([O-])=O', # 'O=C([CH-]C=CC(C([O-])=O)=O)[O-]','[O-]c1ccccc1','CNC(C(C)=[NH+][CH-]CC(O)=O)=O',"[CH2][CH2][CH]=[CH][CH2]"] #smiles_list = ['Cc1ccc(cc1)C1C=CC2C(C=CC2(C#N)C#N)=CC=1'] #smiles_list = ['CC1C=CC2C(C=CC2(C)C)=CC=1'] #smiles_list = ['CC1=CC=C(C=CC2)C2C=C1']