def mol_from_xyz(filepath, add_hs=True, compute_dist_centre=False):
    """Wrapper function for calling xyz2mol function."""
    charged_fragments = True  # alternatively radicals are made
    # quick is faster for large systems but requires networkx
    # if you don't want to install networkx set quick=False and
    # uncomment 'import networkx as nx' at the top of the file
    quick = True

    atomicNumList, charge, xyz_coordinates = read_xyz_file(filepath)
    mol, dMat = xyz2mol(atomicNumList, charge, xyz_coordinates,
                        charged_fragments, quick, check_chiral_stereo=False)  #The molecular structure mol and distance matrix can be obtained by reading the XYZ file
    return mol, np.array(xyz_coordinates), dMat
Exemple #2
0
def test_smiles_from_xyz_files(filename, charge, answer):

    atoms, charge_read, coordinates = x2m.read_xyz_file(filename)

    mols = x2m.xyz2mol(atoms, coordinates, charge=charge)

    smiles_list = []
    for mol in mols:
        mol = Chem.RemoveHs(mol)

        smiles = Chem.MolToSmiles(mol)
        smiles_list.append(smiles)

    assert answer in smiles_list
def MolFromXYZ(filename):
    charged_fragments = True
    quick = True
    cache_filename = filename.parent / f'{filename.stem}.pkl'
    if cache_filename.exists():
        return pickle.load(open(cache_filename, 'rb'))
    else:
        try:
            atomicNumList, charge, xyz_coordinates = read_xyz_file(filename)
            mol = xyz2mol(atomicNumList, charge, xyz_coordinates,
                          charged_fragments, quick)
            pickle.dump(mol, open(cache_filename, 'wb'))
        except:
            print(filename)
    return mol
Exemple #4
0
def get_molecules():
    """
    Constructs rdkit mol objects derrived from the .xyz files. Also returns:
        - mol ids (unique numerical ids)
        - set of molecule level features
        - arrays of xyz coordinates
        - euclidean distance matrices
        - graph distance matrices.
    All objects are returned in dictionaries with 'mol_name' as keys.
    """
    mols, mol_ids, mol_feats = {}, {}, {}
    xyzs, dist_matrices, graph_dist_matrices = {}, {}, {}
    print('Create molecules and distance matrices.')
    for i in range(C.N_MOLS):
        print_progress(i, C.N_MOLS)
        filepath = xyz_filepath_list[i]
        mol_name = filepath.split('/')[-1][:-4]
        mol, xyz, dist_matrix = mol_from_xyz(filepath)  #读取XYZ文件获取结构mol和距离矩阵,坐标
        mols[mol_name] = mol
        xyzs[mol_name] = xyz
        dist_matrices[mol_name] = dist_matrix
        mol_ids[mol_name] = i  # 数据集中分子序号作为分子的id

        # make padded graph distance matrix dataframes
        n_atoms = len(xyz)
        graph_dist_matrix = pd.DataFrame(
            np.pad(rdmolops.GetDistanceMatrix(mol),
                   [(0, 0), (0, C.MAX_N_ATOMS - n_atoms)],
                   'constant'))  #通过ramolops.GetDistanceMatrix获取 图距离矩阵
        graph_dist_matrix['molecule_id'] = n_atoms * [
            i
        ]  # eg: CH4 5 * [0] = [0, 0, 0, 0, 0] list数据可以为dataframe赋值
        graph_dist_matrices[mol_name] = graph_dist_matrix  #字典:value: dataframe

        # compute molecule level features
        adj_matrix = rdmolops.GetAdjacencyMatrix(
            mol)  #通过ramolops.GetDistanceMatrix获取 图邻接矩阵
        atomic_num_list, _, _ = read_xyz_file(
            filepath)  #读取XYZ文件获取分子中各原子的原子序数和坐标
        dists = dist_matrix.ravel()[np.tril(adj_matrix).ravel() ==
                                    1]  #通过邻接矩阵的下三角获取与相邻原子之间的距离
        mol_feats[mol_name] = pd.Series(
            [np.mean(dists),
             np.std(dists),
             np.mean(atomic_num_list)],
            index=mol_feat_columns)  #获取与领接原子之间距离均值和标准差、原子序数的均值(分子级特征)
    return mols, mol_ids, mol_feats, xyzs, dist_matrices, graph_dist_matrices  #返回训练集所有分子结构mol和分子ids,分子级特征,原子坐标,距离矩阵,图距离矩阵
Exemple #5
0
def get_molecules():
    """
    Constructs rdkit mol objects derrived from the .xyz files. Also returns:
        - mol ids (unique numerical ids)
        - set of molecule level features
        - arrays of xyz coordinates
        - euclidean distance matrices
        - graph distance matrices.
    All objects are returned in dictionaries with 'mol_name' as keys.
    """
    mols, mol_ids, mol_feats = {}, {}, {}
    xyzs, dist_matrices, graph_dist_matrices = {}, {}, {}
    print('Create molecules and distance matrices.')
    for i in range(C.N_MOLS):
        print_progress(i, C.N_MOLS)
        filepath = xyz_filepath_list[i]
        mol_name = filepath.split('/')[-1][:-4]
        mol, xyz, dist_matrix = mol_from_xyz(filepath)
        mols[mol_name] = mol
        xyzs[mol_name] = xyz
        dist_matrices[mol_name] = dist_matrix
        mol_ids[mol_name] = i

        # make padded graph distance matrix dataframes
        n_atoms = len(xyz)
        graph_dist_matrix = pd.DataFrame(
            np.pad(rdmolops.GetDistanceMatrix(mol),
                   [(0, 0), (0, C.MAX_N_ATOMS - n_atoms)], 'constant'))
        graph_dist_matrix['molecule_id'] = n_atoms * [i]
        graph_dist_matrices[mol_name] = graph_dist_matrix

        # compute molecule level features
        adj_matrix = rdmolops.GetAdjacencyMatrix(mol)
        atomic_num_list, _, _ = read_xyz_file(filepath)
        dists = dist_matrix.ravel()[np.tril(adj_matrix).ravel() == 1]
        mol_feats[mol_name] = pd.Series(
            [np.mean(dists),
             np.std(dists),
             np.mean(atomic_num_list)],
            index=mol_feat_columns)

    return mols, mol_ids, mol_feats, xyzs, dist_matrices, graph_dist_matrices
Exemple #6
0
import xyz2mol as x2m


if __name__ == "__main__":

    #print(rdBase.rdkitVersion)

    filename = "ethane.xyz"
    filename = "acetate.xyz"
    filename = "chiral_stereo_test.xyz"

    charged_fragments = True
    quick = True
    huckel = True

    atomicNumList,charge,xyz_coordinates = x2m.read_xyz_file(filename)
    mol = x2m.xyz2mol(atomicNumList,charge,xyz_coordinates,charged_fragments,quick,huckel)

    print(Chem.MolToSmiles(mol, isomericSmiles=True))

    # code to test using SMILES instead of xyz file
    smiles_list = ['C=C([O-])CC','C=C([NH3+])CC','CC(=O)[O-]','C[N+](=O)[O-]','CS(CC)(=O)=O','CS([O-])(=O)=O',
                'C=C(C)CC', 'CC(C)CC','C=C(N)CC','C=C(C)C=C','C#CC=C','c1ccccc1','c1ccccc1c1ccccc1',
                '[NH3+]CS([O-])(=O)=O','CC(NC)=O','[O-]c1ccccc1','O=C(C=C1)C=CC1=CCC([O-])=O',
                'C#CC#C','Cc1ccc(cc1)C1C=CC2C(C=CC2(C#N)C#N)=CC=1']
    #smiles_list = ['C[NH+]=C([O-])CC[NH+]=C([O-])C','C[NH+]=CC=C([O-])C',
    #            "[C+](C)(C)CC[C-](C)(C)",'O=C(C=C1)C=CC1=CCC([O-])=O',
    #            'O=C([CH-]C=CC(C([O-])=O)=O)[O-]','[O-]c1ccccc1','CNC(C(C)=[NH+][CH-]CC(O)=O)=O',"[CH2][CH2][CH]=[CH][CH2]"]
    #smiles_list = ['Cc1ccc(cc1)C1C=CC2C(C=CC2(C#N)C#N)=CC=1']
    #smiles_list = ['CC1C=CC2C(C=CC2(C)C)=CC=1']
    #smiles_list = ['CC1=CC=C(C=CC2)C2C=C1']