def pair_features(mol, idx1, idx2, max_distance=7): features = np.zeros((6 + max_distance + 1)) # bond type bond = mol.GetBondBetweenAtoms(idx1, idx2) if bond is not None: bt = bond.GetBondType() features[:6] = np.array([ bt == Chem.rdchem.BondType.SINGLE, bt == Chem.rdchem.BondType.DOUBLE, bt == Chem.rdchem.BondType.TRIPLE, bt == Chem.rdchem.BondType.AROMATIC, bond.GetIsConjugated(), bond.IsInRing() ], dtype=int) # whether two atoms are in same ring rings = mol.GetRingInfo().AtomRings() for ring in rings: if idx1 in ring and idx2 in ring and idx1 != idx2: features[6] = 1 # graph distance between two atoms distance = rdmolops.GetDistanceMatrix(mol) distance = np.where(distance < max_distance, distance, max_distance - 1)[idx1][idx2] distance = to_categorical(distance, num_classes=max_distance) features[7:] = distance return features
def get_molecules(): """ Constructs rdkit mol objects derrived from the .xyz files. Also returns: - mol ids (unique numerical ids) - set of molecule level features - arrays of xyz coordinates - euclidean distance matrices - graph distance matrices. All objects are returned in dictionaries with 'mol_name' as keys. """ mols, mol_ids, mol_feats = {}, {}, {} xyzs, dist_matrices, graph_dist_matrices = {}, {}, {} print('Create molecules and distance matrices.') for i in range(C.N_MOLS): print_progress(i, C.N_MOLS) filepath = xyz_filepath_list[i] mol_name = filepath.split('/')[-1][:-4] mol, xyz, dist_matrix = mol_from_xyz(filepath) #读取XYZ文件获取结构mol和距离矩阵,坐标 mols[mol_name] = mol xyzs[mol_name] = xyz dist_matrices[mol_name] = dist_matrix mol_ids[mol_name] = i # 数据集中分子序号作为分子的id # make padded graph distance matrix dataframes n_atoms = len(xyz) graph_dist_matrix = pd.DataFrame( np.pad(rdmolops.GetDistanceMatrix(mol), [(0, 0), (0, C.MAX_N_ATOMS - n_atoms)], 'constant')) #通过ramolops.GetDistanceMatrix获取 图距离矩阵 graph_dist_matrix['molecule_id'] = n_atoms * [ i ] # eg: CH4 5 * [0] = [0, 0, 0, 0, 0] list数据可以为dataframe赋值 graph_dist_matrices[mol_name] = graph_dist_matrix #字典:value: dataframe # compute molecule level features adj_matrix = rdmolops.GetAdjacencyMatrix( mol) #通过ramolops.GetDistanceMatrix获取 图邻接矩阵 atomic_num_list, _, _ = read_xyz_file( filepath) #读取XYZ文件获取分子中各原子的原子序数和坐标 dists = dist_matrix.ravel()[np.tril(adj_matrix).ravel() == 1] #通过邻接矩阵的下三角获取与相邻原子之间的距离 mol_feats[mol_name] = pd.Series( [np.mean(dists), np.std(dists), np.mean(atomic_num_list)], index=mol_feat_columns) #获取与领接原子之间距离均值和标准差、原子序数的均值(分子级特征) return mols, mol_ids, mol_feats, xyzs, dist_matrices, graph_dist_matrices #返回训练集所有分子结构mol和分子ids,分子级特征,原子坐标,距离矩阵,图距离矩阵
def get_molecules(): """ Constructs rdkit mol objects derrived from the .xyz files. Also returns: - mol ids (unique numerical ids) - set of molecule level features - arrays of xyz coordinates - euclidean distance matrices - graph distance matrices. All objects are returned in dictionaries with 'mol_name' as keys. """ mols, mol_ids, mol_feats = {}, {}, {} xyzs, dist_matrices, graph_dist_matrices = {}, {}, {} print('Create molecules and distance matrices.') for i in range(C.N_MOLS): print_progress(i, C.N_MOLS) filepath = xyz_filepath_list[i] mol_name = filepath.split('/')[-1][:-4] mol, xyz, dist_matrix = mol_from_xyz(filepath) mols[mol_name] = mol xyzs[mol_name] = xyz dist_matrices[mol_name] = dist_matrix mol_ids[mol_name] = i # make padded graph distance matrix dataframes n_atoms = len(xyz) graph_dist_matrix = pd.DataFrame( np.pad(rdmolops.GetDistanceMatrix(mol), [(0, 0), (0, C.MAX_N_ATOMS - n_atoms)], 'constant')) graph_dist_matrix['molecule_id'] = n_atoms * [i] graph_dist_matrices[mol_name] = graph_dist_matrix # compute molecule level features adj_matrix = rdmolops.GetAdjacencyMatrix(mol) atomic_num_list, _, _ = read_xyz_file(filepath) dists = dist_matrix.ravel()[np.tril(adj_matrix).ravel() == 1] mol_feats[mol_name] = pd.Series( [np.mean(dists), np.std(dists), np.mean(atomic_num_list)], index=mol_feat_columns) return mols, mol_ids, mol_feats, xyzs, dist_matrices, graph_dist_matrices
def mol_to_vec(smifile, shared_fg, voltype, max_path_length, verbose=False): """ mol_to_vec function Allows for calculating a graph-based sum of atomic contributions in "layers" based off atomic connectivity ranging away from a specified common functional group. arguments: smifile: a file listing SMILES strings, properties can optionally be provided after each string. shared_fg: a SMILES pattern shared by provided SMILES strings, to be used as a reference. voltype: type of volume contributions to measure, current options are "crippen" or "mcgowan". max_path_length: number of layers to include contributions from base functional group. verbose: print information to terminal. returns: Pandas DataFrame containing sum of atomic contributions for each layer requested in format: ["layer0",..."layerN","Structure" (SMILES string),"Property" (optional)] """ # computes the atomic contributions to volume/sterics at discrete number of bond lengths away from a particular atom/functional group mollist, y_val, vec_df, columns = [], [], [], [] [ columns.append(str(col) + '_' + voltype.lower()) for col in range(0, max_path_length) ] if shared_fg == False: sys.exit( "Please specify a common functional group pattern as a SMILES string using the --fg argument. Exiting." ) for line in open(smifile): toks = line.split() mol2vec = [] base_id = None if len( toks ) > 1: # expects a smiles string, followed by a property value on each line # parse structure from input smi, prop = toks[0:2] y_val.append(prop) elif len(toks) == 1: smi = toks[0] mollist.append(smi) mol = Chem.MolFromSmiles(smi) if mol is None: print("Warning - Could not parse SMILES for", smi, "\nEvaluating with looser constrictions.") mol = Chem.MolFromSmiles(smi, sanitize=False) mol.UpdatePropertyCache(strict=False) if mol is None: print("Parsing Failed. Skipping this structure") vec_df.append(pd.Series()) continue mat = rdmolops.GetDistanceMatrix(mol) # the origin is defined by a particular functional group/atom of interest shared by all structures patt = Chem.MolFromSmarts(shared_fg) # if a functional group SMILES pattern is specified, the base atom is expected to be first in the smiles string fg_atoms = mol.GetSubstructMatch(patt) if len(fg_atoms) == 0: print("ERR: Functional group", shared_fg, "not found in molecule:", smi) lower_fg = shared_fg.lower() patt = Chem.MolFromSmarts(lower_fg) fg_atoms = mol.GetSubstructMatch(patt) if len(fg_atoms) == 0: print( "Parsing functional group from this structure failed. Skipping this structure" ) vec_df.append(pd.Series()) continue else: if verbose: print("Found by parsing functional group as", lower_fg) base_id = mol.GetSubstructMatch(patt)[0] dist_from_base = mat[base_id] # This uses Crippen's atomic contributions to molecular refractivity as volumes # Atomic contributions to logP are also available ... if voltype.lower() == 'crippen': molH = Chem.AddHs(mol) mrContribs = Crippen.rdMolDescriptors._CalcCrippenContribs(molH) logps, mrs = zip(*mrContribs) # condense H atom contributions to attached heavy atom mr, apolsCondensed = crippenHContribs(molH, mrs) vols = apolsCondensed elif voltype.lower() == 'mcgowan': #grab mcgowan volumes molH = Chem.AddHs(mol) vols = mcgowanHContribs(molH) elif voltype.lower() == 'degree': vols = degreeContribs(mol) # this is the radial count up to the max_path_length for level in range(0, max_path_length): mol2vec.append(0) for at, dist in enumerate(dist_from_base): # This will try to exclude the other atoms in the defined FG apart from the base if dist == level and at not in fg_atoms: mol2vec[level] += vols[at] elif at == base_id and level == 0: # add contributions from base atom at level 0 """Add contributions from other base atoms here ? or not at all""" mol2vec[level] += vols[at] # create the vector from the successive graph levels vec_df.append(mol2vec) vec_df = pd.DataFrame(vec_df, columns=columns) vec_df['Structure'] = np.array(mollist) if len(y_val) > 0: vec_df['Property'] = np.array(y_val) return vec_df
def add_sample(url,features,target,As,sizes,num_molecules): """Extracts information from .xyz file and returns the features array, target array, array of adjacency matrices, array of molecule sizes, and total number of molecules.""" try: properties = []; with open(url,'r') as file: for row in file: properties += row.split() SMILES = properties[-4] #INCHI = properties[-2] m = Chem.MolFromSmiles(SMILES) #m = Chem.AddHs(m) #uncomment to include hydrogen atoms vertices = m.GetAtoms() d = len(vertices) partial_charges=properties[22:(23+5*(d-1)):5] rdPartialCharges.ComputeGasteigerCharges(m) # Targets dipole_moment = float(properties[6]) polarizability = float(properties[7]) h**o = float(properties[8]) lumo = float(properties[9]) gap = float(properties[10]) r2 = float(properties[11]) zpve = float(properties[12]) U0 = float(properties[13]) internal_energy = float(properties[14]) enthalpy = float(properties[15]) free_nrg = float(properties[16]) heat_capacity = float(properties[17]) # This is where you decide which properties you would like to predict target.append([heat_capacity]) #target.append([dipole_moment,polarizability,h**o,lumo,gap, # r2,zpve,U0,internal_energy,enthalpy,free_nrg,heat_capacity]) #populate the adjacency matrix and write features tempA = rdmolops.GetDistanceMatrix(m) #np.zeros((d,d)); #Adjacency matrix tempBO = np.zeros((d,d)) tempAR = np.zeros((d,d)) num_features = 11 tempfeatures = [[0]*num_features for _ in range(d)]; # d=#nodes, f=#features available for atom in vertices: # Get features of the atom v_i = atom.GetIdx() atomic_num = atom.GetAtomicNum() degree = atom.GetDegree() valence = atom.GetTotalValence() hybrid = int(atom.GetHybridization()) atom_aromatic = atom.GetIsAromatic() ring = atom.IsInRing() rad = atom.GetNumRadicalElectrons() pc = atom.GetDoubleProp("_GasteigerCharge") # Get bonds linked_atoms =[x.GetIdx() for x in atom.GetNeighbors()] # Populate adjacency matrix tempBO[v_i][v_i] = 10 #arbitrarily large number to indicate connectivity to self for v_j in linked_atoms: bond_order = m.GetBondBetweenAtoms(v_i,v_j).GetBondTypeAsDouble() #bond_length = m.GetBondBetweenAtoms(v_i,v_j).GetBondLength() bond_aromatic = m.GetBondBetweenAtoms(v_i,v_j).GetIsAromatic() tempBO[v_i][v_j] = bond_order tempAR[v_i][v_j] = bond_aromatic # Write features tempfeatures[v_i][0] = atomic_num tempfeatures[v_i][1] = int(atomic_num==1) #H tempfeatures[v_i][2] = int(atomic_num==6) #C tempfeatures[v_i][3] = int(atomic_num==7) #N tempfeatures[v_i][4] = int(atomic_num==8) #O tempfeatures[v_i][5] = int(atomic_num==9) #F tempfeatures[v_i][6] = int(atom_aromatic) tempfeatures[v_i][7] = hybrid tempfeatures[v_i][8] = degree tempfeatures[v_i][9] = valence tempfeatures[v_i][10] = atom.GetDoubleProp("_GasteigerCharge") As[0].append(sp.coo_matrix(tempA)) As[1].append(sp.coo_matrix(tempBO)) As[2].append(sp.coo_matrix(tempAR)) if (num_molecules == 0): sizes = sizes + [d-1] else: sizes = sizes + [d] num_molecules=num_molecules+1 if (num_molecules % 5000 == 0): print("On the "+str(num_molecules)+"th molecule") features+=tempfeatures return features, target, As, sizes, num_molecules except Exception as e: # Write exception to file print(str(e)) # with open("analysis/problem_files.txt","a+") as file: # file.write(str(num_molecules)+" : "+str(url)+ " " + str(e) + "\n") return features, target, A, sizes, num_molecules
def add_sample(url, features, target, As, sizes, num_molecules, elements_info): #extract information from xyz file #try: if (True): properties = [] with open(url, 'r') as file: for row in file: properties += row.split() SMILES = properties[-4] INCHI = properties[-2] m = Chem.MolFromSmiles(SMILES) m = Chem.AddHs(m) vertices = m.GetAtoms() #edges = m.GetBonds() d = len(vertices) partial_charges = properties[22:(23 + 5 * (d - 1)):5] atomic_number_mean = elements_info[1] atomic_number_stdev = elements_info[2] #pc = m.ComputeGasteigerCharges() #print(pc,partial_charges) # Targets dipole_moment = float(properties[6]) polarizability = float(properties[7]) h**o = float(properties[8]) lumo = float(properties[9]) gap = float(properties[10]) r2 = float(properties[11]) zpve = float(properties[12]) U0 = float(properties[13]) internal_energy = float(properties[14]) enthalpy = float(properties[15]) free_nrg = float(properties[16]) heat_capacity = float(properties[17]) target.append([heat_capacity]) #target.append([dipole_moment,polarizability,h**o,lumo,gap, # r2,zpve,U0,internal_energy,enthalpy,free_nrg,heat_capacity]) #populate the adjacency matrix and write features tempA = rdmolops.GetDistanceMatrix( m) #np.zeros((d,d)); #Adjacency matrix tempBO = np.zeros((d, d)) tempAR = np.zeros((d, d)) f = 8 tempfeatures = [[0] * f for _ in range(d)] # d=#nodes, f=#features available dic = {'C': 6, 'H': 1, 'O': 8, 'N': 7, 'F': 9} for atom in vertices: # Get features of the atom v_i = atom.GetIdx() atomic_num = atom.GetAtomicNum() degree = atom.GetDegree() valence = atom.GetTotalValence() hybrid = int(atom.GetHybridization()) atom_aromatic = atom.GetIsAromatic() ring = atom.IsInRing() rad = atom.GetNumRadicalElectrons() # Get bonds linked_atoms = [x.GetIdx() for x in atom.GetNeighbors()] # Populate adjacency matrix tempBO[v_i][ v_i] = 10 #arbitrarily large number to indicate connectivity to self for v_j in linked_atoms: bond_order = m.GetBondBetweenAtoms(v_i, v_j).GetBondTypeAsDouble() #bond_length = m.GetBondBetweenAtoms(v_i,v_j).GetBondLength() bond_aromatic = m.GetBondBetweenAtoms(v_i, v_j).GetIsAromatic() tempBO[v_i][v_j] = bond_order tempAR[v_i][v_j] = bond_aromatic # Write features tempfeatures[v_i][0] = (atomic_num - atomic_number_mean) / atomic_number_stdev tempfeatures[v_i][1] = int(atomic_num == 1) #H tempfeatures[v_i][2] = int(atomic_num == 6) #C tempfeatures[v_i][3] = int(atomic_num == 7) #N tempfeatures[v_i][4] = int(atomic_num == 8) #O tempfeatures[v_i][5] = int(atomic_num == 9) #F tempfeatures[v_i][6] = int(atom_aromatic) tempfeatures[v_i][7] = hybrid #tempfeatures[v_i][8] = ring #float(partial_charges[atom]) #Mulliken partial charge #tempfeatures[v_i][9] = degree #tempfeatures[v_i][10] = valence As[0].append(sp.coo_matrix(tempA)) As[1].append(sp.coo_matrix(tempBO)) As[2].append(sp.coo_matrix(tempAR)) if (num_molecules == 0): sizes = sizes + [d - 1] else: sizes = sizes + [d] num_molecules = num_molecules + 1 if (num_molecules % 5000 == 0): print("On the " + str(num_molecules) + "th molecule") features += tempfeatures return features, target, As, sizes, num_molecules
def distance_matrix(mol): """ The topological distance matrix. """ return rdmolops.GetDistanceMatrix(mol)