def test_rmsd(): data_path_1 = os.path.join(this_dir, 'data', '1b5e_1.mol2') data_path_2 = os.path.join(this_dir, 'data', '1b5e_2.mol2') pdmol_1 = PandasMol2().read_mol2(data_path_1) pdmol_2 = PandasMol2().read_mol2(data_path_2) assert pdmol_1.rmsd(pdmol_1.df, pdmol_2.df, heavy_only=False) == 1.5523 assert pdmol_1.rmsd(pdmol_1.df, pdmol_2.df) == 1.1609
def data_processor(mol2s): q_pdmol = PandasMol2() d_pdmol = PandasMol2() d_pdmol.read_mol2_from_list(mol2_code=mol2s[0][0], mol2_lines=mol2s[0][1]) q_pdmol.read_mol2_from_list(mol2_code=mol2s[1][0], mol2_lines=mol2s[1][1]) atoms, charges = get_atom_matches(q_pdmol, d_pdmol) return mol2s[0][0], mol2s[1][0], atoms, charges
def data_processor_gz(mol2s_gz): q_pdmol = PandasMol2() d_pdmol = PandasMol2() d_pdmol.read_mol2_from_list(mol2_code=mol2s_gz[0][0], mol2_lines=mol2s_gz[0][1]) q_pdmol.read_mol2_from_list(mol2_code=mol2s_gz[1][0], mol2_lines=mol2s_gz[1][1]) atoms, charges = get_atom_matches(q_pdmol, d_pdmol) return (mol2s_gz[0][0].decode('utf-8'), mol2s_gz[1][0].decode('utf-8'), atoms, charges)
def SYBYL(self, atomtype_set=['Al','B','Br','C.1','C.2','C.3','C.ar','C.cat','Ca','Cl','F','H','Li',\ 'Mg','N.1','N.2','N.3','N.4','N.am','N.ar','N.pl3','Na','O.2','O.3',\ 'O.co2','P.3','S.2','S.3','S.O2','S.O','Si','Zn']): ''' convert a list of smiles into SYBYL array ''' atomtype_to_int = dict((a,i) for i,a in enumerate(atomtype_set)) array_fp = np.zeros((len(self.ls_smiles), len(atomtype_set))) for i, smi in enumerate(self.ls_smiles): try: obconversion = openbabel.OBConversion() obconversion.SetInAndOutFormats("smi", "mol2") mol = openbabel.OBMol() obconversion.ReadString(mol,smi) # read molecule from database mol.AddHydrogens() output_mol2 = obconversion.WriteString(mol) # transform smiles into mol2 with open("molecule.mol2","w+") as file: # write mol2 format into the file, molecule.mol2. file.write(output_mol2) molecule_mol2 = PandasMol2().read_mol2("molecule.mol2") # use biopandas to static the discriptors for atomtype in molecule_mol2.df['atom_type'].value_counts().index: array_fp[i,atomtype_to_int[atomtype]] = molecule_mol2.df['atom_type'].value_counts()[atomtype] except: continue return array_fp
def get_mol_feature(mol2file, agl_class): # get data solvation = PandasMol2().read_mol2(mol2file).df solvation_e = ['H', 'C', 'N', 'O', 'F', 'P', 'S', 'Cl', 'Br', 'I'] total_features = np.array([], dtype=float) for e1 in range(len(solvation_e)): ele1 = solvation_e[e1] for e2 in range(len(solvation_e)): # ligand's element ele2 = solvation_e[e2] cloudpoint1 = \ solvation[['x', 'y', 'z']][solvation['atom_name'].str.contains('^' + ele1 + '[0-9]*$')].values cloudpoint2 = \ solvation[['x', 'y', 'z']][solvation['atom_name'].str.contains('^' + ele2 + '[0-9]*$')].values if cloudpoint1.shape[0] == 0 or cloudpoint2.shape[0] == 0: # agl_features have 9 features agl_features = np.zeros((9, )) else: # each pair atoms feature agl_features = agl_class.graph_features(cloudpoint1, cloudpoint2, ele1, ele2) # store all pair features total_features = np.append(total_features, agl_features) return total_features
def extract_centerdistance_data(mol, proj_direction): '''extracts and formats center distance from mol2 file after alignment to principal axes''' # Extracting data from mol2 pd.options.mode.chained_assignment = None mol2 = PandasMol2().read_mol2(mol) atoms = mol2.df[['atom_id', 'x', 'y', 'z']] atoms.columns = ['atom_id', 'x', 'y', 'z'] # Aligning to principal axes so that origin is the center of pocket trans_coords = alignment( atoms, proj_direction) # get the transformation coordinate atoms['x'] = trans_coords[:, 0] atoms['y'] = trans_coords[:, 1] atoms['z'] = trans_coords[:, 2] atomid_list = atoms['atom_id'].tolist() coordinate_list = atoms.values.tolist() # Calculating the distance to the center of the pocket and creating dictionary center_dist_list = [] for xyz in coordinate_list: center_dist = ((xyz[0])**2 + (xyz[1])**2 + (xyz[2])**2)**.5 center_dist_list.append(center_dist) center_dist_data = dict(zip(atomid_list, center_dist_list)) return center_dist_data
def parseMol2(self): if not self.mol2_parsed_: if self.lig_file.split(".")[-1] != "mol2": out_file = self.lig_file + ".mol2" self._format_convert(self.lig_file, out_file) self.lig_file = out_file if os.path.exists(self.lig_file): try: self.lig = PandasMol2().read_mol2(self.lig_file) except ValueError: templ_ligfile = self.lig_file + "templ.pdb" self._format_convert(self.lig_file, templ_ligfile) if os.path.exists(templ_ligfile): self.lig = mt.load_pdb(templ_ligfile) top = self.lig.topolgy table, bond = top.to_dataframe() self.lig_ele = list(table['element']) self.coordinates_ = self.lig.xyz[0] * 10.0 self.lig_data = table self.lig_data['x'] = self.coordinates_[:, 0] self.lig_data['y'] = self.coordinates_[:, 1] self.lig_data['z'] = self.coordinates_[:, 2] self.mol2_parsed_ = True os.remove(templ_ligfile) return self else: return None self.lig_data = self.lig.df self.get_element() self.get_coordinates() self.mol2_parsed_ = True return self
def voronoi_atoms_coords(bs, bs_out=None, projection=miller, proDirct=None): # Suppresses warning pd.options.mode.chained_assignment = None print(os.path.basename(bs)) # Read molecules in mol2 format mol2 = PandasMol2().read_mol2(bs) atoms = mol2.df[[ 'subst_id', 'subst_name', 'atom_type', 'atom_name', 'x', 'y', 'z' ]] atoms.columns = [ 'res_id', 'residue_type', 'atom_type', 'atom_name', 'x', 'y', 'z' ] atoms['residue_type'] = atoms['residue_type'].apply(lambda x: x[0:3]) # Align to principal Axis trans_coords = alignment(atoms, proDirct) # get the transformation coordinate mol2.df['x'] = trans_coords[:, 0] mol2.df['y'] = trans_coords[:, 1] mol2.df['z'] = trans_coords[:, 2] filename = os.path.basename(bs) filename_without_tail = filename.split('.')[0] mol2.df.to_csv(bs_out + filename_without_tail, float_format="%10.4f", sep='\t', index=False) return
def __read_mol(self, mol_path): """ Read the mol2 file as a dataframe. May include pop_path and profile_path in the future. """ atoms = PandasMol2().read_mol2(mol_path) atoms = atoms.df[[ 'atom_id', 'subst_name', 'atom_type', 'atom_name', 'x', 'y', 'z', 'charge' ]] atoms['residue'] = atoms['subst_name'].apply(lambda x: x[0:3]) atoms['hydrophobicity'] = atoms['residue'].apply( lambda x: self.hydrophobicity[x]) atoms['binding_probability'] = atoms['residue'].apply( lambda x: self.binding_probability[x]) center_distances = self.__compute_dist_to_center(atoms[['x', 'y', 'z' ]].to_numpy()) atoms['distance_to_center'] = center_distances siteresidue_list = atoms['subst_name'].tolist() #qsasa_data = self.__extract_sasa_data(siteresidue_list, pop_path) #atoms['sasa'] = qsasa_data #seq_entropy_data = self.__extract_seq_entropy_data(siteresidue_list, profile_path) # sequence entropy data with subst_name as keys #atoms['sequence_entropy'] = atoms['subst_name'].apply(lambda x: seq_entropy_data[x]) if atoms.isnull().values.any(): print('invalid input data (containing nan):') print(mol_path) bonds = self.bond_parser(mol_path) atoms_graph = self.__form_graph(atoms, bonds, self.threshold) return atoms_graph
def transform_pdb_to_numpy(pdb_file: str, experiment_type: str, center: bool = False) -> Mapping[str, np.array]: """ adapted in part from dMaSIF – https://github.com/FreyrS/dMaSIF/blob/master/data_preprocessing/convert_pdb2npy.py read in a pdb `experiment_type` in ['pdbbind', 'scpdb'] """ # print(pdb_file) assert experiment_type in ['pdbbind', 'scpdb'] # atom_label_to_num = {} num_atoms = 0 if pdb_file[-4:] == 'mol2': try: df = PandasMol2().read_mol2(pdb_file).df coords = df[['x', 'y', 'z']].values # -- to get atom type, get first letter of string by converting to 1-byte array # thanks to https://stackoverflow.com/a/48320451/5338871 for this idea. atoms = df['atom_type'].values if atoms[0] == '': with open( '../data/logs/' + experiment_type + '/problem_files.txt', 'a') as outfile: outfile.write(pdb_file + '\n') return np.zeros((1, 13)) atoms = np.vectorize(get_element_symbols)(atoms) except: with open('../data/logs/' + experiment_type + '/problem_files.txt', 'a') as outfile: outfile.write(pdb_file + '\n') return np.zeros((1, 13)) else: try: df = PandasPdb().read_pdb(pdb_file).df['ATOM'] coords = df[['x_coord', 'y_coord', 'z_coord']].values atoms = df['element_symbol'].values if atoms[0] == '': with open('../data/logs/problems_files.txt', 'a') as outfile: outfile.write(pdb_file + '\n') return np.zeros((1, 13)) atoms = np.vectorize(get_element_symbols)(atoms) except: with open('../data/logs/problems_files.txt', 'a') as outfile: outfile.write(pdb_file + '\n') return np.zeros((1, 13)) types = np.vectorize(atom_label_to_num.__getitem__)(atoms) types_array = np.zeros((len(types), len(atom_label_to_num))) for i, t in enumerate(types): types_array[i, t] = 1.0 if center: coords = coords - np.mean(coords, axis=0, keepdims=True) combined_array = np.concatenate((coords, types_array), axis=1) return combined_array
def get_coords(ac_mol2_file): pmol = PandasMol2().read_mol2(ac_mol2_file) coords = [] molecule = [] for atom in pmol.df.itertuples(): coords.append([atom.x, atom.y, atom.z]) return np.array(coords)
def __init__(self, ligand_fn): self.lig = PandasMol2().read_mol2(ligand_fn) # print(self.lig.df.head()) self.lig_data = self.lig.df self.lig_ele = None self.coordinates = None self.mol2_parsed_ = False
def test_read_mol2_from_list(): data_path = os.path.join(this_dir, 'data', '40_mol2_files.mol2') mol2 = next(split_multimol2(data_path)) pdmol = PandasMol2().read_mol2_from_list(mol2_lines=mol2[1], mol2_code=mol2[0]) assert pdmol.df.shape == (65, 9) assert pdmol.code == 'ZINC38611810'
def extract_seq_entropy_data(profile, mol): '''extracts sequence entropy data from .profile''' # Extracting data from mol2 pd.options.mode.chained_assignment = None mol2 = PandasMol2().read_mol2(mol) atoms = mol2.df[['subst_name']] atoms.columns = ['residue_type'] siteresidue_list = atoms['residue_type'].tolist() # Opening and formatting lists of the probabilities and residues with open(profile) as profile: ressingle_list = [] probdata_list = [] # extracting relevant information for line in profile: line_list = line.split() residue_type = line_list[0] prob_data = line_list[1:] prob_data = list(map(float, prob_data)) ressingle_list.append(residue_type) probdata_list.append(prob_data) ressingle_list = ressingle_list[1:] probdata_list = probdata_list[1:] # Changing single letter amino acid to triple letter with # its corresponding number count = 0 restriple_list = [] for res in ressingle_list: newres = res.replace(res, amino_single_to_triple(res)) count += 1 restriple_list.append(newres + str(count)) # Calculating information entropy with np.errstate(divide='ignore'): prob_array = np.asarray(probdata_list) log_array = np.log2(prob_array) # change all infinite values to 0 log_array[~np.isfinite(log_array)] = 0 entropy_array = log_array * prob_array entropydata_array = np.sum(a=entropy_array, axis=1) * -1 entropydata_list = entropydata_array.tolist() # Matching amino acids from .mol2 and .profile files and creating dictionary fullprotein_data = dict(zip(restriple_list, entropydata_list)) seq_entropy_data = { k: float(fullprotein_data[k]) for k in siteresidue_list if k in fullprotein_data } return seq_entropy_data
def test_overwrite_df(): data_path = os.path.join(this_dir, 'data', '1b5e_1.mol2') pdmol = PandasMol2().read_mol2(data_path) def overwrite(): pdmol.df = pdmol.df[(pdmol.df['atom_type'] != 'H')] expect = ('Please use `PandasMol2._df = ... `' ' instead\nof `PandasMol2.df = ... `' ' if you are sure that\nyou want' ' to overwrite the `df` attribute.') assert_raises(AttributeError, expect, overwrite)
def check_charge(filename, charge): """ Check the net charge of a mol2 file Parameters ---------- filename : str charge : float """ mol2 = PandasMol2().read_mol2(filename) sum_charge = round(mol2._df.charge.sum(),5) if sum_charge == charge: print('Check passed!') else: print('Check failed! The charge is: {:0.4f}'.format(sum_charge))
def extract_charge_data(mol): '''extracts and formats charge data from mol2 file''' # Extracting data from mol2 pd.options.mode.chained_assignment = None # Suppress warning mol2 = PandasMol2().read_mol2(mol) atoms = mol2.df[['atom_id', 'charge']] # Only need atom_id and charge data atoms.columns = ['atom_id', 'charge'] # Create dictionary charge_list = atoms['charge'].tolist() atomid_list = atoms['atom_id'].tolist() charge_data = dict(zip(atomid_list, charge_list)) return charge_data
def test_read_mol2(): data_path_1 = os.path.join(this_dir, 'data', '40_mol2_files.mol2') data_path_2 = os.path.join(this_dir, 'data', '40_mol2_files.mol2.gz') for data_path in (data_path_1, data_path_2): pdmol = PandasMol2().read_mol2(data_path) assert pdmol.df.shape == (65, 9) assert pdmol.code == 'ZINC38611810' expect = ['atom_id', 'atom_name', 'x', 'y', 'z', 'atom_type', 'subst_id', 'subst_name', 'charge'] assert expect == list(pdmol.df.columns) assert len(pdmol.mol2_text) == 6469 assert pdmol.mol2_path == data_path
def _from_mol2_text(cls, mol2_text, verbose=False): """ Get structural data from mol2 text as DataFrame. Parameters ---------- mol2_text : str Mol2 file content from KLIFS database. verbose : bool Show only default columns (False) or additionally input-format specific columns (True). Returns ------- pandas.DataFrame Structural data. """ mol2_text = mol2_text.split("\n") # Use biopandas to parse the mol2 format and return a DataFrame try: pmol = PandasMol2() try: mol2_df = pmol.read_mol2_from_list( mol2_text, "mol", columns=MOL2_COLUMNS["n_cols_10"]).df except ValueError as e: if str(e) == "10 columns passed, passed data had 9 columns": mol2_df = pmol.read_mol2_from_list( mol2_text, "mol", columns=MOL2_COLUMNS["n_cols_9"]).df else: raise e except UnboundLocalError as e: if str( e ) == "local variable 'first_idx' referenced before assignment": raise ValueError( "No structural data could be loaded. Is the input text in mol2 format?" ) else: raise e # Infer residue PDB ID and name from substructure name mol2_df = cls._split_mol2_subst_names(mol2_df) # Format DataFrame mol2_df = cls._format_dataframe(mol2_df, verbose) return mol2_df
def extract_sasa_data(mol, pop): """extracts accessible surface area data from .out file generated by POPSlegacy. then matches the data in the .out file to the binding site in the mol2 file. Used POPSlegacy https://github.com/Fraternalilab/POPSlegacy""" # Extracting data from mol2 file pd.options.mode.chained_assignment = None mol2 = PandasMol2().read_mol2(mol) # only need subst_name for matching. Other data comes from .out file atoms = mol2.df[['subst_name']] atoms.columns = ['residue_type'] siteresidue_list = atoms['residue_type'].tolist() # Extracting sasa data from .out file residue_list = [] qsasa_list = [] with open(pop) as popsa: # opening .out file for line in popsa: line_list = line.split() # extracting relevant information if len(line_list) == 12: residue_type = line_list[2] + line_list[4] if residue_type in siteresidue_list: qsasa = line_list[7] residue_list.append(residue_type) qsasa_list.append(qsasa) qsasa_list = [float(x) for x in qsasa_list] median = statistics.median(qsasa_list) qsasa_new = [median if x == '-nan' else x for x in qsasa_list] # Matching amino acids from .mol2 and .out files and # creating dictionary qsasa_data = {} fullprotein_data = list(zip(residue_list, qsasa_new)) for i in range(len(fullprotein_data)): if fullprotein_data[i][0] in siteresidue_list: qsasa_data[i + 1] = float(fullprotein_data[i][1]) return qsasa_data
def __read_mol(self, mol_path, label): """ Read the mol2 file as a dataframe. """ atoms = PandasMol2().read_mol2(mol_path) atoms = atoms.df[[ 'atom_id', 'subst_name', 'atom_type', 'atom_name', 'x', 'y', 'z', 'charge' ]] atoms['residue'] = atoms['subst_name'].apply(lambda x: x[0:3]) atoms['hydrophobicity'] = atoms['residue'].apply( lambda x: self.hydrophobicity[x]) atoms['binding_probability'] = atoms['residue'].apply( lambda x: self.binding_probability[x]) atoms = atoms[[ 'atom_type', 'residue', 'x', 'y', 'z', 'charge', 'hydrophobicity', 'binding_probability' ]] atoms_graph = self.__form_graph(atoms, self.threshold, label) return atoms_graph
def data_processor(mol2): pdmol = PandasMol2().read_mol2_from_list(mol2_lines=mol2[1], mol2_code=mol2[0]) coordinates = pdmol.df.loc[pd.eval(SELECTION[0]), ['x', 'y', 'z']].values pdmol._df = pdmol._df[pd.eval(SELECTION[1])] for xyz in coordinates: distances = pdmol.distance(xyz) match = ((distances.values >= DISTANCE[0]).any() and (distances.values <= DISTANCE[1]).any()) if match: return mol2[0] return ''
def parseMol2(self): try: self.lig = PandasMol2().read_mol2(self.lig_file) except ValueError: print( "INFO: Warning, parse mol2 file error, converting to PDB instead ......" ) templ_ligfile = self.lig_file + "templ.pdb" # convert mol2 format to pdb format with rdkit self._format_convert(self.lig_file, templ_ligfile) if os.path.exists(templ_ligfile): self.parsePDB(templ_ligfile) os.remove(templ_ligfile) return self self.lig_data = self.lig.df self.get_element() self.get_coordinates() self.ligand_parsed_ = True return self
def _mol2_text_to_dataframe(mol2_text): """ Get structural data from mol2 text. Parameters ---------- mol2_text : str Mol2 file content from KLIFS database. Returns ------- pandas.DataFrame Structural data. """ pmol = PandasMol2() try: mol2_df = pmol._construct_df(mol2_text.splitlines(True), col_names=[ 'atom_id', 'atom_name', 'x', 'y', 'z', 'atom_type', 'subst_id', 'subst_name', 'charge', 'backbone' ], col_types=[ int, str, float, float, float, str, int, str, float, str ]) except ValueError: mol2_df = pmol._construct_df( mol2_text.splitlines(True), col_names=[ 'atom_id', 'atom_name', 'x', 'y', 'z', 'atom_type', 'subst_id', 'subst_name', 'charge' ], col_types=[int, str, float, float, float, str, int, str, float]) return mol2_df
def _mol2_file_to_dataframe(mol2_file): """ Get structural data from mol2 file. Parameters ---------- mol2_file : pathlib.Path or str Path to mol2 file. Returns ------- pandas.DataFrame Structural data. """ mol2_file = Path(mol2_file) pmol = PandasMol2() try: mol2_df = pmol.read_mol2(str(mol2_file), columns={ 0: ('atom_id', int), 1: ('atom_name', str), 2: ('x', float), 3: ('y', float), 4: ('z', float), 5: ('atom_type', str), 6: ('subst_id', int), 7: ('subst_name', str), 8: ('charge', float), 9: ('backbone', str) }) except ValueError: mol2_df = pmol.read_mol2(str(mol2_file)) return mol2_df
def load_mol2(path): mol = PandasMol2().read_mol2(path) pdf = mol x_coords = pdf.df['x'].values y_coords = pdf.df['y'].values z_coords = pdf.df['z'].values atom_types = pdf.df['atom_name'].values residue_names = pdf.df['subst_name'].values partial_charge = pdf.df['charge'].values smarts_notation = next(pybel.readfile('mol2', path)) pro_dict = generate_dict(x_coords, y_coords, z_coords, atom_types, residue_names) pro_dict['charge'] = partial_charge pro_dict['smarts'] = smarts_notation # add a value to the dictionary, which is all of the atomic coordinates just # shifted to the origin #protein_dict = shift_coords(protein_dict) return pro_dict
def ligands_reader(): ''' Parses selected MOL2 file with structures of previously docked ligands using BioPandas module. Lists all atoms from all ligands with their coordinates. :return: symbols, numbers and coordinates of atoms + number of atom :rtype: list of lists ''' window = Tk() path = os.path.normpath(os.getcwd() + os.sep + os.pardir) path = os.path.join(path, 'files') ligands_path_string = filedialog.askopenfilename( initialdir='path', title="SELECT LIGANDS STRUCTURE:", filetypes=(("MOL2 files", "*.mol2"), ("all files", "*.*"))) ligands_name = os.path.basename(ligands_path_string) window.destroy() ligands_data = [] model_number = 1 with open(ligands_path_string, 'r') as ligands: for ligand in split_multimol2(ligands_path_string): pmol = PandasMol2().read_mol2_from_list(mol2_lines=ligand[1], mol2_code=ligand[0]) atom_coord = pmol.df[['atom_name', 'atom_id', 'x', 'y', 'z']] atom_coord = atom_coord.assign(column=model_number) model_number += 1 model_data = atom_coord.values.tolist() ligands_data = ligands_data + model_data # print(ligands_data) return ligands_data
def ECFP_SYBYL(row): try: obconversion = openbabel.OBConversion() obconversion.SetInAndOutFormats("smi", "mol2") mol = openbabel.OBMol() obconversion.ReadString(mol, row["SMILES"]) # read molecule from database mol.AddHydrogens() output_mol2 = obconversion.WriteString( mol) # transform smiles into mol2 file = open("molecule.mol2", "w+") # write mol2 format into the file, molecule.mol2. file.write(output_mol2) file.close() molecule_mol2 = PandasMol2().read_mol2( "molecule.mol2") # use biopandas to static the discriptors for element in molecule_mol2.df['atom_type'].value_counts().index: if element == 'Al': row['Al'] = molecule_mol2.df['atom_type'].value_counts()['Al'] if element == 'B': row['B'] = molecule_mol2.df['atom_type'].value_counts()['B'] if element == 'Br': row['Br'] = molecule_mol2.df['atom_type'].value_counts()['Br'] if element == 'C.1': row['C.1'] = molecule_mol2.df['atom_type'].value_counts( )['C.1'] if element == 'C.2': row['C.2'] = molecule_mol2.df['atom_type'].value_counts( )['C.2'] if element == 'C.3': row['C.3'] = molecule_mol2.df['atom_type'].value_counts( )['C.3'] if element == 'C.ar': row['C.ar'] = molecule_mol2.df['atom_type'].value_counts( )['C.ar'] if element == 'C.cat': row['C.cat'] = molecule_mol2.df['atom_type'].value_counts( )['C.cat'] if element == 'Ca': row['Ca'] = molecule_mol2.df['atom_type'].value_counts()['Ca'] if element == 'Cl': row['Cl'] = molecule_mol2.df['atom_type'].value_counts()['Cl'] if element == 'F': row['F'] = molecule_mol2.df['atom_type'].value_counts()['F'] if element == 'H': row['H'] = molecule_mol2.df['atom_type'].value_counts()['H'] if element == 'Li': row['Li'] = molecule_mol2.df['atom_type'].value_counts()['Li'] if element == 'Mg': row['Mg'] = molecule_mol2.df['atom_type'].value_counts()['Mg'] if element == 'N.1': row['N.1'] = molecule_mol2.df['atom_type'].value_counts( )['N.1'] if element == 'N.2': row['N.2'] = molecule_mol2.df['atom_type'].value_counts( )['N.2'] if element == 'N.3': row['N.3'] = molecule_mol2.df['atom_type'].value_counts( )['N.3'] if element == 'N.4': row['N.4'] = molecule_mol2.df['atom_type'].value_counts( )['N.4'] if element == 'N.am': row['N.am'] = molecule_mol2.df['atom_type'].value_counts( )['N.am'] if element == 'N.ar': row['N.ar'] = molecule_mol2.df['atom_type'].value_counts( )['N.ar'] if element == 'N.pl3': row['N.pl3'] = molecule_mol2.df['atom_type'].value_counts( )['N.pl3'] if element == 'Na': row['Na'] = molecule_mol2.df['atom_type'].value_counts()['Na'] if element == 'O.2': row['O.2'] = molecule_mol2.df['atom_type'].value_counts( )['O.2'] if element == 'O.3': row['O.3'] = molecule_mol2.df['atom_type'].value_counts( )['O.3'] if element == 'O.co2': row['O.co2'] = molecule_mol2.df['atom_type'].value_counts( )['O.co2'] if element == 'P.3': row['P.3'] = molecule_mol2.df['atom_type'].value_counts( )['P.3'] if element == 'S.2': row['S.2'] = molecule_mol2.df['atom_type'].value_counts( )['S.2'] if element == 'S.3': row['S.3'] = molecule_mol2.df['atom_type'].value_counts( )['S.3'] if element == 'S.O2': row['S.O2'] = molecule_mol2.df['atom_type'].value_counts( )['S.O2'] if element == 'S.O': row['S.O'] = molecule_mol2.df['atom_type'].value_counts( )['S.O'] if element == 'Si': row['Si'] = molecule_mol2.df['atom_type'].value_counts()['Si'] if element == 'Zn': row['Zn'] = molecule_mol2.df['atom_type'].value_counts()['Zn'] mol = Chem.MolFromSmiles(row['SMILES']) fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=nbits).ToBitString() for i in range(nbits): row[str(i)] = fp[i] return row except: print(row["SMILES"], "ECFP feature something is wrong!!")
def SYBYL(row): try: obconversion = openbabel.OBConversion() obconversion.SetInAndOutFormats("smi", "mol2") mol = openbabel.OBMol() obconversion.ReadString(mol, row["SMILES"]) # read molecule from database mol.AddHydrogens() output_mol2 = obconversion.WriteString( mol) # transform smiles into mol2 file = open("molecule.mol2", "w+") # write mol2 format into the file, molecule.mol2. file.write(output_mol2) file.close() molecule_mol2 = PandasMol2().read_mol2( "molecule.mol2") # use biopandas to static the discriptors for element in molecule_mol2.df['atom_type'].value_counts().index: if element == 'Al': row['Al'] = molecule_mol2.df['atom_type'].value_counts()['Al'] if element == 'B': row['B'] = molecule_mol2.df['atom_type'].value_counts()['B'] if element == 'Br': row['Br'] = molecule_mol2.df['atom_type'].value_counts()['Br'] if element == 'C.1': row['C.1'] = molecule_mol2.df['atom_type'].value_counts( )['C.1'] if element == 'C.2': row['C.2'] = molecule_mol2.df['atom_type'].value_counts( )['C.2'] if element == 'C.3': row['C.3'] = molecule_mol2.df['atom_type'].value_counts( )['C.3'] if element == 'C.ar': row['C.ar'] = molecule_mol2.df['atom_type'].value_counts( )['C.ar'] if element == 'C.cat': row['C.cat'] = molecule_mol2.df['atom_type'].value_counts( )['C.cat'] if element == 'Ca': row['Ca'] = molecule_mol2.df['atom_type'].value_counts()['Ca'] if element == 'Cl': row['Cl'] = molecule_mol2.df['atom_type'].value_counts()['Cl'] if element == 'F': row['F'] = molecule_mol2.df['atom_type'].value_counts()['F'] if element == 'H': row['H'] = molecule_mol2.df['atom_type'].value_counts()['H'] if element == 'Li': row['Li'] = molecule_mol2.df['atom_type'].value_counts()['Li'] if element == 'Mg': row['Mg'] = molecule_mol2.df['atom_type'].value_counts()['Mg'] if element == 'N.1': row['N.1'] = molecule_mol2.df['atom_type'].value_counts( )['N.1'] if element == 'N.2': row['N.2'] = molecule_mol2.df['atom_type'].value_counts( )['N.2'] if element == 'N.3': row['N.3'] = molecule_mol2.df['atom_type'].value_counts( )['N.3'] if element == 'N.4': row['N.4'] = molecule_mol2.df['atom_type'].value_counts( )['N.4'] if element == 'N.am': row['N.am'] = molecule_mol2.df['atom_type'].value_counts( )['N.am'] if element == 'N.ar': row['N.ar'] = molecule_mol2.df['atom_type'].value_counts( )['N.ar'] if element == 'N.pl3': row['N.pl3'] = molecule_mol2.df['atom_type'].value_counts( )['N.pl3'] if element == 'Na': row['Na'] = molecule_mol2.df['atom_type'].value_counts()['Na'] if element == 'O.2': row['O.2'] = molecule_mol2.df['atom_type'].value_counts( )['O.2'] if element == 'O.3': row['O.3'] = molecule_mol2.df['atom_type'].value_counts( )['O.3'] if element == 'O.co2': row['O.co2'] = molecule_mol2.df['atom_type'].value_counts( )['O.co2'] if element == 'P.3': row['P.3'] = molecule_mol2.df['atom_type'].value_counts( )['P.3'] if element == 'S.2': row['S.2'] = molecule_mol2.df['atom_type'].value_counts( )['S.2'] if element == 'S.3': row['S.3'] = molecule_mol2.df['atom_type'].value_counts( )['S.3'] if element == 'S.O2': row['S.O2'] = molecule_mol2.df['atom_type'].value_counts( )['S.O2'] if element == 'S.O': row['S.O'] = molecule_mol2.df['atom_type'].value_counts( )['S.O'] if element == 'Si': row['Si'] = molecule_mol2.df['atom_type'].value_counts()['Si'] if element == 'Zn': row['Zn'] = molecule_mol2.df['atom_type'].value_counts()['Zn'] return row except: print(row["SMILES"], "SYBYL something is wrong!!")
def voronoi_atoms(bs, cmap, colorby, bs_out=None, size=None, dpi=None, alpha=1, save_fig=True, projection=miller, proDirct=None): # Suppresses warning pd.options.mode.chained_assignment = None # Read molecules in mol2 format mol2 = PandasMol2().read_mol2(bs) atoms = mol2.df[[ 'subst_id', 'subst_name', 'atom_type', 'atom_name', 'x', 'y', 'z' ]] atoms.columns = [ 'res_id', 'residue_type', 'atom_type', 'atom_name', 'x', 'y', 'z' ] atoms['residue_type'] = atoms['residue_type'].apply(lambda x: x[0:3]) # Align to principal Axis trans_coords = alignment(atoms, proDirct) # get the transformation coordinate atoms['x'] = trans_coords[:, 0] atoms['y'] = trans_coords[:, 1] atoms['z'] = trans_coords[:, 2] # convert 3D to 2D atoms["P(x)"] = atoms[['x', 'y', 'z']].apply( lambda coord: projection(coord.x, coord.y, coord.z)[0], axis=1) atoms["P(y)"] = atoms[['x', 'y', 'z']].apply( lambda coord: projection(coord.x, coord.y, coord.z)[1], axis=1) # setting output image size, labels off, set 120 dpi w x h size = 128 if size is None else size dpi = 120 if dpi is None else dpi figure = plt.figure(figsize=(int(size) / int(dpi), int(size) / int(dpi)), dpi=int(dpi)) # figsize is in inches, dpi is the resolution of the figure # ax = plt.subplot(111) ax = figure.add_subplot(111) # default is (111) ax.axis('off') ax.tick_params(axis='both', bottom=False, left=False, right=False, labelleft=False, labeltop=False, labelright=False, labelbottom=False) # Compute Voronoi tesselation vor = Voronoi(atoms[['P(x)', 'P(y)']]) regions, vertices = voronoi_finite_polygons_2d(vor) polygons = [] for reg in regions: polygon = vertices[reg] polygons.append(polygon) atoms.loc[:, 'polygons'] = polygons # Check alpha alpha = float(alpha) # Color by colorby if colorby in ["atom_type", "residue_type"]: colors = [cmap[_type]["color"] for _type in atoms[colorby]] elif colorby == "residue_num": cmap = k_different_colors(len(set(atoms["res_id"]))) cmap = { res_num: color for res_num, color in zip(set(atoms["res_id"]), cmap) } colors = atoms["res_id"].apply(lambda x: cmap[x]) else: raise ValueError atoms["color"] = colors for i, row in atoms.iterrows(): colored_cell = matplotlib.patches.Polygon(row["polygons"], facecolor=row['color'], edgecolor=row['color'], alpha=alpha, linewidth=0.2) ax.add_patch(colored_cell) # atoms.loc[:,"color"] = color ax.set_xlim(vor.min_bound[0], vor.max_bound[0]) ax.set_ylim(vor.min_bound[1], vor.max_bound[1]) # Output image saving in any format; default jpg bs_out = 'out.jpg' if bs_out is None else bs_out # Get image as numpy array figure.tight_layout(pad=0) img = fig_to_numpy(figure, alpha=alpha) if save_fig: plt.subplots_adjust(bottom=0, top=1, left=0, right=1) plt.savefig(bs_out, frameon=False, pad_inches=False) plt.close(figure) del figure return atoms, vor, img