def get_smiles_from_csd(): ''' Read each CSD identifier and save the smiles ''' co_crystals = pd.read_csv('datasets/train_data/cocrystals2020.csv', encoding='latin1') co_crystals = co_crystals.iloc[:, :] #print(co_crystals.csd_id) smiles1 = [] smiles2 = [] year = [] for i in co_crystals.csd_id.values: #print(i) csd = MoleculeReader('CSD') csd_reader = io.EntryReader('CSD') year.append(csd_reader.entry(i).publication.year) mol = csd.molecule(i) smi = mol.smiles smi = smi.split('.') smi = Remove(smi) smiles1.append(smi[0]) smiles2.append(smi[1]) #print(len(smiles1)) #cocrystal_data = pd.concat([co_crystals , pd.DataFrame(smiles1, columns=['smiles1']), pd.DataFrame(smiles2, columns=['smiles2']), #pd.DataFrame(year, columns=['year'])], axis=1) #cocrystal_data.to_csv('datasets/train_data/all_cocrystals_info.csv') return co_crystals, smiles1, smiles2
def create_het_chembl_mapping_dict(filename= None): ''' read sdf overlays file of PDB hetcodes for each targets and find out chembl id for each hetcodes :param filename: sdf file for overlays :return: dictionary of hetcode and chembl id ''' sdf_reader = io.EntryReader(filename) id_list = [] for sdf in sdf_reader: id_list.append((sdf.molecule.identifier)) het_list = [] for id in id_list: het_list.append((id.split("_")[2])) het_chembl_mapping_dict = {} for het in het_list: compound_url = 'https://www.ebi.ac.uk/pdbe/api/pdb/compound/mappings/{}'.format(het) pdbe_qeury = SearchAPI(search_url=compound_url) pdb_list_dict = pdbe_qeury.run_search() if pdb_list_dict: for k, v in pdb_list_dict.items(): for i in v: if i.get('chembl_id'): het_chembl_mapping_dict[i.get('chembl_id')] = het return het_chembl_mapping_dict
def __init__(self, cds_ids: List[str]) -> None: """Parses CSD structures for oxidation states Args: cds_ids (List[str]): list of CSD database identifiers Returns: None """ # Set up dictionaries and regex self.symbol_name_dict = SymbolNameDict().get_symbol_name_dict() self.name_symbol_dict = { v: k for k, v in self.symbol_name_dict.items() } symbol_regex = "|".join(list(self.symbol_name_dict.values())) self.symbol_regex = re.compile(symbol_regex) self.regex = re.compile("((?:{})\\([iv0]+\\))".format(symbol_regex), re.IGNORECASE) self.not_ox_regex = re.compile("((?:{})[^\\(]*$)".format(symbol_regex), re.IGNORECASE) self.negative_regex = re.compile( "((?:{})\\(-[1234567890]+\\))".format(symbol_regex), re.IGNORECASE) self.csd_ids = cds_ids self.csd_reader = io.EntryReader("CSD")
def generate_id_list(num_samples=1009141): """Sample some random entries from the CSD""" ids = [] csd_reader = io.EntryReader('CSD') idxs = random.sample(list(range(len(csd_reader))), num_samples) for idx in idxs: ids.append(csd_reader[idx].identifier) return ids
def extractCrystalCif(filepath): """ TODO: Can i store molecule information inside and that'll be in crystal? :param filepath: :return: """ reader = io.EntryReader(filepath + ".cif") entry_from_cif = reader[0] logger.info(entry_from_cif.crystal) displayCrystal(entry_from_cif.crystal) return entry_from_cif.crystal
def get_neighbor_atoms(self, query_atom, path_mol2_file_dir): """获取配位原子 :param query_atom:金属元素,Sr,K,Na等,type:string :param path_mol2_file_dir: *.mol2文件所在的绝对路径,type:string :return: 配位原子名称及数量,type:dict """ # 重新定义self.entry_reader,从mol2文件中读取文件,保存为entry list_entry = [] list_mol2_file_path = glob.glob( os.path.join(path_mol2_file_dir, '*.mol2')) for path_temp in list_mol2_file_path: entry_temp = io.EntryReader(path_temp)[0] list_entry.append(entry_temp) self.entry_reader = list_entry.copy() # 创建搜索的原子,此处为Na/Mg/Cr/.../Sr s = search.QuerySubstructure() q = QueryAtom(query_atom) s.add_atom(q) # 搜索neighbors entry_reader = self.entry_reader dict_ligating_atom_statistics = dict() # 用于储存金属原子的配位原子类型及数量 pbar = tqdm(range(len(entry_reader))) for count in pbar: set_ligating_atom = set() # 空集合,用于存放该分子中所涉及的配位原子类型 mol = entry_reader[count].crystal.molecule # 找出该晶体文件中金属原子的配位原子 for atom in mol.atoms: bool_judge = s.match_atom(atom) # 判断该原子是否是金属原子 # 若和金属原子匹配 if bool_judge: neighbors = atom.neighbours # 寻找其配位原子 # 若配位原子不存在,则跳过 if len(neighbors) == 0: pass # 配位原子不为0,则将配位原子增加到对应的集合当中 else: for neighbour in neighbors: set_ligating_atom.add(neighbour.atomic_symbol) # 对金属原子的配位原子数统计 for element in set_ligating_atom: # 字典中若存在该element,则计数增加1 try: dict_ligating_atom_statistics[element] += 1 # 字典中若不存在该element,则新建该element,并计数1 except BaseException: dict_ligating_atom_statistics[element] = 1 pbar.set_description('正在统计配位原子:') return dict_ligating_atom_statistics
def get_smile_from_CSD(dataframe): nbrStruc = dataframe.shape[0] from ccdc import io csd_reader = io.EntryReader('CSD') for i in range(nbrStruc): filename = dataframe.iloc[i,0] print(filename) mol = csd_reader.molecule(filename) smile= mol.smiles print(smile) dataframe.at[i,"smile"] = smile return(dataframe)
def __get_crystal(self): count = 0 list_entry = [] entry_reader = io.EntryReader('CSD') if self.crystal_number is None: return entry_reader else: for i in entry_reader: list_entry.append(i) count = count + 1 if count > self.crystal_number: break return list_entry
def main(): # oxidation_parse_dict = load_pickle( # "/home/kevin/Dropbox (LSMO)/proj62_guess_oxidation_states/oxidation_state_book/data/20190820-173457-csd_ox_parse_output.pkl" #) oxidation_reference_dict = load_pickle( '/home/kevin/Dropbox (LSMO)/proj62_guess_oxidation_states/mine_csd/20190921-142007-csd_ox_parse_output_reference.pkl' ) database_ids = list(oxidation_reference_dict.keys()) csd_reader = io.EntryReader('CSD') formula_dicts = {} for database_id in database_ids: formula_dicts[database_id] = get_chemical_formula(csd_reader, database_id) timestr = time.strftime('%Y%m%d-%H%M%S') output_name = '-'.join([timestr, 'get_chemical_formulas']) with open(output_name + '.pkl', 'wb') as filehandle: pickle.dump(formula_dicts, filehandle)
def __init__(self): #Hit results from search self.searchHits = [] #Config for search self.overallQuery = None self.searchCrystal = crystal.Crystal self.entryReader = io.EntryReader('CSD') #TODO: tmp just set default centring self.searchCrystal.lattice_centring = ChemistryLib.Spacegroup_centring_text().text(1) #Lattice Params self.cellAngles = None #crystal.CellAngles() self.cellLengths = None # crystal.CellLengths() self.angTol = None self.lengthTol = None #Filtering After Search self.refcode = None self.ccdcNumber = None self.atomicElements = [] """ Functional Assignment """ self._unitCellQuery = unitCellQuery self._searchCellVals = searchCellVals #self.cifQueryPath = cellLib.extractCrystalCif self._strDetails = query """ Filtering that can be performed on the hits """ self.filterOnSpacegroup = None
from ccdc import io import numpy as np import pandas as pd def activity(identifier): if "ZINC" in identifier: return 0 else: return 1 entries = io.EntryReader("hotspot_scored.sdf") idents = [entry.identifier for entry in entries] act = [activity(entry.identifier) for entry in entries] hs1 = [ float( np.sum([ float(entry.attributes["acceptor"]), float(entry.attributes["donor"]), float(entry.attributes["apolar"]) ])) for entry in entries ] # hs2 = [entry.attributes["hs2_total"] for entry in entries] hs2 = [ float( np.sum([ float(entry.attributes["hs2_acceptor"]),
def get_bibliometric_information(cif_path, ): # pylint:disable=too-many-locals, too-many-statements, too-many-branches """ Assumes that the filename is the CSD-key. Assumes you are in the EPFL VPN. Uses my scopus API key. Args: cif_path: Returns: """ stem = Path(cif_path).stem.upper() deposition_number = None title = None citations = None url = None abstract = None funding = None doi_ccsd = None doi_paper = None pages = None journal = None affilations = None year = None disorder_csd = None authors = None uv_regex_result = None photo_regex_result = None electronic_regex_result = None csd_remarks = None csd_has_disorder = None chemical_name = None formula = None try: # pylint:disable=too-many-nested-blocks csd_reader = io.EntryReader('CSD') reader = csd_reader.entry(stem) disorder_csd = reader.disorder_details doi_paper = reader.publication.doi csd_has_disorder = reader.has_disorder doi_ccsd = reader.doi deposition_number = reader.ccdc_number chemical_name = reader.chemical_name formula = reader.formula uv_regex_list = ['uv', 'uv-vis', 'vis'] uv_regex = re.compile('|'.join(uv_regex_list), re.IGNORECASE) uv_regex_result = False photo_regex_list = ['photo', 'absorp', 'light', 'lumin'] photo_regex = re.compile('|'.join(photo_regex_list), re.IGNORECASE) photo_regex_result = False electronic_regex_list = ['electronic', 'conduc'] electronic_regex = re.compile('|'.join(electronic_regex_list), re.IGNORECASE) electronic_regex_result = False if doi_paper: logger.info('found DOI %s', doi_paper) works = Works() query_res = works.doi(doi_paper) if query_res: if 'title' in query_res.keys(): if len(query_res['title']) > 0: title = query_res['title'][-1] else: title = query_res['title'] if 'is-referenced-by-count' in query_res.keys(): citations = query_res['is-referenced-by-count'] if 'link' in query_res.keys(): if len(query_res['link']) > 0: url = query_res['link'][0]['URL'] else: url = query_res['link'] if 'abstract' in query_res.keys(): abstract = query_res['abstract'] funding_sublis = [] if 'funder' in query_res.keys(): for f in query_res['funder']: funding_sublis.append(f['name']) funding = funding_sublis else: funding = None if 'page' in query_res.keys(): pages = query_res['page'] if 'container-title' in query_res.keys(): if len(query_res['container-title']) > 0: journal = query_res['container-title'][0] else: journal = query_res['container-title'] if 'author' in query_res.keys(): author_sublist = [] for a in query_res['author']: author_sublist.append(a['family']) authors = author_sublist affiliation_sublist = [] for a in query_res['author']: affiliation_sublist.append(a['affiliation']) if abstract is None: search_result = scopus.search(doi_paper) if len(search_result) > 0: try: abstract = scopus.retrieve_abstract( search_result['scopus_id'].values[0] )['abstract'] except Exception: abstract = None if abstract: electronic_regex_result = re.findall(electronic_regex, abstract) if len(electronic_regex_result) > 0: electronic_regex_result = True uv_regex_result = re.findall(uv_regex, abstract) if len(uv_regex_result) > 0: uv_regex_result = True photo_regex_result = re.findall(photo_regex, abstract) if len(photo_regex_result) > 0: photo_regex_result = True except Exception: logger.info('Could not retrieve CSD info') result_dict = { 'deposition_number': deposition_number, 'title': title, 'csd_abbrv': stem, 'citations': citations, 'url': url, 'abstract': abstract, 'funding': funding, 'doi_ccsd': doi_ccsd, 'doi_paper': doi_paper, 'formula': formula, 'pages': pages, 'journal': journal, 'affilations': affilations, 'year': year, 'disorder_csd': disorder_csd, 'authors': authors, 'remarks': csd_remarks, 'csd_has_disorder': csd_has_disorder, 'chemical_name': chemical_name, 'uv_regex_result': uv_regex_result, 'photo_regex_result': photo_regex_result, 'electronic_regex_result': electronic_regex_result, } else: result_dict = { 'deposition_number': deposition_number, 'title': title, 'csd_abbrv': stem, 'citations': citations, 'url': url, 'abstract': abstract, 'funding': funding, 'doi_ccsd': doi_ccsd, 'doi_paper': doi_paper, 'formula': formula, 'pages': pages, 'journal': journal, 'affilations': affilations, 'year': year, 'disorder_csd': disorder_csd, 'authors': authors, 'remarks': csd_remarks, 'csd_has_disorder': csd_has_disorder, 'chemical_name': chemical_name, 'uv_regex_result': uv_regex_result, 'photo_regex_result': photo_regex_result, 'electronic_regex_result': electronic_regex_result, } return result_dict
def csd(): global _csd if _csd is None: _csd = io.EntryReader('csd') return _csd
from hotspots.pharmacophore_extension import LigandPharmacophoreModel from ccdc import io csd = io.EntryReader('CSD') crystal = csd.crystal('IBPRAC') crystal.molecule.add_hydrogens() ligand_pharmacophore = LigandPharmacophoreModel() ligand_pharmacophore.feature_definitions = [ "acceptor_projected", "donor_projected", "ring_planar_projected" ] ligand_pharmacophore.detect_from_ligand(crystal) ligand_pharmacophore.pymol_visulisation()
import pandas as pd print(pd.__file__) from ccdc import io csd_reader = io.EntryReader('CSD') entry_abebuf = csd_reader.entry('ABEBUF') cryst_abebuf = csd_reader.crystal('ABEBUF') mol_abebuf = csd_reader.molecule('ABEBUF') print(round(mol_abebuf.molecular_weight, 3)) reader_formats = io.MoleculeReader.known_formats.keys() reader_formats.sort() for format in reader_formats: print format first_molecule = csd_reader[0] print first_molecule.identifier ababub = csd_reader.entry('ABABUB') mol = ababub.molecule print len(mol.atoms) print mol.formula for i in range(11): mol = csd_reader[i] print mol.identifier mol = csd_reader.molecule('ABEBUF') size = len(mol.atoms)
#CCDC imports from ccdc import io, search, molecule from ccdc import crystal from ccdc.io import MoleculeReader from ccdc.io import EntryReader from ccdc._lib import ChemistryLib from ccdc.search import ReducedCellSearch from ccdc.search import SimilaritySearch from ccdc.search import TextNumericSearch from ccdc.diagram import DiagramGenerator fileName = os.path.basename(sys.argv[0]) fileName = fileName[:-3] #Configuration CSD entryReader = io.EntryReader('CSD') #-- LOGGER CONFIGURATION --# logger = logging.getLogger(fileName) logger.setLevel(logging.DEBUG) # create console handler with a higher log level ch = logging.StreamHandler() ch.setLevel(logging.DEBUG) # create file handler which logs even debug messages fh = logging.FileHandler('/tmp/spamcellsearcher.log') fh.setLevel(logging.DEBUG) # create formatter and add it to the handlers formatter = logging.Formatter('%(name)s - %(levelname)s - %(message)s') fh.setFormatter(formatter) ch.setFormatter(formatter) # add the handlers to the logger
import glob,os,sys,time sys.path.append("../../affinityDB/") import database from ccdc import io import ccdc db_root = "/home/maksym/Projects/datasets/CSD1/" pdb_folder = "pdbs" afdb_file = "CSD1.db" afdb = database.AffinityDB(os.path.join(db_root,afdb_file)) out_q,stop_event = afdb.open_table_with_queue(table_name="some_table", col_names=["ccdc_id","filename","SMILES"], col_types=[str,str,str]) # Creating a CSD entry reader csd_entry_reader = io.EntryReader('CSD') # Create a CSD entry reader including any updates directory = ccdc.io.csd_directory() csd_and_updates = glob.glob(os.path.join(directory, '*.inf')) csd_and_updates_reader = io.EntryReader(csd_and_updates) #for i in range(1000): # out_q.put(["srandom text"]) exceptions = [] i = 0 for mol in csd_entry_reader.molecules(): start = time.time() try: