def split_strip(inchi, strip=True): mol = rdkit.Chem.MolFromInchi(inchi) split_inchis = set() for m in rdkit.Chem.rdmolops.GetMolFrags(mol, asMols=True): # split i = rdkit.Chem.MolToInchi(m) if strip: c = ic.inchi_conn_layer(i) if not c in self.conn_split_inactive_inchis: split_inchis.add(i) else: split_inchis.add(i) return split_inchis
def query_inchi_conn(self, inchi, strip=True): if strip: try: inchi = ic.strip_inchi( inchi, exclude_inchis=self.conn_split_inactive_inchis ) # filter inchi except: pass inchi_conn = ic.inchi_conn_layer(inchi) r = set() if inchi_conn in self.inchi_connectivity_index: r.update(self.inchi_connectivity_index[inchi_conn]) if inchi_conn in self.inchi_split_connectivity_index: r.update(self.inchi_split_connectivity_index[inchi_conn]) return r
def inchi_overlap( self, inchi1, inchi2, strip=True, consistency=True, filter_layers={'h', 'f', 'p', 'q', 'i', 't', 'b', 'm', 's'}): def split_strip(inchi, strip=True): mol = rdkit.Chem.MolFromInchi(inchi) split_inchis = set() for m in rdkit.Chem.rdmolops.GetMolFrags(mol, asMols=True): # split i = rdkit.Chem.MolToInchi(m) if strip: c = ic.inchi_conn_layer(i) if not c in self.conn_split_inactive_inchis: split_inchis.add(i) else: split_inchis.add(i) return split_inchis if isinstance(inchi1, list): inchi1 = set(inchi1) if isinstance(inchi2, list): inchi2 = set(inchi2) if isinstance(inchi1, set): split_inchis1 = set(inchi1) else: split_inchis1 = split_strip(inchi1, strip=strip) if isinstance(inchi2, set): split_inchis2 = set(inchi2) else: split_inchis2 = split_strip(inchi2, strip=strip) # get conn layer conn_split_inchis1 = defaultdict(set) for i in split_inchis1: conn_split_inchis1[ic.inchi_conn_layer(i)].add(i) conn_split_inchis2 = defaultdict(set) for i in split_inchis2: conn_split_inchis2[ic.inchi_conn_layer(i)].add(i) # compare conn layers to get candidate matches candidates = set() for k in set(conn_split_inchis1.keys()) & set( conn_split_inchis2.keys()): candidates.update( it.product(conn_split_inchis1[k], conn_split_inchis2[k])) # check these candidate matches by chacking consistency matches = set() for i1, i2 in candidates: if consistency: c, s = ic.compare_consistent(i1, i2, filter_layers=filter_layers) if c: matches.add((i1, i2)) else: if i1 == i2: matches.add((i1, i2)) # return overlap between the split inchi fragments i1_matches = {i1 for (i1, i2) in matches} i2_matches = {i2 for (i1, i2) in matches} overlap = i1_matches | i2_matches return split_inchis1 - overlap, matches, split_inchis2 - overlap # remainder from inchi1, paired matches, remainder from inchi2
def load_inactive_compounds(self, data_dir=None): def trim_whitespace(s): while True: if s[-1:] in {'\n', '\r', '\t', ' '}: s = s[:-1] else: break while True: if s[:1] in {'\n', '\r', '\t', ' '}: s = s[1:] else: break return s if data_dir is None: data_dir = self.data_dir # load inactive data with open(f"{data_dir}/exclude_inchis.json", 'rt') as f: self.exclude_inchis = set(json.load(f)) with open(f"{data_dir}/split_inactive_inchis.json", 'rt') as f: self.split_inactive_inchis = set(json.load(f)) with open(f"{data_dir}/salts.smi", 'rt') as f: cols = ['name', 'smiles'] salts = [{ cols[i]: trim_whitespace(v) for i, v in enumerate(l.split('\t')) } for l in f] with open(f"{data_dir}/solvents.smi", 'rt') as f: cols = ['name', 'smiles'] solvents = [{ cols[i]: trim_whitespace(v) for i, v in enumerate(l.split('\t')) } for l in f] salts_inchi = set() for d in salts: try: mol = rdkit.Chem.MolFromSmiles(d['smiles']) inchi = rdkit.Chem.MolToInchi(mol) salts_inchi.add(inchi) except Exception as e: pass solvents_inchi = set() for d in solvents: try: mol = rdkit.Chem.MolFromSmiles(d['smiles']) inchi = rdkit.Chem.MolToInchi(mol) solvents_inchi.add(inchi) except Exception as e: pass self.split_inactive_inchis.update(salts_inchi) self.split_inactive_inchis.update(solvents_inchi) self.conn_split_inactive_inchis = { ic.inchi_conn_layer(i) for i in self.split_inactive_inchis }
def fetch_data(self, data_dir=None): if data_dir is None: data_dir = self.data_dir self.load_inactive_compounds(data_dir=data_dir) chembl_cursor = self.chembl_db.cursor() # fetch structures self.compound_inchis = {} for molregno, inchi in tqdm(chembl_cursor.execute( "select MOLREGNO, STANDARD_INCHI from CHEMBL.COMPOUND_STRUCTURES" ), leave=True, position=0, desc='ChEMBL structures'): if inchi is None: continue ci = self.chembl_index.get_chembl_ident(molregno=molregno) self.compound_inchis[ci] = inchi structure_id2drugbase_id = {} for drugbase_id, mrn, structure_id in tqdm(chembl_cursor.execute( 'select ID, MOLREGNO, MOLECULE_STRUCTURE_ID from DRUGBASE.MOLECULE_DICTIONARY' ), leave=True, position=0, desc='Drugbase structure IDs' ): ci = self.chembl_index.get_chembl_ident(drugbase_id=drugbase_id) structure_id2drugbase_id[structure_id] = ci for structure_id, inchi in tqdm(chembl_cursor.execute( "select MOLECULE_STRUCTURE_ID, INCHI from DRUGBASE.MOLECULE_STRUCTURE" ), leave=True, position=0, desc='Drugbase structures'): if inchi is None: continue inchi = inchi.read() if inchi is None: continue if structure_id in structure_id2drugbase_id: ci = structure_id2drugbase_id[structure_id] self.compound_inchis[ci] = inchi self.inchi_index = defaultdict(set) for k, v in self.compound_inchis.items(): self.inchi_index[v].add(k) # split InChIs describing multiple molecules self.split_inchi_index = defaultdict(set) for ci, inchi in tqdm(self.compound_inchis.items(), leave=True, position=0, desc='Split InChIs'): mol = rdkit.Chem.MolFromInchi(inchi) if mol is None: continue try: for m in rdkit.Chem.rdmolops.GetMolFrags(mol, asMols=True): # split i = rdkit.Chem.MolToInchi(m) c = ic.inchi_conn_layer(i) if not c in self.conn_split_inactive_inchis: self.split_inchi_index[i].add(ci) # add to index except: pass self.split_inchi_index = dict(self.split_inchi_index) # extract connectivity layer of InChI and build index self.inchi_connectivity_index = defaultdict(set) for ci, inchi in tqdm(self.compound_inchis.items(), leave=True, position=0, desc='Connectivity layer InChI index'): try: inchi = ic.strip_inchi( inchi, exclude_inchis=self.conn_split_inactive_inchis ) # filter inchi except: pass if inchi: c = ic.inchi_conn_layer(inchi) self.inchi_connectivity_index[c].add(ci) self.inchi_connectivity_index = dict(self.inchi_connectivity_index) self.inchi_split_connectivity_index = defaultdict(set) for inchi, mols in tqdm(self.split_inchi_index.items(), leave=True, position=0, desc='Split connectivity layer InChI index'): c = ic.inchi_conn_layer(inchi) # if not c in conn_split_inactive_inchis: self.inchi_split_connectivity_index[c].update(mols) self.inchi_split_connectivity_index = { k: v for k, v in self.inchi_split_connectivity_index.items() if not k in self.conn_split_inactive_inchis } # remove innactive InChIs chembl_cursor.close()