def process(molid, smiles): if molid is _END_MOLID: writer.close() return try: mol = to_rdkit_mol(smiles) fpsinfo = {} # N.B. We won't actually use rdkit hash, so we won't ask for nonzero values... # Is there a way of asking rdkit to give us this directly? AllChem.GetMorganFingerprint(mol, max_radius, bitInfo=fpsinfo, useFeatures=fcfp) counts = defaultdict(int) centers = defaultdict(list) for bit_descs in fpsinfo.values(): for center, radius in bit_descs: cansmiles = explain_circular_substructure(mol, center, radius) counts[cansmiles] += 1 centers[cansmiles].append((center, radius)) if write_centers: features_strings = ['%s %d %s' % (cansmiles, count, ' '.join(['%d %d' % (c, r) for c, r in centers[cansmiles]])) for cansmiles, count in counts.iteritems()] else: features_strings = ['%s %d' % (cansmiles, count) for cansmiles, count in counts.iteritems()] writer.write('%s\t%s\n' % (molid, '\t'.join(features_strings))) except: info('Failed molecule %s: %s' % (molid, smiles)) writer.write('%s\t*FAILED*\n' % molid)
def process(molid, smiles): if molid is _END_MOLID: h5.close() return ne = len(molids) try: molids.resize((ne + 1,)) molids[ne] = molid mol = to_rdkit_mol(smiles) descs.resize((ne + 1, nf)) descs[ne, :] = computer.compute(mol)[0] except: info('Failed molecule %s: %s' % (molid, smiles)) descs[ne, :] = [np.nan] * nf
def save_from_smiles_iterator(self, it): """Creates the catalog from the (molid, smiles) iterator, possibly overwriting the present files.""" molids = [] coords = [] base = 0 with open(op.join(self._root, 'molsdata'), 'wb') as writer: for molid, smiles in it: mol = to_rdkit_mol(smiles, molid=molid) if mol is None: molids.append(molid) coords.append((-1, 0)) else: moldata = mol.ToBinary() molids.append(molid) coords.append((base, len(moldata))) base += len(moldata) writer.write(moldata) with open(self._molids_file, 'wt') as writer: for molid in molids: writer.write(molid + '\n') np.save(self._coords_file, np.array(coords))