def add_mol(self, molid, rdkmol): """Serial, memory-conservative fingerprinter for building a fingerprints sparse matrix incrementally.""" try: # Amazing DRY violation counts, _ = unfolded_morgan_fingerprint(rdkmol, max_radius=self.max_radius, fcfp=self.fcfp, use_hs=self.use_hs, canonical=self.canonical, isomeric=self.isomeric, kekule=self.kekule, all_bonds_explicit=self.all_bonds_explicit) cols_vals = [] for smiles, count in counts.iteritems(): if smiles not in self._s2i: self._s2i[smiles] = len(self._s2i) self._rows.append(len(self._molids)) cols_vals.append((self._s2i[smiles], count)) cols_vals = sorted(cols_vals) self._cols.extend(map(itemgetter(0), cols_vals)) self._vals.extend(map(itemgetter(1), cols_vals)) self._molids.append(molid) except Exception, _: warning('Could not compute unfolded fingerprint for molecule %s' % molid) self._failed_moldids.append(molid)
def to_rdkit_mol(smiles, molid=None, sanitize=True, to2D=False, to3D=False, toPropertyMol=False): """Converts a smiles string into an RDKit molecule.""" mol = Chem.MolFromSmiles(smiles, sanitize=sanitize) # TODO: allow other formats, like sdf or inchi if mol is None: if molid is None: warning('RDKit cannot create a molecule from smiles %s' % smiles) else: warning('RDKit cannot create molecule %s from smiles %s' % (molid, smiles)) return mol if to3D: AllChem.EmbedMolecule(mol) AllChem.UFFOptimizeMolecule(mol) elif to2D: AllChem.Compute2DCoords(mol) if toPropertyMol: return PropertyMol(mol) return mol
def iterate_records_in_text_file( filename, record_terminator="$$$$", terminator_only_after_empty_line=True, verbose=True ): """Iterate over all molecule records in the given sdf file. # NB: assume that the last record also includes terminator # Must the record terminator be preceded by an empty line? """ # Let's detect compressed files just using the extension if hasattr(filename, "next"): fileh = filename elif filename.endswith(".zip"): fileh = zipfile.ZipFile(filename, "r") elif filename.endswith(".gz"): fileh = gzip.GzipFile(filename, "r") else: fileh = open(filename, "r") notatend = True partial_record = [] line_num = 0 while notatend: notatend = fileh.next() partial_record.append(notatend) line_num += 1 if notatend.strip() == record_terminator: if terminator_only_after_empty_line: if len(partial_record) < 2 or partial_record[-2] != "\n": if verbose: warning( "Warning: the terminator %s in line %d is not preceded by a blank line\n" % (partial_record[-1].strip(), line_num) ) continue # Do not yield yet, as the terminator is not preceded by an empty line full_record = partial_record partial_record = [] yield "".join(full_record).strip() fileh.close()