Esempio n. 1
0
    def add_mol(self, molid, rdkmol):
        """Serial, memory-conservative fingerprinter for building a fingerprints sparse matrix incrementally."""

        try:
            # Amazing DRY violation
            counts, _ = unfolded_morgan_fingerprint(rdkmol,
                                                    max_radius=self.max_radius,
                                                    fcfp=self.fcfp,
                                                    use_hs=self.use_hs,
                                                    canonical=self.canonical,
                                                    isomeric=self.isomeric,
                                                    kekule=self.kekule,
                                                    all_bonds_explicit=self.all_bonds_explicit)

            cols_vals = []
            for smiles, count in counts.iteritems():
                if smiles not in self._s2i:
                    self._s2i[smiles] = len(self._s2i)
                self._rows.append(len(self._molids))
                cols_vals.append((self._s2i[smiles], count))

            cols_vals = sorted(cols_vals)
            self._cols.extend(map(itemgetter(0), cols_vals))
            self._vals.extend(map(itemgetter(1), cols_vals))
            self._molids.append(molid)
        except Exception, _:
            warning('Could not compute unfolded fingerprint for molecule %s' % molid)
            self._failed_moldids.append(molid)
Esempio n. 2
0
def to_rdkit_mol(smiles, molid=None, sanitize=True, to2D=False, to3D=False, toPropertyMol=False):
    """Converts a smiles string into an RDKit molecule."""
    mol = Chem.MolFromSmiles(smiles, sanitize=sanitize)  # TODO: allow other formats, like sdf or inchi
    if mol is None:
        if molid is None:
            warning('RDKit cannot create a molecule from smiles %s' % smiles)
        else:
            warning('RDKit cannot create molecule %s from smiles %s' % (molid, smiles))
        return mol
    if to3D:
        AllChem.EmbedMolecule(mol)
        AllChem.UFFOptimizeMolecule(mol)
    elif to2D:
        AllChem.Compute2DCoords(mol)
    if toPropertyMol:
        return PropertyMol(mol)
    return mol
Esempio n. 3
0
def iterate_records_in_text_file(
    filename, record_terminator="$$$$", terminator_only_after_empty_line=True, verbose=True
):
    """Iterate over all molecule records in the given sdf file.
    # NB: assume that the last record also includes terminator
    # Must the record terminator be preceded by an empty line?
    """
    # Let's detect compressed files just using the extension
    if hasattr(filename, "next"):
        fileh = filename
    elif filename.endswith(".zip"):
        fileh = zipfile.ZipFile(filename, "r")
    elif filename.endswith(".gz"):
        fileh = gzip.GzipFile(filename, "r")
    else:
        fileh = open(filename, "r")
    notatend = True
    partial_record = []
    line_num = 0
    while notatend:
        notatend = fileh.next()
        partial_record.append(notatend)
        line_num += 1
        if notatend.strip() == record_terminator:
            if terminator_only_after_empty_line:
                if len(partial_record) < 2 or partial_record[-2] != "\n":
                    if verbose:
                        warning(
                            "Warning: the terminator %s in line %d is not preceded by a blank line\n"
                            % (partial_record[-1].strip(), line_num)
                        )
                    continue  # Do not yield yet, as the terminator is not preceded by an empty line
            full_record = partial_record
            partial_record = []
            yield "".join(full_record).strip()
    fileh.close()