def apply_to_mol(self, new_col_name, lambda_func): """Applies a func to the Mol object, which is generated on-the-fly, if necessary. Displays a progress bar for longer operations. Returns a new copy or modifies inplace, depending on self.inplace.""" show_prog, data_len = self.which_progress(min_len=2000) self.find_mol_col() if show_prog is not None: ctr = nbt.ProgCtr() def _apply(x): if show_prog is not None: ctr.inc() if ctr() % 500 == 0: print(" - processed: {:8d}\r".format(ctr()), end="") sys.stdout.flush() mol = self.mol_method(x) if not mol: return pd.np.nan return lambda_func(mol) if self.inplace: self.data[new_col_name] = self.data[self.use_col].apply(_apply) if show_prog is not None: print(" - processed: {:8d} done.".format(ctr())) else: result = self.new() result.data = self.data result.data[new_col_name] = result.data[self.use_col].apply(_apply) if show_prog is not None: print(" - processed: {:8d} done.".format(ctr())) return result
def check_2d_coords(self, force=False): """Generates 2D coordinates if necessary. Requires the Mol object to be present (use add_mols() ). Always operates inplace.""" show_prog, data_len = self.which_progress(min_len=1000) if show_prog is not None: ctr = nbt.ProgCtr() self.find_mol_col() def _apply(x): if show_prog is not None: ctr.inc() if ctr() % 100 == 0: print(" - processed: {:8d}\r".format(ctr()), end="") sys.stdout.flush() mol = self.mol_method(x) if mol: check_2d_coords(mol, force=force) if self.inplace: self.data[self.use_col].apply(_apply) if show_prog is not None: print(" - processed: {:8d} done.".format(ctr())) else: result = self.data[self.use_col].apply(_apply) if show_prog is not None: print(" - processed: {:8d} done.".format(ctr())) return result
def apply_to_col(self, col_name, new_col_name, lambda_func): """Applies a func to a column in the MolFrame. A wrapper around pd.apply to enable progress bars. Returns a new copy or modifies inplace, depending on self.inplace.""" show_prog, data_len = self.which_progress(min_len=5000) if show_prog is not None: ctr = nbt.ProgCtr() def _apply(x): if show_prog is not None: ctr.inc() if ctr() % 500 == 0: print(" - processed: {:8d}\r".format(ctr()), end="") sys.stdout.flush() return lambda_func(x) if self.inplace: self.data[new_col_name] = self.data[col_name].apply(_apply) if show_prog is not None: print(" - processed: {:8d} done.".format(ctr())) else: result = self.new() result.data = self.data result.data[new_col_name] = result.data[col_name].apply(_apply) if show_prog is not None: print(" - processed: {:8d} done.".format(ctr())) return result
def mol_filter(self, query, add_h=False): """Substructure filter. Returns a new MolFrame instance. ``query`` has to be a Smiles string.""" show_prog, data_len = self.which_progress(min_len=5000) if show_prog is not None: ctr = nbt.ProgCtr() query_mol = Chem.MolFromSmiles(query) if not query_mol: raise ValueError("Could not generate query mol.") if "[H]" in query or "#1" in query: add_h = True print("> explicit hydrogens turned on (add_h = True)") res_l = [] self.find_mol_col() for _, rec in self.data.iterrows(): if show_prog is not None: ctr.inc() if ctr() % 1000 == 0: print(" - processed: {:8d}\r".format(ctr()), end="") sys.stdout.flush() mol = self.mol_method(rec[self.use_col]) if not mol: continue hit = False if add_h: mol_with_h = Chem.AddHs(mol) if mol_with_h.HasSubstructMatch(query_mol): hit = True else: if mol.HasSubstructMatch(query_mol): hit = True if hit: res_l.append(rec) result = self.new() result.data = pd.DataFrame(res_l) if show_prog is not None: print(" - processed: {:8d} done.".format(ctr())) print_log(result.data, "mol_filter") return result
def sim_filter(self, query, cutoff=0.75): """Similarity filter. Returns a new MolFrame instance. Add a suitable fingerprint once with addf_fps(), then give a reference molecule or a SMILES string as query.""" if len(self.fp_name) == 0 or self.fp_col not in self.data.columns: raise KeyError( "No fingerprints found. Please generate them first with add_fp()." ) show_prog, data_len = self.which_progress(min_len=5000) if show_prog is not None: ctr = nbt.ProgCtr() if isinstance(query, str): query_mol = Chem.MolFromSmiles(query) else: query_mol = deepcopy(query) if not query_mol: raise ValueError("Could not generate query mol.") fp_method = FPDICT[self.fp_name] query_fp = fp_method(query_mol) res_l = [] for _, rec in self.data.iterrows(): if show_prog is not None: ctr.inc() if ctr() % 1000 == 0: print(" - processed: {:8d}\r".format(ctr()), end="") sys.stdout.flush() mol_fp = pickle.loads(b64.b64decode(rec[self.fp_col])) sim = DataStructs.TanimotoSimilarity(query_fp, mol_fp) if sim >= cutoff: rec["Sim"] = sim res_l.append(rec) result = self.new() result.data = pd.DataFrame(res_l) print_log(result.data, "sim_filter") if show_prog is not None: print(" - processed: {:8d} done.".format(ctr())) return result