def mol_to_sdf(mol, out_file, conf_num=None): """Write RDKit `Mol` objects to an SDF file. Parameters ---------- mol : RDKit Mol A molecule containing 1 or more conformations to write to file. out_file : str Path to save SDF file. conf_num : int or None, optional Maximum number of conformers to save to file. Defaults to all. """ touch_dir(os.path.dirname(out_file)) with smart_open(out_file, "w") as fobj: writer = rdkit.Chem.SDWriter(fobj) conf_ids = [conf.GetId() for conf in mol.GetConformers()] conf_energies = get_conformer_energies_from_mol(mol) mol.ClearProp(CONF_ENERGIES_PROPNAME) for i in conf_ids: if conf_num not in {-1, None} and i >= conf_num: break try: conf_energy = conf_energies[i] mol.SetProp(CONF_ENERGY_PROPNAME, "{:.4f}".format(conf_energy)) except (IndexError, TypeError): pass writer.write(mol, confId=i) writer.close() mol.ClearProp(CONF_ENERGY_PROPNAME) if conf_energies is not None: add_conformer_energies_to_mol(mol, conf_energies) logging.debug("Saved {:d} conformers to {}.".format(i + 1, out_file))
def __init__(self, fold_num, out_dir, cv_method=SEASearchCVMethod(), input_file=os.path.join(os.getcwd(), "input.pkl.bz2"), compute_combined=True, overwrite=False): self.fold_num = fold_num self.out_dir = out_dir touch_dir(self.out_dir) self.input_file = input_file self.mask_file = os.path.join(out_dir, "train_test_mask.pkl.bz2") self.results_file = os.path.join(out_dir, "results.npz") self.target_aucs_file = os.path.join(out_dir, "target_aucs.pkl.bz2") self.combined_roc_file = os.path.join(out_dir, "combined_roc.pkl.bz2") self.combined_prc_file = os.path.join(out_dir, "combined_prc.pkl.bz2") self.combined_enrichment_file = os.path.join( out_dir, "combined_enrichment.pkl.bz2") if isinstance(cv_method, type): cv_method = cv_method() cv_method.out_dir = out_dir cv_method.overwrite = overwrite self.cv_method = cv_method self.compute_combined = compute_combined self.overwrite = overwrite
def __init__(self, k=5, splitter=MoleculeSplitter, cv_method=SEASearchCVMethod(), input_processor=None, parallelizer=None, out_dir=os.getcwd(), overwrite=False, return_auc_type="roc", reduce_negatives=False, fold_kwargs={}): if isinstance(splitter, type): self.splitter = splitter(k) else: assert splitter.k == k self.splitter = splitter self.k = k if (cv_method is SEASearchCVMethod and input_processor is not None): raise ValueError( "Input processing is not (currently) compatible with SEA.") self.cv_method = cv_method self.input_processor = input_processor self.overwrite = overwrite if parallelizer is None: self.parallelizer = Parallelizer(parallel_mode="serial") else: self.parallelizer = parallelizer self.out_dir = out_dir touch_dir(out_dir) self.input_file = os.path.join(self.out_dir, "inputs.pkl.bz2") self.return_auc_type = return_auc_type.lower() self.reduce_negatives = reduce_negatives self.fold_kwargs = fold_kwargs
def substructs_to_pdb( self, level=None, bits=None, out_dir="substructs", reorient=True, exact=False, ): """Save all accepted substructs from current level to PDB. Parameters ---------- level : int or None, optional Level of fingerprinting/number of iterations bits : int or None, optional Folding level of identifiers out_dir : str, optional Directory to which to save PDB files. reorient : bool, optional Reorient substructure to match stereo quadrants. """ shells = self.get_shells_at_level(level=level, exact=exact) if bits in (-1, None): bits = self.bits touch_dir(out_dir) out_files = [] for shell in shells: identifier = signed_to_unsigned_int(shell.identifier) % bits out_file = os.path.join(out_dir, "{}.pdb.gz".format(identifier)) shell_to_pdb( self.mol, shell, self.atom_coords, self.bound_atoms_dict, out_file, reorient=reorient, ) out_files.append(out_file) return out_files
def mol_to_sdf(mol, out_file, conf_num=None): """Write RDKit ``Mol`` objects to an SDF file. Parameters ---------- mol : RDKit Mol A molecule containing 1 or more conformations to write to file. out_file : str Path to save SDF file. conf_num : int or None, optional Maximum number of conformers to save to file. Defaults to all. """ touch_dir(os.path.dirname(out_file)) with smart_open(out_file, "wb") as fobj: writer = rdkit.Chem.SDWriter(fobj) conf_ids = [conf.GetId() for conf in mol.GetConformers()] for i in conf_ids: if conf_num not in {-1, None} and i >= conf_num: break writer.write(mol, confId=i) writer.close() logging.debug("Saved {:d} conformers to {}.".format(i + 1, out_file))
def main(job_id, params, main_conf_dir=MAIN_CONF_DIR, main_dir=CV_DIR, out_dir=None, smiles_file=SMILES_FILE, check_existing=True, mol_targets_file=MOL_TARGETS_FILE, k=CV_K, log_file=LOG_FILE, verbose=False, overwrite=False, min_mols=MIN_MOLS_PER_TARGET, parallelizer=None): params = format_params(params) pre_encoding_params_string = params_to_str(params, with_first=False) params_string = params_to_str(params) if out_dir is None: out_dir = os.path.join(main_dir, params_string) touch_dir(out_dir) if log_file is not None: log_file = os.path.join(out_dir, log_file) setup_logging(log_file, verbose=verbose) params_file = os.path.join(out_dir, "params.cfg") config_parser = update_params(params, section_name="fingerprinting") write_params(config_parser, params_file) if not isinstance(parallelizer, Parallelizer): parallelizer = Parallelizer(parallel_mode="processes", num_proc=NUM_PROC) logging.info("Params: {!r}".format(params.items())) logging.info("Saving files to {:s}.".format(out_dir)) logging.info("Checking for usable pre-existing fingerprints.") existing_molecules_file = get_existing_fprints(pre_encoding_params_string, params['first'], main_dir) molecules_file = get_molecules_file(out_dir) if os.path.isfile(molecules_file) and not overwrite: logging.info("Molecules file already exists. Loading.") smiles_dict, mol_lists_dict, fp_type = molecules_to_lists_dicts( molecules_file) elif existing_molecules_file is None: conf_dir = os.path.join(main_conf_dir, params['conformers']) logging.info("Generating fingerprints from conformers in " "{!s}.".format(conf_dir)) smiles_dict, mol_lists_dict, fp_type = params_to_molecules( params, smiles_file, conf_dir, out_dir, parallelizer=parallelizer) else: logging.info("Using native strings from existing molecules " "file {!s}.".format(existing_molecules_file)) smiles_dict, mol_lists_dict, fp_type = molecules_to_lists_dicts( existing_molecules_file, first=params['first']) lists_dicts_to_molecules(get_molecules_file(out_dir), smiles_dict, mol_lists_dict, fp_type) targets_file = get_targets_file(out_dir) if overwrite or not os.path.isfile(targets_file): logging.info("Reading targets from {!s}.".format(mol_targets_file)) targets_dict = targets_to_dict(mol_targets_file, affinity=AFFINITY) logging.debug("Read {:d} targets.".format(len(targets_dict))) logging.info("Filtering targets by molecules.") filtered_targets_dict = targets_to_mol_lists_targets( filter_targets_by_molecules(targets_dict, mol_lists_dict), mol_lists_dict) del targets_dict, smiles_dict, mol_lists_dict, fp_type logging.info("Saving filtered targets to {!s}.".format(targets_file)) dict_to_targets(targets_file, filtered_targets_dict) del filtered_targets_dict else: logging.info("Targets file already exists. Skipping.") parallel_mode = parallelizer.parallel_mode parallelizer = Parallelizer(parallel_mode=parallel_mode, num_proc=k + 1) splitter = ByTargetMoleculeSplitter(k, reduce_negatives=REDUCE_NEGATIVES) kfold_cv = KFoldCrossValidator(k=k, parallelizer=parallelizer, splitter=splitter, return_auc_type=AUC_TYPE, out_dir=out_dir, overwrite=False) auc = kfold_cv.run(molecules_file, targets_file, min_mols=min_mols, affinity=AFFINITY) logging.info("CV Mean AUC: {:.4f}".format(auc)) return 1 - auc
def fprints_dict_from_mol(mol, bits=BITS, level=LEVEL_DEF, radius_multiplier=RADIUS_MULTIPLIER_DEF, first=FIRST_DEF, counts=COUNTS_DEF, stereo=STEREO_DEF, include_disconnected=INCLUDE_DISCONNECTED_DEF, rdkit_invariants=RDKIT_INVARIANTS_DEF, exclude_floating=EXCLUDE_FLOATING_DEF, out_dir_base=None, out_ext=OUT_EXT_DEF, save=False, all_iters=False, overwrite=False): """Build a E3FP fingerprint from a mol with at least one conformer. Parameters ---------- mol : RDKit Mol Input molecule with one or more conformers to be fingerprinted. bits : int Set number of bits for final folded fingerprint. level : int, optional Level/maximum number of iterations of E3FP. If -1 is provided, it runs until termination, and `all_iters` is set to False. radius_multiplier : float, optional Radius multiplier for spherical shells. first : int, optional First `N` number of conformers from file to fingerprint. If -1, all are fingerprinted. counts : bool, optional Instead of bit-based fingerprints. Otherwise, generate count-based fingerprints. stereo : bool, optional Incorporate stereochemistry in fingerprint. include_disconnected : bool, optional Include disconnected atoms when hashing and for stereo calculations. Turn off purely for testing purposes, to make E3FP more like ECFP. rdkit_invariants : bool, optional Use the atom invariants used by RDKit for its Morgan fingerprint. exclude_floating : bool, optional: Mask atoms with no bonds (usually floating ions) from the fingerprint. These are often placed arbitrarily and can confound the fingerprint. out_dir_base : str, optional Basename of out directory to save fingerprints. Iteration number is appended. out_ext : str, optional Extension on fingerprint pickles, used to determine compression level. save : bool, optional Save fingerprints to directory. all_iters : bool, optional Save fingerprints from all iterations to file(s). overwrite : bool, optional Overwrite pre-existing file. Deleted Parameters ------------------ sdf_file : str SDF file path. """ name = mol.GetProp("_Name") if level is None: level = -1 if bits in (-1, None): bits = BITS if save: filenames = [] all_files_exist = True if level == -1 or not all_iters: if level == -1: dir_name = "{!s}_complete".format(out_dir_base) else: dir_name = "{!s}{:d}".format(out_dir_base, level) touch_dir(dir_name) filenames.append( os.path.join(dir_name, "{!s}{!s}".format(name, out_ext))) if not os.path.isfile(filenames[0]): all_files_exist = False else: for i in range(level + 1): dir_name = "{:s}{:d}".format(out_dir_base, i) touch_dir(dir_name) filename = os.path.join(dir_name, "{!s}{!s}".format(name, out_ext)) filenames.append(filename) if not os.path.isfile(filename): all_files_exist = False if all_files_exist and not overwrite: logging.warning("All fingerprint files for {!s} already exist. " "Skipping.".format(name)) return {} fingerprinter = Fingerprinter(bits=bits, level=level, radius_multiplier=radius_multiplier, counts=counts, stereo=stereo, include_disconnected=include_disconnected, rdkit_invariants=rdkit_invariants, exclude_floating=exclude_floating) try: fprints_dict = {} logging.info("Generating fingerprints for {!s}.".format(name)) for j, conf in enumerate(mol.GetConformers()): if j == first: j -= 1 break fingerprinter.run(conf, mol) # fingerprinter.save_substructs_to_db(substruct_db) #PLACEHOLDER level_range = range(level + 1) if level == -1 or not all_iters: level_range = (level, ) else: level_range = range(level + 1) for i in level_range: fprint = fingerprinter.get_fingerprint_at_level(i) fprint.name = MolItemName.from_str(name).to_conf_name(j) # if i not in fprints_dict and j != 0: # fprints_dict[i] = fprints_dict[i-1][:j] fprints_dict.setdefault(i, []).append(fprint) logging.info("Generated {:d} fingerprints for {!s}.".format( j + 1, name)) except: logging.error("Error generating fingerprints for {:s}.".format(name), exc_info=True) return {} if save: if level == -1 or not all_iters: fprints = fprints_dict[max(fprints_dict.keys())] try: fp.savez(filenames[0], *fprints) logging.info("Saved fingerprints for {:s}.".format(name)) except Exception: logging.error( "Error saving fingerprints for {:s} to {:s}".format( name, filenames[0]), exc_info=True) return {} else: try: for i, fprints in sorted(fprints_dict.items()): fp.savez(filenames[i], *fprints) logging.info("Saved fingerprints for {:s}.".format(name)) except Exception: logging.error( "Error saving fingerprints for {:s} to {:s}".format( name, filenames[i]), exc_info=True) return {} return fprints_dict
def run( mol2=None, smiles=None, standardise=STANDARDISE_DEF, num_conf=NUM_CONF_DEF, first=FIRST_DEF, pool_multiplier=POOL_MULTIPLIER_DEF, rmsd_cutoff=RMSD_CUTOFF_DEF, max_energy_diff=MAX_ENERGY_DIFF_DEF, forcefield=FORCEFIELD_DEF, seed=SEED_DEF, params=None, prioritize=False, out_dir=OUTDIR_DEF, compress=COMPRESS_DEF, overwrite=False, values_file=None, log=None, num_proc=None, parallel_mode=None, verbose=False, ): """Run conformer generation.""" setup_logging(log, verbose=verbose) if params is not None: params = read_params(params) standardise = get_value(params, "preprocessing", "standardise", bool) num_conf = get_value(params, "conformer_generation", "num_conf", int) first = get_value(params, "conformer_generation", "first", int) pool_multiplier = get_value(params, "conformer_generation", "pool_multiplier", int) rmsd_cutoff = get_value(params, "conformer_generation", "rmsd_cutoff", float) max_energy_diff = get_value(params, "conformer_generation", "max_energy_diff", float) forcefield = get_value(params, "conformer_generation", "forcefield") seed = get_value(params, "conformer_generation", "seed", int) # check args if forcefield not in FORCEFIELD_CHOICES: raise ValueError( "Specified forcefield {} is not in valid options {!r}".format( forcefield, FORCEFIELD_CHOICES)) para = Parallelizer(num_proc=num_proc, parallel_mode=parallel_mode) # Check to make sure args make sense if mol2 is None and smiles is None: if para.is_master(): parser.print_usage() logging.error("Please provide mol2 file or a SMILES file.") sys.exit() if mol2 is not None and smiles is not None: if para.is_master(): parser.print_usage() logging.error("Please provide only a mol2 file OR a SMILES file.") sys.exit() if num_proc and num_proc < 1: if para.is_master(): parser.print_usage() logging.error( "Please provide more than one processor with `--num_proc`.") sys.exit() # Set up input type if mol2 is not None: in_type = "mol2" elif smiles is not None: in_type = "smiles" if para.is_master(): if in_type == "mol2": logging.info("Input type: mol2 file(s)") logging.info("Input file number: {:d}".format(len(mol2))) mol_iter = (mol_from_mol2(_mol2_file, _name, standardise=standardise) for _mol2_file, _name in mol2_generator(*mol2)) else: logging.info("Input type: Detected SMILES file(s)") logging.info("Input file number: {:d}".format(len(smiles))) mol_iter = (mol_from_smiles(_smiles, _name, standardise=standardise) for _smiles, _name in smiles_generator(*smiles)) if prioritize: logging.info(("Prioritizing mols with low rotatable bond number" " and molecular weight first.")) mols_with_properties = [( AllChem.CalcNumRotatableBonds(mol), AllChem.CalcExactMolWt(mol), mol, ) for mol in mol_iter if mol is not None] data_iterator = make_data_iterator( (x[-1] for x in sorted(mols_with_properties))) else: data_iterator = make_data_iterator( (x for x in mol_iter if x is not None)) # Set up parallel-specific options logging.info("Parallel Type: {}".format(para.parallel_mode)) # Set other options touch_dir(out_dir) if not num_conf: num_conf = -1 logging.info("Out Directory: {}".format(out_dir)) logging.info("Overwrite Existing Files: {}".format(overwrite)) if values_file is not None: if os.path.exists(values_file) and overwrite is not True: value_args = (values_file, "a") logging.info("Values file: {} (append)".format((values_file))) else: value_args = (values_file, "w") logging.info("Values file: {} (new file)".format( (values_file))) if num_conf is None or num_conf == -1: logging.info("Target Conformer Number: auto") else: logging.info("Target Conformer Number: {:d}".format(num_conf)) if first is None or first == -1: logging.info("First Conformers Number: all") else: logging.info("First Conformers Number: {:d}".format(first)) logging.info("Pool Multiplier: {:d}".format(pool_multiplier)) logging.info("RMSD Cutoff: {:.4g}".format(rmsd_cutoff)) if max_energy_diff is None: logging.info("Maximum Energy Difference: None") else: logging.info("Maximum Energy Difference: {:.4g} kcal".format( max_energy_diff)) logging.info("Forcefield: {}".format(forcefield.upper())) if seed != -1: logging.info("Seed: {:d}".format(seed)) logging.info("Starting.") else: data_iterator = iter([]) gen_conf_kwargs = { "out_dir": out_dir, "num_conf": num_conf, "rmsd_cutoff": rmsd_cutoff, "max_energy_diff": max_energy_diff, "forcefield": forcefield, "pool_multiplier": pool_multiplier, "first": first, "seed": seed, "save": True, "overwrite": overwrite, "compress": compress, } run_kwargs = {"kwargs": gen_conf_kwargs} results_iterator = para.run_gen(generate_conformers, data_iterator, **run_kwargs) if para.is_master() and values_file is not None: hdf5_buffer = HDF5Buffer(*value_args) for result, data in results_iterator: if (para.is_master() and values_file is not None and result is not False): values_to_hdf5(hdf5_buffer, result) if para.is_master() and values_file is not None: hdf5_buffer.flush() hdf5_buffer.close()
dict_to_smiles(CHEMBL_CANON_SMILES_FILE, chembl_canonical_smiles_dict) chembl_canonical_smiles_dict = smiles_to_dict(CHEMBL_CANON_SMILES_FILE) pdb_to_chembl_mol_map, chembl_to_pdb_mol_map = mol_map_from_smiles( pdb_canonical_smiles_dict, chembl_canonical_smiles_dict) with smart_open(PDB_CHEMBL_MOL_MAP, "w") as f: pkl.dump((pdb_to_chembl_mol_map, chembl_to_pdb_mol_map), f) else: with smart_open(PDB_CHEMBL_MOL_MAP, "r") as f: pdb_to_chembl_mol_map, chembl_to_pdb_mol_map = pkl.load(f) # Save query ligands to SDF files and get SMILES pdb_smiles_dict = {} skip_pairs = set() touch_dir(PDB_CONF_DIR) for mol_name, mol2_file in pdb_mol2_files.items(): mol = mol_from_mol2(mol2_file, mol_name) smiles = MolToSmiles(mol, isomericSmiles=True) pdb_smiles_dict[mol_name] = smiles CanonicalizeMol(mol) sdf_file = os.path.join(PDB_CONF_DIR, "{}.sdf.bz2".format(mol_name)) mol_to_sdf(mol, sdf_file) dict_to_smiles(PDB_SMILES_FILE, pdb_smiles_dict) # Build filtered CHEMBL targets dict chembl_smiles_cids = {k.split("-")[0] for k in chembl_smiles_dict.keys()} chembl_cids = set() if not os.path.isfile(CHEMBL_TARGETS): filtered_chembl_targets_dict = {} for k, v in pdb_targets_dict.items():
def main(query_molecules_file, query_targets_file, target_molecules_file, target_targets_file, method=SEASearchCVMethod, fit_file=None, log=None, out_dir="./"): setup_logging(log) method = method() method.out_dir = out_dir touch_dir(out_dir) if fit_file is None: fit_file = os.path.join(out_dir, "library.fit") logging.info("Loading target files.") if isinstance(method, SEASearchCVMethod): method.fit_file = fit_file (_, target_targets_dict, target_smiles_dict, target_mol_list_dict, target_fp_type, target_target_list, target_mol_list) = process_input_files(target_molecules_file, target_targets_file, sea_format=True) logging.info("Saving target SEA files.") dict_to_targets(method.train_targets_file, target_targets_dict) lists_dicts_to_molecules(method.train_molecules_file, target_smiles_dict, target_mol_list_dict, target_fp_type) target_fp_array = None target_mol_to_fp_inds = None target_target_mol_array = None mask = None else: (target_fp_array, target_mol_to_fp_inds, target_target_mol_array, target_target_list, target_mol_list) = process_input_files(target_molecules_file, target_targets_file, sea_format=False) mask = np.ones_like(target_target_mol_array, dtype=np.bool_) method.train(target_fp_array, target_mol_to_fp_inds, target_target_mol_array, target_target_list, target_mol_list, mask=mask) logging.info("Loading query files.") if isinstance(method, SEASearchCVMethod): (query_target_mol_array, query_targets_dict, query_smiles_dict, query_mol_list_dict, query_fp_type, query_target_list, query_mol_list) = process_input_files(query_molecules_file, query_targets_file, sea_format=True) logging.info("Saving query SEA files.") lists_dicts_to_molecules(method.test_molecules_file, query_smiles_dict, query_mol_list_dict, query_fp_type) query_fp_array = None query_mol_to_fp_inds = None else: (query_fp_array, query_mol_to_fp_inds, query_target_mol_array, query_target_list, query_mol_list) = process_input_files(query_molecules_file, query_targets_file, sea_format=False) mask = np.ones_like(query_target_mol_array, dtype=np.bool_) results = method.test(query_fp_array, query_mol_to_fp_inds, query_target_mol_array, query_target_list, query_mol_list, mask=mask) y_true = query_target_mol_array.ravel() y_score = results.ravel() nan_inds = np.where(~np.isnan(y_score)) y_true, y_score = y_true[nan_inds], y_score[nan_inds] logging.info("Computing results curves.") roc_file, prc_file, enrich_file = [ os.path.join(out_dir, "combined_{}.pkl.bz2".format(x)) for x in ["roc", "prc", "enrichment"] ] logging.info("Computing ROC curves.") roc = roc_curve(y_true, y_score, drop_intermediate=True) auroc = auc(roc[0], roc[1]) with smart_open(roc_file, "wb") as f: pkl.dump(roc, f, pkl.HIGHEST_PROTOCOL) logging.info("AUROC: {:.4f}".format(auroc)) logging.info("Computing PRC curves.") prc_rec = precision_recall_curve(y_true, y_score) prc = (prc_rec[1], prc_rec[0], prc_rec[2]) auprc = auc(prc[0], prc[1]) imbalance = get_imbalance(y_true) with smart_open(prc_file, "wb") as f: pkl.dump(prc, f, pkl.HIGHEST_PROTOCOL) logging.info("AUPRC: {:.4f} ({:.4f} of data is positive)".format( auprc, imbalance)) logging.info("Computing enrichment curves.") enrichment = enrichment_curve(y_true, y_score) with smart_open(enrich_file, "wb") as f: pkl.dump(enrichment, f, pkl.HIGHEST_PROTOCOL) auec = auc(enrichment[0], enrichment[1]) logging.info("AUE: {:.4f}".format(auec))
cmd.ray(dpi) cmd.png(axes_figure) partial_opaque_to_opaque(axes_figure) sleep(0.5) if __name__ == "__main__": usage = "pymol -r make_shell_figures.py -- <sdf_file>" try: sdf_file = sys.argv[1] except IndexError: sys.exit(usage) mol_name = os.path.basename(sdf_file).split(".")[0] out_dir = mol_name touch_dir(out_dir) json_out_file = os.path.join(out_dir, "graph.json") aligned_mol_pdb_file = os.path.join(out_dir, "mol.pdb") mol = mol_from_sdf(sdf_file) left_to_right_atom_ids = save_aligned_conf_to_pdb(aligned_mol_pdb_file, mol) fprinter = fingerprint_mol(mol) graph = create_shell_graph(fprinter, radius_multiplier=RADIUS_MULTIPLIER, only_unique=True) atom_types_dict = get_atom_types(mol, graph) atom_colors_dict = define_colors_by_atom_types(atom_types_dict, mol) write_pdb_files(fprinter, BITS, out_dir=out_dir) pdb_dirs = sorted(glob.glob(os.path.join(out_dir, "substructs*")))
def library_from_map(targets_map_file, all_molecules_file, all_targets_file, fit_file=None, sample=None, affinity=None, out_dir='./'): """Build SEA library from target map and existing SEA molecules/targets.""" molecules_file = os.path.join(out_dir, "molecules.csv.bz2") targets_file = os.path.join(out_dir, "targets.csv.bz2") library_file = os.path.join(out_dir, "library.sea") touch_dir(out_dir) logging.info("Reading targets map from {0}".format(targets_map_file)) targets_map = read_targets_map(targets_map_file, key_header=KEY_HEADER, headers=MAP_HEADER) logging.debug("{:d} targets in map".format(len(targets_map))) logging.info("Reading targets file from {0}".format("all_targets_file")) all_targets_dict = targets_to_dict(all_targets_file, affinity=affinity) logging.debug("Read {:d} targets".format(len(all_targets_dict))) targets_dict = filter_targets_by_map(all_targets_dict, targets_map) logging.debug("{:d} targets after filtering".format(len(targets_dict))) logging.info("Reading molecules file from {0}".format(all_molecules_file)) smiles_dict, all_mol_lists_dict, fp_type = molecules_to_lists_dicts( all_molecules_file) logging.debug("{:d} molecules in file".format(len(all_mol_lists_dict))) mol_lists_targets_dict = targets_to_mol_lists_targets( targets_dict, all_mol_lists_dict) logging.debug("{:d} mol lists targets".format(len(mol_lists_targets_dict))) logging.info("Writing targets file") dict_to_targets(targets_file, mol_lists_targets_dict) mol_lists_dict = filter_molecules_by_targets(all_mol_lists_dict, targets_dict) del targets_dict logging.debug("{:d} filtered molecules".format(len(mol_lists_dict))) del mol_lists_targets_dict logging.info("Writing molecules file") lists_dicts_to_molecules(molecules_file, smiles_dict, mol_lists_dict, fp_type) if fit_file is None or not os.path.isfile(fit_file): logging.info("Fit file does not exist. Generating fit.") if fit_file is None: fit_file = os.path.join(out_dir, "library.fit") tmp_molecules_file = all_molecules_file tmp_targets_file = TMP_PREFIX + "_" + os.path.basename(targets_file) tmp_library_file = TMP_PREFIX + "_" + os.path.basename(library_file) if sample is not None: logging.info("Sampling {} random molecules for fit".format(sample)) tmp_molecules_file = TMP_PREFIX + "_" + os.path.basename( molecules_file) sample_mol_lists_files(all_molecules_file, all_targets_file, sample, sample_molecules_file=tmp_molecules_file, sample_targets_file=tmp_targets_file, overwrite=True) else: logging.info("Using all molecules for fit") all_mol_lists_targets_dict = targets_to_mol_lists_targets( all_targets_dict, all_mol_lists_dict) logging.info("Writing all targets to file.") dict_to_targets(tmp_targets_file, all_mol_lists_targets_dict) del all_mol_lists_targets_dict logging.info("Building library for fit molecules/targets.") build_library(tmp_library_file, tmp_molecules_file, tmp_targets_file, fit_file, log=True, no_plot=False) else: logging.info("Fit file already exists. Skipping fit generation.") del all_mol_lists_dict logging.info("Building library") build_library(library_file, molecules_file, targets_file, fit_file, log=True) logging.info("Library has been built.")
def train(self, fp_array, mol_to_fp_inds, target_mol_array, target_list, mol_list, mask): """Train and score a classifier for each target. Parameters ---------- fp_array : ndarray or csr_matrix (n_fprints, n_bits) Array with fingerprints as rows mol_to_fp_inds : dict Map from index of `mol_list` to indices for mol fingerprints in `fp_array` target_mol_array : ndarray of bool (n_targets, n_mols) Boolean array with True marking mol/target binding pairs and False marking implied negatives. target_list : list of str List of target names corresponding to rows of `target_mol_array`. mol_list : list of str List of mol names corresponding to columns of `target_mol_array`. mask : ndarray of bool (n_targets, n_mols) Boolean array with positives marking mol/target pairs in the training dataset. """ if self.is_trained(target_list) and not self.overwrite: logging.info("All targets already trained.") return if self.dense_data and issparse(fp_array): logging.info("Converting from sparse to dense fingerprints.") fp_array = fp_array.toarray() fp_array = fp_array.astype(self.dtype) touch_dir(self.fit_dir) logging.info("Generating target fits.") target_num = len(target_list) target_perc_num = int(target_num / 100) for i, target_key in enumerate(target_list): fit_file = self._fit_file_from_target_key(target_key) if os.path.isfile(fit_file) and not self.overwrite: logging.debug( "Fit file for {} already exists. Skipping".format( target_key.tid)) continue # get subset of training data set_fp_inds, set_mol_inds, set_fp_num = self.get_fprint_subsets( mol_to_fp_inds, target_mol_array[i, :], mask[i, :], sample_negatives=self.train_sample_negatives) data = fp_array[set_fp_inds, :] pos = np.repeat(target_mol_array[i, set_mol_inds], set_fp_num).astype(self.dtype) # perform training clf = self.create_clf(data) logging.debug("Fitting {} using {} fprints ({}/{})".format( target_key.tid, data.shape[0], i + 1, target_num)) self.train_clf(clf, data, pos, batch_size=self.train_batch_size) if self.train_sample_negatives: # expensive if all data used score = self.score_clf(clf, data, pos) logging.debug("Fitted {} with score {:.4f}. ({}/{})".format( target_key.tid, score, i + 1, target_num)) else: logging.debug("Fitted {}. ({}/{})".format( target_key.tid, i + 1, target_num)) self.save_fit_file(target_key, clf) # if (i + 1) % target_perc_num == 0: # logging.info("Fit {:.2f}% of targets ({}/{})".format( # 100 * (i + 1) / float(target_num), i + 1, target_num)) logging.info("Finished fitting targets.")