def run(sdf_files, bits=BITS, first=FIRST_DEF, level=LEVEL_DEF, radius_multiplier=RADIUS_MULTIPLIER_DEF, counts=COUNTS_DEF, stereo=STEREO_DEF, include_disconnected=INCLUDE_DISCONNECTED_DEF, rdkit_invariants=RDKIT_INVARIANTS_DEF, exclude_floating=EXCLUDE_FLOATING_DEF, params=None, out_dir_base=None, out_ext=OUT_EXT_DEF, db_file=None, overwrite=False, all_iters=False, log=None, num_proc=None, parallel_mode=None, verbose=False): """Generate E3FP fingerprints from SDF files.""" setup_logging(log, verbose=verbose) if params is not None: params = read_params(params, fill_defaults=True) bits = get_value(params, "fingerprinting", "bits", int) first = get_value(params, "fingerprinting", "first", int) level = get_value(params, "fingerprinting", "level", int) radius_multiplier = get_value(params, "fingerprinting", "radius_multiplier", float) counts = get_value(params, "fingerprinting", "counts", bool) stereo = get_value(params, "fingerprinting", "stereo", bool) include_disconnected = get_value(params, "fingerprinting", "include_disconnected", bool) rdkit_invariants = get_value(params, "fingerprinting", "rdkit_invariants", bool) exclude_floating = get_value(params, "fingerprinting", "exclude_floating", bool) para = Parallelizer(num_proc=num_proc, parallel_mode=parallel_mode) if para.rank == 0: logging.info("Initializing E3FP generation.") logging.info("Getting SDF files") if len(sdf_files) == 1 and os.path.isdir(sdf_files[0]): from glob import glob sdf_files = glob("{:s}/*sdf*".format(sdf_files[0])) data_iterator = make_data_iterator(sdf_files) logging.info("SDF File Number: {:d}".format(len(sdf_files))) if out_dir_base is not None: logging.info("Out Directory Basename: {:s}".format(out_dir_base)) logging.info("Out Extension: {:s}".format(out_ext)) if db_file is not None: logging.info("Database File: {:s}".format(db_file)) if db_file is None and out_dir_base is None: sys.exit('Either `db_file` or `out_dir_base` must be specified.') logging.info("Max First Conformers: {:d}".format(first)) logging.info("Bits: {:d}".format(bits)) logging.info("Level/Max Iterations: {:d}".format(level)) logging.info( "Shell Radius Multiplier: {:.4g}".format(radius_multiplier)) logging.info("Stereo Mode: {!s}".format(stereo)) if include_disconnected: logging.info("Connected-only mode: on") if rdkit_invariants: logging.info("Invariant type: RDKit") else: logging.info("Invariant type: Daylight") logging.info("Parallel Mode: {!s}".format(para.parallel_mode)) logging.info("Starting") else: data_iterator = iter([]) fp_kwargs = { "first": first, "bits": bits, "level": level, "radius_multiplier": radius_multiplier, "stereo": stereo, "counts": counts, "include_disconnected": include_disconnected, "rdkit_invariants": rdkit_invariants, "exclude_floating": exclude_floating, "out_dir_base": out_dir_base, "out_ext": out_ext, "all_iters": all_iters, "overwrite": overwrite, "save": False } if out_dir_base is not None: fp_kwargs['save'] = True run_kwargs = {"kwargs": fp_kwargs} results_iter = para.run_gen(fprints_dict_from_sdf, data_iterator, **run_kwargs) if db_file is not None: fprints = [] for result, data in results_iter: try: fprints.extend(result.get(level, result[max(result.keys())])) except (AttributeError, ValueError): # fprinting failed, assume logged in method continue if len(fprints) > 0: db = FingerprintDatabase(fp_type=type(fprints[0]), level=level) db.add_fingerprints(fprints) db.save(db_file) logging.info("Saved fingerprints to {:s}".format(db_file)) else: list(results_iter)
def run( mol2=None, smiles=None, standardise=STANDARDISE_DEF, num_conf=NUM_CONF_DEF, first=FIRST_DEF, pool_multiplier=POOL_MULTIPLIER_DEF, rmsd_cutoff=RMSD_CUTOFF_DEF, max_energy_diff=MAX_ENERGY_DIFF_DEF, forcefield=FORCEFIELD_DEF, seed=SEED_DEF, params=None, prioritize=False, out_dir=OUTDIR_DEF, compress=COMPRESS_DEF, overwrite=False, values_file=None, log=None, num_proc=None, parallel_mode=None, verbose=False, ): """Run conformer generation.""" setup_logging(log, verbose=verbose) if params is not None: params = read_params(params) standardise = get_value(params, "preprocessing", "standardise", bool) num_conf = get_value(params, "conformer_generation", "num_conf", int) first = get_value(params, "conformer_generation", "first", int) pool_multiplier = get_value(params, "conformer_generation", "pool_multiplier", int) rmsd_cutoff = get_value(params, "conformer_generation", "rmsd_cutoff", float) max_energy_diff = get_value(params, "conformer_generation", "max_energy_diff", float) forcefield = get_value(params, "conformer_generation", "forcefield") seed = get_value(params, "conformer_generation", "seed", int) # check args if forcefield not in FORCEFIELD_CHOICES: raise ValueError( "Specified forcefield {} is not in valid options {!r}".format( forcefield, FORCEFIELD_CHOICES)) para = Parallelizer(num_proc=num_proc, parallel_mode=parallel_mode) # Check to make sure args make sense if mol2 is None and smiles is None: if para.is_master(): parser.print_usage() logging.error("Please provide mol2 file or a SMILES file.") sys.exit() if mol2 is not None and smiles is not None: if para.is_master(): parser.print_usage() logging.error("Please provide only a mol2 file OR a SMILES file.") sys.exit() if num_proc and num_proc < 1: if para.is_master(): parser.print_usage() logging.error( "Please provide more than one processor with `--num_proc`.") sys.exit() # Set up input type if mol2 is not None: in_type = "mol2" elif smiles is not None: in_type = "smiles" if para.is_master(): if in_type == "mol2": logging.info("Input type: mol2 file(s)") logging.info("Input file number: {:d}".format(len(mol2))) mol_iter = (mol_from_mol2(_mol2_file, _name, standardise=standardise) for _mol2_file, _name in mol2_generator(*mol2)) else: logging.info("Input type: Detected SMILES file(s)") logging.info("Input file number: {:d}".format(len(smiles))) mol_iter = (mol_from_smiles(_smiles, _name, standardise=standardise) for _smiles, _name in smiles_generator(*smiles)) if prioritize: logging.info(("Prioritizing mols with low rotatable bond number" " and molecular weight first.")) mols_with_properties = [( AllChem.CalcNumRotatableBonds(mol), AllChem.CalcExactMolWt(mol), mol, ) for mol in mol_iter if mol is not None] data_iterator = make_data_iterator( (x[-1] for x in sorted(mols_with_properties))) else: data_iterator = make_data_iterator( (x for x in mol_iter if x is not None)) # Set up parallel-specific options logging.info("Parallel Type: {}".format(para.parallel_mode)) # Set other options touch_dir(out_dir) if not num_conf: num_conf = -1 logging.info("Out Directory: {}".format(out_dir)) logging.info("Overwrite Existing Files: {}".format(overwrite)) if values_file is not None: if os.path.exists(values_file) and overwrite is not True: value_args = (values_file, "a") logging.info("Values file: {} (append)".format((values_file))) else: value_args = (values_file, "w") logging.info("Values file: {} (new file)".format( (values_file))) if num_conf is None or num_conf == -1: logging.info("Target Conformer Number: auto") else: logging.info("Target Conformer Number: {:d}".format(num_conf)) if first is None or first == -1: logging.info("First Conformers Number: all") else: logging.info("First Conformers Number: {:d}".format(first)) logging.info("Pool Multiplier: {:d}".format(pool_multiplier)) logging.info("RMSD Cutoff: {:.4g}".format(rmsd_cutoff)) if max_energy_diff is None: logging.info("Maximum Energy Difference: None") else: logging.info("Maximum Energy Difference: {:.4g} kcal".format( max_energy_diff)) logging.info("Forcefield: {}".format(forcefield.upper())) if seed != -1: logging.info("Seed: {:d}".format(seed)) logging.info("Starting.") else: data_iterator = iter([]) gen_conf_kwargs = { "out_dir": out_dir, "num_conf": num_conf, "rmsd_cutoff": rmsd_cutoff, "max_energy_diff": max_energy_diff, "forcefield": forcefield, "pool_multiplier": pool_multiplier, "first": first, "seed": seed, "save": True, "overwrite": overwrite, "compress": compress, } run_kwargs = {"kwargs": gen_conf_kwargs} results_iterator = para.run_gen(generate_conformers, data_iterator, **run_kwargs) if para.is_master() and values_file is not None: hdf5_buffer = HDF5Buffer(*value_args) for result, data in results_iterator: if (para.is_master() and values_file is not None and result is not False): values_to_hdf5(hdf5_buffer, result) if para.is_master() and values_file is not None: hdf5_buffer.flush() hdf5_buffer.close()