def mol_lists_targets_to_targets(mol_lists_targets_dict, merge_proto=True): """Convert targets with mol lists to targets with only mol mol_names.""" targets_dict = {} for target_key, mol_lists_set_value in mol_lists_targets_dict.iteritems(): if merge_proto: cids_dict = OrderedDict([(MolItemName.from_str(x).mol_name, None) for x in mol_lists_set_value.cids]) else: cids_dict = OrderedDict([(MolItemName.from_str(x).proto_name, None) for x in mol_lists_set_value.cids]) set_value = SetValue(mol_lists_set_value.name, cids_dict.keys(), mol_lists_set_value.description) targets_dict[target_key] = set_value return targets_dict
def molecules_to_lists_dicts(molecules_file, first=-1, merge_proto=True): """Read molecules file to dict of mol names to list of native tuples.""" proto_smiles_dict = {} proto_lists_dict = {} smiles_dict = {} mol_lists_dict = {} mol_iter = read_csv_mols(molecules_file, has_fp=True) fp_type = mol_iter.next() while True: try: row = mol_iter.next() except StopIteration: break if row is None: continue fp_name, smiles = row[:2] proto_name = MolItemName.from_str( MolItemName.from_str(fp_name).proto_name) proto_smiles_dict[proto_name] = smiles try: fp_native = row[2] proto_lists_dict.setdefault(proto_name, []).append( (fp_native, fp_name)) # mol_lists_dict.setdefault(mol_name, []).append((fp_native, # fp_name)) except IndexError: logging.warning( "%s has no native fingerprint. Will not be added to dict." % fp_name) if first > 0: proto_lists_dict = dict([(k, v[:first]) for k, v in proto_lists_dict.iteritems()]) if merge_proto: for mol_item_name, native_tuples in sorted(proto_lists_dict.items()): mol_name = mol_item_name.mol_name mol_lists_dict.setdefault(mol_name, []).extend(native_tuples) smiles_dict.setdefault(mol_name, proto_smiles_dict[mol_item_name]) else: for mol_item_name, native_tuples in proto_lists_dict.iteritems(): proto_name = mol_item_name.proto_name mol_lists_dict[proto_name] = native_tuples smiles_dict[proto_name] = proto_smiles_dict[mol_item_name] return smiles_dict, mol_lists_dict, fp_type
def params_to_molecules(params, smiles_file, conf_dir, out_dir, parallelizer=None): """Generate molecules_file based on params dict.""" smiles_dict = smiles_to_dict(smiles_file) logging.debug("SMILES file has {:d} unique smiles.".format( len(smiles_dict))) logging.debug("Example SMILES: {!r}".format(smiles_dict.items()[0])) fprint_params = {"radius_multiplier": params["radius_multiplier"], "stereo": STEREO, "bits": params["bits"], "first": params['first'], "level": params['level']} conf_dir_files = glob.glob("{!s}/*".format(conf_dir)) logging.debug("Found {:d} files in conformer directory.".format( len(conf_dir_files))) sdf_files = [x for x in conf_dir_files if os.path.basename(x).split('.')[0] in smiles_dict] logging.debug("{:d} conformer files match SMILES.".format(len(sdf_files))) if len(sdf_files) == 0: raise Exception("Directory {!s} does not contain any usable SDF " "files.".format(conf_dir)) kwargs = {"save": False, "fprint_params": fprint_params} data_iterator = make_data_iterator(sdf_files) if parallelizer is not None: results_iter = parallelizer.run_gen(native_tuples_from_sdf, data_iterator, kwargs=kwargs) else: results_iter = (native_tuples_from_sdf(*x, **kwargs) for x in data_iterator) molecules_file = get_molecules_file(out_dir) fp_type = fprint_params_to_fptype(**params) with smart_open(molecules_file, "wb") as f: writer = csv.writer(f) fp_type.write(writer) writer.writerow(("molecule id", "smiles", "fingerprint")) for results in results_iter: try: fp_native_list, sdf_file = results except ValueError: logging.error("Results of fingerprinting did not look as " "expected: {!r}".format(results)) proto_name = MolItemName.from_str(fp_native_list[0][1]).proto_name smiles = smiles_dict[proto_name] for fp_native, fp_name in fp_native_list: writer.writerow((fp_name, smiles, fp_native)) del smiles_dict filtered_smiles_dict, mol_lists_dict, fp_type = molecules_to_lists_dicts( molecules_file) return (filtered_smiles_dict, mol_lists_dict, fp_type)
def native_tuples_to_molecules(molecules_file, native_tuples_lists_iter, smiles_dict, fp_type): """Given an iterable of native tuples lists, write to molecules file.""" with smart_open(molecules_file, "wb") as f: writer = csv.writer(f) fp_type.write(writer) writer.writerow(("molecule id", "smiles", "fingerprint")) for i, native_tuples_list in enumerate(native_tuples_lists_iter): logging.debug( "Wrote native strings for molecule {:d} to molecules file.". format(i + 1)) # smiles = smiles_dict[mol_name] for fp_native, fp_name in native_tuples_list: mol_item_name = MolItemName.from_str(native_tuples_list[0][1]) smiles = smiles_dict.get( mol_item_name.proto_name, smiles_dict.get(mol_item_name.mol_name)) writer.writerow((fp_name, smiles, fp_native))
def fprints_dict_from_mol(mol, bits=BITS, level=LEVEL_DEF, radius_multiplier=RADIUS_MULTIPLIER_DEF, first=FIRST_DEF, counts=COUNTS_DEF, stereo=STEREO_DEF, include_disconnected=INCLUDE_DISCONNECTED_DEF, rdkit_invariants=RDKIT_INVARIANTS_DEF, exclude_floating=EXCLUDE_FLOATING_DEF, out_dir_base=None, out_ext=OUT_EXT_DEF, save=False, all_iters=False, overwrite=False): """Build a E3FP fingerprint from a mol with at least one conformer. Parameters ---------- mol : RDKit Mol Input molecule with one or more conformers to be fingerprinted. bits : int Set number of bits for final folded fingerprint. level : int, optional Level/maximum number of iterations of E3FP. If -1 is provided, it runs until termination, and `all_iters` is set to False. radius_multiplier : float, optional Radius multiplier for spherical shells. first : int, optional First `N` number of conformers from file to fingerprint. If -1, all are fingerprinted. counts : bool, optional Instead of bit-based fingerprints. Otherwise, generate count-based fingerprints. stereo : bool, optional Incorporate stereochemistry in fingerprint. include_disconnected : bool, optional Include disconnected atoms when hashing and for stereo calculations. Turn off purely for testing purposes, to make E3FP more like ECFP. rdkit_invariants : bool, optional Use the atom invariants used by RDKit for its Morgan fingerprint. exclude_floating : bool, optional: Mask atoms with no bonds (usually floating ions) from the fingerprint. These are often placed arbitrarily and can confound the fingerprint. out_dir_base : str, optional Basename of out directory to save fingerprints. Iteration number is appended. out_ext : str, optional Extension on fingerprint pickles, used to determine compression level. save : bool, optional Save fingerprints to directory. all_iters : bool, optional Save fingerprints from all iterations to file(s). overwrite : bool, optional Overwrite pre-existing file. Deleted Parameters ------------------ sdf_file : str SDF file path. """ name = mol.GetProp("_Name") if level is None: level = -1 if bits in (-1, None): bits = BITS if save: filenames = [] all_files_exist = True if level == -1 or not all_iters: if level == -1: dir_name = "{!s}_complete".format(out_dir_base) else: dir_name = "{!s}{:d}".format(out_dir_base, level) touch_dir(dir_name) filenames.append( os.path.join(dir_name, "{!s}{!s}".format(name, out_ext))) if not os.path.isfile(filenames[0]): all_files_exist = False else: for i in range(level + 1): dir_name = "{:s}{:d}".format(out_dir_base, i) touch_dir(dir_name) filename = os.path.join(dir_name, "{!s}{!s}".format(name, out_ext)) filenames.append(filename) if not os.path.isfile(filename): all_files_exist = False if all_files_exist and not overwrite: logging.warning("All fingerprint files for {!s} already exist. " "Skipping.".format(name)) return {} fingerprinter = Fingerprinter(bits=bits, level=level, radius_multiplier=radius_multiplier, counts=counts, stereo=stereo, include_disconnected=include_disconnected, rdkit_invariants=rdkit_invariants, exclude_floating=exclude_floating) try: fprints_dict = {} logging.info("Generating fingerprints for {!s}.".format(name)) for j, conf in enumerate(mol.GetConformers()): if j == first: j -= 1 break fingerprinter.run(conf, mol) # fingerprinter.save_substructs_to_db(substruct_db) #PLACEHOLDER level_range = range(level + 1) if level == -1 or not all_iters: level_range = (level, ) else: level_range = range(level + 1) for i in level_range: fprint = fingerprinter.get_fingerprint_at_level(i) fprint.name = MolItemName.from_str(name).to_conf_name(j) # if i not in fprints_dict and j != 0: # fprints_dict[i] = fprints_dict[i-1][:j] fprints_dict.setdefault(i, []).append(fprint) logging.info("Generated {:d} fingerprints for {!s}.".format( j + 1, name)) except: logging.error("Error generating fingerprints for {:s}.".format(name), exc_info=True) return {} if save: if level == -1 or not all_iters: fprints = fprints_dict[max(fprints_dict.keys())] try: fp.savez(filenames[0], *fprints) logging.info("Saved fingerprints for {:s}.".format(name)) except Exception: logging.error( "Error saving fingerprints for {:s} to {:s}".format( name, filenames[0]), exc_info=True) return {} else: try: for i, fprints in sorted(fprints_dict.items()): fp.savez(filenames[i], *fprints) logging.info("Saved fingerprints for {:s}.".format(name)) except Exception: logging.error( "Error saving fingerprints for {:s} to {:s}".format( name, filenames[i]), exc_info=True) return {} return fprints_dict