def main(sdf_dir, smiles_file, num_mols=10000, first=3, out_props_file="random_mols_props.txt", out_smiles_file="random_mols.csv.bz2"): mol_names = set() if os.path.isfile(out_smiles_file): logging.info("Loading existing random molecules.") smiles_dict = smiles_to_dict(out_smiles_file) mol_names.update(set(smiles_dict)) out_sdf_files_dict = {k: get_sdf_file(sdf_dir, k) for k in mol_names} else: logging.info("Loading SMILES file.") smiles_dict = smiles_to_dict(smiles_file) remaining_mol_names = set(smiles_dict.keys()) out_smiles_dict = {} out_sdf_files_dict = {} logging.info("Picking random molecules.") while len(mol_names) < num_mols: print(len(mol_names)) proto_name = random.choice(smiles_dict.keys()) if proto_name not in remaining_mol_names: continue remaining_mol_names.remove(proto_name) sdf_file = get_sdf_file(sdf_dir, proto_name) if not os.path.isfile(sdf_file): continue mol_names.add(proto_name) out_smiles_dict[proto_name] = smiles_dict[proto_name] out_sdf_files_dict[proto_name] = sdf_file if len(mol_names) % 100 == 0: logging.info(len(mol_names)) dict_to_smiles(out_smiles_file, out_smiles_dict) mol_names = sorted(mol_names) logging.info("Computing mol properties.") mol_props = {} for name, smiles in smiles_dict.items(): mol = mol_from_smiles(smiles, name) nheavy = mol.GetNumHeavyAtoms() nrot = AllChem.CalcNumRotatableBonds(mol) mol_props[name] = (nheavy, nrot) with open(out_props_file, "w") as f: f.write("mol_name\tnheavy\tnrot\n") for mol_name in mol_names: nheavy, nrot = mol_props[mol_name] f.write("{}\t{:d}\t{:d}\n".format(mol_name, nheavy, nrot))
def main(smiles_file, params_file, sdf_dir=None, out_file="molecules.csv.bz2", log=None, num_proc=None, parallel_mode=None, verbose=False): """Fingerprint molecules.""" setup_logging(log, verbose=verbose) parallelizer = Parallelizer(parallel_mode="processes") # set conformer generation and fingerprinting parameters confgen_params, fprint_params = params_to_dicts(params_file) kwargs = {"save": False, "fprint_params": fprint_params} smiles_dict = smiles_to_dict(smiles_file) mol_num = len({x.split('-')[0] for x in smiles_dict}) if sdf_dir is not None: sdf_files = glob.glob(os.path.join(sdf_dir, "*.sdf*")) sdf_files = sorted( [x for x in sdf_files if name_from_sdf_filename(x) in smiles_dict]) data_iter = make_data_iterator(sdf_files) fp_method = native_tuples_from_sdf logging.info("Using SDF files from {}".format(sdf_dir)) else: kwargs["confgen_params"] = confgen_params data_iter = ((smiles, name) for name, smiles in smiles_dict.iteritems()) mol_num = len({x.split('-')[0] for x in smiles_dict}) fp_method = native_tuples_from_smiles logging.info("Will generate conformers.") logging.info( "Conformer generation params: {!r}.".format(confgen_params)) logging.info("Fingerprinting params: {!r}.".format(fprint_params)) # fingerprint in parallel logging.info("Fingerprinting {:d} molecules".format(mol_num)) mol_list_dict = {} for result, data in parallelizer.run_gen(fp_method, data_iter, kwargs=kwargs): if not result: logging.warning("Fingerprinting failed for {}.".format(data[0])) continue try: _, name = result[0] name = name.split('_')[0] except IndexError: logging.warning("Fingerprinting failed for {}.".format(data[0])) continue mol_list_dict[name] = result logging.info("Finished fingerprinting molecules") # save to SEA molecules file logging.info("Saving fingerprints to {}".format(out_file)) fp_type = fprint_params_to_fptype(**fprint_params) lists_dicts_to_molecules(out_file, smiles_dict, mol_list_dict, fp_type) logging.info("Finished!")
def run(smiles_file, bits=1024, radius=2, use_chiral=False, out_file="molecules.csv.bz2", log=None): setup_logging(log) smiles_dict = smiles_to_dict(smiles_file) mol_list_dict = {} for name, smiles in smiles_dict.iteritems(): try: mol = mol_from_smiles(smiles, name) logging.info("Generating fingerprint for {}".format(name)) fp = fprint2d_from_mol(mol, bits=bits, radius=radius, use_chiral=use_chiral) logging.info("Generated fingerprint for {}".format(name)) mol_list_dict.setdefault(name, []).append(fprint_to_native_tuple(fp)) except Exception: logging.warning("Fingerprinting {} failed.".format(name)) fp_type = get_fprint2d_fptype(bits=bits, radius=radius, use_chiral=use_chiral) lists_dicts_to_molecules(out_file, smiles_dict, mol_list_dict, fp_type)
def params_to_molecules(params, smiles_file, conf_dir, out_dir, parallelizer=None): """Generate molecules_file based on params dict.""" smiles_dict = smiles_to_dict(smiles_file) logging.debug("SMILES file has {:d} unique smiles.".format( len(smiles_dict))) logging.debug("Example SMILES: {!r}".format(smiles_dict.items()[0])) fprint_params = {"radius_multiplier": params["radius_multiplier"], "stereo": STEREO, "bits": params["bits"], "first": params['first'], "level": params['level']} conf_dir_files = glob.glob("{!s}/*".format(conf_dir)) logging.debug("Found {:d} files in conformer directory.".format( len(conf_dir_files))) sdf_files = [x for x in conf_dir_files if os.path.basename(x).split('.')[0] in smiles_dict] logging.debug("{:d} conformer files match SMILES.".format(len(sdf_files))) if len(sdf_files) == 0: raise Exception("Directory {!s} does not contain any usable SDF " "files.".format(conf_dir)) kwargs = {"save": False, "fprint_params": fprint_params} data_iterator = make_data_iterator(sdf_files) if parallelizer is not None: results_iter = parallelizer.run_gen(native_tuples_from_sdf, data_iterator, kwargs=kwargs) else: results_iter = (native_tuples_from_sdf(*x, **kwargs) for x in data_iterator) molecules_file = get_molecules_file(out_dir) fp_type = fprint_params_to_fptype(**params) with smart_open(molecules_file, "wb") as f: writer = csv.writer(f) fp_type.write(writer) writer.writerow(("molecule id", "smiles", "fingerprint")) for results in results_iter: try: fp_native_list, sdf_file = results except ValueError: logging.error("Results of fingerprinting did not look as " "expected: {!r}".format(results)) proto_name = MolItemName.from_str(fp_native_list[0][1]).proto_name smiles = smiles_dict[proto_name] for fp_native, fp_name in fp_native_list: writer.writerow((fp_name, smiles, fp_native)) del smiles_dict filtered_smiles_dict, mol_lists_dict, fp_type = molecules_to_lists_dicts( molecules_file) return (filtered_smiles_dict, mol_lists_dict, fp_type)
def sample_smiles_file(smiles_file, n, sample_smiles_file=None): """Save a smiles file with a random `n` smiles.""" if sample_smiles_file is None: basename, ext = os.path.splitext(smiles_file) sample_smiles_file = "{!s}_sample{:d}{!s}".format(basename, n, ext) smiles_dict = smiles_to_dict(smiles_file) mol_num = len(smiles_dict.keys()) if n >= mol_num: return smiles_file rand_mol_names = np.random.choice(smiles_dict.keys(), n, replace=False) rand_smiles_dict = dict([(x, smiles_dict[x]) for x in rand_mol_names]) dict_to_smiles(sample_smiles_file, rand_smiles_dict) return sample_smiles_file
def main(smiles_file, sdf_dir, out_file): _, fprint_params = params_to_dicts(load_params()) smiles_dict = smiles_to_dict(smiles_file) para = Parallelizer() smiles_iter = ((smiles, get_sdf_file(name, sdf_dir), name) for name, smiles in smiles_dict.items()) kwargs = {"fprint_params": fprint_params} results_iter = para.run_gen(benchmark_fprinting, smiles_iter, kwargs=kwargs) with open(out_file, "w") as f: f.write("\t".join([ "Name", "ECFP4 Time", "E3FP Time", "Num Heavy", "Num Confs", "Num Rot" ]) + "\n") for results, (_, _, name) in results_iter: print(results) f.write("{}\t{:.4g}\t{:.4g}\t{:d}\t{:d}\t{:d}\n".format( name, *results))
for mol1, mol2 in product(mol_set1, mol_set2): mol_map1.setdefault(mol1, set()).add(mol2) mol_map2.setdefault(mol2, set()).add(mol1) mol_map1 = {k: sorted(v) for k, v in mol_map1.items()} mol_map2 = {k: sorted(v) for k, v in mol_map2.items()} return mol_map1, mol_map2 if __name__ == "__main__": # Read input files annot_df = read_pdb_annotation(PDB_ANNOT_FILE) annot_df['mol_name'] = ["{}-{}".format(row['HET_CODE'], row['PDB_ID']) for i, row in annot_df.iterrows()] annot_df.set_index(['Uniprot_ID', 'mol_name'], inplace=True) chembl_targets_dict = targets_to_dict(CHEMBL_TARGETS_FILE, affinity=10000) chembl_smiles_dict = smiles_to_dict(CHEMBL_SMILES_FILE) # Add PDB data to useful maps pdb_pdb_to_mol = {} pdb_pdb_to_name = {} pdb_smiles_dict = {} for (uid, mol_name), row in annot_df.iterrows(): smiles = row['SMILES'] pdb_smiles_dict[mol_name] = smiles pdb_pdb_to_mol.setdefault(uid, set()).add((mol_name, smiles)) pdb_pdb_to_name[uid] = (row['Uniprot_AC'], row['Uniprot_Name']) for uid, mol_set in pdb_pdb_to_mol.items(): if len(mol_set) < MIN_MOLS: del pdb_pdb_to_mol[uid] del pdb_pdb_to_name[uid]