Example #1
0
def main(sdf_dir,
         smiles_file,
         num_mols=10000,
         first=3,
         out_props_file="random_mols_props.txt",
         out_smiles_file="random_mols.csv.bz2"):
    mol_names = set()
    if os.path.isfile(out_smiles_file):
        logging.info("Loading existing random molecules.")
        smiles_dict = smiles_to_dict(out_smiles_file)
        mol_names.update(set(smiles_dict))
        out_sdf_files_dict = {k: get_sdf_file(sdf_dir, k) for k in mol_names}
    else:
        logging.info("Loading SMILES file.")
        smiles_dict = smiles_to_dict(smiles_file)
        remaining_mol_names = set(smiles_dict.keys())
        out_smiles_dict = {}
        out_sdf_files_dict = {}
        logging.info("Picking random molecules.")
        while len(mol_names) < num_mols:
            print(len(mol_names))
            proto_name = random.choice(smiles_dict.keys())
            if proto_name not in remaining_mol_names:
                continue
            remaining_mol_names.remove(proto_name)
            sdf_file = get_sdf_file(sdf_dir, proto_name)
            if not os.path.isfile(sdf_file):
                continue
            mol_names.add(proto_name)
            out_smiles_dict[proto_name] = smiles_dict[proto_name]
            out_sdf_files_dict[proto_name] = sdf_file

            if len(mol_names) % 100 == 0:
                logging.info(len(mol_names))

        dict_to_smiles(out_smiles_file, out_smiles_dict)

    mol_names = sorted(mol_names)

    logging.info("Computing mol properties.")
    mol_props = {}
    for name, smiles in smiles_dict.items():
        mol = mol_from_smiles(smiles, name)
        nheavy = mol.GetNumHeavyAtoms()
        nrot = AllChem.CalcNumRotatableBonds(mol)
        mol_props[name] = (nheavy, nrot)

    with open(out_props_file, "w") as f:
        f.write("mol_name\tnheavy\tnrot\n")
        for mol_name in mol_names:
            nheavy, nrot = mol_props[mol_name]
            f.write("{}\t{:d}\t{:d}\n".format(mol_name, nheavy, nrot))
Example #2
0
def main(smiles_file,
         params_file,
         sdf_dir=None,
         out_file="molecules.csv.bz2",
         log=None,
         num_proc=None,
         parallel_mode=None,
         verbose=False):
    """Fingerprint molecules."""
    setup_logging(log, verbose=verbose)
    parallelizer = Parallelizer(parallel_mode="processes")

    # set conformer generation and fingerprinting parameters
    confgen_params, fprint_params = params_to_dicts(params_file)
    kwargs = {"save": False, "fprint_params": fprint_params}

    smiles_dict = smiles_to_dict(smiles_file)
    mol_num = len({x.split('-')[0] for x in smiles_dict})

    if sdf_dir is not None:
        sdf_files = glob.glob(os.path.join(sdf_dir, "*.sdf*"))
        sdf_files = sorted(
            [x for x in sdf_files if name_from_sdf_filename(x) in smiles_dict])
        data_iter = make_data_iterator(sdf_files)
        fp_method = native_tuples_from_sdf
        logging.info("Using SDF files from {}".format(sdf_dir))
    else:
        kwargs["confgen_params"] = confgen_params
        data_iter = ((smiles, name)
                     for name, smiles in smiles_dict.iteritems())
        mol_num = len({x.split('-')[0] for x in smiles_dict})
        fp_method = native_tuples_from_smiles
        logging.info("Will generate conformers.")
        logging.info(
            "Conformer generation params: {!r}.".format(confgen_params))
    logging.info("Fingerprinting params: {!r}.".format(fprint_params))

    # fingerprint in parallel
    logging.info("Fingerprinting {:d} molecules".format(mol_num))
    mol_list_dict = {}
    for result, data in parallelizer.run_gen(fp_method,
                                             data_iter,
                                             kwargs=kwargs):
        if not result:
            logging.warning("Fingerprinting failed for {}.".format(data[0]))
            continue
        try:
            _, name = result[0]
            name = name.split('_')[0]
        except IndexError:
            logging.warning("Fingerprinting failed for {}.".format(data[0]))
            continue
        mol_list_dict[name] = result
    logging.info("Finished fingerprinting molecules")

    # save to SEA molecules file
    logging.info("Saving fingerprints to {}".format(out_file))
    fp_type = fprint_params_to_fptype(**fprint_params)
    lists_dicts_to_molecules(out_file, smiles_dict, mol_list_dict, fp_type)
    logging.info("Finished!")
Example #3
0
def run(smiles_file,
        bits=1024,
        radius=2,
        use_chiral=False,
        out_file="molecules.csv.bz2",
        log=None):
    setup_logging(log)

    smiles_dict = smiles_to_dict(smiles_file)
    mol_list_dict = {}
    for name, smiles in smiles_dict.iteritems():
        try:
            mol = mol_from_smiles(smiles, name)
            logging.info("Generating fingerprint for {}".format(name))
            fp = fprint2d_from_mol(mol,
                                   bits=bits,
                                   radius=radius,
                                   use_chiral=use_chiral)
            logging.info("Generated fingerprint for {}".format(name))
            mol_list_dict.setdefault(name,
                                     []).append(fprint_to_native_tuple(fp))

        except Exception:
            logging.warning("Fingerprinting {} failed.".format(name))
    fp_type = get_fprint2d_fptype(bits=bits,
                                  radius=radius,
                                  use_chiral=use_chiral)
    lists_dicts_to_molecules(out_file, smiles_dict, mol_list_dict, fp_type)
Example #4
0
def params_to_molecules(params, smiles_file, conf_dir, out_dir,
                        parallelizer=None):
    """Generate molecules_file based on params dict."""
    smiles_dict = smiles_to_dict(smiles_file)
    logging.debug("SMILES file has {:d} unique smiles.".format(
        len(smiles_dict)))
    logging.debug("Example SMILES: {!r}".format(smiles_dict.items()[0]))
    fprint_params = {"radius_multiplier": params["radius_multiplier"],
                     "stereo": STEREO, "bits": params["bits"],
                     "first": params['first'], "level": params['level']}

    conf_dir_files = glob.glob("{!s}/*".format(conf_dir))
    logging.debug("Found {:d} files in conformer directory.".format(
        len(conf_dir_files)))
    sdf_files = [x for x in conf_dir_files
                 if os.path.basename(x).split('.')[0] in smiles_dict]
    logging.debug("{:d} conformer files match SMILES.".format(len(sdf_files)))

    if len(sdf_files) == 0:
        raise Exception("Directory {!s} does not contain any usable SDF "
                        "files.".format(conf_dir))

    kwargs = {"save": False, "fprint_params": fprint_params}

    data_iterator = make_data_iterator(sdf_files)
    if parallelizer is not None:
        results_iter = parallelizer.run_gen(native_tuples_from_sdf,
                                            data_iterator, kwargs=kwargs)
    else:
        results_iter = (native_tuples_from_sdf(*x, **kwargs)
                        for x in data_iterator)

    molecules_file = get_molecules_file(out_dir)
    fp_type = fprint_params_to_fptype(**params)
    with smart_open(molecules_file, "wb") as f:
        writer = csv.writer(f)
        fp_type.write(writer)
        writer.writerow(("molecule id", "smiles", "fingerprint"))
        for results in results_iter:
            try:
                fp_native_list, sdf_file = results
            except ValueError:
                logging.error("Results of fingerprinting did not look as "
                              "expected: {!r}".format(results))
            proto_name = MolItemName.from_str(fp_native_list[0][1]).proto_name
            smiles = smiles_dict[proto_name]
            for fp_native, fp_name in fp_native_list:
                writer.writerow((fp_name, smiles, fp_native))

    del smiles_dict
    filtered_smiles_dict, mol_lists_dict, fp_type = molecules_to_lists_dicts(
        molecules_file)
    return (filtered_smiles_dict, mol_lists_dict, fp_type)
Example #5
0
def sample_smiles_file(smiles_file, n, sample_smiles_file=None):
    """Save a smiles file with a random `n` smiles."""
    if sample_smiles_file is None:
        basename, ext = os.path.splitext(smiles_file)
        sample_smiles_file = "{!s}_sample{:d}{!s}".format(basename, n, ext)
    smiles_dict = smiles_to_dict(smiles_file)

    mol_num = len(smiles_dict.keys())
    if n >= mol_num:
        return smiles_file

    rand_mol_names = np.random.choice(smiles_dict.keys(), n, replace=False)
    rand_smiles_dict = dict([(x, smiles_dict[x]) for x in rand_mol_names])

    dict_to_smiles(sample_smiles_file, rand_smiles_dict)

    return sample_smiles_file
def main(smiles_file, sdf_dir, out_file):
    _, fprint_params = params_to_dicts(load_params())
    smiles_dict = smiles_to_dict(smiles_file)

    para = Parallelizer()
    smiles_iter = ((smiles, get_sdf_file(name, sdf_dir), name)
                   for name, smiles in smiles_dict.items())
    kwargs = {"fprint_params": fprint_params}
    results_iter = para.run_gen(benchmark_fprinting,
                                smiles_iter,
                                kwargs=kwargs)

    with open(out_file, "w") as f:
        f.write("\t".join([
            "Name", "ECFP4 Time", "E3FP Time", "Num Heavy", "Num Confs",
            "Num Rot"
        ]) + "\n")
        for results, (_, _, name) in results_iter:
            print(results)
            f.write("{}\t{:.4g}\t{:.4g}\t{:d}\t{:d}\t{:d}\n".format(
                name, *results))
Example #7
0
        for mol1, mol2 in product(mol_set1, mol_set2):
            mol_map1.setdefault(mol1, set()).add(mol2)
            mol_map2.setdefault(mol2, set()).add(mol1)
    mol_map1 = {k: sorted(v) for k, v in mol_map1.items()}
    mol_map2 = {k: sorted(v) for k, v in mol_map2.items()}
    return mol_map1, mol_map2

if __name__ == "__main__":
    # Read input files
    annot_df = read_pdb_annotation(PDB_ANNOT_FILE)
    annot_df['mol_name'] = ["{}-{}".format(row['HET_CODE'], row['PDB_ID'])
                            for i, row in annot_df.iterrows()]
    annot_df.set_index(['Uniprot_ID', 'mol_name'], inplace=True)

    chembl_targets_dict = targets_to_dict(CHEMBL_TARGETS_FILE, affinity=10000)
    chembl_smiles_dict = smiles_to_dict(CHEMBL_SMILES_FILE)

    # Add PDB data to useful maps
    pdb_pdb_to_mol = {}
    pdb_pdb_to_name = {}
    pdb_smiles_dict = {}
    for (uid, mol_name), row in annot_df.iterrows():
        smiles = row['SMILES']
        pdb_smiles_dict[mol_name] = smiles
        pdb_pdb_to_mol.setdefault(uid, set()).add((mol_name, smiles))
        pdb_pdb_to_name[uid] = (row['Uniprot_AC'], row['Uniprot_Name'])
    for uid, mol_set in pdb_pdb_to_mol.items():
        if len(mol_set) < MIN_MOLS:
            del pdb_pdb_to_mol[uid]
            del pdb_pdb_to_name[uid]