Exemple #1
0
def mol_lists_targets_to_targets(mol_lists_targets_dict, merge_proto=True):
    """Convert targets with mol lists to targets with only mol mol_names."""
    targets_dict = {}
    for target_key, mol_lists_set_value in mol_lists_targets_dict.iteritems():
        if merge_proto:
            cids_dict = OrderedDict([(MolItemName.from_str(x).mol_name, None)
                                     for x in mol_lists_set_value.cids])
        else:
            cids_dict = OrderedDict([(MolItemName.from_str(x).proto_name, None)
                                     for x in mol_lists_set_value.cids])
        set_value = SetValue(mol_lists_set_value.name, cids_dict.keys(),
                             mol_lists_set_value.description)
        targets_dict[target_key] = set_value
    return targets_dict
Exemple #2
0
def molecules_to_lists_dicts(molecules_file, first=-1, merge_proto=True):
    """Read molecules file to dict of mol names to list of native tuples."""
    proto_smiles_dict = {}
    proto_lists_dict = {}
    smiles_dict = {}
    mol_lists_dict = {}
    mol_iter = read_csv_mols(molecules_file, has_fp=True)
    fp_type = mol_iter.next()
    while True:
        try:
            row = mol_iter.next()
        except StopIteration:
            break
        if row is None:
            continue
        fp_name, smiles = row[:2]

        proto_name = MolItemName.from_str(
            MolItemName.from_str(fp_name).proto_name)
        proto_smiles_dict[proto_name] = smiles
        try:
            fp_native = row[2]
            proto_lists_dict.setdefault(proto_name, []).append(
                (fp_native, fp_name))
            # mol_lists_dict.setdefault(mol_name, []).append((fp_native,
            #                                                 fp_name))
        except IndexError:
            logging.warning(
                "%s has no native fingerprint. Will not be added to dict." %
                fp_name)

    if first > 0:
        proto_lists_dict = dict([(k, v[:first])
                                 for k, v in proto_lists_dict.iteritems()])

    if merge_proto:
        for mol_item_name, native_tuples in sorted(proto_lists_dict.items()):
            mol_name = mol_item_name.mol_name
            mol_lists_dict.setdefault(mol_name, []).extend(native_tuples)
            smiles_dict.setdefault(mol_name, proto_smiles_dict[mol_item_name])
    else:
        for mol_item_name, native_tuples in proto_lists_dict.iteritems():
            proto_name = mol_item_name.proto_name
            mol_lists_dict[proto_name] = native_tuples
            smiles_dict[proto_name] = proto_smiles_dict[mol_item_name]

    return smiles_dict, mol_lists_dict, fp_type
Exemple #3
0
def params_to_molecules(params, smiles_file, conf_dir, out_dir,
                        parallelizer=None):
    """Generate molecules_file based on params dict."""
    smiles_dict = smiles_to_dict(smiles_file)
    logging.debug("SMILES file has {:d} unique smiles.".format(
        len(smiles_dict)))
    logging.debug("Example SMILES: {!r}".format(smiles_dict.items()[0]))
    fprint_params = {"radius_multiplier": params["radius_multiplier"],
                     "stereo": STEREO, "bits": params["bits"],
                     "first": params['first'], "level": params['level']}

    conf_dir_files = glob.glob("{!s}/*".format(conf_dir))
    logging.debug("Found {:d} files in conformer directory.".format(
        len(conf_dir_files)))
    sdf_files = [x for x in conf_dir_files
                 if os.path.basename(x).split('.')[0] in smiles_dict]
    logging.debug("{:d} conformer files match SMILES.".format(len(sdf_files)))

    if len(sdf_files) == 0:
        raise Exception("Directory {!s} does not contain any usable SDF "
                        "files.".format(conf_dir))

    kwargs = {"save": False, "fprint_params": fprint_params}

    data_iterator = make_data_iterator(sdf_files)
    if parallelizer is not None:
        results_iter = parallelizer.run_gen(native_tuples_from_sdf,
                                            data_iterator, kwargs=kwargs)
    else:
        results_iter = (native_tuples_from_sdf(*x, **kwargs)
                        for x in data_iterator)

    molecules_file = get_molecules_file(out_dir)
    fp_type = fprint_params_to_fptype(**params)
    with smart_open(molecules_file, "wb") as f:
        writer = csv.writer(f)
        fp_type.write(writer)
        writer.writerow(("molecule id", "smiles", "fingerprint"))
        for results in results_iter:
            try:
                fp_native_list, sdf_file = results
            except ValueError:
                logging.error("Results of fingerprinting did not look as "
                              "expected: {!r}".format(results))
            proto_name = MolItemName.from_str(fp_native_list[0][1]).proto_name
            smiles = smiles_dict[proto_name]
            for fp_native, fp_name in fp_native_list:
                writer.writerow((fp_name, smiles, fp_native))

    del smiles_dict
    filtered_smiles_dict, mol_lists_dict, fp_type = molecules_to_lists_dicts(
        molecules_file)
    return (filtered_smiles_dict, mol_lists_dict, fp_type)
Exemple #4
0
def native_tuples_to_molecules(molecules_file, native_tuples_lists_iter,
                               smiles_dict, fp_type):
    """Given an iterable of native tuples lists, write to molecules file."""
    with smart_open(molecules_file, "wb") as f:
        writer = csv.writer(f)
        fp_type.write(writer)
        writer.writerow(("molecule id", "smiles", "fingerprint"))
        for i, native_tuples_list in enumerate(native_tuples_lists_iter):
            logging.debug(
                "Wrote native strings for molecule {:d} to molecules file.".
                format(i + 1))
            # smiles = smiles_dict[mol_name]
            for fp_native, fp_name in native_tuples_list:
                mol_item_name = MolItemName.from_str(native_tuples_list[0][1])
                smiles = smiles_dict.get(
                    mol_item_name.proto_name,
                    smiles_dict.get(mol_item_name.mol_name))
                writer.writerow((fp_name, smiles, fp_native))
Exemple #5
0
def fprints_dict_from_mol(mol,
                          bits=BITS,
                          level=LEVEL_DEF,
                          radius_multiplier=RADIUS_MULTIPLIER_DEF,
                          first=FIRST_DEF,
                          counts=COUNTS_DEF,
                          stereo=STEREO_DEF,
                          include_disconnected=INCLUDE_DISCONNECTED_DEF,
                          rdkit_invariants=RDKIT_INVARIANTS_DEF,
                          exclude_floating=EXCLUDE_FLOATING_DEF,
                          out_dir_base=None,
                          out_ext=OUT_EXT_DEF,
                          save=False,
                          all_iters=False,
                          overwrite=False):
    """Build a E3FP fingerprint from a mol with at least one conformer.

    Parameters
    ----------
    mol : RDKit Mol
        Input molecule with one or more conformers to be fingerprinted.
    bits : int
        Set number of bits for final folded fingerprint.
    level : int, optional
        Level/maximum number of iterations of E3FP. If -1 is provided, it runs
        until termination, and `all_iters` is set to False.
    radius_multiplier : float, optional
        Radius multiplier for spherical shells.
    first : int, optional
        First `N` number of conformers from file to fingerprint. If -1, all
        are fingerprinted.
    counts : bool, optional
        Instead of bit-based fingerprints. Otherwise, generate count-based
        fingerprints.
    stereo : bool, optional
        Incorporate stereochemistry in fingerprint.
    include_disconnected : bool, optional
        Include disconnected atoms when hashing and for stereo calculations.
        Turn off purely for testing purposes, to make E3FP more like ECFP.
    rdkit_invariants : bool, optional
        Use the atom invariants used by RDKit for its Morgan fingerprint.
    exclude_floating : bool, optional:
        Mask atoms with no bonds (usually floating ions) from the fingerprint.
        These are often placed arbitrarily and can confound the fingerprint.
    out_dir_base : str, optional
        Basename of out directory to save fingerprints. Iteration number is
        appended.
    out_ext : str, optional
        Extension on fingerprint pickles, used to determine compression level.
    save : bool, optional
        Save fingerprints to directory.
    all_iters : bool, optional
        Save fingerprints from all iterations to file(s).
    overwrite : bool, optional
        Overwrite pre-existing file.

    Deleted Parameters
    ------------------
    sdf_file : str
        SDF file path.
    """
    name = mol.GetProp("_Name")

    if level is None:
        level = -1

    if bits in (-1, None):
        bits = BITS

    if save:
        filenames = []
        all_files_exist = True
        if level == -1 or not all_iters:
            if level == -1:
                dir_name = "{!s}_complete".format(out_dir_base)
            else:
                dir_name = "{!s}{:d}".format(out_dir_base, level)
            touch_dir(dir_name)
            filenames.append(
                os.path.join(dir_name, "{!s}{!s}".format(name, out_ext)))
            if not os.path.isfile(filenames[0]):
                all_files_exist = False
        else:
            for i in range(level + 1):
                dir_name = "{:s}{:d}".format(out_dir_base, i)
                touch_dir(dir_name)
                filename = os.path.join(dir_name,
                                        "{!s}{!s}".format(name, out_ext))
                filenames.append(filename)
                if not os.path.isfile(filename):
                    all_files_exist = False

        if all_files_exist and not overwrite:
            logging.warning("All fingerprint files for {!s} already exist. "
                            "Skipping.".format(name))
            return {}

    fingerprinter = Fingerprinter(bits=bits,
                                  level=level,
                                  radius_multiplier=radius_multiplier,
                                  counts=counts,
                                  stereo=stereo,
                                  include_disconnected=include_disconnected,
                                  rdkit_invariants=rdkit_invariants,
                                  exclude_floating=exclude_floating)

    try:
        fprints_dict = {}
        logging.info("Generating fingerprints for {!s}.".format(name))
        for j, conf in enumerate(mol.GetConformers()):
            if j == first:
                j -= 1
                break
            fingerprinter.run(conf, mol)
            # fingerprinter.save_substructs_to_db(substruct_db) #PLACEHOLDER
            level_range = range(level + 1)
            if level == -1 or not all_iters:
                level_range = (level, )
            else:
                level_range = range(level + 1)
            for i in level_range:
                fprint = fingerprinter.get_fingerprint_at_level(i)
                fprint.name = MolItemName.from_str(name).to_conf_name(j)
                # if i not in fprints_dict and j != 0:
                #     fprints_dict[i] = fprints_dict[i-1][:j]
                fprints_dict.setdefault(i, []).append(fprint)
        logging.info("Generated {:d} fingerprints for {!s}.".format(
            j + 1, name))
    except:
        logging.error("Error generating fingerprints for {:s}.".format(name),
                      exc_info=True)
        return {}

    if save:
        if level == -1 or not all_iters:
            fprints = fprints_dict[max(fprints_dict.keys())]
            try:
                fp.savez(filenames[0], *fprints)
                logging.info("Saved fingerprints for {:s}.".format(name))
            except Exception:
                logging.error(
                    "Error saving fingerprints for {:s} to {:s}".format(
                        name, filenames[0]),
                    exc_info=True)
                return {}
        else:
            try:
                for i, fprints in sorted(fprints_dict.items()):
                    fp.savez(filenames[i], *fprints)
                logging.info("Saved fingerprints for {:s}.".format(name))
            except Exception:
                logging.error(
                    "Error saving fingerprints for {:s} to {:s}".format(
                        name, filenames[i]),
                    exc_info=True)
                return {}

    return fprints_dict