コード例 #1
0
def mol_to_sdf(mol, out_file, conf_num=None):
    """Write RDKit `Mol` objects to an SDF file.

    Parameters
    ----------
    mol : RDKit Mol
        A molecule containing 1 or more conformations to write to file.
    out_file : str
        Path to save SDF file.
    conf_num : int or None, optional
        Maximum number of conformers to save to file. Defaults to all.
    """
    touch_dir(os.path.dirname(out_file))
    with smart_open(out_file, "w") as fobj:
        writer = rdkit.Chem.SDWriter(fobj)
        conf_ids = [conf.GetId() for conf in mol.GetConformers()]
        conf_energies = get_conformer_energies_from_mol(mol)
        mol.ClearProp(CONF_ENERGIES_PROPNAME)
        for i in conf_ids:
            if conf_num not in {-1, None} and i >= conf_num:
                break
            try:
                conf_energy = conf_energies[i]
                mol.SetProp(CONF_ENERGY_PROPNAME, "{:.4f}".format(conf_energy))
            except (IndexError, TypeError):
                pass
            writer.write(mol, confId=i)
        writer.close()
        mol.ClearProp(CONF_ENERGY_PROPNAME)
        if conf_energies is not None:
            add_conformer_energies_to_mol(mol, conf_energies)
    logging.debug("Saved {:d} conformers to {}.".format(i + 1, out_file))
コード例 #2
0
 def __init__(self,
              fold_num,
              out_dir,
              cv_method=SEASearchCVMethod(),
              input_file=os.path.join(os.getcwd(), "input.pkl.bz2"),
              compute_combined=True,
              overwrite=False):
     self.fold_num = fold_num
     self.out_dir = out_dir
     touch_dir(self.out_dir)
     self.input_file = input_file
     self.mask_file = os.path.join(out_dir, "train_test_mask.pkl.bz2")
     self.results_file = os.path.join(out_dir, "results.npz")
     self.target_aucs_file = os.path.join(out_dir, "target_aucs.pkl.bz2")
     self.combined_roc_file = os.path.join(out_dir, "combined_roc.pkl.bz2")
     self.combined_prc_file = os.path.join(out_dir, "combined_prc.pkl.bz2")
     self.combined_enrichment_file = os.path.join(
         out_dir, "combined_enrichment.pkl.bz2")
     if isinstance(cv_method, type):
         cv_method = cv_method()
     cv_method.out_dir = out_dir
     cv_method.overwrite = overwrite
     self.cv_method = cv_method
     self.compute_combined = compute_combined
     self.overwrite = overwrite
コード例 #3
0
 def __init__(self,
              k=5,
              splitter=MoleculeSplitter,
              cv_method=SEASearchCVMethod(),
              input_processor=None,
              parallelizer=None,
              out_dir=os.getcwd(),
              overwrite=False,
              return_auc_type="roc",
              reduce_negatives=False,
              fold_kwargs={}):
     if isinstance(splitter, type):
         self.splitter = splitter(k)
     else:
         assert splitter.k == k
         self.splitter = splitter
     self.k = k
     if (cv_method is SEASearchCVMethod and input_processor is not None):
         raise ValueError(
             "Input processing is not (currently) compatible with SEA.")
     self.cv_method = cv_method
     self.input_processor = input_processor
     self.overwrite = overwrite
     if parallelizer is None:
         self.parallelizer = Parallelizer(parallel_mode="serial")
     else:
         self.parallelizer = parallelizer
     self.out_dir = out_dir
     touch_dir(out_dir)
     self.input_file = os.path.join(self.out_dir, "inputs.pkl.bz2")
     self.return_auc_type = return_auc_type.lower()
     self.reduce_negatives = reduce_negatives
     self.fold_kwargs = fold_kwargs
コード例 #4
0
    def substructs_to_pdb(
        self,
        level=None,
        bits=None,
        out_dir="substructs",
        reorient=True,
        exact=False,
    ):
        """Save all accepted substructs from current level to PDB.

        Parameters
        ----------
        level : int or None, optional
            Level of fingerprinting/number of iterations
        bits : int or None, optional
            Folding level of identifiers
        out_dir : str, optional
            Directory to which to save PDB files.
        reorient : bool, optional
            Reorient substructure to match stereo quadrants.
        """
        shells = self.get_shells_at_level(level=level, exact=exact)

        if bits in (-1, None):
            bits = self.bits

        touch_dir(out_dir)

        out_files = []
        for shell in shells:
            identifier = signed_to_unsigned_int(shell.identifier) % bits
            out_file = os.path.join(out_dir, "{}.pdb.gz".format(identifier))
            shell_to_pdb(
                self.mol,
                shell,
                self.atom_coords,
                self.bound_atoms_dict,
                out_file,
                reorient=reorient,
            )
            out_files.append(out_file)
        return out_files
コード例 #5
0
ファイル: util.py プロジェクト: RhDm/e3fp
def mol_to_sdf(mol, out_file, conf_num=None):
    """Write RDKit ``Mol`` objects to an SDF file.

    Parameters
    ----------
    mol : RDKit Mol
        A molecule containing 1 or more conformations to write to file.
    out_file : str
        Path to save SDF file.
    conf_num : int or None, optional
        Maximum number of conformers to save to file. Defaults to all.
    """
    touch_dir(os.path.dirname(out_file))
    with smart_open(out_file, "wb") as fobj:
        writer = rdkit.Chem.SDWriter(fobj)
        conf_ids = [conf.GetId() for conf in mol.GetConformers()]
        for i in conf_ids:
            if conf_num not in {-1, None} and i >= conf_num:
                break
            writer.write(mol, confId=i)
        writer.close()
    logging.debug("Saved {:d} conformers to {}.".format(i + 1, out_file))
コード例 #6
0
ファイル: wrapper.py プロジェクト: WhitestoneYang/e3fp-paper
def main(job_id, params, main_conf_dir=MAIN_CONF_DIR, main_dir=CV_DIR,
         out_dir=None, smiles_file=SMILES_FILE, check_existing=True,
         mol_targets_file=MOL_TARGETS_FILE, k=CV_K, log_file=LOG_FILE,
         verbose=False, overwrite=False, min_mols=MIN_MOLS_PER_TARGET,
         parallelizer=None):
    params = format_params(params)

    pre_encoding_params_string = params_to_str(params, with_first=False)
    params_string = params_to_str(params)
    if out_dir is None:
        out_dir = os.path.join(main_dir, params_string)
    touch_dir(out_dir)
    if log_file is not None:
        log_file = os.path.join(out_dir, log_file)
    setup_logging(log_file, verbose=verbose)

    params_file = os.path.join(out_dir, "params.cfg")
    config_parser = update_params(params, section_name="fingerprinting")
    write_params(config_parser, params_file)

    if not isinstance(parallelizer, Parallelizer):
        parallelizer = Parallelizer(parallel_mode="processes",
                                    num_proc=NUM_PROC)

    logging.info("Params: {!r}".format(params.items()))
    logging.info("Saving files to {:s}.".format(out_dir))

    logging.info("Checking for usable pre-existing fingerprints.")
    existing_molecules_file = get_existing_fprints(pre_encoding_params_string,
                                                   params['first'], main_dir)

    molecules_file = get_molecules_file(out_dir)
    if os.path.isfile(molecules_file) and not overwrite:
        logging.info("Molecules file already exists. Loading.")
        smiles_dict, mol_lists_dict, fp_type = molecules_to_lists_dicts(
            molecules_file)
    elif existing_molecules_file is None:
        conf_dir = os.path.join(main_conf_dir, params['conformers'])
        logging.info("Generating fingerprints from conformers in "
                     "{!s}.".format(conf_dir))
        smiles_dict, mol_lists_dict, fp_type = params_to_molecules(
            params, smiles_file, conf_dir, out_dir, parallelizer=parallelizer)
    else:
        logging.info("Using native strings from existing molecules "
                     "file {!s}.".format(existing_molecules_file))
        smiles_dict, mol_lists_dict, fp_type = molecules_to_lists_dicts(
            existing_molecules_file, first=params['first'])
        lists_dicts_to_molecules(get_molecules_file(out_dir),
                                 smiles_dict, mol_lists_dict, fp_type)

    targets_file = get_targets_file(out_dir)
    if overwrite or not os.path.isfile(targets_file):
        logging.info("Reading targets from {!s}.".format(mol_targets_file))
        targets_dict = targets_to_dict(mol_targets_file, affinity=AFFINITY)
        logging.debug("Read {:d} targets.".format(len(targets_dict)))
        logging.info("Filtering targets by molecules.")
        filtered_targets_dict = targets_to_mol_lists_targets(
            filter_targets_by_molecules(targets_dict, mol_lists_dict),
            mol_lists_dict)

        del targets_dict, smiles_dict, mol_lists_dict, fp_type
        logging.info("Saving filtered targets to {!s}.".format(targets_file))
        dict_to_targets(targets_file, filtered_targets_dict)
        del filtered_targets_dict
    else:
        logging.info("Targets file already exists. Skipping.")

    parallel_mode = parallelizer.parallel_mode
    parallelizer = Parallelizer(parallel_mode=parallel_mode, num_proc=k + 1)

    splitter = ByTargetMoleculeSplitter(k, reduce_negatives=REDUCE_NEGATIVES)
    kfold_cv = KFoldCrossValidator(k=k, parallelizer=parallelizer,
                                   splitter=splitter,
                                   return_auc_type=AUC_TYPE, out_dir=out_dir,
                                   overwrite=False)
    auc = kfold_cv.run(molecules_file, targets_file, min_mols=min_mols,
                       affinity=AFFINITY)
    logging.info("CV Mean AUC: {:.4f}".format(auc))
    return 1 - auc
コード例 #7
0
ファイル: generate.py プロジェクト: amrhamedp/e3fp
def fprints_dict_from_mol(mol,
                          bits=BITS,
                          level=LEVEL_DEF,
                          radius_multiplier=RADIUS_MULTIPLIER_DEF,
                          first=FIRST_DEF,
                          counts=COUNTS_DEF,
                          stereo=STEREO_DEF,
                          include_disconnected=INCLUDE_DISCONNECTED_DEF,
                          rdkit_invariants=RDKIT_INVARIANTS_DEF,
                          exclude_floating=EXCLUDE_FLOATING_DEF,
                          out_dir_base=None,
                          out_ext=OUT_EXT_DEF,
                          save=False,
                          all_iters=False,
                          overwrite=False):
    """Build a E3FP fingerprint from a mol with at least one conformer.

    Parameters
    ----------
    mol : RDKit Mol
        Input molecule with one or more conformers to be fingerprinted.
    bits : int
        Set number of bits for final folded fingerprint.
    level : int, optional
        Level/maximum number of iterations of E3FP. If -1 is provided, it runs
        until termination, and `all_iters` is set to False.
    radius_multiplier : float, optional
        Radius multiplier for spherical shells.
    first : int, optional
        First `N` number of conformers from file to fingerprint. If -1, all
        are fingerprinted.
    counts : bool, optional
        Instead of bit-based fingerprints. Otherwise, generate count-based
        fingerprints.
    stereo : bool, optional
        Incorporate stereochemistry in fingerprint.
    include_disconnected : bool, optional
        Include disconnected atoms when hashing and for stereo calculations.
        Turn off purely for testing purposes, to make E3FP more like ECFP.
    rdkit_invariants : bool, optional
        Use the atom invariants used by RDKit for its Morgan fingerprint.
    exclude_floating : bool, optional:
        Mask atoms with no bonds (usually floating ions) from the fingerprint.
        These are often placed arbitrarily and can confound the fingerprint.
    out_dir_base : str, optional
        Basename of out directory to save fingerprints. Iteration number is
        appended.
    out_ext : str, optional
        Extension on fingerprint pickles, used to determine compression level.
    save : bool, optional
        Save fingerprints to directory.
    all_iters : bool, optional
        Save fingerprints from all iterations to file(s).
    overwrite : bool, optional
        Overwrite pre-existing file.

    Deleted Parameters
    ------------------
    sdf_file : str
        SDF file path.
    """
    name = mol.GetProp("_Name")

    if level is None:
        level = -1

    if bits in (-1, None):
        bits = BITS

    if save:
        filenames = []
        all_files_exist = True
        if level == -1 or not all_iters:
            if level == -1:
                dir_name = "{!s}_complete".format(out_dir_base)
            else:
                dir_name = "{!s}{:d}".format(out_dir_base, level)
            touch_dir(dir_name)
            filenames.append(
                os.path.join(dir_name, "{!s}{!s}".format(name, out_ext)))
            if not os.path.isfile(filenames[0]):
                all_files_exist = False
        else:
            for i in range(level + 1):
                dir_name = "{:s}{:d}".format(out_dir_base, i)
                touch_dir(dir_name)
                filename = os.path.join(dir_name,
                                        "{!s}{!s}".format(name, out_ext))
                filenames.append(filename)
                if not os.path.isfile(filename):
                    all_files_exist = False

        if all_files_exist and not overwrite:
            logging.warning("All fingerprint files for {!s} already exist. "
                            "Skipping.".format(name))
            return {}

    fingerprinter = Fingerprinter(bits=bits,
                                  level=level,
                                  radius_multiplier=radius_multiplier,
                                  counts=counts,
                                  stereo=stereo,
                                  include_disconnected=include_disconnected,
                                  rdkit_invariants=rdkit_invariants,
                                  exclude_floating=exclude_floating)

    try:
        fprints_dict = {}
        logging.info("Generating fingerprints for {!s}.".format(name))
        for j, conf in enumerate(mol.GetConformers()):
            if j == first:
                j -= 1
                break
            fingerprinter.run(conf, mol)
            # fingerprinter.save_substructs_to_db(substruct_db) #PLACEHOLDER
            level_range = range(level + 1)
            if level == -1 or not all_iters:
                level_range = (level, )
            else:
                level_range = range(level + 1)
            for i in level_range:
                fprint = fingerprinter.get_fingerprint_at_level(i)
                fprint.name = MolItemName.from_str(name).to_conf_name(j)
                # if i not in fprints_dict and j != 0:
                #     fprints_dict[i] = fprints_dict[i-1][:j]
                fprints_dict.setdefault(i, []).append(fprint)
        logging.info("Generated {:d} fingerprints for {!s}.".format(
            j + 1, name))
    except:
        logging.error("Error generating fingerprints for {:s}.".format(name),
                      exc_info=True)
        return {}

    if save:
        if level == -1 or not all_iters:
            fprints = fprints_dict[max(fprints_dict.keys())]
            try:
                fp.savez(filenames[0], *fprints)
                logging.info("Saved fingerprints for {:s}.".format(name))
            except Exception:
                logging.error(
                    "Error saving fingerprints for {:s} to {:s}".format(
                        name, filenames[0]),
                    exc_info=True)
                return {}
        else:
            try:
                for i, fprints in sorted(fprints_dict.items()):
                    fp.savez(filenames[i], *fprints)
                logging.info("Saved fingerprints for {:s}.".format(name))
            except Exception:
                logging.error(
                    "Error saving fingerprints for {:s} to {:s}".format(
                        name, filenames[i]),
                    exc_info=True)
                return {}

    return fprints_dict
コード例 #8
0
ファイル: generate.py プロジェクト: xuzhang5788/e3fp
def run(
    mol2=None,
    smiles=None,
    standardise=STANDARDISE_DEF,
    num_conf=NUM_CONF_DEF,
    first=FIRST_DEF,
    pool_multiplier=POOL_MULTIPLIER_DEF,
    rmsd_cutoff=RMSD_CUTOFF_DEF,
    max_energy_diff=MAX_ENERGY_DIFF_DEF,
    forcefield=FORCEFIELD_DEF,
    seed=SEED_DEF,
    params=None,
    prioritize=False,
    out_dir=OUTDIR_DEF,
    compress=COMPRESS_DEF,
    overwrite=False,
    values_file=None,
    log=None,
    num_proc=None,
    parallel_mode=None,
    verbose=False,
):
    """Run conformer generation."""
    setup_logging(log, verbose=verbose)

    if params is not None:
        params = read_params(params)
        standardise = get_value(params, "preprocessing", "standardise", bool)
        num_conf = get_value(params, "conformer_generation", "num_conf", int)
        first = get_value(params, "conformer_generation", "first", int)
        pool_multiplier = get_value(params, "conformer_generation",
                                    "pool_multiplier", int)
        rmsd_cutoff = get_value(params, "conformer_generation", "rmsd_cutoff",
                                float)
        max_energy_diff = get_value(params, "conformer_generation",
                                    "max_energy_diff", float)
        forcefield = get_value(params, "conformer_generation", "forcefield")
        seed = get_value(params, "conformer_generation", "seed", int)

    # check args
    if forcefield not in FORCEFIELD_CHOICES:
        raise ValueError(
            "Specified forcefield {} is not in valid options {!r}".format(
                forcefield, FORCEFIELD_CHOICES))

    para = Parallelizer(num_proc=num_proc, parallel_mode=parallel_mode)

    # Check to make sure args make sense
    if mol2 is None and smiles is None:
        if para.is_master():
            parser.print_usage()
            logging.error("Please provide mol2 file or a SMILES file.")
        sys.exit()

    if mol2 is not None and smiles is not None:
        if para.is_master():
            parser.print_usage()
            logging.error("Please provide only a mol2 file OR a SMILES file.")
        sys.exit()

    if num_proc and num_proc < 1:
        if para.is_master():
            parser.print_usage()
            logging.error(
                "Please provide more than one processor with `--num_proc`.")
        sys.exit()

    # Set up input type
    if mol2 is not None:
        in_type = "mol2"
    elif smiles is not None:
        in_type = "smiles"

    if para.is_master():
        if in_type == "mol2":
            logging.info("Input type: mol2 file(s)")
            logging.info("Input file number: {:d}".format(len(mol2)))
            mol_iter = (mol_from_mol2(_mol2_file,
                                      _name,
                                      standardise=standardise)
                        for _mol2_file, _name in mol2_generator(*mol2))
        else:
            logging.info("Input type: Detected SMILES file(s)")
            logging.info("Input file number: {:d}".format(len(smiles)))
            mol_iter = (mol_from_smiles(_smiles,
                                        _name,
                                        standardise=standardise)
                        for _smiles, _name in smiles_generator(*smiles))

        if prioritize:
            logging.info(("Prioritizing mols with low rotatable bond number"
                          " and molecular weight first."))
            mols_with_properties = [(
                AllChem.CalcNumRotatableBonds(mol),
                AllChem.CalcExactMolWt(mol),
                mol,
            ) for mol in mol_iter if mol is not None]
            data_iterator = make_data_iterator(
                (x[-1] for x in sorted(mols_with_properties)))
        else:
            data_iterator = make_data_iterator(
                (x for x in mol_iter if x is not None))

        # Set up parallel-specific options
        logging.info("Parallel Type: {}".format(para.parallel_mode))

        # Set other options
        touch_dir(out_dir)

        if not num_conf:
            num_conf = -1

        logging.info("Out Directory: {}".format(out_dir))
        logging.info("Overwrite Existing Files: {}".format(overwrite))
        if values_file is not None:
            if os.path.exists(values_file) and overwrite is not True:
                value_args = (values_file, "a")
                logging.info("Values file: {} (append)".format((values_file)))
            else:
                value_args = (values_file, "w")
                logging.info("Values file: {} (new file)".format(
                    (values_file)))
        if num_conf is None or num_conf == -1:
            logging.info("Target Conformer Number: auto")
        else:
            logging.info("Target Conformer Number: {:d}".format(num_conf))
        if first is None or first == -1:
            logging.info("First Conformers Number: all")
        else:
            logging.info("First Conformers Number: {:d}".format(first))
        logging.info("Pool Multiplier: {:d}".format(pool_multiplier))
        logging.info("RMSD Cutoff: {:.4g}".format(rmsd_cutoff))
        if max_energy_diff is None:
            logging.info("Maximum Energy Difference: None")
        else:
            logging.info("Maximum Energy Difference: {:.4g} kcal".format(
                max_energy_diff))
        logging.info("Forcefield: {}".format(forcefield.upper()))
        if seed != -1:
            logging.info("Seed: {:d}".format(seed))

        logging.info("Starting.")
    else:
        data_iterator = iter([])

    gen_conf_kwargs = {
        "out_dir": out_dir,
        "num_conf": num_conf,
        "rmsd_cutoff": rmsd_cutoff,
        "max_energy_diff": max_energy_diff,
        "forcefield": forcefield,
        "pool_multiplier": pool_multiplier,
        "first": first,
        "seed": seed,
        "save": True,
        "overwrite": overwrite,
        "compress": compress,
    }

    run_kwargs = {"kwargs": gen_conf_kwargs}

    results_iterator = para.run_gen(generate_conformers, data_iterator,
                                    **run_kwargs)

    if para.is_master() and values_file is not None:
        hdf5_buffer = HDF5Buffer(*value_args)

    for result, data in results_iterator:
        if (para.is_master() and values_file is not None
                and result is not False):
            values_to_hdf5(hdf5_buffer, result)

    if para.is_master() and values_file is not None:
        hdf5_buffer.flush()
        hdf5_buffer.close()
コード例 #9
0
            dict_to_smiles(CHEMBL_CANON_SMILES_FILE,
                           chembl_canonical_smiles_dict)
        chembl_canonical_smiles_dict = smiles_to_dict(CHEMBL_CANON_SMILES_FILE)

        pdb_to_chembl_mol_map, chembl_to_pdb_mol_map = mol_map_from_smiles(
            pdb_canonical_smiles_dict, chembl_canonical_smiles_dict)
        with smart_open(PDB_CHEMBL_MOL_MAP, "w") as f:
            pkl.dump((pdb_to_chembl_mol_map, chembl_to_pdb_mol_map), f)
    else:
        with smart_open(PDB_CHEMBL_MOL_MAP, "r") as f:
            pdb_to_chembl_mol_map, chembl_to_pdb_mol_map = pkl.load(f)

    # Save query ligands to SDF files and get SMILES
    pdb_smiles_dict = {}
    skip_pairs = set()
    touch_dir(PDB_CONF_DIR)
    for mol_name, mol2_file in pdb_mol2_files.items():
        mol = mol_from_mol2(mol2_file, mol_name)
        smiles = MolToSmiles(mol, isomericSmiles=True)
        pdb_smiles_dict[mol_name] = smiles
        CanonicalizeMol(mol)
        sdf_file = os.path.join(PDB_CONF_DIR, "{}.sdf.bz2".format(mol_name))
        mol_to_sdf(mol, sdf_file)
    dict_to_smiles(PDB_SMILES_FILE, pdb_smiles_dict)

    # Build filtered CHEMBL targets dict
    chembl_smiles_cids = {k.split("-")[0] for k in chembl_smiles_dict.keys()}
    chembl_cids = set()
    if not os.path.isfile(CHEMBL_TARGETS):
        filtered_chembl_targets_dict = {}
        for k, v in pdb_targets_dict.items():
コード例 #10
0
ファイル: validate.py プロジェクト: WhitestoneYang/e3fp-paper
def main(query_molecules_file,
         query_targets_file,
         target_molecules_file,
         target_targets_file,
         method=SEASearchCVMethod,
         fit_file=None,
         log=None,
         out_dir="./"):
    setup_logging(log)

    method = method()
    method.out_dir = out_dir
    touch_dir(out_dir)
    if fit_file is None:
        fit_file = os.path.join(out_dir, "library.fit")

    logging.info("Loading target files.")
    if isinstance(method, SEASearchCVMethod):
        method.fit_file = fit_file
        (_, target_targets_dict, target_smiles_dict, target_mol_list_dict,
         target_fp_type, target_target_list,
         target_mol_list) = process_input_files(target_molecules_file,
                                                target_targets_file,
                                                sea_format=True)

        logging.info("Saving target SEA files.")
        dict_to_targets(method.train_targets_file, target_targets_dict)
        lists_dicts_to_molecules(method.train_molecules_file,
                                 target_smiles_dict, target_mol_list_dict,
                                 target_fp_type)

        target_fp_array = None
        target_mol_to_fp_inds = None
        target_target_mol_array = None
        mask = None
    else:
        (target_fp_array, target_mol_to_fp_inds, target_target_mol_array,
         target_target_list,
         target_mol_list) = process_input_files(target_molecules_file,
                                                target_targets_file,
                                                sea_format=False)
        mask = np.ones_like(target_target_mol_array, dtype=np.bool_)

    method.train(target_fp_array,
                 target_mol_to_fp_inds,
                 target_target_mol_array,
                 target_target_list,
                 target_mol_list,
                 mask=mask)

    logging.info("Loading query files.")
    if isinstance(method, SEASearchCVMethod):
        (query_target_mol_array, query_targets_dict, query_smiles_dict,
         query_mol_list_dict, query_fp_type, query_target_list,
         query_mol_list) = process_input_files(query_molecules_file,
                                               query_targets_file,
                                               sea_format=True)

        logging.info("Saving query SEA files.")
        lists_dicts_to_molecules(method.test_molecules_file, query_smiles_dict,
                                 query_mol_list_dict, query_fp_type)

        query_fp_array = None
        query_mol_to_fp_inds = None
    else:
        (query_fp_array, query_mol_to_fp_inds, query_target_mol_array,
         query_target_list,
         query_mol_list) = process_input_files(query_molecules_file,
                                               query_targets_file,
                                               sea_format=False)

    mask = np.ones_like(query_target_mol_array, dtype=np.bool_)
    results = method.test(query_fp_array,
                          query_mol_to_fp_inds,
                          query_target_mol_array,
                          query_target_list,
                          query_mol_list,
                          mask=mask)

    y_true = query_target_mol_array.ravel()
    y_score = results.ravel()
    nan_inds = np.where(~np.isnan(y_score))
    y_true, y_score = y_true[nan_inds], y_score[nan_inds]

    logging.info("Computing results curves.")
    roc_file, prc_file, enrich_file = [
        os.path.join(out_dir, "combined_{}.pkl.bz2".format(x))
        for x in ["roc", "prc", "enrichment"]
    ]

    logging.info("Computing ROC curves.")
    roc = roc_curve(y_true, y_score, drop_intermediate=True)
    auroc = auc(roc[0], roc[1])
    with smart_open(roc_file, "wb") as f:
        pkl.dump(roc, f, pkl.HIGHEST_PROTOCOL)
    logging.info("AUROC: {:.4f}".format(auroc))

    logging.info("Computing PRC curves.")
    prc_rec = precision_recall_curve(y_true, y_score)
    prc = (prc_rec[1], prc_rec[0], prc_rec[2])
    auprc = auc(prc[0], prc[1])
    imbalance = get_imbalance(y_true)
    with smart_open(prc_file, "wb") as f:
        pkl.dump(prc, f, pkl.HIGHEST_PROTOCOL)
    logging.info("AUPRC: {:.4f} ({:.4f} of data is positive)".format(
        auprc, imbalance))

    logging.info("Computing enrichment curves.")
    enrichment = enrichment_curve(y_true, y_score)
    with smart_open(enrich_file, "wb") as f:
        pkl.dump(enrichment, f, pkl.HIGHEST_PROTOCOL)
    auec = auc(enrichment[0], enrichment[1])
    logging.info("AUE: {:.4f}".format(auec))
コード例 #11
0
        cmd.ray(dpi)
        cmd.png(axes_figure)
        partial_opaque_to_opaque(axes_figure)
        sleep(0.5)


if __name__ == "__main__":
    usage = "pymol -r make_shell_figures.py -- <sdf_file>"
    try:
        sdf_file = sys.argv[1]
    except IndexError:
        sys.exit(usage)

    mol_name = os.path.basename(sdf_file).split(".")[0]
    out_dir = mol_name
    touch_dir(out_dir)

    json_out_file = os.path.join(out_dir, "graph.json")
    aligned_mol_pdb_file = os.path.join(out_dir, "mol.pdb")

    mol = mol_from_sdf(sdf_file)
    left_to_right_atom_ids = save_aligned_conf_to_pdb(aligned_mol_pdb_file,
                                                      mol)
    fprinter = fingerprint_mol(mol)
    graph = create_shell_graph(fprinter,
                               radius_multiplier=RADIUS_MULTIPLIER,
                               only_unique=True)
    atom_types_dict = get_atom_types(mol, graph)
    atom_colors_dict = define_colors_by_atom_types(atom_types_dict, mol)
    write_pdb_files(fprinter, BITS, out_dir=out_dir)
    pdb_dirs = sorted(glob.glob(os.path.join(out_dir, "substructs*")))
コード例 #12
0
def library_from_map(targets_map_file,
                     all_molecules_file,
                     all_targets_file,
                     fit_file=None,
                     sample=None,
                     affinity=None,
                     out_dir='./'):
    """Build SEA library from target map and existing SEA molecules/targets."""
    molecules_file = os.path.join(out_dir, "molecules.csv.bz2")
    targets_file = os.path.join(out_dir, "targets.csv.bz2")
    library_file = os.path.join(out_dir, "library.sea")
    touch_dir(out_dir)

    logging.info("Reading targets map from {0}".format(targets_map_file))
    targets_map = read_targets_map(targets_map_file,
                                   key_header=KEY_HEADER,
                                   headers=MAP_HEADER)
    logging.debug("{:d} targets in map".format(len(targets_map)))
    logging.info("Reading targets file from {0}".format("all_targets_file"))
    all_targets_dict = targets_to_dict(all_targets_file, affinity=affinity)
    logging.debug("Read {:d} targets".format(len(all_targets_dict)))
    targets_dict = filter_targets_by_map(all_targets_dict, targets_map)
    logging.debug("{:d} targets after filtering".format(len(targets_dict)))
    logging.info("Reading molecules file from {0}".format(all_molecules_file))
    smiles_dict, all_mol_lists_dict, fp_type = molecules_to_lists_dicts(
        all_molecules_file)
    logging.debug("{:d} molecules in file".format(len(all_mol_lists_dict)))
    mol_lists_targets_dict = targets_to_mol_lists_targets(
        targets_dict, all_mol_lists_dict)
    logging.debug("{:d} mol lists targets".format(len(mol_lists_targets_dict)))
    logging.info("Writing targets file")
    dict_to_targets(targets_file, mol_lists_targets_dict)
    mol_lists_dict = filter_molecules_by_targets(all_mol_lists_dict,
                                                 targets_dict)
    del targets_dict
    logging.debug("{:d} filtered molecules".format(len(mol_lists_dict)))
    del mol_lists_targets_dict
    logging.info("Writing molecules file")
    lists_dicts_to_molecules(molecules_file, smiles_dict, mol_lists_dict,
                             fp_type)

    if fit_file is None or not os.path.isfile(fit_file):
        logging.info("Fit file does not exist. Generating fit.")
        if fit_file is None:
            fit_file = os.path.join(out_dir, "library.fit")
        tmp_molecules_file = all_molecules_file
        tmp_targets_file = TMP_PREFIX + "_" + os.path.basename(targets_file)
        tmp_library_file = TMP_PREFIX + "_" + os.path.basename(library_file)
        if sample is not None:
            logging.info("Sampling {} random molecules for fit".format(sample))
            tmp_molecules_file = TMP_PREFIX + "_" + os.path.basename(
                molecules_file)
            sample_mol_lists_files(all_molecules_file,
                                   all_targets_file,
                                   sample,
                                   sample_molecules_file=tmp_molecules_file,
                                   sample_targets_file=tmp_targets_file,
                                   overwrite=True)
        else:
            logging.info("Using all molecules for fit")
            all_mol_lists_targets_dict = targets_to_mol_lists_targets(
                all_targets_dict, all_mol_lists_dict)
            logging.info("Writing all targets to file.")
            dict_to_targets(tmp_targets_file, all_mol_lists_targets_dict)
            del all_mol_lists_targets_dict
        logging.info("Building library for fit molecules/targets.")
        build_library(tmp_library_file,
                      tmp_molecules_file,
                      tmp_targets_file,
                      fit_file,
                      log=True,
                      no_plot=False)
    else:
        logging.info("Fit file already exists. Skipping fit generation.")

    del all_mol_lists_dict

    logging.info("Building library")
    build_library(library_file,
                  molecules_file,
                  targets_file,
                  fit_file,
                  log=True)
    logging.info("Library has been built.")
コード例 #13
0
ファイル: methods.py プロジェクト: WhitestoneYang/e3fp-paper
    def train(self, fp_array, mol_to_fp_inds, target_mol_array, target_list,
              mol_list, mask):
        """Train and score a classifier for each target.

        Parameters
        ----------
        fp_array : ndarray or csr_matrix (n_fprints, n_bits)
            Array with fingerprints as rows
        mol_to_fp_inds : dict
            Map from index of `mol_list` to indices for mol fingerprints in
            `fp_array`
        target_mol_array : ndarray of bool (n_targets, n_mols)
            Boolean array with True marking mol/target binding pairs
            and False marking implied negatives.
        target_list : list of str
            List of target names corresponding to rows of `target_mol_array`.
        mol_list : list of str
            List of mol names corresponding to columns of `target_mol_array`.
        mask : ndarray of bool (n_targets, n_mols)
            Boolean array with positives marking mol/target pairs in the
            training dataset.
        """
        if self.is_trained(target_list) and not self.overwrite:
            logging.info("All targets already trained.")
            return

        if self.dense_data and issparse(fp_array):
            logging.info("Converting from sparse to dense fingerprints.")
            fp_array = fp_array.toarray()
        fp_array = fp_array.astype(self.dtype)

        touch_dir(self.fit_dir)
        logging.info("Generating target fits.")
        target_num = len(target_list)
        target_perc_num = int(target_num / 100)
        for i, target_key in enumerate(target_list):
            fit_file = self._fit_file_from_target_key(target_key)
            if os.path.isfile(fit_file) and not self.overwrite:
                logging.debug(
                    "Fit file for {} already exists. Skipping".format(
                        target_key.tid))
                continue

            # get subset of training data
            set_fp_inds, set_mol_inds, set_fp_num = self.get_fprint_subsets(
                mol_to_fp_inds,
                target_mol_array[i, :],
                mask[i, :],
                sample_negatives=self.train_sample_negatives)
            data = fp_array[set_fp_inds, :]
            pos = np.repeat(target_mol_array[i, set_mol_inds],
                            set_fp_num).astype(self.dtype)

            # perform training
            clf = self.create_clf(data)
            logging.debug("Fitting {} using {} fprints ({}/{})".format(
                target_key.tid, data.shape[0], i + 1, target_num))
            self.train_clf(clf, data, pos, batch_size=self.train_batch_size)
            if self.train_sample_negatives:  # expensive if all data used
                score = self.score_clf(clf, data, pos)
                logging.debug("Fitted {} with score {:.4f}. ({}/{})".format(
                    target_key.tid, score, i + 1, target_num))
            else:
                logging.debug("Fitted {}. ({}/{})".format(
                    target_key.tid, i + 1, target_num))
            self.save_fit_file(target_key, clf)
            # if (i + 1) % target_perc_num == 0:
            #     logging.info("Fit {:.2f}% of targets ({}/{})".format(
            #         100 * (i + 1) / float(target_num), i + 1, target_num))
        logging.info("Finished fitting targets.")