Esempio n. 1
0
def main(smiles_file,
         params_file,
         sdf_dir=None,
         out_file="molecules.csv.bz2",
         log=None,
         num_proc=None,
         parallel_mode=None,
         verbose=False):
    """Fingerprint molecules."""
    setup_logging(log, verbose=verbose)
    parallelizer = Parallelizer(parallel_mode="processes")

    # set conformer generation and fingerprinting parameters
    confgen_params, fprint_params = params_to_dicts(params_file)
    kwargs = {"save": False, "fprint_params": fprint_params}

    smiles_dict = smiles_to_dict(smiles_file)
    mol_num = len({x.split('-')[0] for x in smiles_dict})

    if sdf_dir is not None:
        sdf_files = glob.glob(os.path.join(sdf_dir, "*.sdf*"))
        sdf_files = sorted(
            [x for x in sdf_files if name_from_sdf_filename(x) in smiles_dict])
        data_iter = make_data_iterator(sdf_files)
        fp_method = native_tuples_from_sdf
        logging.info("Using SDF files from {}".format(sdf_dir))
    else:
        kwargs["confgen_params"] = confgen_params
        data_iter = ((smiles, name)
                     for name, smiles in smiles_dict.iteritems())
        mol_num = len({x.split('-')[0] for x in smiles_dict})
        fp_method = native_tuples_from_smiles
        logging.info("Will generate conformers.")
        logging.info(
            "Conformer generation params: {!r}.".format(confgen_params))
    logging.info("Fingerprinting params: {!r}.".format(fprint_params))

    # fingerprint in parallel
    logging.info("Fingerprinting {:d} molecules".format(mol_num))
    mol_list_dict = {}
    for result, data in parallelizer.run_gen(fp_method,
                                             data_iter,
                                             kwargs=kwargs):
        if not result:
            logging.warning("Fingerprinting failed for {}.".format(data[0]))
            continue
        try:
            _, name = result[0]
            name = name.split('_')[0]
        except IndexError:
            logging.warning("Fingerprinting failed for {}.".format(data[0]))
            continue
        mol_list_dict[name] = result
    logging.info("Finished fingerprinting molecules")

    # save to SEA molecules file
    logging.info("Saving fingerprints to {}".format(out_file))
    fp_type = fprint_params_to_fptype(**fprint_params)
    lists_dicts_to_molecules(out_file, smiles_dict, mol_list_dict, fp_type)
    logging.info("Finished!")
Esempio n. 2
0
def params_to_molecules(params, smiles_file, conf_dir, out_dir,
                        parallelizer=None):
    """Generate molecules_file based on params dict."""
    smiles_dict = smiles_to_dict(smiles_file)
    logging.debug("SMILES file has {:d} unique smiles.".format(
        len(smiles_dict)))
    logging.debug("Example SMILES: {!r}".format(smiles_dict.items()[0]))
    fprint_params = {"radius_multiplier": params["radius_multiplier"],
                     "stereo": STEREO, "bits": params["bits"],
                     "first": params['first'], "level": params['level']}

    conf_dir_files = glob.glob("{!s}/*".format(conf_dir))
    logging.debug("Found {:d} files in conformer directory.".format(
        len(conf_dir_files)))
    sdf_files = [x for x in conf_dir_files
                 if os.path.basename(x).split('.')[0] in smiles_dict]
    logging.debug("{:d} conformer files match SMILES.".format(len(sdf_files)))

    if len(sdf_files) == 0:
        raise Exception("Directory {!s} does not contain any usable SDF "
                        "files.".format(conf_dir))

    kwargs = {"save": False, "fprint_params": fprint_params}

    data_iterator = make_data_iterator(sdf_files)
    if parallelizer is not None:
        results_iter = parallelizer.run_gen(native_tuples_from_sdf,
                                            data_iterator, kwargs=kwargs)
    else:
        results_iter = (native_tuples_from_sdf(*x, **kwargs)
                        for x in data_iterator)

    molecules_file = get_molecules_file(out_dir)
    fp_type = fprint_params_to_fptype(**params)
    with smart_open(molecules_file, "wb") as f:
        writer = csv.writer(f)
        fp_type.write(writer)
        writer.writerow(("molecule id", "smiles", "fingerprint"))
        for results in results_iter:
            try:
                fp_native_list, sdf_file = results
            except ValueError:
                logging.error("Results of fingerprinting did not look as "
                              "expected: {!r}".format(results))
            proto_name = MolItemName.from_str(fp_native_list[0][1]).proto_name
            smiles = smiles_dict[proto_name]
            for fp_native, fp_name in fp_native_list:
                writer.writerow((fp_name, smiles, fp_native))

    del smiles_dict
    filtered_smiles_dict, mol_lists_dict, fp_type = molecules_to_lists_dicts(
        molecules_file)
    return (filtered_smiles_dict, mol_lists_dict, fp_type)
Esempio n. 3
0
def run(sdf_files,
        bits=BITS,
        first=FIRST_DEF,
        level=LEVEL_DEF,
        radius_multiplier=RADIUS_MULTIPLIER_DEF,
        counts=COUNTS_DEF,
        stereo=STEREO_DEF,
        include_disconnected=INCLUDE_DISCONNECTED_DEF,
        rdkit_invariants=RDKIT_INVARIANTS_DEF,
        exclude_floating=EXCLUDE_FLOATING_DEF,
        params=None,
        out_dir_base=None,
        out_ext=OUT_EXT_DEF,
        db_file=None,
        overwrite=False,
        all_iters=False,
        log=None,
        num_proc=None,
        parallel_mode=None,
        verbose=False):
    """Generate E3FP fingerprints from SDF files."""
    setup_logging(log, verbose=verbose)

    if params is not None:
        params = read_params(params, fill_defaults=True)
        bits = get_value(params, "fingerprinting", "bits", int)
        first = get_value(params, "fingerprinting", "first", int)
        level = get_value(params, "fingerprinting", "level", int)
        radius_multiplier = get_value(params, "fingerprinting",
                                      "radius_multiplier", float)
        counts = get_value(params, "fingerprinting", "counts", bool)
        stereo = get_value(params, "fingerprinting", "stereo", bool)
        include_disconnected = get_value(params, "fingerprinting",
                                         "include_disconnected", bool)
        rdkit_invariants = get_value(params, "fingerprinting",
                                     "rdkit_invariants", bool)
        exclude_floating = get_value(params, "fingerprinting",
                                     "exclude_floating", bool)

    para = Parallelizer(num_proc=num_proc, parallel_mode=parallel_mode)

    if para.rank == 0:
        logging.info("Initializing E3FP generation.")
        logging.info("Getting SDF files")

        if len(sdf_files) == 1 and os.path.isdir(sdf_files[0]):
            from glob import glob
            sdf_files = glob("{:s}/*sdf*".format(sdf_files[0]))

        data_iterator = make_data_iterator(sdf_files)

        logging.info("SDF File Number: {:d}".format(len(sdf_files)))
        if out_dir_base is not None:
            logging.info("Out Directory Basename: {:s}".format(out_dir_base))
            logging.info("Out Extension: {:s}".format(out_ext))
        if db_file is not None:
            logging.info("Database File: {:s}".format(db_file))
        if db_file is None and out_dir_base is None:
            sys.exit('Either `db_file` or `out_dir_base` must be specified.')
        logging.info("Max First Conformers: {:d}".format(first))
        logging.info("Bits: {:d}".format(bits))
        logging.info("Level/Max Iterations: {:d}".format(level))
        logging.info(
            "Shell Radius Multiplier: {:.4g}".format(radius_multiplier))
        logging.info("Stereo Mode: {!s}".format(stereo))
        if include_disconnected:
            logging.info("Connected-only mode: on")
        if rdkit_invariants:
            logging.info("Invariant type: RDKit")
        else:
            logging.info("Invariant type: Daylight")
        logging.info("Parallel Mode: {!s}".format(para.parallel_mode))
        logging.info("Starting")
    else:
        data_iterator = iter([])

    fp_kwargs = {
        "first": first,
        "bits": bits,
        "level": level,
        "radius_multiplier": radius_multiplier,
        "stereo": stereo,
        "counts": counts,
        "include_disconnected": include_disconnected,
        "rdkit_invariants": rdkit_invariants,
        "exclude_floating": exclude_floating,
        "out_dir_base": out_dir_base,
        "out_ext": out_ext,
        "all_iters": all_iters,
        "overwrite": overwrite,
        "save": False
    }
    if out_dir_base is not None:
        fp_kwargs['save'] = True

    run_kwargs = {"kwargs": fp_kwargs}

    results_iter = para.run_gen(fprints_dict_from_sdf, data_iterator,
                                **run_kwargs)

    if db_file is not None:
        fprints = []
        for result, data in results_iter:
            try:
                fprints.extend(result.get(level, result[max(result.keys())]))
            except (AttributeError, ValueError):
                # fprinting failed, assume logged in method
                continue
        if len(fprints) > 0:
            db = FingerprintDatabase(fp_type=type(fprints[0]), level=level)
            db.add_fingerprints(fprints)
            db.save(db_file)
            logging.info("Saved fingerprints to {:s}".format(db_file))
    else:
        list(results_iter)
Esempio n. 4
0
def run(
    mol2=None,
    smiles=None,
    standardise=STANDARDISE_DEF,
    num_conf=NUM_CONF_DEF,
    first=FIRST_DEF,
    pool_multiplier=POOL_MULTIPLIER_DEF,
    rmsd_cutoff=RMSD_CUTOFF_DEF,
    max_energy_diff=MAX_ENERGY_DIFF_DEF,
    forcefield=FORCEFIELD_DEF,
    seed=SEED_DEF,
    params=None,
    prioritize=False,
    out_dir=OUTDIR_DEF,
    compress=COMPRESS_DEF,
    overwrite=False,
    values_file=None,
    log=None,
    num_proc=None,
    parallel_mode=None,
    verbose=False,
):
    """Run conformer generation."""
    setup_logging(log, verbose=verbose)

    if params is not None:
        params = read_params(params)
        standardise = get_value(params, "preprocessing", "standardise", bool)
        num_conf = get_value(params, "conformer_generation", "num_conf", int)
        first = get_value(params, "conformer_generation", "first", int)
        pool_multiplier = get_value(params, "conformer_generation",
                                    "pool_multiplier", int)
        rmsd_cutoff = get_value(params, "conformer_generation", "rmsd_cutoff",
                                float)
        max_energy_diff = get_value(params, "conformer_generation",
                                    "max_energy_diff", float)
        forcefield = get_value(params, "conformer_generation", "forcefield")
        seed = get_value(params, "conformer_generation", "seed", int)

    # check args
    if forcefield not in FORCEFIELD_CHOICES:
        raise ValueError(
            "Specified forcefield {} is not in valid options {!r}".format(
                forcefield, FORCEFIELD_CHOICES))

    para = Parallelizer(num_proc=num_proc, parallel_mode=parallel_mode)

    # Check to make sure args make sense
    if mol2 is None and smiles is None:
        if para.is_master():
            parser.print_usage()
            logging.error("Please provide mol2 file or a SMILES file.")
        sys.exit()

    if mol2 is not None and smiles is not None:
        if para.is_master():
            parser.print_usage()
            logging.error("Please provide only a mol2 file OR a SMILES file.")
        sys.exit()

    if num_proc and num_proc < 1:
        if para.is_master():
            parser.print_usage()
            logging.error(
                "Please provide more than one processor with `--num_proc`.")
        sys.exit()

    # Set up input type
    if mol2 is not None:
        in_type = "mol2"
    elif smiles is not None:
        in_type = "smiles"

    if para.is_master():
        if in_type == "mol2":
            logging.info("Input type: mol2 file(s)")
            logging.info("Input file number: {:d}".format(len(mol2)))
            mol_iter = (mol_from_mol2(_mol2_file,
                                      _name,
                                      standardise=standardise)
                        for _mol2_file, _name in mol2_generator(*mol2))
        else:
            logging.info("Input type: Detected SMILES file(s)")
            logging.info("Input file number: {:d}".format(len(smiles)))
            mol_iter = (mol_from_smiles(_smiles,
                                        _name,
                                        standardise=standardise)
                        for _smiles, _name in smiles_generator(*smiles))

        if prioritize:
            logging.info(("Prioritizing mols with low rotatable bond number"
                          " and molecular weight first."))
            mols_with_properties = [(
                AllChem.CalcNumRotatableBonds(mol),
                AllChem.CalcExactMolWt(mol),
                mol,
            ) for mol in mol_iter if mol is not None]
            data_iterator = make_data_iterator(
                (x[-1] for x in sorted(mols_with_properties)))
        else:
            data_iterator = make_data_iterator(
                (x for x in mol_iter if x is not None))

        # Set up parallel-specific options
        logging.info("Parallel Type: {}".format(para.parallel_mode))

        # Set other options
        touch_dir(out_dir)

        if not num_conf:
            num_conf = -1

        logging.info("Out Directory: {}".format(out_dir))
        logging.info("Overwrite Existing Files: {}".format(overwrite))
        if values_file is not None:
            if os.path.exists(values_file) and overwrite is not True:
                value_args = (values_file, "a")
                logging.info("Values file: {} (append)".format((values_file)))
            else:
                value_args = (values_file, "w")
                logging.info("Values file: {} (new file)".format(
                    (values_file)))
        if num_conf is None or num_conf == -1:
            logging.info("Target Conformer Number: auto")
        else:
            logging.info("Target Conformer Number: {:d}".format(num_conf))
        if first is None or first == -1:
            logging.info("First Conformers Number: all")
        else:
            logging.info("First Conformers Number: {:d}".format(first))
        logging.info("Pool Multiplier: {:d}".format(pool_multiplier))
        logging.info("RMSD Cutoff: {:.4g}".format(rmsd_cutoff))
        if max_energy_diff is None:
            logging.info("Maximum Energy Difference: None")
        else:
            logging.info("Maximum Energy Difference: {:.4g} kcal".format(
                max_energy_diff))
        logging.info("Forcefield: {}".format(forcefield.upper()))
        if seed != -1:
            logging.info("Seed: {:d}".format(seed))

        logging.info("Starting.")
    else:
        data_iterator = iter([])

    gen_conf_kwargs = {
        "out_dir": out_dir,
        "num_conf": num_conf,
        "rmsd_cutoff": rmsd_cutoff,
        "max_energy_diff": max_energy_diff,
        "forcefield": forcefield,
        "pool_multiplier": pool_multiplier,
        "first": first,
        "seed": seed,
        "save": True,
        "overwrite": overwrite,
        "compress": compress,
    }

    run_kwargs = {"kwargs": gen_conf_kwargs}

    results_iterator = para.run_gen(generate_conformers, data_iterator,
                                    **run_kwargs)

    if para.is_master() and values_file is not None:
        hdf5_buffer = HDF5Buffer(*value_args)

    for result, data in results_iterator:
        if (para.is_master() and values_file is not None
                and result is not False):
            values_to_hdf5(hdf5_buffer, result)

    if para.is_master() and values_file is not None:
        hdf5_buffer.flush()
        hdf5_buffer.close()