def test_cache_cddd_embeddings():
    """
    Verify fetching data from chemblDB.
    """
    num_recs = 1000

    logger.info('CDDD Embeddings Check!')

    cache_dir = os.path.join(tempfile.mkdtemp())
    logger.info('Creating cache at %s' % cache_dir)
    logger.info(type(cache_dir))

    # Write to cache
    chem_data = ChEmblData(fp_type=Embeddings)
    chem_data.save_fingerprints(os.path.join(cache_dir, FINGER_PRINT_FILES),
                                num_recs=num_recs)

    # Verify cache
    hdf_path = os.path.join(cache_dir, FINGER_PRINT_FILES)
    logger.info('Reading molecules from %s...' % hdf_path)
    mol_df = dask.dataframe.read_hdf(hdf_path, 'fingerprints')
    mol_df = mol_df.compute()

    logger.info('Expected %s rec found %s.', num_recs, mol_df.shape[0])
    assert mol_df.shape[0] <= num_recs, \
        ('Expected %d rec found %d.' % (num_recs, mol_df.shape[0]))
class ChemblGenerativeWfDao(GenerativeWfDao, metaclass=Singleton):
    def __init__(self, fp_type):
        self.chem_data = ChEmblData(fp_type)

    def fetch_id_from_chembl(self, id: List):
        logger.debug('Fetch ChEMBL ID using molregno...')
        return self.chem_data.fetch_id_from_chembl(id)
Exemple #3
0
def test_add_molecule_GpuKmeansUmap():
    """
    Verify fetching data from chemblDB when the input is a cudf df.
    """
    _create_context()

    n_molecules, dao, mol_df = _fetch_chembl_test_dataset()

    if hasattr(mol_df, 'compute'):
        mol_df = mol_df.compute()

    mol_df = cudf.from_pandas(mol_df)
    n_molecules = mol_df.shape[0]

    # test mol should container aviable and new molecules
    test_mol = mol_df[n_molecules - 20:]
    mols_tobe_added = test_mol['id'].to_array().tolist()

    chData = ChEmblData()
    logger.info('Fetching ChEMBLLE id for %s', mols_tobe_added)
    mols_tobe_added = [
        str(row[0])
        for row in chData.fetch_chemblId_by_molregno(mols_tobe_added)
    ]
    logger.info('ChEMBL ids to be added %s', mols_tobe_added)

    # Molecules to be used for clustering
    mol_df = mol_df[:n_molecules - 10]

    wf = GpuKmeansUmap(n_molecules=n_molecules, dao=dao, pca_comps=64)
    wf.cluster(df_mol_embedding=mol_df)

    missing_mols, molregnos, df_embedding = wf.add_molecules(mols_tobe_added)
    assert len(
        missing_mols
    ) == 10, 'Expected 10 missing molecules found %d' % len(missing_mols)

    # TODO: Once the issue with add_molecule in multi-gpu env. is fixed, the
    # number of missing_molregno found should be 0
    missing_mols, molregnos, df_embedding = wf.add_molecules(mols_tobe_added)
    assert len(
        missing_mols
    ) == 0, 'Expected no missing molecules found %d' % len(missing_mols)
 def __init__(self, fp_type):
     self.chem_data = ChEmblData(fp_type)
Exemple #5
0
    def cache(self):
        """
        Create Cache
        """
        context = Context()
        data_dir = context.get_config('data_mount_path', default='/data')
        cache_dir = os.path.join(data_dir, 'cache')

        parser = argparse.ArgumentParser(description='Create cache')

        parser.add_argument(
            '-ct',
            '--cache_type',
            dest='cache_type',
            type=str,
            default='MorganFingerprint',
            choices=['MorganFingerprint', 'Embeddings'],
            help='Type of data preprocessing (MorganFingerprint or Embeddings)'
        )

        parser.add_argument('-c',
                            '--cache_directory',
                            dest='cache_directory',
                            type=str,
                            default=cache_dir,
                            help='Location to create fingerprint cache')

        parser.add_argument('--batch_size',
                            dest='batch_size',
                            type=int,
                            default=100000,
                            help='Chunksize.')

        parser.add_argument('--n_cpu',
                            dest='n_cpu',
                            type=int,
                            default=12,
                            help='Number of CPU workers to use')

        parser.add_argument('-d',
                            '--debug',
                            dest='debug',
                            action='store_true',
                            default=False,
                            help='Show debug message')

        parser.add_argument(
            '-m',
            '--n_mol',
            dest='n_mol',
            type=int,
            default=-1,
            help=
            'Number of molecules for analysis. Use negative numbers for using the whole dataset.'
        )

        args = parser.parse_args(sys.argv[2:])

        if args.debug:
            logger.setLevel(logging.DEBUG)

        cluster = LocalCluster(dashboard_address=':9001',
                               n_workers=args.n_cpu,
                               threads_per_worker=4)
        client = Client(cluster)

        with client:
            task_start_time = datetime.now()

            if not os.path.exists(args.cache_directory):
                logger.info('Creating folder %s...' % args.cache_directory)
                os.makedirs(args.cache_directory)

            if (args.cache_type == 'MorganFingerprint'):
                prepocess_type = MorganFingerprint
            elif (args.cache_type == 'Embeddings'):
                prepocess_type = Embeddings

            chem_data = ChEmblData(fp_type=prepocess_type)
            chem_data.save_fingerprints(os.path.join(args.cache_directory,
                                                     FINGER_PRINT_FILES),
                                        num_recs=args.n_mol,
                                        batch_size=args.batch_size)

            logger.info('Fingerprint generated in (hh:mm:ss.ms) {}'.format(
                datetime.now() - task_start_time))
import os

import numpy as np
import pandas as pd
from cuchemcommon.data.helper.chembldata import ChEmblData
from cuchemcommon.fingerprint import calc_morgan_fingerprints

DATA_BENCHMARK_DIR = '/workspace/cuchem/cuchem/cheminformatics/data'
DEFAULT_MAX_SEQ_LEN = 512

if __name__ == '__main__':

    num_samples = 20000
    benchmark_df = pd.DataFrame(
        ChEmblData.fetch_random_samples(num_samples, DEFAULT_MAX_SEQ_LEN))
    benchmark_df.rename(columns={'len': 'length'}, inplace=True)

    # TODO: benchmark SMILES have not been canonicalized. Should this be done?
    fp = calc_morgan_fingerprints(benchmark_df)
    fp.columns = fp.columns.astype(np.int64)
    fp.index = fp.index.astype(np.int64)
    for col in fp.columns:
        fp[col] = fp[col].astype(np.int)
        # fp[col] = fp[col].astype(np.float32)

    # Write results
    benchmark_df.reset_index(
        inplace=True
    )  # For consistency with approved drugs, only one has index reset
    benchmark_df.to_csv(
Exemple #7
0
# See the License for the specific language governing permissions and
# limitations under the License.

import os

import cudf
import numpy as np
import pandas as pd
from cuchemcommon.data.helper.chembldata import ChEmblData
from cuchemcommon.fingerprint import calc_morgan_fingerprints

DATA_BENCHMARK_DIR = '/workspace/cuchem/tests/data'

if __name__ == '__main__':

    benchmark_df = pd.DataFrame(ChEmblData.fetch_approved_drugs())

    # TODO: benchmark SMILES have not been canonicalized. Should this be done?
    fp = calc_morgan_fingerprints(benchmark_df)
    fp.columns = fp.columns.astype(np.int64)
    fp.index = fp.index.astype(np.int64)
    for col in fp.columns:
        fp[col] = fp[col].astype(np.float32)

    # Write results
    benchmark_df.to_csv(os.path.join(DATA_BENCHMARK_DIR, 'benchmark_approved_drugs.csv'))

    fp.to_csv(os.path.join(DATA_BENCHMARK_DIR, 'fingerprints_approved_drugs.csv'))
    fp_hdf5 = cudf.DataFrame(fp)
    fp_hdf5.to_hdf(os.path.join(DATA_BENCHMARK_DIR, 'filter_00.h5', 'fingerprints', format='table'))