def test_cache_cddd_embeddings(): """ Verify fetching data from chemblDB. """ num_recs = 1000 logger.info('CDDD Embeddings Check!') cache_dir = os.path.join(tempfile.mkdtemp()) logger.info('Creating cache at %s' % cache_dir) logger.info(type(cache_dir)) # Write to cache chem_data = ChEmblData(fp_type=Embeddings) chem_data.save_fingerprints(os.path.join(cache_dir, FINGER_PRINT_FILES), num_recs=num_recs) # Verify cache hdf_path = os.path.join(cache_dir, FINGER_PRINT_FILES) logger.info('Reading molecules from %s...' % hdf_path) mol_df = dask.dataframe.read_hdf(hdf_path, 'fingerprints') mol_df = mol_df.compute() logger.info('Expected %s rec found %s.', num_recs, mol_df.shape[0]) assert mol_df.shape[0] <= num_recs, \ ('Expected %d rec found %d.' % (num_recs, mol_df.shape[0]))
def test_add_molecule_GpuKmeansUmap(): """ Verify fetching data from chemblDB when the input is a cudf df. """ _create_context() n_molecules, dao, mol_df = _fetch_chembl_test_dataset() if hasattr(mol_df, 'compute'): mol_df = mol_df.compute() mol_df = cudf.from_pandas(mol_df) n_molecules = mol_df.shape[0] # test mol should container aviable and new molecules test_mol = mol_df[n_molecules - 20:] mols_tobe_added = test_mol['id'].to_array().tolist() chData = ChEmblData() logger.info('Fetching ChEMBLLE id for %s', mols_tobe_added) mols_tobe_added = [ str(row[0]) for row in chData.fetch_chemblId_by_molregno(mols_tobe_added) ] logger.info('ChEMBL ids to be added %s', mols_tobe_added) # Molecules to be used for clustering mol_df = mol_df[:n_molecules - 10] wf = GpuKmeansUmap(n_molecules=n_molecules, dao=dao, pca_comps=64) wf.cluster(df_mol_embedding=mol_df) missing_mols, molregnos, df_embedding = wf.add_molecules(mols_tobe_added) assert len( missing_mols ) == 10, 'Expected 10 missing molecules found %d' % len(missing_mols) # TODO: Once the issue with add_molecule in multi-gpu env. is fixed, the # number of missing_molregno found should be 0 missing_mols, molregnos, df_embedding = wf.add_molecules(mols_tobe_added) assert len( missing_mols ) == 0, 'Expected no missing molecules found %d' % len(missing_mols)
def __init__(self, fp_type): self.chem_data = ChEmblData(fp_type)
def cache(self): """ Create Cache """ context = Context() data_dir = context.get_config('data_mount_path', default='/data') cache_dir = os.path.join(data_dir, 'cache') parser = argparse.ArgumentParser(description='Create cache') parser.add_argument( '-ct', '--cache_type', dest='cache_type', type=str, default='MorganFingerprint', choices=['MorganFingerprint', 'Embeddings'], help='Type of data preprocessing (MorganFingerprint or Embeddings)' ) parser.add_argument('-c', '--cache_directory', dest='cache_directory', type=str, default=cache_dir, help='Location to create fingerprint cache') parser.add_argument('--batch_size', dest='batch_size', type=int, default=100000, help='Chunksize.') parser.add_argument('--n_cpu', dest='n_cpu', type=int, default=12, help='Number of CPU workers to use') parser.add_argument('-d', '--debug', dest='debug', action='store_true', default=False, help='Show debug message') parser.add_argument( '-m', '--n_mol', dest='n_mol', type=int, default=-1, help= 'Number of molecules for analysis. Use negative numbers for using the whole dataset.' ) args = parser.parse_args(sys.argv[2:]) if args.debug: logger.setLevel(logging.DEBUG) cluster = LocalCluster(dashboard_address=':9001', n_workers=args.n_cpu, threads_per_worker=4) client = Client(cluster) with client: task_start_time = datetime.now() if not os.path.exists(args.cache_directory): logger.info('Creating folder %s...' % args.cache_directory) os.makedirs(args.cache_directory) if (args.cache_type == 'MorganFingerprint'): prepocess_type = MorganFingerprint elif (args.cache_type == 'Embeddings'): prepocess_type = Embeddings chem_data = ChEmblData(fp_type=prepocess_type) chem_data.save_fingerprints(os.path.join(args.cache_directory, FINGER_PRINT_FILES), num_recs=args.n_mol, batch_size=args.batch_size) logger.info('Fingerprint generated in (hh:mm:ss.ms) {}'.format( datetime.now() - task_start_time))