Exemple #1
0
def load_dataset(method, labels, prefix='input', num_data=-1):
    policy = _CacheNamePolicy(method, labels, prefix, num_data=num_data)
    train_path = policy.get_train_file_path()
    val_path = policy.get_val_file_path()
    test_path = policy.get_test_file_path()

    train, val, test = None, None, None
    if os.path.exists(policy.cache_dir):
        print('load from cache {}'.format(policy.cache_dir))
        train = NumpyTupleDataset.load(train_path)
        val = NumpyTupleDataset.load(val_path)
        test = NumpyTupleDataset.load(test_path)
    if train is None or val is None or test is None:
        print('preprocessing dataset...')
        preprocessor = preprocess_method_dict[method]()
        if num_data >= 0:
            # Use `num_data` examples for train
            target_index = numpy.arange(num_data)
            train, val, test = D.get_tox21(preprocessor,
                                           labels=labels,
                                           train_target_index=target_index,
                                           val_target_index=None,
                                           test_target_index=None)
        else:
            train, val, test = D.get_tox21(preprocessor, labels=labels)
        # Cache dataset
        policy.create_cache_directory()
        NumpyTupleDataset.save(train_path, train)
        NumpyTupleDataset.save(val_path, val)
        NumpyTupleDataset.save(test_path, test)

    return train, val, test
Exemple #2
0
def load_dataset(method, labels, prefix='input', num_data=-1):
    policy = _CacheNamePolicy(method, labels, prefix, num_data=num_data)
    train_path = policy.get_train_file_path()
    val_path = policy.get_val_file_path()
    test_path = policy.get_test_file_path()

    train, val, test = None, None, None
    print()
    if os.path.exists(policy.cache_dir):
        print('load from cache {}'.format(policy.cache_dir))
        train = NumpyTupleDataset.load(train_path)
        val = NumpyTupleDataset.load(val_path)
        test = NumpyTupleDataset.load(test_path)
    if train is None or val is None or test is None:
        print('preprocessing dataset...')
        preprocessor = preprocess_method_dict[method]()
        if num_data >= 0:
            # Use `num_data` examples for train
            target_index = numpy.arange(num_data)
            train, val, test = D.get_tox21(
                preprocessor, labels=labels,
                train_target_index=target_index, val_target_index=None,
                test_target_index=None
            )
        else:
            train, val, test = D.get_tox21(preprocessor, labels=labels)
        # Cache dataset
        policy.create_cache_directory()
        NumpyTupleDataset.save(train_path, train)
        NumpyTupleDataset.save(val_path, val)
        NumpyTupleDataset.save(test_path, test)
    return train, val, test
Exemple #3
0
def load_dataset(method, labels, prefix='input'):
    method = 'nfp' if 'nfp' in method else method  # to deal with nfpdrop
    method = 'ggnn' if 'ggnn' in method else method  # to deal with ggnndrop
    policy = _CacheNamePolicy(method, labels, prefix)
    train_path = policy.get_train_file_path()
    val_path = policy.get_val_file_path()
    test_path = policy.get_test_file_path()
    smiles_path = policy.get_smiles_path()

    train, val, test = None, None, None
    train_smiles, val_smiles, test_smiles = None, None, None
    print()
    if os.path.exists(policy.cache_dir):
        print('load from cache {}'.format(policy.cache_dir))
        train = NumpyTupleDataset.load(train_path)
        val = NumpyTupleDataset.load(val_path)
        test = NumpyTupleDataset.load(test_path)
        train_smiles, val_smiles, test_smiles = utils.load_npz(smiles_path)
    if train is None or val is None or test is None:
        print('preprocessing dataset...')
        preprocessor = preprocess_method_dict[method]()
        if labels == 'pyridine':
            train, val, test, train_smiles, val_smiles, test_smiles = D.get_tox21(
                preprocessor, labels=None, return_smiles=True)
            print('converting label into pyridine...')
            # --- Pyridine = 1 ---
            train_pyridine_label = [
                hassubst(Chem.MolFromSmiles(smi), smart=PYRIDINE_SMILES) for smi in tqdm(train_smiles)]
            val_pyridine_label = [
                hassubst(Chem.MolFromSmiles(smi), smart=PYRIDINE_SMILES) for smi in tqdm(val_smiles)]
            test_pyridine_label = [
                hassubst(Chem.MolFromSmiles(smi), smart=PYRIDINE_SMILES) for smi in tqdm(test_smiles)]

            train_pyridine_label = numpy.array(train_pyridine_label)[:, None]
            val_pyridine_label = numpy.array(val_pyridine_label)[:, None]
            test_pyridine_label = numpy.array(test_pyridine_label)[:, None]
            print('train positive/negative', numpy.sum(train_pyridine_label == 1), numpy.sum(train_pyridine_label == 0))
            train = NumpyTupleDataset(*train.features[:, :-1], train_pyridine_label)
            val = NumpyTupleDataset(*val.features[:, :-1], val_pyridine_label)
            test = NumpyTupleDataset(*test.features[:, :-1], test_pyridine_label)
        else:
            train, val, test, train_smiles, val_smiles, test_smiles = D.get_tox21(
                preprocessor, labels=labels, return_smiles=True)

        # Cache dataset
        policy.create_cache_directory()
        NumpyTupleDataset.save(train_path, train)
        NumpyTupleDataset.save(val_path, val)
        NumpyTupleDataset.save(test_path, test)
        train_smiles = numpy.array(train_smiles)
        val_smiles = numpy.array(val_smiles)
        test_smiles = numpy.array(test_smiles)
        utils.save_npz(smiles_path, (train_smiles, val_smiles, test_smiles))
    return train, val, test, train_smiles, val_smiles, test_smiles
def load_dataset(method, labels, prefix='input'):
    policy = _CacheNamePolicy(method, labels, prefix)
    train_path = policy.get_train_file_path()
    val_path = policy.get_val_file_path()
    test_path = policy.get_test_file_path()

    train, val, test = None, None, None
    print()
    if os.path.exists(policy.cache_dir):
        print('load from cache {}'.format(policy.cache_dir))
        train = NumpyTupleDataset.load(train_path)
        val = NumpyTupleDataset.load(val_path)
        test = NumpyTupleDataset.load(test_path)
    if train is None or val is None or test is None:
        print('preprocessing dataset...')
        preprocessor = preprocess_method_dict[method]()
        train, val, test = D.get_tox21(preprocessor, labels=labels)
        # Cache dataset
        policy.create_cache_directory()
        NumpyTupleDataset.save(train_path, train)
        NumpyTupleDataset.save(val_path, val)
        NumpyTupleDataset.save(test_path, test)
    return train, val, test