コード例 #1
0
  def random_test_train_valid_test_split(self):
    """Test of singletask RF ECFP regression API."""
    input_transforms = []
    output_transforms = ["normalize"]
    model_params = {}
    tasks = ["log-solubility"]
    task_type = "regression"
    task_types = {task: task_type for task in tasks}
    input_file = os.path.join(self.current_dir, "example.csv")
    featurizer = CircularFingerprint(size=1024)

    input_file = os.path.join(self.current_dir, input_file)
    loader = DataLoader(tasks=tasks,
                        smiles_field=self.smiles_field,
                        featurizer=featurizer,
                        verbosity="low")

    dataset = loader.featurize(input_file, self.data_dir)

    # Splits featurized samples into train/test
    splitter = RandomSplitter()
    train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(
        dataset, self.train_dir, self.valid_dir, self.test_dir)
    assert len(train_dataset) == 8
    assert len(valid_dataset) == 1
    assert len(test_dataset) == 1
コード例 #2
0
ファイル: read_data.py プロジェクト: valence-discovery/lapool
def _load_dense_dataset(dataset,
                        valid_size=0.1,
                        test_size=0.1,
                        min_size=0,
                        max_size=None,
                        **kwargs):

    train_size = 1.0 - (test_size + valid_size)
    graphs = _read_graphfile(dataname=dataset,
                             min_nodes=min_size,
                             max_nodes=max_size)
    labels = []
    for G in graphs:
        for u in G.nodes():
            if G.nodes[u].get("feat") is None:
                # fall back to node label if node attributes are not found
                G.nodes[u]['feat'] = np.array(G.nodes[u]['label'])
        labels.append(G.graph['label'])
    n_tasks = len(set(labels))
    labels = np.asarray(labels)
    dataset = NumpyDataset(graphs, y=labels, n_tasks=n_tasks)
    splitter = RandomSplitter()
    # splits.RandomStratifiedSplitter()
    train, valid, test = splitter.train_valid_test_split(dataset,
                                                         frac_train=train_size,
                                                         frac_valid=valid_size,
                                                         frac_test=test_size)

    datasets = []
    for dt in (train, valid, test):
        datasets.append(
            NetworkXGraphDataset(dt.X, dt.y, w=None, pad_to=max_size))

    in_size = datasets[0].X[0].shape[-1]
    return datasets, in_size, n_tasks
コード例 #3
0
ファイル: test_splitter.py プロジェクト: Justin318/deepchem
    def test_singletask_random_k_fold_split(self):
        """
    Test singletask RandomSplitter class.
    """
        solubility_dataset = self.load_solubility_data()
        random_splitter = RandomSplitter()
        ids_set = set(solubility_dataset.ids)

        K = 5
        fold_dirs = [tempfile.mkdtemp() for i in range(K)]
        fold_datasets = random_splitter.k_fold_split(solubility_dataset,
                                                     fold_dirs)
        for fold in range(K):
            fold_dataset = fold_datasets[fold]
            # Verify lengths is 10/k == 2
            assert len(fold_dataset) == 2
            # Verify that compounds in this fold are subset of original compounds
            fold_ids_set = set(fold_dataset.ids)
            assert fold_ids_set.issubset(ids_set)
            # Verify that no two folds have overlapping compounds.
            for other_fold in range(K):
                if fold == other_fold:
                    continue
                other_fold_dataset = fold_datasets[other_fold]
                other_fold_ids_set = set(other_fold_dataset.ids)
                assert fold_ids_set.isdisjoint(other_fold_ids_set)

        merge_dir = tempfile.mkdtemp()
        merged_dataset = DiskDataset.merge(merge_dir, fold_datasets)
        assert len(merged_dataset) == len(solubility_dataset)
        assert sorted(merged_dataset.ids) == (sorted(solubility_dataset.ids))
コード例 #4
0
    def random_test_train_valid_test_split_from_sdf(self):
        """Test of singletask CoulombMatrixEig regression on .sdf file."""
        splittype = "random"
        input_transforms = []
        output_transforms = ["normalize"]
        model_params = {}
        tasks = ["atomization_energy"]
        task_type = "regression"
        task_types = {task: task_type for task in tasks}
        current_dir = os.path.dirname(os.path.abspath(__file__))
        input_file = os.path.join(current_dir, "data/water.sdf")

        featurizer = CoulombMatrixEig(6, remove_hydrogens=False)

        input_file = os.path.join(self.current_dir, input_file)
        loader = DataLoader(tasks=tasks,
                            smiles_field=self.smiles_field,
                            mol_field="mol",
                            featurizer=featurizer,
                            verbosity="low")

        dataset = loader.featurize(input_file, self.data_dir)

        # Splits featurized samples into train/test
        splitter = RandomSplitter()
        train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(
            dataset, self.train_dir, self.valid_dir, self.test_dir)
        assert len(train_dataset) == 8
        assert len(valid_dataset) == 1
        assert len(test_dataset) == 1
コード例 #5
0
ファイル: test_sdf_reader.py プロジェクト: rbharath/deepchem
  def random_test_train_valid_test_split_from_sdf(self):
    """Test of singletask CoulombMatrixEig regression on .sdf file."""
    splittype = "random"
    input_transforms = []
    output_transforms = ["normalize"]
    model_params = {}
    tasks = ["atomization_energy"]
    task_type = "regression"
    task_types = {task: task_type for task in tasks}
    current_dir = os.path.dirname(os.path.abspath(__file__))
    input_file = os.path.join(current_dir, "data/water.sdf")

    featurizer = CoulombMatrixEig(6, remove_hydrogens=False)

    input_file = os.path.join(self.current_dir, input_file)
    loader = DataLoader(tasks=tasks,
                        smiles_field=self.smiles_field,
                        mol_field="mol",
                        featurizer=featurizer,
                        verbosity="low")

    dataset = loader.featurize(input_file, self.data_dir)

    # Splits featurized samples into train/test
    splitter = RandomSplitter()
    train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(
        dataset, self.train_dir, self.valid_dir, self.test_dir)
    assert len(train_dataset) == 8
    assert len(valid_dataset) == 1
    assert len(test_dataset) == 1
コード例 #6
0
ファイル: test_splitter.py プロジェクト: Justin318/deepchem
 def test_multitask_random_split(self):
     """
 Test multitask RandomSplitter class.
 """
     multitask_dataset = self.load_multitask_data()
     random_splitter = RandomSplitter()
     train_data, valid_data, test_data = \
         random_splitter.train_valid_test_split(
             multitask_dataset,
             self.train_dir, self.valid_dir, self.test_dir,
             frac_train=0.8, frac_valid=0.1, frac_test=0.1)
     assert len(train_data) == 8
     assert len(valid_data) == 1
     assert len(test_data) == 1
コード例 #7
0
ファイル: test_splitter.py プロジェクト: rbharath/deepchem
 def test_multitask_random_split(self):
   """
   Test multitask RandomSplitter class.
   """
   multitask_dataset = self.load_multitask_data()
   random_splitter = RandomSplitter()
   train_data, valid_data, test_data = \
       random_splitter.train_valid_test_split(
           multitask_dataset,
           self.train_dir, self.valid_dir, self.test_dir,
           frac_train=0.8, frac_valid=0.1, frac_test=0.1)
   assert len(train_data) == 8
   assert len(valid_data) == 1
   assert len(test_data) == 1
コード例 #8
0
ファイル: test_splitter.py プロジェクト: Justin318/deepchem
    def test_singletask_random_split(self):
        """
    Test singletask RandomSplitter class.
    """
        solubility_dataset = self.load_solubility_data()
        random_splitter = RandomSplitter()
        train_data, valid_data, test_data = \
            random_splitter.train_valid_test_split(
                solubility_dataset,
                self.train_dir, self.valid_dir, self.test_dir,
                frac_train=0.8, frac_valid=0.1, frac_test=0.1)
        assert len(train_data) == 8
        assert len(valid_data) == 1
        assert len(test_data) == 1

        merge_dir = tempfile.mkdtemp()
        merged_dataset = DiskDataset.merge(merge_dir,
                                           [train_data, valid_data, test_data])
        assert sorted(merged_dataset.ids) == (sorted(solubility_dataset.ids))
コード例 #9
0
ファイル: read_data.py プロジェクト: valence-discovery/lapool
def _load_mol_dataset(dataset_file,
                      tasks,
                      split="stratified",
                      test_size=0.1,
                      valid_size=0.1,
                      min_size=0,
                      max_size=None,
                      **kwargs):

    train_size = 1.0 - (test_size + valid_size)
    featurizer = RawFeaturizer()
    loader = CSVLoader(tasks=tasks,
                       smiles_field="smiles",
                       featurizer=featurizer,
                       verbose=False,
                       log_every_n=10000)
    dataset = loader.featurize(dataset_file)

    splitters = {
        'index': IndexSplitter(),
        'random': RandomSplitter(),
        'scaffold': ScaffoldSplitter(),
        'butina': ButinaSplitter(),
        'stratified': RandomStratifiedSplitter()
    }

    splitter = splitters[split]
    train, valid, test = splitter.train_valid_test_split(dataset,
                                                         frac_train=train_size,
                                                         frac_valid=valid_size,
                                                         frac_test=test_size)

    # compute data balance information on train
    balancer = BalancingTransformer(transform_w=True, dataset=train)
    train = balancer.transform(train)
    valid = balancer.transform(valid)
    test = balancer.transform(test)
    transformer = GraphTransformer(mol_size=[min_size, max_size], **kwargs)
    datasets = []
    for dt in (train, valid, test):
        X, ids = transformer(dt.ids, dtype=np.float32, ignore_errors=False)
        y = dt.y[ids, :]
        w = dt.w[ids, :]
        raw_mols = dt.X[ids]
        datasets.append(MolDataset(X, y, raw_mols, w=w, pad_to=max_size))

    in_size = X[0][-1].shape[-1]
    out_size = 1 if len(y.shape) == 1 else y.shape[-1]
    return datasets, in_size, out_size
コード例 #10
0
def partition_train_val_test(smiles, dataset):
    """
    Split a molecule dataset (SMILES) with deepchem built-ins
    """

    ds = MockDataset(smiles)

    if dataset == "BBBP":
        splitter = ScaffoldSplitter()
    elif dataset == "BACE":
        splitter = ScaffoldSplitter()
    elif dataset == "TOX21":
        splitter = RandomSplitter()

    train_inds, val_inds, test_inds = splitter.split(ds)

    return {
        "train_inds": train_inds,
        "val_inds": val_inds,
        "test_inds": test_inds
    }
コード例 #11
0
Load sider models now
"""

base_sider_data_dir = "/home/apappu/deepchem-models/toxcast_models/sider/sider_data"

sider_tasks, sider_dataset, sider_transformers = load_sider(
    base_sider_data_dir, reload=reload)

base_sider_dir = "/home/apappu/deepchem-models/toxcast_models/sider/sider_analysis"

sider_train_dir = os.path.join(base_sider_dir, "train_dataset")
sider_valid_dir = os.path.join(base_sider_dir, "valid_dataset")
sider_test_dir = os.path.join(base_sider_dir, "test_dataset")
sider_model_dir = os.path.join(base_sider_dir, "model")

sider_splitter = RandomSplitter()
sider_train_dataset, sider_valid_dataset, sider_test_dataset = sider_splitter.train_valid_test_split(
    sider_dataset, sider_train_dir, sider_valid_dir, sider_test_dir)

# Fit Logistic Regression models
sider_task_types = {task: "classification" for task in sider_tasks}

params_dict = {
    "batch_size": None,
    "data_shape": sider_train_dataset.get_data_shape(),
}

sider_model = SingletaskToMultitask(sider_tasks,
                                    sider_task_types,
                                    params_dict,
                                    sider_model_dir,
コード例 #12
0
force_transform = False

base_dir = "/tmp/nci_rf"
train_dir = os.path.join(base_dir, "train_dataset")
valid_dir = os.path.join(base_dir, "valid_dataset")
test_dir = os.path.join(base_dir, "test_dataset")
model_dir = os.path.join(base_dir, "model")
if os.path.exists(base_dir):
    shutil.rmtree(base_dir)
os.makedirs(base_dir)

nci_tasks, nci_dataset, transformers = load_nci(
    base_dir, reload=reload, force_transform=force_transform)

print("About to perform train/valid/test split.")
splitter = RandomSplitter(verbosity=verbosity)
print("Performing new split.")
train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(
    nci_dataset, train_dir, valid_dir, test_dir)

classification_metric = Metric(metrics.roc_auc_score,
                               np.mean,
                               verbosity=verbosity,
                               mode="classification")


def model_builder(model_dir):
    sklearn_model = RandomForestRegressor(n_estimators=500)
    return SklearnModel(sklearn_model, model_dir)

コード例 #13
0
def get_model(model_name: str):
    if model_name == "ECFP":
        model = model_obj(len(wang_tasks),
                          wang_train.get_data_shape()[0],
                          batch_size=50,
                          tensorboard_log_frequency=25)
    else:
        model = model_obj(len(wang_tasks),
                          batch_size=50,
                          mode='regression',
                          tensorboard_log_frequency=25)
    return model


splitter_dict = {
    "Random": RandomSplitter(),
    #"Scaffold": ScaffoldSplitterNew(),
    #"MolecularWeight": MolecularWeightSplitterNew(),
    #"Butina": ButinaSplitterNew(),
}

if __name__ == "__main__":
    results = {}

    for splitter_name, splitter in splitter_dict.items():
        logging.info(f"Generating scaffolds with {splitter_name}")
        results[splitter_name] = {}
        for model_name, model_obj in model_dict.items():
            logging.info(f"Using {model_name} as a model")
            results[splitter_name][model_name] = {}
            featurizer = model_name
コード例 #14
0
ファイル: test_splitter.py プロジェクト: rbharath/deepchem
  def test_singletask_random_split(self):
    """
    Test singletask RandomSplitter class.
    """
    solubility_dataset = self.load_solubility_data()
    random_splitter = RandomSplitter()
    train_data, valid_data, test_data = \
        random_splitter.train_valid_test_split(
            solubility_dataset,
            self.train_dir, self.valid_dir, self.test_dir,
            frac_train=0.8, frac_valid=0.1, frac_test=0.1)
    assert len(train_data) == 8
    assert len(valid_data) == 1
    assert len(test_data) == 1

    def test_singletask_scaffold_split(self):
      """
      Test singletask ScaffoldSplitter class.
      """
      solubility_dataset = self.load_solubility_data()
      scaffold_splitter = ScaffoldSplitter()
      train_data, valid_data, test_data = \
          scaffold_splitter.train_valid_test_split(
              solubility_dataset,
              self.train_dir, self.valid_dir, self.test_dir,
              frac_train=0.8, frac_valid=0.1, frac_test=0.1)
      assert len(train_data) == 8
      assert len(valid_data) == 1
      assert len(test_data) == 1

    def test_multitask_random_split(self):
      """
      Test multitask RandomSplitter class.
      """
      multitask_dataset = self.load_multitask_data()
      random_splitter = RandomSplitter()
      train_data, valid_data, test_data = \
          random_splitter.train_valid_test_split(
              multitask_dataset,
              self.train_dir, self.valid_dir, self.test_dir,
              frac_train=0.8, frac_valid=0.1, frac_test=0.1)
      assert len(train_data) == 8
      assert len(valid_data) == 1
      assert len(test_data) == 1

    def test_multitask_scaffold_split(self):
      """
      Test multitask ScaffoldSplitter class.
      """
      multitask_dataset = self.load_multitask_data()
      scaffold_splitter = ScaffoldSplitter()
      train_data, valid_data, test_data = \
          scaffold_splitter.train_valid_test_split(
              multitask_dataset,
              self.train_dir, self.valid_dir, self.test_dir,
              frac_train=0.8, frac_valid=0.1, frac_test=0.1)
      assert len(train_data) == 8
      assert len(valid_data) == 1
      assert len(test_data) == 1

    def test_stratified_multitask_split(self):
      """
      Test multitask StratifiedSplitter class
      """
      # ensure sparse dataset is actually sparse

      sparse_dataset = self.load_sparse_multitask_dataset()

      X, y, w, ids = sparse_dataset.to_numpy()

      """
      sparsity is determined by number of w weights that are 0 for a given task
      structure of w np array is such that each row corresponds to a sample -- e.g., analyze third column for third
      sparse task
      """
      frac_train = 0.5
      cutoff = int(frac_train * w.shape[0])
      w = w[:cutoff, :]
      sparse_flag = False

      col_index = 0
      for col in w.T:
        if not np.any(col): #check to see if any columns are all zero
          sparse_flag = True
          break
        col_index+=1
      if not sparse_flag:
        print("Test dataset isn't sparse -- test failed")
      else:
        print("Column %d is sparse -- expected" % col_index)
      assert sparse_flag

      stratified_splitter = StratifiedSplitter()
      train_data, valid_data, test_data = \
          stratified_splitter.train_valid_test_split(
              sparse_dataset,
              self.train_dir, self.valid_dir, self.test_dir,
              frac_train=0.8, frac_valid=0.1, frac_test=0.1
          )

      datasets = [train_data, valid_data, test_data]
      dataset_index = 0
      for dataset in datasets:
        X, y, w, ids = dataset.to_numpy()
        # verify that each task in the train dataset has some hits
        for col in w.T:
            if not np.any(col):
                print("Fail -- one column doesn't have results")
                if dataset_index == 0:
                    print("train_data failed")
                elif dataset_index == 1:
                    print("valid_data failed")
                elif dataset_index == 2:
                    print("test_data failed")
                assert np.any(col)
        if dataset_index == 0:
            print("train_data passed")
        elif dataset_index == 1:
            print("valid_data passed")
        elif dataset_index == 2:
            print("test_data passed")
        dataset_index+=1
      print("end of stratified test")
      assert 1 == 1
コード例 #15
0
ファイル: sweet.py プロジェクト: apappu97/deepchem
Load sider models now
"""

base_sider_data_dir = "/home/apappu/deepchem-models/toxcast_models/sider/sider_data"

sider_tasks, sider_dataset, sider_transformers = load_sider(
    base_sider_data_dir, reload=reload)

base_sider_dir = "/home/apappu/deepchem-models/toxcast_models/sider/sider_analysis"

sider_train_dir = os.path.join(base_sider_dir, "train_dataset")
sider_valid_dir = os.path.join(base_sider_dir, "valid_dataset")
sider_test_dir = os.path.join(base_sider_dir, "test_dataset")
sider_model_dir = os.path.join(base_sider_dir, "model")

sider_splitter = RandomSplitter()
sider_train_dataset, sider_valid_dataset, sider_test_dataset = sider_splitter.train_valid_test_split(
  sider_dataset, sider_train_dir, sider_valid_dir, sider_test_dir)

# Fit Logistic Regression models
sider_task_types = {task: "classification" for task in sider_tasks}

params_dict = {
  "batch_size": None,
  "data_shape": sider_train_dataset.get_data_shape(),
}

sider_model = SingletaskToMultitask(sider_tasks, sider_task_types, params_dict, sider_model_dir,
                              model_builder, verbosity=verbosity)
sider_model.reload()
コード例 #16
0
ファイル: nci_rf.py プロジェクト: apappu97/deepchem
force_transform = False 

base_dir = "/tmp/nci_rf"
train_dir = os.path.join(base_dir, "train_dataset")
valid_dir = os.path.join(base_dir, "valid_dataset")
test_dir = os.path.join(base_dir, "test_dataset")
model_dir = os.path.join(base_dir, "model")
if os.path.exists(base_dir):
  shutil.rmtree(base_dir)
os.makedirs(base_dir)

nci_tasks, nci_dataset, transformers = load_nci(
    base_dir, reload=reload, force_transform=force_transform)

print("About to perform train/valid/test split.")
splitter = RandomSplitter(verbosity=verbosity)
print("Performing new split.")
train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(
    nci_dataset, train_dir, valid_dir, test_dir)

classification_metric = Metric(metrics.roc_auc_score, np.mean,
                               verbosity=verbosity,
                               mode="classification")
def model_builder(model_dir):
  sklearn_model = RandomForestRegressor(n_estimators=500)
  return SklearnModel(sklearn_model, model_dir)
model = SingletaskToMultitask(nci_tasks, model_builder, model_dir)

# Fit trained model
model.fit(train_dataset)
model.save()
コード例 #17
0
    def test_singletask_random_split(self):
        """
    Test singletask RandomSplitter class.
    """
        solubility_dataset = self.load_solubility_data()
        random_splitter = RandomSplitter()
        train_data, valid_data, test_data = \
            random_splitter.train_valid_test_split(
                solubility_dataset,
                self.train_dir, self.valid_dir, self.test_dir,
                frac_train=0.8, frac_valid=0.1, frac_test=0.1)
        assert len(train_data) == 8
        assert len(valid_data) == 1
        assert len(test_data) == 1

        def test_singletask_scaffold_split(self):
            """
      Test singletask ScaffoldSplitter class.
      """
            solubility_dataset = self.load_solubility_data()
            scaffold_splitter = ScaffoldSplitter()
            train_data, valid_data, test_data = \
                scaffold_splitter.train_valid_test_split(
                    solubility_dataset,
                    self.train_dir, self.valid_dir, self.test_dir,
                    frac_train=0.8, frac_valid=0.1, frac_test=0.1)
            assert len(train_data) == 8
            assert len(valid_data) == 1
            assert len(test_data) == 1

        def test_multitask_random_split(self):
            """
      Test multitask RandomSplitter class.
      """
            multitask_dataset = self.load_multitask_data()
            random_splitter = RandomSplitter()
            train_data, valid_data, test_data = \
                random_splitter.train_valid_test_split(
                    multitask_dataset,
                    self.train_dir, self.valid_dir, self.test_dir,
                    frac_train=0.8, frac_valid=0.1, frac_test=0.1)
            assert len(train_data) == 8
            assert len(valid_data) == 1
            assert len(test_data) == 1

        def test_multitask_scaffold_split(self):
            """
      Test multitask ScaffoldSplitter class.
      """
            multitask_dataset = self.load_multitask_data()
            scaffold_splitter = ScaffoldSplitter()
            train_data, valid_data, test_data = \
                scaffold_splitter.train_valid_test_split(
                    multitask_dataset,
                    self.train_dir, self.valid_dir, self.test_dir,
                    frac_train=0.8, frac_valid=0.1, frac_test=0.1)
            assert len(train_data) == 8
            assert len(valid_data) == 1
            assert len(test_data) == 1

        def test_stratified_multitask_split(self):
            """
      Test multitask StratifiedSplitter class
      """
            # ensure sparse dataset is actually sparse

            sparse_dataset = self.load_sparse_multitask_dataset()

            X, y, w, ids = sparse_dataset.to_numpy()
            """
      sparsity is determined by number of w weights that are 0 for a given task
      structure of w np array is such that each row corresponds to a sample -- e.g., analyze third column for third
      sparse task
      """
            frac_train = 0.5
            cutoff = int(frac_train * w.shape[0])
            w = w[:cutoff, :]
            sparse_flag = False

            col_index = 0
            for col in w.T:
                if not np.any(col):  #check to see if any columns are all zero
                    sparse_flag = True
                    break
                col_index += 1
            if not sparse_flag:
                print("Test dataset isn't sparse -- test failed")
            else:
                print("Column %d is sparse -- expected" % col_index)
            assert sparse_flag

            stratified_splitter = StratifiedSplitter()
            train_data, valid_data, test_data = \
                stratified_splitter.train_valid_test_split(
                    sparse_dataset,
                    self.train_dir, self.valid_dir, self.test_dir,
                    frac_train=0.8, frac_valid=0.1, frac_test=0.1
                )

            datasets = [train_data, valid_data, test_data]
            dataset_index = 0
            for dataset in datasets:
                X, y, w, ids = dataset.to_numpy()
                # verify that each task in the train dataset has some hits
                for col in w.T:
                    if not np.any(col):
                        print("Fail -- one column doesn't have results")
                        if dataset_index == 0:
                            print("train_data failed")
                        elif dataset_index == 1:
                            print("valid_data failed")
                        elif dataset_index == 2:
                            print("test_data failed")
                        assert np.any(col)
                if dataset_index == 0:
                    print("train_data passed")
                elif dataset_index == 1:
                    print("valid_data passed")
                elif dataset_index == 2:
                    print("test_data passed")
                dataset_index += 1
            print("end of stratified test")
            assert 1 == 1