Ejemplo n.º 1
0
def _load_dense_dataset(dataset,
                        valid_size=0.1,
                        test_size=0.1,
                        min_size=0,
                        max_size=None,
                        **kwargs):

    train_size = 1.0 - (test_size + valid_size)
    graphs = _read_graphfile(dataname=dataset,
                             min_nodes=min_size,
                             max_nodes=max_size)
    labels = []
    for G in graphs:
        for u in G.nodes():
            if G.nodes[u].get("feat") is None:
                # fall back to node label if node attributes are not found
                G.nodes[u]['feat'] = np.array(G.nodes[u]['label'])
        labels.append(G.graph['label'])
    n_tasks = len(set(labels))
    labels = np.asarray(labels)
    dataset = NumpyDataset(graphs, y=labels, n_tasks=n_tasks)
    splitter = RandomSplitter()
    # splits.RandomStratifiedSplitter()
    train, valid, test = splitter.train_valid_test_split(dataset,
                                                         frac_train=train_size,
                                                         frac_valid=valid_size,
                                                         frac_test=test_size)

    datasets = []
    for dt in (train, valid, test):
        datasets.append(
            NetworkXGraphDataset(dt.X, dt.y, w=None, pad_to=max_size))

    in_size = datasets[0].X[0].shape[-1]
    return datasets, in_size, n_tasks
Ejemplo n.º 2
0
  def random_test_train_valid_test_split_from_sdf(self):
    """Test of singletask CoulombMatrixEig regression on .sdf file."""
    splittype = "random"
    input_transforms = []
    output_transforms = ["normalize"]
    model_params = {}
    tasks = ["atomization_energy"]
    task_type = "regression"
    task_types = {task: task_type for task in tasks}
    current_dir = os.path.dirname(os.path.abspath(__file__))
    input_file = os.path.join(current_dir, "data/water.sdf")

    featurizer = CoulombMatrixEig(6, remove_hydrogens=False)

    input_file = os.path.join(self.current_dir, input_file)
    loader = DataLoader(tasks=tasks,
                        smiles_field=self.smiles_field,
                        mol_field="mol",
                        featurizer=featurizer,
                        verbosity="low")

    dataset = loader.featurize(input_file, self.data_dir)

    # Splits featurized samples into train/test
    splitter = RandomSplitter()
    train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(
        dataset, self.train_dir, self.valid_dir, self.test_dir)
    assert len(train_dataset) == 8
    assert len(valid_dataset) == 1
    assert len(test_dataset) == 1
Ejemplo n.º 3
0
  def random_test_train_valid_test_split(self):
    """Test of singletask RF ECFP regression API."""
    input_transforms = []
    output_transforms = ["normalize"]
    model_params = {}
    tasks = ["log-solubility"]
    task_type = "regression"
    task_types = {task: task_type for task in tasks}
    input_file = os.path.join(self.current_dir, "example.csv")
    featurizer = CircularFingerprint(size=1024)

    input_file = os.path.join(self.current_dir, input_file)
    loader = DataLoader(tasks=tasks,
                        smiles_field=self.smiles_field,
                        featurizer=featurizer,
                        verbosity="low")

    dataset = loader.featurize(input_file, self.data_dir)

    # Splits featurized samples into train/test
    splitter = RandomSplitter()
    train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(
        dataset, self.train_dir, self.valid_dir, self.test_dir)
    assert len(train_dataset) == 8
    assert len(valid_dataset) == 1
    assert len(test_dataset) == 1
Ejemplo n.º 4
0
    def random_test_train_valid_test_split_from_sdf(self):
        """Test of singletask CoulombMatrixEig regression on .sdf file."""
        splittype = "random"
        input_transforms = []
        output_transforms = ["normalize"]
        model_params = {}
        tasks = ["atomization_energy"]
        task_type = "regression"
        task_types = {task: task_type for task in tasks}
        current_dir = os.path.dirname(os.path.abspath(__file__))
        input_file = os.path.join(current_dir, "data/water.sdf")

        featurizer = CoulombMatrixEig(6, remove_hydrogens=False)

        input_file = os.path.join(self.current_dir, input_file)
        loader = DataLoader(tasks=tasks,
                            smiles_field=self.smiles_field,
                            mol_field="mol",
                            featurizer=featurizer,
                            verbosity="low")

        dataset = loader.featurize(input_file, self.data_dir)

        # Splits featurized samples into train/test
        splitter = RandomSplitter()
        train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(
            dataset, self.train_dir, self.valid_dir, self.test_dir)
        assert len(train_dataset) == 8
        assert len(valid_dataset) == 1
        assert len(test_dataset) == 1
Ejemplo n.º 5
0
 def test_multitask_random_split(self):
     """
 Test multitask RandomSplitter class.
 """
     multitask_dataset = self.load_multitask_data()
     random_splitter = RandomSplitter()
     train_data, valid_data, test_data = \
         random_splitter.train_valid_test_split(
             multitask_dataset,
             self.train_dir, self.valid_dir, self.test_dir,
             frac_train=0.8, frac_valid=0.1, frac_test=0.1)
     assert len(train_data) == 8
     assert len(valid_data) == 1
     assert len(test_data) == 1
Ejemplo n.º 6
0
 def test_multitask_random_split(self):
   """
   Test multitask RandomSplitter class.
   """
   multitask_dataset = self.load_multitask_data()
   random_splitter = RandomSplitter()
   train_data, valid_data, test_data = \
       random_splitter.train_valid_test_split(
           multitask_dataset,
           self.train_dir, self.valid_dir, self.test_dir,
           frac_train=0.8, frac_valid=0.1, frac_test=0.1)
   assert len(train_data) == 8
   assert len(valid_data) == 1
   assert len(test_data) == 1
Ejemplo n.º 7
0
    def test_singletask_random_split(self):
        """
    Test singletask RandomSplitter class.
    """
        solubility_dataset = self.load_solubility_data()
        random_splitter = RandomSplitter()
        train_data, valid_data, test_data = \
            random_splitter.train_valid_test_split(
                solubility_dataset,
                self.train_dir, self.valid_dir, self.test_dir,
                frac_train=0.8, frac_valid=0.1, frac_test=0.1)
        assert len(train_data) == 8
        assert len(valid_data) == 1
        assert len(test_data) == 1

        merge_dir = tempfile.mkdtemp()
        merged_dataset = DiskDataset.merge(merge_dir,
                                           [train_data, valid_data, test_data])
        assert sorted(merged_dataset.ids) == (sorted(solubility_dataset.ids))
Ejemplo n.º 8
0
base_dir = "/tmp/nci_rf"
train_dir = os.path.join(base_dir, "train_dataset")
valid_dir = os.path.join(base_dir, "valid_dataset")
test_dir = os.path.join(base_dir, "test_dataset")
model_dir = os.path.join(base_dir, "model")
if os.path.exists(base_dir):
  shutil.rmtree(base_dir)
os.makedirs(base_dir)

nci_tasks, nci_dataset, transformers = load_nci(
    base_dir, reload=reload, force_transform=force_transform)

print("About to perform train/valid/test split.")
splitter = RandomSplitter(verbosity=verbosity)
print("Performing new split.")
train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(
    nci_dataset, train_dir, valid_dir, test_dir)

classification_metric = Metric(metrics.roc_auc_score, np.mean,
                               verbosity=verbosity,
                               mode="classification")
def model_builder(model_dir):
  sklearn_model = RandomForestRegressor(n_estimators=500)
  return SklearnModel(sklearn_model, model_dir)
model = SingletaskToMultitask(nci_tasks, model_builder, model_dir)

# Fit trained model
model.fit(train_dataset)
model.save()

train_evaluator = Evaluator(model, train_dataset, transformers, verbosity=verbosity)
train_scores = train_evaluator.compute_model_performance([classification_metric])
Ejemplo n.º 9
0
"""

base_sider_data_dir = "/home/apappu/deepchem-models/toxcast_models/sider/sider_data"

sider_tasks, sider_dataset, sider_transformers = load_sider(
    base_sider_data_dir, reload=reload)

base_sider_dir = "/home/apappu/deepchem-models/toxcast_models/sider/sider_analysis"

sider_train_dir = os.path.join(base_sider_dir, "train_dataset")
sider_valid_dir = os.path.join(base_sider_dir, "valid_dataset")
sider_test_dir = os.path.join(base_sider_dir, "test_dataset")
sider_model_dir = os.path.join(base_sider_dir, "model")

sider_splitter = RandomSplitter()
sider_train_dataset, sider_valid_dataset, sider_test_dataset = sider_splitter.train_valid_test_split(
    sider_dataset, sider_train_dir, sider_valid_dir, sider_test_dir)

# Fit Logistic Regression models
sider_task_types = {task: "classification" for task in sider_tasks}

params_dict = {
    "batch_size": None,
    "data_shape": sider_train_dataset.get_data_shape(),
}

sider_model = SingletaskToMultitask(sider_tasks,
                                    sider_task_types,
                                    params_dict,
                                    sider_model_dir,
                                    model_builder,
                                    verbosity=verbosity)
Ejemplo n.º 10
0
base_dir = "/tmp/nci_rf"
train_dir = os.path.join(base_dir, "train_dataset")
valid_dir = os.path.join(base_dir, "valid_dataset")
test_dir = os.path.join(base_dir, "test_dataset")
model_dir = os.path.join(base_dir, "model")
if os.path.exists(base_dir):
    shutil.rmtree(base_dir)
os.makedirs(base_dir)

nci_tasks, nci_dataset, transformers = load_nci(
    base_dir, reload=reload, force_transform=force_transform)

print("About to perform train/valid/test split.")
splitter = RandomSplitter(verbosity=verbosity)
print("Performing new split.")
train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(
    nci_dataset, train_dir, valid_dir, test_dir)

classification_metric = Metric(metrics.roc_auc_score,
                               np.mean,
                               verbosity=verbosity,
                               mode="classification")


def model_builder(model_dir):
    sklearn_model = RandomForestRegressor(n_estimators=500)
    return SklearnModel(sklearn_model, model_dir)


model = SingletaskToMultitask(nci_tasks, model_builder, model_dir)

# Fit trained model
Ejemplo n.º 11
0
  def test_singletask_random_split(self):
    """
    Test singletask RandomSplitter class.
    """
    solubility_dataset = self.load_solubility_data()
    random_splitter = RandomSplitter()
    train_data, valid_data, test_data = \
        random_splitter.train_valid_test_split(
            solubility_dataset,
            self.train_dir, self.valid_dir, self.test_dir,
            frac_train=0.8, frac_valid=0.1, frac_test=0.1)
    assert len(train_data) == 8
    assert len(valid_data) == 1
    assert len(test_data) == 1

    def test_singletask_scaffold_split(self):
      """
      Test singletask ScaffoldSplitter class.
      """
      solubility_dataset = self.load_solubility_data()
      scaffold_splitter = ScaffoldSplitter()
      train_data, valid_data, test_data = \
          scaffold_splitter.train_valid_test_split(
              solubility_dataset,
              self.train_dir, self.valid_dir, self.test_dir,
              frac_train=0.8, frac_valid=0.1, frac_test=0.1)
      assert len(train_data) == 8
      assert len(valid_data) == 1
      assert len(test_data) == 1

    def test_multitask_random_split(self):
      """
      Test multitask RandomSplitter class.
      """
      multitask_dataset = self.load_multitask_data()
      random_splitter = RandomSplitter()
      train_data, valid_data, test_data = \
          random_splitter.train_valid_test_split(
              multitask_dataset,
              self.train_dir, self.valid_dir, self.test_dir,
              frac_train=0.8, frac_valid=0.1, frac_test=0.1)
      assert len(train_data) == 8
      assert len(valid_data) == 1
      assert len(test_data) == 1

    def test_multitask_scaffold_split(self):
      """
      Test multitask ScaffoldSplitter class.
      """
      multitask_dataset = self.load_multitask_data()
      scaffold_splitter = ScaffoldSplitter()
      train_data, valid_data, test_data = \
          scaffold_splitter.train_valid_test_split(
              multitask_dataset,
              self.train_dir, self.valid_dir, self.test_dir,
              frac_train=0.8, frac_valid=0.1, frac_test=0.1)
      assert len(train_data) == 8
      assert len(valid_data) == 1
      assert len(test_data) == 1

    def test_stratified_multitask_split(self):
      """
      Test multitask StratifiedSplitter class
      """
      # ensure sparse dataset is actually sparse

      sparse_dataset = self.load_sparse_multitask_dataset()

      X, y, w, ids = sparse_dataset.to_numpy()

      """
      sparsity is determined by number of w weights that are 0 for a given task
      structure of w np array is such that each row corresponds to a sample -- e.g., analyze third column for third
      sparse task
      """
      frac_train = 0.5
      cutoff = int(frac_train * w.shape[0])
      w = w[:cutoff, :]
      sparse_flag = False

      col_index = 0
      for col in w.T:
        if not np.any(col): #check to see if any columns are all zero
          sparse_flag = True
          break
        col_index+=1
      if not sparse_flag:
        print("Test dataset isn't sparse -- test failed")
      else:
        print("Column %d is sparse -- expected" % col_index)
      assert sparse_flag

      stratified_splitter = StratifiedSplitter()
      train_data, valid_data, test_data = \
          stratified_splitter.train_valid_test_split(
              sparse_dataset,
              self.train_dir, self.valid_dir, self.test_dir,
              frac_train=0.8, frac_valid=0.1, frac_test=0.1
          )

      datasets = [train_data, valid_data, test_data]
      dataset_index = 0
      for dataset in datasets:
        X, y, w, ids = dataset.to_numpy()
        # verify that each task in the train dataset has some hits
        for col in w.T:
            if not np.any(col):
                print("Fail -- one column doesn't have results")
                if dataset_index == 0:
                    print("train_data failed")
                elif dataset_index == 1:
                    print("valid_data failed")
                elif dataset_index == 2:
                    print("test_data failed")
                assert np.any(col)
        if dataset_index == 0:
            print("train_data passed")
        elif dataset_index == 1:
            print("valid_data passed")
        elif dataset_index == 2:
            print("test_data passed")
        dataset_index+=1
      print("end of stratified test")
      assert 1 == 1
Ejemplo n.º 12
0
"""

base_sider_data_dir = "/home/apappu/deepchem-models/toxcast_models/sider/sider_data"

sider_tasks, sider_dataset, sider_transformers = load_sider(
    base_sider_data_dir, reload=reload)

base_sider_dir = "/home/apappu/deepchem-models/toxcast_models/sider/sider_analysis"

sider_train_dir = os.path.join(base_sider_dir, "train_dataset")
sider_valid_dir = os.path.join(base_sider_dir, "valid_dataset")
sider_test_dir = os.path.join(base_sider_dir, "test_dataset")
sider_model_dir = os.path.join(base_sider_dir, "model")

sider_splitter = RandomSplitter()
sider_train_dataset, sider_valid_dataset, sider_test_dataset = sider_splitter.train_valid_test_split(
  sider_dataset, sider_train_dir, sider_valid_dir, sider_test_dir)

# Fit Logistic Regression models
sider_task_types = {task: "classification" for task in sider_tasks}

params_dict = {
  "batch_size": None,
  "data_shape": sider_train_dataset.get_data_shape(),
}

sider_model = SingletaskToMultitask(sider_tasks, sider_task_types, params_dict, sider_model_dir,
                              model_builder, verbosity=verbosity)
sider_model.reload()

"""
Load sweetlead dataset now. Pass in dataset object and appropriate transformers to predict functions
Ejemplo n.º 13
0
    def test_singletask_random_split(self):
        """
    Test singletask RandomSplitter class.
    """
        solubility_dataset = self.load_solubility_data()
        random_splitter = RandomSplitter()
        train_data, valid_data, test_data = \
            random_splitter.train_valid_test_split(
                solubility_dataset,
                self.train_dir, self.valid_dir, self.test_dir,
                frac_train=0.8, frac_valid=0.1, frac_test=0.1)
        assert len(train_data) == 8
        assert len(valid_data) == 1
        assert len(test_data) == 1

        def test_singletask_scaffold_split(self):
            """
      Test singletask ScaffoldSplitter class.
      """
            solubility_dataset = self.load_solubility_data()
            scaffold_splitter = ScaffoldSplitter()
            train_data, valid_data, test_data = \
                scaffold_splitter.train_valid_test_split(
                    solubility_dataset,
                    self.train_dir, self.valid_dir, self.test_dir,
                    frac_train=0.8, frac_valid=0.1, frac_test=0.1)
            assert len(train_data) == 8
            assert len(valid_data) == 1
            assert len(test_data) == 1

        def test_multitask_random_split(self):
            """
      Test multitask RandomSplitter class.
      """
            multitask_dataset = self.load_multitask_data()
            random_splitter = RandomSplitter()
            train_data, valid_data, test_data = \
                random_splitter.train_valid_test_split(
                    multitask_dataset,
                    self.train_dir, self.valid_dir, self.test_dir,
                    frac_train=0.8, frac_valid=0.1, frac_test=0.1)
            assert len(train_data) == 8
            assert len(valid_data) == 1
            assert len(test_data) == 1

        def test_multitask_scaffold_split(self):
            """
      Test multitask ScaffoldSplitter class.
      """
            multitask_dataset = self.load_multitask_data()
            scaffold_splitter = ScaffoldSplitter()
            train_data, valid_data, test_data = \
                scaffold_splitter.train_valid_test_split(
                    multitask_dataset,
                    self.train_dir, self.valid_dir, self.test_dir,
                    frac_train=0.8, frac_valid=0.1, frac_test=0.1)
            assert len(train_data) == 8
            assert len(valid_data) == 1
            assert len(test_data) == 1

        def test_stratified_multitask_split(self):
            """
      Test multitask StratifiedSplitter class
      """
            # ensure sparse dataset is actually sparse

            sparse_dataset = self.load_sparse_multitask_dataset()

            X, y, w, ids = sparse_dataset.to_numpy()
            """
      sparsity is determined by number of w weights that are 0 for a given task
      structure of w np array is such that each row corresponds to a sample -- e.g., analyze third column for third
      sparse task
      """
            frac_train = 0.5
            cutoff = int(frac_train * w.shape[0])
            w = w[:cutoff, :]
            sparse_flag = False

            col_index = 0
            for col in w.T:
                if not np.any(col):  #check to see if any columns are all zero
                    sparse_flag = True
                    break
                col_index += 1
            if not sparse_flag:
                print("Test dataset isn't sparse -- test failed")
            else:
                print("Column %d is sparse -- expected" % col_index)
            assert sparse_flag

            stratified_splitter = StratifiedSplitter()
            train_data, valid_data, test_data = \
                stratified_splitter.train_valid_test_split(
                    sparse_dataset,
                    self.train_dir, self.valid_dir, self.test_dir,
                    frac_train=0.8, frac_valid=0.1, frac_test=0.1
                )

            datasets = [train_data, valid_data, test_data]
            dataset_index = 0
            for dataset in datasets:
                X, y, w, ids = dataset.to_numpy()
                # verify that each task in the train dataset has some hits
                for col in w.T:
                    if not np.any(col):
                        print("Fail -- one column doesn't have results")
                        if dataset_index == 0:
                            print("train_data failed")
                        elif dataset_index == 1:
                            print("valid_data failed")
                        elif dataset_index == 2:
                            print("test_data failed")
                        assert np.any(col)
                if dataset_index == 0:
                    print("train_data passed")
                elif dataset_index == 1:
                    print("valid_data passed")
                elif dataset_index == 2:
                    print("test_data passed")
                dataset_index += 1
            print("end of stratified test")
            assert 1 == 1