Example #1
0
  def _to_singletask(dataset, task_dirs):
    """Transforms a multitask dataset to a collection of singletask datasets."""
    tasks = dataset.get_task_names()
    assert len(tasks) == len(task_dirs)
    log("Splitting multitask dataset into singletask datasets", dataset.verbosity)
    task_metadata_rows = {task: [] for task in tasks}
    for shard_num, (X, y, w, ids) in enumerate(dataset.itershards()):
      log("Processing shard %d" % shard_num, dataset.verbosity)
      basename = "dataset-%d" % shard_num
      for task_num, task in enumerate(tasks):
        log("\tTask %s" % task, dataset.verbosity)
        w_task = w[:, task_num]
        y_task = y[:, task_num]

        # Extract those datapoints which are present for this task
        X_nonzero = X[w_task != 0]
        num_datapoints = X_nonzero.shape[0]
        y_nonzero = np.reshape(y_task[w_task != 0], (num_datapoints, 1))
        w_nonzero = np.reshape(w_task[w_task != 0], (num_datapoints, 1))
        ids_nonzero = ids[w_task != 0]

        if X_nonzero.size > 0: 
          task_metadata_rows[task].append(
            DiskDataset.write_data_to_disk(
                task_dirs[task_num], basename, [task],
                X_nonzero, y_nonzero, w_nonzero, ids_nonzero))
    
    task_datasets = [
        DiskDataset(data_dir=task_dirs[task_num],
                metadata_rows=task_metadata_rows[task],
                verbosity=dataset.verbosity)
        for (task_num, task) in enumerate(tasks)]
    return task_datasets
Example #2
0
    def test_move_load(self):
        """Test that datasets can be moved and loaded."""
        verbosity = "high"
        current_dir = os.path.dirname(os.path.realpath(__file__))
        data_dir = os.path.join(self.base_dir, "data")
        moved_data_dir = os.path.join(self.base_dir, "moved_data")
        dataset_file = os.path.join(current_dir,
                                    "../../models/tests/example.csv")

        featurizer = CircularFingerprint(size=1024)
        tasks = ["log-solubility"]
        loader = DataLoader(tasks=tasks,
                            smiles_field="smiles",
                            featurizer=featurizer,
                            verbosity=verbosity)
        dataset = loader.featurize(dataset_file, data_dir)

        X, y, w, ids = (dataset.X, dataset.y, dataset.w, dataset.ids)
        shutil.move(data_dir, moved_data_dir)

        moved_dataset = DiskDataset(moved_data_dir, reload=True)

        X_moved, y_moved, w_moved, ids_moved = (moved_dataset.X,
                                                moved_dataset.y,
                                                moved_dataset.w,
                                                moved_dataset.ids)

        np.testing.assert_allclose(X, X_moved)
        np.testing.assert_allclose(y, y_moved)
        np.testing.assert_allclose(w, w_moved)
        np.testing.assert_array_equal(ids, ids_moved)
Example #3
0
  def test_samples_move(self):
    """Test that featurized samples can be moved and reloaded."""
    verbosity = "high"
    data_dir = os.path.join(self.base_dir, "data")
    moved_data_dir = os.path.join(self.base_dir, "moved_data")
    dataset_file = os.path.join(
        self.current_dir, "example.csv")

    featurizer = CircularFingerprint(size=1024)
    tasks = ["log-solubility"]
    loader = DataLoader(tasks=tasks,
                        smiles_field="smiles",
                        featurizer=featurizer,
                        verbosity=verbosity)
    featurized_dataset = loader.featurize(
        dataset_file, data_dir)
    n_dataset = len(featurized_dataset)
  
    # Now perform move
    shutil.move(data_dir, moved_data_dir)

    moved_featurized_dataset = DiskDataset(
        data_dir=moved_data_dir, reload=True)

    assert len(moved_featurized_dataset) == n_dataset
Example #4
0
  def test_power_X_transformer(self):
    """Test Power transformer on Gaussian normal dataset."""
    gaussian_dataset = self.load_gaussian_cdf_data()
    powers=[1,2,0.5]
    power_transformer = PowerTransformer(transform_X=True, powers=powers)
    X, y, w, ids = (gaussian_dataset.X,gaussian_dataset.y,gaussian_dataset.w,gaussian_dataset.ids)
    power_transformer.transform(gaussian_dataset)
    gaussian_dataset = DiskDataset(data_dir=gaussian_dataset.data_dir,reload=True)
    X_t, y_t, w_t, ids_t = (gaussian_dataset.X,gaussian_dataset.y,gaussian_dataset.w,gaussian_dataset.ids)

    # Check ids are unchanged.
    for id_elt, id_t_elt in zip(ids, ids_t):
      assert id_elt == id_t_elt
    # Check y is unchanged since this is an X transformer
    np.testing.assert_allclose(y, y_t)
    # Check w is unchanged since this is an X transformer
    np.testing.assert_allclose(w, w_t)
    # Check X is now holding the proper values in each column.
    np.testing.assert_allclose(X, X_t[:,:2])
    np.testing.assert_allclose(np.power(X,2),X_t[:,2:4])
    np.testing.assert_allclose(np.power(X,0.5),X_t[:,4:])
Example #5
0
  def test_cdf_X_transformer(self):
    """Test CDF transformer on Gaussian normal dataset."""
    target = np.array(np.transpose(np.linspace(0.,1.,1001)))
    target = np.transpose(np.array(np.append([target],[target], axis=0)))
    gaussian_dataset = self.load_gaussian_cdf_data()
    bins=1001
    cdf_transformer = CDFTransformer(transform_X=True, bins=bins)
    X, y, w, ids = (gaussian_dataset.X,gaussian_dataset.y,gaussian_dataset.w,gaussian_dataset.ids)
    cdf_transformer.transform(gaussian_dataset, bins=bins)
    gaussian_dataset = DiskDataset(data_dir=gaussian_dataset.data_dir,reload=True)
    X_t, y_t, w_t, ids_t = (gaussian_dataset.X,gaussian_dataset.y,gaussian_dataset.w,gaussian_dataset.ids)

    # Check ids are unchanged.
    for id_elt, id_t_elt in zip(ids, ids_t):
      assert id_elt == id_t_elt
    # Check y is unchanged since this is an X transformer
    np.testing.assert_allclose(y, y_t)
    # Check w is unchanged since this is an X transformer
    np.testing.assert_allclose(w, w_t)
    # Check X is now holding the proper values when sorted.
    sorted = np.sort(X_t,axis=0)
    np.testing.assert_allclose(sorted, target)
Example #6
0
    def test_multiload(self):
        """Check can re-use featurization for multiple task selections.

    TODO(rbharath): This test seems silly after the recent round of
                    refactoring. Can it be removed?
    """
        # Only for debug!
        np.random.seed(123)

        # Set some global variables up top
        reload = True
        verbosity = "high"

        current_dir = os.path.dirname(os.path.realpath(__file__))
        #Make directories to store the raw and featurized datasets.
        data_dir = os.path.join(self.base_dir, "dataset")
        train_dir = os.path.join(self.base_dir, "train_dataset")
        valid_dir = os.path.join(self.base_dir, "valid_dataset")
        test_dir = os.path.join(self.base_dir, "test_dataset")
        model_dir = os.path.join(self.base_dir, "model")

        # Load dataset
        print("About to load dataset.")
        dataset_file = os.path.join(
            current_dir, "../../models/tests/multitask_example.csv")
        dataset = load_from_disk(dataset_file)
        print("Columns of dataset: %s" % str(dataset.columns.values))
        print("Number of examples in dataset: %s" % str(dataset.shape[0]))

        # Featurize tox21 dataset
        print("About to featurize dataset.")
        featurizer = CircularFingerprint(size=1024)
        all_tasks = ["task%d" % i for i in range(17)]

        ####### Do featurization
        loader = DataLoader(tasks=all_tasks,
                            smiles_field="smiles",
                            featurizer=featurizer,
                            verbosity=verbosity)
        dataset = loader.featurize(dataset_file, data_dir)

        # Do train/valid split.
        X_multi, y_multi, w_multi, ids_multi = (dataset.X, dataset.y,
                                                dataset.w, dataset.ids)

        ####### Do singletask load
        y_tasks, w_tasks, = [], []
        for ind, task in enumerate(all_tasks):
            print("Processing task %s" % task)
            dataset = DiskDataset(data_dir, verbosity=verbosity, reload=reload)

            X_task, y_task, w_task, ids_task = (dataset.X, dataset.y,
                                                dataset.w, dataset.ids)
            y_tasks.append(y_task[:, ind])
            w_tasks.append(w_task[:, ind])

        ################## Do comparison
        for ind, task in enumerate(all_tasks):
            y_multi_task = y_multi[:, ind]
            w_multi_task = w_multi[:, ind]

            y_task = y_tasks[ind]
            w_task = w_tasks[ind]

            np.testing.assert_allclose(y_multi_task.flatten(),
                                       y_task.flatten())
            np.testing.assert_allclose(w_multi_task.flatten(),
                                       w_task.flatten())
Example #7
0
def load_pcba(base_dir, reload=True, frac_train=.8):
  """Load PCBA datasets. Does not do train/test split"""
  # Set some global variables up top
  reload = True
  verbosity = "high"
  regen = False

  # Create some directories for analysis
  # The base_dir holds the results of all analysis
  if not reload:
    if os.path.exists(base_dir):
      shutil.rmtree(base_dir)
  if not os.path.exists(base_dir):
    os.makedirs(base_dir)
  current_dir = os.path.dirname(os.path.realpath(__file__))
  #Make directories to store the raw and featurized datasets.
  data_dir = os.path.join(base_dir, "dataset")
  train_dir = os.path.join(base_dir, "train_dataset")
  valid_dir = os.path.join(base_dir, "valid_dataset")

  # Load PCBA dataset
  print("About to load PCBA dataset.")
  dataset_file = os.path.join(
      current_dir, "../../datasets/pcba.csv.gz")
  dataset = load_from_disk(dataset_file)
  print("Columns of dataset: %s" % str(dataset.columns.values))
  print("Number of examples in dataset: %s" % str(dataset.shape[0]))

  # Featurize PCBA dataset
  print("About to featurize PCBA dataset.")
  featurizer = CircularFingerprint(size=1024)
  PCBA_tasks = [
      'PCBA-1030','PCBA-1379','PCBA-1452','PCBA-1454','PCBA-1457',
      'PCBA-1458','PCBA-1460','PCBA-1461','PCBA-1468','PCBA-1469',
      'PCBA-1471','PCBA-1479','PCBA-1631','PCBA-1634','PCBA-1688',
      'PCBA-1721','PCBA-2100','PCBA-2101','PCBA-2147','PCBA-2242',
      'PCBA-2326','PCBA-2451','PCBA-2517','PCBA-2528','PCBA-2546',
      'PCBA-2549','PCBA-2551','PCBA-2662','PCBA-2675','PCBA-2676',
      'PCBA-411','PCBA-463254','PCBA-485281','PCBA-485290','PCBA-485294',
      'PCBA-485297','PCBA-485313','PCBA-485314','PCBA-485341','PCBA-485349',
      'PCBA-485353','PCBA-485360','PCBA-485364','PCBA-485367','PCBA-492947',
      'PCBA-493208','PCBA-504327','PCBA-504332','PCBA-504333','PCBA-504339',
      'PCBA-504444','PCBA-504466','PCBA-504467','PCBA-504706','PCBA-504842',
      'PCBA-504845','PCBA-504847','PCBA-504891','PCBA-540276','PCBA-540317',
      'PCBA-588342','PCBA-588453','PCBA-588456','PCBA-588579','PCBA-588590',
      'PCBA-588591','PCBA-588795','PCBA-588855','PCBA-602179','PCBA-602233',
      'PCBA-602310','PCBA-602313','PCBA-602332','PCBA-624170','PCBA-624171',
      'PCBA-624173','PCBA-624202','PCBA-624246','PCBA-624287','PCBA-624288',
      'PCBA-624291','PCBA-624296','PCBA-624297','PCBA-624417','PCBA-651635',
      'PCBA-651644','PCBA-651768','PCBA-651965','PCBA-652025','PCBA-652104',
      'PCBA-652105','PCBA-652106','PCBA-686970','PCBA-686978','PCBA-686979',
      'PCBA-720504','PCBA-720532','PCBA-720542','PCBA-720551','PCBA-720553',
      'PCBA-720579','PCBA-720580','PCBA-720707','PCBA-720708','PCBA-720709',
      'PCBA-720711','PCBA-743255','PCBA-743266','PCBA-875','PCBA-881',
      'PCBA-883','PCBA-884','PCBA-885','PCBA-887','PCBA-891','PCBA-899',
      'PCBA-902','PCBA-903','PCBA-904','PCBA-912','PCBA-914','PCBA-915',
      'PCBA-924','PCBA-925','PCBA-926','PCBA-927','PCBA-938','PCBA-995']

  loader = DataLoader(tasks=PCBA_tasks,
                      smiles_field="smiles",
                      featurizer=featurizer,
                      verbosity=verbosity)
  if not reload or not os.path.exists(data_dir):
    dataset = loader.featurize(dataset_file, data_dir)
    regen = True
  else:
    dataset = DiskDataset(data_dir, reload=True)

  # Initialize transformers 
  transformers = [
      BalancingTransformer(transform_w=True, dataset=dataset)]

  if regen:
    print("About to transform data")
    for transformer in transformers:
        transformer.transform(dataset)

  print("About to perform train/valid/test split.")
  num_train = frac_train * len(dataset)
  X, y, w, ids = (dataset.X, dataset.y, dataset.w, dataset.ids)
  num_tasks = 120
  PCBA_tasks = PCBA_tasks[:num_tasks]
  print("Using following tasks")
  print(PCBA_tasks)
  X_train, X_valid = X[:num_train], X[num_train:]
  y_train, y_valid = y[:num_train, :num_tasks], y[num_train:, :num_tasks]
  w_train, w_valid = w[:num_train, :num_tasks], w[num_train:, :num_tasks]
  ids_train, ids_valid = ids[:num_train], ids[num_train:]

  train_dataset = DiskDataset.from_numpy(train_dir, X_train, y_train,
                                     w_train, ids_train, PCBA_tasks)
  valid_dataset = DiskDataset.from_numpy(valid_dir, X_valid, y_valid,
                                     w_valid, ids_valid, PCBA_tasks)

  
  return PCBA_tasks, dataset, transformers
Example #8
0
def load_tox21(base_dir, reload=True, num_train=7200):
    """Load Tox21 datasets. Does not do train/test split"""
    # Set some global variables up top
    reload = True
    verbosity = "high"

    # Create some directories for analysis
    # The base_dir holds the results of all analysis
    if not reload:
        if os.path.exists(base_dir):
            shutil.rmtree(base_dir)
    if not os.path.exists(base_dir):
        os.makedirs(base_dir)
    current_dir = os.path.dirname(os.path.realpath(__file__))
    #Make directories to store the raw and featurized datasets.
    data_dir = os.path.join(base_dir, "dataset")
    train_dir = os.path.join(base_dir, "train")
    valid_dir = os.path.join(base_dir, "valid")

    # Load Tox21 dataset
    print("About to load Tox21 dataset.")
    dataset_file = os.path.join(current_dir, "../../datasets/tox21.csv.gz")
    dataset = load_from_disk(dataset_file)
    print("Columns of dataset: %s" % str(dataset.columns.values))
    print("Number of examples in dataset: %s" % str(dataset.shape[0]))

    # Featurize Tox21 dataset
    print("About to featurize Tox21 dataset.")
    featurizer = CircularFingerprint(size=1024)
    tox21_tasks = [
        'NR-AR', 'NR-AR-LBD', 'NR-AhR', 'NR-Aromatase', 'NR-ER', 'NR-ER-LBD',
        'NR-PPAR-gamma', 'SR-ARE', 'SR-ATAD5', 'SR-HSE', 'SR-MMP', 'SR-p53'
    ]

    if not reload or not os.path.exists(data_dir):
        loader = DataLoader(tasks=tox21_tasks,
                            smiles_field="smiles",
                            featurizer=featurizer,
                            verbosity=verbosity)
        dataset = loader.featurize(dataset_file, data_dir, shard_size=8192)
    else:
        dataset = DiskDataset(data_dir, tox21_tasks, reload=True)

    # Initialize transformers
    transformers = [BalancingTransformer(transform_w=True, dataset=dataset)]
    if not reload:
        print("About to transform data")
        for transformer in transformers:
            transformer.transform(dataset)

    X, y, w, ids = (dataset.X, dataset.y, dataset.w, dataset.ids)
    X_train, X_valid = X[:num_train], X[num_train:]
    y_train, y_valid = y[:num_train], y[num_train:]
    w_train, w_valid = w[:num_train], w[num_train:]
    ids_train, ids_valid = ids[:num_train], ids[num_train:]

    train_dataset = DiskDataset.from_numpy(train_dir, X_train, y_train,
                                           w_train, ids_train, tox21_tasks)
    valid_dataset = DiskDataset.from_numpy(valid_dir, X_valid, y_valid,
                                           w_valid, ids_valid, tox21_tasks)

    return tox21_tasks, (train_dataset, valid_dataset), transformers
Example #9
0
    def featurize(self,
                  input_files,
                  data_dir,
                  shard_size=8192,
                  num_shards_per_batch=24,
                  worker_pool=None,
                  logging=True,
                  debug=False):
        """Featurize provided files and write to specified location."""
        ############################################################## TIMING
        time1 = time.time()
        ############################################################## TIMING
        log("Loading raw samples now.", self.verbosity)
        log("shard_size: %d" % shard_size, self.verbosity)
        log("num_shards_per_batch: %d" % num_shards_per_batch, self.verbosity)

        # Allow users to specify a single file for featurization
        if not isinstance(input_files, list):
            input_files = [input_files]

        if not os.path.exists(data_dir):
            os.makedirs(data_dir)

        # Construct partial function to write datasets.
        if not len(input_files):
            return None
        input_type = get_input_type(input_files[0])

        if logging:
            mp.log_to_stderr()
        if worker_pool is None:
            if logging:
                worker_pool = LoggingPool(processes=1)
            else:
                worker_pool = mp.Pool(processes=1)
        log("Spawning workers now.", self.verbosity)
        metadata_rows = []

        def wrap_with_shard_metadata(iterator):
            for item in iterator:
                yield ((self, shard_size, input_type, data_dir), item)

        data_iterator = wrap_with_shard_metadata(
            enumerate(load_data(input_files, shard_size, self.verbosity)))
        # Turns out python map is terrible and exhausts the generator as given.
        # Solution seems to be to to manually pull out N elements from iterator,
        # then to map on only those N elements. BLECH. Python should do a better
        # job here.
        num_batches = 0
        ############################################################## TIMING
        time2 = time.time()
        log("TIMING: pre-map featurization took %0.3f s" % (time2 - time1))
        ############################################################## TIMING
        while True:
            log("About to start processing next batch of shards",
                self.verbosity)
            ############################################################## TIMING
            time1 = time.time()
            ############################################################## TIMING
            iterator = itertools.islice(data_iterator, num_shards_per_batch)
            if not debug:
                batch_metadata = worker_pool.map(featurize_map_function,
                                                 iterator)
            else:
                batch_metadata = []
                for elt in iterator:
                    batch_metadata.append(featurize_map_function(elt))
            ############################################################## TIMING
            time2 = time.time()
            log("TIMING: map call on batch took %0.3f s" % (time2 - time1),
                self.verbosity)
            ############################################################## TIMING
            if batch_metadata:
                metadata_rows.extend(
                    [elt for elt in batch_metadata if elt is not None])
                num_batches += 1
                log(
                    "Featurized %d datapoints\n" %
                    (shard_size * num_shards_per_batch * num_batches),
                    self.verbosity)
            else:
                break
        ############################################################## TIMING
        time1 = time.time()
        ############################################################## TIMING

        # TODO(rbharath): This whole bit with metadata_rows is an awkward way of
        # creating a Dataset. Is there a more elegant solutions?
        dataset = DiskDataset(data_dir=data_dir,
                              metadata_rows=metadata_rows,
                              reload=True,
                              verbosity=self.verbosity)
        ############################################################## TIMING
        time2 = time.time()
        print("TIMING: dataset construction took %0.3f s" % (time2 - time1),
              self.verbosity)
        ############################################################## TIMING
        return dataset
Example #10
0
def load_sweet(base_dir, reload=True, frac_train=.8):
    """Load sweet datasets. Does not do train/test split"""
    # Set some global variables up top
    reload = True
    verbosity = "high"
    model = "logistic"
    regen = False

    # Create some directories for analysis
    # The base_dir holds the results of all analysis
    if not reload:
        if os.path.exists(base_dir):
            shutil.rmtree(base_dir)
    if not os.path.exists(base_dir):
        os.makedirs(base_dir)
    current_dir = os.path.dirname(os.path.realpath(__file__))
    #Make directories to store the raw and featurized datasets.
    data_dir = os.path.join(base_dir, "dataset")
    train_dir = os.path.join(base_dir, "train_dataset")
    valid_dir = os.path.join(base_dir, "valid_dataset")

    # Load SWEET dataset
    print("About to load SWEET dataset.")
    dataset_file = os.path.join(current_dir, "./sweet.csv.gz")
    dataset = load_from_disk(dataset_file)
    print("Columns of dataset: %s" % str(dataset.columns.values))
    print("Number of examples in dataset: %s" % str(dataset.shape[0]))

    # Featurize SWEET dataset
    print("About to featurize SWEET dataset.")
    featurizer = CircularFingerprint(size=1024)
    SWEET_tasks = dataset.columns.values[1:].tolist()

    loader = DataLoader(tasks=SWEET_tasks,
                        smiles_field="smiles",
                        featurizer=featurizer,
                        verbosity=verbosity)
    if not reload or not os.path.exists(data_dir):
        dataset = loader.featurize(dataset_file, data_dir)
        regen = True
    else:
        dataset = DiskDataset(data_dir, reload=True)

    # Initialize transformers
    transformers = [BalancingTransformer(transform_w=True, dataset=dataset)]
    if regen:
        print("About to transform data")
        for transformer in transformers:
            dataset = transformer.transform(dataset)

    X, y, w, ids = (dataset.X, dataset.y, dataset.w, dataset.ids)
    num_tasks = 17
    num_train = frac_train * len(dataset)
    SWEET_tasks = SWEET_tasks[:num_tasks]
    print("Using following tasks")
    print(SWEET_tasks)
    X_train, X_valid = X[:num_train], X[num_train:]
    y_train, y_valid = y[:num_train, :num_tasks], y[num_train:, :num_tasks]
    w_train, w_valid = w[:num_train, :num_tasks], w[num_train:, :num_tasks]
    ids_train, ids_valid = ids[:num_train], ids[num_train:]

    train_dataset = DiskDataset.from_numpy(train_dir, X_train, y_train,
                                           w_train, ids_train, SWEET_tasks)
    valid_dataset = DiskDataset.from_numpy(valid_dir, X_valid, y_valid,
                                           w_valid, ids_valid, SWEET_tasks)

    return SWEET_tasks, (train_dataset, valid_dataset), transformers