Ejemplo n.º 1
0
def output_predictions(dataset: Dataset, y_preds: np.ndarray,
                       csv_out: str) -> None:
    """Writes predictions to file.

  Writes predictions made on `dataset` to a specified file on
  disk. `dataset.ids` are used to format predictions. The produce CSV file will have format as follows

  | ID          | Task1Name    | Task2Name    |
  | ----------- | ------------ | ------------ |
  | identifer1  | prediction11 | prediction12 |
  | identifer2  | prediction21 | prediction22 |

  Parameters
  ----------
  dataset: dc.data.Dataset
    Dataset on which predictions have been made.
  y_preds: np.ndarray
    Predictions to output
  csv_out: str
    Name of file to write predictions to.
  """
    data_ids = dataset.ids
    n_tasks = len(dataset.get_task_names())
    y_preds = np.reshape(y_preds, (len(y_preds), n_tasks))
    assert len(y_preds) == len(data_ids)
    with open(csv_out, "w") as csvfile:
        csvwriter = csv.writer(csvfile)
        csvwriter.writerow(["ID"] + dataset.get_task_names())
        for mol_id, y_pred in zip(data_ids, y_preds):
            csvwriter.writerow([mol_id] + list(y_pred))
Ejemplo n.º 2
0
    def predict(self,
                dataset: Dataset,
                transformers: List[Transformer] = []) -> np.ndarray:
        """
    Uses self to make predictions on provided Dataset object.

    Parameters
    ----------
    dataset: Dataset
      Dataset to make prediction on
    transformers: List[Transformer]
      Transformers that the input data has been transformed by. The output
      is passed through these transformers to undo the transformations.

    Returns
    -------
    np.ndarray
      A numpy array of predictions the model produces.
    """
        y_preds = []
        for (X_batch, _, _,
             ids_batch) in dataset.iterbatches(deterministic=True):
            n_samples = len(X_batch)
            y_pred_batch = self.predict_on_batch(X_batch)
            # Discard any padded predictions
            y_pred_batch = y_pred_batch[:n_samples]
            y_pred_batch = undo_transforms(y_pred_batch, transformers)
            y_preds.append(y_pred_batch)
        y_pred = np.concatenate(y_preds)
        return y_pred
Ejemplo n.º 3
0
    def fit(self, dataset: Dataset, nb_epoch: int = 10) -> float:
        """
    Fits a model on data in a Dataset object.

    Parameters
    ----------
    dataset: Dataset
      the Dataset to train on
    nb_epoch: int
      the number of epochs to train for

    Returns
    -------
    float
      The average loss over the most recent checkpoint interval.
    """
        for epoch in range(nb_epoch):
            logger.info("Starting epoch %s" % str(epoch + 1))
            losses = []
            for (X_batch, y_batch, w_batch,
                 ids_batch) in dataset.iterbatches():
                losses.append(self.fit_on_batch(X_batch, y_batch, w_batch))
            logger.info("Avg loss for epoch %d: %f" %
                        (epoch + 1, np.array(losses).mean()))
        return np.array(losses).mean()
Ejemplo n.º 4
0
    def predict(self,
                dataset: Dataset,
                transformers: List[Transformer] = []) -> OneOrMany[np.ndarray]:
        """
    Uses self to make predictions on provided Dataset object.


    Parameters
    ----------
    dataset: dc.data.Dataset
      Dataset to make prediction on
    transformers: list of dc.trans.Transformers
      Transformers that the input data has been transformed by.  The output
      is passed through these transformers to undo the transformations.

    Returns
    -------
    a NumPy array of the model produces a single output, or a list of arrays
    if it produces multiple outputs
    """
        y_preds = []
        n_tasks = self.get_num_tasks()
        ind = 0

        for (X_batch, _, _,
             ids_batch) in dataset.iterbatches(deterministic=True):
            n_samples = len(X_batch)
            y_pred_batch = self.predict_on_batch(X_batch)
            # Discard any padded predictions
            y_pred_batch = y_pred_batch[:n_samples]
            y_pred_batch = undo_transforms(y_pred_batch, transformers)
            y_preds.append(y_pred_batch)
        y_pred = np.concatenate(y_preds)
        return y_pred
Ejemplo n.º 5
0
    def fit(self, dataset: Dataset, nb_epoch: int = 10) -> float:
        """
    Fits a model on data in a Dataset object.

    Parameters
    ----------
    dataset: Dataset
      the Dataset to train on
    nb_epoch: int
      the number of epochs to train for

    Returns
    -------
    the average loss over the most recent epoch
    """
        # TODO(rbharath/enf): We need a structured way to deal with potential GPU
        #                     memory overflows.
        for epoch in range(nb_epoch):
            logger.info("Starting epoch %s" % str(epoch + 1))
            losses = []
            for (X_batch, y_batch, w_batch,
                 ids_batch) in dataset.iterbatches():
                losses.append(self.fit_on_batch(X_batch, y_batch, w_batch))
            logger.info("Avg loss for epoch %d: %f" %
                        (epoch + 1, np.array(losses).mean()))
        return np.array(losses).mean()
Ejemplo n.º 6
0
    def __init__(self, dataset: Dataset):
        super(DuplicateBalancingTransformer, self).__init__(transform_X=True,
                                                            transform_y=True,
                                                            transform_w=True,
                                                            transform_ids=True,
                                                            dataset=dataset)

        if len(dataset.get_task_names()) > 1:
            raise ValueError(
                "This transformation is only defined for singletask datsets.")

        # Get the labels/weights
        y = dataset.y
        w = dataset.w
        # Normalize shapes
        if len(y.shape) == 1:
            y = np.reshape(y, (len(y), 1))
        if len(w.shape) == 1:
            w = np.reshape(w, (len(w), 1))
        if len(y.shape) != 2:
            raise ValueError("y must be of shape (N,) or (N, n_tasks)")
        if len(w.shape) != 2:
            raise ValueError("w must be of shape (N,) or (N, n_tasks)")
        self.classes = sorted(np.unique(y))
        # Remove labels with zero weights
        y = y[w != 0]
        N = len(y)
        class_weights = []
        # Note that we may have 0 elements of a given class since we remove those
        # labels with zero weight.
        for c in self.classes:
            # this works because y is 1D
            c_weight = np.sum(w[y == c])
            class_weights.append(c_weight)
        weight_largest = max(class_weights)
        # This is the right ratio since int(N/num_c) * num_c \approx N
        # for all classes
        duplication_ratio = [
            int(weight_largest / float(c_weight)) if c_weight > 0 else 0
            for c_weight in class_weights
        ]
        self.duplication_ratio = duplication_ratio
Ejemplo n.º 7
0
  def default_generator(
      self,
      dataset: Dataset,
      epochs: int = 1,
      mode: str = 'fit',
      deterministic: bool = True,
      pad_batches: bool = True) -> Iterable[Tuple[List, List, List]]:
    """Create a generator that iterates batches for a dataset.

    Subclasses may override this method to customize how model inputs are
    generated from the data.

    Parameters
    ----------
    dataset: Dataset
      the data to iterate
    epochs: int
      the number of times to iterate over the full dataset
    mode: str
      allowed values are 'fit' (called during training), 'predict' (called
      during prediction), and 'uncertainty' (called during uncertainty
      prediction)
    deterministic: bool
      whether to iterate over the dataset in order, or randomly shuffle the
      data for each epoch
    pad_batches: bool
      whether to pad each batch up to this model's preferred batch size

    Returns
    -------
    a generator that iterates batches, each represented as a tuple of lists:
    ([inputs], [outputs], [weights])
    """
    for epoch in range(epochs):
      for (X_b, y_b, w_b, ids_b) in dataset.iterbatches(
          batch_size=self.batch_size,
          deterministic=deterministic,
          pad_batches=pad_batches):
        yield ([X_b], [y_b], [w_b])
Ejemplo n.º 8
0
  def default_generator(
      self,
      dataset: Dataset,
      epochs: int = 1,
      mode: str = 'fit',
      deterministic: bool = True,
      pad_batches: bool = True) -> Iterable[Tuple[List, List, List]]:
    """Convert a dataset into the tensors needed for learning.

    Parameters
    ----------
    dataset: `dc.data.Dataset`
      Dataset to convert
    epochs: int, optional (Default 1)
      Number of times to walk over `dataset`
    mode: str, optional (Default 'fit')
      Ignored in this implementation.
    deterministic: bool, optional (Default True)
      Whether the dataset should be walked in a deterministic fashion
    pad_batches: bool, optional (Default True)
      If true, each returned batch will have size `self.batch_size`.

    Returns
    -------
    Iterator which walks over the batches
    """

    for epoch in range(epochs):
      for (X_b, y_b, w_b, ids_b) in dataset.iterbatches(
          batch_size=self.batch_size,
          deterministic=deterministic,
          pad_batches=pad_batches):
        if y_b is not None:
          if self.mode == 'classification':
            y_b = to_one_hot(y_b.flatten(), self.n_classes).reshape(
                -1, self.n_tasks, self.n_classes)
        inputs = self.compute_features_on_batch(X_b)
        yield (inputs, [y_b], [w_b])
Ejemplo n.º 9
0
  def default_generator(
      self,
      dataset: Dataset,
      epochs: int = 1,
      mode: str = 'fit',
      deterministic: bool = True,
      pad_batches: bool = True) -> Iterable[Tuple[List, List, List]]:
    """Convert a dataset into the tensors needed for learning.

    Parameters
    ----------
    dataset: `dc.data.Dataset`
      Dataset to convert
    epochs: int, optional (Default 1)
      Number of times to walk over `dataset`
    mode: str, optional (Default 'fit')
      Ignored in this implementation.
    deterministic: bool, optional (Default True)
      Whether the dataset should be walked in a deterministic fashion
    pad_batches: bool, optional (Default True)
      If true, each returned batch will have size `self.batch_size`.

    Returns
    -------
    Iterator which walks over the batches
    """

    for epoch in range(epochs):
      for (X_b, y_b, w_b, ids_b) in dataset.iterbatches(
          batch_size=self.batch_size,
          deterministic=deterministic,
          pad_batches=pad_batches):
        if y_b is not None:
          if self.mode == 'classification':
            y_b = to_one_hot(y_b.flatten(), self.n_classes).reshape(
                -1, self.n_tasks, self.n_classes)
        atom_feat = []
        pair_feat = []
        atom_split = []
        atom_to_pair = []
        pair_split = []
        start = 0
        for im, mol in enumerate(X_b):
          n_atoms = mol.get_num_atoms()
          # number of atoms in each molecule
          atom_split.extend([im] * n_atoms)
          # index of pair features
          C0, C1 = np.meshgrid(np.arange(n_atoms), np.arange(n_atoms))
          atom_to_pair.append(
              np.transpose(
                  np.array([C1.flatten() + start,
                            C0.flatten() + start])))
          # number of pairs for each atom
          pair_split.extend(C1.flatten() + start)
          start = start + n_atoms

          # atom features
          atom_feat.append(mol.get_atom_features())
          # pair features
          pair_feat.append(
              np.reshape(mol.get_pair_features(),
                         (n_atoms * n_atoms, self.n_pair_feat[0])))

        inputs = [
            np.concatenate(atom_feat, axis=0),
            np.concatenate(pair_feat, axis=0),
            np.array(pair_split),
            np.array(atom_split),
            np.concatenate(atom_to_pair, axis=0)
        ]
        yield (inputs, [y_b], [w_b])
Ejemplo n.º 10
0
def load_bace(mode="regression", transform=True, split="20-80"):
    """Load BACE-1 dataset as regression/classification problem."""
    reload = True
    verbosity = "high"
    regen = False
    assert split in ["20-80", "80-20"]

    current_dir = os.path.dirname(os.path.realpath(__file__))
    if split == "20-80":
        dataset_file = os.path.join(current_dir,
                                    "../../datasets/desc_canvas_aug30.csv")
    elif split == "80-20":
        dataset_file = os.path.join(current_dir,
                                    "../../datasets/rev8020split_desc.csv")
    dataset = load_from_disk(dataset_file)
    num_display = 10
    pretty_columns = ("[" + ",".join(
        ["'%s'" % column
         for column in dataset.columns.values[:num_display]]) + ",...]")

    crystal_dataset_file = os.path.join(
        current_dir, "../../datasets/crystal_desc_canvas_aug30.csv")
    crystal_dataset = load_from_disk(crystal_dataset_file)

    print("Columns of dataset: %s" % pretty_columns)
    print("Number of examples in dataset: %s" % str(dataset.shape[0]))
    print("Number of examples in crystal dataset: %s" %
          str(crystal_dataset.shape[0]))

    #Make directories to store the raw and featurized datasets.
    base_dir = tempfile.mkdtemp()
    data_dir = os.path.join(base_dir, "dataset")
    train_dir = os.path.join(base_dir, "train_dataset")
    valid_dir = os.path.join(base_dir, "valid_dataset")
    test_dir = os.path.join(base_dir, "test_dataset")
    model_dir = os.path.join(base_dir, "model")
    crystal_dir = os.path.join(base_dir, "crystal")

    if mode == "regression":
        bace_tasks = ["pIC50"]
    elif mode == "classification":
        bace_tasks = ["Class"]
    else:
        raise ValueError("Unknown mode %s" % mode)
    featurizer = UserDefinedFeaturizer(user_specified_features)
    loader = DataLoader(tasks=bace_tasks,
                        smiles_field="mol",
                        id_field="CID",
                        featurizer=featurizer)
    if not reload or not os.path.exists(data_dir):
        dataset = loader.featurize(dataset_file, data_dir)
        regen = True
    else:
        dataset = Dataset(data_dir, reload=True)
    if not reload or not os.path.exists(crystal_dir):
        crystal_dataset = loader.featurize(crystal_dataset_file, crystal_dir)
    else:
        crystal_dataset = Dataset(crystal_dir, reload=True)

    if (not reload or not os.path.exists(train_dir)
            or not os.path.exists(valid_dir) or not os.path.exists(test_dir)):
        regen = True
        splitter = SpecifiedSplitter(dataset_file,
                                     "Model",
                                     verbosity=verbosity)
        train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(
            dataset, train_dir, valid_dir, test_dir)
    else:
        train_dataset = Dataset(train_dir, reload=True)
        valid_dataset = Dataset(valid_dir, reload=True)
        test_dataset = Dataset(test_dir, reload=True)

    #NOTE THE RENAMING:
    if split == "20-80":
        valid_dataset, test_dataset = test_dataset, valid_dataset
    print("Number of compounds in train set")
    print(len(train_dataset))
    print("Number of compounds in validation set")
    print(len(valid_dataset))
    print("Number of compounds in test set")
    print(len(test_dataset))
    print("Number of compounds in crystal set")
    print(len(crystal_dataset))

    if transform and regen:
        input_transformers = [
            NormalizationTransformer(transform_X=True, dataset=train_dataset),
            ClippingTransformer(transform_X=True, dataset=train_dataset)
        ]
        output_transformers = []
        if mode == "regression":
            output_transformers = [
                NormalizationTransformer(transform_y=True,
                                         dataset=train_dataset)
            ]
        else:
            output_transformers = []
    else:
        input_transformers, output_transformers = [], []

    transformers = input_transformers + output_transformers
    for dataset in [
            train_dataset, valid_dataset, test_dataset, crystal_dataset
    ]:
        for transformer in transformers:
            dataset = transformer.transform(dataset)

    return (bace_tasks, train_dataset, valid_dataset, test_dataset,
            crystal_dataset, output_transformers)