Example #1
0
class TensorGraph(Model):
    def __init__(self,
                 tensorboard=False,
                 tensorboard_log_frequency=100,
                 batch_size=100,
                 random_seed=None,
                 use_queue=True,
                 graph=None,
                 learning_rate=0.001,
                 **kwargs):
        """
    Parameters
    ----------
    tensorboard: bool
      Should we log to model_dir data for tensorboard?
    tensorboard_log_frequency: int
      How many training batches before logging tensorboard?
    batch_size: int
      default batch size for training and evaluating
    use_queue: boolean
      if True when building we will create a tf.FIFO queue, which will hold
      all features, weights, and labels.  We will feed the inputs into this
      queue in batches of self.batch_size in a separate thread from the
      thread training the model.  You cannot use a queue when
      batches are not of consistent size
    graph: tensorflow.Graph
      the Graph in which to create Tensorflow objects.  If None, a new Graph
      is created.
    learning_rate: float or LearningRateSchedule
      the learning rate to use for optimization
    kwargs
    """

        # Layer Management
        self.layers = dict()
        self.features = list()
        self.labels = list()
        self.outputs = list()
        self.task_weights = list()
        self.submodels = list()
        self.loss = Constant(0)
        self.built = False
        self.queue_installed = False
        self.optimizer = Adam(learning_rate=learning_rate,
                              beta1=0.9,
                              beta2=0.999,
                              epsilon=1e-7)

        # Singular place to hold Tensor objects which don't serialize
        # These have to be reconstructed on restoring from pickle
        # See TensorGraph._get_tf() for more details on lazy construction
        self.tensor_objects = {
            "FileWriter": None,
            "Graph": graph,
            "train_op": None,
            "summary_op": None,
        }
        self.tensorboard = tensorboard
        self.tensorboard_log_frequency = tensorboard_log_frequency
        self.tensorboard_step = 0
        self.global_step = 0
        self.use_queue = use_queue

        self.batch_size = batch_size
        self.random_seed = random_seed
        super(TensorGraph, self).__init__(**kwargs)
        self.save_file = "%s/%s" % (self.model_dir, "model")
        self.model_class = None

        self.rnn_initial_states = []
        self.rnn_final_states = []
        self.rnn_zero_states = []
        if self.use_queue and self.tensorboard:
            raise ValueError(
                "Currently TensorGraph cannot both use_queue and tensorboard at the same time"
            )

    def _add_layer(self, layer):
        if layer.name is None:
            layer.name = "%s_%s" % (layer.__class__.__name__,
                                    len(self.layers) + 1)
        if layer.name in self.layers:
            return
        if isinstance(layer, Feature):
            self.features.append(layer)
        if isinstance(layer, Label):
            self.labels.append(layer)
        if isinstance(layer, Weights):
            self.task_weights.append(layer)
        self.layers[layer.name] = layer
        for in_layer in layer.in_layers:
            self._add_layer(in_layer)

    def fit(self,
            dataset,
            nb_epoch=10,
            max_checkpoints_to_keep=5,
            checkpoint_interval=1000,
            deterministic=False,
            restore=False,
            submodel=None):
        """Train this model on a dataset.

    Parameters
    ----------
    dataset: Dataset
      the Dataset to train on
    nb_epoch: int
      the number of epochs to train for
    max_checkpoints_to_keep: int
      the maximum number of checkpoints to keep.  Older checkpoints are discarded.
    checkpoint_interval: int
      the frequency at which to write checkpoints, measured in training steps.
      Set this to 0 to disable automatic checkpointing.
    deterministic: bool
      if True, the samples are processed in order.  If False, a different random
      order is used for each epoch.
    restore: bool
      if True, restore the model from the most recent checkpoint and continue training
      from there.  If False, retrain the model from scratch.
    submodel: Submodel
      an alternate training objective to use.  This should have been created by
      calling create_submodel().
    """
        return self.fit_generator(
            self.default_generator(dataset,
                                   epochs=nb_epoch,
                                   deterministic=deterministic),
            max_checkpoints_to_keep, checkpoint_interval, restore, submodel)

    def fit_generator(self,
                      feed_dict_generator,
                      max_checkpoints_to_keep=5,
                      checkpoint_interval=1000,
                      restore=False,
                      submodel=None):
        """Train this model on data from a generator.

    Parameters
    ----------
    feed_dict_generator: generator
      this should generate batches, each represented as a dict that maps
      Layers to values.
    max_checkpoints_to_keep: int
      the maximum number of checkpoints to keep.  Older checkpoints are discarded.
    checkpoint_interval: int
      the frequency at which to write checkpoints, measured in training steps.
      Set this to 0 to disable automatic checkpointing.
    restore: bool
      if True, restore the model from the most recent checkpoint and continue training
      from there.  If False, retrain the model from scratch.
    submodel: Submodel
      an alternate training objective to use.  This should have been created by
      calling create_submodel().

    Returns
    -------
    the average loss over the most recent checkpoint interval
    """
        def create_feed_dict():
            if self.use_queue:
                while True:
                    yield {self._training_placeholder: 1.0}
            for d in feed_dict_generator:
                feed_dict = dict(d)
                feed_dict[self._training_placeholder] = 1.0
                yield feed_dict

        if not self.built:
            self.build()
        with self._get_tf("Graph").as_default():
            time1 = time.time()
            loss = self.loss
            if submodel is None:
                train_op = self._get_tf('train_op')
            else:
                train_op = submodel.get_train_op()
                if submodel.loss is not None:
                    loss = submodel.loss
            if checkpoint_interval > 0:
                saver = tf.train.Saver(max_to_keep=max_checkpoints_to_keep)
            if restore:
                self.restore()
            avg_loss, n_averaged_batches = 0.0, 0.0
            n_samples = 0
            n_enqueued = [0]
            final_sample = [None]
            if self.use_queue:
                enqueue_thread = threading.Thread(
                    target=_enqueue_batch,
                    args=(self, feed_dict_generator, self._get_tf("Graph"),
                          self.session, n_enqueued, final_sample))
                enqueue_thread.start()
            for feed_dict in create_feed_dict():
                if self.use_queue:
                    # Don't let this thread get ahead of the enqueue thread, since if
                    # we try to read more batches than the total number that get queued,
                    # this thread will hang indefinitely.
                    while n_enqueued[0] <= n_samples:
                        if n_samples == final_sample[0]:
                            break
                        time.sleep(0)
                    if n_samples == final_sample[0]:
                        break
                n_samples += 1
                should_log = (self.tensorboard and
                              n_samples % self.tensorboard_log_frequency == 0)
                fetches = [train_op, loss.out_tensor]
                if should_log:
                    fetches.append(self._get_tf("summary_op"))
                fetched_values = self.session.run(fetches, feed_dict=feed_dict)
                if should_log:
                    self._log_tensorboard(fetches[2])
                avg_loss += fetched_values[1]
                n_averaged_batches += 1
                self.global_step += 1
                if checkpoint_interval > 0 and self.global_step % checkpoint_interval == checkpoint_interval - 1:
                    saver.save(self.session,
                               self.save_file,
                               global_step=self.global_step)
                    avg_loss = float(avg_loss) / n_averaged_batches
                    print('Ending global_step %d: Average loss %g' %
                          (self.global_step, avg_loss))
                    avg_loss, n_averaged_batches = 0.0, 0.0
            if n_averaged_batches > 0:
                avg_loss = float(avg_loss) / n_averaged_batches
            if checkpoint_interval > 0:
                if n_averaged_batches > 0:
                    print('Ending global_step %d: Average loss %g' %
                          (self.global_step, avg_loss))
                saver.save(self.session,
                           self.save_file,
                           global_step=self.global_step)
                time2 = time.time()
                print("TIMING: model fitting took %0.3f s" % (time2 - time1))
        return avg_loss

    def _log_tensorboard(self, summary):
        """
    TODO(LESWING) set epoch
    Parameters
    ----------
    Returns
    -------
    """
        global_step = int(self.global_step)
        writer = self._get_tf("FileWriter")
        writer.reopen()
        writer.add_summary(summary, global_step=global_step)
        writer.close()

    def fit_on_batch(self, X, y, w, submodel=None):
        if not self.built:
            self.build()
        dataset = NumpyDataset(X, y)
        return self.fit(dataset, nb_epoch=1, submodel=submodel)

    def default_generator(self,
                          dataset,
                          epochs=1,
                          predict=False,
                          deterministic=True,
                          pad_batches=True):
        if len(self.features) > 1:
            raise ValueError("More than one Feature, must use generator")
        if len(self.labels) > 1:
            raise ValueError("More than one Label, must use generator")
        if len(self.task_weights) > 1:
            raise ValueError("More than one Weights, must use generator")
        for epoch in range(epochs):
            for (X_b, y_b, w_b,
                 ids_b) in dataset.iterbatches(batch_size=self.batch_size,
                                               deterministic=deterministic,
                                               pad_batches=pad_batches):
                feed_dict = dict()
                if len(self.labels) == 1 and y_b is not None and not predict:
                    feed_dict[self.labels[0]] = y_b
                if len(self.features) == 1 and X_b is not None:
                    feed_dict[self.features[0]] = X_b
                if len(self.task_weights
                       ) == 1 and w_b is not None and not predict:
                    feed_dict[self.task_weights[0]] = w_b
                for (initial_state, zero_state) in zip(self.rnn_initial_states,
                                                       self.rnn_zero_states):
                    feed_dict[initial_state] = zero_state
                yield feed_dict

    def predict_on_generator(self, generator, transformers=[], outputs=None):
        """
    Parameters
    ----------
    generator: Generator
      Generator that constructs feed dictionaries for TensorGraph.
    transformers: list
      List of dc.trans.Transformers.
    outputs: object
      If outputs is None, then will assume outputs = self.outputs.
      If outputs is a Layer/Tensor, then will evaluate and return as a
      single ndarray. If outputs is a list of Layers/Tensors, will return a list
      of ndarrays.
    Returns:
      y_pred: numpy ndarray of shape (n_samples, n_classes*n_tasks)
    """
        if not self.built:
            self.build()
        if outputs is None:
            outputs = self.outputs
        elif not isinstance(outputs, collections.Sequence):
            outputs = [outputs]
        with self._get_tf("Graph").as_default():
            # Gather results for each output
            results = [[] for out in outputs]
            for feed_dict in generator:
                feed_dict = {
                    self.layers[k.name].out_tensor: v
                    for k, v in six.iteritems(feed_dict)
                }
                feed_dict[self._training_placeholder] = 0.0
                feed_results = self.session.run(outputs, feed_dict=feed_dict)
                if len(feed_results) > 1:
                    if len(transformers):
                        raise ValueError("Does not support transformations "
                                         "for multiple outputs.")
                elif len(feed_results) == 1:
                    result = undo_transforms(feed_results[0], transformers)
                    feed_results = [result]
                for ind, result in enumerate(feed_results):
                    results[ind].append(result)

            final_results = []
            for result_list in results:
                final_results.append(np.concatenate(result_list, axis=0))
            # If only one output, just return array
            if len(final_results) == 1:
                return final_results[0]
            else:
                return final_results

    def predict_proba_on_generator(self,
                                   generator,
                                   transformers=[],
                                   outputs=None):
        """
    Returns:
      y_pred: numpy ndarray of shape (n_samples, n_classes*n_tasks)
    """
        return self.predict_on_generator(generator, transformers, outputs)

    def predict_on_batch(self, X, transformers=[], outputs=None):
        """Generates predictions for input samples, processing samples in a batch.

    Parameters
    ----------
    X: ndarray
      the input data, as a Numpy array.
    transformers: List
      List of dc.trans.Transformers

    Returns
    -------
    A Numpy array of predictions.
    """
        dataset = NumpyDataset(X=X, y=None)
        generator = self.default_generator(dataset,
                                           predict=True,
                                           pad_batches=False)
        return self.predict_on_generator(generator, transformers, outputs)

    def predict_proba_on_batch(self, X, transformers=[], outputs=None):
        """Generates predictions for input samples, processing samples in a batch.

    Parameters
    ----------
    X: ndarray
      the input data, as a Numpy array.
    transformers: List
      List of dc.trans.Transformers

    Returns
    -------
    A Numpy array of predictions.
    """
        return self.predict_on_batch(X, transformers, outputs)

    def predict(self, dataset, transformers=[], outputs=None):
        """
    Uses self to make predictions on provided Dataset object.

    Parameters
    ----------
    dataset: dc.data.Dataset
      Dataset to make prediction on
    transformers: list
      List of dc.trans.Transformers.
    outputs: object
      If outputs is None, then will assume outputs = self.outputs[0] (single
      output). If outputs is a Layer/Tensor, then will evaluate and return as a
      single ndarray. If outputs is a list of Layers/Tensors, will return a list
      of ndarrays.

    Returns
    -------
    results: numpy ndarray or list of numpy ndarrays
    """
        generator = self.default_generator(dataset,
                                           predict=True,
                                           pad_batches=False)
        return self.predict_on_generator(generator, transformers, outputs)

    def predict_proba(self, dataset, transformers=[], outputs=None):
        """
    Parameters
    ----------
    dataset: dc.data.Dataset
      Dataset to make prediction on
    transformers: list
      List of dc.trans.Transformers.
    outputs: object
      If outputs is None, then will assume outputs = self.outputs[0] (single
      output). If outputs is a Layer/Tensor, then will evaluate and return as a
      single ndarray. If outputs is a list of Layers/Tensors, will return a list
      of ndarrays.

    Returns
    -------
    y_pred: numpy ndarray or list of numpy ndarrays
    """
        generator = self.default_generator(dataset,
                                           predict=True,
                                           pad_batches=False)
        return self.predict_proba_on_generator(generator, transformers,
                                               outputs)

    def topsort(self):
        def add_layers_to_list(layer, sorted_layers):
            if layer in sorted_layers:
                return
            for in_layer in layer.in_layers:
                add_layers_to_list(in_layer, sorted_layers)
            sorted_layers.append(layer)

        sorted_layers = []
        for l in self.features + self.labels + self.task_weights + self.outputs:
            add_layers_to_list(l, sorted_layers)
        add_layers_to_list(self.loss, sorted_layers)
        for submodel in self.submodels:
            if submodel.loss is not None:
                add_layers_to_list(submodel.loss, sorted_layers)
        return sorted_layers

    def build(self):
        if self.built:
            return
        with self._get_tf("Graph").as_default():
            self._training_placeholder = tf.placeholder(dtype=tf.float32,
                                                        shape=())
            if self.random_seed is not None:
                tf.set_random_seed(self.random_seed)
            self._install_queue()
            for layer in self.topsort():
                with tf.name_scope(layer.name):
                    layer.create_tensor(training=self._training_placeholder)
                    self.rnn_initial_states += layer.rnn_initial_states
                    self.rnn_final_states += layer.rnn_final_states
                    self.rnn_zero_states += layer.rnn_zero_states
                    layer.add_summary_to_tg()
            self.session = tf.Session()
            self.built = True

            # Ensure all training operators have been created.

            self._get_tf('train_op')
            for submodel in self.submodels:
                train_op = submodel.get_train_op()

            # Initialize variables.

            self.session.run(tf.global_variables_initializer())
            for layer in self.layers.values():
                if layer.variable_values is not None:
                    variables = self.get_layer_variables(layer)
                    for var, val in zip(variables, layer.variable_values):
                        self.session.run(var.assign(val))

        for layer in self.layers.values():
            if layer.tensorboard:
                self.tensorboard = True
        tf.summary.scalar("loss", self.loss.out_tensor)
        for layer in self.layers.values():
            if layer.tensorboard:
                tf.summary.tensor_summary(layer.name, layer.out_tensor)
        if self.tensorboard:
            writer = self._get_tf("FileWriter")
            writer.add_graph(self._get_tf("Graph"))
            writer.close()

        # As a sanity check, make sure all tensors have the correct shape.

        for layer in self.layers.values():
            try:
                assert list(layer.shape) == layer.out_tensor.get_shape(
                ).as_list(
                ), '%s: Expected shape %s does not match actual shape %s' % (
                    layer.name, layer.shape,
                    layer.out_tensor.get_shape().as_list())
            except NotImplementedError:
                pass

    def _install_queue(self):
        """
    """
        if not self.use_queue or self.queue_installed:
            for layer in self.features + self.labels + self.task_weights:
                layer.pre_queue = True
            return
        names = []
        shapes = []
        pre_q_inputs = []
        q = InputFifoQueue(shapes, names, in_layers=pre_q_inputs)
        q.name = "%s_%s" % (q.__class__.__name__, len(self.layers) + 1)

        for layer in self.features + self.labels + self.task_weights:
            pre_q_input = layer.create_pre_q(self.batch_size)
            shapes.append(pre_q_input.shape)
            names.append(pre_q_input.name)
            pre_q_inputs.append(pre_q_input)

            layer.in_layers.append(q)

        self._add_layer(q)
        self.input_queue = q
        self.queue_installed = True

    def set_loss(self, layer):
        self._add_layer(layer)
        self.loss = layer

    def add_output(self, layer):
        self._add_layer(layer)
        self.outputs.append(layer)

    def set_optimizer(self, optimizer):
        """Set the optimizer to use for fitting."""
        self.optimizer = optimizer

    def create_submodel(self, layers=None, loss=None, optimizer=None):
        """Create an alternate objective for training one piece of a TensorGraph.

    A TensorGraph consists of a set of layers, and specifies a loss function and
    optimizer to use for training those layers.  Usually this is sufficient, but
    there are cases where you want to train different parts of a model separately.
    For example, a GAN consists of a generator and a discriminator.  They are
    trained separately, and they use different loss functions.

    A submodel defines an alternate objective to use in cases like this.  It may
    optionally specify any of the following: a subset of layers in the model to
    train; a different loss function; and a different optimizer to use.  This
    method creates a submodel, which you can then pass to fit() to use it for
    training.

    Parameters
    ----------
    layers: list
      the list of layers to train.  If None, all layers in the model will be
      trained.
    loss: Layer
      the loss function to optimize.  If None, the model's main loss function
      will be used.
    optimizer: Optimizer
      the optimizer to use for training.  If None, the model's main optimizer
      will be used.

    Returns
    -------
    the newly created submodel, which can be passed to any of the fitting
    methods.
    """
        if self.built:
            raise ValueError(
                'Submodels must be created before build() is called.')
        submodel = Submodel(self, layers, loss, optimizer)
        self.submodels.append(submodel)
        if loss is not None:
            self._add_layer(loss)
        return submodel

    def get_pickling_errors(self, obj, seen=None):
        if seen == None:
            seen = []
        try:
            state = obj.__getstate__()
        except AttributeError:
            return
        if state == None:
            return
        if isinstance(state, tuple):
            if not isinstance(state[0], dict):
                state = state[1]
            else:
                state = state[0].update(state[1])
        result = {}
        for i in state:
            try:
                pickle.dumps(state[i], protocol=2)
            except pickle.PicklingError:
                if not state[i] in seen:
                    seen.append(state[i])
                    result[i] = self.get_pickling_errors(state[i], seen)
        return result

    def save(self):
        # Remove out_tensor from the object to be pickled
        must_restore = False
        tensor_objects = self.tensor_objects
        rnn_initial_states = self.rnn_initial_states
        rnn_final_states = self.rnn_final_states
        rnn_zero_states = self.rnn_zero_states
        session = self.session
        self.tensor_objects = {}
        self.rnn_initial_states = []
        self.rnn_final_states = []
        self.rnn_zero_states = []
        self.session = None
        out_tensors = []
        if self.built:
            must_restore = True
            for layer in self.topsort():
                out_tensors.append(layer.none_tensors())
            training_placeholder = self._training_placeholder
            self._training_placeholder = None
            self.built = False

        # Pickle itself
        pickle_name = os.path.join(self.model_dir, "model.pickle")

        with open(pickle_name, 'wb') as fout:
            try:
                pickle.dump(self, fout)
            except Exception as e:
                print(self.get_pickling_errors(self))
                raise e

        # add out_tensor back to everyone
        if must_restore:
            for index, layer in enumerate(self.topsort()):
                layer.set_tensors(out_tensors[index])
            self._training_placeholder = training_placeholder
            self.built = True
        self.tensor_objects = tensor_objects
        self.rnn_initial_states = rnn_initial_states
        self.rnn_final_states = rnn_final_states
        self.rnn_zero_states = rnn_zero_states
        self.session = session

    def evaluate_generator(self,
                           feed_dict_generator,
                           metrics,
                           transformers=[],
                           labels=None,
                           outputs=None,
                           weights=[],
                           per_task_metrics=False):

        if labels is None:
            raise ValueError
        n_tasks = len(self.outputs)
        n_classes = self.outputs[0].out_tensor.get_shape()[-1].value
        evaluator = GeneratorEvaluator(self,
                                       feed_dict_generator,
                                       transformers,
                                       labels=labels,
                                       outputs=outputs,
                                       weights=weights,
                                       n_tasks=n_tasks,
                                       n_classes=n_classes)
        if not per_task_metrics:
            scores = evaluator.compute_model_performance(metrics)
            return scores
        else:
            scores, per_task_scores = evaluator.compute_model_performance(
                metrics, per_task_metrics=per_task_metrics)
            return scores, per_task_scores

    def get_layer_variables(self, layer):
        """Get the list of trainable variables in a layer of the graph."""
        if not self.built:
            self.build()
        with self._get_tf("Graph").as_default():
            if layer.variable_scope == '':
                return []
            return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                     scope=layer.variable_scope)

    def get_global_step(self):
        return self._get_tf("GlobalStep")

    def _get_tf(self, obj):
        """Fetches underlying TensorFlow primitives.

    Parameters
    ----------
    obj: str
      If "Graph", returns tf.Graph instance. If "FileWriter", returns
      tf.summary.FileWriter. If "Optimizer", returns the optimizer. If
      "train_op", returns the train operation. If "summary_op", returns the
      merged summary. If "GlobalStep" returns the global step.
    Returns
    -------
    TensorFlow Object

    """

        if obj in self.tensor_objects and self.tensor_objects[obj] is not None:
            return self.tensor_objects[obj]
        if obj == "Graph":
            self.tensor_objects['Graph'] = tf.Graph()
        elif obj == "FileWriter":
            self.tensor_objects['FileWriter'] = tf.summary.FileWriter(
                self.model_dir)
        elif obj == 'Optimizer':
            self.tensor_objects[
                'Optimizer'] = self.optimizer._create_optimizer(
                    self._get_tf('GlobalStep'))
        elif obj == 'train_op':
            opt = self._get_tf('Optimizer')
            global_step = self._get_tf('GlobalStep')
            try:
                self.tensor_objects['train_op'] = opt.minimize(
                    self.loss.out_tensor, global_step=global_step)
            except ValueError:
                # The loss doesn't depend on any variables.
                self.tensor_objects['train_op'] = 0
        elif obj == 'summary_op':
            self.tensor_objects['summary_op'] = tf.summary.merge_all(
                key=tf.GraphKeys.SUMMARIES)
        elif obj == 'GlobalStep':
            with self._get_tf("Graph").as_default():
                self.tensor_objects['GlobalStep'] = tf.Variable(
                    0, trainable=False)
        return self._get_tf(obj)

    def save_checkpoint(self, max_checkpoints_to_keep=5):
        """Save a checkpoint to disk.

    Usually you do not need to call this method, since fit() saves checkpoints
    automatically.  If you have disabled automatic checkpointing during fitting,
    this can be called to manually write checkpoints.

    Parameters
    ----------
    max_checkpoints_to_keep: int
      the maximum number of checkpoints to keep.  Older checkpoints are discarded.
    """
        saver = tf.train.Saver(max_to_keep=max_checkpoints_to_keep)
        saver.save(self.session, self.save_file, global_step=self.global_step)

    def restore(self):
        """Reload the values of all variables from the most recent checkpoint file."""
        if not self.built:
            self.build()
        last_checkpoint = tf.train.latest_checkpoint(self.model_dir)
        if last_checkpoint is None:
            raise ValueError('No checkpoint found')
        with self._get_tf("Graph").as_default():
            saver = tf.train.Saver()
            saver.restore(self.session, last_checkpoint)

    def get_num_tasks(self):
        return len(self.outputs)

    def get_pre_q_input(self, input_layer):
        layer_name = input_layer.name
        pre_q_name = "%s_pre_q" % layer_name
        return self.layers[pre_q_name]

    @staticmethod
    def load_from_dir(model_dir):
        pickle_name = os.path.join(model_dir, "model.pickle")
        with open(pickle_name, 'rb') as fout:
            tensorgraph = pickle.load(fout)
            tensorgraph.built = False
            tensorgraph.model_dir = model_dir
            try:
                tensorgraph.restore()
            except ValueError:
                pass  # No checkpoint to load
            return tensorgraph

    def __del__(self):
        pass
Example #2
0
class TensorGraph(Model):

  def __init__(self,
               tensorboard=False,
               tensorboard_log_frequency=100,
               batch_size=100,
               random_seed=None,
               use_queue=True,
               mode="regression",
               graph=None,
               learning_rate=0.001,
               **kwargs):
    """
    TODO(LESWING) allow a model to change its learning rate
    Parameters
    ----------
    tensorboard: bool
      Should we log to model_dir data for tensorboard?
    tensorboard_log_frequency: int
      How many training batches before logging tensorboard?
    batch_size: int
      default batch size for training and evaluating
    use_queue: boolean
      if True when building we will create a tf.FIFO queue, which will hold
      all features, weights, and labels.  We will feed the inputs into this
      queue in batches of self.batch_size in a separate thread from the
      thread training the model.  You cannot use a queue when
      batches are not of consistent size
    mode: str
      "regression" or "classification".  "classification" models on
      predict will do an argmax(axis=2) to determine the class of the
      prediction.
    graph: tensorflow.Graph
      the Graph in which to create Tensorflow objects.  If None, a new Graph
      is created.
    learning_rate: float or LearningRateSchedule
      the learning rate to use for optimization
    kwargs
    """

    # Layer Management
    self.nxgraph = nx.DiGraph()
    self.layers = dict()
    self.features = list()
    self.labels = list()
    self.outputs = list()
    self.task_weights = list()
    self.loss = None
    self.built = False
    self.queue_installed = False
    self.optimizer = Adam(
        learning_rate=learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-7)

    # Singular place to hold Tensor objects which don't serialize
    # These have to be reconstructed on restoring from pickle
    # See TensorGraph._get_tf() for more details on lazy construction
    self.tensor_objects = {
        "FileWriter": None,
        "Graph": graph,
        "train_op": None,
        "summary_op": None,
    }
    self.tensorboard = tensorboard
    self.tensorboard_log_frequency = tensorboard_log_frequency
    self.tensorboard_step = 0
    self.mode = mode
    self.global_step = 0
    self.last_checkpoint = None
    self.use_queue = use_queue

    self.batch_size = batch_size
    self.random_seed = random_seed
    super(TensorGraph, self).__init__(**kwargs)
    self.save_file = "%s/%s" % (self.model_dir, "model")
    self.model_class = None

    self.rnn_initial_states = []
    self.rnn_final_states = []
    self.rnn_zero_states = []

  def _add_layer(self, layer):
    if layer.name is None:
      layer.name = "%s_%s" % (layer.__class__.__name__, len(self.layers) + 1)
    if layer.name in self.layers:
      return
    if isinstance(layer, Feature):
      self.features.append(layer)
    if isinstance(layer, Label):
      self.labels.append(layer)
    if isinstance(layer, Weights):
      self.task_weights.append(layer)
    self.nxgraph.add_node(layer.name)
    self.layers[layer.name] = layer
    for in_layer in layer.in_layers:
      self._add_layer(in_layer)
      self.nxgraph.add_edge(in_layer.name, layer.name)

  def fit(self,
          dataset,
          nb_epoch=10,
          max_checkpoints_to_keep=5,
          checkpoint_interval=1000):
    return self.fit_generator(
        self.default_generator(dataset, epochs=nb_epoch),
        max_checkpoints_to_keep, checkpoint_interval)

  def fit_generator(self,
                    feed_dict_generator,
                    max_checkpoints_to_keep=5,
                    checkpoint_interval=1000):

    def create_feed_dict():
      if self.use_queue:
        while True:
          yield {self._training_placeholder: 1.0}
      for d in feed_dict_generator:
        feed_dict = {k.out_tensor: v for k, v in six.iteritems(d)}
        feed_dict[self._training_placeholder] = 1.0
        yield feed_dict

    if not self.built:
      self.build()
    with self._get_tf("Graph").as_default():
      time1 = time.time()
      train_op = self._get_tf('train_op')
      saver = tf.train.Saver(max_to_keep=max_checkpoints_to_keep)
      with tf.Session() as sess:
        self._initialize_weights(sess, saver)
        avg_loss, n_batches = 0.0, 0.0
        coord = tf.train.Coordinator()
        n_samples = 0
        if self.use_queue:
          enqueue_thread = threading.Thread(
              target=_enqueue_batch,
              args=(self, feed_dict_generator, self._get_tf("Graph"), sess,
                    coord))
          enqueue_thread.start()
        output_tensors = [x.out_tensor for x in self.outputs]
        fetches = output_tensors + [train_op, self.loss.out_tensor]
        for feed_dict in create_feed_dict():
          try:
            fetched_values = sess.run(fetches, feed_dict=feed_dict)
            loss = fetched_values[-1]
            avg_loss += loss
            n_batches += 1
            self.global_step += 1
            n_samples += 1
            if self.tensorboard and n_samples % self.tensorboard_log_frequency == 0:
              summary = sess.run(
                  self._get_tf("summary_op"), feed_dict=feed_dict)
              self._log_tensorboard(summary)
          except OutOfRangeError:
            break
          if self.global_step % checkpoint_interval == checkpoint_interval - 1:
            saver.save(sess, self.save_file, global_step=self.global_step)
            self.last_checkpoint = saver.last_checkpoints[-1]
            avg_loss = float(avg_loss) / n_batches
            print('Ending global_step %d: Average loss %g' % (self.global_step,
                                                              avg_loss))
            avg_loss, n_batches = 0.0, 0.0
        avg_loss = float(avg_loss) / n_batches
        print('Ending global_step %d: Average loss %g' % (self.global_step,
                                                          avg_loss))
        saver.save(sess, self.save_file, global_step=self.global_step)
        self.last_checkpoint = saver.last_checkpoints[-1]
      ############################################################## TIMING
      time2 = time.time()
      print("TIMING: model fitting took %0.3f s" % (time2 - time1))
      ############################################################## TIMING

  def _log_tensorboard(self, summary):
    """
    TODO(LESWING) set epoch
    Parameters
    ----------
    Returns
    -------
    """
    global_step = int(self.global_step)
    writer = self._get_tf("FileWriter")
    writer.reopen()
    writer.add_summary(summary, global_step=global_step)
    writer.close()

  def fit_on_batch(self, X, y, w):
    if not self.built:
      self.build()
    dataset = NumpyDataset(X, y)
    return self.fit(dataset, nb_epoch=1)

  def default_generator(self,
                        dataset,
                        epochs=1,
                        predict=False,
                        pad_batches=True):
    if len(self.features) > 1:
      raise ValueError("More than one Feature, must use generator")
    if len(self.labels) > 1:
      raise ValueError("More than one Label, must use generator")
    if len(self.task_weights) > 1:
      raise ValueError("More than one Weights, must use generator")
    for epoch in range(epochs):
      for (X_b, y_b, w_b, ids_b) in dataset.iterbatches(
          batch_size=self.batch_size,
          deterministic=True,
          pad_batches=pad_batches):
        feed_dict = dict()
        if len(self.labels) == 1 and y_b is not None and not predict:
          feed_dict[self.labels[0]] = y_b
        if len(self.features) == 1 and X_b is not None:
          feed_dict[self.features[0]] = X_b
        if len(self.task_weights) == 1 and w_b is not None and not predict:
          feed_dict[self.task_weights[0]] = w_b
        for (initial_state, zero_state) in zip(self.rnn_initial_states,
                                               self.rnn_zero_states):
          feed_dict[initial_state] = zero_state
        yield feed_dict

  def predict_on_generator(self, generator, transformers=[]):
    """Generates output predictions for the input samples,
      processing the samples in a batched way.

    # Arguments
        x: the input data, as a Numpy array.
        batch_size: integer.
        verbose: verbosity mode, 0 or 1.

    # Returns
        A Numpy array of predictions.
    """
    retval = self.predict_proba_on_generator(generator, transformers)
    if self.mode == 'classification':
      retval = np.expand_dims(from_one_hot(retval, axis=2), axis=1)
    return retval

  def predict_proba_on_generator(self, generator, transformers=[]):
    """
    Returns:
      y_pred: numpy ndarray of shape (n_samples, n_classes*n_tasks)
    """
    if not self.built:
      self.build()
    with self._get_tf("Graph").as_default():
      with tf.Session() as sess:
        saver = tf.train.Saver()
        self._initialize_weights(sess, saver)
        out_tensors = [x.out_tensor for x in self.outputs]
        results = []
        for feed_dict in generator:
          feed_dict = {
              self.layers[k.name].out_tensor: v
              for k, v in six.iteritems(feed_dict)
          }
          feed_dict[self._training_placeholder] = 0.0
          result = np.array(sess.run(out_tensors, feed_dict=feed_dict))
          if len(result.shape) == 3:
            result = np.transpose(result, axes=[1, 0, 2])
          result = undo_transforms(result, transformers)
          results.append(result)
        return np.concatenate(results, axis=0)

  def bayesian_predict_on_batch(self, X, transformers=[], n_passes=4):
    """
    Returns:
      mu: numpy ndarray of shape (n_samples, n_tasks)
      sigma: numpy ndarray of shape (n_samples, n_tasks)
    """
    dataset = NumpyDataset(X=X, y=None, n_tasks=len(self.outputs))
    y_ = []
    for i in range(n_passes):
      generator = self.default_generator(
          dataset, predict=True, pad_batches=True)
      y_.append(self.predict_on_generator(generator, transformers))

    y_ = np.concatenate(y_, axis=2)
    mu = np.mean(y_, axis=2)
    sigma = np.std(y_, axis=2)

    return mu, sigma

  def predict_on_smiles_batch(self,
                              smiles,
                              featurizer,
                              n_tasks,
                              transformers=[]):
    """
    # Returns:
      A numpy ndarray of shape (n_samples, n_tasks)
    """
    convmols = featurize_smiles_np(smiles, featurizer)

    dataset = NumpyDataset(X=convmols, y=None, n_tasks=len(self.outputs))
    generator = self.default_generator(dataset, predict=True, pad_batches=True)
    return self.predict_on_generator(generator, transformers)

  def predict_on_batch(self, X, sess=None, transformers=[]):
    """Generates output predictions for the input samples,
      processing the samples in a batched way.

    # Arguments
        x: the input data, as a Numpy array.
        batch_size: integer.
        verbose: verbosity mode, 0 or 1.

    # Returns
        A Numpy array of predictions.
    """
    dataset = NumpyDataset(X=X, y=None)
    generator = self.default_generator(dataset, predict=True, pad_batches=False)
    return self.predict_on_generator(generator, transformers)

  def predict_proba_on_batch(self, X, sess=None, transformers=[]):
    dataset = NumpyDataset(X=X, y=None)
    generator = self.default_generator(dataset, predict=True, pad_batches=False)
    return self.predict_proba_on_generator(generator, transformers)

  def predict(self, dataset, transformers=[], batch_size=None):
    """
    Uses self to make predictions on provided Dataset object.

    Returns:
      y_pred: numpy ndarray of shape (n_samples,)
    """
    generator = self.default_generator(dataset, predict=True, pad_batches=False)
    return self.predict_on_generator(generator, transformers)

  def predict_proba(self, dataset, transformers=[], batch_size=None):
    """
    TODO: Do transformers even make sense here?

    Returns:
      y_pred: numpy ndarray of shape (n_samples, n_classes*n_tasks)
    """
    generator = self.default_generator(dataset, predict=True, pad_batches=False)
    return self.predict_proba_on_generator(generator, transformers)

  def topsort(self):
    return nx.topological_sort(self.nxgraph)

  def build(self):
    if self.built:
      return
    with self._get_tf("Graph").as_default():
      self._training_placeholder = tf.placeholder(dtype=tf.float32, shape=())
      if self.random_seed is not None:
        tf.set_random_seed(self.random_seed)
      self._install_queue()
      order = self.topsort()
      for node in order:
        with tf.name_scope(node):
          node_layer = self.layers[node]
          node_layer.create_tensor(training=self._training_placeholder)
          self.rnn_initial_states += node_layer.rnn_initial_states
          self.rnn_final_states += node_layer.rnn_final_states
          self.rnn_zero_states += node_layer.rnn_zero_states
          node_layer.add_summary_to_tg()

      self.built = True

    for layer in self.layers.values():
      if layer.tensorboard:
        self.tensorboard = True
    tf.summary.scalar("loss", self.loss.out_tensor)
    for layer in self.layers.values():
      if layer.tensorboard:
        tf.summary.tensor_summary(layer.name, layer.out_tensor)
    if self.tensorboard:
      writer = self._get_tf("FileWriter")
      writer.add_graph(self._get_tf("Graph"))
      writer.close()

    # As a sanity check, make sure all tensors have the correct shape.

    for layer in self.layers.values():
      try:
        assert list(layer.shape) == layer.out_tensor.get_shape().as_list(
        ), '%s: Expected shape %s does not match actual shape %s' % (
            layer.name, layer.shape, layer.out_tensor.get_shape().as_list())
      except NotImplementedError:
        pass

  def _install_queue(self):
    """
    """
    if not self.use_queue or self.queue_installed:
      for layer in self.features + self.labels + self.task_weights:
        layer.pre_queue = True
      return
    names = []
    shapes = []
    pre_q_inputs = []
    q = InputFifoQueue(shapes, names, in_layers=pre_q_inputs)
    q.name = "%s_%s" % (q.__class__.__name__, len(self.layers) + 1)

    for layer in self.features + self.labels + self.task_weights:
      pre_q_input = layer.create_pre_q(self.batch_size)
      shapes.append(pre_q_input.shape)
      names.append(pre_q_input.name)
      pre_q_inputs.append(pre_q_input)

      layer.in_layers.append(q)
      self.nxgraph.add_edge(q.name, layer.name)

    self._add_layer(q)
    self.input_queue = q
    self.queue_installed = True

  def set_loss(self, layer):
    self._add_layer(layer)
    self.loss = layer

  def add_output(self, layer):
    self._add_layer(layer)
    self.outputs.append(layer)

  def set_optimizer(self, optimizer):
    """Set the optimizer to use for fitting."""
    self.optimizer = optimizer

  def get_pickling_errors(self, obj, seen=None):
    if seen == None:
      seen = []
    try:
      state = obj.__getstate__()
    except AttributeError:
      return
    if state == None:
      return
    if isinstance(state, tuple):
      if not isinstance(state[0], dict):
        state = state[1]
      else:
        state = state[0].update(state[1])
    result = {}
    for i in state:
      try:
        pickle.dumps(state[i], protocol=2)
      except pickle.PicklingError:
        if not state[i] in seen:
          seen.append(state[i])
          result[i] = self.get_pickling_errors(state[i], seen)
    return result

  def save(self):
    # Remove out_tensor from the object to be pickled
    must_restore = False
    tensor_objects = self.tensor_objects
    rnn_initial_states = self.rnn_initial_states
    rnn_final_states = self.rnn_final_states
    rnn_zero_states = self.rnn_zero_states
    self.tensor_objects = {}
    self.rnn_initial_states = []
    self.rnn_final_states = []
    self.rnn_zero_states = []
    out_tensors = []
    if self.built:
      must_restore = True
      for node in self.topsort():
        node_layer = self.layers[node]
        out_tensors.append(node_layer.none_tensors())
      optimizer = self.optimizer
      self.optimizer = None
      training_placeholder = self._training_placeholder
      self._training_placeholder = None
      self.built = False

    # Pickle itself
    pickle_name = os.path.join(self.model_dir, "model.pickle")
    with open(pickle_name, 'wb') as fout:
      try:
        pickle.dump(self, fout)
      except Exception as e:
        print(self.get_pickling_errors(self))
        raise e

    # add out_tensor back to everyone
    if must_restore:
      for index, node in enumerate(self.topsort()):
        node_layer = self.layers[node]
        node_layer.set_tensors(out_tensors[index])
      self._training_placeholder = training_placeholder
      self.optimizer = optimizer
      self.built = True
    self.tensor_objects = tensor_objects
    self.rnn_initial_states = rnn_initial_states
    self.rnn_final_states = rnn_final_states
    self.rnn_zero_states = rnn_zero_states

  def evaluate_generator(self,
                         feed_dict_generator,
                         metrics,
                         transformers=[],
                         labels=None,
                         outputs=None,
                         weights=[],
                         per_task_metrics=False):

    if labels is None:
      raise ValueError
    n_tasks = len(self.outputs)
    n_classes = self.outputs[0].out_tensor.get_shape()[-1].value
    evaluator = GeneratorEvaluator(
        self,
        feed_dict_generator,
        transformers,
        labels=labels,
        outputs=outputs,
        weights=weights,
        n_tasks=n_tasks,
        n_classes=n_classes)
    if not per_task_metrics:
      scores = evaluator.compute_model_performance(metrics)
      return scores
    else:
      scores, per_task_scores = evaluator.compute_model_performance(
          metrics, per_task_metrics=per_task_metrics)
      return scores, per_task_scores

  def get_layer_variables(self, layer):
    """Get the list of trainable variables in a layer of the graph."""
    if not self.built:
      self.build()
    with self._get_tf("Graph").as_default():
      return tf.get_collection(
          tf.GraphKeys.GLOBAL_VARIABLES, scope=layer.variable_scope)

  def get_global_step(self):
    return self._get_tf("GlobalStep")

  def _get_tf(self, obj):
    """
    TODO(LESWING) REALLY NEED TO DOCUMENT THIS
    Parameters
    ----------
    obj

    Returns
    -------
    TensorFlow Object

    """

    if obj in self.tensor_objects and self.tensor_objects[obj] is not None:
      return self.tensor_objects[obj]
    if obj == "Graph":
      self.tensor_objects['Graph'] = tf.Graph()
    elif obj == "FileWriter":
      self.tensor_objects['FileWriter'] = tf.summary.FileWriter(self.model_dir)
    elif obj == 'Optimizer':
      self.tensor_objects['Optimizer'] = self.optimizer._create_optimizer(
          self._get_tf('GlobalStep'))
    elif obj == 'train_op':
      self.tensor_objects['train_op'] = self._get_tf('Optimizer').minimize(
          self.loss.out_tensor, global_step=self._get_tf('GlobalStep'))
    elif obj == 'summary_op':
      self.tensor_objects['summary_op'] = tf.summary.merge_all(
          key=tf.GraphKeys.SUMMARIES)
    elif obj == 'GlobalStep':
      with self._get_tf("Graph").as_default():
        self.tensor_objects['GlobalStep'] = tf.Variable(0, trainable=False)
    return self._get_tf(obj)

  def _initialize_weights(self, sess, saver):
    """
    Parameters
    ----------
    sess: tf.Session
      The Session must be open
    saver: tf.train.Saver
      A saver object to save/restore checkpoints

    Returns
    -------

    """
    if self.last_checkpoint is None:
      sess.run(tf.global_variables_initializer())
      saver.save(sess, self.save_file, global_step=self.global_step)
      self.last_checkpoint = saver.last_checkpoints[-1]
    else:
      saver.restore(sess, self.last_checkpoint)

  def get_num_tasks(self):
    return len(self.outputs)

  def get_pre_q_input(self, input_layer):
    layer_name = input_layer.name
    pre_q_name = "%s_pre_q" % layer_name
    return self.layers[pre_q_name]

  @staticmethod
  def load_from_dir(model_dir):
    pickle_name = os.path.join(model_dir, "model.pickle")
    with open(pickle_name, 'rb') as fout:
      tensorgraph = pickle.load(fout)
      tensorgraph.built = False
      return tensorgraph

  def __del__(self):
    pass
Example #3
0
class TensorGraph(Model):

  def __init__(self,
               tensorboard=False,
               tensorboard_log_frequency=100,
               batch_size=100,
               random_seed=None,
               use_queue=True,
               graph=None,
               learning_rate=0.001,
               configproto=None,
               **kwargs):
    """
    Parameters
    ----------
    tensorboard: bool
      Should we log to model_dir data for tensorboard?
    tensorboard_log_frequency: int
      How many training batches before logging tensorboard?
    batch_size: int
      default batch size for training and evaluating
    use_queue: boolean
      if True when building we will create a tf.FIFO queue, which will hold
      all features, weights, and labels.  We will feed the inputs into this
      queue in batches of self.batch_size in a separate thread from the
      thread training the model.  You cannot use a queue when
      batches are not of consistent size
    graph: tensorflow.Graph
      the Graph in which to create Tensorflow objects.  If None, a new Graph
      is created.
    learning_rate: float or LearningRateSchedule
      the learning rate to use for optimization
    configproto: a tf.ConfigProto() object used to create tf.Session()
    """

    # Layer Management
    self.layers = dict()
    self.features = list()
    self.labels = list()
    self.outputs = list()
    self.variances = list()
    self.task_weights = list()
    self.submodels = list()
    self.loss = Constant(0)
    self.built = False
    self.queue_installed = False
    self.optimizer = Adam(
        learning_rate=learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-7)
    self.configproto = configproto

    # Singular place to hold Tensor objects which don't serialize
    # These have to be reconstructed on restoring from pickle
    # See TensorGraph._get_tf() for more details on lazy construction
    self.tensor_objects = {
        "FileWriter": None,
        "Graph": graph,
        "train_op": None,
        "summary_op": None,
    }
    self.tensorboard = tensorboard
    self.tensorboard_log_frequency = tensorboard_log_frequency
    self.tensorboard_step = 0
    self.global_step = 0
    self.use_queue = use_queue

    self.batch_size = batch_size
    self.random_seed = random_seed
    super(TensorGraph, self).__init__(**kwargs)
    self.save_file = "%s/%s" % (self.model_dir, "model")
    self.model_class = None

    self.rnn_initial_states = []
    self.rnn_final_states = []
    self.rnn_zero_states = []
    if self.use_queue and self.tensorboard:
      raise ValueError(
          "Currently TensorGraph cannot both use_queue and tensorboard at the same time"
      )

  def _add_layer(self, layer):
    if layer.name is None:
      layer.name = "%s_%s" % (layer.__class__.__name__, len(self.layers) + 1)
    if layer.name in self.layers:
      return
    if isinstance(layer, Feature):
      self.features.append(layer)
    if isinstance(layer, Label):
      self.labels.append(layer)
    if isinstance(layer, Weights):
      self.task_weights.append(layer)
    self.layers[layer.name] = layer
    for in_layer in layer.in_layers:
      self._add_layer(in_layer)

  def fit(self,
          dataset,
          nb_epoch=10,
          max_checkpoints_to_keep=5,
          checkpoint_interval=1000,
          deterministic=False,
          restore=False,
          submodel=None,
          **kwargs):
    """Train this model on a dataset.

    Parameters
    ----------
    dataset: Dataset
      the Dataset to train on
    nb_epoch: int
      the number of epochs to train for
    max_checkpoints_to_keep: int
      the maximum number of checkpoints to keep.  Older checkpoints are discarded.
    checkpoint_interval: int
      the frequency at which to write checkpoints, measured in training steps.
      Set this to 0 to disable automatic checkpointing.
    deterministic: bool
      if True, the samples are processed in order.  If False, a different random
      order is used for each epoch.
    restore: bool
      if True, restore the model from the most recent checkpoint and continue training
      from there.  If False, retrain the model from scratch.
    submodel: Submodel
      an alternate training objective to use.  This should have been created by
      calling create_submodel().
    """
    return self.fit_generator(
        self.default_generator(
            dataset, epochs=nb_epoch, deterministic=deterministic),
        max_checkpoints_to_keep, checkpoint_interval, restore, submodel)

  def fit_generator(self,
                    feed_dict_generator,
                    max_checkpoints_to_keep=5,
                    checkpoint_interval=1000,
                    restore=False,
                    submodel=None):
    """Train this model on data from a generator.

    Parameters
    ----------
    feed_dict_generator: generator
      this should generate batches, each represented as a dict that maps
      Layers to values.
    max_checkpoints_to_keep: int
      the maximum number of checkpoints to keep.  Older checkpoints are discarded.
    checkpoint_interval: int
      the frequency at which to write checkpoints, measured in training steps.
      Set this to 0 to disable automatic checkpointing.
    restore: bool
      if True, restore the model from the most recent checkpoint and continue training
      from there.  If False, retrain the model from scratch.
    submodel: Submodel
      an alternate training objective to use.  This should have been created by
      calling create_submodel().

    Returns
    -------
    the average loss over the most recent checkpoint interval
    """
    if not self.built:
      self.build()
    with self._get_tf("Graph").as_default():
      time1 = time.time()
      loss = self.loss
      if submodel is not None and submodel.loss is not None:
        loss = submodel.loss
      if tfe.in_eager_mode():
        # In eager mode we want an optimizer and a function to compute the
        # gradient of the loss.

        submodel_vars = None
        if submodel is None:
          optimizer = self._get_tf("Optimizer")
        else:
          optimizer = submodel.create_optimizer()
          if submodel.layers is not None:
            submodel_vars = set()
            for layer in submodel.layers:
              for var in layer.variables:
                submodel_vars.add(var)
        val_grad_fn = tfe.implicit_value_and_gradients(
            lambda x: self._run_graph([loss], x, True)[0])
      else:
        # In graph mode we want a training operation.

        if submodel is None:
          train_op = self._get_tf('train_op')
        else:
          train_op = submodel.get_train_op()
      if checkpoint_interval > 0:
        saver = tf.train.Saver(
            self.get_variables(),
            max_to_keep=max_checkpoints_to_keep,
            save_relative_paths=True)
      if restore:
        self.restore()
      avg_loss, n_averaged_batches = 0.0, 0.0
      n_samples = 0
      n_enqueued = [0]
      final_sample = [None]
      if self.queue_installed:
        enqueue_thread = threading.Thread(
            target=_enqueue_batch,
            args=(self, feed_dict_generator, self._get_tf("Graph"),
                  self.session, n_enqueued, final_sample))
        enqueue_thread.start()
      for feed_dict in self._create_feed_dicts(feed_dict_generator, True):
        if self.queue_installed:
          # Don't let this thread get ahead of the enqueue thread, since if
          # we try to read more batches than the total number that get queued,
          # this thread will hang indefinitely.
          while n_enqueued[0] <= n_samples:
            if n_samples == final_sample[0]:
              break
            time.sleep(0)
          if n_samples == final_sample[0]:
            break
        n_samples += 1
        should_log = (self.tensorboard and
                      n_samples % self.tensorboard_log_frequency == 0)
        if tfe.in_eager_mode():
          value, grads_and_vars = val_grad_fn(feed_dict)
          if submodel_vars is not None:
            grads_and_vars = [
                x for x in grads_and_vars if x[1] in submodel_vars
            ]
          optimizer.apply_gradients(grads_and_vars)
          avg_loss += value
        else:
          fetches = [train_op, loss.out_tensor]
          if should_log:
            fetches.append(self._get_tf("summary_op"))
          fetched_values = self.session.run(fetches, feed_dict=feed_dict)
          if should_log:
            self._log_tensorboard(fetched_values[2])
          avg_loss += fetched_values[1]
        n_averaged_batches += 1
        self.global_step += 1
        if checkpoint_interval > 0 and self.global_step % checkpoint_interval == checkpoint_interval - 1:
          saver.save(self.session, self.save_file, global_step=self.global_step)
          avg_loss = float(avg_loss) / n_averaged_batches
          logger.info('Ending global_step %d: Average loss %g' %
                      (self.global_step, avg_loss))
          avg_loss, n_averaged_batches = 0.0, 0.0
      if n_averaged_batches > 0:
        avg_loss = float(avg_loss) / n_averaged_batches
      if checkpoint_interval > 0:
        if n_averaged_batches > 0:
          logger.info('Ending global_step %d: Average loss %g' %
                      (self.global_step, avg_loss))
        saver.save(self.session, self.save_file, global_step=self.global_step)
        time2 = time.time()
        logger.info("TIMING: model fitting took %0.3f s" % (time2 - time1))
    return avg_loss

  def _log_tensorboard(self, summary):
    """
    TODO(LESWING) set epoch
    Parameters
    ----------
    Returns
    -------
    """
    global_step = int(self.global_step)
    writer = self._get_tf("FileWriter")
    writer.reopen()
    writer.add_summary(summary, global_step=global_step)
    writer.close()

  def fit_on_batch(self, X, y, w, submodel=None):
    if not self.built:
      self.build()
    dataset = NumpyDataset(X, y)
    return self.fit(dataset, nb_epoch=1, submodel=submodel)

  def default_generator(self,
                        dataset,
                        epochs=1,
                        predict=False,
                        deterministic=True,
                        pad_batches=True):
    if len(self.features) > 1:
      raise ValueError("More than one Feature, must use generator")
    if len(self.labels) > 1:
      raise ValueError("More than one Label, must use generator")
    if len(self.task_weights) > 1:
      raise ValueError("More than one Weights, must use generator")
    for epoch in range(epochs):
      for (X_b, y_b, w_b, ids_b) in dataset.iterbatches(
          batch_size=self.batch_size,
          deterministic=deterministic,
          pad_batches=pad_batches):
        feed_dict = dict()
        if len(self.labels) == 1 and y_b is not None and not predict:
          feed_dict[self.labels[0]] = y_b
        if len(self.features) == 1 and X_b is not None:
          feed_dict[self.features[0]] = X_b
        if len(self.task_weights) == 1 and w_b is not None and not predict:
          feed_dict[self.task_weights[0]] = w_b
        for (initial_state, zero_state) in zip(self.rnn_initial_states,
                                               self.rnn_zero_states):
          feed_dict[initial_state] = zero_state
        yield feed_dict

  def __call__(self, *inputs, **kwargs):
    """Execute the model in eager mode to compute outputs as a function of inputs.

    This is very similar to predict_on_batch(), except that it returns the outputs
    as tensors rather than numpy arrays.  That means you can compute the graph's
    outputs, then do additional calculations based on them, and gradients will
    be tracked correctly through the whole process.

    Parameters
    ----------
    inputs: tensors
      the values to use for the model's features.  The number of inputs must
      exactly match the length of the model's `features` property.  The values
      may be tensors, numpy arrays, or anything else that can be converted to
      tensors of the correct shape.
    outputs: list of Layers
      the output layers to compute.  If this is omitted, self.outputs is used
      (that is, all outputs that have been added by calling add_output()).

    Returns
    -------
    The output tensors, or a list of tensors if multiple outputs were requested.
    """
    if len(inputs) != len(self.features):
      raise ValueError('Expected %d inputs, received %d' % len(self.features),
                       len(inputs))
    # TODO Once we drop Python 2 support, turn outputs into a proper keyword arg
    # instead of using the **kwargs hack.
    if 'outputs' in kwargs:
      outputs = kwargs['outputs']
    else:
      outputs = self.outputs
    feed_dict = dict(zip(self.features, inputs))
    results = self._run_graph(outputs, feed_dict, False)
    if len(results) == 1:
      return results[0]
    return results

  def _predict(self, generator, transformers, outputs, uncertainty):
    """
    Predict outputs for data provided by a generator.

    This is the private implementation of prediction.  Do not call it directly.
    Instead call one of the public prediction methods.

    Parameters
    ----------
    generator: Generator
      Generator that constructs feed dictionaries for TensorGraph.
    transformers: list
      List of dc.trans.Transformers.
    outputs: object
      If outputs is None, then will assume outputs = self.outputs.
      If outputs is a Layer/Tensor, then will evaluate and return as a
      single ndarray. If outputs is a list of Layers/Tensors, will return a list
      of ndarrays.
    uncertainty: bool
      specifies whether this is being called as part of estimating uncertainty.
      If True, it sets the training flag so that dropout will be enabled, and
      returns the values of the uncertainty outputs.
    Returns:
      y_pred: numpy ndarray of shape (n_samples, n_classes*n_tasks)
    """
    if not self.built:
      self.build()
    if outputs is None:
      outputs = self.outputs
    elif not isinstance(outputs, collections.Sequence):
      outputs = [outputs]
    if uncertainty:
      if len(self.variances) == 0:
        raise ValueError('This model cannot compute uncertainties')
      if len(self.variances) != len(outputs):
        raise ValueError(
            'The number of variances must exactly match the number of outputs')
      tensors = outputs + self.variances
    else:
      tensors = outputs

    with self._get_tf("Graph").as_default():
      # Gather results for each output
      results = [[] for out in tensors]
      n_samples = 0
      n_enqueued = [0]
      final_sample = [None]
      if self.queue_installed:
        enqueue_thread = threading.Thread(
            target=_enqueue_batch,
            args=(self, generator, self._get_tf("Graph"), self.session,
                  n_enqueued, final_sample))
        enqueue_thread.start()
      for feed_dict in self._create_feed_dicts(generator, uncertainty):
        if self.queue_installed:
          # Don't let this thread get ahead of the enqueue thread, since if
          # we try to read more batches than the total number that get queued,
          # this thread will hang indefinitely.
          while n_enqueued[0] <= n_samples:
            if n_samples == final_sample[0]:
              break
            time.sleep(0)
          if n_samples == final_sample[0]:
            break
        n_samples += 1
        feed_results = self._run_graph(tensors, feed_dict, uncertainty)
        if tfe.in_eager_mode():
          feed_results = [f.numpy() for f in feed_results]
        if len(feed_results) > 1:
          if len(transformers):
            raise ValueError("Does not support transformations "
                             "for multiple outputs.")
        elif len(feed_results) == 1:
          result = undo_transforms(feed_results[0], transformers)
          feed_results = [result]
        for ind, result in enumerate(feed_results):
          results[ind].append(result)

      final_results = []
      for result_list in results:
        final_results.append(np.concatenate(result_list, axis=0))
      # If only one output, just return array
      if len(final_results) == 1:
        return final_results[0]
      elif uncertainty:
        return zip(final_results[:len(outputs)], final_results[len(outputs):])
      else:
        return final_results

  def predict_on_generator(self, generator, transformers=[], outputs=None):
    """
    Parameters
    ----------
    generator: Generator
      Generator that constructs feed dictionaries for TensorGraph.
    transformers: list
      List of dc.trans.Transformers.
    outputs: object
      If outputs is None, then will assume outputs = self.outputs.
      If outputs is a Layer/Tensor, then will evaluate and return as a
      single ndarray. If outputs is a list of Layers/Tensors, will return a list
      of ndarrays.
    Returns:
      y_pred: numpy ndarray of shape (n_samples, n_classes*n_tasks)
    """
    return self._predict(generator, transformers, outputs, False)

  def predict_on_batch(self, X, transformers=[], outputs=None):
    """Generates predictions for input samples, processing samples in a batch.

    Parameters
    ----------
    X: ndarray
      the input data, as a Numpy array.
    transformers: List
      List of dc.trans.Transformers

    Returns
    -------
    A Numpy array of predictions.
    """
    dataset = NumpyDataset(X=X, y=None)
    generator = self.default_generator(dataset, predict=True, pad_batches=False)
    return self.predict_on_generator(generator, transformers, outputs)

  def predict_uncertainty_on_batch(self, X, masks=50):
    """
    Predict the model's outputs, along with the uncertainty in each one.

    The uncertainty is computed as described in https://arxiv.org/abs/1703.04977.
    It involves repeating the prediction many times with different dropout masks.
    The prediction is computed as the average over all the predictions.  The
    uncertainty includes both the variation among the predicted values (epistemic
    uncertainty) and the model's own estimates for how well it fits the data
    (aleatoric uncertainty).  Not all models support uncertainty prediction.

    Parameters
    ----------
    X: ndarray
      the input data, as a Numpy array.
    masks: int
      the number of dropout masks to average over

    Returns
    -------
    for each output, a tuple (y_pred, y_std) where y_pred is the predicted
    value of the output, and each element of y_std estimates the standard
    deviation of the corresponding element of y_pred
    """
    dataset = NumpyDataset(X=X, y=None)
    return self.predict_uncertainty(dataset, masks)

  def predict(self, dataset, transformers=[], outputs=None):
    """
    Uses self to make predictions on provided Dataset object.

    Parameters
    ----------
    dataset: dc.data.Dataset
      Dataset to make prediction on
    transformers: list
      List of dc.trans.Transformers.
    outputs: object
      If outputs is None, then will assume outputs=self.outputs. If outputs is
      a Layer/Tensor, then will evaluate and return as a single ndarray. If
      outputs is a list of Layers/Tensors, will return a list of ndarrays.

    Returns
    -------
    results: numpy ndarray or list of numpy ndarrays
    """
    generator = self.default_generator(dataset, predict=True, pad_batches=False)
    return self.predict_on_generator(generator, transformers, outputs)

  def predict_uncertainty(self, dataset, masks=50):
    """
    Predict the model's outputs, along with the uncertainty in each one.

    The uncertainty is computed as described in https://arxiv.org/abs/1703.04977.
    It involves repeating the prediction many times with different dropout masks.
    The prediction is computed as the average over all the predictions.  The
    uncertainty includes both the variation among the predicted values (epistemic
    uncertainty) and the model's own estimates for how well it fits the data
    (aleatoric uncertainty).  Not all models support uncertainty prediction.

    Parameters
    ----------
    dataset: dc.data.Dataset
      Dataset to make prediction on
    masks: int
      the number of dropout masks to average over

    Returns
    -------
    for each output, a tuple (y_pred, y_std) where y_pred is the predicted
    value of the output, and each element of y_std estimates the standard
    deviation of the corresponding element of y_pred
    """
    sum_pred = []
    sum_sq_pred = []
    sum_var = []
    for i in range(masks):
      generator = self.default_generator(
          dataset, predict=True, pad_batches=False)
      results = self._predict(generator, [], self.outputs, True)
      if len(sum_pred) == 0:
        for p, v in results:
          sum_pred.append(p)
          sum_sq_pred.append(p * p)
          sum_var.append(v)
      else:
        for j, (p, v) in enumerate(results):
          sum_pred[j] += p
          sum_sq_pred[j] += p * p
          sum_var[j] += v
    output = []
    std = []
    for i in range(len(sum_pred)):
      p = sum_pred[i] / masks
      output.append(p)
      std.append(np.sqrt(sum_sq_pred[i] / masks - p * p + sum_var[i] / masks))
    if len(output) == 1:
      return (output[0], std[0])
    else:
      return zip(output, std)

  def topsort(self):

    def add_layers_to_list(layer, sorted_layers):
      if layer in sorted_layers:
        return
      for in_layer in layer.in_layers:
        add_layers_to_list(in_layer, sorted_layers)
      sorted_layers.append(layer)

    sorted_layers = []
    for l in self.features + self.labels + self.task_weights + self.outputs + self.variances:
      add_layers_to_list(l, sorted_layers)
    add_layers_to_list(self.loss, sorted_layers)
    for submodel in self.submodels:
      if submodel.loss is not None:
        add_layers_to_list(submodel.loss, sorted_layers)
    return sorted_layers

  def build(self):
    if self.built:
      return
    if tfe.in_eager_mode():
      # In eager mode, we need to execute every layer once to ensure its variables
      # have been created.

      def build_layers(layer, tensors):
        if layer in tensors:
          return tensors[layer]
        inputs = [build_layers(input, tensors) for input in layer.in_layers]
        if isinstance(layer, Input):
          # We can't execute Input layers in eager mode, since they would try
          # to create placeholders.  Instead create a tensor of the correct
          # size and type.
          shape = [1 if s is None else s for s in layer.shape]
          tensor = tf.zeros(shape, layer.dtype)
        else:
          with tf.name_scope(layer.name):
            tensor = layer.create_tensor(in_layers=inputs, set_tensors=False)
        tensors[layer] = tensor
        return tensor

      tensors = {}
      with self._get_tf("Graph").as_default():
        # Build the layers.

        build_layers(self.loss, tensors)
        for output in self.outputs:
          build_layers(output, tensors)
        for variance in self.variances:
          build_layers(variance, tensors)
        for submodel in self.submodels:
          build_layers(submodel.loss, tensors)

        # Initialize variables.

        for layer in self.layers.values():
          if layer.variable_values is not None:
            for var, val in zip(layer.variables, layer.variable_values):
              var.assign(val)
      self.session = None
      self._training_placeholder = None
      self.built = True
      return

    # In graph mode we need to create the computation graph.

    with self._get_tf("Graph").as_default():
      self._training_placeholder = tf.placeholder(dtype=tf.float32, shape=())
      if self.random_seed is not None:
        tf.set_random_seed(self.random_seed)
      self._install_queue()
      for layer in self.topsort():
        with tf.name_scope(layer.name):
          layer.create_tensor(training=self._training_placeholder)
          self.rnn_initial_states += layer.rnn_initial_states
          self.rnn_final_states += layer.rnn_final_states
          self.rnn_zero_states += layer.rnn_zero_states
          layer.add_summary_to_tg()
      self.session = tf.Session(config=self.configproto)
      self.built = True

      # Ensure all training operators have been created.

      self._get_tf('train_op')
      for submodel in self.submodels:
        train_op = submodel.get_train_op()

      # Initialize variables.

      self.session.run(tf.global_variables_initializer())
      for layer in self.layers.values():
        if layer.variable_values is not None:
          variables = self.get_layer_variables(layer)
          for var, val in zip(variables, layer.variable_values):
            self.session.run(var.assign(val))

    for layer in self.layers.values():
      if layer.tensorboard:
        self.tensorboard = True
    tf.summary.scalar("loss", self.loss.out_tensor)
    for layer in self.layers.values():
      if layer.tensorboard:
        tf.summary.tensor_summary(layer.name, layer.out_tensor)
    if self.tensorboard:
      writer = self._get_tf("FileWriter")
      writer.add_graph(self._get_tf("Graph"))
      writer.close()

    # As a sanity check, make sure all tensors have the correct shape.

    for layer in self.layers.values():
      try:
        assert list(layer.shape) == layer.out_tensor.get_shape().as_list(
        ), '%s: Expected shape %s does not match actual shape %s' % (
            layer.name, layer.shape, layer.out_tensor.get_shape().as_list())
      except NotImplementedError:
        pass

  def _install_queue(self):
    """
    """
    if not self.use_queue or self.queue_installed:
      for layer in self.features + self.labels + self.task_weights:
        layer.pre_queue = True
      return
    inputs = self.features + self.labels + self.task_weights
    if len(inputs) == 0:
      return
    names = []
    shapes = []
    pre_q_inputs = []
    q = InputFifoQueue(shapes, names, in_layers=pre_q_inputs)
    q.name = "%s_%s" % (q.__class__.__name__, len(self.layers) + 1)

    for layer in inputs:
      pre_q_input = layer.create_pre_q()
      shapes.append(pre_q_input.shape)
      names.append(pre_q_input.name)
      pre_q_inputs.append(pre_q_input)

      layer.in_layers.append(q)

    self._add_layer(q)
    self.input_queue = q
    self.queue_installed = True

  def set_loss(self, layer):
    self._add_layer(layer)
    self.loss = layer

  def add_output(self, layer):
    """Add an output layer that can be computed by predict()"""
    self._add_layer(layer)
    self.outputs.append(layer)

  def add_variance(self, layer):
    """Add a layer that computes the variance in an output.

    If a model supports uncertainty, it must call add_variance() once for every
    output.  Each variance layer has the same shape as the corresponding output,
    and each element computes an estimate of the variance from aleatoric
    uncertainty in the corresponding element of the output.

    In addition, if a model supports uncertainty it MUST use dropout on every
    layer.  Otherwise, the uncertainties it computes will be inaccurate.
    """
    self._add_layer(layer)
    self.variances.append(layer)

  def set_optimizer(self, optimizer):
    """Set the optimizer to use for fitting."""
    self.optimizer = optimizer

  def create_submodel(self, layers=None, loss=None, optimizer=None):
    """Create an alternate objective for training one piece of a TensorGraph.

    A TensorGraph consists of a set of layers, and specifies a loss function and
    optimizer to use for training those layers.  Usually this is sufficient, but
    there are cases where you want to train different parts of a model separately.
    For example, a GAN consists of a generator and a discriminator.  They are
    trained separately, and they use different loss functions.

    A submodel defines an alternate objective to use in cases like this.  It may
    optionally specify any of the following: a subset of layers in the model to
    train; a different loss function; and a different optimizer to use.  This
    method creates a submodel, which you can then pass to fit() to use it for
    training.

    Parameters
    ----------
    layers: list
      the list of layers to train.  If None, all layers in the model will be
      trained.
    loss: Layer
      the loss function to optimize.  If None, the model's main loss function
      will be used.
    optimizer: Optimizer
      the optimizer to use for training.  If None, the model's main optimizer
      will be used.

    Returns
    -------
    the newly created submodel, which can be passed to any of the fitting
    methods.
    """
    if self.built:
      raise ValueError('Submodels must be created before build() is called.')
    submodel = Submodel(self, layers, loss, optimizer)
    self.submodels.append(submodel)
    if loss is not None:
      self._add_layer(loss)
    return submodel

  def get_pickling_errors(self, obj, seen=None):
    if seen == None:
      seen = []
    try:
      state = obj.__getstate__()
    except AttributeError:
      return
    if state == None:
      return
    if isinstance(state, tuple):
      if not isinstance(state[0], dict):
        state = state[1]
      else:
        state = state[0].update(state[1])
    result = {}
    for i in state:
      try:
        pickle.dumps(state[i], protocol=2)
      except pickle.PicklingError:
        if not state[i] in seen:
          seen.append(state[i])
          result[i] = self.get_pickling_errors(state[i], seen)
    return result

  def save(self):
    # Remove out_tensor from the object to be pickled
    must_restore = False
    tensor_objects = self.tensor_objects
    rnn_initial_states = self.rnn_initial_states
    rnn_final_states = self.rnn_final_states
    rnn_zero_states = self.rnn_zero_states
    session = self.session
    self.tensor_objects = {}
    self.rnn_initial_states = []
    self.rnn_final_states = []
    self.rnn_zero_states = []
    self.session = None
    out_tensors = []
    submodel_ops = []
    if self.built:
      must_restore = True
      for layer in self.topsort():
        out_tensors.append(layer.none_tensors())
      for submodel in self.submodels:
        submodel_ops.append(submodel._train_op)
        submodel._train_op = None
      training_placeholder = self._training_placeholder
      self._training_placeholder = None
      self.built = False

    # Pickle itself
    pickle_name = os.path.join(self.model_dir, "model.pickle")

    with open(pickle_name, 'wb') as fout:
      try:
        pickle.dump(self, fout)
      except Exception as e:
        logger.info(self.get_pickling_errors(self))
        raise e

    # add out_tensor back to everyone
    if must_restore:
      for index, layer in enumerate(self.topsort()):
        layer.set_tensors(out_tensors[index])
      for submodel, op in zip(self.submodels, submodel_ops):
        submodel._train_op = op
      self._training_placeholder = training_placeholder
      self.built = True
    self.tensor_objects = tensor_objects
    self.rnn_initial_states = rnn_initial_states
    self.rnn_final_states = rnn_final_states
    self.rnn_zero_states = rnn_zero_states
    self.session = session

  def evaluate_generator(self,
                         feed_dict_generator,
                         metrics,
                         transformers=[],
                         labels=None,
                         outputs=None,
                         weights=[],
                         per_task_metrics=False):

    if labels is None:
      raise ValueError
    n_tasks = len(self.outputs)
    n_classes = self.outputs[0].out_tensor.get_shape()[-1].value
    evaluator = GeneratorEvaluator(
        self,
        feed_dict_generator,
        transformers,
        labels=labels,
        outputs=outputs,
        weights=weights,
        n_tasks=n_tasks,
        n_classes=n_classes)
    if not per_task_metrics:
      scores = evaluator.compute_model_performance(metrics)
      return scores
    else:
      scores, per_task_scores = evaluator.compute_model_performance(
          metrics, per_task_metrics=per_task_metrics)
      return scores, per_task_scores

  def get_layer_variables(self, layer):
    """Get the list of trainable variables in a layer of the graph."""
    if tfe.in_eager_mode():
      return layer.variables
    if not self.built:
      self.build()
    with self._get_tf("Graph").as_default():
      if layer.variable_scope == '':
        return []
      return tf.get_collection(
          tf.GraphKeys.TRAINABLE_VARIABLES, scope=layer.variable_scope)

  def get_variables(self):
    """Get the list of all trainable variables in the graph."""
    if not self.built:
      self.build()
    if tfe.in_eager_mode():
      variables = []
      for layer in self.layers.values():
        variables += layer.variables
      return variables
    else:
      with self._get_tf("Graph").as_default():
        return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)

  def get_global_step(self):
    return self._get_tf("GlobalStep")

  def _get_tf(self, obj):
    """Fetches underlying TensorFlow primitives.

    Parameters
    ----------
    obj: str
      If "Graph", returns tf.Graph instance. If "FileWriter", returns
      tf.summary.FileWriter. If "Optimizer", returns the optimizer. If
      "train_op", returns the train operation. If "summary_op", returns the
      merged summary. If "GlobalStep" returns the global step.
    Returns
    -------
    TensorFlow Object

    """

    if obj in self.tensor_objects and self.tensor_objects[obj] is not None:
      return self.tensor_objects[obj]
    if obj == "Graph":
      self.tensor_objects['Graph'] = tf.Graph()
    elif obj == "FileWriter":
      self.tensor_objects['FileWriter'] = tf.summary.FileWriter(self.model_dir)
    elif obj == 'Optimizer':
      self.tensor_objects['Optimizer'] = self.optimizer._create_optimizer(
          self._get_tf('GlobalStep'))
    elif obj == 'train_op':
      opt = self._get_tf('Optimizer')
      global_step = self._get_tf('GlobalStep')
      try:
        self.tensor_objects['train_op'] = opt.minimize(
            self.loss.out_tensor, global_step=global_step)
      except ValueError:
        # The loss doesn't depend on any variables.
        self.tensor_objects['train_op'] = 0
    elif obj == 'summary_op':
      self.tensor_objects['summary_op'] = tf.summary.merge_all(
          key=tf.GraphKeys.SUMMARIES)
    elif obj == 'GlobalStep':
      with self._get_tf("Graph").as_default():
        self.tensor_objects['GlobalStep'] = create_variable(0, trainable=False)
    return self._get_tf(obj)

  def save_checkpoint(self, max_checkpoints_to_keep=5):
    """Save a checkpoint to disk.

    Usually you do not need to call this method, since fit() saves checkpoints
    automatically.  If you have disabled automatic checkpointing during fitting,
    this can be called to manually write checkpoints.

    Parameters
    ----------
    max_checkpoints_to_keep: int
      the maximum number of checkpoints to keep.  Older checkpoints are discarded.
    """
    saver = tf.train.Saver(
        self.get_variables(), max_to_keep=max_checkpoints_to_keep)
    saver.save(self.session, self.save_file, global_step=self.global_step)

  def get_checkpoints(self):
    """Get a list of all available checkpoint files."""
    return tf.train.get_checkpoint_state(
        self.model_dir).all_model_checkpoint_paths

  def restore(self, checkpoint=None):
    """Reload the values of all variables from a checkpoint file.

    Parameters
    ----------
    checkpoint: str
      the path to the checkpoint file to load.  If this is None, the most recent
      checkpoint will be chosen automatically.  Call get_checkpoints() to get a
      list of all available checkpoints.
    """
    if not self.built:
      self.build()
    if checkpoint is None:
      checkpoint = tf.train.latest_checkpoint(self.model_dir)
    if checkpoint is None:
      raise ValueError('No checkpoint found')
    with self._get_tf("Graph").as_default():
      reader = NewCheckpointReader(checkpoint)
      var_names = set([x for x in reader.get_variable_to_shape_map()])
      var_list = []
      for var in self.get_variables():
        name = var.name
        if ':' in name:
          name = name[:name.rfind(':')]
        if name in var_names:
          var_list.append(var)
      saver = tf.train.Saver(var_list=var_list)
      saver.restore(self.session, checkpoint)

  def get_num_tasks(self):
    return len(self.outputs)

  def get_pre_q_input(self, input_layer):
    layer_name = input_layer.name
    pre_q_name = "%s_pre_q" % layer_name
    return self.layers[pre_q_name]

  @staticmethod
  def load_from_dir(model_dir, restore=True):
    pickle_name = os.path.join(model_dir, "model.pickle")
    with open(pickle_name, 'rb') as fout:
      tensorgraph = pickle.load(fout)
      tensorgraph.built = False
      tensorgraph.model_dir = model_dir
      if restore:
        try:
          tensorgraph.restore()
        except ValueError:
          pass  # No checkpoint to load
      return tensorgraph

  def __del__(self):
    pass

  def _create_feed_dicts(self, generator, training):
    """Create feed dicts for use in fitting or prediction.

    Parameters
    ----------
    generator: Generator
      the feed dict generator that was passed to fit_generator() or predict_on_generator()
    training: bool
      True during training, False during prediction
    """
    train_value = 1.0 if training else 0.0
    if self.queue_installed:
      while True:
        yield {self._training_placeholder: train_value}
    else:
      for d in generator:
        feed_dict = {}
        for key, value in d.items():
          if isinstance(key, Input):
            value = _ensure_value_shape(value, key)
            if tfe.in_eager_mode():
              value = tf.cast(value, key.dtype)
            feed_dict[key] = value
          else:
            feed_dict[key] = value
        if not tfe.in_eager_mode():
          feed_dict[self._training_placeholder] = train_value
        yield feed_dict

  def _run_graph(self, outputs, feed_dict, training):
    """Run the calculations in the graph to compute some outputs.

    In graph mode, this just calls session.run().  In eager mode, it executes
    all required layers to compute the output.

    Parameters
    ----------
    outputs: list of Layers
      the output layers to compute
    feed_dict: dict
      maps input layers to values
    training: bool
      whether this is being executed in training mode
    """
    if not tfe.in_eager_mode():
      return self.session.run(outputs, feed_dict)

    def run_layers(layer, tensors):
      if layer in tensors:
        return tensors[layer]
      inputs = [run_layers(input, tensors) for input in layer.in_layers]
      tensor = layer.create_tensor(
          in_layers=inputs, set_tensors=False, training=training)
      tensors[layer] = tensor
      return tensor

    tensors = feed_dict.copy()
    return [run_layers(o, tensors) for o in outputs]

  def make_estimator(self,
                     feature_columns,
                     weight_column=None,
                     metrics={},
                     model_dir=None,
                     config=None):
    """Construct a Tensorflow Estimator from this model.

    tf.estimator.Estimator is the standard Tensorflow API for representing models.
    This method provides interoperability between DeepChem and other Tensorflow
    based tools by allowing any model to be used an Estimator.

    Once this method returns, the Estimator it created is independent of the model
    it was created from.  They do not share tensors, variables, save files, or any
    other resources.  The Estimator is a self contained object with its own methods
    for training, evaluation, prediction, checkpointing, etc.

    Parameters
    ----------
    feature_columns: list of tf.feature_column objects
      this describes the input features to the models.  There must be one entry
      for each Feature layer in this model's features field.
    weight_column: tf.feature_column or None
      if this model includes a Weights layer, this describes the input weights.
      Otherwise, this should be None.
    metrics: map
      metrics that should be computed in calls to evaluate().  For each entry,
      the key is the name to report for the metric, and the value is a function
      of the form f(labels, predictions, weights) that returns the tensors for
      computing the metric.  Any of the functions in tf.metrics can be used, as
      can other functions that satisfy the same interface.
    model_dir: str
      the directory in which the Estimator should save files.  If None, this
      defaults to the model's model_dir.
    config: RunConfig
      configuration options for the Estimator
    """
    # Check the inputs.

    if tfe.in_eager_mode():
      raise ValueError('make_estimator() is not supported in eager mode')
    if len(feature_columns) != len(self.features):
      raise ValueError(
          'This model requires %d feature column(s)' % len(self.features))
    if len(self.labels) != 1:
      raise ValueError(
          'Can only create an Estimator from a model with exactly one Label input'
      )
    if len(self.task_weights) > 1:
      raise ValueError(
          'Cannot create an Estimator from a model with multiple Weight inputs')
    if weight_column is None:
      if len(self.task_weights) > 0:
        raise ValueError('This model requires a weight column')
    else:
      if len(self.task_weights) == 0:
        raise ValueError(
            'Cannot specify weight_column for a model with no Weight inputs')
    if model_dir is None:
      model_dir = self.model_dir

    # Define a function that recursively creates tensors from layers.

    def create_tensors(layer, tensors, training):
      if layer in tensors:
        return tensors[layer]
      inputs = [
          create_tensors(in_layer, tensors, training)
          for in_layer in layer.in_layers
      ]
      tensor = layer.create_tensor(
          in_layers=inputs, set_tensors=False, training=training)
      tensors[layer] = tensor
      layer.add_summary_to_tg(tensor)
      return tensor

    # Define the model function.

    def model_fn(features, labels, mode):
      # Define the inputs.

      tensors = self.create_estimator_inputs(feature_columns, weight_column,
                                             features, labels, mode)
      for layer, tensor in tensors.items():
        layer.add_summary_to_tg(tensor)

      # Create the correct outputs, based on the mode.

      if mode == tf.estimator.ModeKeys.PREDICT:
        predictions = {}
        for i, output in enumerate(self.outputs):
          predictions[i] = create_tensors(output, tensors, 0)
        return tf.estimator.EstimatorSpec(mode, predictions=predictions)
      if mode == tf.estimator.ModeKeys.EVAL:
        loss = create_tensors(self.loss, tensors, 0)
        predictions = create_tensors(self.outputs[0], tensors, 0)
        if len(self.task_weights) == 0:
          weights = None
        else:
          weights = tensors[self.task_weights[0]]
        eval_metric_ops = {}
        for name, function in metrics.items():
          eval_metric_ops[name] = function(tensors[self.labels[0]], predictions,
                                           weights)
        return tf.estimator.EstimatorSpec(
            mode, loss=loss, eval_metric_ops=eval_metric_ops)
      if mode == tf.estimator.ModeKeys.TRAIN:
        loss = create_tensors(self.loss, tensors, 1)
        global_step = tf.train.get_global_step()
        optimizer = self.optimizer._create_optimizer(global_step)
        train_op = optimizer.minimize(loss, global_step=global_step)
        return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)
      raise ValueError('Unknown mode')

    # Create the Estimator.

    return tf.estimator.Estimator(
        model_fn=model_fn, model_dir=model_dir, config=config)

  def create_estimator_inputs(self, feature_columns, weight_column, features,
                              labels, mode):
    """This is called by make_estimator() to create tensors for the inputs.

    feature_columns and weight_column are the arguments passed to
    make_estimator().  features, labels, and mode are the arguments passed to
    the estimator's model function.  This method creates and returns a dict with
    one entry for every Feature, Label, or Weights layer in the graph.  The keys
    are the layers, and the values are the tensors that correspond to them.

    Any subclass that overrides default_generator() must also override this
    method.
    """
    if self.__class__.default_generator != TensorGraph.default_generator:
      raise ValueError(
          "Class overrides default_generator() but not create_estimator_inputs()"
      )
    tensors = {}
    for layer, column in zip(self.features, feature_columns):
      tensors[layer] = tf.feature_column.input_layer(features, [column])
    if weight_column is not None:
      tensors[self.task_weights[0]] = tf.feature_column.input_layer(
          features, [weight_column])
    if labels is not None:
      tensors[self.labels[0]] = tf.cast(labels, self.labels[0].dtype)
    return tensors
Example #4
0
class KerasModel(Model):
  """This is a DeepChem model implemented by a Keras model.

  This class provides several advantages over using the Keras model's fitting
  and prediction methods directly.

  1. It provides better integration with the rest of DeepChem, such as direct
     support for Datasets and Transformers.

  2. It defines the loss in a more flexible way.  In particular, Keras does not
     support multidimensional weight matrices, which makes it impossible to
     implement most multitask models with Keras.

  3. It provides various additional features not found in the Keras Model class,
     such as uncertainty prediction and saliency mapping.

  The loss function for a model can be defined in two different ways.  For
  models that have only a single output and use a standard loss function, you
  can simply provide a dc.models.losses.Loss object.  This defines the loss for
  each sample or sample/task pair.  The result is automatically multiplied by
  the weights and averaged over the batch.  Any additional losses computed by
  model layers, such as weight decay penalties, are also added.

  For more complicated cases, you can instead provide a function that directly
  computes the total loss.  It must be of the form f(outputs, labels, weights),
  taking the list of outputs from the model, the expected values, and any weight
  matrices.  It should return a scalar equal to the value of the loss function
  for the batch.  No additional processing is done to the result; it is up to
  you to do any weighting, averaging, adding of penalty terms, etc.

  You can optionally provide an output_types argument, which describes how to
  interpret the model's outputs.  This should be a list of strings, one for each
  output.  Each entry must have one of the following values:

  - 'prediction': This is a normal output, and will be returned by predict().
    If output types are not specified, all outputs are assumed to be of this
    type.

  - 'loss': This output will be used in place of the normal outputs for
    computing the loss function.  For example, models that output probability
    distributions usually do it by computing unbounded numbers (the logits),
    then passing them through a softmax function to turn them into
    probabilities.  When computing the cross entropy, it is more numerically
    stable to use the logits directly rather than the probabilities.  You can
    do this by having the model produce both probabilities and logits as
    outputs, then specifying output_types=['prediction', 'loss'].  When
    predict() is called, only the first output (the probabilities) will be
    returned.  But during training, it is the second output (the logits) that
    will be passed to the loss function.

  - 'variance': This output is used for estimating the uncertainty in another
    output.  To create a model that can estimate uncertainty, there must be the
    same number of 'prediction' and 'variance' outputs.  Each variance output
    must have the same shape as the corresponding prediction output, and each
    element is an estimate of the variance in the corresponding prediction.
    Also be aware that if a model supports uncertainty, it MUST use dropout on
    every layer.  Otherwise, the uncertainties it computes will be inaccurate.
  """

  def __init__(self,
               model,
               loss,
               output_types=None,
               batch_size=100,
               model_dir=None,
               learning_rate=0.001,
               optimizer=None,
               tensorboard=False,
               tensorboard_log_frequency=100,
               **kwargs):
    """Create a new KerasModel.

    Parameters
    ----------
    model: tf.keras.Model
      the Keras model implementing the calculation
    loss: dc.models.losses.Loss or function
      a Loss or function defining how to compute the training loss for each
      batch, as described above
    output_types: list of strings
      the type of each output from the model, as described above
    batch_size: int
      default batch size for training and evaluating
    model_dir: str
      the directory on disk where the model will be stored.  If this is None,
      a temporary directory is created.
    learning_rate: float or LearningRateSchedule
      the learning rate to use for fitting.  If optimizer is specified, this is
      ignored.
    optimizer: Optimizer
      the optimizer to use for fitting.  If this is specified, learning_rate is
      ignored.
    tensorboard: bool
      whether to log progress to TensorBoard during training
    tensorboard_log_frequency: int
      the frequency at which to log data to TensorBoard, measured in batches
    """
    super(KerasModel, self).__init__(
        model_instance=model, model_dir=model_dir, **kwargs)
    self.model = model
    if isinstance(loss, Loss):
      self._loss_fn = _StandardLoss(model, loss)
    else:
      self._loss_fn = loss
    self.batch_size = batch_size
    if optimizer is None:
      self.optimizer = Adam(learning_rate=learning_rate)
    else:
      self.optimizer = optimizer
    self.tensorboard = tensorboard
    self.tensorboard_log_frequency = tensorboard_log_frequency
    self._tensorboard_step = 0
    if tensorboard and tf.executing_eagerly():
      raise ValueError(
          "Logging to TensorBoard is not currently supported in eager mode")
    if output_types is None:
      self._prediction_outputs = None
      self._loss_outputs = None
      self._variance_outputs = None
    else:
      self._prediction_outputs = []
      self._loss_outputs = []
      self._variance_outputs = []
      for i, type in enumerate(output_types):
        if type == 'prediction':
          self._prediction_outputs.append(i)
        elif type == 'loss':
          self._loss_outputs.append(i)
        elif type == 'variance':
          self._variance_outputs.append(i)
        else:
          raise ValueError('Unknown output type "%s"' % type)
    self._built = False
    self._inputs_built = False
    self._training_ops_built = False
    self._initialized_vars = set()

  def _ensure_built(self):
    """The first time this is called, create internal data structures."""
    if self._built:
      return
    self._built = True
    if not tf.executing_eagerly():
      self.session = tf.Session()
    self._global_step = tf.Variable(0, trainable=False)
    self._tf_optimizer = self.optimizer._create_optimizer(self._global_step)
    self._checkpoint = tf.train.Checkpoint(
        optimizer=self._tf_optimizer, model=self.model)
    self._init_new_vars()

  def _create_inputs(self, example_inputs):
    """The first time this is called, create tensors representing the inputs and outputs."""
    if self._inputs_built:
      return
    self._ensure_built()
    self._inputs_built = True
    if len(self.model.inputs) > 0:
      self._input_dtypes = [t.dtype.as_numpy_dtype for t in self.model.inputs]
    else:
      self._input_dtypes = [
          np.float32 if x.dtype == np.float64 else x.dtype
          for x in example_inputs
      ]
    if tf.executing_eagerly():
      return
    if len(self.model.inputs) > 0:
      self._input_placeholders = self.model.inputs
    else:
      # The model doesn't specify inputs, so guess the input shapes based on the
      # example batch.
      input_shapes = [(None,) + i.shape[1:] for i in example_inputs]
      self._input_placeholders = [
          tf.placeholder(dtype=tf.as_dtype(t), shape=s)
          for s, t in zip(input_shapes, self._input_dtypes)
      ]
      if len(input_shapes) == 1:
        self.model.build(input_shapes[0])
      else:
        self.model.build(input_shapes)
    if len(self._input_placeholders) == 1:
      self._output_tensors = self.model(
          self._input_placeholders[0], training=False)
      self._uncertainty_tensors = self.model(
          self._input_placeholders[0], training=True)
    else:
      self._output_tensors = self.model(
          self._input_placeholders, training=False)
      self._uncertainty_tensors = self.model(
          self._input_placeholders, training=True)
    if isinstance(self._output_tensors, tf.Tensor):
      self._output_tensors = [self._output_tensors]
    if self._prediction_outputs is None:
      self._prediction_outputs = list(range(len(self._output_tensors)))
      self._loss_outputs = list(range(len(self._output_tensors)))
    self._init_new_vars()

  def _create_training_ops(self, example_batch):
    """The first time this is called, create tensors used in optimization."""
    if self._training_ops_built:
      return
    self._create_inputs(example_batch[0])
    self._training_ops_built = True
    self._label_dtypes = [
        np.float32 if x.dtype == np.float64 else x.dtype
        for x in example_batch[1]
    ]
    self._weights_dtypes = [
        np.float32 if x.dtype == np.float64 else x.dtype
        for x in example_batch[2]
    ]
    if tf.executing_eagerly():
      return
    self._label_placeholders = [
        tf.placeholder(dtype=tf.as_dtype(t), shape=x.shape)
        for x, t in zip(example_batch[1], self._label_dtypes)
    ]
    self._weights_placeholders = [
        tf.placeholder(dtype=tf.as_dtype(t), shape=x.shape)
        for x, t in zip(example_batch[2], self._weights_dtypes)
    ]
    self._loss_tensor = self._loss_fn(
        [self._output_tensors[i] for i in self._loss_outputs],
        self._label_placeholders, self._weights_placeholders)
    try:
      self._train_op = self._tf_optimizer.minimize(
          self._loss_tensor, global_step=self._global_step)
    except ValueError:
      # The loss doesn't depend on any variables.
      self._train_op = 0
    if self.tensorboard:
      self._summary_ops = tf.summary.scalar('loss', self._loss_tensor)
      self._summary_writer = tf.summary.FileWriter(self.model_dir)
    self._init_new_vars()

  def _init_new_vars(self):
    """Initialize any new variables created since the last call to this method."""
    if not tf.executing_eagerly():
      vars = set(tf.global_variables())
      new_vars = vars.difference(self._initialized_vars)
      self.session.run(tf.variables_initializer(new_vars))
      self._initialized_vars = vars

  def fit(self,
          dataset,
          nb_epoch=10,
          max_checkpoints_to_keep=5,
          checkpoint_interval=1000,
          deterministic=False,
          restore=False):
    """Train this model on a dataset.

    Parameters
    ----------
    dataset: Dataset
      the Dataset to train on
    nb_epoch: int
      the number of epochs to train for
    max_checkpoints_to_keep: int
      the maximum number of checkpoints to keep.  Older checkpoints are discarded.
    checkpoint_interval: int
      the frequency at which to write checkpoints, measured in training steps.
      Set this to 0 to disable automatic checkpointing.
    deterministic: bool
      if True, the samples are processed in order.  If False, a different random
      order is used for each epoch.
    restore: bool
      if True, restore the model from the most recent checkpoint and continue training
      from there.  If False, retrain the model from scratch.
   """
    return self.fit_generator(
        self.default_generator(
            dataset, epochs=nb_epoch, deterministic=deterministic),
        max_checkpoints_to_keep, checkpoint_interval, restore)

  def fit_generator(self,
                    generator,
                    max_checkpoints_to_keep=5,
                    checkpoint_interval=1000,
                    restore=False):
    """Train this model on data from a generator.

    Parameters
    ----------
    generator: generator
      this should generate batches, each represented as a tuple of the form
      (inputs, labels, weights).
    max_checkpoints_to_keep: int
      the maximum number of checkpoints to keep.  Older checkpoints are discarded.
    checkpoint_interval: int
      the frequency at which to write checkpoints, measured in training steps.
      Set this to 0 to disable automatic checkpointing.
    restore: bool
      if True, restore the model from the most recent checkpoint and continue training
      from there.  If False, retrain the model from scratch.

    Returns
    -------
    the average loss over the most recent checkpoint interval
    """
    self._ensure_built()
    if restore:
      self.restore()
    if checkpoint_interval > 0:
      manager = tf.train.CheckpointManager(self._checkpoint, self.model_dir,
                                           max_checkpoints_to_keep)
    avg_loss = 0.0
    averaged_batches = 0
    time1 = time.time()

    # Main training loop.

    for batch in generator:
      self._create_training_ops(batch)
      inputs, labels, weights = self._prepare_batch(batch)
      self._tensorboard_step += 1
      should_log = (
          self.tensorboard and
          self._tensorboard_step % self.tensorboard_log_frequency == 0)
      if tf.executing_eagerly():

        # In eager mode we execute the loss function, accumulating the gradients.

        with tf.GradientTape() as tape:
          outputs = self.model(inputs[0])
          if isinstance(outputs, tf.Tensor):
            outputs = [outputs]
          if self._loss_outputs is not None:
            outputs = [outputs[i] for i in self._loss_outputs]
          loss = self._loss_fn(outputs, labels, weights)
          avg_loss += loss
          grads = tape.gradient(loss, self.model.trainable_variables)
          self._tf_optimizer.apply_gradients(
              zip(grads, self.model.trainable_variables))
          tf.assign_add(self._global_step, 1)
          current_step = self._global_step.numpy()
      else:

        # In graph mode we execute the training op.

        fetches = [self._train_op, self._loss_tensor, self._global_step]
        if should_log:
          fetches.append(self._summary_ops)
        feed_dict = dict(zip(self._input_placeholders, inputs))
        feed_dict.update(dict(zip(self._label_placeholders, labels)))
        feed_dict.update(dict(zip(self._weights_placeholders, weights)))
        fetched_values = self.session.run(fetches, feed_dict=feed_dict)
        avg_loss += fetched_values[1]
        current_step = fetched_values[2]
        if should_log:
          self._summary_writer.reopen()
          self._summary_writer.add_summary(
              fetched_values[3], global_step=current_step)
          self._summary_writer.close()

      # Report progress and write checkpoints.

      averaged_batches += 1
      if checkpoint_interval > 0 and current_step % checkpoint_interval == checkpoint_interval - 1:
        self._exec_with_session(lambda: manager.save())
        avg_loss = float(avg_loss) / averaged_batches
        print(
            'Ending global_step %d: Average loss %g' % (current_step, avg_loss))
        avg_loss = 0.0
        averaged_batches = 0

    # Report final results.

    if checkpoint_interval > 0:
      if averaged_batches > 0:
        avg_loss = float(avg_loss) / averaged_batches
        print(
            'Ending global_step %d: Average loss %g' % (current_step, avg_loss))
      self._exec_with_session(lambda: manager.save())
    time2 = time.time()
    print("TIMING: model fitting took %0.3f s" % (time2 - time1))
    return avg_loss

  def fit_on_batch(self, X, y, w):
    """Perform a single step of training.

    Parameters
    ----------
    X: ndarray
      the inputs for the batch
    y: ndarray
      the labels for the batch
    w: ndarray
      the weights for the batch
   """
    if not self.built:
      self.build()
    dataset = NumpyDataset(X, y, w)
    return self.fit(dataset, nb_epoch=1)

  def _predict(self, generator, transformers, uncertainty):
    """
    Predict outputs for data provided by a generator.

    This is the private implementation of prediction.  Do not call it directly.
    Instead call one of the public prediction methods.

    Parameters
    ----------
    generator: generator
      this should generate batches, each represented as a tuple of the form
      (inputs, labels, weights).
    transformers: list of dc.trans.Transformers
      Transformers that the input data has been transformed by.  The output
      is passed through these transformers to undo the transformations.
    uncertainty: bool
      specifies whether this is being called as part of estimating uncertainty.
      If True, it sets the training flag so that dropout will be enabled, and
      returns the values of the uncertainty outputs.
    Returns:
      a NumPy array of the model produces a single output, or a list of arrays
      if it produces multiple outputs
    """
    results = None
    variances = None
    if uncertainty:
      if self._variance_outputs is None or len(self._variance_outputs) == 0:
        raise ValueError('This model cannot compute uncertainties')
      if len(self._variance_outputs) != len(self._prediction_outputs):
        raise ValueError(
            'The number of variances must exactly match the number of outputs')
    for batch in generator:
      inputs, labels, weights = batch
      self._create_inputs(inputs)
      inputs, _, _ = self._prepare_batch((inputs, None, None))
      if tf.executing_eagerly():

        # In eager mode we invoke the model directly.

        if len(inputs) == 1:
          inputs = inputs[0]
        outputs = self.model(inputs, training=uncertainty)
        outputs = [t.numpy() for t in outputs]
      else:

        # In graph mode we execute the output tensors.

        if uncertainty:
          fetches = self._uncertainty_tensors
        else:
          fetches = self._output_tensors
        feed_dict = dict(zip(self._input_placeholders, inputs))
        outputs = self.session.run(fetches, feed_dict=feed_dict)

      # Apply tranformers and record results.

      if uncertainty:
        var = [outputs[i] for i in self._variance_outputs]
        if variances is None:
          variances = var
        else:
          for i, t in enumerate(var):
            variances[i].append(t)
      if self._prediction_outputs is not None:
        outputs = [outputs[i] for i in self._prediction_outputs]
      if len(transformers) > 0:
        if len(outputs) > 1:
          raise ValueError(
              "predict() does not support Transformers for models with multiple outputs."
          )
        elif len(outputs) == 1:
          outputs = [undo_transforms(outputs[0], transformers)]
      if results is None:
        results = [outputs]
      else:
        for i, t in enumerate(outputs):
          results[i].append(t)

    # Concatenate arrays to create the final results.

    final_results = []
    final_variances = []
    for r in results:
      final_results.append(np.concatenate(r, axis=0))
    if uncertainty:
      for v in variances:
        final_variances.append(np.concatenate(v, axis=0))
      return zip(final_results, final_variances)
    # If only one output, just return array
    if len(final_results) == 1:
      return final_results[0]
    else:
      return final_results

  def predict_on_generator(self, generator, transformers=[]):
    """
    Parameters
    ----------
    generator: generator
      this should generate batches, each represented as a tuple of the form
      (inputs, labels, weights).
    transformers: list of dc.trans.Transformers
      Transformers that the input data has been transformed by.  The output
      is passed through these transformers to undo the transformations.
    Returns:
      a NumPy array of the model produces a single output, or a list of arrays
      if it produces multiple outputs
    """
    return self._predict(generator, transformers, False)

  def predict_on_batch(self, X, transformers=[]):
    """Generates predictions for input samples, processing samples in a batch.

    Parameters
    ----------
    X: ndarray
      the input data, as a Numpy array.
    transformers: list of dc.trans.Transformers
      Transformers that the input data has been transformed by.  The output
      is passed through these transformers to undo the transformations.

    Returns
    -------
    a NumPy array of the model produces a single output, or a list of arrays
    if it produces multiple outputs
    """
    dataset = NumpyDataset(X=X, y=None)
    return self.predict(dataset, transformers)

  def predict_uncertainty_on_batch(self, X, masks=50):
    """
    Predict the model's outputs, along with the uncertainty in each one.

    The uncertainty is computed as described in https://arxiv.org/abs/1703.04977.
    It involves repeating the prediction many times with different dropout masks.
    The prediction is computed as the average over all the predictions.  The
    uncertainty includes both the variation among the predicted values (epistemic
    uncertainty) and the model's own estimates for how well it fits the data
    (aleatoric uncertainty).  Not all models support uncertainty prediction.

    Parameters
    ----------
    X: ndarray
      the input data, as a Numpy array.
    masks: int
      the number of dropout masks to average over

    Returns
    -------
    for each output, a tuple (y_pred, y_std) where y_pred is the predicted
    value of the output, and each element of y_std estimates the standard
    deviation of the corresponding element of y_pred
    """
    dataset = NumpyDataset(X=X, y=None)
    return self.predict_uncertainty(dataset, masks)

  def predict(self, dataset, transformers=[]):
    """
    Uses self to make predictions on provided Dataset object.

    Parameters
    ----------
    dataset: dc.data.Dataset
      Dataset to make prediction on
    transformers: list of dc.trans.Transformers
      Transformers that the input data has been transformed by.  The output
      is passed through these transformers to undo the transformations.

    Returns
    -------
    a NumPy array of the model produces a single output, or a list of arrays
    if it produces multiple outputs
    """
    generator = self.default_generator(dataset, predict=True, pad_batches=False)
    return self.predict_on_generator(generator, transformers)

  def predict_uncertainty(self, dataset, masks=50):
    """
    Predict the model's outputs, along with the uncertainty in each one.

    The uncertainty is computed as described in https://arxiv.org/abs/1703.04977.
    It involves repeating the prediction many times with different dropout masks.
    The prediction is computed as the average over all the predictions.  The
    uncertainty includes both the variation among the predicted values (epistemic
    uncertainty) and the model's own estimates for how well it fits the data
    (aleatoric uncertainty).  Not all models support uncertainty prediction.

    Parameters
    ----------
    dataset: dc.data.Dataset
      Dataset to make prediction on
    masks: int
      the number of dropout masks to average over

    Returns
    -------
    for each output, a tuple (y_pred, y_std) where y_pred is the predicted
    value of the output, and each element of y_std estimates the standard
    deviation of the corresponding element of y_pred
    """
    sum_pred = []
    sum_sq_pred = []
    sum_var = []
    for i in range(masks):
      generator = self.default_generator(
          dataset, predict=True, pad_batches=False)
      results = self._predict(generator, [], True)
      if len(sum_pred) == 0:
        for p, v in results:
          sum_pred.append(p)
          sum_sq_pred.append(p * p)
          sum_var.append(v)
      else:
        for j, (p, v) in enumerate(results):
          sum_pred[j] += p
          sum_sq_pred[j] += p * p
          sum_var[j] += v
    output = []
    std = []
    for i in range(len(sum_pred)):
      p = sum_pred[i] / masks
      output.append(p)
      std.append(np.sqrt(sum_sq_pred[i] / masks - p * p + sum_var[i] / masks))
    if len(output) == 1:
      return (output[0], std[0])
    else:
      return zip(output, std)

  def evaluate_generator(self,
                         generator,
                         metrics,
                         transformers=[],
                         per_task_metrics=False):
    """Evaluate the performance of this model on the data produced by a generator.

    Parameters
    ----------
    generator: generator
      this should generate batches, each represented as a tuple of the form
      (inputs, labels, weights).
    metric: deepchem.metrics.Metric
      Evaluation metric
    transformers: list of dc.trans.Transformers
      Transformers that the input data has been transformed by.  The output
      is passed through these transformers to undo the transformations.
    per_task_metrics: bool
      If True, return per-task scores.

    Returns
    -------
    dict
      Maps tasks to scores under metric.
    """
    evaluator = GeneratorEvaluator(self, generator, transformers)
    return evaluator.compute_model_performance(metrics, per_task_metrics)

  def compute_saliency(self, X):
    """Compute the saliency map for an input sample.

    This computes the Jacobian matrix with the derivative of each output element
    with respect to each input element.  More precisely,

    - If this model has a single output, it returns a matrix of shape
      (output_shape, input_shape) with the derivatives.
    - If this model has multiple outputs, it returns a list of matrices, one
      for each output.

    This method cannot be used on models that take multiple inputs.

    Parameters
    ----------
    X: ndarray
      the input data for a single sample

    Returns
    -------
    the Jacobian matrix, or a list of matrices
    """
    input_shape = X.shape
    X = np.reshape(X, [1] + list(X.shape))
    self._create_inputs([X])
    X, _, _ = self._prepare_batch((X, None, None))
    if tf.executing_eagerly():
      # In eager mode we use a GradientTape to compute gradients.

      X = tf.constant(X)
      with tf.GradientTape(
          persistent=True, watch_accessed_variables=False) as tape:
        tape.watch(X)
        outputs = self.model(X)
        if isinstance(outputs, tf.Tensor):
          outputs = [outputs]
        final_result = []
        for output in outputs:
          output_shape = tuple(output.shape.as_list()[1:])
          output = tf.reshape(output, [-1])
          result = []
          for i in range(output.shape[0]):
            result.append(tape.gradient(output[i], X))
          final_result.append(
              tf.reshape(tf.stack(result), output_shape + input_shape).numpy())
    else:
      # In graph mode we use tf.gradients().

      def jacobian(y, x):
        # Adapted from https://github.com/tensorflow/tensorflow/issues/675#issuecomment-319891923.
        y = tf.reshape(tf.convert_to_tensor(y)[0], [-1])
        n = y.shape[0]
        loop_vars = [
            tf.constant(0, tf.int32),
            tf.TensorArray(tf.float32, size=n)
        ]
        _, jacobian = tf.while_loop(
            lambda j, _: j < n,
            lambda j, result: (j + 1, result.write(j, tf.gradients(y[j], x))),
            loop_vars)
        return jacobian.stack()

      grads = [
          jacobian(self._output_tensors[i], self._input_placeholders[0])
          for i in self._prediction_outputs
      ]
      feed_dict = {self._input_placeholders[0]: X}
      result = self.session.run(grads, feed_dict=feed_dict)
      output_shapes = [
          tuple(o.shape.as_list()[1:]) for o in self._output_tensors
      ]
      final_result = [
          x.reshape(s + input_shape) for x, s in zip(result, output_shapes)
      ]
    if len(final_result) == 1:
      return final_result[0]
    return final_result

  def _prepare_batch(self, batch):
    inputs, labels, weights = batch
    inputs = [
        x if x.dtype == t else x.astype(t)
        for x, t in zip(inputs, self._input_dtypes)
    ]
    if labels is not None:
      labels = [
          x if x.dtype == t else x.astype(t)
          for x, t in zip(labels, self._label_dtypes)
      ]
    if weights is not None:
      weights = [
          x if x.dtype == t else x.astype(t)
          for x, t in zip(weights, self._weights_dtypes)
      ]
    return (inputs, labels, weights)

  def default_generator(self,
                        dataset,
                        epochs=1,
                        predict=False,
                        deterministic=True,
                        pad_batches=True):
    for epoch in range(epochs):
      for (X_b, y_b, w_b, ids_b) in dataset.iterbatches(
          batch_size=self.batch_size,
          deterministic=deterministic,
          pad_batches=pad_batches):
        yield ([X_b], [y_b], [w_b])

  def save_checkpoint(self, max_checkpoints_to_keep=5):
    """Save a checkpoint to disk.

    Usually you do not need to call this method, since fit() saves checkpoints
    automatically.  If you have disabled automatic checkpointing during fitting,
    this can be called to manually write checkpoints.

    Parameters
    ----------
    max_checkpoints_to_keep: int
      the maximum number of checkpoints to keep.  Older checkpoints are discarded.
    """
    self._ensure_built()
    manager = tf.train.CheckpointManager(self._checkpoint, self.model_dir,
                                         max_checkpoints_to_keep)
    self._exec_with_session(lambda: manager.save())

  def _exec_with_session(self, f):
    if tf.executing_eagerly():
      f()
    else:
      with self.session.as_default():
        f()

  def get_checkpoints(self):
    """Get a list of all available checkpoint files."""
    return tf.train.get_checkpoint_state(
        self.model_dir).all_model_checkpoint_paths

  def restore(self, checkpoint=None):
    """Reload the values of all variables from a checkpoint file.

    Parameters
    ----------
    checkpoint: str
      the path to the checkpoint file to load.  If this is None, the most recent
      checkpoint will be chosen automatically.  Call get_checkpoints() to get a
      list of all available checkpoints.
    """
    if checkpoint is None:
      checkpoint = tf.train.latest_checkpoint(self.model_dir)
    if checkpoint is None:
      raise ValueError('No checkpoint found')
    if tf.executing_eagerly():
      self._checkpoint.restore(checkpoint)
    else:
      self._checkpoint.restore(checkpoint).run_restore_ops(self.session)
Example #5
0
class TensorGraph(Model):

  def __init__(self,
               tensorboard=False,
               tensorboard_log_frequency=100,
               batch_size=100,
               random_seed=None,
               use_queue=True,
               graph=None,
               learning_rate=0.001,
               configproto=None,
               **kwargs):
    """
    Parameters
    ----------
    tensorboard: bool
      Should we log to model_dir data for tensorboard?
    tensorboard_log_frequency: int
      How many training batches before logging tensorboard?
    batch_size: int
      default batch size for training and evaluating
    use_queue: boolean
      if True when building we will create a tf.FIFO queue, which will hold
      all features, weights, and labels.  We will feed the inputs into this
      queue in batches of self.batch_size in a separate thread from the
      thread training the model.  You cannot use a queue when
      batches are not of consistent size
    graph: tensorflow.Graph
      the Graph in which to create Tensorflow objects.  If None, a new Graph
      is created.
    learning_rate: float or LearningRateSchedule
      the learning rate to use for optimization
    configproto: a tf.ConfigProto() object used to create tf.Session()
    """

    # Layer Management
    self.layers = dict()
    self.features = list()
    self.labels = list()
    self.outputs = list()
    self.variances = list()
    self.task_weights = list()
    self.submodels = list()
    self.loss = Constant(0)
    self.built = False
    self.queue_installed = False
    self.optimizer = Adam(
        learning_rate=learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-7)
    self.configproto = configproto

    # Singular place to hold Tensor objects which don't serialize
    # These have to be reconstructed on restoring from pickle
    # See TensorGraph._get_tf() for more details on lazy construction
    self.tensor_objects = {
        "FileWriter": None,
        "Graph": graph,
        "train_op": None,
        "summary_op": None,
    }
    self.tensorboard = tensorboard
    self.tensorboard_log_frequency = tensorboard_log_frequency
    self.tensorboard_step = 0
    self.global_step = 0
    self.use_queue = use_queue

    self.batch_size = batch_size
    self.random_seed = random_seed
    super(TensorGraph, self).__init__(**kwargs)
    self.save_file = "%s/%s" % (self.model_dir, "model")
    self.model_class = None

    self.rnn_initial_states = []
    self.rnn_final_states = []
    self.rnn_zero_states = []
    if self.use_queue and self.tensorboard:
      raise ValueError(
          "Currently TensorGraph cannot both use_queue and tensorboard at the same time"
      )

  def _add_layer(self, layer):
    if layer.name is None:
      layer.name = "%s_%s" % (layer.__class__.__name__, len(self.layers) + 1)
    if layer.name in self.layers:
      return
    if isinstance(layer, Feature):
      self.features.append(layer)
    if isinstance(layer, Label):
      self.labels.append(layer)
    if isinstance(layer, Weights):
      self.task_weights.append(layer)
    self.layers[layer.name] = layer
    for in_layer in layer.in_layers:
      self._add_layer(in_layer)

  def fit(self,
          dataset,
          nb_epoch=10,
          max_checkpoints_to_keep=5,
          checkpoint_interval=1000,
          deterministic=False,
          restore=False,
          submodel=None,
          **kwargs):
    """Train this model on a dataset.

    Parameters
    ----------
    dataset: Dataset
      the Dataset to train on
    nb_epoch: int
      the number of epochs to train for
    max_checkpoints_to_keep: int
      the maximum number of checkpoints to keep.  Older checkpoints are discarded.
    checkpoint_interval: int
      the frequency at which to write checkpoints, measured in training steps.
      Set this to 0 to disable automatic checkpointing.
    deterministic: bool
      if True, the samples are processed in order.  If False, a different random
      order is used for each epoch.
    restore: bool
      if True, restore the model from the most recent checkpoint and continue training
      from there.  If False, retrain the model from scratch.
    submodel: Submodel
      an alternate training objective to use.  This should have been created by
      calling create_submodel().
    """
    return self.fit_generator(
        self.default_generator(
            dataset, epochs=nb_epoch, deterministic=deterministic),
        max_checkpoints_to_keep, checkpoint_interval, restore, submodel)

  def fit_generator(self,
                    feed_dict_generator,
                    max_checkpoints_to_keep=5,
                    checkpoint_interval=1000,
                    restore=False,
                    submodel=None):
    """Train this model on data from a generator.

    Parameters
    ----------
    feed_dict_generator: generator
      this should generate batches, each represented as a dict that maps
      Layers to values.
    max_checkpoints_to_keep: int
      the maximum number of checkpoints to keep.  Older checkpoints are discarded.
    checkpoint_interval: int
      the frequency at which to write checkpoints, measured in training steps.
      Set this to 0 to disable automatic checkpointing.
    restore: bool
      if True, restore the model from the most recent checkpoint and continue training
      from there.  If False, retrain the model from scratch.
    submodel: Submodel
      an alternate training objective to use.  This should have been created by
      calling create_submodel().

    Returns
    -------
    the average loss over the most recent checkpoint interval
    """
    if not self.built:
      self.build()
    with self._get_tf("Graph").as_default():
      time1 = time.time()
      loss = self.loss
      if submodel is not None and submodel.loss is not None:
        loss = submodel.loss
      if tfe.in_eager_mode():
        # In eager mode we want an optimizer and a function to compute the
        # gradient of the loss.

        submodel_vars = None
        if submodel is None:
          optimizer = self._get_tf("Optimizer")
        else:
          optimizer = submodel.create_optimizer()
          if submodel.layers is not None:
            submodel_vars = set()
            for layer in submodel.layers:
              for var in layer.variables:
                submodel_vars.add(var)
        val_grad_fn = tfe.implicit_value_and_gradients(
            lambda x: self._run_graph([loss], x, True)[0])
      else:
        # In graph mode we want a training operation.

        if submodel is None:
          train_op = self._get_tf('train_op')
        else:
          train_op = submodel.get_train_op()
      if checkpoint_interval > 0:
        saver = tf.train.Saver(
            self.get_variables(),
            max_to_keep=max_checkpoints_to_keep,
            save_relative_paths=True)
      if restore:
        self.restore()
      avg_loss, n_averaged_batches = 0.0, 0.0
      n_samples = 0
      n_enqueued = [0]
      final_sample = [None]
      if self.queue_installed:
        enqueue_thread = threading.Thread(
            target=_enqueue_batch,
            args=(self, feed_dict_generator, self._get_tf("Graph"),
                  self.session, n_enqueued, final_sample))
        enqueue_thread.start()
      for feed_dict in self._create_feed_dicts(feed_dict_generator, True):
        if self.queue_installed:
          # Don't let this thread get ahead of the enqueue thread, since if
          # we try to read more batches than the total number that get queued,
          # this thread will hang indefinitely.
          while n_enqueued[0] <= n_samples:
            if n_samples == final_sample[0]:
              break
            time.sleep(0)
          if n_samples == final_sample[0]:
            break
        n_samples += 1
        should_log = (self.tensorboard and
                      n_samples % self.tensorboard_log_frequency == 0)
        if tfe.in_eager_mode():
          value, grads_and_vars = val_grad_fn(feed_dict)
          if submodel_vars is not None:
            grads_and_vars = [
                x for x in grads_and_vars if x[1] in submodel_vars
            ]
          optimizer.apply_gradients(grads_and_vars)
          avg_loss += value
        else:
          fetches = [train_op, loss.out_tensor]
          if should_log:
            fetches.append(self._get_tf("summary_op"))
          fetched_values = self.session.run(fetches, feed_dict=feed_dict)
          if should_log:
            self._log_tensorboard(fetched_values[2])
          avg_loss += fetched_values[1]
        n_averaged_batches += 1
        self.global_step += 1
        if checkpoint_interval > 0 and self.global_step % checkpoint_interval == checkpoint_interval - 1:
          saver.save(self.session, self.save_file, global_step=self.global_step)
          avg_loss = float(avg_loss) / n_averaged_batches
          logger.info('Ending global_step %d: Average loss %g' %
                      (self.global_step, avg_loss))
          avg_loss, n_averaged_batches = 0.0, 0.0
      if n_averaged_batches > 0:
        avg_loss = float(avg_loss) / n_averaged_batches
      if checkpoint_interval > 0:
        if n_averaged_batches > 0:
          logger.info('Ending global_step %d: Average loss %g' %
                      (self.global_step, avg_loss))
        saver.save(self.session, self.save_file, global_step=self.global_step)
        time2 = time.time()
        logger.info("TIMING: model fitting took %0.3f s" % (time2 - time1))
    return avg_loss

  def _log_tensorboard(self, summary):
    """
    TODO(LESWING) set epoch
    Parameters
    ----------
    Returns
    -------
    """
    global_step = int(self.global_step)
    writer = self._get_tf("FileWriter")
    writer.reopen()
    writer.add_summary(summary, global_step=global_step)
    writer.close()

  def fit_on_batch(self, X, y, w, submodel=None):
    if not self.built:
      self.build()
    dataset = NumpyDataset(X, y)
    return self.fit(dataset, nb_epoch=1, submodel=submodel)

  def default_generator(self,
                        dataset,
                        epochs=1,
                        predict=False,
                        deterministic=True,
                        pad_batches=True):
    if len(self.features) > 1:
      raise ValueError("More than one Feature, must use generator")
    if len(self.labels) > 1:
      raise ValueError("More than one Label, must use generator")
    if len(self.task_weights) > 1:
      raise ValueError("More than one Weights, must use generator")
    for epoch in range(epochs):
      for (X_b, y_b, w_b, ids_b) in dataset.iterbatches(
          batch_size=self.batch_size,
          deterministic=deterministic,
          pad_batches=pad_batches):
        feed_dict = dict()
        if len(self.labels) == 1 and y_b is not None and not predict:
          feed_dict[self.labels[0]] = y_b
        if len(self.features) == 1 and X_b is not None:
          feed_dict[self.features[0]] = X_b
        if len(self.task_weights) == 1 and w_b is not None and not predict:
          feed_dict[self.task_weights[0]] = w_b
        for (initial_state, zero_state) in zip(self.rnn_initial_states,
                                               self.rnn_zero_states):
          feed_dict[initial_state] = zero_state
        yield feed_dict

  def __call__(self, *inputs, **kwargs):
    """Execute the model in eager mode to compute outputs as a function of inputs.

    This is very similar to predict_on_batch(), except that it returns the outputs
    as tensors rather than numpy arrays.  That means you can compute the graph's
    outputs, then do additional calculations based on them, and gradients will
    be tracked correctly through the whole process.

    Parameters
    ----------
    inputs: tensors
      the values to use for the model's features.  The number of inputs must
      exactly match the length of the model's `features` property.  The values
      may be tensors, numpy arrays, or anything else that can be converted to
      tensors of the correct shape.
    outputs: list of Layers
      the output layers to compute.  If this is omitted, self.outputs is used
      (that is, all outputs that have been added by calling add_output()).

    Returns
    -------
    The output tensors, or a list of tensors if multiple outputs were requested.
    """
    if len(inputs) != len(self.features):
      raise ValueError('Expected %d inputs, received %d' % len(self.features),
                       len(inputs))
    # TODO Once we drop Python 2 support, turn outputs into a proper keyword arg
    # instead of using the **kwargs hack.
    if 'outputs' in kwargs:
      outputs = kwargs['outputs']
    else:
      outputs = self.outputs
    feed_dict = dict(zip(self.features, inputs))
    results = self._run_graph(outputs, feed_dict, False)
    if len(results) == 1:
      return results[0]
    return results

  def _predict(self, generator, transformers, outputs, uncertainty):
    """
    Predict outputs for data provided by a generator.

    This is the private implementation of prediction.  Do not call it directly.
    Instead call one of the public prediction methods.

    Parameters
    ----------
    generator: Generator
      Generator that constructs feed dictionaries for TensorGraph.
    transformers: list
      List of dc.trans.Transformers.
    outputs: object
      If outputs is None, then will assume outputs = self.outputs.
      If outputs is a Layer/Tensor, then will evaluate and return as a
      single ndarray. If outputs is a list of Layers/Tensors, will return a list
      of ndarrays.
    uncertainty: bool
      specifies whether this is being called as part of estimating uncertainty.
      If True, it sets the training flag so that dropout will be enabled, and
      returns the values of the uncertainty outputs.
    Returns:
      y_pred: numpy ndarray of shape (n_samples, n_classes*n_tasks)
    """
    if not self.built:
      self.build()
    if outputs is None:
      outputs = self.outputs
    elif not isinstance(outputs, collections.Sequence):
      outputs = [outputs]
    if uncertainty:
      if len(self.variances) == 0:
        raise ValueError('This model cannot compute uncertainties')
      if len(self.variances) != len(outputs):
        raise ValueError(
            'The number of variances must exactly match the number of outputs')
      tensors = outputs + self.variances
    else:
      tensors = outputs

    with self._get_tf("Graph").as_default():
      # Gather results for each output
      results = [[] for out in tensors]
      n_samples = 0
      n_enqueued = [0]
      final_sample = [None]
      if self.queue_installed:
        enqueue_thread = threading.Thread(
            target=_enqueue_batch,
            args=(self, generator, self._get_tf("Graph"), self.session,
                  n_enqueued, final_sample))
        enqueue_thread.start()
      for feed_dict in self._create_feed_dicts(generator, uncertainty):
        if self.queue_installed:
          # Don't let this thread get ahead of the enqueue thread, since if
          # we try to read more batches than the total number that get queued,
          # this thread will hang indefinitely.
          while n_enqueued[0] <= n_samples:
            if n_samples == final_sample[0]:
              break
            time.sleep(0)
          if n_samples == final_sample[0]:
            break
        n_samples += 1
        feed_results = self._run_graph(tensors, feed_dict, uncertainty)
        if tfe.in_eager_mode():
          feed_results = [f.numpy() for f in feed_results]
        if len(feed_results) > 1:
          if len(transformers):
            raise ValueError("Does not support transformations "
                             "for multiple outputs.")
        elif len(feed_results) == 1:
          result = undo_transforms(feed_results[0], transformers)
          feed_results = [result]
        for ind, result in enumerate(feed_results):
          results[ind].append(result)

      final_results = []
      for result_list in results:
        final_results.append(np.concatenate(result_list, axis=0))
      # If only one output, just return array
      if len(final_results) == 1:
        return final_results[0]
      elif uncertainty:
        return zip(final_results[:len(outputs)], final_results[len(outputs):])
      else:
        return final_results

  def predict_on_generator(self, generator, transformers=[], outputs=None):
    """
    Parameters
    ----------
    generator: Generator
      Generator that constructs feed dictionaries for TensorGraph.
    transformers: list
      List of dc.trans.Transformers.
    outputs: object
      If outputs is None, then will assume outputs = self.outputs.
      If outputs is a Layer/Tensor, then will evaluate and return as a
      single ndarray. If outputs is a list of Layers/Tensors, will return a list
      of ndarrays.
    Returns:
      y_pred: numpy ndarray of shape (n_samples, n_classes*n_tasks)
    """
    return self._predict(generator, transformers, outputs, False)

  def predict_on_batch(self, X, transformers=[], outputs=None):
    """Generates predictions for input samples, processing samples in a batch.

    Parameters
    ----------
    X: ndarray
      the input data, as a Numpy array.
    transformers: List
      List of dc.trans.Transformers

    Returns
    -------
    A Numpy array of predictions.
    """
    dataset = NumpyDataset(X=X, y=None)
    generator = self.default_generator(dataset, predict=True, pad_batches=False)
    return self.predict_on_generator(generator, transformers, outputs)

  def predict_uncertainty_on_batch(self, X, masks=50):
    """
    Predict the model's outputs, along with the uncertainty in each one.

    The uncertainty is computed as described in https://arxiv.org/abs/1703.04977.
    It involves repeating the prediction many times with different dropout masks.
    The prediction is computed as the average over all the predictions.  The
    uncertainty includes both the variation among the predicted values (epistemic
    uncertainty) and the model's own estimates for how well it fits the data
    (aleatoric uncertainty).  Not all models support uncertainty prediction.

    Parameters
    ----------
    X: ndarray
      the input data, as a Numpy array.
    masks: int
      the number of dropout masks to average over

    Returns
    -------
    for each output, a tuple (y_pred, y_std) where y_pred is the predicted
    value of the output, and each element of y_std estimates the standard
    deviation of the corresponding element of y_pred
    """
    dataset = NumpyDataset(X=X, y=None)
    return self.predict_uncertainty(dataset, masks)

  def predict(self, dataset, transformers=[], outputs=None):
    """
    Uses self to make predictions on provided Dataset object.

    Parameters
    ----------
    dataset: dc.data.Dataset
      Dataset to make prediction on
    transformers: list
      List of dc.trans.Transformers.
    outputs: object
      If outputs is None, then will assume outputs=self.outputs. If outputs is
      a Layer/Tensor, then will evaluate and return as a single ndarray. If
      outputs is a list of Layers/Tensors, will return a list of ndarrays.

    Returns
    -------
    results: numpy ndarray or list of numpy ndarrays
    """
    generator = self.default_generator(dataset, predict=True, pad_batches=False)
    return self.predict_on_generator(generator, transformers, outputs)

  def predict_uncertainty(self, dataset, masks=50):
    """
    Predict the model's outputs, along with the uncertainty in each one.

    The uncertainty is computed as described in https://arxiv.org/abs/1703.04977.
    It involves repeating the prediction many times with different dropout masks.
    The prediction is computed as the average over all the predictions.  The
    uncertainty includes both the variation among the predicted values (epistemic
    uncertainty) and the model's own estimates for how well it fits the data
    (aleatoric uncertainty).  Not all models support uncertainty prediction.

    Parameters
    ----------
    dataset: dc.data.Dataset
      Dataset to make prediction on
    masks: int
      the number of dropout masks to average over

    Returns
    -------
    for each output, a tuple (y_pred, y_std) where y_pred is the predicted
    value of the output, and each element of y_std estimates the standard
    deviation of the corresponding element of y_pred
    """
    sum_pred = []
    sum_sq_pred = []
    sum_var = []
    for i in range(masks):
      generator = self.default_generator(
          dataset, predict=True, pad_batches=False)
      results = self._predict(generator, [], self.outputs, True)
      if len(sum_pred) == 0:
        for p, v in results:
          sum_pred.append(p)
          sum_sq_pred.append(p * p)
          sum_var.append(v)
      else:
        for j, (p, v) in enumerate(results):
          sum_pred[j] += p
          sum_sq_pred[j] += p * p
          sum_var[j] += v
    output = []
    std = []
    for i in range(len(sum_pred)):
      p = sum_pred[i] / masks
      output.append(p)
      std.append(np.sqrt(sum_sq_pred[i] / masks - p * p + sum_var[i] / masks))
    if len(output) == 1:
      return (output[0], std[0])
    else:
      return zip(output, std)

  def topsort(self):

    def add_layers_to_list(layer, sorted_layers):
      if layer in sorted_layers:
        return
      for in_layer in layer.in_layers:
        add_layers_to_list(in_layer, sorted_layers)
      sorted_layers.append(layer)

    sorted_layers = []
    for l in self.features + self.labels + self.task_weights + self.outputs + self.variances:
      add_layers_to_list(l, sorted_layers)
    add_layers_to_list(self.loss, sorted_layers)
    for submodel in self.submodels:
      if submodel.loss is not None:
        add_layers_to_list(submodel.loss, sorted_layers)
    return sorted_layers

  def build(self):
    if self.built:
      return
    if tfe.in_eager_mode():
      # In eager mode, we need to execute every layer once to ensure its variables
      # have been created.

      def build_layers(layer, tensors):
        if layer in tensors:
          return tensors[layer]
        inputs = [build_layers(input, tensors) for input in layer.in_layers]
        if isinstance(layer, Input):
          # We can't execute Input layers in eager mode, since they would try
          # to create placeholders.  Instead create a tensor of the correct
          # size and type.
          shape = [1 if s is None else s for s in layer.shape]
          tensor = tf.zeros(shape, layer.dtype)
        else:
          with tf.name_scope(layer.name):
            tensor = layer.create_tensor(in_layers=inputs, set_tensors=False)
        tensors[layer] = tensor
        return tensor

      tensors = {}
      with self._get_tf("Graph").as_default():
        # Build the layers.

        build_layers(self.loss, tensors)
        for output in self.outputs:
          build_layers(output, tensors)
        for variance in self.variances:
          build_layers(variance, tensors)
        for submodel in self.submodels:
          build_layers(submodel.loss, tensors)

        # Initialize variables.

        for layer in self.layers.values():
          if layer.variable_values is not None:
            for var, val in zip(layer.variables, layer.variable_values):
              var.assign(val)
      self.session = None
      self._training_placeholder = None
      self.built = True
      return

    # In graph mode we need to create the computation graph.

    with self._get_tf("Graph").as_default():
      self._training_placeholder = tf.placeholder(dtype=tf.float32, shape=())
      if self.random_seed is not None:
        tf.set_random_seed(self.random_seed)
      self._install_queue()
      self.built = True
      for layer in self.topsort():
        with tf.name_scope(layer.name):
          layer.create_tensor(training=self._training_placeholder)
          self.rnn_initial_states += layer.rnn_initial_states
          self.rnn_final_states += layer.rnn_final_states
          self.rnn_zero_states += layer.rnn_zero_states
          layer.add_summary_to_tg(layer.out_tensor,
                                  self.get_layer_variables(layer))
      self.session = tf.Session(config=self.configproto)

      # Ensure all training operators have been created.

      self._get_tf('train_op')
      for submodel in self.submodels:
        train_op = submodel.get_train_op()

      # Initialize variables.

      self.session.run(tf.global_variables_initializer())
      for layer in self.layers.values():
        if layer.variable_values is not None:
          variables = self.get_layer_variables(layer)
          for var, val in zip(variables, layer.variable_values):
            self.session.run(var.assign(val))

    for layer in self.layers.values():
      if layer.tensorboard:
        self.tensorboard = True
    tf.summary.scalar("loss", self.loss.out_tensor)
    for layer in self.layers.values():
      if layer.tensorboard:
        tf.summary.tensor_summary(layer.name, layer.out_tensor)
    if self.tensorboard:
      writer = self._get_tf("FileWriter")
      writer.add_graph(self._get_tf("Graph"))
      writer.close()

    # As a sanity check, make sure all tensors have the correct shape.

    for layer in self.layers.values():
      try:
        assert list(layer.shape) == layer.out_tensor.get_shape().as_list(
        ), '%s: Expected shape %s does not match actual shape %s' % (
            layer.name, layer.shape, layer.out_tensor.get_shape().as_list())
      except NotImplementedError:
        pass

  def _install_queue(self):
    """
    """
    if not self.use_queue or self.queue_installed:
      for layer in self.features + self.labels + self.task_weights:
        layer.pre_queue = True
      return
    inputs = self.features + self.labels + self.task_weights
    if len(inputs) == 0:
      return
    names = []
    shapes = []
    pre_q_inputs = []
    q = InputFifoQueue(shapes, names, in_layers=pre_q_inputs)
    q.name = "%s_%s" % (q.__class__.__name__, len(self.layers) + 1)

    for layer in inputs:
      pre_q_input = layer.create_pre_q()
      shapes.append(pre_q_input.shape)
      names.append(pre_q_input.name)
      pre_q_inputs.append(pre_q_input)

      layer.in_layers.append(q)

    self._add_layer(q)
    self.input_queue = q
    self.queue_installed = True

  def set_loss(self, layer):
    self._add_layer(layer)
    self.loss = layer

  def add_output(self, layer):
    """Add an output layer that can be computed by predict()"""
    self._add_layer(layer)
    self.outputs.append(layer)

  def add_variance(self, layer):
    """Add a layer that computes the variance in an output.

    If a model supports uncertainty, it must call add_variance() once for every
    output.  Each variance layer has the same shape as the corresponding output,
    and each element computes an estimate of the variance from aleatoric
    uncertainty in the corresponding element of the output.

    In addition, if a model supports uncertainty it MUST use dropout on every
    layer.  Otherwise, the uncertainties it computes will be inaccurate.
    """
    self._add_layer(layer)
    self.variances.append(layer)

  def set_optimizer(self, optimizer):
    """Set the optimizer to use for fitting."""
    self.optimizer = optimizer

  def create_submodel(self, layers=None, loss=None, optimizer=None):
    """Create an alternate objective for training one piece of a TensorGraph.

    A TensorGraph consists of a set of layers, and specifies a loss function and
    optimizer to use for training those layers.  Usually this is sufficient, but
    there are cases where you want to train different parts of a model separately.
    For example, a GAN consists of a generator and a discriminator.  They are
    trained separately, and they use different loss functions.

    A submodel defines an alternate objective to use in cases like this.  It may
    optionally specify any of the following: a subset of layers in the model to
    train; a different loss function; and a different optimizer to use.  This
    method creates a submodel, which you can then pass to fit() to use it for
    training.

    Parameters
    ----------
    layers: list
      the list of layers to train.  If None, all layers in the model will be
      trained.
    loss: Layer
      the loss function to optimize.  If None, the model's main loss function
      will be used.
    optimizer: Optimizer
      the optimizer to use for training.  If None, the model's main optimizer
      will be used.

    Returns
    -------
    the newly created submodel, which can be passed to any of the fitting
    methods.
    """
    if self.built:
      raise ValueError('Submodels must be created before build() is called.')
    submodel = Submodel(self, layers, loss, optimizer)
    self.submodels.append(submodel)
    if loss is not None:
      self._add_layer(loss)
    return submodel

  def get_pickling_errors(self, obj, seen=None):
    if seen == None:
      seen = []
    try:
      state = obj.__getstate__()
    except AttributeError:
      return
    if state == None:
      return
    if isinstance(state, tuple):
      if not isinstance(state[0], dict):
        state = state[1]
      else:
        state = state[0].update(state[1])
    result = {}
    for i in state:
      try:
        pickle.dumps(state[i], protocol=2)
      except pickle.PicklingError:
        if not state[i] in seen:
          seen.append(state[i])
          result[i] = self.get_pickling_errors(state[i], seen)
    return result

  def save(self):
    # Remove out_tensor from the object to be pickled
    must_restore = False
    tensor_objects = self.tensor_objects
    rnn_initial_states = self.rnn_initial_states
    rnn_final_states = self.rnn_final_states
    rnn_zero_states = self.rnn_zero_states
    session = self.session
    self.tensor_objects = {}
    self.rnn_initial_states = []
    self.rnn_final_states = []
    self.rnn_zero_states = []
    self.session = None
    out_tensors = []
    submodel_ops = []
    if self.built:
      must_restore = True
      for layer in self.topsort():
        out_tensors.append(layer.none_tensors())
      for submodel in self.submodels:
        submodel_ops.append(submodel._train_op)
        submodel._train_op = None
      training_placeholder = self._training_placeholder
      self._training_placeholder = None
      self.built = False

    # Pickle itself
    pickle_name = os.path.join(self.model_dir, "model.pickle")

    with open(pickle_name, 'wb') as fout:
      try:
        pickle.dump(self, fout)
      except Exception as e:
        logger.info(self.get_pickling_errors(self))
        raise e

    # add out_tensor back to everyone
    if must_restore:
      for index, layer in enumerate(self.topsort()):
        layer.set_tensors(out_tensors[index])
      for submodel, op in zip(self.submodels, submodel_ops):
        submodel._train_op = op
      self._training_placeholder = training_placeholder
      self.built = True
    self.tensor_objects = tensor_objects
    self.rnn_initial_states = rnn_initial_states
    self.rnn_final_states = rnn_final_states
    self.rnn_zero_states = rnn_zero_states
    self.session = session

  def evaluate_generator(self,
                         feed_dict_generator,
                         metrics,
                         transformers=[],
                         labels=None,
                         outputs=None,
                         weights=[],
                         per_task_metrics=False):

    if labels is None:
      raise ValueError
    n_tasks = len(self.outputs)
    n_classes = self.outputs[0].out_tensor.get_shape()[-1].value
    evaluator = GeneratorEvaluator(
        self,
        feed_dict_generator,
        transformers,
        labels=labels,
        outputs=outputs,
        weights=weights,
        n_tasks=n_tasks,
        n_classes=n_classes)
    if not per_task_metrics:
      scores = evaluator.compute_model_performance(metrics)
      return scores
    else:
      scores, per_task_scores = evaluator.compute_model_performance(
          metrics, per_task_metrics=per_task_metrics)
      return scores, per_task_scores

  def get_layer_variables(self, layer):
    """Get the list of trainable variables in a layer of the graph."""
    if tfe.in_eager_mode():
      return layer.variables
    if not self.built:
      self.build()
    with self._get_tf("Graph").as_default():
      if layer.variable_scope == '':
        return []
      return tf.get_collection(
          tf.GraphKeys.TRAINABLE_VARIABLES, scope=layer.variable_scope)

  def get_variables(self):
    """Get the list of all trainable variables in the graph."""
    if not self.built:
      self.build()
    if tfe.in_eager_mode():
      variables = []
      for layer in self.layers.values():
        variables += layer.variables
      return variables
    else:
      with self._get_tf("Graph").as_default():
        return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)

  def get_global_step(self):
    return self._get_tf("GlobalStep")

  def _get_tf(self, obj):
    """Fetches underlying TensorFlow primitives.

    Parameters
    ----------
    obj: str
      If "Graph", returns tf.Graph instance. If "FileWriter", returns
      tf.summary.FileWriter. If "Optimizer", returns the optimizer. If
      "train_op", returns the train operation. If "summary_op", returns the
      merged summary. If "GlobalStep" returns the global step.
    Returns
    -------
    TensorFlow Object

    """

    if obj in self.tensor_objects and self.tensor_objects[obj] is not None:
      return self.tensor_objects[obj]
    if obj == "Graph":
      self.tensor_objects['Graph'] = tf.Graph()
    elif obj == "FileWriter":
      self.tensor_objects['FileWriter'] = tf.summary.FileWriter(self.model_dir)
    elif obj == 'Optimizer':
      self.tensor_objects['Optimizer'] = self.optimizer._create_optimizer(
          self._get_tf('GlobalStep'))
    elif obj == 'train_op':
      opt = self._get_tf('Optimizer')
      global_step = self._get_tf('GlobalStep')
      try:
        self.tensor_objects['train_op'] = opt.minimize(
            self.loss.out_tensor, global_step=global_step)
      except ValueError:
        # The loss doesn't depend on any variables.
        self.tensor_objects['train_op'] = 0
    elif obj == 'summary_op':
      self.tensor_objects['summary_op'] = tf.summary.merge_all(
          key=tf.GraphKeys.SUMMARIES)
    elif obj == 'GlobalStep':
      with self._get_tf("Graph").as_default():
        self.tensor_objects['GlobalStep'] = create_variable(0, trainable=False)
    return self._get_tf(obj)

  def save_checkpoint(self, max_checkpoints_to_keep=5):
    """Save a checkpoint to disk.

    Usually you do not need to call this method, since fit() saves checkpoints
    automatically.  If you have disabled automatic checkpointing during fitting,
    this can be called to manually write checkpoints.

    Parameters
    ----------
    max_checkpoints_to_keep: int
      the maximum number of checkpoints to keep.  Older checkpoints are discarded.
    """
    saver = tf.train.Saver(
        self.get_variables(), max_to_keep=max_checkpoints_to_keep)
    saver.save(self.session, self.save_file, global_step=self.global_step)

  def get_checkpoints(self):
    """Get a list of all available checkpoint files."""
    return tf.train.get_checkpoint_state(
        self.model_dir).all_model_checkpoint_paths

  def restore(self, checkpoint=None):
    """Reload the values of all variables from a checkpoint file.

    Parameters
    ----------
    checkpoint: str
      the path to the checkpoint file to load.  If this is None, the most recent
      checkpoint will be chosen automatically.  Call get_checkpoints() to get a
      list of all available checkpoints.
    """
    if not self.built:
      self.build()
    if checkpoint is None:
      checkpoint = tf.train.latest_checkpoint(self.model_dir)
    if checkpoint is None:
      raise ValueError('No checkpoint found')
    with self._get_tf("Graph").as_default():
      reader = NewCheckpointReader(checkpoint)
      var_names = set([x for x in reader.get_variable_to_shape_map()])
      var_list = []
      for var in self.get_variables():
        name = var.name
        if ':' in name:
          name = name[:name.rfind(':')]
        if name in var_names:
          var_list.append(var)
      saver = tf.train.Saver(var_list=var_list)
      saver.restore(self.session, checkpoint)

  def get_num_tasks(self):
    return len(self.outputs)

  def get_pre_q_input(self, input_layer):
    layer_name = input_layer.name
    pre_q_name = "%s_pre_q" % layer_name
    return self.layers[pre_q_name]

  @staticmethod
  def load_from_dir(model_dir, restore=True):
    pickle_name = os.path.join(model_dir, "model.pickle")
    with open(pickle_name, 'rb') as fout:
      tensorgraph = pickle.load(fout)
      tensorgraph.built = False
      tensorgraph.model_dir = model_dir
      if restore:
        try:
          tensorgraph.restore()
        except ValueError:
          pass  # No checkpoint to load
      return tensorgraph

  def __del__(self):
    pass

  def _create_feed_dicts(self, generator, training):
    """Create feed dicts for use in fitting or prediction.

    Parameters
    ----------
    generator: Generator
      the feed dict generator that was passed to fit_generator() or predict_on_generator()
    training: bool
      True during training, False during prediction
    """
    train_value = 1.0 if training else 0.0
    if self.queue_installed:
      while True:
        yield {self._training_placeholder: train_value}
    else:
      for d in generator:
        feed_dict = {}
        for key, value in d.items():
          if isinstance(key, Input):
            value = _ensure_value_shape(value, key)
            if tfe.in_eager_mode():
              value = tf.cast(value, key.dtype)
            feed_dict[key] = value
          else:
            feed_dict[key] = value
        if not tfe.in_eager_mode():
          feed_dict[self._training_placeholder] = train_value
        yield feed_dict

  def _run_graph(self, outputs, feed_dict, training):
    """Run the calculations in the graph to compute some outputs.

    In graph mode, this just calls session.run().  In eager mode, it executes
    all required layers to compute the output.

    Parameters
    ----------
    outputs: list of Layers
      the output layers to compute
    feed_dict: dict
      maps input layers to values
    training: bool
      whether this is being executed in training mode
    """
    if not tfe.in_eager_mode():
      return self.session.run(outputs, feed_dict)

    def run_layers(layer, tensors):
      if layer in tensors:
        return tensors[layer]
      inputs = [run_layers(input, tensors) for input in layer.in_layers]
      tensor = layer.create_tensor(
          in_layers=inputs, set_tensors=False, training=training)
      tensors[layer] = tensor
      return tensor

    tensors = feed_dict.copy()
    return [run_layers(o, tensors) for o in outputs]

  def make_estimator(self,
                     feature_columns,
                     weight_column=None,
                     metrics={},
                     model_dir=None,
                     config=None):
    """Construct a Tensorflow Estimator from this model.

    tf.estimator.Estimator is the standard Tensorflow API for representing models.
    This method provides interoperability between DeepChem and other Tensorflow
    based tools by allowing any model to be used an Estimator.

    Once this method returns, the Estimator it created is independent of the model
    it was created from.  They do not share tensors, variables, save files, or any
    other resources.  The Estimator is a self contained object with its own methods
    for training, evaluation, prediction, checkpointing, etc.

    Parameters
    ----------
    feature_columns: list of tf.feature_column objects
      this describes the input features to the models.  There must be one entry
      for each Feature layer in this model's features field.
    weight_column: tf.feature_column or None
      if this model includes a Weights layer, this describes the input weights.
      Otherwise, this should be None.
    metrics: map
      metrics that should be computed in calls to evaluate().  For each entry,
      the key is the name to report for the metric, and the value is a function
      of the form f(labels, predictions, weights) that returns the tensors for
      computing the metric.  Any of the functions in tf.metrics can be used, as
      can other functions that satisfy the same interface.
    model_dir: str
      the directory in which the Estimator should save files.  If None, this
      defaults to the model's model_dir.
    config: RunConfig
      configuration options for the Estimator
    """
    # Check the inputs.

    if tfe.in_eager_mode():
      raise ValueError('make_estimator() is not supported in eager mode')
    if len(feature_columns) != len(self.features):
      raise ValueError(
          'This model requires %d feature column(s)' % len(self.features))
    if len(self.labels) != 1:
      raise ValueError(
          'Can only create an Estimator from a model with exactly one Label input'
      )
    if len(self.task_weights) > 1:
      raise ValueError(
          'Cannot create an Estimator from a model with multiple Weight inputs')
    if weight_column is None:
      if len(self.task_weights) > 0:
        raise ValueError('This model requires a weight column')
    else:
      if len(self.task_weights) == 0:
        raise ValueError(
            'Cannot specify weight_column for a model with no Weight inputs')
    if model_dir is None:
      model_dir = self.model_dir

    # Define a function that recursively creates tensors from layers.

    def create_tensors(layer, tensors, training):
      if layer in tensors:
        return tensors[layer]
      inputs = [
          create_tensors(in_layer, tensors, training)
          for in_layer in layer.in_layers
      ]
      tensor = layer.create_tensor(
          in_layers=inputs, set_tensors=False, training=training)
      tensors[layer] = tensor
      vars = tf.get_collection(
          tf.GraphKeys.TRAINABLE_VARIABLES, scope=layer.name)
      layer.add_summary_to_tg(tensor, vars)
      return tensor

    # Define the model function.

    def model_fn(features, labels, mode):
      # Define the inputs.

      tensors = self.create_estimator_inputs(feature_columns, weight_column,
                                             features, labels, mode)
      for layer, tensor in tensors.items():
        layer.add_summary_to_tg(tensor, [])

      # Create the correct outputs, based on the mode.

      if mode == tf.estimator.ModeKeys.PREDICT:
        predictions = {}
        for i, output in enumerate(self.outputs):
          predictions[i] = create_tensors(output, tensors, 0)
        return tf.estimator.EstimatorSpec(mode, predictions=predictions)
      if mode == tf.estimator.ModeKeys.EVAL:
        loss = create_tensors(self.loss, tensors, 0)
        predictions = create_tensors(self.outputs[0], tensors, 0)
        if len(self.task_weights) == 0:
          weights = None
        else:
          weights = tensors[self.task_weights[0]]
        eval_metric_ops = {}
        for name, function in metrics.items():
          eval_metric_ops[name] = function(tensors[self.labels[0]], predictions,
                                           weights)
        return tf.estimator.EstimatorSpec(
            mode, loss=loss, eval_metric_ops=eval_metric_ops)
      if mode == tf.estimator.ModeKeys.TRAIN:
        loss = create_tensors(self.loss, tensors, 1)
        global_step = tf.train.get_global_step()
        optimizer = self.optimizer._create_optimizer(global_step)
        train_op = optimizer.minimize(loss, global_step=global_step)
        return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)
      raise ValueError('Unknown mode')

    # Create the Estimator.

    return tf.estimator.Estimator(
        model_fn=model_fn, model_dir=model_dir, config=config)

  def create_estimator_inputs(self, feature_columns, weight_column, features,
                              labels, mode):
    """This is called by make_estimator() to create tensors for the inputs.

    feature_columns and weight_column are the arguments passed to
    make_estimator().  features, labels, and mode are the arguments passed to
    the estimator's model function.  This method creates and returns a dict with
    one entry for every Feature, Label, or Weights layer in the graph.  The keys
    are the layers, and the values are the tensors that correspond to them.

    Any subclass that overrides default_generator() must also override this
    method.
    """
    if self.__class__.default_generator != TensorGraph.default_generator:
      raise ValueError(
          "Class overrides default_generator() but not create_estimator_inputs()"
      )
    tensors = {}
    for layer, column in zip(self.features, feature_columns):
      tensors[layer] = tf.feature_column.input_layer(features, [column])
    if weight_column is not None:
      tensors[self.task_weights[0]] = tf.feature_column.input_layer(
          features, [weight_column])
    if labels is not None:
      tensors[self.labels[0]] = tf.cast(labels, self.labels[0].dtype)
    return tensors