Example #1
0
    def run_step(self, batch: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
        """Run a forward step through the Network on a batch of data.

        Implementations of this method within derived classes should handle bringing the prediction data back from the
        (multi-)GPU environment to the CPU. This method expects that Network.load_epoch() has already been invoked.

        Args:
            batch: The batch of data serving as input to the Network.

        Returns:
            (batch_data, prediction_data)
        """
        mode = self.epoch_state["mode"]
        batch_in = self._get_effective_batch_input(batch, mode)
        self.epoch_state["tape"] = NonContext()
        # gpu operation
        with torch.no_grad() if not self.epoch_state["req_grad"] else NonContext():
            self._forward_batch(batch_in, self.epoch_state, self.epoch_ops)
        # copy data to cpu
        if self.device.type == "cuda":
            prediction = {
                key: self._move_tensor_between_device(self._detach_tensor(batch_in[key]), "cpu")
                for key in self.effective_outputs[mode] if key in batch_in
            }
        else:
            prediction = {
                key: self._detach_tensor(batch_in[key])
                for key in self.effective_outputs[mode] if key in batch_in
            }
        return batch, prediction
Example #2
0
    def _forward_step_static(self,
                             batch: Dict[str, Any],
                             state: Dict[str, Any],
                             ops: List[TensorOp],
                             effective_outputs: List[str]) -> Dict[str, Any]:
        """Run a forward step of the Network in static graph mode.

        Args:
            batch: The input data for the Network.
            state: A dictionary containing information about the current execution environment, including the active
                gradient tape.
            ops: A list of Ops to run during the forward step.
            effective_outputs: Which outputs should be copied from the GPU back onto the CPU for further use in Traces.

        Returns:
            The prediction dictionary resulting from a forward pass of the Network.
        """
        batch = ChainMap({}, batch)
        prediction = {}
        with tf.GradientTape(persistent=True) if state["req_grad"] else NonContext() as tape:
            state['tape'] = tape
            self._forward_batch(batch, state, ops)
        del state['tape']
        del tape
        for key in effective_outputs:
            if key in batch:
                prediction[key] = batch[key]
        return prediction
Example #3
0
def build(model_def, model_name, optimizer, loss_name, custom_objects=None):
    """build keras model instance in FastEstimator

    Args:
        model_def (function): function definition of tf.keras model or path of model file(h5)
        model_name (str, list, tuple): model name(s)
        optimizer (str, optimizer, list, tuple): optimizer(s)
        loss_name (str, list, tuple): loss name(s)
        custom_objects (dict): dictionary that maps custom

    Returns:
        model: model(s) compiled by FastEstimator
    """
    with fe.distribute_strategy.scope(
    ) if fe.distribute_strategy else NonContext():
        if isinstance(model_def, str):
            model = tf.keras.models.load_model(model_def,
                                               custom_objects=custom_objects)
        else:
            model = model_def()
        model = to_list(model)
        model_name = to_list(model_name)
        optimizer = to_list(optimizer)
        loss_name = to_list(loss_name)
        assert len(model) == len(model_name) == len(optimizer) == len(
            loss_name)
        for idx, (m, m_n, o, l_n) in enumerate(
                zip(model, model_name, optimizer, loss_name)):
            model[idx] = _fe_compile(m, m_n, o, l_n)
    if len(model) == 1:
        model = model[0]
    return model
Example #4
0
    def _start(self, run_modes: Set[str], eager: bool) -> None:
        """The outer training loop.

        This method invokes the trace on_begin method, runs the necessary 'train' and 'eval' epochs, and then invokes
        the trace on_end method.

        Args:
            run_modes: The current execution modes.
            eager: Whether to run the training in eager mode. This is only related to TensorFlow training because
                PyTorch by nature is always in eager mode.
        """
        all_traces = sort_traces(get_current_items(self.traces_in_use, run_modes=run_modes), ds_ids=[])
        with NonContext() if fe.fe_history_path is False else HistoryRecorder(
                self.system, self.filepath, db_path=fe.fe_history_path):
            try:
                self._run_traces_on_begin(traces=all_traces)
                if "train" in run_modes or "eval" in run_modes:
                    # If the training is re-starting from a restore wizard, it should re-run the last eval epoch
                    if self.system.epoch_idx > 0 and "eval" in self.pipeline.get_modes(epoch=self.system.epoch_idx):
                        self.system.mode = "eval"
                        self._run_epoch(eager=eager)
                    for self.system.epoch_idx in range(self.system.epoch_idx + 1, self.system.total_epochs + 1):
                        if "train" in self.pipeline.get_modes(epoch=self.system.epoch_idx):
                            self.system.mode = "train"
                            self._run_epoch(eager=eager)
                        if "eval" in self.pipeline.get_modes(epoch=self.system.epoch_idx):
                            self.system.mode = "eval"
                            self._run_epoch(eager=eager)
                else:
                    self._run_epoch(eager=eager)
            except EarlyStop:
                pass  # On early stopping we still want to run the final traces and return results
            self._run_traces_on_end(traces=all_traces)
Example #5
0
    def _fetch_logs(self, args: Dict[str, Any], unknown: List[str]) -> None:
        """A method to collect and return a given set of logs from the database.

        Args:
            args: The CLI arguments provided by the user.
            unknown: Any CLI arguments not matching known inputs.
        """
        if len(unknown) > 0:
            print("unrecognized arguments: ", str.join(", ", unknown))
            return
        save = args['file']
        save_path = None
        if save:
            save_path = args['file_dir']
            if save_path is None:
                save_path = os.path.join(str(Path.home()),
                                         'fastestimator_data')
                save = 'dir'
                print(f"Writing log(s) to {save_path}")
            else:
                save = 'file'
                print(f'Writing log to {save_path}')
        logs = {}
        for idx in args['indices']:
            selection = self.response[idx - 1]  # Auto index starts at 1
            pk = selection['pk']
            with closing(self.db.cursor()) as cursor:
                cursor.execute("SELECT log FROM logs WHERE logs.fk = (?)",
                               [pk])
                logs[idx] = cursor.fetchall()
        with open(save_path, 'w') if save == 'file' else NonContext() as f:
            f = sys.stdout if f is None else f
            for idx, log in logs.items():
                with open(os.path.join(save_path, f"{idx}.txt"),
                          'w') if save == 'dir' else NonContext() as f1:
                    f1 = f if f1 is None else f1
                    if log:
                        f1.write(
                            f'\n@@@@@@@@@@@ Log for Index {idx} @@@@@@@@@@@\n\n'
                        )
                        f1.write(log[0]['log'])
                        f1.write('\n')
                    else:
                        f1.write(f"No logs found for Index {idx}\n")
Example #6
0
 def _document_fe_graph(self) -> None:
     """Add FE execution graphs into the traceability document.
     """
     with self.doc.create(Section("FastEstimator Architecture")):
         for mode in self.system.pipeline.data.keys():
             scheduled_items = self.system.pipeline.get_scheduled_items(
                 mode) + self.system.network.get_scheduled_items(
                     mode) + self.system.traces
             signature_epochs = get_signature_epochs(
                 scheduled_items,
                 total_epochs=self.system.epoch_idx,
                 mode=mode)
             epochs_with_data = self.system.pipeline.get_epochs_with_data(
                 total_epochs=self.system.epoch_idx, mode=mode)
             if set(signature_epochs) & epochs_with_data:
                 self.doc.append(NoEscape(r'\FloatBarrier'))
                 with self.doc.create(Subsection(mode.capitalize())):
                     for epoch in signature_epochs:
                         if epoch not in epochs_with_data:
                             continue
                         self.doc.append(NoEscape(r'\FloatBarrier'))
                         with self.doc.create(
                                 Subsubsection(
                                     f"Epoch {epoch}",
                                     label=Label(
                                         Marker(name=f"{mode}{epoch}",
                                                prefix="ssubsec")))):
                             ds_ids = self.system.pipeline.get_ds_ids(
                                 epoch=epoch, mode=mode)
                             for ds_id in ds_ids:
                                 with NonContext(
                                 ) if ds_id == '' else self.doc.create(
                                         Paragraph(
                                             f"Dataset {ds_id}",
                                             label=Label(
                                                 Marker(
                                                     name=
                                                     f"{mode}{epoch}{ds_id}",
                                                     prefix="para")))):
                                     diagram = self._draw_diagram(
                                         mode, epoch, ds_id)
                                     ltx = d2t.dot2tex(diagram.to_string(),
                                                       figonly=True)
                                     args = Arguments(
                                         **{
                                             'max width':
                                             r'\textwidth, max height=0.9\textheight'
                                         })
                                     args.escape = False
                                     with self.doc.create(Center()):
                                         with self.doc.create(
                                                 AdjustBox(arguments=args)
                                         ) as box:
                                             box.append(NoEscape(ltx))
Example #7
0
    def prepare(self, mode_list, distribute_strategy):
        """This function constructs the model specified in model definition and create replica of model
         for distributed training across multiple devices if there are multiple GPU available.

        Args:
            mode_list : can be either 'train' or 'eval'
            distribute_strategy : Tensorflow class that defines distribution strategy (e.g. tf.distribute.MirroredStrategy)
        """
        all_output_keys = []
        for mode in mode_list:
            signature_epoch, mode_ops = self._get_signature_epoch(mode)
            epoch_ops_map = {}
            epoch_model_map = {}
            for epoch in signature_epoch:
                epoch_ops = []
                epoch_model = []
                # generate ops for specific mode and epoch
                for op in mode_ops:
                    if isinstance(op, Scheduler):
                        scheduled_op = op.get_current_value(epoch)
                        if scheduled_op:
                            epoch_ops.append(scheduled_op)
                    else:
                        epoch_ops.append(op)
                # check the ops
                verify_ops(epoch_ops, "Network")
                # create model list
                for op in epoch_ops:
                    all_output_keys.append(op.outputs)
                    if isinstance(op, ModelOp):
                        if op.model.keras_model is None:
                            with distribute_strategy.scope(
                            ) if distribute_strategy else NonContext():
                                op.model.keras_model = op.model.model_def()
                                op.model.keras_model.optimizer = op.model.optimizer
                                op.model.keras_model.loss_name = op.model.loss_name
                                op.model.keras_model.model_name = op.model.model_name
                                assert op.model.model_name not in self.model, \
                                    "duplicated model name: {}".format(op.model.model_name)
                                self.model[
                                    op.model.model_name] = op.model.keras_model
                                if op.model.loss_name not in self.all_losses:
                                    self.all_losses.append(op.model.loss_name)
                        if op.model.keras_model not in epoch_model:
                            epoch_model.append(op.model.keras_model)
                assert epoch_model, "Network has no model for epoch {}".format(
                    epoch)
                epoch_ops_map[epoch] = epoch_ops
                epoch_model_map[epoch] = epoch_model
            self.op_schedule[mode] = Scheduler(epoch_dict=epoch_ops_map)
            self.model_schedule[mode] = Scheduler(epoch_dict=epoch_model_map)
        self.all_output_keys = set(flatten_list(all_output_keys)) - {None}
Example #8
0
    def run_step(self,
                 batch,
                 ops,
                 model_list,
                 epoch_losses,
                 state,
                 warm_up=False):
        """Function that calculates the loss and gradients for curent step in training. It also constructs the higher
        level computational graph between the models before the training.

        Args:
            batch : dictionary that contains batch data and predictions from last epoch
            ops : Model operation dictionary that contains 'Inputs','Mode', and 'Outputs'
            model_list : List of the models
            epoch_losses : List of epoch losses.
            state : run time dictionary that contains following keys 'mode' and 'batch size'
            warm_up (bool, optional): Specifies if it's in warm up phase or not. Defaults to False.

        Returns:
            dictionary containing the predictions of current epoch
        """
        prediction = {}
        batch = ChainMap(prediction, batch)
        mode = state["mode"]
        global_batch_size = state["batch_size"]
        num_model = len(model_list)
        # use gradient tape for train, otherwise use a dummy tape
        with tf.GradientTape(
                persistent=True) if mode == "train" else NonContext() as tape:
            state['tape'] = tape
            self._forward(batch, state, ops)
            reduced_loss = self._reduce_loss(batch, global_batch_size,
                                             epoch_losses, warm_up)
        # update model only for train mode
        if mode == "train":
            for idx in range(num_model):
                model = model_list[idx]
                loss = reduced_loss[model.loss_name]
                optimizer = model.optimizer
                if warm_up:
                    with tfops.init_scope():  # pylint: disable=not-context-manager
                        _ = optimizer.iterations
                        optimizer._create_hypers()  # pylint: disable=protected-access
                        optimizer._create_slots(
                            model_list[idx].trainable_variables)  # pylint: disable=protected-access
                else:
                    gradients = tape.gradient(loss, model.trainable_variables)
                    optimizer.apply_gradients(
                        zip(gradients, model.trainable_variables))
        del state['tape']
        del tape
        return prediction
Example #9
0
    def run_step(self, batch, ops, state):
        """Function that calculates the loss and gradients for curent step in training. It also constructs the higher
        level computational graph between the models before the training.

        Args:
            batch : dictionary that contains batch data and predictions from last epoch
            ops : Model operation dictionary that contains 'Inputs','Mode', and 'Outputs'
            state : run time dictionary that contains following keys 'mode' and 'batch size'

        Returns:
            dictionary containing the predictions of current epoch
        """
        prediction = {}
        batch = ChainMap(prediction, batch)
        mode = state["mode"]
        # use gradient tape for train, otherwise use a dummy tape
        with tf.GradientTape(
                persistent=True) if mode == "train" else NonContext() as tape:
            state['tape'] = tape
            self._forward(batch, state, ops)
        del state['tape']
        del tape
        return prediction
Example #10
0
def build(model_def, model_name, optimizer, loss_name):
    """build keras model instance in FastEstimator

    Args:
        model_def (function): function definition of tf.keras model
        model_name (str, list, tuple): model name(s)
        optimizer (str, optimizer, list, tuple): optimizer(s)
        loss_name (str, list, tuple): loss name(s)

    Returns:
        model: model(s) compiled by FastEstimator
    """
    with fe.distribute_strategy.scope() if fe.distribute_strategy else NonContext():
        model = to_list(model_def())
        model_name = to_list(model_name)
        optimizer = to_list(optimizer)
        loss_name = to_list(loss_name)
        assert len(model) == len(model_name) == len(optimizer) == len(loss_name)
        for idx, (m, m_n, o, l_n) in enumerate(zip(model, model_name, optimizer, loss_name)):
            model[idx] = _fe_compile(m, m_n, o, l_n)
    if len(model) == 1:
        model = model[0]
    return model
Example #11
0
def get_gradient(target: Tensor,
                 sources: Union[Iterable[Tensor], Tensor],
                 higher_order: bool = False,
                 tape: Optional[tf.GradientTape] = None,
                 retain_graph: bool = True) -> Union[Iterable[Tensor], Tensor]:
    """Calculate gradients of a target w.r.t sources.

    This method can be used with TensorFlow tensors:
    ```python
    x = tf.Variable([1.0, 2.0, 3.0])
    with tf.GradientTape(persistent=True) as tape:
        y = x * x

        b = fe.backend.get_gradient(target=y, sources=x, tape=tape)  # [2.0, 4.0, 6.0]
        b = fe.backend.get_gradient(target=b, sources=x, tape=tape)  # None

        b = fe.backend.get_gradient(target=y, sources=x, tape=tape, higher_order=True)  # [2.0, 4.0, 6.0]
        b = fe.backend.get_gradient(target=b, sources=x, tape=tape)  # [2.0, 2.0, 2.0]
    ```

    This method can be used with PyTorch tensors:
    ```python
    x = torch.tensor([1.0, 2.0, 3.0], requires_grad=True)
    y = x * x

    b = fe.backend.get_gradient(target=y, sources=x)  # [2.0, 4.0, 6.0]
    b = fe.backend.get_gradient(target=b, sources=x)  # Error - b does not have a backwards function

    b = fe.backend.get_gradient(target=y, sources=x, higher_order=True)  # [2.0, 4.0, 6.0]
    b = fe.backend.get_gradient(target=b, sources=x)  # [2.0, 2.0, 2.0]
    ```

    Args:
        target: The target (final) tensor.
        sources: A sequence of source (initial) tensors.
        higher_order: Whether the gradient will be used for higher order gradients.
        tape: TensorFlow gradient tape. Only needed when using the TensorFlow backend.
        retain_graph: Whether to retain PyTorch graph. Only valid when using the PyTorch backend.

    Returns:
        Gradient(s) of the `target` with respect to the `sources`.

    Raises:
        ValueError: If `target` is an unacceptable data type.
    """
    if tf.is_tensor(target):
        with NonContext() if higher_order else tape.stop_recording():
            gradients = tape.gradient(target, sources)
    elif isinstance(target, torch.Tensor):
        gradients = torch.autograd.grad(target,
                                        sources,
                                        grad_outputs=torch.ones_like(target),
                                        retain_graph=retain_graph,
                                        create_graph=higher_order,
                                        only_inputs=True)

        if isinstance(sources, torch.Tensor):
            #  The behavior table of tf and torch backend
            #  ---------------------------------------------------------------
            #        | case 1                     | case 2                    |
            #  ---------------------------------------------------------------|
            #  tf    | target: tf.Tensor          | target: tf.Tensor         |
            #        | sources: tf.Tensor         | sources: [tf.Tensor]      |
            #        | gradients: tf.Tensor       | gradients: [tf.Tensor]    |
            # ----------------------------------------------------------------|
            #  torch | target: torch.Tensor       | target: tf.Tensor         |
            #        | sources: torch.Tensor      | sources: [tf.Tensor]      |
            #        | gradients: (torch.Tensor,) | gradients: (torch.Tensor,)|
            # ----------------------------------------------------------------
            # In order to make the torch behavior become the same as tf in case 1, need to unwrap the gradients when
            # source is not Iterable.

            gradients = gradients[0]
    else:
        raise ValueError("Unrecognized tensor type {}".format(type(target)))
    return gradients