def run_step(self, batch: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]: """Run a forward step through the Network on a batch of data. Implementations of this method within derived classes should handle bringing the prediction data back from the (multi-)GPU environment to the CPU. This method expects that Network.load_epoch() has already been invoked. Args: batch: The batch of data serving as input to the Network. Returns: (batch_data, prediction_data) """ mode = self.epoch_state["mode"] batch_in = self._get_effective_batch_input(batch, mode) self.epoch_state["tape"] = NonContext() # gpu operation with torch.no_grad() if not self.epoch_state["req_grad"] else NonContext(): self._forward_batch(batch_in, self.epoch_state, self.epoch_ops) # copy data to cpu if self.device.type == "cuda": prediction = { key: self._move_tensor_between_device(self._detach_tensor(batch_in[key]), "cpu") for key in self.effective_outputs[mode] if key in batch_in } else: prediction = { key: self._detach_tensor(batch_in[key]) for key in self.effective_outputs[mode] if key in batch_in } return batch, prediction
def _forward_step_static(self, batch: Dict[str, Any], state: Dict[str, Any], ops: List[TensorOp], effective_outputs: List[str]) -> Dict[str, Any]: """Run a forward step of the Network in static graph mode. Args: batch: The input data for the Network. state: A dictionary containing information about the current execution environment, including the active gradient tape. ops: A list of Ops to run during the forward step. effective_outputs: Which outputs should be copied from the GPU back onto the CPU for further use in Traces. Returns: The prediction dictionary resulting from a forward pass of the Network. """ batch = ChainMap({}, batch) prediction = {} with tf.GradientTape(persistent=True) if state["req_grad"] else NonContext() as tape: state['tape'] = tape self._forward_batch(batch, state, ops) del state['tape'] del tape for key in effective_outputs: if key in batch: prediction[key] = batch[key] return prediction
def build(model_def, model_name, optimizer, loss_name, custom_objects=None): """build keras model instance in FastEstimator Args: model_def (function): function definition of tf.keras model or path of model file(h5) model_name (str, list, tuple): model name(s) optimizer (str, optimizer, list, tuple): optimizer(s) loss_name (str, list, tuple): loss name(s) custom_objects (dict): dictionary that maps custom Returns: model: model(s) compiled by FastEstimator """ with fe.distribute_strategy.scope( ) if fe.distribute_strategy else NonContext(): if isinstance(model_def, str): model = tf.keras.models.load_model(model_def, custom_objects=custom_objects) else: model = model_def() model = to_list(model) model_name = to_list(model_name) optimizer = to_list(optimizer) loss_name = to_list(loss_name) assert len(model) == len(model_name) == len(optimizer) == len( loss_name) for idx, (m, m_n, o, l_n) in enumerate( zip(model, model_name, optimizer, loss_name)): model[idx] = _fe_compile(m, m_n, o, l_n) if len(model) == 1: model = model[0] return model
def _start(self, run_modes: Set[str], eager: bool) -> None: """The outer training loop. This method invokes the trace on_begin method, runs the necessary 'train' and 'eval' epochs, and then invokes the trace on_end method. Args: run_modes: The current execution modes. eager: Whether to run the training in eager mode. This is only related to TensorFlow training because PyTorch by nature is always in eager mode. """ all_traces = sort_traces(get_current_items(self.traces_in_use, run_modes=run_modes), ds_ids=[]) with NonContext() if fe.fe_history_path is False else HistoryRecorder( self.system, self.filepath, db_path=fe.fe_history_path): try: self._run_traces_on_begin(traces=all_traces) if "train" in run_modes or "eval" in run_modes: # If the training is re-starting from a restore wizard, it should re-run the last eval epoch if self.system.epoch_idx > 0 and "eval" in self.pipeline.get_modes(epoch=self.system.epoch_idx): self.system.mode = "eval" self._run_epoch(eager=eager) for self.system.epoch_idx in range(self.system.epoch_idx + 1, self.system.total_epochs + 1): if "train" in self.pipeline.get_modes(epoch=self.system.epoch_idx): self.system.mode = "train" self._run_epoch(eager=eager) if "eval" in self.pipeline.get_modes(epoch=self.system.epoch_idx): self.system.mode = "eval" self._run_epoch(eager=eager) else: self._run_epoch(eager=eager) except EarlyStop: pass # On early stopping we still want to run the final traces and return results self._run_traces_on_end(traces=all_traces)
def _fetch_logs(self, args: Dict[str, Any], unknown: List[str]) -> None: """A method to collect and return a given set of logs from the database. Args: args: The CLI arguments provided by the user. unknown: Any CLI arguments not matching known inputs. """ if len(unknown) > 0: print("unrecognized arguments: ", str.join(", ", unknown)) return save = args['file'] save_path = None if save: save_path = args['file_dir'] if save_path is None: save_path = os.path.join(str(Path.home()), 'fastestimator_data') save = 'dir' print(f"Writing log(s) to {save_path}") else: save = 'file' print(f'Writing log to {save_path}') logs = {} for idx in args['indices']: selection = self.response[idx - 1] # Auto index starts at 1 pk = selection['pk'] with closing(self.db.cursor()) as cursor: cursor.execute("SELECT log FROM logs WHERE logs.fk = (?)", [pk]) logs[idx] = cursor.fetchall() with open(save_path, 'w') if save == 'file' else NonContext() as f: f = sys.stdout if f is None else f for idx, log in logs.items(): with open(os.path.join(save_path, f"{idx}.txt"), 'w') if save == 'dir' else NonContext() as f1: f1 = f if f1 is None else f1 if log: f1.write( f'\n@@@@@@@@@@@ Log for Index {idx} @@@@@@@@@@@\n\n' ) f1.write(log[0]['log']) f1.write('\n') else: f1.write(f"No logs found for Index {idx}\n")
def _document_fe_graph(self) -> None: """Add FE execution graphs into the traceability document. """ with self.doc.create(Section("FastEstimator Architecture")): for mode in self.system.pipeline.data.keys(): scheduled_items = self.system.pipeline.get_scheduled_items( mode) + self.system.network.get_scheduled_items( mode) + self.system.traces signature_epochs = get_signature_epochs( scheduled_items, total_epochs=self.system.epoch_idx, mode=mode) epochs_with_data = self.system.pipeline.get_epochs_with_data( total_epochs=self.system.epoch_idx, mode=mode) if set(signature_epochs) & epochs_with_data: self.doc.append(NoEscape(r'\FloatBarrier')) with self.doc.create(Subsection(mode.capitalize())): for epoch in signature_epochs: if epoch not in epochs_with_data: continue self.doc.append(NoEscape(r'\FloatBarrier')) with self.doc.create( Subsubsection( f"Epoch {epoch}", label=Label( Marker(name=f"{mode}{epoch}", prefix="ssubsec")))): ds_ids = self.system.pipeline.get_ds_ids( epoch=epoch, mode=mode) for ds_id in ds_ids: with NonContext( ) if ds_id == '' else self.doc.create( Paragraph( f"Dataset {ds_id}", label=Label( Marker( name= f"{mode}{epoch}{ds_id}", prefix="para")))): diagram = self._draw_diagram( mode, epoch, ds_id) ltx = d2t.dot2tex(diagram.to_string(), figonly=True) args = Arguments( **{ 'max width': r'\textwidth, max height=0.9\textheight' }) args.escape = False with self.doc.create(Center()): with self.doc.create( AdjustBox(arguments=args) ) as box: box.append(NoEscape(ltx))
def prepare(self, mode_list, distribute_strategy): """This function constructs the model specified in model definition and create replica of model for distributed training across multiple devices if there are multiple GPU available. Args: mode_list : can be either 'train' or 'eval' distribute_strategy : Tensorflow class that defines distribution strategy (e.g. tf.distribute.MirroredStrategy) """ all_output_keys = [] for mode in mode_list: signature_epoch, mode_ops = self._get_signature_epoch(mode) epoch_ops_map = {} epoch_model_map = {} for epoch in signature_epoch: epoch_ops = [] epoch_model = [] # generate ops for specific mode and epoch for op in mode_ops: if isinstance(op, Scheduler): scheduled_op = op.get_current_value(epoch) if scheduled_op: epoch_ops.append(scheduled_op) else: epoch_ops.append(op) # check the ops verify_ops(epoch_ops, "Network") # create model list for op in epoch_ops: all_output_keys.append(op.outputs) if isinstance(op, ModelOp): if op.model.keras_model is None: with distribute_strategy.scope( ) if distribute_strategy else NonContext(): op.model.keras_model = op.model.model_def() op.model.keras_model.optimizer = op.model.optimizer op.model.keras_model.loss_name = op.model.loss_name op.model.keras_model.model_name = op.model.model_name assert op.model.model_name not in self.model, \ "duplicated model name: {}".format(op.model.model_name) self.model[ op.model.model_name] = op.model.keras_model if op.model.loss_name not in self.all_losses: self.all_losses.append(op.model.loss_name) if op.model.keras_model not in epoch_model: epoch_model.append(op.model.keras_model) assert epoch_model, "Network has no model for epoch {}".format( epoch) epoch_ops_map[epoch] = epoch_ops epoch_model_map[epoch] = epoch_model self.op_schedule[mode] = Scheduler(epoch_dict=epoch_ops_map) self.model_schedule[mode] = Scheduler(epoch_dict=epoch_model_map) self.all_output_keys = set(flatten_list(all_output_keys)) - {None}
def run_step(self, batch, ops, model_list, epoch_losses, state, warm_up=False): """Function that calculates the loss and gradients for curent step in training. It also constructs the higher level computational graph between the models before the training. Args: batch : dictionary that contains batch data and predictions from last epoch ops : Model operation dictionary that contains 'Inputs','Mode', and 'Outputs' model_list : List of the models epoch_losses : List of epoch losses. state : run time dictionary that contains following keys 'mode' and 'batch size' warm_up (bool, optional): Specifies if it's in warm up phase or not. Defaults to False. Returns: dictionary containing the predictions of current epoch """ prediction = {} batch = ChainMap(prediction, batch) mode = state["mode"] global_batch_size = state["batch_size"] num_model = len(model_list) # use gradient tape for train, otherwise use a dummy tape with tf.GradientTape( persistent=True) if mode == "train" else NonContext() as tape: state['tape'] = tape self._forward(batch, state, ops) reduced_loss = self._reduce_loss(batch, global_batch_size, epoch_losses, warm_up) # update model only for train mode if mode == "train": for idx in range(num_model): model = model_list[idx] loss = reduced_loss[model.loss_name] optimizer = model.optimizer if warm_up: with tfops.init_scope(): # pylint: disable=not-context-manager _ = optimizer.iterations optimizer._create_hypers() # pylint: disable=protected-access optimizer._create_slots( model_list[idx].trainable_variables) # pylint: disable=protected-access else: gradients = tape.gradient(loss, model.trainable_variables) optimizer.apply_gradients( zip(gradients, model.trainable_variables)) del state['tape'] del tape return prediction
def run_step(self, batch, ops, state): """Function that calculates the loss and gradients for curent step in training. It also constructs the higher level computational graph between the models before the training. Args: batch : dictionary that contains batch data and predictions from last epoch ops : Model operation dictionary that contains 'Inputs','Mode', and 'Outputs' state : run time dictionary that contains following keys 'mode' and 'batch size' Returns: dictionary containing the predictions of current epoch """ prediction = {} batch = ChainMap(prediction, batch) mode = state["mode"] # use gradient tape for train, otherwise use a dummy tape with tf.GradientTape( persistent=True) if mode == "train" else NonContext() as tape: state['tape'] = tape self._forward(batch, state, ops) del state['tape'] del tape return prediction
def build(model_def, model_name, optimizer, loss_name): """build keras model instance in FastEstimator Args: model_def (function): function definition of tf.keras model model_name (str, list, tuple): model name(s) optimizer (str, optimizer, list, tuple): optimizer(s) loss_name (str, list, tuple): loss name(s) Returns: model: model(s) compiled by FastEstimator """ with fe.distribute_strategy.scope() if fe.distribute_strategy else NonContext(): model = to_list(model_def()) model_name = to_list(model_name) optimizer = to_list(optimizer) loss_name = to_list(loss_name) assert len(model) == len(model_name) == len(optimizer) == len(loss_name) for idx, (m, m_n, o, l_n) in enumerate(zip(model, model_name, optimizer, loss_name)): model[idx] = _fe_compile(m, m_n, o, l_n) if len(model) == 1: model = model[0] return model
def get_gradient(target: Tensor, sources: Union[Iterable[Tensor], Tensor], higher_order: bool = False, tape: Optional[tf.GradientTape] = None, retain_graph: bool = True) -> Union[Iterable[Tensor], Tensor]: """Calculate gradients of a target w.r.t sources. This method can be used with TensorFlow tensors: ```python x = tf.Variable([1.0, 2.0, 3.0]) with tf.GradientTape(persistent=True) as tape: y = x * x b = fe.backend.get_gradient(target=y, sources=x, tape=tape) # [2.0, 4.0, 6.0] b = fe.backend.get_gradient(target=b, sources=x, tape=tape) # None b = fe.backend.get_gradient(target=y, sources=x, tape=tape, higher_order=True) # [2.0, 4.0, 6.0] b = fe.backend.get_gradient(target=b, sources=x, tape=tape) # [2.0, 2.0, 2.0] ``` This method can be used with PyTorch tensors: ```python x = torch.tensor([1.0, 2.0, 3.0], requires_grad=True) y = x * x b = fe.backend.get_gradient(target=y, sources=x) # [2.0, 4.0, 6.0] b = fe.backend.get_gradient(target=b, sources=x) # Error - b does not have a backwards function b = fe.backend.get_gradient(target=y, sources=x, higher_order=True) # [2.0, 4.0, 6.0] b = fe.backend.get_gradient(target=b, sources=x) # [2.0, 2.0, 2.0] ``` Args: target: The target (final) tensor. sources: A sequence of source (initial) tensors. higher_order: Whether the gradient will be used for higher order gradients. tape: TensorFlow gradient tape. Only needed when using the TensorFlow backend. retain_graph: Whether to retain PyTorch graph. Only valid when using the PyTorch backend. Returns: Gradient(s) of the `target` with respect to the `sources`. Raises: ValueError: If `target` is an unacceptable data type. """ if tf.is_tensor(target): with NonContext() if higher_order else tape.stop_recording(): gradients = tape.gradient(target, sources) elif isinstance(target, torch.Tensor): gradients = torch.autograd.grad(target, sources, grad_outputs=torch.ones_like(target), retain_graph=retain_graph, create_graph=higher_order, only_inputs=True) if isinstance(sources, torch.Tensor): # The behavior table of tf and torch backend # --------------------------------------------------------------- # | case 1 | case 2 | # ---------------------------------------------------------------| # tf | target: tf.Tensor | target: tf.Tensor | # | sources: tf.Tensor | sources: [tf.Tensor] | # | gradients: tf.Tensor | gradients: [tf.Tensor] | # ----------------------------------------------------------------| # torch | target: torch.Tensor | target: tf.Tensor | # | sources: torch.Tensor | sources: [tf.Tensor] | # | gradients: (torch.Tensor,) | gradients: (torch.Tensor,)| # ---------------------------------------------------------------- # In order to make the torch behavior become the same as tf in case 1, need to unwrap the gradients when # source is not Iterable. gradients = gradients[0] else: raise ValueError("Unrecognized tensor type {}".format(type(target))) return gradients