def _set_checkpointer(self, model): if (self.config["checkpoint"] or self.config["lr_scheduler"] == "reduce_on_plateau"): self._validate_checkpoint_metric(model) # Set checkpoint_dir to log_dir/checkpoints/ if self.writer: if not self.config["checkpoint_config"]["checkpoint_dir"]: self.config["checkpoint_config"][ "checkpoint_dir"] = os.path.join( self.writer.log_subdir, "checkpoints") else: # If you hardcode checkpoint_dir, checkpoints from concurrent runs # may overwrite each other. msg = ( "You have provided checkpoint_dir, overriding the default " "of using log_dir/run_dir/run_name/checkpoints. Be careful: " "multiple concurrent runs may override each other.") warnings.warn(msg) else: self.config["checkpoint_config"][ "checkpoint_dir"] = "checkpoints" # Create Checkpointer self.checkpointer = Checkpointer(self.config["checkpoint_config"], verbose=self.config["verbose"]) else: self.checkpointer = None
def _set_checkpointer(self, train_config): if train_config["checkpoint"]: self.checkpointer = Checkpointer( train_config["checkpoint_config"], verbose=self.config["verbose"] ) else: self.checkpointer = None
def _set_checkpointer(self, train_config): if train_config["checkpoint"]: # Default to valid split for checkpoint metric checkpoint_config = train_config["checkpoint_config"] checkpoint_metric = checkpoint_config["checkpoint_metric"] if checkpoint_metric.count("/") == 0: checkpoint_config[ "checkpoint_metric"] = f"valid/{checkpoint_metric}" self.checkpointer = Checkpointer(checkpoint_config, verbose=self.config["verbose"]) else: self.checkpointer = None
def _set_checkpointer(self, model): if (self.config["checkpoint"] or self.config["lr_scheduler"] == "reduce_on_plateau"): self._validate_checkpoint_metric(model) # Set checkpoint_dir to log_dir/checkpoints/ if self.writer: if not self.config["checkpoint_config"]["checkpoint_dir"]: self.config["checkpoint_config"][ "checkpoint_dir"] = os.path.join( self.writer.log_subdir, "checkpoints") else: # If you hardcode checkpoint_dir, checkpoints from concurrent runs # may overwrite each other. msg = ( "You have provided checkpoint_dir, overriding the default " "of using log_dir/run_dir/run_name/checkpoints. Be careful: " "multiple concurrent runs may override each other.") warnings.warn(msg) else: self.config["checkpoint_config"][ "checkpoint_dir"] = "checkpoints" # Create Checkpointer self.checkpointer = Checkpointer(self.config["checkpoint_config"], verbose=self.config["verbose"]) else: self.checkpointer = None # EXPERIMENTAL: Optionally add task-specific checkpointers # HACK: This is hard-coded in a way specific to Glue! self.task_checkpointers = [] if self.config["checkpoint_tasks"]: msg = ( "checkpoint_tasks setting does not have the same thorough error " "checking that the normal checkpoint operation has, so you may " "accidentally be trying to checkpoint metrics that aren't going to be " "found in the metrics_dict if you're not careful.") warnings.warn(msg) for task_name in self.task_names: # We only make task_specific checkpoints for the glue tasks # HACK: allow checkpointing on slice tasks using_slice = ":" in task_name orig_task_name = task_name.split( ":")[0] if using_slice else None if (task_name not in GLUE_METRICS) and (orig_task_name not in GLUE_METRICS): continue checkpoint_config = copy.deepcopy( self.config["checkpoint_config"]) checkpoint_config["checkpoint_dir"] += f"/{task_name}" checkpoint_config["checkpoint_best"] = True checkpoint_metric = (( f"{task_name}/{orig_task_name}_valid/{GLUE_METRICS[orig_task_name]}" ) if using_slice else ( f"{task_name}/{task_name}_valid/{GLUE_METRICS[task_name]}") ) checkpoint_config["checkpoint_metric"] = checkpoint_metric checkpoint_config["checkpoint_metric_mode"] = "max" task_checkpointer = Checkpointer( checkpoint_config, verbose=self.config["verbose"]) self.task_checkpointers.append(task_checkpointer)