def setUpClass(cls): cls.tf_data = Data({ 'x': tf.random.normal(shape=(1, 28, 28, 3)), 'y': tf.random.uniform(shape=(1, ), maxval=10, dtype=tf.int32), 'images': tf.random.normal(shape=(1, 28, 28, 3)), 'embed': np.ones(shape=(1, 3, 3, 3)), 'embed_images': np.ones(shape=(1, 3, 3, 3)) }) cls.torch_data = Data({ 'x': torch.rand(size=(1, 1, 28, 28)), 'y': torch.rand(size=(3, )), 'images': torch.rand(size=(1, 3, 28, 28)), 'embed': np.ones(shape=(1, 3, 3, 3)), 'embed_images': np.ones(shape=(1, 3, 3, 3)) }) cls.log_dir = os.path.join(tempfile.gettempdir(), 'tensorboard') cls.train_path = os.path.join(cls.log_dir, 'train') cls.embed_path = os.path.join(cls.log_dir, 'train', '00001', 'embed') cls.on_begin_msg = "FastEstimator-Tensorboard: writing logs to {}".format( cls.log_dir)
def on_batch_end(self, data: Data) -> None: if self.system.mode == "train" and isinstance(self.lr_fn, ARC): self.lr_fn.accumulate_single_train_loss(data[min(self.model.loss_name)].numpy()) if self.system.mode == "train" and self.system.log_steps and (self.system.global_step % self.system.log_steps == 0 or self.system.global_step == 1): current_lr = np.float32(get_lr(self.model)) data.write_with_log(self.outputs[0], current_lr)
def on_batch_end(self, data: Data) -> None: if self.system.ds_id != '': self.fe_per_ds_trace.on_batch_end(DSData(self.system.ds_id, data)) # Block the main process from writing per-instance info since we already have the more detailed key data.per_instance_enabled = False super().on_batch_end(data) data.per_instance_enabled = True
def test_max_to_keep_tf_architecture(self): save_dir = tempfile.mkdtemp() model = fe.build(model_fn=one_layer_tf_model, optimizer_fn='adam') model_saver = ModelSaver(model=model, save_dir=save_dir, max_to_keep=2, save_architecture=True) model_saver.system = sample_system_object() model_saver.on_epoch_end(data=Data()) model_saver.system.epoch_idx += 1 model_saver.on_epoch_end(data=Data()) model_saver.system.epoch_idx += 1 model_saver.on_epoch_end(data=Data()) model_name = "{}_epoch_{}".format(model_saver.model.model_name, model_saver.system.epoch_idx) tf_model_path1 = os.path.join(save_dir, model_name + '.h5') tf_architecture_path1 = os.path.join(save_dir, model_name) model_saver.system.epoch_idx += 1 model_saver.on_epoch_end(data=Data()) model_name = "{}_epoch_{}".format(model_saver.model.model_name, model_saver.system.epoch_idx) tf_model_path2 = os.path.join(save_dir, model_name + '.h5') tf_architecture_path2 = os.path.join(save_dir, model_name) with self.subTest('Check only four files are kept'): self.assertEqual(len(os.listdir(save_dir)), 4) with self.subTest('Check two latest models are kept'): self.assertTrue(os.path.exists(tf_model_path1)) self.assertTrue(os.path.exists(tf_model_path2)) self.assertTrue(os.path.exists(tf_architecture_path1)) self.assertTrue(os.path.isdir(tf_architecture_path1)) self.assertTrue(os.path.exists(tf_architecture_path2)) self.assertTrue(os.path.isdir(tf_architecture_path2))
def test_tf_traceability(self): if os.path.exists(self.tf_dir) and os.path.isdir(self.tf_dir): shutil.rmtree(self.tf_dir) trace = Traceability(save_path=self.tf_dir) est = _build_estimator( fe.build(model_fn=LeNet, optimizer_fn="adam", model_name='tfLeNet'), trace) trace.system = est.system trace.system.epoch_idx = 1 trace.system.summary.name = "TF Test" trace.on_begin(Data()) trace.on_end(Data()) crawler = os.walk(self.tf_dir) root = next(crawler) self.assertIn('resources', root[1], "A resources subdirectory should have been generated") self.assertIn('tf_test.tex', root[2], "The tex file should have been generated") # Might be a pdf and/or a .ds_store file depending on system, but shouldn't be more than that self.assertLessEqual(len(root[2]), 3, "Extra files should not have been generated") figs = next(crawler) self.assertIn('tf_test_tfLeNet.pdf', figs[2], "A figure for the model should have been generated") self.assertIn('tf_test_logs.png', figs[2], "A log image should have been generated") self.assertIn('tf_test.txt', figs[2], "A raw log file should have been generated")
def on_batch_end(self, data: Data) -> None: if self.write_graph and self.system.network.epoch_models.symmetric_difference( self.painted_graphs): self.writer.write_epoch_models(mode=self.system.mode) self.painted_graphs = self.system.network.epoch_models if self.system.mode != 'train': return if self.histogram_freq.freq and self.histogram_freq.is_step and \ self.system.global_step % self.histogram_freq.freq == 0: self.writer.write_weights(mode=self.system.mode, models=self.system.network.models, step=self.system.global_step, visualize=self.paint_weights) if self.update_freq.freq and self.update_freq.is_step and self.system.global_step % self.update_freq.freq == 0: self.writer.write_scalars(mode=self.system.mode, step=self.system.global_step, scalars=filter(lambda x: is_number(x[1]), data.items())) self.writer.write_images(mode=self.system.mode, step=self.system.global_step, images=filter( lambda x: x[1] is not None, map(lambda y: (y, data.get(y)), self.write_images))) self.writer.write_embeddings( mode=self.system.mode, step=self.system.global_step, embeddings=filter( lambda x: x[1] is not None, map( lambda t: (t[0], data.get(t[0]), data.get(t[1]), data.get(t[2])), self.write_embeddings)))
def on_epoch_end(self, data: Data) -> None: if self.system.mode == 'train' and self.histogram_freq.freq and not self.histogram_freq.is_step and \ self.system.epoch_idx % self.histogram_freq.freq == 0: self.writer.write_weights(mode=self.system.mode, models=self.system.network.models, step=self.system.global_step, visualize=self.paint_weights) if self.update_freq.freq and (self.update_freq.is_step or self.system.epoch_idx % self.update_freq.freq == 0): self.writer.write_scalars(mode=self.system.mode, step=self.system.global_step, scalars=filter(lambda x: is_number(x[1]), data.items())) self.writer.write_images(mode=self.system.mode, step=self.system.global_step, images=filter( lambda x: x[1] is not None, map(lambda y: (y, data.get(y)), self.write_images))) self.writer.write_embeddings( mode=self.system.mode, step=self.system.global_step, embeddings=filter( lambda x: x[1] is not None, map( lambda t: (t[0], data.get(t[0]), data.get(t[1]), data.get(t[2])), self.write_embeddings)))
def test_on_batch_end(self): self.pbm_calibrator.y_true = [] self.pbm_calibrator.y_pred = [] batch1 = { 'y': np.array([0, 0, 1, 1]), 'y_pred': np.array([[1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0]]) } self.pbm_calibrator.on_batch_end(data=Data(batch1)) with self.subTest('Check true values'): self.assertTrue( is_equal(self.pbm_calibrator.y_true, list(batch1['y']))) with self.subTest('Check pred values'): self.assertTrue( is_equal(self.pbm_calibrator.y_pred, list(batch1['y_pred']))) batch2 = { 'y': np.array([1, 1, 0, 0]), 'y_pred': np.array([[0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0]]) } self.pbm_calibrator.on_batch_end(data=Data(batch2)) with self.subTest('Check true values (2 batches)'): self.assertTrue( is_equal(self.pbm_calibrator.y_true, list(batch1['y']) + list(batch2['y']))) with self.subTest('Check pred values (2 batches)'): self.assertTrue( is_equal(self.pbm_calibrator.y_pred, list(batch1['y_pred']) + list(batch2['y_pred'])))
def on_end(self, data: Data) -> None: data.write_with_log( "total_time", "{} sec".format(round(time.perf_counter() - self.train_start, 2))) for model in self.system.network.models: if hasattr(model, "current_optimizer"): data.write_with_log(model.model_name + "_lr", get_lr(model))
def on_begin(self, data: Data) -> None: if fe.fe_deterministic_seed is not None: raise RuntimeError( "You cannot use RestoreWizard while in deterministic training mode since a restored" + " training can't guarantee that all prngs will be reset to exactly the same position" ) if not self.should_restore(): self._cleanup( self.dirs) # Remove any partially completed checkpoints print("FastEstimator-RestoreWizard: Backing up to {}".format( self.directory)) else: self._load_key() directory = self.dirs[self.dir_idx] self.system.load_state(directory) data.write_with_log("epoch", self.system.epoch_idx) print( "FastEstimator-RestoreWizard: Restoring from {}, resume training" .format(directory)) self.dir_idx = int( not self.dir_idx ) # Flip the idx so that next save goes to other dir self._cleanup( self.dirs[self.dir_idx] ) # Clean out the other dir in case it had a partial save
def on_batch_end(self, data: Data) -> None: y_pred, y_true = to_number(data['pred']), to_number(data['target_real']) if y_true.shape[-1] > 1 and y_true.ndim > 2: y_true = np.argmax(y_true, axis=-1) if y_pred.shape[-1] > 1 and y_pred.ndim > 2: y_pred = np.argmax(y_pred, axis=-1) sentence_level_scores = self.batch_precision_parameters(y_true, y_pred) data.write_per_instance_log(self.outputs[0], sentence_level_scores)
def on_epoch_end(self, data: Data) -> None: for key, ds_vals in self.test_results.items(): for ds_id, vals in ds_vals.items(): if ds_id != '': d = DSData(ds_id, data) d.write_with_log(key, np.mean(np.array(vals), axis=0)) data.write_with_log( key, np.mean(np.array([e for x in ds_vals.values() for e in x]), axis=0))
def on_batch_end(self, data: Data) -> None: if self.system.log_steps and (self.system.global_step % self.system.log_steps == 0 or self.system.global_step == 1): for key in self.inputs: if key in data: data.write_with_log(key, data[key]) if self.system.global_step > 1: self.elapse_times.append(time.perf_counter() - self.step_start) data.write_with_log("steps/sec", round(self.system.log_steps / np.sum(self.elapse_times), 2)) self.elapse_times = [] self.step_start = time.perf_counter()
def on_epoch_end(self, data: Data) -> None: if self.monitor_op(data[self.inputs[0]], self.best): self.best = data[self.inputs[0]] self.wait = 0 else: self.wait += 1 if self.wait >= self.patience: new_lr = max(self.min_lr, np.float32(self.factor * get_lr(self.model))) set_lr(self.model, new_lr) self.wait = 0 data.write_with_log(self.outputs[0], new_lr) print("FastEstimator-ReduceLROnPlateau: learning rate reduced to {}".format(new_lr))
def on_begin(self, data: Data) -> None: if not os.path.exists(self.directory) or not os.listdir( self.directory): print("FastEstimator-RestoreWizard: Backing up in {}".format( self.directory)) else: self._scan_files() self._load_files() data.write_with_log("epoch", self.system.epoch_idx) print( "FastEstimator-RestoreWizard: Restoring from {}, resume training" .format(self.directory))
def on_epoch_end(self, data: Data) -> None: if self.binary_classification: score = f1_score(self.y_true, self.y_pred, average='binary', **self.kwargs) else: score = f1_score(self.y_true, self.y_pred, average=None, **self.kwargs) data.write_with_log(self.outputs[0], score)
def on_epoch_end(self, data: Data) -> None: if self.monitor_op(data[self.metric], self.best): self.best = data[self.metric] self.since_best = 0 if self.save_dir: self.model_path = save_model(self.model, self.save_dir, self.model_name) print("FastEstimator-BestModelSaver: Saved model to {}".format( self.model_path)) else: self.since_best += 1 data.write_with_log(self.outputs[0], self.since_best) data.write_with_log(self.outputs[1], self.best)
def on_end(self, data: Data) -> None: index_summaries = DefaultKeyDict(default=lambda x: Summary(name=x)) for mode in self.mode: final_scores = sorted([(idx, elem[-1][1]) for idx, elem in self.index_history[mode].items()], key=lambda x: x[1]) max_idx_list = {elem[0] for elem in final_scores[-1:-self.n_max_to_keep - 1:-1]} min_idx_list = {elem[0] for elem in final_scores[:self.n_min_to_keep]} target_idx_list = Set.union(min_idx_list, max_idx_list, self.idx_to_keep) for idx in target_idx_list: for step, score in self.index_history[mode][idx]: index_summaries[idx].history[mode][self.metric_key][step] = score self.system.add_graph(self.outputs[0], list(index_summaries.values())) # So traceability can draw it data.write_without_log(self.outputs[0], list(index_summaries.values()))
def on_batch_end(self, data: Data) -> None: y_true, y_pred = to_number(data[self.true_key]), to_number(data[self.pred_key]) if y_true.shape[-1] > 1 and y_true.ndim > 1: y_true = np.argmax(y_true, axis=-1) if y_pred.shape[-1] > 1 and y_pred.ndim > 1: y_pred = np.argmax(y_pred, axis=-1) else: # binaray classification (pred shape is [batch, 1]) if self.from_logits: y_pred = 1 / (1 + np.exp(-y_pred)) y_pred = np.round(y_pred) assert y_pred.size == y_true.size self.correct += np.sum(y_pred.ravel() == y_true.ravel()) self.total += len(y_pred.ravel()) data.write_per_instance_log(self.outputs[0], np.array(y_pred.ravel() == y_true.ravel(), dtype=np.int8))
def on_epoch_end(self, data: Data) -> None: if self.system.mode == 'train' and self.histogram_freq.freq and not self.histogram_freq.is_step and \ self.system.epoch_idx % self.histogram_freq.freq == 0: self.writer.write_weights(mode=self.system.mode, models=self.system.network.models, step=self.system.global_step, visualize=self.paint_weights) # Write out any embeddings which were aggregated over batches for name, val_list in self.collected_embeddings.items(): embeddings = None if any( x[0] is None for x in val_list) else concat([x[0] for x in val_list]) labels = None if any( x[1] is None for x in val_list) else concat([x[1] for x in val_list]) imgs = None if any( x[2] is None for x in val_list) else concat([x[2] for x in val_list]) self.writer.write_embeddings(mode=self.system.mode, step=self.system.global_step, embeddings=[(name, embeddings, labels, imgs)]) self.collected_embeddings.clear() # Get any embeddings which were generated externally on epoch end if self.embedding_freq.freq and (self.embedding_freq.is_step or self.system.epoch_idx % self.embedding_freq.freq == 0): self.writer.write_embeddings( mode=self.system.mode, step=self.system.global_step, embeddings=filter( lambda x: x[1] is not None, map( lambda t: (t[0], data.get(t[0]), data.get(t[1]), data.get(t[2])), self.write_embeddings))) if self.update_freq.freq and (self.update_freq.is_step or self.system.epoch_idx % self.update_freq.freq == 0): self.writer.write_scalars(mode=self.system.mode, step=self.system.global_step, scalars=filter(lambda x: is_number(x[1]), data.items())) self.writer.write_images(mode=self.system.mode, step=self.system.global_step, images=filter( lambda x: x[1] is not None, map(lambda y: (y, data.get(y)), self.write_images)))
def test_on_epoch_end(self): data = Data({}) eval_essential = EvalEssential(monitor_names='loss') eval_essential.system = sample_system_object() eval_essential.eval_results = {'loss': [10, 20]} eval_essential.on_epoch_end(data=data) self.assertEqual(data['loss'], 15.0)
def test_on_epoch_end(self): data = Data({}) test_essential = TestEssential(monitor_names={'loss'}) test_essential.system = sample_system_object() test_essential.test_results['loss'][''].extend([10, 20]) test_essential.on_epoch_end(data=data) self.assertEqual(data['loss'], 15.0)
def _print_message(self, header: str, data: Data, log_epoch: bool = False) -> None: """Print a log message to the screen, and record the `data` into the `system` summary. Args: header: The prefix for the log message. data: A collection of data to be recorded. log_epoch: Whether epoch information should be included in the log message. """ log_message = header if log_epoch: log_message += "epoch: {}; ".format(self.system.epoch_idx) self.system.write_summary('epoch', self.system.epoch_idx) deferred = [] for key, val in humansorted(data.read_logs().items(), key=lambda x: x[0]): if isinstance(val, ValWithError): log_message += "{}: {}; ".format(key, str(val)) else: val = to_number(val) if val.size > 1: deferred.append("\n{}:\n{};".format(key, np.array2string(val, separator=','))) else: log_message += "{}: {}; ".format(key, str(val)) self.system.write_summary(key, val) log_message = log_message.strip() for elem in deferred: log_message += elem print(log_message)
def setUpClass(cls): cls.data = Data({'loss': 10}) cls.train_essential = TrainEssential(monitor_names='loss') cls.train_essential.system = sample_system_object() cls.train_essential.system.log_steps = 5 cls.train_essential.system.global_step = 10 cls.train_essential.epoch_start = time.perf_counter() - 500 cls.train_essential.step_start = time.perf_counter() - 300
def on_epoch_end(self, data: Data) -> None: self.y_true = np.squeeze(np.stack(self.y_true)) self.y_pred = np.stack(self.y_pred) calibrator = cal.PlattBinnerMarginalCalibrator(num_calibration=len( self.y_true), num_bins=10) calibrator.train_calibration(probs=self.y_pred, labels=self.y_true) if self.save_path: if not self.save_key or (self.save_key and to_number(data[self.save_key]) == 0): with open(self.save_path, 'wb') as f: dill.dump(calibrator.calibrate, file=f) print( f"FastEstimator-PBMCalibrator: Calibrator written to {self.save_path}" ) data.write_without_log(self.outputs[0], calibrator.calibrate(self.y_pred))
def setUpClass(cls): cls.data_np = Data({'loss': np.NaN}) cls.data_tf = Data({'loss': tf.constant(np.NaN)}) cls.data_torch = Data({'loss': torch.tensor(np.NaN)}) cls.expected_msg = "FastEstimator-TerminateOnNaN: NaN Detected in: loss" cls.expected_loss_keys = {"ce"} cls.expected_all_keys = {"ce", "accuracy", "f1_score"} tf_model = fe.build(model_fn=one_layer_tf_model, optimizer_fn='adam') cls.network = fe.Network(ops=[ ModelOp(model=tf_model, inputs="x", outputs="y"), CrossEntropy(inputs=("y_pred", "y"), outputs="ce"), UpdateOp(model=tf_model, loss_name="ce") ]) cls.traces = [ Accuracy(true_key="y", pred_key="y_pred", output_name="accuracy"), F1Score(true_key="y", pred_key="y_pred", output_name="f1_score") ]
def _run_traces_on_end(traces: Iterable[Trace]) -> None: """Invoke the on_end methods of given traces. Args: traces: List of traces. """ data = Data() for trace in traces: trace.on_end(data)
def setUpClass(cls): cls.data = Data({}) cls.on_begin_global_step_msg = "FastEstimator-Start: step: 2;" cls.on_begin_msg = "FastEstimator-Start: step: 1;" cls.on_batch_end_msg = "FastEstimator-Train: step: 1;" cls.on_epoch_end_train_msg = "FastEstimator-Train: step: 2; epoch: 0;" cls.on_epoch_end_eval_msg = "FastEstimator-Eval: step: 2; epoch: 0;" cls.on_epoch_end_test_msg = "FastEstimator-Test: step: 2; epoch: 0;" cls.on_end_msg = "FastEstimator-Finish: step: 2;"
def _run_traces_on_ds_begin(self, traces: Iterable[PerDSTrace]) -> None: """Invoke the on_ds_begin methods of given traces. Args: traces: List of traces. """ data = Data() for trace in traces: trace.on_ds_begin(data) self._check_early_exit()
def _run_traces_on_epoch_end(self, traces: Iterable[Trace]) -> None: """Invoke the on_epoch_end methods of of given traces. Args: traces: List of traces. """ data = Data() for trace in traces: trace.on_epoch_end(data) self._check_early_exit()