def __next__(self): if self.__signal_received is not None: logger.log('\nKilling Loop.', color=Text.danger) loop.finish_loop() self.__finish() raise StopIteration("SIGINT") try: global_step = next(self.__loop) except StopIteration as e: self.__finish() raise e loop.set_global_step(global_step) if self.is_interval(self.__log_write_interval, global_step): tracker.save() if self.is_interval(self.__log_new_line_interval, global_step): logger.log() if (self.__is_save_models and self.is_interval(self.__save_models_interval, global_step)): experiment.save_checkpoint() return global_step
def __print_info_and_check_repo(self): """ ## 🖨 Print the experiment info and check git repo status """ logger.new_line() logger.log([(self.name, Text.title), ': ', (str(self.run.uuid), Text.meta)]) if self.run.comment != '': logger.log(['\t', (self.run.comment, Text.highlight)]) logger.log([ "\t" "[dirty]" if self.run.is_dirty else "[clean]", ": ", (f"\"{self.run.commit_message.strip()}\"", Text.highlight) ]) if self.run.load_run is not None: logger.log([ "\t" "loaded from", ": ", (f"{self.run.load_run}", Text.meta2), ]) # Exit if git repository is dirty if self.check_repo_dirty and self.run.is_dirty: logger.log([("[FAIL]", Text.danger), " Cannot trial an experiment with uncommitted changes." ]) exit(1)
def main(): lab = Lab(os.getcwd()) parser = argparse.ArgumentParser(description='Run TensorBoard') parser.add_argument("-l", action='store_true', dest='list', help='List all available experiments') parser.add_argument('-e', required=False, type=str, nargs='+', dest='experiments', help='List of experiments') args = parser.parse_args() if args.list: utils.list_experiments(lab, logger) elif args.experiments: # List out the experiments. # This will fail if experiments are missing. runs = utils.get_last_trials(lab, args.experiments) utils.list_trials(runs, logger) # Invoke Tensorboard cmd = utils.get_tensorboard_cmd(lab, args.experiments) logger.log("Starting TensorBoard", color=colors.Style.bold) os.system(cmd) else: parser.print_usage()
def calc_configs(self, configs: Optional[Configs], configs_dict: Dict[str, any], run_order: Optional[List[Union[List[str], str]]]): self.configs_processor = ConfigProcessor(configs, configs_dict) self.configs_processor(run_order) logger.log()
def handler(self, sig, frame): # Pass second interrupt without delaying if self.signal_received is not None: self.old_handler(*self.signal_received) return # Store the interrupt signal for later self.signal_received = (sig, frame) logger.log([('\nSIGINT received. Delaying KeyboardInterrupt.', Text.danger)])
def _open_dashboard(): try: import lab_dashboard except (ImportError, ModuleNotFoundError): logger.log("Cannot import ", ('lab_dashboard', Text.highlight), '.') logger.log('Install with ', ('pip install machine_learning_lab_dashboard', Text.value)) return lab_dashboard.start_server()
def __finish(self): try: signal.signal(signal.SIGINT, self.old_handler) except ValueError: pass tracker.save() logger.log() if self.__is_save_models: logger.log("Saving model...") experiment.save_checkpoint()
def get_device(use_cuda: bool, cuda_device: int): is_cuda = use_cuda and torch.cuda.is_available() if not is_cuda: return torch.device('cpu') else: if cuda_device < torch.cuda.device_count(): return torch.device('cuda', cuda_device) else: logger.log(f"Cuda device index {cuda_device} higher than " f"device count {torch.cuda.device_count()}", Text.warning) return torch.device('cuda', torch.cuda.device_count() - 1)
def _print_artifacts_list(self, table: Dict[str, int], artifacts: Dict[str, Artifact]): order = list(table.keys()) if not len(order): return keys = {k for name in order for k in artifacts[name].keys()} for k in keys: for name in order: value = artifacts[name].get_string(k, artifacts) logger.log([(name, Text.key), ": ", (value, Text.value)])
def print_all(self, others: Dict[str, Artifact]): if plt is None: logger.log(('matplotlib', logger.Text.highlight), ' not found. So cannot display impages') images = [_to_numpy(v) for v in self._values.values()] cols = 3 fig: plt.Figure fig, axs = plt.subplots((len(images) + cols - 1) // cols, cols, sharex='all', sharey='all', figsize=(8, 10)) fig.suptitle(self.name) for i, img in enumerate(images): ax: plt.Axes = axs[i // cols, i % cols] ax.imshow(img) plt.show()
def print_info_and_check_repo(self): """ ## 🖨 Print the experiment info and check git repo status """ logger.log_color([(self.info.name, colors.Style.bold)]) logger.log_color([("\t", None), (self.trial.comment, colors.BrightColor.cyan)]) logger.log_color([("\t", None), ("[dirty]" if self.trial.is_dirty else "[clean]", None), (": ", None), (f"\"{self.trial.commit_message.strip()}\"", colors.BrightColor.orange)]) # Exit if git repository is dirty if self.check_repo_dirty and self.trial.is_dirty: logger.log("Cannot trial an experiment with uncommitted changes. ", new_line=False) logger.log("[FAIL]", color=colors.BrightColor.red) exit(1)
def print(self): order = self.calculator.topological_order.copy() added = set(order) ignored = set() for k in self.parser.types: if k not in added: added.add(k) order.append(k) ignored.add(k) logger.log("Configs:", Text.heading) for k in order: computed = getattr(self.calculator.configs, k, None) if k in ignored: parts = self.__print_config(k, is_ignored=True) elif k in self.parser.list_appends: parts = self.__print_config(k, value=computed, is_list=True) elif k in self.parser.options: v = self.parser.values[k] opts = self.parser.options[k] lst = list(opts.keys()) if v in opts: lst.remove(v) else: v = None parts = self.__print_config(k, value=computed, option=v, other_options=lst) else: parts = self.__print_config(k, value=computed) logger.log(parts) logger.new_line()
def get_last_run_checkpoint(experiment_path: PurePath, run_uuid: str, checkpoint: int = -1): checkpoint = get_run_checkpoint(experiment_path, run_uuid, checkpoint) if checkpoint is None: logger.log("Couldn't find a previous run/checkpoint") return None, None logger.log(["Selected ", ("run", Text.key), " = ", (run_uuid, Text.value), " ", ("checkpoint", Text.key), " = ", (checkpoint, Text.value)]) run_path = experiment_path / str(run_uuid) checkpoint_path = run_path / "checkpoints" return checkpoint_path / str(checkpoint), checkpoint
def _print_artifacts_table(self, table: Dict[str, int], artifacts: Dict[str, Artifact]): order = list(table.keys()) if not len(order): return keys = [] keys_set = set() for name in order: for k in artifacts[name].keys(): if k not in keys_set: keys_set.add(k) keys.append(k) parts = [self.__format_artifact(table[name], name) for name in order] logger.log('|'.join(parts), Text.heading) for k in keys: parts = [] for name in order: value = artifacts[name].get_string(k, artifacts) parts.append(self.__format_artifact(table[name], value)) logger.log('|'.join(parts), Text.value)
def __handler(self, sig, frame): # Pass second interrupt without delaying if self.__signal_received is not None: logger.log('\nSIGINT received twice. Stopping...', color=Text.danger) self.old_handler(*self.__signal_received) return if self.__is_loop_on_interrupt: # Store the interrupt signal for later self.__signal_received = (sig, frame) logger.log('\nSIGINT received. Delaying KeyboardInterrupt.', color=Text.danger) else: self.__finish() logger.log('Killing loop...', Text.danger) self.old_handler(sig, frame)
# We'll track the progress of that too for i in range(100): time.sleep(0.01) # Progress is tracked manually unlike in the top level iterator. # The progress updates do not have to be sequential. logger.progress(i + 1) # Log stored values. # This will output to the console and write TensorBoard summaries. logger.write() # Store progress in the trials file and in the python code as a comment if (global_step + 1) % 10 == 0: logger.save_progress() # By default we will overwrite the same console line. # `new_line` makes it go to the next line. # This helps keep the console output concise. if (global_step + 1) % 10 == 0: logger.new_line() except KeyboardInterrupt: logger.finish_loop() logger.new_line() logger.log( f"Stopping the training at {global_step} and saving checkpoints" ) break with logger.section("Cleaning up"): time.sleep(0.5)
from lab import logger from lab.logger.colors import Text, Color if __name__ == '__main__': logger.log("Colors are missing when views on github", Text.highlight) logger.log([('Styles\n', Text.heading), ('Danger\n', Text.danger), ('Warning\n', Text.warning), ('Meta\n', Text.meta), ('Key\n', Text.key), ('Meta2\n', Text.meta2), ('Title\n', Text.title), ('Heading\n', Text.heading), ('Value\n', Text.value), ('Highlight\n', Text.highlight), ('Subtle\n', Text.subtle)]) logger.log([ ('Colors\n', Text.heading), ('Red\n', Color.red), ('Black\n', Color.black), ('Blue\n', Color.blue), ('Cyan\n', Color.cyan), ('Green\n', Color.green), ('Orange\n', Color.orange), ('Purple Heading\n', [Color.purple, Text.heading]), ('White\n', Color.white), ])
def main(): args = parse_args() use_cuda = not args.no_cuda and torch.cuda.is_available() torch.manual_seed(args.seed) device = torch.device("cuda" if use_cuda else "cpu") kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} # Loading data with logger.section("Loading data"): train_loader = torch.utils.data.DataLoader(datasets.MNIST( './data', train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])), batch_size=args.batch_size, shuffle=True, **kwargs) test_loader = torch.utils.data.DataLoader( datasets.MNIST('./data', train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])), batch_size=args.test_batch_size, shuffle=True, **kwargs) # Model creation with logger.section("Create model"): model = Net().to(device) optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum) # Specify indicators logger.add_indicator("train_loss", queue_limit=10, is_print=True) logger.add_indicator("test_loss", is_histogram=False, is_print=True) logger.add_indicator("accuracy", is_histogram=False, is_print=True) for name, param in model.named_parameters(): if param.requires_grad: logger.add_indicator(name, is_histogram=True, is_print=False) logger.add_indicator(f"{name}_grad", is_histogram=True, is_print=False) # Start the experiment EXPERIMENT.start_train() # Loop through the monitored iterator for epoch in logger.loop(range(0, args.epochs)): # Delayed keyboard interrupt handling to use # keyboard interrupts to end the loop. # This will capture interrupts and finish # the loop at the end of processing the iteration; # i.e. the loop won't stop in the middle of an epoch. try: with logger.delayed_keyboard_interrupt(): # Training and testing train(args, model, device, train_loader, optimizer, epoch) test(model, device, test_loader) # Add histograms with model parameter values and gradients for name, param in model.named_parameters(): if param.requires_grad: logger.store(name, param.data.cpu().numpy()) logger.store(f"{name}_grad", param.grad.cpu().numpy()) # Clear line and output to console logger.write() # Output the progress summaries to `trial.yaml` and # to the python file header logger.save_progress() # Clear line and go to the next line; # that is, we add a new line to the output # at the end of each epoch logger.new_line() # Handled delayed interrupt except KeyboardInterrupt: logger.finish_loop() logger.new_line() logger.log("\nKilling loop...") break
def print_all(self, others: Dict[str, Artifact]): logger.log(self.name, TextStyle.heading) for t in self._values.values(): logger.log(t, TextStyle.value)
def main(): args = parse_args() # Loading data with logger.section("Load data"): mnist = tf.keras.datasets.mnist (x_train, y_train), (x_test, y_test) = mnist.load_data() x_train, x_test = x_train / 255.0, x_test / 255.0 train_dataset = create_mnist_dataset(x_train, y_train, args.batch_size) test_dataset = create_mnist_dataset(x_test, y_test, args.batch_size) # Model creation with logger.section("Create model"): model = tf.keras.models.Sequential([ tf.keras.layers.Flatten(input_shape=(28, 28)), tf.keras.layers.Dense(512, activation=tf.nn.relu), tf.keras.layers.Dropout(0.2), tf.keras.layers.Dense(10, activation=tf.nn.softmax) ]) # Creation of the trainer with logger.section("Create trainer"): optimizer = tf.train.AdamOptimizer(learning_rate=args.lr) train_iterator = train_dataset.make_initializable_iterator() data, target = train_iterator.get_next() train_loss = loss(model, data, target) train_op = optimizer.minimize(train_loss) test_iterator = test_dataset.make_initializable_iterator() data, target = test_iterator.get_next() test_loss = loss(model, data, target) test_accuracy = accuracy(model, data, target) logger.add_indicator("train_loss", queue_limit=10, is_print=True) logger.add_indicator("test_loss", is_histogram=False, is_print=True) logger.add_indicator("accuracy", is_histogram=False, is_print=True) # batches = len(x_train) // args.batch_size with tf.Session() as session: EXPERIMENT.start_train(session) # Loop through the monitored iterator for epoch in logger.loop(range(0, args.epochs)): # Delayed keyboard interrupt handling to use # keyboard interrupts to end the loop. # This will capture interrupts and finish # the loop at the end of processing the iteration; # i.e. the loop won't stop in the middle of an epoch. try: with logger.delayed_keyboard_interrupt(): # Training and testing session.run(train_iterator.initializer) train(args, session, train_loss, train_op, batches, epoch) session.run(test_iterator.initializer) test(session, test_loss, test_accuracy, len(x_test) // args.batch_size) # Clear line and output to console logger.write() # Output the progress summaries to `trial.yaml` and # to the python file header logger.save_progress() # Clear line and go to the next line; # that is, we add a new line to the output # at the end of each epoch logger.new_line() # Handled delayed interrupt except KeyboardInterrupt: logger.finish_loop() logger.new_line() logger.log("\nKilling loop...") break