def start_train(self, session: tf.Session, is_init: bool = True): """ ## Start experiment Load a checkpoint or reset based on `global_step`. """ global_step = 0 if not is_init: # load checkpoint if we are starting from middle with logger.section("Loading checkpoint") as m: is_successful = self.__checkpoint_saver.load(session) logger.set_successful(is_successful) if is_successful: global_step = self.__checkpoint_saver.max_step self._start(global_step) if global_step == 0: # initialize variables and clear summaries if we are starting from scratch with logger.section("Clearing summaries"): self.clear_summaries() with logger.section("Clearing checkpoints"): self.clear_checkpoints() with logger.section("Initializing variables"): tf_util.init_variables(session) self.create_writer(session)
def loop_section(): for step in logger.loop(range(0, 10)): with logger.section("Step"): time.sleep(0.5) with logger.section("Step2"): time.sleep(0.1) logger.write() logger.new_line()
def data_loaders(c: Configs): with logger.section("Training data"): train = _data_loader(True, c.batch_size, c.data_loader_args) with logger.section("Testing data"): test = _data_loader(False, c.test_batch_size, c.data_loader_args) return train, test
def model_optimizer(c: Configs): with logger.section("Create model"): m: Net = Net() m.to(c.device) with logger.section("Create optimizer"): o = optim.SGD(m.parameters(), lr=c.learning_rate, momentum=c.momentum) return m, o
def start(self, *, run: Optional[int] = None, checkpoint: Optional[int] = None): if run is not None: with logger.section("Loading checkpoint"): global_step = self._load_checkpoint(run, checkpoint) if global_step is None: logger.set_successful(False) global_step = 0 else: global_step = 0 self.run.start_step = global_step logger.internal().set_start_global_step(global_step) self.__print_info_and_check_repo() self.configs_processor.print() self.run.save_info() if self.configs_processor is not None: self.configs_processor.save(self.run.configs_path) logger.internal().save_indicators(self.run.indicators_path) with open(str(self.run.diff_path), "w") as f: f.write(self.run.diff)
def start_replay(self, session: tf.Session): """ ## Start replaying experiment Load a checkpoint or reset based on `global_step`. """ with logger.section("Loading checkpoint") as m: m.is_successful = self.__checkpoint_saver.load(session)
def __start_from_checkpoint(self, run_uuid: str, checkpoint: Optional[int]): checkpoint_path, global_step = experiment_run.get_last_run_checkpoint( self.experiment_path, run_uuid, checkpoint) if global_step is None: return 0 else: with logger.section("Loading checkpoint"): self._load_checkpoint(checkpoint_path) self.run.load_run = run_uuid return global_step
def __start_from_checkpoint(self, run_index: int, checkpoint: int): checkpoint_path, global_step = experiment_run.get_last_run_checkpoint( self.experiment_path, run_index, checkpoint, {self.run.index}) if global_step is None: return 0 else: with logger.section("Loading checkpoint"): self._load_checkpoint(checkpoint_path) return global_step
def test(session: tf.Session, loss_value, accuracy_value, batches): with logger.section("Test", total_steps=batches): test_loss = 0 correct = 0 batch_idx = -1 while True: batch_idx += 1 try: l, a = session.run([loss_value, accuracy_value]) test_loss += l correct += a except tf.errors.OutOfRangeError: break logger.progress(batch_idx + 1) logger.store(test_loss=test_loss / batches) logger.store(accuracy=correct / batches)
def test(model, device, test_loader): with logger.section("Test", total_steps=len(test_loader)): model.eval() test_loss = 0 correct = 0 with torch.no_grad(): for batch_idx, (data, target) in enumerate(test_loader): data, target = data.to(device), target.to(device) output = model(data) test_loss += F.nll_loss(output, target, reduction='sum').item() pred = output.argmax(dim=1, keepdim=True) correct += pred.eq(target.view_as(pred)).sum().item() logger.progress(batch_idx + 1) # Add test loss and accuracy to logger logger.store(test_loss=test_loss / len(test_loader.dataset)) logger.store(accuracy=correct / len(test_loader.dataset))
def train(args, session: tf.Session, loss_value, train_op, batches, epoch): with logger.section("Train", total_steps=batches): batch_idx = -1 while True: batch_idx += 1 try: l, _ = session.run([loss_value, train_op]) except tf.errors.OutOfRangeError: break # Add training loss to the logger. # The logger will queue the values and output the mean logger.store(train_loss=l) logger.progress(batch_idx + 1) logger.set_global_step(epoch * batches + batch_idx) # Print output to the console if batch_idx % args.log_interval == 0: # Output the indicators logger.write()
def train(args, model, device, train_loader, optimizer, epoch): with logger.section("Train", total_steps=len(train_loader)): model.train() for batch_idx, (data, target) in enumerate(train_loader): data, target = data.to(device), target.to(device) optimizer.zero_grad() output = model(data) loss = F.nll_loss(output, target) loss.backward() optimizer.step() # Add training loss to the logger. # The logger will queue the values and output the mean logger.store(train_loss=loss.item()) logger.progress(batch_idx + 1) logger.set_global_step(epoch * len(train_loader) + batch_idx) # Print output to the console if batch_idx % args.log_interval == 0: # Output the indicators logger.write()
def main(): args = parse_args() use_cuda = not args.no_cuda and torch.cuda.is_available() torch.manual_seed(args.seed) device = torch.device("cuda" if use_cuda else "cpu") kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} # Loading data with logger.section("Loading data"): train_loader = torch.utils.data.DataLoader(datasets.MNIST( './data', train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])), batch_size=args.batch_size, shuffle=True, **kwargs) test_loader = torch.utils.data.DataLoader( datasets.MNIST('./data', train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])), batch_size=args.test_batch_size, shuffle=True, **kwargs) # Model creation with logger.section("Create model"): model = Net().to(device) optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum) # Specify indicators logger.add_indicator("train_loss", queue_limit=10, is_print=True) logger.add_indicator("test_loss", is_histogram=False, is_print=True) logger.add_indicator("accuracy", is_histogram=False, is_print=True) for name, param in model.named_parameters(): if param.requires_grad: logger.add_indicator(name, is_histogram=True, is_print=False) logger.add_indicator(f"{name}_grad", is_histogram=True, is_print=False) # Start the experiment EXPERIMENT.start_train() # Loop through the monitored iterator for epoch in logger.loop(range(0, args.epochs)): # Delayed keyboard interrupt handling to use # keyboard interrupts to end the loop. # This will capture interrupts and finish # the loop at the end of processing the iteration; # i.e. the loop won't stop in the middle of an epoch. try: with logger.delayed_keyboard_interrupt(): # Training and testing train(args, model, device, train_loader, optimizer, epoch) test(model, device, test_loader) # Add histograms with model parameter values and gradients for name, param in model.named_parameters(): if param.requires_grad: logger.store(name, param.data.cpu().numpy()) logger.store(f"{name}_grad", param.grad.cpu().numpy()) # Clear line and output to console logger.write() # Output the progress summaries to `trial.yaml` and # to the python file header logger.save_progress() # Clear line and go to the next line; # that is, we add a new line to the output # at the end of each epoch logger.new_line() # Handled delayed interrupt except KeyboardInterrupt: logger.finish_loop() logger.new_line() logger.log("\nKilling loop...") break
def progress(): with logger.section("Progress", total_steps=100): for i in range(100): time.sleep(0.1) # Multiple training steps in the inner loop logger.progress(i)
def adam_optimizer(c: Configs): with logger.section("Create optimizer"): return optim.Adam(c.model.parameters(), lr=c.learning_rate)
def model(c: Configs): with logger.section("Create model"): m: Net = Net() m.to(c.device) return m
def main(): args = parse_args() # Loading data with logger.section("Load data"): mnist = tf.keras.datasets.mnist (x_train, y_train), (x_test, y_test) = mnist.load_data() x_train, x_test = x_train / 255.0, x_test / 255.0 train_dataset = create_mnist_dataset(x_train, y_train, args.batch_size) test_dataset = create_mnist_dataset(x_test, y_test, args.batch_size) # Model creation with logger.section("Create model"): model = tf.keras.models.Sequential([ tf.keras.layers.Flatten(input_shape=(28, 28)), tf.keras.layers.Dense(512, activation=tf.nn.relu), tf.keras.layers.Dropout(0.2), tf.keras.layers.Dense(10, activation=tf.nn.softmax) ]) # Creation of the trainer with logger.section("Create trainer"): optimizer = tf.train.AdamOptimizer(learning_rate=args.lr) train_iterator = train_dataset.make_initializable_iterator() data, target = train_iterator.get_next() train_loss = loss(model, data, target) train_op = optimizer.minimize(train_loss) test_iterator = test_dataset.make_initializable_iterator() data, target = test_iterator.get_next() test_loss = loss(model, data, target) test_accuracy = accuracy(model, data, target) logger.add_indicator("train_loss", queue_limit=10, is_print=True) logger.add_indicator("test_loss", is_histogram=False, is_print=True) logger.add_indicator("accuracy", is_histogram=False, is_print=True) # batches = len(x_train) // args.batch_size with tf.Session() as session: EXPERIMENT.start_train(session) # Loop through the monitored iterator for epoch in logger.loop(range(0, args.epochs)): # Delayed keyboard interrupt handling to use # keyboard interrupts to end the loop. # This will capture interrupts and finish # the loop at the end of processing the iteration; # i.e. the loop won't stop in the middle of an epoch. try: with logger.delayed_keyboard_interrupt(): # Training and testing session.run(train_iterator.initializer) train(args, session, train_loss, train_op, batches, epoch) session.run(test_iterator.initializer) test(session, test_loss, test_accuracy, len(x_test) // args.batch_size) # Clear line and output to console logger.write() # Output the progress summaries to `trial.yaml` and # to the python file header logger.save_progress() # Clear line and go to the next line; # that is, we add a new line to the output # at the end of each epoch logger.new_line() # Handled delayed interrupt except KeyboardInterrupt: logger.finish_loop() logger.new_line() logger.log("\nKilling loop...") break
def simple_section(): with logger.section("Simple section"): # code to load data time.sleep(2)
def loop_partial_section(): for step in logger.loop(range(0, 10)): with logger.section("Step", is_partial=True): time.sleep(0.5) logger.progress((step % 5 + 1) / 5) logger.write()
import tensorflow as tf from lab import logger from lab.experiment.tensorflow import Experiment # Create the sample experiment EXPERIMENT = Experiment(name="sample", python_file=__file__, comment="Sample lab experiment", check_repo_dirty=False) # Sections are use to keep track of # what's going on from the console output. # It is also useful to organize the code into sections, # when separating them into functions is difficult with logger.section("Create model"): # Indicate that this section failed. You don't have to set # this if it is successful. logger.set_successful(False) # Sleep for a minute. time.sleep(1) # Print sample info logger.info(one=1, two=2, string="string") # ### Set logger indicators # Reward is queued; this is useful when you want to track the moving # average of something. logger.add_indicator("reward", queue_limit=10)
def train_loader(c: Configs): with logger.section("Training data"): return _data_loader(True, c.batch_size, c.data_loader_args)
def sgd_optimizer(c: Configs): with logger.section("Create optimizer"): return optim.SGD(c.model.parameters(), lr=c.learning_rate, momentum=c.momentum)
def test_loader(c: Configs): with logger.section("Testing data"): return _data_loader(False, c.test_batch_size, c.data_loader_args)
def set_seed(c: Configs): with logger.section("Setting seed"): torch.manual_seed(c.seed)
def unsuccessful_section(): with logger.section("Unsuccessful section"): time.sleep(1) logger.set_successful(False)