Beispiel #1
0
    def start_train(self, session: tf.Session, is_init: bool = True):
        """
        ## Start experiment

        Load a checkpoint or reset based on `global_step`.
        """

        global_step = 0

        if not is_init:
            # load checkpoint if we are starting from middle
            with logger.section("Loading checkpoint") as m:
                is_successful = self.__checkpoint_saver.load(session)
                logger.set_successful(is_successful)
                if is_successful:
                    global_step = self.__checkpoint_saver.max_step

        self._start(global_step)

        if global_step == 0:
            # initialize variables and clear summaries if we are starting from scratch
            with logger.section("Clearing summaries"):
                self.clear_summaries()
            with logger.section("Clearing checkpoints"):
                self.clear_checkpoints()
            with logger.section("Initializing variables"):
                tf_util.init_variables(session)

        self.create_writer(session)
Beispiel #2
0
def loop_section():
    for step in logger.loop(range(0, 10)):
        with logger.section("Step"):
            time.sleep(0.5)
        with logger.section("Step2"):
            time.sleep(0.1)
        logger.write()
    logger.new_line()
Beispiel #3
0
def data_loaders(c: Configs):
    with logger.section("Training data"):
        train = _data_loader(True, c.batch_size, c.data_loader_args)

    with logger.section("Testing data"):
        test = _data_loader(False, c.test_batch_size, c.data_loader_args)

    return train, test
Beispiel #4
0
def model_optimizer(c: Configs):
    with logger.section("Create model"):
        m: Net = Net()
        m.to(c.device)

    with logger.section("Create optimizer"):
        o = optim.SGD(m.parameters(), lr=c.learning_rate, momentum=c.momentum)

    return m, o
Beispiel #5
0
    def start(self,
              *,
              run: Optional[int] = None,
              checkpoint: Optional[int] = None):
        if run is not None:
            with logger.section("Loading checkpoint"):
                global_step = self._load_checkpoint(run, checkpoint)
                if global_step is None:
                    logger.set_successful(False)
                    global_step = 0
        else:
            global_step = 0

        self.run.start_step = global_step
        logger.internal().set_start_global_step(global_step)

        self.__print_info_and_check_repo()
        self.configs_processor.print()

        self.run.save_info()

        if self.configs_processor is not None:
            self.configs_processor.save(self.run.configs_path)

        logger.internal().save_indicators(self.run.indicators_path)

        with open(str(self.run.diff_path), "w") as f:
            f.write(self.run.diff)
Beispiel #6
0
    def start_replay(self, session: tf.Session):
        """
        ## Start replaying experiment

        Load a checkpoint or reset based on `global_step`.
        """

        with logger.section("Loading checkpoint") as m:
            m.is_successful = self.__checkpoint_saver.load(session)
Beispiel #7
0
    def __start_from_checkpoint(self, run_uuid: str,
                                checkpoint: Optional[int]):
        checkpoint_path, global_step = experiment_run.get_last_run_checkpoint(
            self.experiment_path, run_uuid, checkpoint)

        if global_step is None:
            return 0
        else:
            with logger.section("Loading checkpoint"):
                self._load_checkpoint(checkpoint_path)
            self.run.load_run = run_uuid

        return global_step
Beispiel #8
0
    def __start_from_checkpoint(self, run_index: int, checkpoint: int):
        checkpoint_path, global_step = experiment_run.get_last_run_checkpoint(
            self.experiment_path,
            run_index,
            checkpoint,
            {self.run.index})

        if global_step is None:
            return 0
        else:
            with logger.section("Loading checkpoint"):
                self._load_checkpoint(checkpoint_path)

        return global_step
Beispiel #9
0
def test(session: tf.Session, loss_value, accuracy_value, batches):
    with logger.section("Test", total_steps=batches):
        test_loss = 0
        correct = 0
        batch_idx = -1
        while True:
            batch_idx += 1
            try:
                l, a = session.run([loss_value, accuracy_value])
                test_loss += l
                correct += a
            except tf.errors.OutOfRangeError:
                break
            logger.progress(batch_idx + 1)

        logger.store(test_loss=test_loss / batches)
        logger.store(accuracy=correct / batches)
Beispiel #10
0
def test(model, device, test_loader):
    with logger.section("Test", total_steps=len(test_loader)):
        model.eval()
        test_loss = 0
        correct = 0
        with torch.no_grad():
            for batch_idx, (data, target) in enumerate(test_loader):
                data, target = data.to(device), target.to(device)
                output = model(data)
                test_loss += F.nll_loss(output, target, reduction='sum').item()
                pred = output.argmax(dim=1, keepdim=True)
                correct += pred.eq(target.view_as(pred)).sum().item()
                logger.progress(batch_idx + 1)

        # Add test loss and accuracy to logger
        logger.store(test_loss=test_loss / len(test_loader.dataset))
        logger.store(accuracy=correct / len(test_loader.dataset))
Beispiel #11
0
def train(args, session: tf.Session, loss_value, train_op, batches, epoch):
    with logger.section("Train", total_steps=batches):
        batch_idx = -1
        while True:
            batch_idx += 1
            try:
                l, _ = session.run([loss_value, train_op])
            except tf.errors.OutOfRangeError:
                break

            # Add training loss to the logger.
            # The logger will queue the values and output the mean
            logger.store(train_loss=l)
            logger.progress(batch_idx + 1)
            logger.set_global_step(epoch * batches + batch_idx)

            # Print output to the console
            if batch_idx % args.log_interval == 0:
                # Output the indicators
                logger.write()
Beispiel #12
0
def train(args, model, device, train_loader, optimizer, epoch):
    with logger.section("Train", total_steps=len(train_loader)):
        model.train()
        for batch_idx, (data, target) in enumerate(train_loader):
            data, target = data.to(device), target.to(device)
            optimizer.zero_grad()
            output = model(data)
            loss = F.nll_loss(output, target)
            loss.backward()
            optimizer.step()

            # Add training loss to the logger.
            # The logger will queue the values and output the mean
            logger.store(train_loss=loss.item())
            logger.progress(batch_idx + 1)
            logger.set_global_step(epoch * len(train_loader) + batch_idx)

            # Print output to the console
            if batch_idx % args.log_interval == 0:
                # Output the indicators
                logger.write()
Beispiel #13
0
def main():
    args = parse_args()
    use_cuda = not args.no_cuda and torch.cuda.is_available()

    torch.manual_seed(args.seed)

    device = torch.device("cuda" if use_cuda else "cpu")

    kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}

    # Loading data
    with logger.section("Loading data"):
        train_loader = torch.utils.data.DataLoader(datasets.MNIST(
            './data',
            train=True,
            download=True,
            transform=transforms.Compose([
                transforms.ToTensor(),
                transforms.Normalize((0.1307, ), (0.3081, ))
            ])),
                                                   batch_size=args.batch_size,
                                                   shuffle=True,
                                                   **kwargs)
        test_loader = torch.utils.data.DataLoader(
            datasets.MNIST('./data',
                           train=False,
                           transform=transforms.Compose([
                               transforms.ToTensor(),
                               transforms.Normalize((0.1307, ), (0.3081, ))
                           ])),
            batch_size=args.test_batch_size,
            shuffle=True,
            **kwargs)

    # Model creation
    with logger.section("Create model"):
        model = Net().to(device)
        optimizer = optim.SGD(model.parameters(),
                              lr=args.lr,
                              momentum=args.momentum)

    # Specify indicators
    logger.add_indicator("train_loss", queue_limit=10, is_print=True)
    logger.add_indicator("test_loss", is_histogram=False, is_print=True)
    logger.add_indicator("accuracy", is_histogram=False, is_print=True)
    for name, param in model.named_parameters():
        if param.requires_grad:
            logger.add_indicator(name, is_histogram=True, is_print=False)
            logger.add_indicator(f"{name}_grad",
                                 is_histogram=True,
                                 is_print=False)

    # Start the experiment
    EXPERIMENT.start_train()

    # Loop through the monitored iterator
    for epoch in logger.loop(range(0, args.epochs)):
        # Delayed keyboard interrupt handling to use
        # keyboard interrupts to end the loop.
        # This will capture interrupts and finish
        # the loop at the end of processing the iteration;
        # i.e. the loop won't stop in the middle of an epoch.
        try:
            with logger.delayed_keyboard_interrupt():

                # Training and testing
                train(args, model, device, train_loader, optimizer, epoch)
                test(model, device, test_loader)

                # Add histograms with model parameter values and gradients
                for name, param in model.named_parameters():
                    if param.requires_grad:
                        logger.store(name, param.data.cpu().numpy())
                        logger.store(f"{name}_grad", param.grad.cpu().numpy())

                # Clear line and output to console
                logger.write()

                # Output the progress summaries to `trial.yaml` and
                # to the python file header
                logger.save_progress()

                # Clear line and go to the next line;
                # that is, we add a new line to the output
                # at the end of each epoch
                logger.new_line()

        # Handled delayed interrupt
        except KeyboardInterrupt:
            logger.finish_loop()
            logger.new_line()
            logger.log("\nKilling loop...")
            break
Beispiel #14
0
def progress():
    with logger.section("Progress", total_steps=100):
        for i in range(100):
            time.sleep(0.1)
            # Multiple training steps in the inner loop
            logger.progress(i)
Beispiel #15
0
def adam_optimizer(c: Configs):
    with logger.section("Create optimizer"):
        return optim.Adam(c.model.parameters(), lr=c.learning_rate)
Beispiel #16
0
def model(c: Configs):
    with logger.section("Create model"):
        m: Net = Net()
        m.to(c.device)
        return m
Beispiel #17
0
def main():
    args = parse_args()

    # Loading data
    with logger.section("Load data"):
        mnist = tf.keras.datasets.mnist

        (x_train, y_train), (x_test, y_test) = mnist.load_data()
        x_train, x_test = x_train / 255.0, x_test / 255.0

        train_dataset = create_mnist_dataset(x_train, y_train, args.batch_size)
        test_dataset = create_mnist_dataset(x_test, y_test, args.batch_size)

    # Model creation
    with logger.section("Create model"):
        model = tf.keras.models.Sequential([
            tf.keras.layers.Flatten(input_shape=(28, 28)),
            tf.keras.layers.Dense(512, activation=tf.nn.relu),
            tf.keras.layers.Dropout(0.2),
            tf.keras.layers.Dense(10, activation=tf.nn.softmax)
        ])

    # Creation of the trainer
    with logger.section("Create trainer"):
        optimizer = tf.train.AdamOptimizer(learning_rate=args.lr)
        train_iterator = train_dataset.make_initializable_iterator()
        data, target = train_iterator.get_next()
        train_loss = loss(model, data, target)
        train_op = optimizer.minimize(train_loss)

        test_iterator = test_dataset.make_initializable_iterator()
        data, target = test_iterator.get_next()
        test_loss = loss(model, data, target)
        test_accuracy = accuracy(model, data, target)

    logger.add_indicator("train_loss", queue_limit=10, is_print=True)
    logger.add_indicator("test_loss", is_histogram=False, is_print=True)
    logger.add_indicator("accuracy", is_histogram=False, is_print=True)

    #
    batches = len(x_train) // args.batch_size

    with tf.Session() as session:
        EXPERIMENT.start_train(session)

        # Loop through the monitored iterator
        for epoch in logger.loop(range(0, args.epochs)):
            # Delayed keyboard interrupt handling to use
            # keyboard interrupts to end the loop.
            # This will capture interrupts and finish
            # the loop at the end of processing the iteration;
            # i.e. the loop won't stop in the middle of an epoch.
            try:
                with logger.delayed_keyboard_interrupt():

                    # Training and testing
                    session.run(train_iterator.initializer)
                    train(args, session, train_loss, train_op, batches, epoch)
                    session.run(test_iterator.initializer)
                    test(session, test_loss, test_accuracy,
                         len(x_test) // args.batch_size)

                    # Clear line and output to console
                    logger.write()

                    # Output the progress summaries to `trial.yaml` and
                    # to the python file header
                    logger.save_progress()

                    # Clear line and go to the next line;
                    # that is, we add a new line to the output
                    # at the end of each epoch
                    logger.new_line()

            # Handled delayed interrupt
            except KeyboardInterrupt:
                logger.finish_loop()
                logger.new_line()
                logger.log("\nKilling loop...")
                break
Beispiel #18
0
def simple_section():
    with logger.section("Simple section"):
        # code to load data
        time.sleep(2)
Beispiel #19
0
def loop_partial_section():
    for step in logger.loop(range(0, 10)):
        with logger.section("Step", is_partial=True):
            time.sleep(0.5)
            logger.progress((step % 5 + 1) / 5)
        logger.write()
Beispiel #20
0
import tensorflow as tf

from lab import logger
from lab.experiment.tensorflow import Experiment

# Create the sample experiment
EXPERIMENT = Experiment(name="sample",
                        python_file=__file__,
                        comment="Sample lab experiment",
                        check_repo_dirty=False)

# Sections are use to keep track of
# what's going on from the console output.
# It is also useful to organize the code into sections,
# when separating them into functions is difficult
with logger.section("Create model"):
    # Indicate that this section failed. You don't have to set
    #  this if it is successful.
    logger.set_successful(False)

    # Sleep for a minute.
    time.sleep(1)

# Print sample info
logger.info(one=1, two=2, string="string")

# ### Set logger indicators

# Reward is queued; this is useful when you want to track the moving
# average of something.
logger.add_indicator("reward", queue_limit=10)
Beispiel #21
0
def train_loader(c: Configs):
    with logger.section("Training data"):
        return _data_loader(True, c.batch_size, c.data_loader_args)
Beispiel #22
0
def sgd_optimizer(c: Configs):
    with logger.section("Create optimizer"):
        return optim.SGD(c.model.parameters(),
                         lr=c.learning_rate,
                         momentum=c.momentum)
Beispiel #23
0
def test_loader(c: Configs):
    with logger.section("Testing data"):
        return _data_loader(False, c.test_batch_size, c.data_loader_args)
Beispiel #24
0
def set_seed(c: Configs):
    with logger.section("Setting seed"):
        torch.manual_seed(c.seed)
Beispiel #25
0
def unsuccessful_section():
    with logger.section("Unsuccessful section"):
        time.sleep(1)
        logger.set_successful(False)