Ejemplo n.º 1
0
    def build_cache_step(self, overwrite=False):
        """Build a step for generating cache DB file.

            If self.db_path exists and not overwritting, build an empty step.
            Overwise, build a step as follows.
            Pipe original reader to the _DatasetWriter,
            so that dataset field blobs are populated.
            Then save these blobs into a file.

            Args:
                overwrite: bool. If true, ignore the existing file
                    and build a new one overwritting the existing one anyway.

            Returns:
                build_cache_step: ExecutionStep.
                    The step to be run for building a cache DB file.
        """
        if os.path.exists(self.db_path) and not overwrite:
            # cache already exists, no need to rebuild it
            return core.execution_step('build_step', [])

        init_net = core.Net('init')
        self._init_field_blobs_as_empty(init_net)
        with Cluster(), core.NameScope(self.name), TaskGroup() as copy_tg:
            pipe(self.original_reader, self.ds.writer(), num_threads=16)
            copy_step = copy_tg.to_task().get_step()
        save_net = core.Net('save')
        self._save_field_blobs_to_db_file(save_net)

        return core.execution_step('build_cache',
                                   [init_net, copy_step, save_net])
Ejemplo n.º 2
0
    def run_with(self, builder):
        with Cluster():
            with Job() as job:
                outputs = build_pipeline(node_id=0)
            output_fetcher = Task(step=core.Net('empty'), outputs=outputs)

            def fetch_total(session):
                session.run(output_fetcher)
                return output_fetcher.outputs()[0].fetch()

            session, checkpoint = builder()
            compiled_job = job.compile(LocalSession)
            num_epochs = JobRunner(compiled_job, checkpoint)(session)
            self.assertEquals(num_epochs, len(EXPECTED_TOTALS))
            self.assertEquals(fetch_total(session), EXPECTED_TOTALS[-1])

            for initial_epoch in range(1, num_epochs + 1):
                session, checkpoint = builder()
                JobRunner(
                    compiled_job,
                    checkpoint, resume_from_epoch=initial_epoch)(session)
                self.assertEquals(fetch_total(session), EXPECTED_TOTALS[-1])

            for epoch in range(1, num_epochs + 1):
                session.run(checkpoint.load(epoch))
                self.assertEquals(fetch_total(session),
                                  EXPECTED_TOTALS[epoch - 1])
Ejemplo n.º 3
0
    def build_cache(self, cache_path, overwrite=False):
        if not self.has_cache() or overwrite:
            self.cache_path = cache_path
        if self.has_cache() and not overwrite:
            # cache already exists, no need to rebuild it
            return core.execution_step('build_step', [])

        init_net = core.Net('init')
        self._init_dataset(init_net)
        with Cluster(), core.NameScope(self.name), TaskGroup() as copy_tg:
            pipe(self.original_reader, self.ds.writer(), num_threads=16)
            copy_step = copy_tg.to_task().get_step()
        save_net = core.Net('save')
        self._save_to_file(save_net)

        return core.execution_step('build_cache', [init_net, copy_step, save_net])
Ejemplo n.º 4
0
    def test_upload_checkpoint(self):
        try:
            tmpdir = tempfile.mkdtemp()
            upload_dir = os.path.join(tmpdir, "upload")
            os.mkdir(upload_dir)
            num_nodes = 3

            # The uploaded files do not exist yet.
            for node_id in range(num_nodes):
                node_name = 'trainer_%d' % node_id
                upload_path = os.path.join(upload_dir, node_name)
                self.assertFalse(os.path.exists(upload_path))

            # Create and run the job runner.
            for node_id in range(3):
                ws = workspace.C.Workspace()
                session = LocalSession(ws)
                checkpoint = MultiNodeCheckpointManager(tmpdir, 'minidb')
                with Cluster():
                    with Job() as job:
                        build_pipeline(node_id)
                    compiled_job = job.compile(LocalSession)
                    local_upload_builder = UploadToLocalFile(upload_dir)
                    job_runner = JobRunner(
                        compiled_job,
                        checkpoint,
                        upload_task_group_builder=local_upload_builder)
                    num_epochs = job_runner(session)
                    self.assertEquals(num_epochs, len(EXPECTED_TOTALS))

            # The uploaded files should exist now.
            for node_id in range(num_nodes):
                node_name = 'trainer_%d' % node_id
                upload_path = os.path.join(upload_dir, node_name)
                self.assertTrue(os.path.exists(upload_path))

        finally:
            shutil.rmtree(tmpdir)
Ejemplo n.º 5
0
    def test_ckpt_save_failure(self):
        num_nodes = 3
        # The goal of this test is to ensure that the job runs
        # successfully even if saving a checkpoint fails.
        # Hence tmpdir is a non existent directory to emulate a failure
        # while saving checkpoints
        tmpdir = "/tmp/path_does_not_exist/"

        # Check the saving checkpoint failure does not cause job failure
        workspace.ResetWorkspace()
        for node_id in range(num_nodes):
            ws = workspace.C.Workspace()
            session = LocalSession(ws)
            checkpoint = MultiNodeCheckpointManager(tmpdir, 'minidb')
            with Cluster():
                with Job() as job:
                    build_pipeline(node_id)
                compiled_job = job.compile(LocalSession)
                job_runner = JobRunner(compiled_job, checkpoint)
                num_epochs = job_runner(session)
            # make sure all epochs are executed even though saving the checkpoint failed
            # Saving checkpoint failure should not cause job failure
            self.assertEquals(num_epochs, len(EXPECTED_TOTALS))
Ejemplo n.º 6
0
    def test_ckpt_name_and_load_model_from_ckpts(self):
        try:
            num_nodes = 3
            tmpdir = tempfile.mkdtemp()
            # First, check if the checkpoint name generation mechanism is
            # correct.
            checkpoint = MultiNodeCheckpointManager(tmpdir, 'minidb')
            with Cluster():
                with Job() as job:
                    for node_id in range(num_nodes):
                        build_pipeline(node_id)
                compiled_job = job.compile(LocalSession)
                checkpoint.init(compiled_job.nodes_to_checkpoint())

                for node_id in range(num_nodes):
                    epoch = 5
                    node_name = 'trainer_%d' % node_id
                    expected_db_name = tmpdir + '/' + node_name + '.5'
                    self.assertEquals(
                        checkpoint.get_ckpt_db_name(node_name, epoch),
                        expected_db_name)
            shutil.rmtree(tmpdir)

            # Next, check mechanism to load model from checkpoints.
            tmpdir = tempfile.mkdtemp()
            workspace.ResetWorkspace()
            for node_id in range(num_nodes):
                ws = workspace.C.Workspace()
                session = LocalSession(ws)
                checkpoint = MultiNodeCheckpointManager(tmpdir, 'minidb')
                with Cluster():
                    with Job() as job:
                        build_pipeline(node_id)
                    compiled_job = job.compile(LocalSession)
                    job_runner = JobRunner(compiled_job, checkpoint)
                    num_epochs = job_runner(session)
                self.assertEquals(num_epochs, len(EXPECTED_TOTALS))

                # There are 12 global blobs after finishing up the job runner.
                # (only blobs on init_group are checkpointed)
                self.assertEquals(len(ws.blobs), 12)

            ws = workspace.C.Workspace()
            session = LocalSession(ws)
            self.assertEquals(len(ws.blobs), 0)
            model_blob_names = ['trainer_1/task_2/GivenTensorInt64Fill:0',
                                'trainer_2/task_2/GivenTensorInt64Fill:0']
            checkpoint = MultiNodeCheckpointManager(tmpdir, 'minidb')
            with Cluster():
                with Job() as job:
                    for node_id in range(num_nodes):
                        build_pipeline(node_id)
                compiled_job = job.compile(LocalSession)
                job_runner = JobRunner(compiled_job, checkpoint)
                job_runner.load_blobs_from_checkpoints(
                    blob_names=model_blob_names, epoch=1, session=session)

                # Check that we can successfully load from checkpoints of epochs
                # 1 to 4, but not epoch 5.
                for epoch in range(1, 5):
                    self.assertTrue(
                        job_runner.load_blobs_from_checkpoints(
                            blob_names=model_blob_names, epoch=epoch,
                            session=session))
                    # Check that all the model blobs are loaded.
                    for blob_name in model_blob_names:
                        self.assertTrue(ws.has_blob(blob_name))
                        self.assertEquals(
                            ws.fetch_blob(blob_name),
                            np.array([EXPECTED_TOTALS[epoch - 1]]))
                self.assertFalse(
                    job_runner.load_blobs_from_checkpoints(
                        blob_names=model_blob_names, epoch=5, session=session))

        finally:
            shutil.rmtree(tmpdir)
Ejemplo n.º 7
0
 def _compile_task_group(cls, task_group, setup_net_list=None):
     with Cluster():
         task = task_group.to_task()
     plan = core.Plan('task_group_plan')
     plan.AddStep(task.get_step())
     return (plan, task.output_list(), task.workspace_type)