Esempio n. 1
0
    def _build_graph(self):
        try:
            with tf.variable_scope(str(self.metadata.id)):
                with tf.variable_scope("misc_ops"):
                    self.log.debug("build_misc_ops")
                    self._build_misc_ops()

                with tf.variable_scope('main_graph'):
                    self.log.debug('build_main_graph')
                    self._build_main_graph()

                with tf.variable_scope('loss_ops'):
                    self.log.debug('build_loss_ops')
                    self._loss_ops = self._build_loss_ops()
                    assert self._loss_ops is not None

                with tf.variable_scope('predict_ops'):
                    self.log.debug('build_predict_ops')
                    self._predict_ops = self._build_predict_ops()
                    assert self._predict_ops is not None

                with tf.variable_scope('metric_ops'):
                    self.log.debug('build_metric_ops')
                    self._metric_ops = self._build_metric_ops()
                    assert self._metric_ops is not None

                with tf.variable_scope('train_ops'):
                    self.log.debug('build train_ops')

                    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
                    with tf.control_dependencies(update_ops):
                        self._train_ops = self._build_train_ops()
                    assert self._train_ops is not None

                with tf.variable_scope('phase_controller'):
                    self.log.debug('build phase_controller')
                    scope = str(self.metadata.id)

                    dropout_set_train_ops = self.singleton_dropout.collect_set_train_ops(
                        scope)
                    bn_set_train_ops = self.singleton_bn.collect_set_train_ops(
                        scope)
                    set_train_ops = dropout_set_train_ops + bn_set_train_ops
                    self.set_train_op = tf.group(*set_train_ops)

                    dropout_set_non_train_ops = self.singleton_dropout.collect_set_non_train_ops(
                        scope)
                    bn_set_non_train_ops = self.singleton_bn.collect_set_non_train_ops(
                        scope)
                    set_non_train_ops = dropout_set_non_train_ops + bn_set_non_train_ops
                    self.set_non_train_op = tf.group(*set_non_train_ops)

            self._is_graph_built = True
            self.log.info("build success")

        except BaseException as e:
            self.log.error(error_trace(e))
            print(error_trace(e))
            raise ModelBuildFailError("ModelBuildFailError")
Esempio n. 2
0
    def to(self, target, target_scope):
        if not target.is_built:
            raise RuntimeError(f'transfer fail, target model must be built')

        with temp_directory(self.temp_path):
            try:
                source_var_list = tf.get_collection(
                    tf.GraphKeys.GLOBAL_VARIABLES,
                    scope=join_scope(self.source_model.id, self.source_scope))
                target_var_list = tf.get_collection(
                    tf.GraphKeys.GLOBAL_VARIABLES,
                    scope=join_scope(self.source_model.id, target_scope))
                self.log.info(f'collect var_list')

                transfer_dict = self.build_transfer_dict(
                    source_var_list, self.source_scope, target_scope)
                self.log.info(f'build transfer_dict')

                # save transfer
                saver = tf.train.Saver(transfer_dict, name='transfer_saver')
                saver.save(self.source_model.sess, self.temp_path)
                self.log.info(f'save transfer')

                # load transfer
                saver = tf.train.Saver(target_var_list)
                saver.restore(target.sess, self.temp_path)
                self.log.info(f'load transfer')
            except BaseException as e:
                self.log.error(error_trace(e))
                raise RuntimeError('TransferLearning Fail')

        return target
Esempio n. 3
0
    def _iter_augment(self, images, masks, batch_size, aug_seq, hook_func,
                      queue, join_signal, finish_signal):
        try:
            images = NpSharedObj.decode(images).np
            masks = NpSharedObj.decode(masks).np
            dataset = BaseDataset(x=images, y=masks)
            while True:
                image, mask = dataset.next_batch(batch_size,
                                                 balanced_class=False)
                seq_det = aug_seq.to_deterministic()
                image_aug = seq_det.augment_images(image)
                mask_aug = seq_det.augment_images(mask, hooks=hook_func)

                while True:
                    try:
                        queue.put((image_aug, mask_aug))
                        break
                    except QueueFull:
                        print(f'queue is full, need to reduce n_jobs')
                        pass

                if join_signal.is_set():
                    break
        except BaseException as e:
            print(error_trace(e))
        finally:
            finish_signal.set()
Esempio n. 4
0
    def open(self):
        if self.sess is not None:
            raise RuntimeError(f'session already opened')

        try:
            self.sess = tf.Session(config=self.config)
        except BaseException as e:
            self.log.error(error_trace(e))
Esempio n. 5
0
        def wrapper(*args, **kwargs):
            start = time.time()
            try:
                ret = func(*args, **kwargs)
            except BaseException as e:
                print(error_trace(e))
                ret = None
            elapse_time = time.time() - start

            try:
                bot = SlackBot(token_path, channel)
                msg = f"in {func.__name__}(), time {elapse_time:.4f}'s elapsed"
                bot.post_message(msg)
            except BaseException as e:
                print(error_trace(e))
                print('slackbot fail to post message')

            return ret
Esempio n. 6
0
    def close(self):
        if self.sess is None:
            return

        if self.is_injected:
            raise RuntimeError(
                f'injected session can not close in Session manager')

        try:
            self.sess.close()
        except BaseException as e:
            self.log.error(error_trace(e))
Esempio n. 7
0
    def load_restore_test(self):
        try:
            if self.path is None:
                path = './test_instance'
            else:
                path = self.path

            self.model.save(path)
            self.model.load_meta(path)
            self.model.restore(path)
        except BaseException as e:
            print(error_trace(e))
            raise RuntimeError(f'load_restore_test fail\n{self.model}')
Esempio n. 8
0
    def build(self):
        with tf.device(self.device):
            try:
                self.ph, self.summary_op = self.build_graph(self.name)
                self.writer_path = os.path.join(self.logdir, self.name)
                self.is_build = True

                self.log.info(
                    f'build summary tensor={self.name}, writer_path={self.writer_path}'
                )
            except BaseException as e:
                self.log.error(error_trace(e))
                raise TFSummaryBuildError(e)
Esempio n. 9
0
    def build(self, x=None, y=None):
        if not self._is_input_shape_built:
            try:
                self.inputs_x = x
                self.inputs_y = y
                self._build_input_shapes(x, y)
                self._is_input_shape_built = True
            except BaseException as e:
                print(error_trace(e))
                raise ModelBuildFailError(
                    f'input_shape build fail, x={x}, y={y}')

        self._build_graph()

        self.sessionManager.open_if_not()
        self.sessionManager.init_variable(self.var_list)
Esempio n. 10
0
    def terminate(self):
        self.join_signal.set()
        time.sleep(0.01)

        while True:
            try:
                self.q.get(timeout=0.1)
            except QueueEmpty:
                break
            except BaseException as e:
                print(error_trace(e))
                break

        for worker in self.workers:
            worker.terminate()
            worker.join()
Esempio n. 11
0
    def __call__(self, model, dataset, metric, epoch):
        sign = 1 if self.max_best else -1

        self.log(
            f'\n'
            f'{self.name} current top_k\n'
            f'{pformat(self.top_k[1:])}\n'
        )

        try:
            for i in reversed(range(1, self.k + 1)):
                if sign * self.top_k[i - 1] > sign * metric > sign * self.top_k[i]:
                    # update top_k
                    self.top_k.insert(i, metric)
                    self.top_k.pop(self.k + 1)

                    # dump top_k json
                    dump_json(self.top_k, path_join(self.path, 'top_k.json'))
                    self.log(
                        f'update top_k at {i}th, metric = {metric}\n'
                        f'{pformat(self.top_k[1:])}'
                    )

                    if self.save_model:
                        # del worst dir
                        shutil.rmtree(path_join(self.path, f'top_{self.k}'))

                        # shift dir
                        path_pairs = [
                            (
                                path_join(self.path, f'top_{idx}'),
                                path_join(self.path, f'top_{idx+1}')
                            )
                            for idx in range(i, self.k)
                        ]
                        path_pairs = list(reversed(path_pairs))
                        for src, dst in path_pairs:
                            os.rename(src, dst)

                        # save model
                        save_path = path_join(self.path, f'top_{i}')
                        model.save(save_path)

                    break
        except BaseException as e:
            print(error_trace(e))
            raise RuntimeError(f'while Top k save, raise {e}')
Esempio n. 12
0
 def build_test(self):
     try:
         self.model.build(**self.inputs)
     except BaseException as e:
         print(error_trace(e))
         raise RuntimeError(f'build test fail\n{self.model}')
Esempio n. 13
0
    def train(
        self,
        x=None,
        y=None,
        epoch=1,
        batch_size=None,
        dataset_callback=None,
        epoch_pbar=True,
        iter_pbar=True,
        epoch_callbacks=None,
        validation_data=None,
    ):
        if not self.is_built:
            raise RuntimeError(f'{self} not built')

        batch_size = getattr(
            self, 'batch_size') if batch_size is None else batch_size
        self.batch_size = batch_size
        dataset = dataset_callback if dataset_callback else BaseDataset(x=x,
                                                                        y=y)

        metric = None
        epoch_pbar = tqdm([i for i in range(1, epoch +
                                            1)]) if epoch_pbar else None
        for i_epoch in range(1, epoch + 1):
            epoch_pbar.set_description(f'epoch {i_epoch}/{epoch}')

            dataset.shuffle()

            loss_mean = 0
            iter_size = int(dataset.size / batch_size)
            with trange(iter_size) as iter_trange:
                for i in range(iter_size):
                    batch_x, batch_y = dataset.next_batch(batch_size)
                    self._train_iter(batch_x, batch_y)

                    batch_loss = self.loss(batch_x, batch_y)
                    loss_mean = (loss_mean * i + batch_loss) / (i + 1)

                    iter_trange.set_description(f'iter {i}/{iter_size}')
                    iter_trange.set_postfix(loss=loss_mean)
                    iter_trange.update(1)

            self.sess.run(self.op_inc_global_epoch)
            global_epoch = self.sess.run(self.global_epoch)
            if epoch_pbar: epoch_pbar.update(1)

            metric = self.metric(x, y)

            msg = f"epoch:{global_epoch}"
            msg += f', loss={loss_mean:0.6f}'
            msg += f', metric={np.mean(metric):0.6f}'
            if validation_data:
                val_loss = self.loss(*validation_data)
                val_metric = self.metric(*validation_data)

                msg += f', val_loss={np.mean(val_loss):0.6f}'
                msg += f', val_metric={np.mean(val_metric):0.6f}'
            tqdm.write(msg)

            break_epoch = False
            if epoch_callbacks:
                try:
                    results = [
                        callback(self, dataset, loss_mean, global_epoch)
                        for callback in epoch_callbacks
                    ]
                except BaseException as e:
                    print(error_trace(e))
                    raise RuntimeError
                for result in results:
                    if result and 'break_epoch' in result:
                        break_epoch = True
            if break_epoch: break

        if epoch_pbar: epoch_pbar.close()
        if dataset_callback: del dataset

        return metric