def _build_graph(self): try: with tf.variable_scope(str(self.metadata.id)): with tf.variable_scope("misc_ops"): self.log.debug("build_misc_ops") self._build_misc_ops() with tf.variable_scope('main_graph'): self.log.debug('build_main_graph') self._build_main_graph() with tf.variable_scope('loss_ops'): self.log.debug('build_loss_ops') self._loss_ops = self._build_loss_ops() assert self._loss_ops is not None with tf.variable_scope('predict_ops'): self.log.debug('build_predict_ops') self._predict_ops = self._build_predict_ops() assert self._predict_ops is not None with tf.variable_scope('metric_ops'): self.log.debug('build_metric_ops') self._metric_ops = self._build_metric_ops() assert self._metric_ops is not None with tf.variable_scope('train_ops'): self.log.debug('build train_ops') update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): self._train_ops = self._build_train_ops() assert self._train_ops is not None with tf.variable_scope('phase_controller'): self.log.debug('build phase_controller') scope = str(self.metadata.id) dropout_set_train_ops = self.singleton_dropout.collect_set_train_ops( scope) bn_set_train_ops = self.singleton_bn.collect_set_train_ops( scope) set_train_ops = dropout_set_train_ops + bn_set_train_ops self.set_train_op = tf.group(*set_train_ops) dropout_set_non_train_ops = self.singleton_dropout.collect_set_non_train_ops( scope) bn_set_non_train_ops = self.singleton_bn.collect_set_non_train_ops( scope) set_non_train_ops = dropout_set_non_train_ops + bn_set_non_train_ops self.set_non_train_op = tf.group(*set_non_train_ops) self._is_graph_built = True self.log.info("build success") except BaseException as e: self.log.error(error_trace(e)) print(error_trace(e)) raise ModelBuildFailError("ModelBuildFailError")
def to(self, target, target_scope): if not target.is_built: raise RuntimeError(f'transfer fail, target model must be built') with temp_directory(self.temp_path): try: source_var_list = tf.get_collection( tf.GraphKeys.GLOBAL_VARIABLES, scope=join_scope(self.source_model.id, self.source_scope)) target_var_list = tf.get_collection( tf.GraphKeys.GLOBAL_VARIABLES, scope=join_scope(self.source_model.id, target_scope)) self.log.info(f'collect var_list') transfer_dict = self.build_transfer_dict( source_var_list, self.source_scope, target_scope) self.log.info(f'build transfer_dict') # save transfer saver = tf.train.Saver(transfer_dict, name='transfer_saver') saver.save(self.source_model.sess, self.temp_path) self.log.info(f'save transfer') # load transfer saver = tf.train.Saver(target_var_list) saver.restore(target.sess, self.temp_path) self.log.info(f'load transfer') except BaseException as e: self.log.error(error_trace(e)) raise RuntimeError('TransferLearning Fail') return target
def _iter_augment(self, images, masks, batch_size, aug_seq, hook_func, queue, join_signal, finish_signal): try: images = NpSharedObj.decode(images).np masks = NpSharedObj.decode(masks).np dataset = BaseDataset(x=images, y=masks) while True: image, mask = dataset.next_batch(batch_size, balanced_class=False) seq_det = aug_seq.to_deterministic() image_aug = seq_det.augment_images(image) mask_aug = seq_det.augment_images(mask, hooks=hook_func) while True: try: queue.put((image_aug, mask_aug)) break except QueueFull: print(f'queue is full, need to reduce n_jobs') pass if join_signal.is_set(): break except BaseException as e: print(error_trace(e)) finally: finish_signal.set()
def open(self): if self.sess is not None: raise RuntimeError(f'session already opened') try: self.sess = tf.Session(config=self.config) except BaseException as e: self.log.error(error_trace(e))
def wrapper(*args, **kwargs): start = time.time() try: ret = func(*args, **kwargs) except BaseException as e: print(error_trace(e)) ret = None elapse_time = time.time() - start try: bot = SlackBot(token_path, channel) msg = f"in {func.__name__}(), time {elapse_time:.4f}'s elapsed" bot.post_message(msg) except BaseException as e: print(error_trace(e)) print('slackbot fail to post message') return ret
def close(self): if self.sess is None: return if self.is_injected: raise RuntimeError( f'injected session can not close in Session manager') try: self.sess.close() except BaseException as e: self.log.error(error_trace(e))
def load_restore_test(self): try: if self.path is None: path = './test_instance' else: path = self.path self.model.save(path) self.model.load_meta(path) self.model.restore(path) except BaseException as e: print(error_trace(e)) raise RuntimeError(f'load_restore_test fail\n{self.model}')
def build(self): with tf.device(self.device): try: self.ph, self.summary_op = self.build_graph(self.name) self.writer_path = os.path.join(self.logdir, self.name) self.is_build = True self.log.info( f'build summary tensor={self.name}, writer_path={self.writer_path}' ) except BaseException as e: self.log.error(error_trace(e)) raise TFSummaryBuildError(e)
def build(self, x=None, y=None): if not self._is_input_shape_built: try: self.inputs_x = x self.inputs_y = y self._build_input_shapes(x, y) self._is_input_shape_built = True except BaseException as e: print(error_trace(e)) raise ModelBuildFailError( f'input_shape build fail, x={x}, y={y}') self._build_graph() self.sessionManager.open_if_not() self.sessionManager.init_variable(self.var_list)
def terminate(self): self.join_signal.set() time.sleep(0.01) while True: try: self.q.get(timeout=0.1) except QueueEmpty: break except BaseException as e: print(error_trace(e)) break for worker in self.workers: worker.terminate() worker.join()
def __call__(self, model, dataset, metric, epoch): sign = 1 if self.max_best else -1 self.log( f'\n' f'{self.name} current top_k\n' f'{pformat(self.top_k[1:])}\n' ) try: for i in reversed(range(1, self.k + 1)): if sign * self.top_k[i - 1] > sign * metric > sign * self.top_k[i]: # update top_k self.top_k.insert(i, metric) self.top_k.pop(self.k + 1) # dump top_k json dump_json(self.top_k, path_join(self.path, 'top_k.json')) self.log( f'update top_k at {i}th, metric = {metric}\n' f'{pformat(self.top_k[1:])}' ) if self.save_model: # del worst dir shutil.rmtree(path_join(self.path, f'top_{self.k}')) # shift dir path_pairs = [ ( path_join(self.path, f'top_{idx}'), path_join(self.path, f'top_{idx+1}') ) for idx in range(i, self.k) ] path_pairs = list(reversed(path_pairs)) for src, dst in path_pairs: os.rename(src, dst) # save model save_path = path_join(self.path, f'top_{i}') model.save(save_path) break except BaseException as e: print(error_trace(e)) raise RuntimeError(f'while Top k save, raise {e}')
def build_test(self): try: self.model.build(**self.inputs) except BaseException as e: print(error_trace(e)) raise RuntimeError(f'build test fail\n{self.model}')
def train( self, x=None, y=None, epoch=1, batch_size=None, dataset_callback=None, epoch_pbar=True, iter_pbar=True, epoch_callbacks=None, validation_data=None, ): if not self.is_built: raise RuntimeError(f'{self} not built') batch_size = getattr( self, 'batch_size') if batch_size is None else batch_size self.batch_size = batch_size dataset = dataset_callback if dataset_callback else BaseDataset(x=x, y=y) metric = None epoch_pbar = tqdm([i for i in range(1, epoch + 1)]) if epoch_pbar else None for i_epoch in range(1, epoch + 1): epoch_pbar.set_description(f'epoch {i_epoch}/{epoch}') dataset.shuffle() loss_mean = 0 iter_size = int(dataset.size / batch_size) with trange(iter_size) as iter_trange: for i in range(iter_size): batch_x, batch_y = dataset.next_batch(batch_size) self._train_iter(batch_x, batch_y) batch_loss = self.loss(batch_x, batch_y) loss_mean = (loss_mean * i + batch_loss) / (i + 1) iter_trange.set_description(f'iter {i}/{iter_size}') iter_trange.set_postfix(loss=loss_mean) iter_trange.update(1) self.sess.run(self.op_inc_global_epoch) global_epoch = self.sess.run(self.global_epoch) if epoch_pbar: epoch_pbar.update(1) metric = self.metric(x, y) msg = f"epoch:{global_epoch}" msg += f', loss={loss_mean:0.6f}' msg += f', metric={np.mean(metric):0.6f}' if validation_data: val_loss = self.loss(*validation_data) val_metric = self.metric(*validation_data) msg += f', val_loss={np.mean(val_loss):0.6f}' msg += f', val_metric={np.mean(val_metric):0.6f}' tqdm.write(msg) break_epoch = False if epoch_callbacks: try: results = [ callback(self, dataset, loss_mean, global_epoch) for callback in epoch_callbacks ] except BaseException as e: print(error_trace(e)) raise RuntimeError for result in results: if result and 'break_epoch' in result: break_epoch = True if break_epoch: break if epoch_pbar: epoch_pbar.close() if dataset_callback: del dataset return metric