def test_get_default_session_or_error(self): with pytest.raises(RuntimeError, match='No session is active'): get_default_session_or_error() with self.test_session(use_gpu=False) as sess: self.assertIs(sess, get_default_session_or_error()) with pytest.raises(RuntimeError, match='No session is active'): get_default_session_or_error()
def set(self, value): """ Set the value of the variable. Args: value: The value to be assigned to the variable. """ get_default_session_or_error().run( self._self_assign_op, feed_dict={self._self_assign_ph: value})
def collect_outputs(outputs, inputs, data_flow, feed_dict=None, session=None): """ Run TensorFlow graph by mini-batch and concat outputs from each batch. Args: outputs (Iterable[tf.Tensor]): Output tensors to be computed. inputs (Iterable[tf.Tensor]): Input placeholders. data_flow (DataFlow): Data flow to feed the input placeholders. feed_dict: Optional, additional feed dict. session: The TensorFlow session. If not specified, use the default session. Returns: tuple[np.ndarray]: The concatenated outputs. """ outputs = list(outputs) inputs = list(inputs) session = session or get_default_session_or_error() collected = [[] for _ in range(len(outputs))] for batch in data_flow: batch_feed_dict = merge_feed_dict( feed_dict, {k: v for (k, v) in zip(inputs, batch)}) for i, o in enumerate(session.run(outputs, feed_dict=batch_feed_dict)): collected[i].append(o) for i, batches in enumerate(collected): collected[i] = np.concatenate(batches, axis=0) return tuple(collected)
def save(self, global_step=None, session=None): """ Save the session to a checkpoint file. Args: global_step (int or tf.Tensor): The global step counter. session (tf.Session): The session to save. If not specified, select the default session. Returns: str: The path of the saved checkpoint file. """ session = session or get_default_session_or_error() # save the states of savable objects into serial var if self._objects: object_states = {} for key, obj in six.iteritems(self._objects): object_states[key] = obj.get_state() serialized_states = pkl.dumps( object_states, protocol=pkl.HIGHEST_PROTOCOL) self._serial_var.set(serialized_states) # now save the variables to checkpoint file if not os.path.isdir(self.save_dir): makedirs(self.save_dir, exist_ok=True) return self._saver.save( session, os.path.join(self.save_dir, self.filename), global_step=global_step, write_meta_graph=self.save_meta )
def run(self): """Run training loop.""" if self._is_fitting: raise RuntimeError('`run()` is not re-entrant.') self._is_fitting = True try: # initialize global training status session = get_default_session_or_error() ensure_variables_initialized() self.loop.print_training_summary() # initialize internal status for hook_list in self.hook_lists: hook_list.reset() for epoch in self.loop.iter_epochs(): # run before epoch hook self.before_epochs.call_hooks() # run steps of this epoch for payload in self._iter_steps(): # run before step hook self.before_steps.call_hooks() # run the step self._run_step(session, payload) # run after step hook self.after_steps.call_hooks() # run after epoch hook self.after_epochs.call_hooks() finally: self._is_fitting = False
def add_metrics(self, global_step=None, metrics=None, **kwargs): """Add a scalar metric as summary. Parameters ---------- global_step : int | tf.Tensor | tf.Variable The global step counter. (optional) metrics, **kwargs Dict of metric values. """ if metrics is not None and not isinstance(metrics, (dict, OrderedDict)): raise TypeError('%r should be a dict.' % (metrics,)) values = [] if metrics: for k, v in six.iteritems(metrics): values.append(tf.summary.Summary.Value(tag=k, simple_value=v)) for k, v in six.iteritems(kwargs): values.append(tf.summary.Summary.Value(tag=k, simple_value=v)) if values: if isinstance(global_step, (tf.Tensor, tf.Variable)): global_step = get_default_session_or_error().run(global_step) summary = tf.summary.Summary(value=values) self._writer.add_summary(summary, global_step=global_step)
def run(self, feed_dict=None): """ Run validation. Args: feed_dict (dict[tf.Tensor, any]): The extra feed dict to be merged with the already configured dict. (default :obj:`None`) """ session = get_default_session_or_error() with self.loop.timeit(self._time_metric_name), \ self.loop.metric_collector(self._loss_metric_name) as mc: for batch_data in self.data_flow: # prepare for the batch feed dict feed_dict = resolve_feed_dict( merge_feed_dict(self.feed_dict, feed_dict, zip(self.inputs, batch_data))) # run the mini-batch loss = self._run_batch(session, feed_dict) if self._loss_weight_func is not None: loss_weight = self._loss_weight_func(*batch_data) else: loss_weight = 1. mc.collect(loss, weight=loss_weight)
def test_sampling_for_fully_dynamic_shape(self): with self.get_session(use_gpu=True): params = { k: np.tile(v, [10] + [1] * len(v.shape)) for k, v in six.iteritems(self.simple_params) } # sample `x` from distribution with dynamic batch shape tf.set_random_seed(1234) params_ph = { k: tf.placeholder(tf.float32) for k, v in six.iteritems(self.simple_params) } feed_dict = { params_ph[k]: np.tile(self.simple_params[k], [10] + [1] * len(self.simple_params[k].shape)) for k in six.iterkeys(self.simple_params) } dist = self.dist_class(**params_ph) x = dist.sample() prob, log_prob = dist.prob(x), dist.log_prob(x) x, prob, log_prob = get_default_session_or_error().run( [x, prob, log_prob], feed_dict=feed_dict) value_shape, batch_shape = \ self.get_shapes_for_param(**self.simple_params) np.testing.assert_equal(x.shape, [10] + list(batch_shape + value_shape)) self.assert_allclose(prob, self.prob(x, **params)) self.assert_allclose(log_prob, self.log_prob(x, **params))
def get_samples_and_prob(self, sample_shape=(), feed_dict=None, **params): tf.set_random_seed(1234) dist = self.dist_class(**params) x = dist.sample(sample_shape) prob, log_prob = dist.prob(x), dist.log_prob(x) return get_default_session_or_error().run([x, prob, log_prob], feed_dict=feed_dict)
def run(self, feed_dict=None): """ Run evaluation. Args: feed_dict: The extra feed dict to be merged with the already configured dict. (default :obj:`None`) """ @contextmanager def timeit(): if self.time_metric_name is not None: with self.loop.timeit(self.time_metric_name): yield else: yield session = get_default_session_or_error() metric_tensors = list(six.itervalues(self.metrics)) metric_names = list(six.iterkeys(self.metrics)) metric_values = [] metric_weights = [] with timeit(): for batch_data in self.data_flow: # prepare for the batch feed dict feed_dict = resolve_feed_dict( merge_feed_dict(self.feed_dict, feed_dict, zip(self.inputs, batch_data))) # inspect the batch weight if self._batch_weight_func is not None: batch_weight = self._batch_weight_func(*batch_data) else: batch_weight = 1. metric_weights.append(batch_weight) # run the mini-batch batch_values = self._run_batch(session, feed_dict) for i, v in enumerate(batch_values): if len(np.asarray(v).shape) != 0: # pragma: no cover raise ValueError( 'Metric is not a scalar: tensor {!r}, value {!r}.'. format(v, metric_tensors[i])) # accumulate the metrics metric_values.append(np.asarray(batch_values)) # now merge all batch metrics and do logging metric_values = np.average( np.stack(metric_values, axis=0), axis=0, weights=np.asarray(metric_weights), ) assert (len(metric_names) == len(metric_values)) self._last_metrics_dict = metrics_dict = { k: v for k, v in zip(metric_names, metric_values) } self.loop.collect_metrics(metrics_dict)
def test_analytic_kld(self): with self.get_session(use_gpu=True): dist1 = self.dist_class(**self.simple_params) dist2 = self.dist_class(**self.kld_simple_params) kld = get_default_session_or_error().run(dist1.analytic_kld(dist2)) self.assert_allclose( kld, self.analytic_kld(self.simple_params, self.kld_simple_params))
def plot_samples(loop): with loop.timeit('plot_time'): session = get_default_session_or_error() images = session.run(x_plots, feed_dict={is_training: False}) save_images_collection(images=images, filename=results.prepare_parent( 'plotting/{}.png'.format(loop.epoch)), grid_size=(10, 10))
def get_score(self, values, missing=None): """ Get the `reconstruction probability` of specified KPI observations. The larger `reconstruction probability`, the less likely a point is anomaly. You may take the negative of the score, if you want something to directly indicate the severity of anomaly. Args: values (np.ndarray): 1-D float32 array, the KPI observations. missing (np.ndarray): 1-D int32 array, the indicator of missing points. If :obj:`None`, the MCMC missing data imputation will be disabled. (default :obj:`None`) Returns: np.ndarray: The `reconstruction probability`, 1-D array if `last_point_only` is :obj:`True`, or 2-D array if `last_point_only` is :obj:`False`. """ with tf.name_scope('DonutPredictor.get_score'): sess = get_default_session_or_error() collector = [] # validate the arguments values = np.asarray(values, dtype=np.float32) if len(values.shape) != 1: raise ValueError('`values` must be a 1-D array') # run the prediction in mini-batches sliding_window = BatchSlidingWindow( array_size=len(values), window_size=self.model.x_dims, batch_size=self._batch_size, ) if missing is not None: missing = np.asarray(missing, dtype=np.int32) if missing.shape != values.shape: raise ValueError( 'The shape of `missing` does not agree ' 'with the shape of `values` ({} vs {})'.format( missing.shape, values.shape)) for b_x, b_y in sliding_window.get_iterator([values, missing]): feed_dict = dict(six.iteritems(self._feed_dict)) feed_dict[self._input_x] = b_x feed_dict[self._input_y] = b_y b_r = sess.run(self._get_score(), feed_dict=feed_dict) collector.append(b_r) else: for b_x, in sliding_window.get_iterator([values]): feed_dict = dict(six.iteritems(self._feed_dict)) feed_dict[self._input_x] = b_x b_r = sess.run(self._get_score_without_y(), feed_dict=feed_dict) collector.append(b_r) # merge the results of mini-batches result = np.concatenate(collector, axis=0) return result
def fit(self, train_iterator, summary_dir=None): """ Train the :class:`OmniAnomaly` model with given data. Args: values (np.ndarray): 1-D `float32` array, the standardized KPI observations. summary_dir (str): Optional summary directory for :class:`tf.summary.FileWriter`. (default :obj:`None`, summary is disabled) """ sess = get_default_session_or_error() # initialize the variables of the trainer, and the model sess.run(self._trainer_initializer) ensure_variables_initialized(self._train_params) # training loop lr = self._initial_lr with TrainLoop( param_vars=self._train_params, summary_dir=summary_dir, max_epoch=self._max_epoch, max_step=self._max_step, ) as loop: # type: TrainLoop # loop.print_training_summary() train_batch_time = [] valid_batch_time = [] time_train_start = time.time() for epoch in loop.iter_epochs(): start_time = time.time() for step, idx in loop.iter_steps(range(len(train_iterator))): # run a training step batch_x = train_iterator[idx] start_batch_time = time.time() feed_dict = dict(six.iteritems(self._feed_dict)) feed_dict[self._learning_rate] = lr feed_dict[self._input_x] = batch_x loss, _ = sess.run([self._loss, self._train_op], feed_dict=feed_dict) loop.collect_metrics({"loss": loss}) train_batch_time.append(time.time() - start_batch_time) # anneal the learning rate if self._lr_anneal_epochs and epoch % self._lr_anneal_epochs == 0: lr *= self._lr_anneal_factor loop.println("Learning rate decreased to {}".format(lr), with_tag=True) time_train_end = time.time() return { # "best_valid_loss": float(loop.best_valid_metric), "train_time": np.sum(train_batch_time), "total_train_time": time_train_end - time_train_start, }
def get_score(self, values): """ Get the `reconstruction probability` of specified KPI observations. The larger `reconstruction probability`, the less likely a point is anomaly. You may take the negative of the score, if you want something to directly indicate the severity of anomaly. Args: values (np.ndarray): 1-D float32 array, the KPI observations. Returns: np.ndarray: The `reconstruction probability`, 1-D array if `last_point_only` is :obj:`True`, or 2-D array if `last_point_only` is :obj:`False`. """ with tf.name_scope('Predictor.get_score'): sess = get_default_session_or_error() collector = [] collector_z = [] # validate the arguments values = np.asarray(values, dtype=np.float32) if len(values.shape) != 2: raise ValueError('`values` must be a 2-D array') # run the prediction in mini-batches sliding_window = BatchSlidingWindow( array_size=len(values), window_size=self.model.window_length, batch_size=self._batch_size, ) pred_time = [] for b_x, in sliding_window.get_iterator([values]): start_iter_time = time.time() input_adj = get_adj(b_x[..., :self._model.x_dims], self._model.config.gcn_type) feed_dict = dict(six.iteritems(self._feed_dict)) feed_dict[self._input_x] = b_x[..., :self._model.x_dims] feed_dict[self._input_feature] = b_x[..., self._model.x_dims:] feed_dict[self._input_adj] = input_adj # b_r:(50,)一个batch的score b_r, q_net_z = sess.run(self._get_score_without_y(), feed_dict=feed_dict) collector.append(b_r) pred_time.append(time.time() - start_iter_time) collector_z.append(q_net_z) # merge the results of mini-batches result = np.concatenate(collector, axis=0) result_z = np.concatenate(collector_z, axis=0) return result, result_z, np.mean(pred_time)
def test_get_default_session_or_error(self): def do_raise(): with self.assertRaises(RuntimeError) as cm: get_default_session_or_error() self.assertIn('No session is active.', str(cm.exception)) do_raise() with self.get_session() as sess: self.assertIs(sess, get_default_session_or_error()) do_raise()
def get_refactor_probability(self, values, missing=None): """ 获取指定KPI监测数据的“重构概率”。 “重建概率”越大,异常点的可能性就越小。如果想要直接表明异常的严重程度,可以取这个分数的负值。 Args: values (np.ndarray): 一维32位浮点数数组,KPI监测数据 missing (np.ndarray): 一维32位整型数组,指明缺失点 (default :obj:`None`,如果为 :obj:`None`, 不会进行缺失点注入 ) Returns: np.ndarray: 重构概率,`last_point_only`如果是 :obj:`True`,就是一维数组, `last_point_only`如果是 :obj:`False`,就是二维数组 """ tc = TimeCounter() tc.start() with tf.name_scope('DonutPredictor.get_refactor_probability'): sess = get_default_session_or_error() collector = [] # 校验参数 values = np.asarray(values, dtype=np.float32) if len(values.shape) != 1: raise ValueError('`values` 必须为一维数组') # 对每个小切片进行预测 # 滑动窗口 sliding_window = BatchSlidingWindow(array_size=len(values), window_size=self.model.x_dims, batch_size=self._batch_size) # 有缺失点 if missing is not None: missing = np.asarray(missing, dtype=np.int32) # 缺失点shape必须与values的shape相同 if missing.shape != values.shape: raise ValueError( '`missing` 的形状必须与`values`的形状相同 ({} vs {})'.format( missing.shape, values.shape)) for b_x, b_y in sliding_window.get_iterator([values, missing]): feed_dict = dict(six.iteritems(self._feed_dict)) feed_dict[self._input_x] = b_x feed_dict[self._input_y] = b_y b_r = sess.run(self._get_refactor_probability(), feed_dict=feed_dict) collector.append(b_r) else: for b_x, in sliding_window.get_iterator([values]): feed_dict = dict(six.iteritems(self._feed_dict)) feed_dict[self._input_x] = b_x b_r = sess.run(self._get_refactor_probability_without_y(), feed_dict=feed_dict) collector.append(b_r) # 合并小切片的数据 tc.end() test_probability_time = tc.get_s() + "秒" return np.concatenate(collector, axis=0), test_probability_time
def test_prob_with_higher_dimensional_params(self): with self.get_session(use_gpu=True): x, _, _ = self.get_samples_and_prob( **self.extended_dimensional_params) x = x[0, ...] dist = self.dist_class(**self.extended_dimensional_params) prob, log_prob = get_default_session_or_error().run( [dist.prob(x), dist.log_prob(x)]) self.assert_allclose( prob, self.prob(x, **self.extended_dimensional_params)) self.assert_allclose( log_prob, self.log_prob(x, **self.extended_dimensional_params))
def add_summary(self, summary, global_step=None): """Add a summary object. Parameters ---------- summary : bytes | tf.summary.Summary The summary object. global_step : int | tf.Tensor | tf.Variable The global step counter. (optional) """ if isinstance(global_step, (tf.Tensor, tf.Variable)): global_step = get_default_session_or_error().run(global_step) self._writer.add_summary(summary, global_step=global_step)
def save(self, global_step=None): """ Save the checkpoint to file. Args: global_step (int or tf.Tensor): The global step counter. """ sess = get_default_session_or_error() makedirs(self.save_dir, exist_ok=True) self._saver.save(sess, os.path.join(self.save_dir, self.filename), global_step=global_step, latest_filename=self.latest_file, write_meta_graph=self.save_meta)
def collect_metrics(self, metrics, global_step=None): """ Collect the statistics of metrics. Args: metrics (dict[str, float or np.ndarray or ScheduledVariable]): Dict from metrics names to their values. For :meth:`format_logs`, there is no difference between calling :meth:`collect_metrics` only once, with an array of metric values; or calling :meth:`collect_metrics` multiple times, with one value at each time. However, for the TensorFlow summary writer, only the mean of the metric values would be recorded, if calling :meth:`collect_metrics` with an array. global_step (int or tf.Variable or tf.Tensor): The global step counter. (optional) """ from tfsnippet.trainer import ScheduledVariable tf_summary_values = [] for k, v in six.iteritems(metrics): if isinstance(v, ScheduledVariable): v = v.get() v = np.asarray(v) self._metrics[k].collect(v) if self._summary_writer is not None and \ (self._summary_skip_pattern is None or not self._summary_skip_pattern.match(k)): skip_count = self._metrics_skip_counter.get(k, 0) freq_limit = self._summary_commit_freqs.get(k, 1) if skip_count + 1 >= freq_limit: self._metrics_skip_counter[k] = 0 tag = self._summary_metric_prefix + k tf_summary_values.append( tf.summary.Summary.Value(tag=tag, simple_value=v.mean())) else: self._metrics_skip_counter[k] = skip_count + 1 if tf_summary_values: summary = tf.summary.Summary(value=tf_summary_values) if global_step is not None and \ isinstance(global_step, (tf.Variable, tf.Tensor)): global_step = get_default_session_or_error().run(global_step) self._summary_writer.add_summary(summary, global_step=global_step)
def add_graph(self, graph=None, global_step=None): """Add graph to the summary. Parameters ---------- graph : tf.Graph The graph to be added. If not specified, will add the current active graph. global_step : int | tf.Tensor | tf.Variable The global step counter. (optional) """ if isinstance(global_step, (tf.Tensor, tf.Variable)): global_step = get_default_session_or_error().run(global_step) self._writer.add_graph( graph or tf.get_default_graph(), global_step=global_step )
def run(self): """Run training loop.""" if self._is_fitting: raise RuntimeError('`run()` is not re-entrant.') self._is_fitting = True try: # trigger the before execution event self.events.fire(EventKeys.BEFORE_EXECUTION, self) # initialize global training status session = get_default_session_or_error() if self._ensure_variables_initialized: ensure_variables_initialized() self.loop.print_training_summary() for _ in self.loop.iter_epochs(): # trigger before epoch event self.events.fire(EventKeys.BEFORE_EPOCH, self) # run steps of this epoch for payload in self._iter_steps(): # trigger before step event self.events.fire(EventKeys.BEFORE_STEP, self) # run the step self._run_step(session, payload) # trigger after step events self.events.fire(EventKeys.STEP_EVALUATION, self) self.events.fire(EventKeys.STEP_ANNEALING, self) self.events.fire(EventKeys.STEP_LOGGING, self) self.events.reverse_fire(EventKeys.AFTER_STEP, self) # trigger after epoch events self.events.fire(EventKeys.EPOCH_EVALUATION, self) self.events.fire(EventKeys.EPOCH_ANNEALING, self) self.events.fire(EventKeys.EPOCH_LOGGING, self) self.events.reverse_fire(EventKeys.AFTER_EPOCH, self) # trigger the after execution event self.events.reverse_fire(EventKeys.AFTER_EXECUTION, self) finally: self._is_fitting = False
def _run(self, images, output): sess = get_default_session_or_error() err_msg = ('`images` must be a list of bytes, or a numpy array of ' 'shape (?, ?, ?, 3).') if isinstance(images, list): for im in images: if not isinstance(im, six.binary_type): raise TypeError(err_msg) input_tensor = self._jpeg_input get_image = lambda i: images[i] elif isinstance(images, np.ndarray): if len(images.shape) != 4 or images.shape[3] != 3: raise TypeError(err_msg) input_tensor = self._array_input get_image = lambda i: images[i:i + 1].astype(np.float32) else: raise TypeError(err_msg) ret = [] for i in range(len(images)): ret.append(sess.run(output, {input_tensor: get_image(i)})) return np.concatenate(ret, axis=0)
def restore(self, ignore_non_exist=False): """ Restore the checkpoint from file if it exists. Args: ignore_non_exist (bool): Whether or not to ignore error if the checkpoint file does not exist? (default :obj:`False`) Raises: IOError: If the checkpoint files do not exist, and `ignore_non_exist` is not :obj:`True`. """ file_path = self.get_latest_file() if file_path: sess = get_default_session_or_error() self._saver.restore(sess, file_path) getLogger(__name__).debug('Restored from checkpoint file %r.', file_path) elif not ignore_non_exist: raise IOError( 'Checkpoint file does not exist in directory {}'.format( self.save_dir))
def get_score(self, test_iterator): """ Get the `reconstruction probability` of specified KPI observations. The larger `reconstruction probability`, the less likely a point is anomaly. You may take the negative of the score, if you want something to directly indicate the severity of anomaly. Args: values (np.ndarray): 1-D float32 array, the KPI observations. Returns: np.ndarray: The `reconstruction probability`, 1-D array if `last_point_only` is :obj:`True`, or 2-D array if `last_point_only` is :obj:`False`. """ with tf.name_scope("Predictor.get_score"): sess = get_default_session_or_error() collector = [] collector_z = [] pred_time = [] for idx in range(len(test_iterator)): b_x = test_iterator[idx] start_iter_time = time.time() feed_dict = dict(six.iteritems(self._feed_dict)) feed_dict[self._input_x] = b_x b_r, q_net_z = sess.run( self._get_score_without_y(), feed_dict=feed_dict ) collector.append(b_r) pred_time.append(time.time() - start_iter_time) collector_z.append(q_net_z) # merge the results of mini-batches result = np.concatenate(collector, axis=0) result_z = np.concatenate(collector_z, axis=0) return result, result_z, np.sum(pred_time)
def restore(self, save_path, session=None): """ Restore from a checkpoint file. Args: save_path (str): Restore from this checkpoint file. session (tf.Session): Restore the variables into this session. If not specified, restore into the default session. """ session = session or get_default_session_or_error() # restore the variables self._saver.restore(session, save_path) # restore the states of savable objects if self._objects: object_states = pkl.loads(self._serial_var.get(session)) assert(isinstance(object_states, dict)) for key, obj in six.iteritems(self._objects): if key not in object_states: raise KeyError('Object `{}` not found in the checkpoint: ' '{}'.format(key, save_path)) obj.set_state(object_states[key])
def collect_metrics(self, metrics, global_step=None): """ Collect the statistics of metrics. Args: metrics (dict[str, float or np.ndarray or DynamicValue]): Dict from metrics names to their values. For :meth:`format_logs`, there is no difference between calling :meth:`collect_metrics` only once, with an array of metric values; or calling :meth:`collect_metrics` multiple times, with one value at each time. However, for the TensorFlow summary writer, only the mean of the metric values would be recorded, if calling :meth:`collect_metrics` with an array. global_step (int or tf.Variable or tf.Tensor): The global step counter. (optional) """ from tfsnippet.trainer import DynamicValue tf_summary_values = [] for k, v in six.iteritems(metrics): if isinstance(v, DynamicValue): v = v.get() v = np.asarray(v) self._metrics[k].collect(v) if self._summary_writer is not None: mean_value = v.mean() tf_summary_values.append( tf.summary.Summary.Value(tag=k, simple_value=mean_value)) if tf_summary_values: summary = tf.summary.Summary(value=tf_summary_values) if global_step is not None and \ isinstance(global_step, (tf.Variable, tf.Tensor)): global_step = get_default_session_or_error().run(global_step) self._summary_writer.add_summary(summary, global_step=global_step)
def fit(self, values, valid_portion=0.01, summary_dir=None): """ Train the :class:`OmniAnomaly` model with given data. Args: values (np.ndarray): 1-D `float32` array, the standardized KPI observations. valid_portion (float): Ratio of validation data out of all the specified training data. (default 0.3) summary_dir (str): Optional summary directory for :class:`tf.summary.FileWriter`. (default :obj:`None`, summary is disabled) """ sess = get_default_session_or_error() # split the training & validation set values = np.asarray(values, dtype=np.float32) if len(values.shape) != 2: raise ValueError("`values` must be a 2-D array") n = int(len(values) * valid_portion) train_values, v_x = values[:-n], values[-n:] train_sliding_window = BatchSlidingWindow( array_size=len(train_values), window_size=self.model.window_length, batch_size=self._batch_size, shuffle=True, ignore_incomplete_batch=True, ) valid_sliding_window = BatchSlidingWindow( array_size=len(v_x), window_size=self.model.window_length, batch_size=self._valid_batch_size, ) # initialize the variables of the trainer, and the model sess.run(self._trainer_initializer) ensure_variables_initialized(self._train_params) # training loop lr = self._initial_lr with TrainLoop( param_vars=self._train_params, early_stopping=True, summary_dir=summary_dir, max_epoch=self._max_epoch, max_step=self._max_step, ) as loop: # type: TrainLoop loop.print_training_summary() train_batch_time = [] valid_batch_time = [] time_train_start = time.time() for epoch in loop.iter_epochs(): print("train_values:", train_values.shape) train_iterator = train_sliding_window.get_iterator([train_values]) start_time = time.time() for step, (batch_x,) in loop.iter_steps(train_iterator): # run a training step start_batch_time = time.time() feed_dict = dict(six.iteritems(self._feed_dict)) feed_dict[self._learning_rate] = lr feed_dict[self._input_x] = batch_x loss, _ = sess.run( [self._loss, self._train_op], feed_dict=feed_dict ) loop.collect_metrics({"loss": loss}) train_batch_time.append(time.time() - start_batch_time) # if step % self._valid_step_freq == 0: # train_duration = time.time() - start_time # loop.collect_metrics({"train_time": train_duration}) # # collect variable summaries # if summary_dir is not None: # loop.add_summary(sess.run(self._summary_op)) # # do validation in batches # with loop.timeit("valid_time"), loop.metric_collector( # "valid_loss" # ) as mc: # v_it = valid_sliding_window.get_iterator([v_x]) # for (b_v_x,) in v_it: # start_batch_time = time.time() # feed_dict = dict(six.iteritems(self._valid_feed_dict)) # feed_dict[self._input_x] = b_v_x # loss = sess.run(self._loss, feed_dict=feed_dict) # valid_batch_time.append(time.time() - start_batch_time) # mc.collect(loss, weight=len(b_v_x)) # # print the logs of recent steps # loop.print_logs() # start_time = time.time() # anneal the learning rate if self._lr_anneal_epochs and epoch % self._lr_anneal_epochs == 0: lr *= self._lr_anneal_factor loop.println( "Learning rate decreased to {}".format(lr), with_tag=True ) time_train_end = time.time() return { # "best_valid_loss": float(loop.best_valid_metric), "train_time": np.sum(train_batch_time), "valid_time": 0, "total_train_time": time_train_end - time_train_start, }
def collect_outputs(outputs, inputs, data_flow, mode='concat', axis=0, feed_dict=None, session=None): """ Run TensorFlow nodes by mini-batch and collect outputs from each batch. Args: outputs (Iterable[tf.Tensor] or dict[str, tf.Tensor]): The output tensors to be computed. inputs (Iterable[tf.Tensor]): Input placeholders. data_flow (DataFlow): Data flow to feed the input placeholders. mode ({'concat', 'average'}): If "concat", will concatenate the outputs from each mini-batch. If "average", the output from each batch must be a scalar, and if so, this method will take average of the outputs from each mini-batch, weighted according to the batch size. axis (int): The axis for concatenation. feed_dict: Optional, additional feed dict. session: The TensorFlow session. If not specified, use the default session. Returns: tuple[np.ndarray] or dict[str, tf.Tensor]: The collected outputs. Returns a dict if `outputs` is a dict, or a tuple otherwise. """ mode = validate_enum_arg('mode', mode, ['concat', 'average']) session = session or get_default_session_or_error() if isinstance(outputs, (dict, OrderedDict)): output_keys = list(outputs) outputs = [tf.convert_to_tensor(outputs[k]) for k in output_keys] else: output_keys = None outputs = [tf.convert_to_tensor(o) for o in outputs] inputs = [tf.convert_to_tensor(i) for i in inputs] # check the shape of output tensors for i, o in enumerate(outputs): o_shape = o.get_shape() if mode == 'concat': if o_shape.ndims is not None and o_shape.ndims < 1: raise ValueError('`mode` is "concat", but the {}-th output ' 'is a scalar: {!r}'.format(i, o)) else: if o_shape.ndims is not None and o_shape.ndims > 0: raise ValueError('`mode` is "average", but the {}-th output ' 'is not a scalar: {!r}'.format(i, o)) collected = [[] for _ in range(len(outputs))] weights = [] for batch in data_flow: weights.append(len(batch[0])) batch_feed_dict = merge_feed_dict( feed_dict, {k: v for (k, v) in zip(inputs, batch)} ) batch_feed_dict = resolve_feed_dict(batch_feed_dict) for i, o in enumerate(session.run(outputs, feed_dict=batch_feed_dict)): collected[i].append(o) weights = np.asarray(weights, dtype=np.float32) for i, batches in enumerate(collected): if mode == 'average': stacked = np.stack(batches, axis=0) assert(len(stacked.shape) == 1) collected[i] = np.average(stacked, axis=0, weights=weights) else: collected[i] = np.concatenate(batches, axis=axis) if output_keys is not None: collected = dict(zip(output_keys, collected)) else: collected = tuple(collected) return collected