def evaluate(self, sess):
        global_step = sess.run(tf.train.get_or_create_global_step())
        test_results = self.run_epoch_generator(
            sess,
            self.model,
            self._data['eval_loader'].get_iterator(),
            return_output=True,
            training=False)

        # y_preds:  a list of (batch_size, horizon, num_nodes, output_dim)
        test_loss, y_preds = test_results['loss'], test_results['outputs']
        utils.add_simple_summary(self._writer, ['loss/test_loss'], [test_loss],
                                 global_step=global_step)

        y_preds = np.concatenate(y_preds, axis=0)
        scaler = self._data['scaler']
        predictions = []
        y_truths = []
        for horizon_i in range(self._data['y_eval'].shape[1]):
            y_truth = scaler.inverse_transform(
                self._data['y_eval'][:, horizon_i, :, 0])
            y_truths.append(y_truth)

            y_pred = scaler.inverse_transform(y_preds[:, horizon_i, :, 0])
            predictions.append(y_pred)

            mse = metrics.masked_mse_np(preds=y_pred,
                                        labels=y_truth,
                                        null_val=0)
            mae = metrics.masked_mae_np(preds=y_pred,
                                        labels=y_truth,
                                        null_val=0)
            mape = metrics.masked_mape_np(preds=y_pred,
                                          labels=y_truth,
                                          null_val=0)
            rmse = metrics.masked_rmse_np(preds=y_pred,
                                          labels=y_truth,
                                          null_val=0)
            self._logger.info(
                "Horizon {:02d}, MSE: {:.2f}, MAE: {:.2f}, RMSE: {:.2f}, MAPE: {:.4f}"
                .format(horizon_i + 1, mse, mae, rmse, mape))
            utils.add_simple_summary(self._writer, [
                '%s_%d' % (item, horizon_i + 1)
                for item in ['metric/rmse', 'metric/mae', 'metric/mse']
            ], [rmse, mae, mse],
                                     global_step=global_step)
        outputs = {'predictions': predictions, 'groundtruth': y_truths}
        return outputs
Exemple #2
0
    def evaluate(self, sess, **kwargs):
        global_step = sess.run(tf.train.get_or_create_global_step())
        test_results = self.run_epoch_generator(
            sess,
            'test',
            self._test_model,
            self._data['test_loader'].get_iterator(),
            self._data['scaler'],
            return_output=True,
            training=False)

        # y_preds:  a list of (batch_size, horizon, num_nodes, output_dim)
        test_loss, y_preds = test_results['loss'], test_results['outputs']
        self._logger.info('test_mae: %f', (np.asscalar(test_loss)))
        utils.add_simple_summary(self._writer, ['loss/test_loss'], [test_loss],
                                 global_step=global_step)
        return
Exemple #3
0
    def evaluate(self, sess, data_test, **kwargs):
        global_step = sess.run(tf.train.get_or_create_global_step())
        test_results = self.run_epoch_generator(
            sess,
            self._test_model,
            data_test['test_loader'].get_iterator(),
            return_output=True,
            training=False)

        # y_preds:  a list of (batch_size, horizon, num_nodes, output_dim)
        test_loss, y_preds = test_results['loss'], test_results['outputs']
        utils.add_simple_summary(self._writer, ['loss/test_loss'], [test_loss],
                                 global_step=global_step)

        y_preds = np.concatenate(y_preds, axis=0)

        return y_preds
    def evaluate(self, sess, **kwargs):

        y_preds_all = []
        half_length = int(len(self.clusters) / 2)
        sclusters = self.clusters[0:32]
        for cluster in sclusters:

            node_count, adj_mx = self.cluster_data(cluster)
            adj_mx = utils.calculate_random_walk_matrix(adj_mx).T
            adj_mx = self._build_sparse_matrix(adj_mx)
            global_step = sess.run(tf.train.get_or_create_global_step())
            scaler_path = self._kwargs['data'].get(
                'dataset_dir') + '/scaler.npy'
            scaler_data_ = np.load(scaler_path)
            mean, var = scaler_data_[0], scaler_data_[1]
            scaler = StandardScaler(mean=mean, std=var)

            # change val to test before run
            test_data_path = self._kwargs['data'].get(
                'dataset_dir') + '/test_' + str(cluster) + '.tfrecords'
            test_dataset = tf.data.TFRecordDataset([test_data_path])
            test_dataset = test_dataset.map(self._parse_record_fn)
            test_dataset = test_dataset.make_one_shot_iterator()
            test_next_element = test_dataset.get_next()

            test_results = self.run_epoch_generator(sess,
                                                    self._test_model,
                                                    test_next_element,
                                                    adj_mx,
                                                    return_output=True,
                                                    training=False)
            test_loss, y_preds = test_results['loss'], test_results['outputs']
            utils.add_simple_summary(self._writer, ['loss/test_loss'],
                                     [test_loss],
                                     global_step=global_step)

            y_preds = np.concatenate(y_preds, axis=0)
            y_preds = scaler.inverse_transform(y_preds[:, self.horizon - 1, :,
                                                       0])
            y_preds = y_preds[:, 0:node_count]

            y_preds_all.append(y_preds)

        y_preds_all = np.concatenate(y_preds_all, axis=1)
        return y_preds_all
Exemple #5
0
    def _train(self,
               sess,
               base_lr,
               epoch,
               steps,
               patience=50,
               epochs=100,
               min_learning_rate=2e-6,
               lr_decay_ratio=0.1,
               save_model=1,
               test_every_n_epochs=10,
               save_epoch_interval=5,
               **train_kwargs):
        history = []
        min_val_loss = float('inf')
        wait = 0

        max_to_keep = train_kwargs.get('max_to_keep', 100)
        model_metaname = train_kwargs.get('model_metaname')
        if model_metaname is not None:
            pass
            #saver = tf.train.import_meta_graph(os.path.join('data/model/dcrnn_DR_2_h_12_64-64_lr_0.005_bs_32_0131205604_test', model_metaname))
        else:
            saver = tf.train.Saver(tf.global_variables(),
                                   max_to_keep=max_to_keep)

        model_filename = train_kwargs.get('model_filename')

        if model_filename is not None:
            saver.restore(
                sess, os.path.join(self._kwargs['base_dir'], model_filename))
            self._epoch = epoch + 1
        else:
            sess.run(tf.global_variables_initializer())
        self._logger.info('Start training ...')

        while self._epoch <= epochs:

            # Learning rate schedule.
            new_lr = max(
                min_learning_rate,
                base_lr *
                (lr_decay_ratio**np.sum(self._epoch >= np.array(steps))))
            self.set_lr(sess=sess, lr=new_lr)

            start_time = time.time()
            train_results = self.run_epoch_generator(
                sess,
                'train',
                self._train_model,
                self._data['train_loader'].get_iterator(),
                None,
                training=True,
                return_output=True,
                writer=self._writer)
            train_loss, train_mae = train_results['loss'], train_results['mae']
            if train_loss > 1e5:
                self._logger.warning('Gradient explosion detected. Ending...')
                break

            global_step = sess.run(tf.train.get_or_create_global_step())

            # Compute validation error.
            print(
                '--------------------------------------------------------------------------------'
            )
            val_results = self.run_epoch_generator(
                sess,
                'val',
                self._test_model,
                self._data['val_loader'].get_iterator(),
                None,
                return_output=True,
                training=False)
            val_loss, val_mae = np.asscalar(val_results['loss']), np.asscalar(
                val_results['mae'])
            y_preds = val_results['outputs']
            scaler = self._data['scaler']
            y_preds = np.concatenate(y_preds, axis=0)
            for horizon_i in range(self._data['y_val'].shape[1]):
                y_truth = scaler.inverse_transform(
                    self._data['y_val'][:, horizon_i, :, 0])
                print('truth', y_truth[0, :5])

                y_pred = scaler.inverse_transform(y_preds[0:5, horizon_i, :,
                                                          0])
                print('pred', y_pred[0, :5])

            utils.add_simple_summary(self._writer, [
                'loss/train_loss', 'metric/train_mae', 'loss/val_loss',
                'metric/val_mae'
            ], [train_loss, train_mae, val_loss, val_mae],
                                     global_step=global_step)
            end_time = time.time()
            message = 'Epoch [{}/{}] ({}) train_mae: {:.4f}, val_mae: {:.4f} lr:{:.6f} {:.1f}s'.format(
                self._epoch, epochs, global_step, train_mae, val_mae, new_lr,
                (end_time - start_time))
            self._logger.info(message)
            if self._epoch % test_every_n_epochs == test_every_n_epochs - 1:
                self.evaluate(sess)
            if val_loss < min_val_loss:
                self._logger.info(('Val loss decrease from %.4f to %.4f') %
                                  (min_val_loss, val_loss))
                min_val_loss = val_loss
            else:
                wait += 1
                if wait > patience:
                    self._logger.warning('Early stopping at epoch: %d' %
                                         self._epoch)
                    break
            if self._epoch % save_epoch_interval == 0:
                wait = 0
                if save_model > 0:
                    model_filename = self.save(sess, val_loss)
                    self._logger.info(
                        'min Val loss  %.4f ,Val loss %.4f, saving to %s' %
                        (min_val_loss, val_loss,
                         model_filename))  #model_filename

            history.append(val_mae)
            # Increases epoch.
            self._epoch += 1

            sys.stdout.flush()
        return np.min(history)
Exemple #6
0
    def _train(self,
               sess,
               base_lr,
               epoch,
               steps,
               patience=50,
               epochs=100,
               min_learning_rate=2e-6,
               lr_decay_ratio=0.1,
               save_model=1,
               test_every_n_epochs=10,
               **train_kwargs):

        history = []
        min_val_loss = float('inf')
        wait = 0

        max_to_keep = train_kwargs.get('max_to_keep', 100)
        saver = tf.train.Saver(tf.global_variables(), max_to_keep=max_to_keep)
        model_filename = train_kwargs.get('model_filename')
        if model_filename is not None:
            saver.restore(sess, model_filename)
            self._epoch = epoch + 1
        else:
            sess.run(tf.global_variables_initializer())
        self._logger.info('Start training ...')

        while self._epoch <= epochs:
            # Learning rate schedule.
            new_lr = max(
                min_learning_rate,
                base_lr *
                (lr_decay_ratio**np.sum(self._epoch >= np.array(steps))))
            self.set_lr(sess=sess, lr=new_lr)

            start_time = time.time()
            train_results = self.run_epoch_generator(
                sess,
                self._train_model,
                self._data['train_loader'].get_iterator(),
                training=True,
                writer=self._writer)

            train_loss, train_mae = train_results['loss'], train_results['mae']

            if train_loss > 1e5:
                self._logger.warning('Gradient explosion detected. Ending...')
                break

            global_step = sess.run(tf.train.get_or_create_global_step())
            # Compute validation error.
            val_results = self.run_epoch_generator(
                sess,
                self._test_model,
                self._data['val_loader'].get_iterator(),
                training=False)
            val_loss, val_mae = np.asscalar(val_results['loss']), np.asscalar(
                val_results['mae'])

            utils.add_simple_summary(self._writer, [
                'loss/train_loss', 'metric/train_mae', 'loss/val_loss',
                'metric/val_mae'
            ], [train_loss, train_mae, val_loss, val_mae],
                                     global_step=global_step)
            end_time = time.time()
            message = 'Epoch [{}/{}] ({}) train_mae: {:.4f}, val_mae: {:.4f} lr:{:.6f} {:.1f}s'.format(
                self._epoch, epochs, global_step, train_mae, val_mae, new_lr,
                (end_time - start_time))
            self._logger.info(message)
            test_every_n_epochs = 1
            if self._epoch % test_every_n_epochs == test_every_n_epochs - 1:
                self.evaluate(sess)
            if val_loss <= min_val_loss:
                wait = 0
                if save_model > 0:
                    model_filename = self.save(sess, val_loss)
                self._logger.info(
                    'Val loss decrease from %.4f to %.4f, saving to %s' %
                    (min_val_loss, val_loss, model_filename))
                min_val_loss = val_loss
            else:
                wait += 1
                if wait > patience:
                    self._logger.warning('Early stopping at epoch: %d' %
                                         self._epoch)
                    break

            history.append(val_mae)
            # Increases epoch.
            self._epoch += 1

            sys.stdout.flush()
        return np.min(history)
    def _test(self, sess, **kwargs):

        global_step = sess.run(tf.train.get_or_create_global_step())

        results_summary = pd.DataFrame(index=range(self._run_times))
        results_summary['No.'] = range(self._run_times)

        n_metrics = 4
        # Metrics: MSE, MAE, RMSE, MAPE, ER
        metrics_summary = np.zeros(shape=(self._run_times, self._horizon * n_metrics + 1))

        for i in range(self._run_times):
            self._logger.info('|--- Run time: {}'.format(i))
            # y_test = self._prepare_test_set()

            test_results = self._run_tm_prediction(sess, model=self._test_model)

            # y_preds:  a list of (batch_size, horizon, num_nodes, output_dim)
            test_loss, y_preds = test_results['loss'], test_results['y_preds']
            utils.add_simple_summary(self._writer, ['loss/test_loss'], [test_loss], global_step=global_step)

            y_preds = test_results['y_preds']
            y_preds = np.concatenate(y_preds, axis=0)

            y_truths = test_results['y_truths']
            y_truths = np.concatenate(y_truths, axis=0)
            scaler = self._data['scaler']
            predictions = []

            for horizon_i in range(self._horizon):
                y_truth = scaler.inverse_transform(y_truths[:, horizon_i, :, 0])

                y_pred = scaler.inverse_transform(y_preds[:, horizon_i, :, 0])
                predictions.append(y_pred)

                mse = metrics.masked_mse_np(preds=y_pred, labels=y_truth, null_val=0)
                mae = metrics.masked_mae_np(preds=y_pred, labels=y_truth, null_val=0)
                mape = metrics.masked_mape_np(preds=y_pred, labels=y_truth, null_val=0)
                rmse = metrics.masked_rmse_np(preds=y_pred, labels=y_truth, null_val=0)
                self._logger.info(
                    "Horizon {:02d}, MSE: {:.2f}, MAE: {:.2f}, RMSE: {:.2f}, MAPE: {:.4f}".format(
                        horizon_i + 1, mse, mae, rmse, mape
                    )
                )
                metrics_summary[i, horizon_i * n_metrics + 0] = mse
                metrics_summary[i, horizon_i * n_metrics + 1] = mae
                metrics_summary[i, horizon_i * n_metrics + 2] = rmse
                metrics_summary[i, horizon_i * n_metrics + 3] = mape

            tm_pred = scaler.inverse_transform(test_results['tm_pred'])
            g_truth = scaler.inverse_transform(self._data['test_data_norm'][self._seq_len:-self._horizon])
            m_indicator = test_results['m_indicator']
            er = error_ratio(y_pred=tm_pred,
                             y_true=g_truth,
                             measured_matrix=m_indicator)
            metrics_summary[i, -1] = er

            self._save_results(g_truth=g_truth, pred_tm=tm_pred, m_indicator=m_indicator, tag=str(i))

            print('ER: {}'.format(er))

        for horizon_i in range(self._horizon):
            results_summary['mse_{}'.format(horizon_i)] = metrics_summary[:, horizon_i * n_metrics + 0]
            results_summary['mae_{}'.format(horizon_i)] = metrics_summary[:, horizon_i * n_metrics + 1]
            results_summary['rmse_{}'.format(horizon_i)] = metrics_summary[:, horizon_i * n_metrics + 2]
            results_summary['mape_{}'.format(horizon_i)] = metrics_summary[:, horizon_i * n_metrics + 3]

        results_summary['er'] = metrics_summary[:, -1]
        results_summary.to_csv(self._log_dir + 'results_summary.csv', index=False)

        return
    def _train(self,
               sess,
               base_lr,
               epoch,
               steps,
               patience=50,
               epochs=100,
               min_learning_rate=2e-6,
               lr_decay_ratio=0.1,
               save_model=1,
               test_every_n_epochs=10,
               **train_kwargs):
        history = []
        min_val_loss = float('inf')
        wait = 0

        training_history = pd.DataFrame()
        losses, val_losses = [], []

        max_to_keep = train_kwargs.get('max_to_keep', 100)
        saver = tf.train.Saver(tf.global_variables(), max_to_keep=max_to_keep)
        model_filename = train_kwargs.get('model_filename')
        continue_train = train_kwargs.get('continue_train')
        if continue_train is True and model_filename is not None:
            saver.restore(sess, model_filename)
            self._epoch = epoch + 1
        else:
            sess.run(tf.global_variables_initializer())
        self._logger.info('Start training ...')

        while self._epoch <= epochs:
            self._logger.info('Training epoch: {}/{}'.format(
                self._epoch, epochs))
            # Learning rate schedule.
            new_lr = max(
                min_learning_rate,
                base_lr *
                (lr_decay_ratio**np.sum(self._epoch >= np.array(steps))))
            self.set_lr(sess=sess, lr=new_lr)

            start_time = time.time()
            train_results = self.run_epoch_generator(
                sess,
                self.model,
                self._data['train_loader'].get_iterator(),
                training=True,
                writer=self._writer)
            train_loss, train_mse = train_results['loss'], train_results['mse']
            # if train_loss > 1e5:
            #     self._logger.warning('Gradient explosion detected. Ending...')
            #     break

            global_step = sess.run(tf.train.get_or_create_global_step())
            # Compute validation error.
            val_results = self.run_epoch_generator(
                sess,
                self.model,
                self._data['val_loader'].get_iterator(),
                training=False)
            val_loss, val_mse = val_results['loss'].item(
            ), val_results['mse'].item()

            utils.add_simple_summary(self._writer, [
                'loss/train_loss', 'metric/train_mse', 'loss/val_loss',
                'metric/val_mse'
            ], [train_loss, train_mse, val_loss, val_mse],
                                     global_step=global_step)
            end_time = time.time()
            message = 'Epoch [{}/{}] ({}) train_mse: {:f}, val_mse: {:f} lr:{:f} {:.1f}s'.format(
                self._epoch, epochs, global_step, train_mse, val_mse, new_lr,
                (end_time - start_time))
            self._logger.info(message)
            if val_loss <= min_val_loss:
                wait = 0
                if save_model > 0:
                    model_filename = self.save(sess, val_loss)
                self._logger.info(
                    'Val loss decrease from %f to %f, saving to %s' %
                    (min_val_loss, val_loss, model_filename))
                min_val_loss = val_loss
            else:
                wait += 1
                if wait > patience:
                    self._logger.warning('Early stopping at epoch: %d' %
                                         self._epoch)
                    break

            history.append(val_mse)
            # Increases epoch.
            self._epoch += 1
            losses.append(train_loss)
            val_losses.append(val_loss)
            sys.stdout.flush()

        training_history['epoch'] = np.arange(self._epoch)
        training_history['loss'] = losses
        training_history['val_loss'] = val_losses
        training_history.to_csv(self._log_dir + 'training_history.csv',
                                index=False)

        return np.min(history)
Exemple #9
0
    def _train(self,
               sess,
               base_lr,
               epoch,
               steps,
               patience=50,
               epochs=100,
               min_learning_rate=2e-6,
               lr_decay_ratio=0.1,
               save_model=1,
               test_every_n_epochs=10,
               **train_kwargs):
        history = []
        min_val_loss = float('inf')
        wait = 0

        max_to_keep = train_kwargs.get('max_to_keep', 100)
        saver = tf.train.Saver(tf.global_variables(), max_to_keep=max_to_keep)
        model_filename = train_kwargs.get('model_filename')

        output_file = train_kwargs.get('preds_file')
        gt_file = train_kwargs.get('groundtruth_file')
        if model_filename is not None:
            saver.restore(sess, model_filename)
            self._epoch = epoch + 1
        else:
            sess.run(tf.global_variables_initializer())
        self._logger.info('Start training ...')

        while self._epoch <= epochs:
            # Learning rate schedule.
            new_lr = max(
                min_learning_rate,
                base_lr *
                (lr_decay_ratio**np.sum(self._epoch >= np.array(steps))))
            self.set_lr(sess=sess, lr=new_lr)

            start_time = time.time()
            train_results = self.run_epoch_generator(
                sess,
                self._train_model,
                self._data['train_loader'].get_iterator(),
                training=True,
                writer=self._writer)
            train_loss, train_mae, train_reg = train_results[
                'loss'], train_results['mae'], train_results['reg']
            #print ('reg loss is:', train_reg)
            if train_loss > 1e5:
                self._logger.warning('Gradient explosion detected. Ending...')
                break

            global_step = sess.run(tf.train.get_or_create_global_step())
            # Compute validation error.
            val_results = self.run_epoch_generator(
                sess,
                self._test_model,
                self._data['val_loader'].get_iterator(),
                training=False)
            val_loss, val_mae = np.asscalar(val_results['loss']), np.asscalar(
                val_results['mae'])

            utils.add_simple_summary(self._writer, [
                'loss/train_loss', 'metric/train_mae', 'loss/val_loss',
                'metric/val_mae'
            ], [train_loss, train_mae, val_loss, val_mae],
                                     global_step=global_step)
            end_time = time.time()
            message = 'Epoch [{}/{}] ({}) train_mae: {:.4f}, val_mae: {:.4f} lr:{:.6f} {:.1f}s'.format(
                self._epoch, epochs, global_step, train_mae, val_mae, new_lr,
                (end_time - start_time))
            self._logger.info(message)
            stt = time.time()
            outputs = self.evaluate(sess)
            #print (outputs['groundtruth'].shape)
            test_gdt = outputs['groundtruth'][:, :, :, 0]
            test_y = outputs['observed'][:, :, :, 0]
            best_pred = outputs['predictions'][:test_gdt.shape[0], :, :, 0]
            test_gdt = test_gdt.reshape(
                test_gdt.shape[0], -1, order='F'
            )  # record the corresponding steps for each node first, then node by node
            test_y = test_y.reshape(test_gdt.shape[0], -1, order='F')
            best_pred = best_pred.reshape(test_gdt.shape[0], -1, order='F')
            scaler = self._data['scaler']
            print('Test running time: %fs' % (time.time() - stt))
            # --------
            # best_pred = scaler.inverse_transform(best_pred)
            # # np.save('best_pred.npy', best_pred)
            # mape = metrics.masked_mape_np(best_pred, test_y, test_gdt, null_val=0)
            # rmse = metrics.masked_rmse_np(best_pred, test_y, test_gdt, null_val=0)
            # self._logger.info(
            #     'Overall Test MAPE %.4f, RMSE %.4f' % (mape, rmse))
            # --------
            if val_loss <= min_val_loss:
                wait = 0
                if save_model > 0:
                    model_filename = self.save(sess, val_loss)
                self._logger.info(
                    'Val loss decrease from %.4f to %.4f, saving to %s' %
                    (min_val_loss, val_loss, model_filename))
                min_val_loss = val_loss
                test_y = scaler.inverse_transform(test_y)
                best_pred = scaler.inverse_transform(best_pred)
                # np.save('best_pred.npy', best_pred)
                mape = metrics.masked_mape_np(best_pred,
                                              test_y,
                                              test_gdt,
                                              null_val=0)
                rmse = metrics.masked_rmse_np(best_pred,
                                              test_y,
                                              test_gdt,
                                              null_val=0)
                self._logger.info('Overall Test MAPE %.4f, RMSE %.4f' %
                                  (mape, rmse))
                print(best_pred.shape)
                #np.savetxt(output_file, best_pred, delimiter = ',')
                #np.savetxt(gt_file, test_gdt, delimiter = ',')
            else:
                wait += 1
                if wait > patience:
                    self._logger.warning('Early stopping at epoch: %d' %
                                         self._epoch)
                    break

            # test_y = scaler.inverse_transform(test_y)
            # best_pred = scaler.inverse_transform(best_pred)
            # mape = metrics.masked_mape_np(best_pred, test_y, test_gdt, null_val=0)
            # rmse = metrics.masked_rmse_np(best_pred, test_y, test_gdt, null_val=0)
            # self._logger.info(
            #     'Overall Test MAPE %.4f, RMSE %.4f' % (mape, rmse))
            history.append(val_mae)
            # Increases epoch.
            self._epoch += 1

            sys.stdout.flush()
        return np.min(history)
Exemple #10
0
    def _train(self,
               sess,
               base_lr,
               epoch,
               steps,
               patience=50,
               epochs=100,
               min_learning_rate=2e-6,
               lr_decay_ratio=0.1,
               save_model=1,
               test_every_n_epochs=10,
               **train_kwargs):
        history = []
        min_val_loss = float('inf')
        wait = 0

        max_to_keep = train_kwargs.get('max_to_keep', 100)
        saver = tf.train.Saver(
            tf.global_variables(),
            max_to_keep=max_to_keep)  # 第一个参数var_list指定要保存和恢复的变量
        model_filename = train_kwargs.get('model_filename')
        if model_filename is not None:
            saver.restore(sess, model_filename)
            self._epoch = epoch + 1
        else:
            sess.run(tf.global_variables_initializer())
        self._logger.info('Start training ...')

        while self._epoch <= epochs:
            # Learning rate schedule.
            new_lr = max(
                min_learning_rate,
                base_lr *
                (lr_decay_ratio**np.sum(self._epoch >= np.array(steps))))
            self.set_lr(sess=sess, lr=new_lr)

            start_time = time.time()
            train_results = self.run_epoch_generator(
                sess,
                self._train_model,
                self._data['train_loader'].get_iterator(),
                training=True,
                writer=self._writer)
            train_loss, train_mae = train_results['loss'], train_results['mae']
            if train_loss > 1e5:
                self._logger.warning('Gradient explosion detected. Ending...')
                break

            global_step = sess.run(tf.train.get_or_create_global_step())
            '''
            global_step refer to the number of batches seen by the graph. 
            Everytime a batch is provided, the weights are updated in the direction that minimizes the loss. 
            global_step just keeps track of the number of batches seen so far. 
            When it is passed in the minimize() argument list, the variable is increased by one. Have a look at optimizer.minimize().

            You can get the global_step value using tf.train.global_step().
            
            The 0 is the initial value of the global step in this context.
            
            讲解global_step的好博客:https://blog.csdn.net/leviopku/article/details/78508951
            '''
            # Compute validation error.
            val_results = self.run_epoch_generator(
                sess,
                self._test_model,
                self._data['val_loader'].get_iterator(),
                training=False)
            val_loss, val_mae = np.asscalar(val_results['loss']), np.asscalar(
                val_results['mae'])

            utils.add_simple_summary(self._writer, [
                'loss/train_loss', 'metric/train_mae', 'loss/val_loss',
                'metric/val_mae'
            ], [train_loss, train_mae, val_loss, val_mae],
                                     global_step=global_step)
            end_time = time.time()
            message = 'Epoch [{}/{}] ({}) train_mae: {:.4f}, val_mae: {:.4f} lr:{:.6f} {:.1f}s'.format(
                self._epoch, epochs, global_step, train_mae, val_mae, new_lr,
                (end_time - start_time))
            self._logger.info(message)
            if self._epoch % test_every_n_epochs == test_every_n_epochs - 1:
                self.evaluate(sess)
            if val_loss <= min_val_loss:
                wait = 0
                if save_model > 0:
                    model_filename = self.save(
                        sess,
                        val_loss)  # save()同时记录了config_x.yaml配置文件(x=epoch)
                self._logger.info(
                    'Val loss decrease from %.4f to %.4f, saving to %s' %
                    (min_val_loss, val_loss, model_filename))
                min_val_loss = val_loss
            else:
                wait += 1
                if wait > patience:
                    self._logger.warning('Early stopping at epoch: %d' %
                                         self._epoch)
                    break

            history.append(val_mae)
            # Increases epoch.
            self._epoch += 1

            sys.stdout.flush()  # 即将缓冲区中的数据立刻写入文件,同时清空缓冲区,不需要是被动的等待输出缓冲区写入
        return np.min(history)
Exemple #11
0
    def _train(self,
               sess,
               base_lr,
               epoch,
               steps,
               patience=50,
               epochs=100,
               min_learning_rate=2e-6,
               lr_decay_ratio=0.1,
               save_model=1,
               test_every_n_epochs=10,
               **train_kwargs):
        val_history, train_history = [], []
        min_val_loss = float('inf')
        wait = 0
        steps.append(
            float('inf'))  # to keep finial learning rate up to the end

        max_to_keep = train_kwargs.get('max_to_keep', 100)
        cl_decay_steps = self._model_kwargs.get('cl_decay_steps')
        saver = tf.train.Saver(tf.global_variables(), max_to_keep=max_to_keep)
        model_filename = train_kwargs.get('model_filename')
        if model_filename is not None:
            saver.restore(sess, model_filename)
            self._epoch = epoch + 1
            min_val_loss = float(
                os.path.basename(model_filename).split('-')[1])
        else:
            sess.run(tf.global_variables_initializer())
        self._logger.info('Start training ...')

        while self._epoch < epochs:
            # Learning rate schedule.
            new_lr = max(
                min_learning_rate,
                base_lr *
                (lr_decay_ratio**np.sum(self._epoch >= np.array(steps))))
            self.set_lr(sess=sess, lr=new_lr)

            start_time = time.time()
            train_results = self.run_epoch_generator(
                sess,
                self._train_model,
                self._data['train_loader'].get_iterator(),
                training=True,
                writer=self._writer)
            train_loss, train_mae = train_results['loss'], train_results['mae']
            if train_loss > 1e5:
                self._logger.warning('Gradient explosion detected. Ending...')
                break

            global_step = sess.run(tf.train.get_or_create_global_step())
            # Compute validation error.
            val_results = self.run_epoch_generator(
                sess,
                self._test_model,
                self._data['val_loader'].get_iterator(),
                training=False)
            val_loss, val_mae = np.asscalar(val_results['loss']), np.asscalar(
                val_results['mae'])

            utils.add_simple_summary(self._writer, [
                'loss/train_loss', 'metric/train_mae', 'loss/val_loss',
                'metric/val_mae'
            ], [train_loss, train_mae, val_loss, val_mae],
                                     global_step=global_step)
            end_time = time.time()
            cl_threshold = self._train_model._compute_sampling_threshold(
                global_step,
                cl_decay_steps).eval()  # current sampling probability of cl
            message = 'Epoch [{}/{}] ({}) train_mae: {:.5f}, val_mae: {:.5f}, lr: {:.5f}, cl_thres: {:.3f}, t: {:.1f}min'.format(
                self._epoch, epochs, global_step, train_mae, val_mae, new_lr,
                cl_threshold, (end_time - start_time) / 60)
            self._logger.info(message)
            if self._epoch % test_every_n_epochs == test_every_n_epochs - 1:
                self.evaluate(sess)
            if val_loss <= min_val_loss:
                wait = 0
                if save_model > 0:
                    model_filename = self.save(sess, val_loss)
                self._logger.info(
                    'Val loss decrease from %.4f to %.4f, saving to %s' %
                    (min_val_loss, val_loss, model_filename))
                min_val_loss = val_loss
            else:
                wait += 1
                if wait >= patience:
                    self._logger.warning('Lowering learning rate preliminary.')

            val_history.append(val_mae)
            train_history.append(train_mae)

            # Restore best weights before lowering lr in next epoch.
            if steps[np.sum(self._epoch >= np.array(steps)
                            )] == self._epoch + 1 or wait >= patience:
                self._logger.info('Restore model from epoch {}: {}'.format(
                    (self._epoch - wait), os.path.basename(model_filename)))
                saver.restore(sess, model_filename)
                self._epoch = self._epoch - wait  # Go back to epoch...
                steps[np.sum(self._epoch >= np.array(steps))] = self._epoch + 1
                wait = 0  # Reset patience iterator.

            # Increases epoch.
            self._epoch += 1

            sys.stdout.flush()
        return val_history, train_history