Python TensorBoardProgressWriter Examples

Programming Language: Python

Namespace/Package Name: cntk.logging

Examples at hotexamples.com: 20

Python TensorBoardProgressWriter - 20 examples found. These are the top rated real world Python examples of cntk.logging.TensorBoardProgressWriter extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

TensorBoardProgressWriter(14)

write_image(1)

Example #1

Show file

    def __init__(self, input_shape, nb_actions,
                 gamma=0.99, explorer=LinearEpsilonAnnealingExplorer(1, 0.1, 1000000),
                 learning_rate=0.00025, momentum=0.95, minibatch_size=32,
                 memory_size=500000, train_after=10000, train_interval=4,
                 target_update_interval=10000, monitor=True):
        self.input_shape = input_shape
        self.nb_actions = nb_actions
        self.gamma = gamma
        self._train_after = train_after
        self._train_interval = train_interval
        self._target_update_interval = target_update_interval
        self._explorer = explorer
        self._minibatch_size = minibatch_size
        self._history = History(input_shape)
        self._memory = RepMem(memory_size, input_shape[1:], 4)
        self._num_actions_taken = 0
        self._episode_rewards, self._episode_q_means, self._episode_q_stddev = [], [], []

        with default_options(activation=relu, init=he_uniform()):
            self._action_value_net = Sequential([
                Dense(input_shape, init=he_uniform(scale=0.01)),
                Dense(input_shape),
                Dense(nb_actions, activation=None, init=he_uniform(scale=0.01))])

        self._action_value_net.update_signature(Tensor[input_shape])

        self._target_net = self._action_value_net.clone(CloneMethod.freeze)


        @Function
        @Signature(post_states=Tensor[input_shape], rewards=Tensor[()], terminals=Tensor[()])
        def compute_q_targets(post_states, rewards, terminals):
            return element_select(
                terminals,
                rewards,
                gamma * reduce_max(self._target_net(post_states), axis=0) + rewards,
            )

        @Function
        @Signature(pre_states=Tensor[input_shape], actions=Tensor[nb_actions],
                   post_states=Tensor[input_shape], rewards=Tensor[()], terminals=Tensor[()])
        def criterion(pre_states, actions, post_states, rewards, terminals):
            q_targets = compute_q_targets(post_states, rewards, terminals)

            q_acted = reduce_sum(self._action_value_net(pre_states) * actions, axis=0)

            return huber_loss(q_targets, q_acted, 1.0)

        lr_schedule = learning_rate_schedule(learning_rate, UnitType.minibatch)
        m_schedule = momentum_schedule(momentum)
        vm_schedule = momentum_schedule(0.999)
        l_sgd = adam(self._action_value_net.parameters, lr_schedule,
                     momentum=m_schedule, variance_momentum=vm_schedule)

        self._metrics_writer = TensorBoardProgressWriter(freq=1, log_dir='metrics', model=criterion) if monitor else None
        self._learner = l_sgd
        self._trainer = Trainer(criterion, (criterion, None), l_sgd, self._metrics_writer)

Example #2

Show file

File: dnn.py Project: bongiovimatthew/jigsaw-rl

    def build_trainer(self):

        # Set the learning rate, and the momentum parameters for the Adam optimizer.
        lr = learning_rate_schedule(self.lr, UnitType.minibatch)
        beta1 = momentum_schedule(0.9)
        beta2 = momentum_schedule(0.99)

        # Calculate the losses.
        loss_on_v = cntk.squared_error(self.R, self.v)
        pi_a_s = cntk.log(cntk.times_transpose(self.pi, self.action))

        loss_on_pi = cntk.variables.Constant(-1) * (cntk.plus(
            cntk.times(pi_a_s, cntk.minus(self.R, self.v_calc)),
            0.01 * cntk.times_transpose(self.pi, cntk.log(self.pi))))
        #loss_on_pi = cntk.times(pi_a_s, cntk.minus(self.R, self.v_calc))

        self.tensorboard_v_writer = TensorBoardProgressWriter(
            freq=10, log_dir="tensorboard_v_logs", model=self.v)
        self.tensorboard_pi_writer = TensorBoardProgressWriter(
            freq=10, log_dir="tensorboard_pi_logs", model=self.pi)

        # tensorboard --logdir=tensorboard_pi_logs  http://localhost:6006/
        # tensorboard --logdir=tensorboard_v_logs  http://localhost:6006/

        # Create the trainiers.
        self.trainer_v = cntk.Trainer(self.v, (loss_on_v), [
            adam(self.pms_v,
                 lr,
                 beta1,
                 variance_momentum=beta2,
                 gradient_clipping_threshold_per_sample=2,
                 l2_regularization_weight=0.01)
        ], self.tensorboard_v_writer)
        self.trainer_pi = cntk.Trainer(self.pi, (loss_on_pi), [
            adam(self.pms_pi,
                 lr,
                 beta1,
                 variance_momentum=beta2,
                 gradient_clipping_threshold_per_sample=2,
                 l2_regularization_weight=0.01)
        ], self.tensorboard_pi_writer)

Example #3

Show file

File: ConvNet_CIFAR10.py Project: palindromed/batch-shipyard

def convnet_cifar10(train_source,
                    test_source,
                    epoch_size,
                    num_convolution_layers=2,
                    minibatch_size=64,
                    max_epochs=30,
                    log_file=None,
                    tboard_log_dir='.',
                    results_path=_MODEL_PATH):
    _cntk_py.set_computation_network_trace_level(0)

    logger.info("""Running network with: 
                {num_convolution_layers} convolution layers
                {minibatch_size}  minibatch size
                for {max_epochs} epochs""".format(
        num_convolution_layers=num_convolution_layers,
        minibatch_size=minibatch_size,
        max_epochs=max_epochs))

    network = create_network(num_convolution_layers)

    progress_printer = ProgressPrinter(tag='Training',
                                       log_to_file=log_file,
                                       rank=cntk.Communicator.rank(),
                                       num_epochs=max_epochs)
    tensorboard_writer = TensorBoardProgressWriter(freq=10,
                                                   log_dir=tboard_log_dir,
                                                   model=network['output'])
    trainer = create_trainer(network, minibatch_size, epoch_size,
                             [progress_printer, tensorboard_writer])

    cv_config = CrossValidationConfig(
        minibatch_source=test_source,
        minibatch_size=16,
        callback=create_results_callback(
            os.path.join(results_path, "model_results.json"),
            num_convolution_layers=num_convolution_layers,
            minibatch_size=minibatch_size,
            max_epochs=max_epochs))
    train_and_test(network,
                   trainer,
                   train_source,
                   test_source,
                   minibatch_size,
                   epoch_size,
                   restore=False,
                   cv_config=cv_config)
    network['output'].save(os.path.join(results_path, _MODEL_NAME))

Example #4

Show file

File: DQNdrone.py Project: BrainsGarden/AirSim

    def __init__(self, input_shape, nb_actions,
                 gamma=0.99, explorer=LinearEpsilonAnnealingExplorer(1, 0.1, 1000000),
                 learning_rate=0.00025, momentum=0.95, minibatch_size=32,
                 memory_size=500000, train_after=10000, train_interval=4, target_update_interval=10000,
                 monitor=True):
        self.input_shape = input_shape
        self.nb_actions = nb_actions
        self.gamma = gamma

        self._train_after = train_after
        self._train_interval = train_interval
        self._target_update_interval = target_update_interval

        self._explorer = explorer
        self._minibatch_size = minibatch_size
        self._history = History(input_shape)
        self._memory = ReplayMemory(memory_size, input_shape[1:], 4)
        self._num_actions_taken = 0

        # Metrics accumulator
        self._episode_rewards, self._episode_q_means, self._episode_q_stddev = [], [], []

        # Action Value model (used by agent to interact with the environment)
        with default_options(activation=relu, init=he_uniform()):
            self._action_value_net = Sequential([
                Convolution2D((8, 8), 16, strides=4),
                Convolution2D((4, 4), 32, strides=2),
                Convolution2D((3, 3), 32, strides=1),
                Dense(256, init=he_uniform(scale=0.01)),
                Dense(nb_actions, activation=None, init=he_uniform(scale=0.01))
            ])
        self._action_value_net.update_signature(Tensor[input_shape])

        # Target model used to compute the target Q-values in training, updated
        # less frequently for increased stability.
        self._target_net = self._action_value_net.clone(CloneMethod.freeze)

        # Function computing Q-values targets as part of the computation graph
        @Function
        @Signature(post_states=Tensor[input_shape], rewards=Tensor[()], terminals=Tensor[()])
        def compute_q_targets(post_states, rewards, terminals):
            return element_select(
                terminals,
                rewards,
                gamma * reduce_max(self._target_net(post_states), axis=0) + rewards,
            )

        # Define the loss, using Huber Loss (more robust to outliers)
        @Function
        @Signature(pre_states=Tensor[input_shape], actions=Tensor[nb_actions],
                   post_states=Tensor[input_shape], rewards=Tensor[()], terminals=Tensor[()])
        def criterion(pre_states, actions, post_states, rewards, terminals):
            # Compute the q_targets
            q_targets = compute_q_targets(post_states, rewards, terminals)

            # actions is a 1-hot encoding of the action done by the agent
            q_acted = reduce_sum(self._action_value_net(pre_states) * actions, axis=0)

            # Define training criterion as the Huber Loss function
            return huber_loss(q_targets, q_acted, 1.0)

        # Adam based SGD
        lr_schedule = learning_rate_schedule(learning_rate, UnitType.minibatch)
        m_schedule = momentum_schedule(momentum)
        vm_schedule = momentum_schedule(0.999)
        l_sgd = adam(self._action_value_net.parameters, lr_schedule,
                     momentum=m_schedule, variance_momentum=vm_schedule)

        self._metrics_writer = TensorBoardProgressWriter(freq=1, log_dir='metrics', model=criterion) if monitor else None
        self._learner = l_sgd
        self._trainer = Trainer(criterion, (criterion, None), l_sgd, self._metrics_writer)

Example #5

Show file

File: DQNdrone.py Project: BrainsGarden/AirSim

class DeepQAgent(object):
    """
    Implementation of Deep Q Neural Network agent like in:
        Nature 518. "Human-level control through deep reinforcement learning" (Mnih & al. 2015)
    """
    def __init__(self, input_shape, nb_actions,
                 gamma=0.99, explorer=LinearEpsilonAnnealingExplorer(1, 0.1, 1000000),
                 learning_rate=0.00025, momentum=0.95, minibatch_size=32,
                 memory_size=500000, train_after=10000, train_interval=4, target_update_interval=10000,
                 monitor=True):
        self.input_shape = input_shape
        self.nb_actions = nb_actions
        self.gamma = gamma

        self._train_after = train_after
        self._train_interval = train_interval
        self._target_update_interval = target_update_interval

        self._explorer = explorer
        self._minibatch_size = minibatch_size
        self._history = History(input_shape)
        self._memory = ReplayMemory(memory_size, input_shape[1:], 4)
        self._num_actions_taken = 0

        # Metrics accumulator
        self._episode_rewards, self._episode_q_means, self._episode_q_stddev = [], [], []

        # Action Value model (used by agent to interact with the environment)
        with default_options(activation=relu, init=he_uniform()):
            self._action_value_net = Sequential([
                Convolution2D((8, 8), 16, strides=4),
                Convolution2D((4, 4), 32, strides=2),
                Convolution2D((3, 3), 32, strides=1),
                Dense(256, init=he_uniform(scale=0.01)),
                Dense(nb_actions, activation=None, init=he_uniform(scale=0.01))
            ])
        self._action_value_net.update_signature(Tensor[input_shape])

        # Target model used to compute the target Q-values in training, updated
        # less frequently for increased stability.
        self._target_net = self._action_value_net.clone(CloneMethod.freeze)

        # Function computing Q-values targets as part of the computation graph
        @Function
        @Signature(post_states=Tensor[input_shape], rewards=Tensor[()], terminals=Tensor[()])
        def compute_q_targets(post_states, rewards, terminals):
            return element_select(
                terminals,
                rewards,
                gamma * reduce_max(self._target_net(post_states), axis=0) + rewards,
            )

        # Define the loss, using Huber Loss (more robust to outliers)
        @Function
        @Signature(pre_states=Tensor[input_shape], actions=Tensor[nb_actions],
                   post_states=Tensor[input_shape], rewards=Tensor[()], terminals=Tensor[()])
        def criterion(pre_states, actions, post_states, rewards, terminals):
            # Compute the q_targets
            q_targets = compute_q_targets(post_states, rewards, terminals)

            # actions is a 1-hot encoding of the action done by the agent
            q_acted = reduce_sum(self._action_value_net(pre_states) * actions, axis=0)

            # Define training criterion as the Huber Loss function
            return huber_loss(q_targets, q_acted, 1.0)

        # Adam based SGD
        lr_schedule = learning_rate_schedule(learning_rate, UnitType.minibatch)
        m_schedule = momentum_schedule(momentum)
        vm_schedule = momentum_schedule(0.999)
        l_sgd = adam(self._action_value_net.parameters, lr_schedule,
                     momentum=m_schedule, variance_momentum=vm_schedule)

        self._metrics_writer = TensorBoardProgressWriter(freq=1, log_dir='metrics', model=criterion) if monitor else None
        self._learner = l_sgd
        self._trainer = Trainer(criterion, (criterion, None), l_sgd, self._metrics_writer)

    def act(self, state):
        """ This allows the agent to select the next action to perform in regard of the current state of the environment.
        It follows the terminology used in the Nature paper.

        Attributes:
            state (Tensor[input_shape]): The current environment state

        Returns: Int >= 0 : Next action to do
        """
        # Append the state to the short term memory (ie. History)
        self._history.append(state)

        # If policy requires agent to explore, sample random action
        if self._explorer.is_exploring(self._num_actions_taken):
            action = self._explorer(self.nb_actions)
        else:
            # Use the network to output the best action
            env_with_history = self._history.value
            q_values = self._action_value_net.eval(
                # Append batch axis with only one sample to evaluate
                env_with_history.reshape((1,) + env_with_history.shape)
            )

            self._episode_q_means.append(np.mean(q_values))
            self._episode_q_stddev.append(np.std(q_values))

            # Return the value maximizing the expected reward
            action = q_values.argmax()

        # Keep track of interval action counter
        self._num_actions_taken += 1
        return action

    def observe(self, old_state, action, reward, done):
        """ This allows the agent to observe the output of doing the action it selected through act() on the old_state

        Attributes:
            old_state (Tensor[input_shape]): Previous environment state
            action (int): Action done by the agent
            reward (float): Reward for doing this action in the old_state environment
            done (bool): Indicate if the action has terminated the environment
        """
        self._episode_rewards.append(reward)

        # If done, reset short term memory (ie. History)
        if done:
            # Plot the metrics through Tensorboard and reset buffers
            if self._metrics_writer is not None:
                self._plot_metrics()
            self._episode_rewards, self._episode_q_means, self._episode_q_stddev = [], [], []

            # Reset the short term memory
            self._history.reset()

        # Append to long term memory
        self._memory.append(old_state, action, reward, done)

    def train(self):
        """ This allows the agent to train itself to better understand the environment dynamics.
        The agent will compute the expected reward for the state(t+1)
        and update the expected reward at step t according to this.

        The target expectation is computed through the Target Network, which is a more stable version
        of the Action Value Network for increasing training stability.

        The Target Network is a frozen copy of the Action Value Network updated as regular intervals.
        """

        agent_step = self._num_actions_taken

        if agent_step >= self._train_after:
            if (agent_step % self._train_interval) == 0:
                pre_states, actions, post_states, rewards, terminals = self._memory.minibatch(self._minibatch_size)

                self._trainer.train_minibatch(
                    self._trainer.loss_function.argument_map(
                        pre_states=pre_states,
                        actions=Value.one_hot(actions.reshape(-1, 1).tolist(), self.nb_actions),
                        post_states=post_states,
                        rewards=rewards,
                        terminals=terminals
                    )
                )

                # Update the Target Network if needed
                if (agent_step % self._target_update_interval) == 0:
                    self._target_net = self._action_value_net.clone(CloneMethod.freeze)
                    filename = "models\model%d" % agent_step
                    self._trainer.save_checkpoint(filename)

    def _plot_metrics(self):
        """Plot current buffers accumulated values to visualize agent learning
        """
        if len(self._episode_q_means) > 0:
            mean_q = np.asscalar(np.mean(self._episode_q_means))
            self._metrics_writer.write_value('Mean Q per ep.', mean_q, self._num_actions_taken)

        if len(self._episode_q_stddev) > 0:
            std_q = np.asscalar(np.mean(self._episode_q_stddev))
            self._metrics_writer.write_value('Mean Std Q per ep.', std_q, self._num_actions_taken)

        self._metrics_writer.write_value('Sum rewards per ep.', sum(self._episode_rewards), self._num_actions_taken)

Example #6

Show file

def simple_mnist(tensorboard_logdir=None):
    input_dim = 19
    num_output_classes = 2
    num_hidden_layers = 2
    hidden_layers_dim = 1024

    # Input variables denoting the features and label data
    feature = C.input_variable(input_dim, np.float32)
    label = C.input_variable(num_output_classes, np.float32)

    # Instantiate the feedforward classification model
    # scaled_input = element_times(constant(0.00390625), feature)

    z = Sequential([For(range(num_hidden_layers), lambda i: Dense(hidden_layers_dim, activation=relu)),
                    Dense(num_output_classes)])(feature)

    ce = cross_entropy_with_softmax(z, label)
    pe = classification_error(z, label)

    data_dir = r"."

    path = os.path.normpath(os.path.join(data_dir, "train.ctf"))
    check_path(path)

    reader_train = create_reader(path, True, input_dim, num_output_classes)

    input_map = {
        feature  : reader_train.streams.features,
        label  : reader_train.streams.labels
    }

    # Training config
    minibatch_size = 512
    num_samples_per_sweep = 1825000
    num_sweeps_to_train_with = 100

    # Instantiate progress writers.
    progress_writers = [ProgressPrinter(
        tag='Training',
        num_epochs=num_sweeps_to_train_with)]

    if tensorboard_logdir is not None:
        tensorboard_writer = TensorBoardProgressWriter(freq=10, log_dir=tensorboard_logdir, model=z)
        progress_writers.append(tensorboard_writer)

    # Instantiate the trainer object to drive the model training
    lr = learning_parameter_schedule_per_sample(0.001)
    learner = create_learner(model=z)
    trainer = Trainer(z, (ce, pe), learner, progress_writers)

    num_minibatches_to_train = int(num_samples_per_sweep / minibatch_size * num_sweeps_to_train_with)

    model_dir = "model"
    for i in range(num_minibatches_to_train):
        mb = reader_train.next_minibatch(minibatch_size, input_map=input_map)
        trainer.train_minibatch(mb)

        freq = int(num_samples_per_sweep / minibatch_size)
        if i > 0 and i % freq == 0:
            timestamp = datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S_%f")
            current_trainer_cp = os.path.join(model_dir, timestamp + "_epoch_" + str(freq) + ".trainer")
            trainer.save_checkpoint(current_trainer_cp)

            train_error = get_error_rate(os.path.join(data_dir, "train_subset.ctf"), input_map, input_dim,
                                         num_output_classes, trainer)
            valid_error = get_error_rate(os.path.join(data_dir, "validation.ctf"), input_map, input_dim, num_output_classes,
                                         trainer)

            if train_error > 0:
                tensorboard_writer.write_value("train_error", train_error, i)
            if valid_error > 0:
                tensorboard_writer.write_value("valid_error", valid_error, i)

    feat_path = os.path.normpath(os.path.join(data_dir, "test.ctf"))
    return get_error_rate(feat_path, input_map, input_dim, num_output_classes, trainer)

Example #7

Show file

def simple_mnist(tensorboard_logdir=None):
    input_dim = 784
    num_output_classes = 10
    num_hidden_layers = 2
    hidden_layers_dim = 200

    # Input variables denoting the features and label data
    feature = C.input_variable(input_dim, np.float32)
    label = C.input_variable(num_output_classes, np.float32)

    # Instantiate the feedforward classification model
    scaled_input = element_times(constant(0.00390625), feature)

    z = Sequential([
        For(range(num_hidden_layers),
            lambda i: Dense(hidden_layers_dim, activation=relu)),
        Dense(num_output_classes)
    ])(scaled_input)

    ce = cross_entropy_with_softmax(z, label)
    pe = classification_error(z, label)

    data_dir = os.path.dirname(os.path.abspath(__file__))
    path = os.path.join(data_dir, 'Train-28x28_cntk_text.txt')

    reader_train = create_reader(path, True, input_dim, num_output_classes)

    input_map = {
        feature: reader_train.streams.features,
        label: reader_train.streams.labels
    }

    # Training config
    minibatch_size = 64
    num_samples_per_sweep = 60000
    num_sweeps_to_train_with = 10

    # Instantiate progress writers.
    # training_progress_output_freq = 100
    progress_writers = [
        ProgressPrinter(
            # freq=training_progress_output_freq,
            tag='Training',
            num_epochs=num_sweeps_to_train_with)
    ]

    if tensorboard_logdir is not None:
        progress_writers.append(
            TensorBoardProgressWriter(freq=10,
                                      log_dir=tensorboard_logdir,
                                      model=z))

    # Instantiate the trainer object to drive the model training
    lr = 0.001
    trainer = Trainer(z, (ce, pe), sgd(z.parameters, lr), progress_writers)

    training_session(trainer=trainer,
                     mb_source=reader_train,
                     mb_size=minibatch_size,
                     model_inputs_to_streams=input_map,
                     max_samples=num_samples_per_sweep *
                     num_sweeps_to_train_with,
                     progress_frequency=num_samples_per_sweep).train()

    # Load test data
    path = os.path.normpath(os.path.join(data_dir, "Test-28x28_cntk_text.txt"))
    check_path(path)

    reader_test = create_reader(path, False, input_dim, num_output_classes)

    input_map = {
        feature: reader_test.streams.features,
        label: reader_test.streams.labels
    }

    # Test data for trained model
    C.debugging.start_profiler()
    C.debugging.enable_profiler()
    C.debugging.set_node_timing(True)

    test_minibatch_size = 1024
    num_samples = 10000
    num_minibatches_to_test = num_samples / test_minibatch_size
    test_result = 0.0
    for i in range(0, int(num_minibatches_to_test)):
        mb = reader_test.next_minibatch(test_minibatch_size,
                                        input_map=input_map)
        eval_error = trainer.test_minibatch(mb)
        test_result = test_result + eval_error

    C.debugging.stop_profiler()
    trainer.print_node_timing()

    # Average of evaluation errors of all test minibatches
    return test_result * 100 / num_minibatches_to_test

Example #8

Show file

File: train_mnist_onnx.py Project: zhouxiong100/samples-for-ai

def train_test(train_reader,
               test_reader,
               model_func,
               x,
               y,
               learning_rate,
               minibatch_size,
               num_sweeps_to_train_with=10,
               tensorboard_logdir=None):

    # Instantiate the model function; x is the input (feature) variable
    model = model_func(x)

    # Instantiate the Tensorboard writer
    tensorboard_writer = None
    if tensorboard_logdir is not None:
        tensorboard_writer = TensorBoardProgressWriter(
            freq=10, log_dir=tensorboard_logdir, model=model)

    # Instantiate the loss and error function
    loss, label_error = create_criterion_function(model, y)

    # Instantiate the trainer object to drive the model training
    #learning_rate = 0.2
    lr_schedule = C.learning_parameter_schedule(learning_rate)
    learner = C.sgd(z.parameters, lr_schedule)
    trainer = C.Trainer(z, (loss, label_error), [learner],
                        progress_writers=tensorboard_writer)

    # Initialize the parameters for the trainer
    #minibatch_size = 64
    num_samples_per_sweep = 60000
    num_minibatches_to_train = (num_samples_per_sweep *
                                num_sweeps_to_train_with) / minibatch_size

    # Map the data streams to the input and labels.
    input_map = {
        y: train_reader.streams.labels,
        x: train_reader.streams.features
    }

    # Uncomment below for more detailed logging
    training_progress_output_freq = 500

    # Start a timer
    start = time.time()

    for i in range(0, int(num_minibatches_to_train)):
        # Read a mini batch from the training data file
        data = train_reader.next_minibatch(minibatch_size, input_map=input_map)
        trainer.train_minibatch(data)
        print_training_progress(trainer,
                                i,
                                training_progress_output_freq,
                                verbose=1)

    # Print training time
    print("Training took {:.1f} sec".format(time.time() - start))

    # Test the model
    test_input_map = {
        y: test_reader.streams.labels,
        x: test_reader.streams.features
    }

    # Test data for trained model
    test_minibatch_size = 512
    num_samples = 10000
    num_minibatches_to_test = num_samples // test_minibatch_size

    test_result = 0.0

    for i in range(num_minibatches_to_test):

        # We are loading test data in batches specified by test_minibatch_size
        # Each data point in the minibatch is a MNIST digit image of 784 dimensions
        # with one pixel per dimension that we will encode / decode with the
        # trained model.
        data = test_reader.next_minibatch(test_minibatch_size,
                                          input_map=test_input_map)
        eval_error = trainer.test_minibatch(data)
        test_result = test_result + eval_error

    # Average of evaluation errors of all test minibatches
    print("Average test error: {0:.2f}%".format(test_result * 100 /
                                                num_minibatches_to_test))

Example #9

Show file

    def __init__(self,
                 state_dim,
                 action_dim,
                 gamma=0.99,
                 learning_rate=1e-4,
                 momentum=0.95):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.gamma = gamma

        with default_options(activation=relu, init=he_uniform()):
            # Convolution filter counts were halved to save on memory, no gpu :(
            self.model = Sequential([
                Convolution2D((8, 8), 16, strides=4, name='conv1'),
                Convolution2D((4, 4), 32, strides=2, name='conv2'),
                Convolution2D((3, 3), 32, strides=1, name='conv3'),
                Dense(256, init=he_uniform(scale=0.01), name='dense1'),
                Dense(action_dim,
                      activation=None,
                      init=he_uniform(scale=0.01),
                      name='actions')
            ])
            self.model.update_signature(Tensor[state_dim])

        # Create the target model as a copy of the online model
        self.target_model = None
        self.update_target()

        self.pre_states = input_variable(state_dim, name='pre_states')
        self.actions = input_variable(action_dim, name='actions')
        self.post_states = input_variable(state_dim, name='post_states')
        self.rewards = input_variable((), name='rewards')
        self.terminals = input_variable((), name='terminals')
        self.is_weights = input_variable((), name='is_weights')

        predicted_q = reduce_sum(self.model(self.pre_states) * self.actions,
                                 axis=0)

        # DQN - calculate target q values
        # post_q = reduce_max(self.target_model(self.post_states), axis=0)

        # DDQN - calculate target q values
        online_selection = one_hot(
            argmax(self.model(self.post_states), axis=0), self.action_dim)
        post_q = reduce_sum(self.target_model(self.post_states) *
                            online_selection,
                            axis=0)

        post_q = (1.0 - self.terminals) * post_q
        target_q = stop_gradient(self.rewards + self.gamma * post_q)

        # Huber loss
        delta = 1.0
        self.td_error = minus(predicted_q, target_q, name='td_error')
        abs_error = abs(self.td_error)
        errors = element_select(less(abs_error, delta),
                                square(self.td_error) * 0.5,
                                delta * (abs_error - 0.5 * delta))
        loss = errors * self.is_weights

        # Adam based SGD
        lr_schedule = learning_rate_schedule(learning_rate, UnitType.minibatch)
        m_scheule = momentum_schedule(momentum)
        vm_schedule = momentum_schedule(0.999)

        self._learner = adam(self.model.parameters,
                             lr_schedule,
                             m_scheule,
                             variance_momentum=vm_schedule)
        self.writer = TensorBoardProgressWriter(log_dir='metrics',
                                                model=self.model)
        self.trainer = Trainer(self.model, (loss, None), [self._learner],
                               self.writer)

Example #10

Show file

class DeepQAgent(object):
    """
    Implementation of Deep Q Neural Network agent like in:
        Nature 518. "Human-level control through deep reinforcement learning" (Mnih & al. 2015)
    """
    def __init__(self, input_shape, nb_actions,
                 gamma=0.95, explorer=LinearEpsilonAnnealingExplorer(1, 0.1, 100000),
                 learning_rate=0.01, momentum=0.8, minibatch_size=16,
                 memory_size=15000, train_after=100, train_interval=100, target_update_interval=500,
                 monitor=True):
        self.input_shape = input_shape
        self.nb_actions = nb_actions
        self.gamma = gamma

        self._train_after = train_after
        self._train_interval = train_interval
        self._target_update_interval = target_update_interval

        self._explorer = explorer
        self._minibatch_size = minibatch_size
        self._history = History(input_shape)
        self._memory = ReplayMemory(memory_size, input_shape[1:], 4)
        self._num_actions_taken = 0
        self._num_trains = 0

        # Metrics accumulator
        self._episode_rewards, self._episode_q_means, self._episode_q_stddev = [], [], []

        '''
        # Action Value model (used by agent to interact with the environment)
        with default_options(activation=relu, init=he_uniform()):
            self._action_value_net = Sequential([
                Convolution2D((8, 8), 16, strides=4),
                Convolution2D((4, 4), 32, strides=2),
                Convolution2D((3, 3), 32, strides=1),
                Dense(256, init=he_uniform(scale=0.01)),
                Dense(nb_actions, activation=None, init=he_uniform(scale=0.01))
            ])
        ''' 
        with default_options(activation=relu, init=he_uniform()):
            self._action_value_net = Sequential([
                Dense(7, init=he_uniform(scale=0.01)),
                Dense(8, init=he_uniform(scale=0.01)),
                #Dense(16, init=he_uniform(scale=0.01)),
                #Dense(32, init=he_uniform(scale=0.01)),
                Dense(nb_actions, activation=None, init=he_uniform(scale=0.01))
            ])
        
        self._action_value_net.update_signature(Tensor[input_shape])

        # Target model used to compute the target Q-values in training, updated
        # less frequently for increased stability.
        self._target_net = self._action_value_net.clone(CloneMethod.freeze)

        # Function computing Q-values targets as part of the computation graph
        @Function
        @Signature(post_states=Tensor[input_shape], rewards=Tensor[()], terminals=Tensor[()])
        def compute_q_targets(post_states, rewards, terminals):
            return element_select(
                terminals,
                rewards,
                gamma * reduce_max(self._target_net(post_states), axis=0) + rewards,
            )

        # Define the loss, using Huber Loss (more robust to outliers)
        @Function
        @Signature(pre_states=Tensor[input_shape], actions=Tensor[nb_actions],
                   post_states=Tensor[input_shape], rewards=Tensor[()], terminals=Tensor[()])
        def criterion(pre_states, actions, post_states, rewards, terminals):
            # Compute the q_targets
            q_targets = compute_q_targets(post_states, rewards, terminals)

            # actions is a 1-hot encoding of the action done by the agent
            q_acted = reduce_sum(self._action_value_net(pre_states) * actions, axis=0)

            # Define training criterion as the Huber Loss function
            return huber_loss(q_targets, q_acted, 1.0)

        # Adam based SGD
        lr_schedule = learning_rate_schedule(learning_rate, UnitType.minibatch)
        m_schedule = momentum_schedule(momentum)
        vm_schedule = momentum_schedule(0.999)
        l_sgd = adam(self._action_value_net.parameters, lr_schedule,
                     momentum=m_schedule, variance_momentum=vm_schedule)

        self._metrics_writer = TensorBoardProgressWriter(freq=1, log_dir='metrics', model=criterion) if monitor else None
        self._learner = l_sgd
        self._trainer = Trainer(criterion, (criterion, None), l_sgd, self._metrics_writer)

        #self._trainer.restore_from_checkpoint('models/oldmodels/model800000')

    def act(self, state):
        """ This allows the agent to select the next action to perform in regard of the current state of the environment.
        It follows the terminology used in the Nature paper.

        Attributes:
            state (Tensor[input_shape]): The current environment state

        Returns: Int >= 0 : Next action to do
        """
        # Append the state to the short term memory (ie. History)
        self._history.append(state)

        # If policy requires agent to explore, sample random action
        if self._explorer.is_exploring(self._num_actions_taken):
            action = self._explorer(self.nb_actions)
        else:
            # Use the network to output the best action
            env_with_history = self._history.value
            q_values = self._action_value_net.eval(
                # Append batch axis with only one sample to evaluate
                env_with_history.reshape((1,) + env_with_history.shape)
            )

            self._episode_q_means.append(np.mean(q_values))
            self._episode_q_stddev.append(np.std(q_values))

            # Return the value maximizing the expected reward
            action = q_values.argmax()

        # Keep track of interval action counter
        self._num_actions_taken += 1
        #print(self._num_actions_taken)
        return action

    def observe(self, old_state, action, reward, done):
        """ This allows the agent to observe the output of doing the action it selected through act() on the old_state

        Attributes:
            old_state (Tensor[input_shape]): Previous environment state
            action (int): Action done by the agent
            reward (float): Reward for doing this action in the old_state environment
            done (bool): Indicate if the action has terminated the environment
        """
        self._episode_rewards.append(reward)

        # If done, reset short term memory (ie. History)
        if done:
            # Plot the metrics through Tensorboard and reset buffers
            if self._metrics_writer is not None:
                self._plot_metrics()
            self._episode_rewards, self._episode_q_means, self._episode_q_stddev = [], [], []

            # Reset the short term memory
            self._history.reset()

        # Append to long term memory
        self._memory.append(old_state, action, reward, done)

    def train(self):
        """ This allows the agent to train itself to better understand the environment dynamics.
        The agent will compute the expected reward for the state(t+1)
        and update the expected reward at step t according to this.

        The target expectation is computed through the Target Network, which is a more stable version
        of the Action Value Network for increasing training stability.

        The Target Network is a frozen copy of the Action Value Network updated as regular intervals.
        """

        agent_step = self._num_actions_taken

        if agent_step >= self._train_after:
            #if (agent_step % self._train_interval) == 0:
            print('\nTraining minibatch\n')
            client.setCarControls(zero_controls)
            pre_states, actions, post_states, rewards, terminals = self._memory.minibatch(self._minibatch_size)
            self._trainer.train_minibatch(
                self._trainer.loss_function.argument_map(
                    pre_states=pre_states,
                    actions=Value.one_hot(actions.reshape(-1, 1).tolist(), self.nb_actions),
                    post_states=post_states,
                    rewards=rewards,
                    terminals=terminals
                )
            )
            self._num_trains += 1
            # Update the Target Network if needed
            if self._num_trains % 20 == 0:
                print('updating network')
                self._target_net = self._action_value_net.clone(CloneMethod.freeze)
                filename = dirname+"\model%d" % agent_step
                self._trainer.save_checkpoint(filename)
    
    def _plot_metrics(self):
        """Plot current buffers accumulated values to visualize agent learning
        """
        if len(self._episode_q_means) > 0:
            mean_q = np.asscalar(np.mean(self._episode_q_means))
            self._metrics_writer.write_value('Mean Q per ep.', mean_q, self._num_actions_taken)

        if len(self._episode_q_stddev) > 0:
            std_q = np.asscalar(np.mean(self._episode_q_stddev))
            self._metrics_writer.write_value('Mean Std Q per ep.', std_q, self._num_actions_taken)

        self._metrics_writer.write_value('Sum rewards per ep.', sum(self._episode_rewards), self._num_actions_taken)

Example #11

Show file

    def __init__(self,
                 state_shape,
                 action_count,
                 model_func,
                 vmin,
                 vmax,
                 n,
                 gamma=0.99,
                 lr=0.00025,
                 mm=0.95,
                 use_tensorboard=False):
        """
        Creates a new agent that learns using Categorical DQN as described in
        "A Distributional Perspective on Reinforcement Learning"
        :param state_shape: The shape of each input shape e.g. (4 x 84 x 84) for Atari
        :param action_count: The number of actions e.g. 14
        :param model_func: The model to train
        :param vmin: Minimum value of return distribution
        :param vmax: Maximum value of return distribution
        :param n: Number of support atoms
        :param gamma: Discount factor for Bellman update
        :param lr: The learning rate for Adam SGD
        :param mm: The momentum for Adam SGD
        """
        self.state_shape = state_shape
        self.action_count = action_count
        self.gamma = gamma
        self.learning_rate = lr
        self.momentum = mm

        # Distribution parameters
        self.vmin = vmin
        self.vmax = vmax
        self.n = n
        self.dz = (vmax - vmin) / (n - 1)

        # Support atoms
        self.z = np.linspace(vmin, vmax, n, dtype=np.float32)

        # Model input and output
        self.state_var = C.input_variable(self.state_shape, name='state')
        self.action_return_dist = C.input_variable((self.action_count, n),
                                                   name='ar_dist')

        # Model output assigns a probability to each support atom for each action
        self.raw = model_func(self.state_var)
        self.model = C.softmax(self.raw, axis=1)

        # Adam-based SGD with cross-entropy loss
        loss = C.cross_entropy_with_softmax(self.raw,
                                            self.action_return_dist,
                                            axis=1,
                                            name='loss')
        lr_schedule = C.learning_rate_schedule(self.learning_rate,
                                               C.UnitType.sample)
        mom_schedule = C.momentum_schedule(self.momentum)
        vm_schedule = C.momentum_schedule(0.999)
        learner = C.adam(self.raw.parameters,
                         lr_schedule,
                         mom_schedule,
                         variance_momentum=vm_schedule)

        if use_tensorboard:
            self.writer = TensorBoardProgressWriter(log_dir='metrics',
                                                    model=self.model)
        else:
            self.writer = None

        self.trainer = C.Trainer(self.raw, (loss, None), [learner],
                                 self.writer)

        # Create target network as copy of online network
        self.target_model = None
        self.update_target()

Example #12

Show file

class DQAgent(object):
    """docstring for DQAgent"""                                                             ############should modify @!@!
    def __init__(self, input_shape, nb_actions,
                 gamma=0.99, explorer=LinearEpsilonAnnealingExplorer(1, 0.1, 1000000),
                 learning_rate=0.00025, momentum=0.95, minibatch_size=32,
                 memory_size=500000, train_after=10000, train_interval=4,
                 target_update_interval=10000, monitor=True):
        self.input_shape = input_shape
        self.nb_actions = nb_actions
        self.gamma = gamma
        self._train_after = train_after
        self._train_interval = train_interval
        self._target_update_interval = target_update_interval
        self._explorer = explorer
        self._minibatch_size = minibatch_size
        self._history = History(input_shape)
        self._memory = RepMem(memory_size, input_shape[1:], 4)
        self._num_actions_taken = 0
        self._episode_rewards, self._episode_q_means, self._episode_q_stddev = [], [], []

        with default_options(activation=relu, init=he_uniform()):
            self._action_value_net = Sequential([
                Dense(input_shape, init=he_uniform(scale=0.01)),
                Dense(input_shape),
                Dense(nb_actions, activation=None, init=he_uniform(scale=0.01))])

        self._action_value_net.update_signature(Tensor[input_shape])

        self._target_net = self._action_value_net.clone(CloneMethod.freeze)


        @Function
        @Signature(post_states=Tensor[input_shape], rewards=Tensor[()], terminals=Tensor[()])
        def compute_q_targets(post_states, rewards, terminals):
            return element_select(
                terminals,
                rewards,
                gamma * reduce_max(self._target_net(post_states), axis=0) + rewards,
            )

        @Function
        @Signature(pre_states=Tensor[input_shape], actions=Tensor[nb_actions],
                   post_states=Tensor[input_shape], rewards=Tensor[()], terminals=Tensor[()])
        def criterion(pre_states, actions, post_states, rewards, terminals):
            q_targets = compute_q_targets(post_states, rewards, terminals)

            q_acted = reduce_sum(self._action_value_net(pre_states) * actions, axis=0)

            return huber_loss(q_targets, q_acted, 1.0)

        lr_schedule = learning_rate_schedule(learning_rate, UnitType.minibatch)
        m_schedule = momentum_schedule(momentum)
        vm_schedule = momentum_schedule(0.999)
        l_sgd = adam(self._action_value_net.parameters, lr_schedule,
                     momentum=m_schedule, variance_momentum=vm_schedule)

        self._metrics_writer = TensorBoardProgressWriter(freq=1, log_dir='metrics', model=criterion) if monitor else None
        self._learner = l_sgd
        self._trainer = Trainer(criterion, (criterion, None), l_sgd, self._metrics_writer)

    def act(self, state):
        self._history.append(state)

        if self._explorer.is_exploring(self._num_actions_taken):
            action = self._explorer(self.nb_actions)
        else:
            env_with_history = self._history.value
            q_values = self._action_value_net.eval(
                env_with_history.reshape((1,) + env_with_history.shape)
            )

            self._episode_q_means.append(np.mean(q_values))
            self._episode_q_stddev.append(np.std(q_values))
            action = q_values.argmax()

        self._num_actions_taken += 1
        return action

    def observe(self, old_state, action, reward, done):
        self._episode_rewards.append(reward)

        if done:
            if self._metrics_writer is not None:
                self._plot_metrics()
            self._episode_rewards, self._episode_q_means, self._episode_q_stddev = [], [], []

            self._history.reset()

        self._memory.append(old_state, action, reward, done)

    def train(self):

        agent_step = self._num_actions_taken

        if agent_step >= self._train_after:
            if (agent_step % self._train_interval) == 0:
                pre_states, actions, post_states, rewards, terminals = self._memory.minibatch(self._minibatch_size)

                self._trainer.train_minibatch(
                    self._trainer.loss_function.argument_map(
                        pre_states=pre_states,
                        actions=Value.one_hot(actions.reshape(-1,1).tolist(), self.nb_actions),
                        post_states=post_states,
                        rewards=rewards,
                        terminals=terminals
                    )
                )

            if (agent_step % self._target_update_interval) == 0:
                self._target_net = self._action_value_net.clone(CloneMethod.freeze)
                filename = "model\model%d" % agent_step # save ???? not good at using %d
                self._trainer.save_checkpoint(filename)

    def _plot_metrics(self):

        if len(self._episode_q_means) > 0:
            mean_q = np.asscalar(np.mean(self._episode_q_means))
            self._metrics_writer.write_value('Mean Q per ep.', mean_q, self._num_actions_taken)

        if len(self._episode_q_stddev) > 0:
            std_q = np.asscalar(np.mean(self._episode_q_stddev))
            self._metrics_writer.write_value('Mean Std Q per ep.', std_q, self._num_actions_taken)

        self._metrics_writer.write_value('Sum rewards per ep', sum(self._episode_rewards), self._num_actions_taken)

Example #13

Show file

class DeepQAgent(object):
    """
    Implementation of Deep Q Neural Network agent like in:
        Nature 518. "Human-level control through deep reinforcement learning" (Mnih & al. 2015)
    """
    def __init__(self,
                 input_shape,
                 nb_actions,
                 gamma=0.99,
                 explorer=LinearEpsilonAnnealingExplorer(1, 0.1, 1000000),
                 learning_rate=0.00025,
                 momentum=0.95,
                 minibatch_size=32,
                 memory_size=500000,
                 train_after=200000,
                 train_interval=4,
                 target_update_interval=10000,
                 monitor=True):
        self.input_shape = input_shape
        self.nb_actions = nb_actions
        self.gamma = gamma

        self._train_after = train_after
        self._train_interval = train_interval
        self._target_update_interval = target_update_interval

        self._explorer = explorer
        self._minibatch_size = minibatch_size
        self._history = History(input_shape)
        self._memory = ReplayMemory(memory_size, input_shape[1:], 4)
        self._num_actions_taken = 0

        # Metrics accumulator
        self._episode_rewards, self._episode_q_means, self._episode_q_stddev = [], [], []

        # Action Value model (used by agent to interact with the environment)
        with default_options(activation=relu, init=he_uniform()):
            self._action_value_net = Sequential([
                Convolution2D((8, 8), 16, strides=4),
                Convolution2D((4, 4), 32, strides=2),
                Convolution2D((3, 3), 32, strides=1),
                Dense(256, init=he_uniform(scale=0.01)),
                Dense(nb_actions, activation=None, init=he_uniform(scale=0.01))
            ])
        self._action_value_net.update_signature(Tensor[input_shape])

        # Target model used to compute the target Q-values in training, updated
        # less frequently for increased stability.
        self._target_net = self._action_value_net.clone(CloneMethod.freeze)

        # Function computing Q-values targets as part of the computation graph
        @Function
        @Signature(post_states=Tensor[input_shape],
                   rewards=Tensor[()],
                   terminals=Tensor[()])
        def compute_q_targets(post_states, rewards, terminals):
            return element_select(
                terminals,
                rewards,
                gamma * reduce_max(self._target_net(post_states), axis=0) +
                rewards,
            )

        # Define the loss, using Huber Loss (more robust to outliers)
        @Function
        @Signature(pre_states=Tensor[input_shape],
                   actions=Tensor[nb_actions],
                   post_states=Tensor[input_shape],
                   rewards=Tensor[()],
                   terminals=Tensor[()])
        def criterion(pre_states, actions, post_states, rewards, terminals):
            # Compute the q_targets
            q_targets = compute_q_targets(post_states, rewards, terminals)

            # actions is a 1-hot encoding of the action done by the agent
            q_acted = reduce_sum(self._action_value_net(pre_states) * actions,
                                 axis=0)

            # Define training criterion as the Huber Loss function
            return huber_loss(q_targets, q_acted, 1.0)

        # Adam based SGD
        lr_schedule = learning_rate_schedule(learning_rate, UnitType.minibatch)
        m_schedule = momentum_schedule(momentum)
        vm_schedule = momentum_schedule(0.999)
        l_sgd = adam(self._action_value_net.parameters,
                     lr_schedule,
                     momentum=m_schedule,
                     variance_momentum=vm_schedule)

        self._metrics_writer = TensorBoardProgressWriter(
            freq=1, log_dir='metrics', model=criterion) if monitor else None
        self._learner = l_sgd
        self._trainer = Trainer(criterion, (criterion, None), l_sgd,
                                self._metrics_writer)

    def load(self, model_path):

        self._trainer.restore_from_checkpoint(model_path)

    def act(self, state, eval=False):
        """ This allows the agent to select the next action to perform in regard of the current state of the environment.
        It follows the terminology used in the Nature paper.

        Attributes:
            state (Tensor[input_shape]): The current environment state

        Returns: Int >= 0 : Next action to do
        """
        # Append the state to the short term memory (ie. History)
        self._history.append(state)

        # If policy requires agent to explore, sample random action
        if self._explorer.is_exploring(self._num_actions_taken) and not eval:
            action = self._explorer(self.nb_actions)
            q_values = None
        else:
            # Use the network to output the best action
            env_with_history = self._history.value
            q_values = self._action_value_net.eval(
                # Append batch axis with only one sample to evaluate
                env_with_history.reshape((1, ) + env_with_history.shape))

            self._episode_q_means.append(np.mean(q_values))
            self._episode_q_stddev.append(np.std(q_values))

            # Return the value maximizing the expected reward
            action = q_values.argmax()

        # Keep track of interval action counter
        self._num_actions_taken += 1
        return action, q_values

    def observe(self, old_state, action, reward, done):
        """ This allows the agent to observe the output of doing the action it selected through act() on the old_state

        Attributes:
            old_state (Tensor[input_shape]): Previous environment state
            action (int): Action done by the agent
            reward (float): Reward for doing this action in the old_state environment
            done (bool): Indicate if the action has terminated the environment
        """
        self._episode_rewards.append(reward)

        # If done, reset short term memory (ie. History)
        if done:
            # Plot the metrics through Tensorboard and reset buffers
            if self._metrics_writer is not None:
                self._plot_metrics()
            self._episode_rewards, self._episode_q_means, self._episode_q_stddev = [], [], []

            # Reset the short term memory
            self._history.reset()

        # Append to long term memory
        self._memory.append(old_state, action, reward, done)

    def train(self, checkpoint_dir):
        """ This allows the agent to train itself to better understand the environment dynamics.
        The agent will compute the expected reward for the state(t+1)
        and update the expected reward at step t according to this.

        The target expectation is computed through the Target Network, which is a more stable version
        of the Action Value Network for increasing training stability.

        The Target Network is a frozen copy of the Action Value Network updated as regular intervals.
        """

        agent_step = self._num_actions_taken

        if agent_step >= self._train_after:
            if (agent_step % self._train_interval) == 0:
                #print('training... number of steps: {}'.format(agent_step))

                pre_states, actions, post_states, rewards, terminals = self._memory.minibatch(
                    self._minibatch_size)
                self._trainer.train_minibatch(
                    self._trainer.loss_function.argument_map(
                        pre_states=pre_states,
                        actions=Value.one_hot(
                            actions.reshape(-1, 1).tolist(), self.nb_actions),
                        post_states=post_states,
                        rewards=rewards,
                        terminals=terminals))

                # Update the Target Network if needed
                if (agent_step % self._target_update_interval) == 0:
                    self._target_net = self._action_value_net.clone(
                        CloneMethod.freeze)
                    filename = os.path.join(checkpoint_dir,
                                            "models\model%d" % agent_step)
                    self._trainer.save_checkpoint(filename)

    def _plot_metrics(self):
        """Plot current buffers accumulated values to visualize agent learning
        """
        if len(self._episode_q_means) > 0:
            mean_q = np.asscalar(np.mean(self._episode_q_means))
            self._metrics_writer.write_value('Mean Q per ep.', mean_q,
                                             self._num_actions_taken)

        if len(self._episode_q_stddev) > 0:
            std_q = np.asscalar(np.mean(self._episode_q_stddev))
            self._metrics_writer.write_value('Mean Std Q per ep.', std_q,
                                             self._num_actions_taken)

        self._metrics_writer.write_value('Sum rewards per ep.',
                                         sum(self._episode_rewards),
                                         self._num_actions_taken)

    def get_depth_image(self, client):
        # get depth image from airsim
        responses = client.simGetImages([
            airsim.ImageRequest("RCCamera", airsim.ImageType.DepthPerspective,
                                True, False)
        ])

        img1d = np.array(responses[0].image_data_float, dtype=np.float)
        img1d = 255 / np.maximum(np.ones(img1d.size), img1d)
        if img1d.size > 1:

            img2d = np.reshape(img1d,
                               (responses[0].height, responses[0].width))

            image = Image.fromarray(img2d)
            im_final = np.array(image.resize((84, 84)).convert('L'))
            im_final = im_final / 255.0

            return im_final

        return np.zeros((84, 84)).astype(float)

    # Gets a coverage image from AirSim
    def get_cov_image(self, coverage_map):

        state, cov_reward = coverage_map.get_state_from_pose()
        #state = self.coverage_map.get_map_scaled()

        # debug only
        #im = Image.fromarray(np.uint8(state))
        #im.save("DistributedRL\\debug\\{}.png".format(time.time()))

        # normalize state
        state = state / 255.0

        return state, cov_reward

Example #14

Show file

def convnet_mnist(debug_output=False,
                  epoch_size=60000,
                  minibatch_size=64,
                  max_epochs=10):
    image_height = 28
    image_width = 28
    num_channels = 1
    input_dim = image_height * image_width * num_channels
    num_output_classes = 10

    # Input variables denoting the features and label data
    input_var = C.ops.input_variable((num_channels, image_height, image_width),
                                     np.float32)
    label_var = C.ops.input_variable(num_output_classes, np.float32)

    # Instantiate the feedforward classification model
    scaled_input = C.ops.element_times(C.ops.constant(0.00390625), input_var)

    with C.layers.default_options(activation=C.ops.relu, pad=False):
        conv1 = C.layers.Convolution2D((5, 5), 32, pad=True)(scaled_input)
        pool1 = C.layers.MaxPooling((3, 3), (2, 2))(conv1)
        conv2 = C.layers.Convolution2D((3, 3), 48)(pool1)
        pool2 = C.layers.MaxPooling((3, 3), (2, 2))(conv2)
        conv3 = C.layers.Convolution2D((3, 3), 64)(pool2)
        f4 = C.layers.Dense(96)(conv3)
        drop4 = C.layers.Dropout(0.5)(f4)
        z = C.layers.Dense(num_output_classes, activation=C.ops.sigmoid)(drop4)

    ce = C.losses.cross_entropy_with_softmax(z, label_var)
    pe = C.metrics.classification_error(z, label_var)

    reader_train = create_reader(
        os.path.join(data_dir, 'Train-28x28_cntk_text.txt'), True, input_dim,
        num_output_classes)

    # Set learning parameters
    lr_per_sample = [0.001] * 10 + [0.0005] * 10 + [0.0001]
    lr_schedule = C.learning_rate_schedule(lr_per_sample,
                                           C.learners.UnitType.sample,
                                           epoch_size)
    mm_time_constant = [0] * 5 + [1024]
    mm_schedule = C.learners.momentum_as_time_constant_schedule(
        mm_time_constant, epoch_size)

    # Instantiate the trainer object to drive the model training
    learner = C.learners.momentum_sgd(z.parameters, lr_schedule, mm_schedule)
    progress_printers = [
        C.logging.ProgressPrinter(tag='Training', num_epochs=max_epochs)
    ]
    progress_printers.append(
        TensorBoardProgressWriter(freq=10, log_dir=log_dir, model=z))

    trainer = C.Trainer(z, (ce, pe), learner, progress_printers)

    # define mapping from reader streams to network inputs
    input_map = {
        input_var: reader_train.streams.features,
        label_var: reader_train.streams.labels
    }

    C.logging.log_number_of_parameters(z)
    print()

    # Get minibatches of images to train with and perform model training
    for epoch in range(max_epochs):  # loop over epochs
        sample_count = 0
        while sample_count < epoch_size:  # loop over minibatches in the epoch
            data = reader_train.next_minibatch(
                min(minibatch_size, epoch_size - sample_count),
                input_map=input_map)  # fetch minibatch.
            trainer.train_minibatch(data)  # update model with it
            sample_count += data[
                label_var].num_samples  # count samples processed so far

        trainer.summarize_training_progress()
        z.save(
            os.path.join(output_dir,
                         "digit_epoch_{}.checkpoint".format(epoch)))

    # Load test data
    reader_test = create_reader(
        os.path.join(data_dir, 'Test-28x28_cntk_text.txt'), False, input_dim,
        num_output_classes)
    z.save(os.path.join(output_dir, "digit.model"))

    input_map = {
        input_var: reader_test.streams.features,
        label_var: reader_test.streams.labels
    }

    # Test data for trained model
    epoch_size = 10000
    minibatch_size = 1024

    # process minibatches and evaluate the model
    metric_numer = 0
    metric_denom = 0
    sample_count = 0
    minibatch_index = 0

    while sample_count < epoch_size:
        current_minibatch = min(minibatch_size, epoch_size - sample_count)

        # Fetch next test min batch.
        data = reader_test.next_minibatch(current_minibatch,
                                          input_map=input_map)

        # minibatch data to be trained with
        metric_numer += trainer.test_minibatch(data) * current_minibatch
        metric_denom += current_minibatch

        # Keep track of the number of samples processed so far.
        sample_count += data[label_var].num_samples
        minibatch_index += 1

    print("")
    print("Final Results: Minibatch[1-{}]: errs = {:0.2f}% * {}".format(
        minibatch_index + 1, (metric_numer * 100.0) / metric_denom,
        metric_denom))
    print("")

    return metric_numer / metric_denom

Example #15

Show file

File: FasterRCNN_train.py Project: thiwankajayasiri/prometheus

def train_model(image_input,
                roi_input,
                dims_input,
                loss,
                pred_error,
                lr_per_sample,
                mm_schedule,
                l2_reg_weight,
                epochs_to_train,
                cfg,
                rpn_rois_input=None,
                buffered_rpn_proposals=None):
    if isinstance(loss, cntk.Variable):
        loss = combine([loss])

    params = loss.parameters
    biases = [p for p in params if '.b' in p.name or 'b' == p.name]
    others = [p for p in params if not p in biases]
    bias_lr_mult = cfg["CNTK"].BIAS_LR_MULT

    if cfg["CNTK"].DEBUG_OUTPUT:
        print("biases")
        for p in biases:
            print(p)
        print("others")
        for p in others:
            print(p)
        print("bias_lr_mult: {}".format(bias_lr_mult))

    # Instantiate the learners and the trainer object
    lr_schedule = learning_parameter_schedule_per_sample(lr_per_sample)
    learner = momentum_sgd(others,
                           lr_schedule,
                           mm_schedule,
                           l2_regularization_weight=l2_reg_weight,
                           unit_gain=False,
                           use_mean_gradient=True)

    bias_lr_per_sample = [v * bias_lr_mult for v in lr_per_sample]
    bias_lr_schedule = learning_parameter_schedule_per_sample(
        bias_lr_per_sample)
    bias_learner = momentum_sgd(biases,
                                bias_lr_schedule,
                                mm_schedule,
                                l2_regularization_weight=l2_reg_weight,
                                unit_gain=False,
                                use_mean_gradient=True)
    trainer = Trainer(None, (loss, pred_error), [learner, bias_learner])

    # Get minibatches of images and perform model training
    print("Training model for %s epochs." % epochs_to_train)
    log_number_of_parameters(loss)

    # Create the minibatch source
    if buffered_rpn_proposals is not None:
        proposal_provider = ProposalProvider.fromlist(buffered_rpn_proposals,
                                                      requires_scaling=False)
    else:
        proposal_provider = None

    od_minibatch_source = ObjectDetectionMinibatchSource(
        cfg["DATA"].TRAIN_MAP_FILE,
        cfg["DATA"].TRAIN_ROI_FILE,
        num_classes=cfg["DATA"].NUM_CLASSES,
        max_annotations_per_image=cfg.INPUT_ROIS_PER_IMAGE,
        pad_width=cfg.IMAGE_WIDTH,
        pad_height=cfg.IMAGE_HEIGHT,
        pad_value=cfg["MODEL"].IMG_PAD_COLOR,
        randomize=True,
        use_flipping=cfg["TRAIN"].USE_FLIPPED,
        max_images=cfg["DATA"].NUM_TRAIN_IMAGES,
        proposal_provider=proposal_provider)

    # define mapping from reader streams to network inputs
    input_map = {
        od_minibatch_source.image_si: image_input,
        od_minibatch_source.roi_si: roi_input,
    }
    if buffered_rpn_proposals is not None:
        input_map[od_minibatch_source.proposals_si] = rpn_rois_input
    else:
        input_map[od_minibatch_source.dims_si] = dims_input

    progress_printer = [
        ProgressPrinter(tag='Training',
                        num_epochs=epochs_to_train,
                        gen_heartbeat=True)
    ]
    tensorboard_logdir = os.path.join(
        os.path.dirname(os.path.abspath(__file__)), r"./TensorBoard")
    tensorboard_writer = None
    if tensorboard_logdir is not None:
        tensorboard_writer = TensorBoardProgressWriter(
            freq=10, log_dir=tensorboard_logdir, model=loss)
        progress_printer.append(tensorboard_writer)

    for epoch in range(epochs_to_train):  # loop over epochs
        sample_count = 0
        while sample_count < cfg[
                "DATA"].NUM_TRAIN_IMAGES:  # loop over minibatches in the epoch
            data = od_minibatch_source.next_minibatch(min(
                cfg.MB_SIZE, cfg["DATA"].NUM_TRAIN_IMAGES - sample_count),
                                                      input_map=input_map)
            output = trainer.train_minibatch(
                data, outputs=[image_input])  # update model with it
            sample_count += trainer.previous_minibatch_sample_count  # count samples processed so far

            #progress_printer.update_with_trainer(trainer, with_metric=True)  # log progress
            #Write output images to tensorboard
            tensorboard_writer.write_image('training', output[1], sample_count)

            if sample_count % 100 == 0:
                print("Processed {} samples".format(sample_count))

        #progress_printer.epoch_summary(with_metric=True)
        trainer.summarize_training_progress()

        if tensorboard_writer:
            for parameter in loss.parameters:
                tensorboard_writer.write_value(parameter.uid + "/mean",
                                               reduce_mean(parameter).eval(),
                                               epoch)

Example #16

Show file

lr_per_minibatch = learning_rate_schedule(
    [0.01] * 25 + [0.001] * 25 + [0.0001] * 25 + [0.00001] * 25 + [0.000001],
    UnitType.minibatch, epoch_size)
#lr_schedule            = learning_parameter_schedule(lr_per_mb, minibatch_size=minibatch_size, epoch_size=epoch_size)
#mm_schedule            = momentum_schedule(0.9, minibatch_size=minibatch_size)
momentum_time_constant = momentum_as_time_constant_schedule(-minibatch_size /
                                                            np.log(0.9))
l2_reg_weight = 0.0005

# trainer objectS
progress_writers = [ProgressPrinter(tag='Training', num_epochs=max_epochs)]
#progress_writers = [ProgressPrinter(tag='Training', log_to_file=train_log_file, num_epochs=max_epochs, gen_heartbeat=True)]

if cfg.train_log_dir is not None:
    tensorboard_writer = TensorBoardProgressWriter(freq=10,
                                                   log_dir=cfg.train_log_dir,
                                                   model=z)
    progress_writers.append(tensorboard_writer)

learner = momentum_sgd(z.parameters,
                       lr=lr_per_minibatch,
                       momentum=momentum_time_constant,
                       l2_regularization_weight=l2_reg_weight)

######### RESTORE TRAINER IF NEEDED
trainer = Trainer(z, (ce, pe), learner, progress_writers)
# trainer.restore_from_checkpoint(model_temp_file)

# define mapping from reader streams to network inputs
input_map = {
    input_var: reader_train.streams.features,

Example #17

Show file

def simple_mnist(tensorboard_logdir=None):
    input_dim = 784
    num_output_classes = 10
    num_hidden_layers = 1
    hidden_layers_dim = 200

    # Input variables denoting the features and label data
    feature = input(input_dim, np.float32)
    label = input(num_output_classes, np.float32)

    # Instantiate the feedforward classification model
    scaled_input = element_times(constant(0.00390625), feature)
    z = fully_connected_classifier_net(scaled_input, num_output_classes,
                                       hidden_layers_dim, num_hidden_layers,
                                       relu)

    ce = cross_entropy_with_softmax(z, label)
    pe = classification_error(z, label)

    data_dir = os.path.join(abs_path, "..", "..", "..", "DataSets", "MNIST")

    path = os.path.normpath(os.path.join(data_dir,
                                         "Train-28x28_cntk_text.txt"))
    check_path(path)

    reader_train = create_reader(path, True, input_dim, num_output_classes)

    input_map = {
        feature: reader_train.streams.features,
        label: reader_train.streams.labels
    }

    # Training config
    minibatch_size = 64
    num_samples_per_sweep = 60000
    num_sweeps_to_train_with = 10

    # Instantiate progress writers.
    #training_progress_output_freq = 100
    progress_writers = [
        ProgressPrinter(
            #freq=training_progress_output_freq,
            tag='Training',
            num_epochs=num_sweeps_to_train_with)
    ]

    if tensorboard_logdir is not None:
        progress_writers.append(
            TensorBoardProgressWriter(freq=10,
                                      log_dir=tensorboard_logdir,
                                      model=z))

    # Instantiate the trainer object to drive the model training
    trainer = Trainer(z, (ce, pe), adadelta(z.parameters), progress_writers)

    training_session(trainer=trainer,
                     mb_source=reader_train,
                     mb_size=minibatch_size,
                     var_to_stream=input_map,
                     max_samples=num_samples_per_sweep *
                     num_sweeps_to_train_with,
                     progress_frequency=num_samples_per_sweep).train()

    # Load test data
    path = os.path.normpath(os.path.join(data_dir, "Test-28x28_cntk_text.txt"))
    check_path(path)

    reader_test = create_reader(path, False, input_dim, num_output_classes)

    input_map = {
        feature: reader_test.streams.features,
        label: reader_test.streams.labels
    }

    # Test data for trained model
    test_minibatch_size = 1024
    num_samples = 10000
    num_minibatches_to_test = num_samples / test_minibatch_size
    test_result = 0.0
    for i in range(0, int(num_minibatches_to_test)):
        mb = reader_test.next_minibatch(test_minibatch_size,
                                        input_map=input_map)
        eval_error = trainer.test_minibatch(mb)
        test_result = test_result + eval_error

    # Average of evaluation errors of all test minibatches
    return test_result / num_minibatches_to_test

Example #18

Show file

File: helpers_cntk.py Project: tikyau/Object-Detection-Using-CNTK

def init_train_fast_rcnn(image_height,
                         image_width,
                         num_classes,
                         num_rois,
                         mb_size,
                         max_epochs,
                         cntk_lr_per_image,
                         l2_reg_weight,
                         momentum_time_constant,
                         base_path,
                         boSkipTraining=False,
                         debug_output=False,
                         tensorboardLogDir=None):

    #make sure we use GPU for training
    if use_default_device().type() == 0:
        print("WARNING: using CPU for training.")
    else:
        print("Using GPU for training.")

    # Instantiate the Fast R-CNN prediction model
    image_input = input_variable((3, image_height, image_width))
    roi_input = input_variable((num_rois, 4))
    label_input = input_variable((num_rois, num_classes))
    frcn_output, frcn_penultimateLayer = frcn_predictor(
        image_input, roi_input, num_classes, base_path)

    if boSkipTraining:
        print("Using pre-trained DNN without refinement")
        return frcn_penultimateLayer

    # Create the minibatch source and define mapping from reader streams to network inputs
    minibatch_source, epoch_size = create_mb_source("train",
                                                    image_height,
                                                    image_width,
                                                    num_classes,
                                                    num_rois,
                                                    base_path,
                                                    randomize=True)
    input_map = {
        image_input: minibatch_source.streams.features,
        roi_input: minibatch_source.streams.rois,
        label_input: minibatch_source.streams.roiLabels
    }

    # set loss / error functions
    ce = cross_entropy_with_softmax(frcn_output, label_input, axis=1)
    pe = classification_error(frcn_output, label_input, axis=1)
    if debug_output:
        plot(frcn_output, "graph_frcn.png")

    # set the progress printer(s)
    progress_writers = [ProgressPrinter(tag='Training', num_epochs=max_epochs)]
    if tensorboardLogDir != None:
        tensorboard_writer = TensorBoardProgressWriter(
            freq=10, log_dir=tensorboardLogDir, model=frcn_output)
        progress_writers.append(tensorboard_writer)

    # Set learning parameters and instantiate the trainer object
    lr_per_sample = [f / float(num_rois) for f in cntk_lr_per_image]
    lr_schedule = learning_rate_schedule(lr_per_sample, unit=UnitType.sample)
    mm_schedule = momentum_as_time_constant_schedule(momentum_time_constant)
    learner = momentum_sgd(frcn_output.parameters,
                           lr_schedule,
                           mm_schedule,
                           l2_regularization_weight=l2_reg_weight)
    trainer = Trainer(frcn_output, (ce, pe), learner, progress_writers)

    # Get minibatches of images and perform model training
    print("Training Fast R-CNN model for %s epochs." % max_epochs)
    log_number_of_parameters(frcn_output)
    for epoch in range(max_epochs):
        sample_count = 0

        # loop over minibatches in the epoch
        while sample_count < epoch_size:
            data = minibatch_source.next_minibatch(min(
                mb_size, epoch_size - sample_count),
                                                   input_map=input_map)
            if sample_count % 100 == 1:
                print(
                    "Training in progress: epoch {} of {}, sample count {} of {}"
                    .format(epoch, max_epochs, sample_count, epoch_size))
            trainer.train_minibatch(data)
            sample_count += trainer.previous_minibatch_sample_count  # count samples processed so far
        trainer.summarize_training_progress()

        # Log mean of each parameter tensor, so that we can confirm that the parameters change indeed.
        if tensorboardLogDir != None:
            for parameter in frcn_output.parameters:
                tensorboard_writer.write_value(parameter.uid + "/mean",
                                               np.mean(parameter.value), epoch)
                tensorboard_writer.write_value(parameter.uid + "/std",
                                               np.std(parameter.value), epoch)
                tensorboard_writer.write_value(parameter.uid + "/absSum",
                                               np.sum(np.abs(parameter.value)),
                                               epoch)

        if debug_output:
            frcn_output.save_model("frcn_py_%s.model" % (epoch + 1))
    return frcn_output

Example #19

Show file

    def __init__(self, input_shape, nb_actions,
                 gamma=0.95, explorer=LinearEpsilonAnnealingExplorer(1, 0.1, 100000),
                 learning_rate=0.01, momentum=0.8, minibatch_size=16,
                 memory_size=15000, train_after=100, train_interval=100, target_update_interval=500,
                 monitor=True):
        self.input_shape = input_shape
        self.nb_actions = nb_actions
        self.gamma = gamma

        self._train_after = train_after
        self._train_interval = train_interval
        self._target_update_interval = target_update_interval

        self._explorer = explorer
        self._minibatch_size = minibatch_size
        self._history = History(input_shape)
        self._memory = ReplayMemory(memory_size, input_shape[1:], 4)
        self._num_actions_taken = 0
        self._num_trains = 0

        # Metrics accumulator
        self._episode_rewards, self._episode_q_means, self._episode_q_stddev = [], [], []

        '''
        # Action Value model (used by agent to interact with the environment)
        with default_options(activation=relu, init=he_uniform()):
            self._action_value_net = Sequential([
                Convolution2D((8, 8), 16, strides=4),
                Convolution2D((4, 4), 32, strides=2),
                Convolution2D((3, 3), 32, strides=1),
                Dense(256, init=he_uniform(scale=0.01)),
                Dense(nb_actions, activation=None, init=he_uniform(scale=0.01))
            ])
        ''' 
        with default_options(activation=relu, init=he_uniform()):
            self._action_value_net = Sequential([
                Dense(7, init=he_uniform(scale=0.01)),
                Dense(8, init=he_uniform(scale=0.01)),
                #Dense(16, init=he_uniform(scale=0.01)),
                #Dense(32, init=he_uniform(scale=0.01)),
                Dense(nb_actions, activation=None, init=he_uniform(scale=0.01))
            ])
        
        self._action_value_net.update_signature(Tensor[input_shape])

        # Target model used to compute the target Q-values in training, updated
        # less frequently for increased stability.
        self._target_net = self._action_value_net.clone(CloneMethod.freeze)

        # Function computing Q-values targets as part of the computation graph
        @Function
        @Signature(post_states=Tensor[input_shape], rewards=Tensor[()], terminals=Tensor[()])
        def compute_q_targets(post_states, rewards, terminals):
            return element_select(
                terminals,
                rewards,
                gamma * reduce_max(self._target_net(post_states), axis=0) + rewards,
            )

        # Define the loss, using Huber Loss (more robust to outliers)
        @Function
        @Signature(pre_states=Tensor[input_shape], actions=Tensor[nb_actions],
                   post_states=Tensor[input_shape], rewards=Tensor[()], terminals=Tensor[()])
        def criterion(pre_states, actions, post_states, rewards, terminals):
            # Compute the q_targets
            q_targets = compute_q_targets(post_states, rewards, terminals)

            # actions is a 1-hot encoding of the action done by the agent
            q_acted = reduce_sum(self._action_value_net(pre_states) * actions, axis=0)

            # Define training criterion as the Huber Loss function
            return huber_loss(q_targets, q_acted, 1.0)

        # Adam based SGD
        lr_schedule = learning_rate_schedule(learning_rate, UnitType.minibatch)
        m_schedule = momentum_schedule(momentum)
        vm_schedule = momentum_schedule(0.999)
        l_sgd = adam(self._action_value_net.parameters, lr_schedule,
                     momentum=m_schedule, variance_momentum=vm_schedule)

        self._metrics_writer = TensorBoardProgressWriter(freq=1, log_dir='metrics', model=criterion) if monitor else None
        self._learner = l_sgd
        self._trainer = Trainer(criterion, (criterion, None), l_sgd, self._metrics_writer)

Example #20

Show file

File: myDQNdrone_input_not_image.py Project: Xking8/RL_drone

class DeepQAgent(object):
    """
    Implementation of Deep Q Neural Network agent like in:
        Nature 518. "Human-level control through deep reinforcement learning" (Mnih & al. 2015)
    """
    def __init__(self,
                 input_shape,
                 nb_actions,
                 gamma=0.99,
                 explorer=LinearEpsilonAnnealingExplorer(0.91, 0.1, 910000),
                 fixpolicy=LinearEpsilonAnnealingExplorer(0.5, 0.1, 100000),
                 learning_rate=0.00025,
                 momentum=0.95,
                 minibatch_size=32,
                 memory_size=500000,
                 train_after=10000,
                 train_interval=4,
                 target_update_interval=10000,
                 monitor=True):
        self.input_shape = input_shape
        self.nb_actions = nb_actions
        self.gamma = gamma

        self._train_after = train_after
        self._train_interval = train_interval
        self._target_update_interval = target_update_interval

        self._explorer = explorer
        self._fixpolicy = fixpolicy
        self._minibatch_size = minibatch_size
        self._history = History(input_shape)
        print("input_shape:", input_shape)
        print("input_shape[1:]", input_shape[1:])
        self._memory = ReplayMemory(memory_size, input_shape[1:], 4)
        self._num_actions_taken = 0

        # Metrics accumulator
        self._episode_rewards, self._episode_q_means, self._episode_q_stddev = [], [], []

        # Action Value model (used by agent to interact with the environment)
        with default_options(activation=relu, init=he_uniform()):
            self._action_value_net = Sequential([
                #Convolution2D((8, 8), 16, strides=4),
                #Convolution2D((4, 4), 32, strides=2),
                #Convolution2D((1, 1), 16, strides=1),
                Dense(25, init=he_uniform(scale=0.01)),
                Dense(nb_actions, activation=None, init=he_uniform(scale=0.01))
            ])
        self._action_value_net.update_signature(Tensor[input_shape])

        # Target model used to compute the target Q-values in training, updated
        # less frequently for increased stability.
        self._target_net = self._action_value_net.clone(CloneMethod.freeze)

        # Function computing Q-values targets as part of the computation graph
        @Function
        @Signature(post_states=Tensor[input_shape],
                   rewards=Tensor[()],
                   terminals=Tensor[()])
        def compute_q_targets(post_states, rewards, terminals):
            return element_select(
                terminals,
                rewards,
                gamma * reduce_max(self._target_net(post_states), axis=0) +
                rewards,
            )

        # Define the loss, using Huber Loss (more robust to outliers)
        @Function
        @Signature(pre_states=Tensor[input_shape],
                   actions=Tensor[nb_actions],
                   post_states=Tensor[input_shape],
                   rewards=Tensor[()],
                   terminals=Tensor[()])
        def criterion(pre_states, actions, post_states, rewards, terminals):
            # Compute the q_targets
            q_targets = compute_q_targets(post_states, rewards, terminals)

            # actions is a 1-hot encoding of the action done by the agent
            q_acted = reduce_sum(self._action_value_net(pre_states) * actions,
                                 axis=0)

            # Define training criterion as the Huber Loss function
            return huber_loss(q_targets, q_acted, 1.0)

        # Adam based SGD
        lr_schedule = learning_rate_schedule(learning_rate, UnitType.minibatch)
        m_schedule = momentum_schedule(momentum)
        vm_schedule = momentum_schedule(0.999)
        l_sgd = adam(self._action_value_net.parameters,
                     lr_schedule,
                     momentum=m_schedule,
                     variance_momentum=vm_schedule)

        self._metrics_writer = TensorBoardProgressWriter(
            freq=1, log_dir='metrics', model=criterion) if monitor else None
        self._learner = l_sgd
        self._trainer = Trainer(criterion, (criterion, None), l_sgd,
                                self._metrics_writer)
        #self._trainer.restore_from_checkpoint("models_heuristic_no_image\model")
    def act(self, state):
        """ This allows the agent to select the next action to perform in regard of the current state of the environment.
        It follows the terminology used in the Nature paper.

        Attributes:
            state (Tensor[input_shape]): The current environment state

        Returns: Int >= 0 : Next action to do
        """
        # Append the state to the short term memory (ie. History)
        self._history.append(state)
        #if True:
        if self._fixpolicy.is_exploring(self._num_actions_taken):
            diff_x = state[3] - state[0]
            diff_y = state[4] - state[1]
            diff_z = state[5] - state[2]
            diff_arr = np.array([diff_x, diff_y, diff_z])
            direction = np.argmax(np.absolute(diff_arr))
            '''
            abs_x = math.fabs(diff_x)
            abs_y = math.fabs(diff_y)
            abs_z = math.fabs(diff_z)
            diff = [diff_x, diff_y, diff_z]
            abs_diff = [abs_x, abs_y, abs_z]
            print(diff, abs_diff)
            m = max(abs_diff)
            direction = diff.index(m)'''
            print(diff_arr)
            if diff_arr[direction] < 0:
                fixaction = direction + 4
            else:
                fixaction = direction + 1

            self._num_actions_taken += 1
            return fixaction

        # If policy requires agent to explore, sample random action
        if self._explorer.is_exploring(self._num_actions_taken):
            action = self._explorer(self.nb_actions)
        else:
            # Use the network to output the best action
            env_with_history = self._history.value
            q_values = self._action_value_net.eval(
                # Append batch axis with only one sample to evaluate
                env_with_history.reshape((1, ) + env_with_history.shape))

            self._episode_q_means.append(np.mean(q_values))
            self._episode_q_stddev.append(np.std(q_values))

            # Return the value maximizing the expected reward
            action = q_values.argmax()

        # Keep track of interval action counter
        self._num_actions_taken += 1
        return action

    def observe(self, old_state, action, reward, done):
        """ This allows the agent to observe the output of doing the action it selected through act() on the old_state

        Attributes:
            old_state (Tensor[input_shape]): Previous environment state
            action (int): Action done by the agent
            reward (float): Reward for doing this action in the old_state environment
            done (bool): Indicate if the action has terminated the environment
        """
        self._episode_rewards.append(reward)

        # If done, reset short term memory (ie. History)
        if done:
            # Plot the metrics through Tensorboard and reset buffers
            if self._metrics_writer is not None:
                self._plot_metrics()
            self._episode_rewards, self._episode_q_means, self._episode_q_stddev = [], [], []

            # Reset the short term memory
            self._history.reset()

        # Append to long term memory
        self._memory.append(old_state, action, reward, done)

    def train(self):
        """ This allows the agent to train itself to better understand the environment dynamics.
        The agent will compute the expected reward for the state(t+1)
        and update the expected reward at step t according to this.

        The target expectation is computed through the Target Network, which is a more stable version
        of the Action Value Network for increasing training stability.

        The Target Network is a frozen copy of the Action Value Network updated as regular intervals.
        """

        agent_step = self._num_actions_taken
        print("agent_step = ", agent_step)
        #time.sleep(1)
        if agent_step >= self._train_after:
            if (agent_step % self._train_interval) == 0:
                pre_states, actions, post_states, rewards, terminals = self._memory.minibatch(
                    self._minibatch_size)

                self._trainer.train_minibatch(
                    self._trainer.loss_function.argument_map(
                        pre_states=pre_states,
                        actions=Value.one_hot(
                            actions.reshape(-1, 1).tolist(), self.nb_actions),
                        post_states=post_states,
                        rewards=rewards,
                        terminals=terminals))

                # Update the Target Network if needed
                if (agent_step % self._target_update_interval) == 0:
                    self._target_net = self._action_value_net.clone(
                        CloneMethod.freeze)
                    filename = "models_heuristic_no_image_less_exploration\model%d" % agent_step
                    print(
                        "$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$filename=",
                        filename)
                    self._trainer.save_checkpoint(filename)
                    #time.sleep(100)

    def _plot_metrics(self):
        global landing_count, episode_count
        """Plot current buffers accumulated values to visualize agent learning
        """
        f = open('log__heuristic_no_image_less_exploration2', 'a+')
        f.write('episode:' + str(episode_count) + ': exploration rate= ' +
                str(self._explorer._rate) + ' heuristic fix rate= ' +
                str(self._fixpolicy._rate) + '\n')
        if len(self._episode_q_means) > 0:
            mean_q = np.asscalar(np.mean(self._episode_q_means))
            self._metrics_writer.write_value('Mean Q per ep.', mean_q,
                                             self._num_actions_taken)
            print('Mean Q per ep.', mean_q, self._num_actions_taken)
            f.write('Mean Q per ep. ' + str(mean_q) + ' ' +
                    str(self._num_actions_taken) + '\n')

        if len(self._episode_q_stddev) > 0:
            std_q = np.asscalar(np.mean(self._episode_q_stddev))
            self._metrics_writer.write_value('Mean Std Q per ep.', std_q,
                                             self._num_actions_taken)
            print('Mean Std Q per ep.', std_q, self._num_actions_taken)
            f.write('Mean Std Q per ep. ' + str(std_q) + ' ' +
                    str(self._num_actions_taken) + '\n')

        self._metrics_writer.write_value('Sum rewards per ep.',
                                         sum(self._episode_rewards),
                                         self._num_actions_taken)
        print('Sum rewards per ep.', sum(self._episode_rewards),
              self._num_actions_taken)
        f.write('Sum rewards per ep. ' + str(sum(self._episode_rewards)) +
                ' ' + str(self._num_actions_taken) + '\n')
        if landing_count > 0:
            f.write('****************Success landing**********' +
                    str(landing_count) + '\n')
        landing_count = 0
        episode_count = 0

        f.write('\n')