Exemple #1
0
def main(args):
    """Entry point if run on the command line"""
    with open(args.output, 'w') as f:
        if args.runners_hh:
            code = Runners(nodes + custom_nodes).header
        elif args.runners_cc:
            code = Runners(nodes).source
        elif args.dispatch_hh:
            code = Dispatch(terminals, nodes + custom_nodes).header
        elif args.dispatch_cc:
            code = Dispatch(terminals, nodes + custom_nodes).source
        elif args.expression_hh:
            code = Expressions(nodes).header
        elif args.expression_cc:
            code = Expressions(nodes).source
        elif args.numeric_hh:
            code = Numeric(nodes).header
        elif args.numeric_cc:
            code = Numeric(nodes).source
        elif args.exprenum_hh:
            code = ExprEnums(nodes).code
        elif args.exprenum_java:
            code = ExprEnums(nodes).java
        elif args.createexpr_cc:
            code = CreateExpressions(terminals + nodes + custom_nodes).source
        f.writelines(code)
Exemple #2
0
    def __init__(self, network_creator, environment_creator, args):
        super(PAACLearner, self).__init__(network_creator, environment_creator,
                                          args)
        self.workers = args.emulator_workers

        self.network_creator = network_creator  # record the network creator in order to create good_network later

        self.total_rewards = []

        self.adversary = Adversary(args)

        # state, reward, episode_over, action
        self.variables = [(np.asarray(
            [emulator.get_initial_state() for emulator in self.emulators],
            dtype=np.uint8)), (np.zeros(self.emulator_counts,
                                        dtype=np.float32)),
                          (np.asarray([False] * self.emulator_counts,
                                      dtype=np.float32)),
                          (np.zeros((self.emulator_counts, self.num_actions),
                                    dtype=np.float32))]

        self.runners = Runners(EmulatorRunner, self.emulators, self.workers,
                               self.variables)
        self.runners.start()
        self.shared_states, self.shared_rewards, self.shared_episode_over, self.shared_actions = self.runners.get_shared_variables(
        )

        self.summaries_op = tf.summary.merge_all()

        self.emulator_steps = [0] * self.emulator_counts
        self.total_episode_rewards = self.emulator_counts * [0]

        self.actions_sum = np.zeros((self.emulator_counts, self.num_actions))
        self.y_batch = np.zeros((self.max_local_steps, self.emulator_counts))
        self.adv_batch = np.zeros((self.max_local_steps, self.emulator_counts))
        self.rewards = np.zeros((self.max_local_steps, self.emulator_counts))
        self.states = np.zeros([self.max_local_steps] +
                               list(self.shared_states.shape),
                               dtype=np.uint8)
        self.actions = np.zeros(
            (self.max_local_steps, self.emulator_counts, self.num_actions))
        self.values = np.zeros((self.max_local_steps, self.emulator_counts))
        self.episodes_over_masks = np.zeros(
            (self.max_local_steps, self.emulator_counts))
Exemple #3
0
    def train(self):
        """
        Main actor learner loop for parallel advantage actor critic learning.
        """

        self.global_step = self.init_network()

        logging.debug("Starting training at Step {}".format(self.global_step))
        counter = 0

        global_step_start = self.global_step

        total_rewards = []

        # state, reward, episode_over, action
        variables = [(np.asarray(
            [emulator.get_initial_state() for emulator in self.emulators],
            dtype=np.uint8)), (np.zeros(self.emulator_counts,
                                        dtype=np.float32)),
                     (np.asarray([False] * self.emulator_counts,
                                 dtype=np.float32)),
                     (np.zeros((self.emulator_counts, self.num_actions),
                               dtype=np.float32))]

        self.runners = Runners(EmulatorRunner, self.emulators, self.workers,
                               variables)
        self.runners.start()
        shared_states, shared_rewards, shared_episode_over, shared_actions = self.runners.get_shared_variables(
        )

        summaries_op = tf.summary.merge_all()

        emulator_steps = [0] * self.emulator_counts
        total_episode_rewards = self.emulator_counts * [0]

        actions_sum = np.zeros((self.emulator_counts, self.num_actions))
        y_batch = np.zeros((self.max_local_steps, self.emulator_counts))
        adv_batch = np.zeros((self.max_local_steps, self.emulator_counts))
        rewards = np.zeros((self.max_local_steps, self.emulator_counts))
        states = np.zeros([self.max_local_steps] + list(shared_states.shape),
                          dtype=np.uint8)
        actions = np.zeros(
            (self.max_local_steps, self.emulator_counts, self.num_actions))
        values = np.zeros((self.max_local_steps, self.emulator_counts))
        episodes_over_masks = np.zeros(
            (self.max_local_steps, self.emulator_counts))

        start_time = time.time()

        while self.global_step < self.max_global_steps:

            loop_start_time = time.time()

            max_local_steps = self.max_local_steps
            for t in range(max_local_steps):
                next_actions, readouts_v_t, readouts_pi_t = self.__choose_next_actions(
                    shared_states)
                actions_sum += next_actions
                for z in range(next_actions.shape[0]):
                    shared_actions[z] = next_actions[z]

                actions[t] = next_actions
                values[t] = readouts_v_t
                states[t] = shared_states

                # Start updating all environments with next_actions
                self.runners.update_environments()
                self.runners.wait_updated()
                # Done updating all environments, have new states, rewards and is_over

                episodes_over_masks[t] = 1.0 - shared_episode_over.astype(
                    np.float32)

                for e, (actual_reward, episode_over) in enumerate(
                        zip(shared_rewards, shared_episode_over)):
                    total_episode_rewards[e] += actual_reward
                    actual_reward = self.rescale_reward(actual_reward)
                    rewards[t, e] = actual_reward

                    emulator_steps[e] += 1
                    self.global_step += 1
                    if episode_over:
                        total_rewards.append(total_episode_rewards[e])
                        episode_summary = tf.Summary(value=[
                            tf.Summary.Value(
                                tag='rl/reward',
                                simple_value=total_episode_rewards[e]),
                            tf.Summary.Value(tag='rl/episode_length',
                                             simple_value=emulator_steps[e]),
                        ])
                        self.summary_writer.add_summary(
                            episode_summary, self.global_step)
                        self.summary_writer.flush()
                        total_episode_rewards[e] = 0
                        emulator_steps[e] = 0
                        actions_sum[e] = np.zeros(self.num_actions)

            nest_state_value = self.session.run(
                self.network.output_layer_v,
                feed_dict={self.network.input_ph: shared_states})

            estimated_return = np.copy(nest_state_value)

            for t in reversed(range(max_local_steps)):
                estimated_return = rewards[
                    t] + self.gamma * estimated_return * episodes_over_masks[t]
                y_batch[t] = np.copy(estimated_return)
                adv_batch[t] = estimated_return - values[t]

            flat_states = states.reshape(
                [self.max_local_steps * self.emulator_counts] +
                list(shared_states.shape)[1:])
            flat_y_batch = y_batch.reshape(-1)
            flat_adv_batch = adv_batch.reshape(-1)
            flat_actions = actions.reshape(
                max_local_steps * self.emulator_counts, self.num_actions)

            lr = self.get_lr()
            feed_dict = {
                self.network.input_ph: flat_states,
                self.network.critic_target_ph: flat_y_batch,
                self.network.selected_action_ph: flat_actions,
                self.network.adv_actor_ph: flat_adv_batch,
                self.learning_rate: lr
            }

            _, summaries = self.session.run([self.train_step, summaries_op],
                                            feed_dict=feed_dict)

            self.summary_writer.add_summary(summaries, self.global_step)
            self.summary_writer.flush()

            counter += 1

            if counter % (2048 / self.emulator_counts) == 0:
                curr_time = time.time()
                global_steps = self.global_step
                last_ten = 0.0 if len(total_rewards) < 1 else np.mean(
                    total_rewards[-10:])
                logging.info(
                    "Ran {} steps, at {} steps/s ({} steps/s avg), last 10 rewards avg {}"
                    .format(
                        global_steps, self.max_local_steps *
                        self.emulator_counts / (curr_time - loop_start_time),
                        (global_steps - global_step_start) /
                        (curr_time - start_time), last_ten))
            self.save_vars()

        self.cleanup()
Exemple #4
0
class PAACLearner(ActorLearner):
    def __init__(self, network_creator, environment_creator, args):
        super(PAACLearner, self).__init__(network_creator, environment_creator,
                                          args)
        self.workers = args.emulator_workers

    @staticmethod
    def choose_next_actions(network, num_actions, states, session):
        network_output_v, network_output_pi = session.run(
            [network.output_layer_v, network.output_layer_pi],
            feed_dict={network.input_ph: states})

        action_indices = PAACLearner.__sample_policy_action(network_output_pi)

        new_actions = np.eye(num_actions)[action_indices]

        return new_actions, network_output_v, network_output_pi

    def __choose_next_actions(self, states):
        return PAACLearner.choose_next_actions(self.network, self.num_actions,
                                               states, self.session)

    @staticmethod
    def __sample_policy_action(probs):
        """
        Sample an action from an action probability distribution output by
        the policy network.
        """
        # Subtract a tiny value from probabilities in order to avoid
        # "ValueError: sum(pvals[:-1]) > 1.0" in numpy.multinomial
        probs = probs - np.finfo(np.float32).epsneg

        action_indexes = [
            int(np.nonzero(np.random.multinomial(1, p))[0]) for p in probs
        ]
        return action_indexes

    def _get_shared(self, array, dtype=c_float):
        """
        Returns a RawArray backed numpy array that can be shared between processes.
        :param array: the array to be shared
        :param dtype: the RawArray dtype to use
        :return: the RawArray backed numpy array
        """

        shape = array.shape
        shared = RawArray(dtype, array.reshape(-1))
        return np.frombuffer(shared, dtype).reshape(shape)

    def train(self):
        """
        Main actor learner loop for parallel advantage actor critic learning.
        """

        self.global_step = self.init_network()

        logging.debug("Starting training at Step {}".format(self.global_step))
        counter = 0

        global_step_start = self.global_step

        total_rewards = []

        # state, reward, episode_over, action
        variables = [(np.asarray(
            [emulator.get_initial_state() for emulator in self.emulators],
            dtype=np.uint8)), (np.zeros(self.emulator_counts,
                                        dtype=np.float32)),
                     (np.asarray([False] * self.emulator_counts,
                                 dtype=np.float32)),
                     (np.zeros((self.emulator_counts, self.num_actions),
                               dtype=np.float32))]

        self.runners = Runners(EmulatorRunner, self.emulators, self.workers,
                               variables)
        self.runners.start()
        shared_states, shared_rewards, shared_episode_over, shared_actions = self.runners.get_shared_variables(
        )

        summaries_op = tf.summary.merge_all()

        emulator_steps = [0] * self.emulator_counts
        total_episode_rewards = self.emulator_counts * [0]

        actions_sum = np.zeros((self.emulator_counts, self.num_actions))
        y_batch = np.zeros((self.max_local_steps, self.emulator_counts))
        adv_batch = np.zeros((self.max_local_steps, self.emulator_counts))
        rewards = np.zeros((self.max_local_steps, self.emulator_counts))
        states = np.zeros([self.max_local_steps] + list(shared_states.shape),
                          dtype=np.uint8)
        actions = np.zeros(
            (self.max_local_steps, self.emulator_counts, self.num_actions))
        values = np.zeros((self.max_local_steps, self.emulator_counts))
        episodes_over_masks = np.zeros(
            (self.max_local_steps, self.emulator_counts))

        start_time = time.time()

        while self.global_step < self.max_global_steps:

            loop_start_time = time.time()

            max_local_steps = self.max_local_steps
            for t in range(max_local_steps):
                next_actions, readouts_v_t, readouts_pi_t = self.__choose_next_actions(
                    shared_states)
                actions_sum += next_actions
                for z in range(next_actions.shape[0]):
                    shared_actions[z] = next_actions[z]

                actions[t] = next_actions
                values[t] = readouts_v_t
                states[t] = shared_states

                # Start updating all environments with next_actions
                self.runners.update_environments()
                self.runners.wait_updated()
                # Done updating all environments, have new states, rewards and is_over

                episodes_over_masks[t] = 1.0 - shared_episode_over.astype(
                    np.float32)

                for e, (actual_reward, episode_over) in enumerate(
                        zip(shared_rewards, shared_episode_over)):
                    total_episode_rewards[e] += actual_reward
                    actual_reward = self.rescale_reward(actual_reward)
                    rewards[t, e] = actual_reward

                    emulator_steps[e] += 1
                    self.global_step += 1
                    if episode_over:
                        total_rewards.append(total_episode_rewards[e])
                        episode_summary = tf.Summary(value=[
                            tf.Summary.Value(
                                tag='rl/reward',
                                simple_value=total_episode_rewards[e]),
                            tf.Summary.Value(tag='rl/episode_length',
                                             simple_value=emulator_steps[e]),
                        ])
                        self.summary_writer.add_summary(
                            episode_summary, self.global_step)
                        self.summary_writer.flush()
                        total_episode_rewards[e] = 0
                        emulator_steps[e] = 0
                        actions_sum[e] = np.zeros(self.num_actions)

            nest_state_value = self.session.run(
                self.network.output_layer_v,
                feed_dict={self.network.input_ph: shared_states})

            estimated_return = np.copy(nest_state_value)

            for t in reversed(range(max_local_steps)):
                estimated_return = rewards[
                    t] + self.gamma * estimated_return * episodes_over_masks[t]
                y_batch[t] = np.copy(estimated_return)
                adv_batch[t] = estimated_return - values[t]

            flat_states = states.reshape(
                [self.max_local_steps * self.emulator_counts] +
                list(shared_states.shape)[1:])
            flat_y_batch = y_batch.reshape(-1)
            flat_adv_batch = adv_batch.reshape(-1)
            flat_actions = actions.reshape(
                max_local_steps * self.emulator_counts, self.num_actions)

            lr = self.get_lr()
            feed_dict = {
                self.network.input_ph: flat_states,
                self.network.critic_target_ph: flat_y_batch,
                self.network.selected_action_ph: flat_actions,
                self.network.adv_actor_ph: flat_adv_batch,
                self.learning_rate: lr
            }

            _, summaries = self.session.run([self.train_step, summaries_op],
                                            feed_dict=feed_dict)

            self.summary_writer.add_summary(summaries, self.global_step)
            self.summary_writer.flush()

            counter += 1

            if counter % (2048 / self.emulator_counts) == 0:
                curr_time = time.time()
                global_steps = self.global_step
                last_ten = 0.0 if len(total_rewards) < 1 else np.mean(
                    total_rewards[-10:])
                logging.info(
                    "Ran {} steps, at {} steps/s ({} steps/s avg), last 10 rewards avg {}"
                    .format(
                        global_steps, self.max_local_steps *
                        self.emulator_counts / (curr_time - loop_start_time),
                        (global_steps - global_step_start) /
                        (curr_time - start_time), last_ten))
            self.save_vars()

        self.cleanup()

    def cleanup(self):
        super(PAACLearner, self).cleanup()
        self.runners.stop()
Exemple #5
0
    def train(self):
        """ Main actor learner loop for parallel advantage actor critic learning."""

        self.global_step = self.init_network()
        global_step_start = self.global_step
        counter = 0
        total_rewards = []
        total_steps = []

        logging.debug("Starting training at Step {}".format(self.global_step))

        # state, reward, episode_over, action, repetition
        variables = [(np.asarray(
            [emulator.get_initial_state() for emulator in self.emulators],
            dtype=np.uint8)), (np.zeros(self.emulator_counts,
                                        dtype=np.float32)),
                     (np.asarray([False] * self.emulator_counts,
                                 dtype=np.float32)),
                     (np.zeros((self.emulator_counts, self.num_actions),
                               dtype=np.float32)),
                     (np.zeros((self.emulator_counts, self.total_repetitions),
                               dtype=np.float32))]

        self.runners = Runners(self.tab_rep, EmulatorRunner, self.emulators,
                               self.workers, variables)
        self.runners.start()
        shared_states, shared_rewards, shared_episode_over, shared_actions, shared_rep = self.runners.get_shared_variables(
        )
        if self.lstm_bool:
            self.n_steps = 5
            memory = np.zeros(([self.emulator_counts, self.n_steps] +
                               list(shared_states.shape)[1:]),
                              dtype=np.uint8)
            whole_memory = np.zeros(
                ([self.max_local_steps, self.emulator_counts, self.n_steps] +
                 list(shared_states.shape)[1:]),
                dtype=np.uint8)
            for e in range(self.emulator_counts):
                memory[e, -1, :, :, :] = shared_states[e]

        summaries_op = tf.summary.merge_all()

        emulator_steps = [0] * self.emulator_counts
        total_episode_rewards = self.emulator_counts * [0]

        actions_sum = np.zeros((self.emulator_counts, self.num_actions))
        y_batch = np.zeros((self.max_local_steps, self.emulator_counts))
        adv_batch = np.zeros((self.max_local_steps, self.emulator_counts))
        rewards = np.zeros((self.max_local_steps, self.emulator_counts))
        states = np.zeros([self.max_local_steps] + list(shared_states.shape),
                          dtype=np.uint8)
        actions = np.zeros(
            (self.max_local_steps, self.emulator_counts, self.num_actions))
        repetitions = np.zeros((self.max_local_steps, self.emulator_counts,
                                self.total_repetitions))
        values = np.zeros((self.max_local_steps, self.emulator_counts))
        episodes_over_masks = np.zeros(
            (self.max_local_steps, self.emulator_counts))

        start_time = time.time()

        while self.global_step < self.max_global_steps:
            print('step : ' + str(self.global_step))

            loop_start_time = time.time()
            total_action_rep = np.zeros(
                (self.num_actions, self.total_repetitions))
            nb_actions = 0

            max_local_steps = self.max_local_steps
            for t in range(max_local_steps):

                #Choose actions and repetitions for each emulator
                if not self.lstm_bool:
                    readouts_v_t, readouts_pi_t, readouts_rep_t = self.session.run(
                        [
                            self.network.output_layer_v,
                            self.network.output_layer_pi,
                            self.network.output_layer_rep
                        ],
                        feed_dict={self.network.input_ph: shared_states})
                    new_actions, new_repetitions = self.explo_policy.choose_next_actions(
                        readouts_pi_t, readouts_rep_t, self.num_actions)
                else:
                    readouts_v_t, readouts_pi_t, readouts_rep_t = self.session.run(
                        [
                            self.network.output_layer_v,
                            self.network.output_layer_pi,
                            self.network.output_layer_rep
                        ],
                        feed_dict={self.network.memory_ph: memory})
                    new_actions, new_repetitions = self.explo_policy.choose_next_actions(
                        readouts_pi_t, readouts_rep_t, self.num_actions)

                actions_sum += new_actions

                for e in range(self.emulator_counts):
                    nb_actions += np.argmax(new_repetitions[e]) + 1

                # sharing the actions and repetitions to the different threads
                for z in range(new_actions.shape[0]):
                    shared_actions[z] = new_actions[z]
                for z in range(new_repetitions.shape[0]):
                    shared_rep[z] = new_repetitions[z]

                actions[t] = new_actions
                values[t] = readouts_v_t
                states[t] = shared_states
                repetitions[t] = new_repetitions

                # Start updating all environments with next_actions
                self.runners.update_environments()
                self.runners.wait_updated()
                # Done updating all environments, have new states, rewards and is_over

                if self.lstm_bool:
                    memory, whole_memory = self.update_memory(
                        memory, shared_states, whole_memory, t)

                episodes_over_masks[t] = 1.0 - shared_episode_over.astype(
                    np.float32)

                for e, (actual_reward, episode_over) in enumerate(
                        zip(shared_rewards, shared_episode_over)):
                    total_episode_rewards[e] += actual_reward
                    actual_reward = self.rescale_reward(actual_reward)
                    rewards[t, e] = actual_reward

                    emulator_steps[e] += self.tab_rep[np.argmax(
                        new_repetitions[e])] + 1
                    self.global_step += 1

                    #rempli le tableau pour l'histogramme des actions - repetitions
                    a = np.argmax(new_actions[e])
                    r = np.argmax(new_repetitions[e])
                    total_action_rep[a][r] += 1

                    if episode_over:
                        total_rewards.append(total_episode_rewards[e])
                        total_steps.append(emulator_steps[e])
                        episode_summary = tf.Summary(value=[
                            tf.Summary.Value(
                                tag='rl/reward',
                                simple_value=total_episode_rewards[e]),
                            tf.Summary.Value(tag='rl/episode_length',
                                             simple_value=emulator_steps[e])
                        ])
                        self.summary_writer.add_summary(
                            episode_summary, self.global_step)
                        self.summary_writer.flush()
                        total_episode_rewards[e] = 0
                        emulator_steps[e] = 0
                        if self.lstm_bool:
                            memory[e] = np.zeros(
                                ([self.n_steps] +
                                 list(shared_states.shape)[1:]),
                                dtype=np.uint8)

                        actions_sum[e] = np.zeros(self.num_actions)

            ##plot output of conv layers
            # with tf.name_scope('Summary_ConvNet'):
            #     if self.global_step % (10000*self.emulator_counts*self.max_local_steps) == 0:
            #         convs = self.session.run(self.network.convs,
            #             feed_dict= {self.network.input_ph: [shared_states[0]]})
            #         imgs = [np.array([utils.plot_conv_output(conv)]) for conv in convs]
            #         sums = [tf.summary.image('conv'+str(i), imgs[i], 1) for i in range(len(imgs))]
            #         real_sums = self.session.run(sums)
            #         for s in real_sums : self.summary_writer.add_summary(s, self.global_step)
            #         self.summary_writer.flush()

            if self.lstm_bool:
                nest_state_value = self.session.run(
                    self.network.output_layer_v,
                    feed_dict={self.network.memory_ph: memory})
            else:
                nest_state_value = self.session.run(
                    self.network.output_layer_v,
                    feed_dict={self.network.input_ph: shared_states})

            estimated_return = np.copy(nest_state_value)

            for t in reversed(range(max_local_steps)):
                estimated_return = rewards[
                    t] + self.gamma * estimated_return * episodes_over_masks[t]
                y_batch[t] = np.copy(estimated_return)
                adv_batch[t] = estimated_return - values[t]

            if self.lstm_bool:
                flat_states = whole_memory.reshape([
                    self.max_local_steps * self.emulator_counts, self.n_steps
                ] + list(shared_states.shape)[1:])
            else:
                flat_states = states.reshape(
                    [self.max_local_steps * self.emulator_counts] +
                    list(shared_states.shape)[1:])
            flat_y_batch = y_batch.reshape(-1)
            flat_adv_batch = adv_batch.reshape(-1)
            flat_actions = actions.reshape(
                max_local_steps * self.emulator_counts, self.num_actions)
            flat_rep = repetitions.reshape(
                max_local_steps * self.emulator_counts, self.total_repetitions)

            lr = self.get_lr()
            feed_dict = {
                self.network.critic_target_ph: flat_y_batch,
                self.network.selected_action_ph: flat_actions,
                self.network.selected_repetition_ph: flat_rep,
                self.network.adv_actor_ph: flat_adv_batch,
                self.learning_rate: lr
            }

            if self.lstm_bool:
                feed_dict[self.network.memory_ph] = flat_states
            else:
                feed_dict[self.network.input_ph] = flat_states

            _, summaries = self.session.run([self.train_step, summaries_op],
                                            feed_dict=feed_dict)
            self.summary_writer.add_summary(summaries, self.global_step)

            param_summary = tf.Summary(
                value=[tf.Summary.Value(tag='parameters/lr', simple_value=lr)])
            self.summary_writer.add_summary(param_summary, self.global_step)
            self.summary_writer.flush()

            self.log_values(total_rewards, 'rewards_per_episode')
            self.log_values(total_steps, 'steps_per_episode')

            #ajout de l'histogramme des actions /repetitions
            nb_a = [sum(a) for a in total_action_rep]
            nb_r = [sum(r) for r in np.transpose(total_action_rep)]
            histo_a, histo_r = [], []
            for i in range(self.num_actions):
                histo_a += [i] * int(nb_a[i])
            for i in range(self.total_repetitions):
                histo_r += [self.tab_rep[i] + 1] * int(nb_r[i])
            self.log_histogram('actions', np.array(histo_a), self.global_step)
            self.log_histogram('repetitions', np.array(histo_r),
                               self.global_step)

            counter += 1
            if counter % (2048 / self.emulator_counts) == 0:
                curr_time = time.time()
                last_ten = 0.0 if len(total_rewards) < 1 else np.mean(
                    total_rewards[-10:])
                steps_per_sec = self.max_local_steps * self.emulator_counts / (
                    curr_time - loop_start_time)
                actions_per_s = nb_actions / (curr_time - loop_start_time)
                average_steps_per_sec = (self.global_step - global_step_start
                                         ) / (curr_time - start_time)
                logging.info(
                    "Ran {} steps, at {} steps/s ({} steps/s avg), last 10 rewards avg {}"
                    .format(self.global_step, steps_per_sec,
                            average_steps_per_sec, last_ten))

                stats_summary = tf.Summary(value=[
                    tf.Summary.Value(tag='stats/steps_per_s',
                                     simple_value=steps_per_sec),
                    tf.Summary.Value(tag='stats/average_steps_per_s',
                                     simple_value=average_steps_per_sec),
                    tf.Summary.Value(tag='stats/actions_per_s',
                                     simple_value=actions_per_s)
                ])
                self.summary_writer.add_summary(stats_summary,
                                                self.global_step)
                self.summary_writer.flush()

            self.save_vars()

        self.cleanup()
Exemple #6
0
class PAACLearner(ActorLearner):
    def __init__(self, network_creator, environment_creator, explo_policy,
                 args):
        super(PAACLearner, self).__init__(network_creator, environment_creator,
                                          explo_policy, args)
        self.workers = args.emulator_workers
        self.total_repetitions = args.nb_choices
        self.lstm_bool = (args.arch == 'LSTM')
        self.tab_rep = explo_policy.tab_rep

        #add the parameters to tensorboard
        sess = tf.InteractiveSession()
        file_args = open(args.debugging_folder + "args.json", 'r')
        text = str(file_args.read())
        summary_op = tf.summary.text('text', tf.convert_to_tensor(text))
        text = sess.run(summary_op)
        self.summary_writer.add_summary(text, 0)
        self.summary_writer.flush()
        sess.close()

    def _get_shared(self, array, dtype=c_float):
        """
        Returns a RawArray backed numpy array that can be shared between processes.
        :param array: the array to be shared
        :param dtype: the RawArray dtype to use
        :return: the RawArray backed numpy array """

        shape = array.shape
        shared = RawArray(dtype, array.reshape(-1))
        return np.frombuffer(shared, dtype).reshape(shape)

    def log_histogram(self, tag, values, step, bins=1000):
        """Logs the histogram of a list/vector of values"""

        counts, bin_edges = np.histogram(values, bins=bins)
        hist = tf.HistogramProto()
        hist.min = float(np.min(values))
        hist.max = float(np.max(values))
        hist.num = int(np.prod(values.shape))
        hist.sum = float(np.sum(values))
        hist.sum_squares = float(np.sum(values**2))

        bin_edges = bin_edges[1:]

        for edge in bin_edges:
            hist.bucket_limit.append(edge)
        for c in counts:
            hist.bucket.append(c)

        summary = tf.Summary(value=[tf.Summary.Value(tag=tag, histo=hist)])
        self.summary_writer.add_summary(summary, step)
        self.summary_writer.flush()

    def log_values(self, values, tag, length=50, timestep=500):
        if len(values) > length and self.global_step % timestep == 0:
            mean = np.mean(values[-50:])
            std = np.std(values[-50:])
            summary = tf.Summary(value=[
                tf.Summary.Value(tag=tag + '/mean', simple_value=mean),
                tf.Summary.Value(tag=tag + '/min',
                                 simple_value=min(values[-50:])),
                tf.Summary.Value(tag=tag + '/max',
                                 simple_value=max(values[-50:])),
                tf.Summary.Value(tag=tag + '/std', simple_value=std),
                tf.Summary.Value(tag=tag + '/std_over_mean',
                                 simple_value=min(2, np.absolute(std / mean)))
            ])
            self.summary_writer.add_summary(summary, self.global_step)
            self.summary_writer.flush()

    def update_memory(self, memory, shared_states, whole_memory, t):
        whole_memory[t] = memory
        memory[:, :-1, :, :, :] = memory[:, 1:, :, :, :]
        memory[:, -1, :, :, :] = shared_states
        return memory, whole_memory

    def train(self):
        """ Main actor learner loop for parallel advantage actor critic learning."""

        self.global_step = self.init_network()
        global_step_start = self.global_step
        counter = 0
        total_rewards = []
        total_steps = []

        logging.debug("Starting training at Step {}".format(self.global_step))

        # state, reward, episode_over, action, repetition
        variables = [(np.asarray(
            [emulator.get_initial_state() for emulator in self.emulators],
            dtype=np.uint8)), (np.zeros(self.emulator_counts,
                                        dtype=np.float32)),
                     (np.asarray([False] * self.emulator_counts,
                                 dtype=np.float32)),
                     (np.zeros((self.emulator_counts, self.num_actions),
                               dtype=np.float32)),
                     (np.zeros((self.emulator_counts, self.total_repetitions),
                               dtype=np.float32))]

        self.runners = Runners(self.tab_rep, EmulatorRunner, self.emulators,
                               self.workers, variables)
        self.runners.start()
        shared_states, shared_rewards, shared_episode_over, shared_actions, shared_rep = self.runners.get_shared_variables(
        )
        if self.lstm_bool:
            self.n_steps = 5
            memory = np.zeros(([self.emulator_counts, self.n_steps] +
                               list(shared_states.shape)[1:]),
                              dtype=np.uint8)
            whole_memory = np.zeros(
                ([self.max_local_steps, self.emulator_counts, self.n_steps] +
                 list(shared_states.shape)[1:]),
                dtype=np.uint8)
            for e in range(self.emulator_counts):
                memory[e, -1, :, :, :] = shared_states[e]

        summaries_op = tf.summary.merge_all()

        emulator_steps = [0] * self.emulator_counts
        total_episode_rewards = self.emulator_counts * [0]

        actions_sum = np.zeros((self.emulator_counts, self.num_actions))
        y_batch = np.zeros((self.max_local_steps, self.emulator_counts))
        adv_batch = np.zeros((self.max_local_steps, self.emulator_counts))
        rewards = np.zeros((self.max_local_steps, self.emulator_counts))
        states = np.zeros([self.max_local_steps] + list(shared_states.shape),
                          dtype=np.uint8)
        actions = np.zeros(
            (self.max_local_steps, self.emulator_counts, self.num_actions))
        repetitions = np.zeros((self.max_local_steps, self.emulator_counts,
                                self.total_repetitions))
        values = np.zeros((self.max_local_steps, self.emulator_counts))
        episodes_over_masks = np.zeros(
            (self.max_local_steps, self.emulator_counts))

        start_time = time.time()

        while self.global_step < self.max_global_steps:
            print('step : ' + str(self.global_step))

            loop_start_time = time.time()
            total_action_rep = np.zeros(
                (self.num_actions, self.total_repetitions))
            nb_actions = 0

            max_local_steps = self.max_local_steps
            for t in range(max_local_steps):

                #Choose actions and repetitions for each emulator
                if not self.lstm_bool:
                    readouts_v_t, readouts_pi_t, readouts_rep_t = self.session.run(
                        [
                            self.network.output_layer_v,
                            self.network.output_layer_pi,
                            self.network.output_layer_rep
                        ],
                        feed_dict={self.network.input_ph: shared_states})
                    new_actions, new_repetitions = self.explo_policy.choose_next_actions(
                        readouts_pi_t, readouts_rep_t, self.num_actions)
                else:
                    readouts_v_t, readouts_pi_t, readouts_rep_t = self.session.run(
                        [
                            self.network.output_layer_v,
                            self.network.output_layer_pi,
                            self.network.output_layer_rep
                        ],
                        feed_dict={self.network.memory_ph: memory})
                    new_actions, new_repetitions = self.explo_policy.choose_next_actions(
                        readouts_pi_t, readouts_rep_t, self.num_actions)

                actions_sum += new_actions

                for e in range(self.emulator_counts):
                    nb_actions += np.argmax(new_repetitions[e]) + 1

                # sharing the actions and repetitions to the different threads
                for z in range(new_actions.shape[0]):
                    shared_actions[z] = new_actions[z]
                for z in range(new_repetitions.shape[0]):
                    shared_rep[z] = new_repetitions[z]

                actions[t] = new_actions
                values[t] = readouts_v_t
                states[t] = shared_states
                repetitions[t] = new_repetitions

                # Start updating all environments with next_actions
                self.runners.update_environments()
                self.runners.wait_updated()
                # Done updating all environments, have new states, rewards and is_over

                if self.lstm_bool:
                    memory, whole_memory = self.update_memory(
                        memory, shared_states, whole_memory, t)

                episodes_over_masks[t] = 1.0 - shared_episode_over.astype(
                    np.float32)

                for e, (actual_reward, episode_over) in enumerate(
                        zip(shared_rewards, shared_episode_over)):
                    total_episode_rewards[e] += actual_reward
                    actual_reward = self.rescale_reward(actual_reward)
                    rewards[t, e] = actual_reward

                    emulator_steps[e] += self.tab_rep[np.argmax(
                        new_repetitions[e])] + 1
                    self.global_step += 1

                    #rempli le tableau pour l'histogramme des actions - repetitions
                    a = np.argmax(new_actions[e])
                    r = np.argmax(new_repetitions[e])
                    total_action_rep[a][r] += 1

                    if episode_over:
                        total_rewards.append(total_episode_rewards[e])
                        total_steps.append(emulator_steps[e])
                        episode_summary = tf.Summary(value=[
                            tf.Summary.Value(
                                tag='rl/reward',
                                simple_value=total_episode_rewards[e]),
                            tf.Summary.Value(tag='rl/episode_length',
                                             simple_value=emulator_steps[e])
                        ])
                        self.summary_writer.add_summary(
                            episode_summary, self.global_step)
                        self.summary_writer.flush()
                        total_episode_rewards[e] = 0
                        emulator_steps[e] = 0
                        if self.lstm_bool:
                            memory[e] = np.zeros(
                                ([self.n_steps] +
                                 list(shared_states.shape)[1:]),
                                dtype=np.uint8)

                        actions_sum[e] = np.zeros(self.num_actions)

            ##plot output of conv layers
            # with tf.name_scope('Summary_ConvNet'):
            #     if self.global_step % (10000*self.emulator_counts*self.max_local_steps) == 0:
            #         convs = self.session.run(self.network.convs,
            #             feed_dict= {self.network.input_ph: [shared_states[0]]})
            #         imgs = [np.array([utils.plot_conv_output(conv)]) for conv in convs]
            #         sums = [tf.summary.image('conv'+str(i), imgs[i], 1) for i in range(len(imgs))]
            #         real_sums = self.session.run(sums)
            #         for s in real_sums : self.summary_writer.add_summary(s, self.global_step)
            #         self.summary_writer.flush()

            if self.lstm_bool:
                nest_state_value = self.session.run(
                    self.network.output_layer_v,
                    feed_dict={self.network.memory_ph: memory})
            else:
                nest_state_value = self.session.run(
                    self.network.output_layer_v,
                    feed_dict={self.network.input_ph: shared_states})

            estimated_return = np.copy(nest_state_value)

            for t in reversed(range(max_local_steps)):
                estimated_return = rewards[
                    t] + self.gamma * estimated_return * episodes_over_masks[t]
                y_batch[t] = np.copy(estimated_return)
                adv_batch[t] = estimated_return - values[t]

            if self.lstm_bool:
                flat_states = whole_memory.reshape([
                    self.max_local_steps * self.emulator_counts, self.n_steps
                ] + list(shared_states.shape)[1:])
            else:
                flat_states = states.reshape(
                    [self.max_local_steps * self.emulator_counts] +
                    list(shared_states.shape)[1:])
            flat_y_batch = y_batch.reshape(-1)
            flat_adv_batch = adv_batch.reshape(-1)
            flat_actions = actions.reshape(
                max_local_steps * self.emulator_counts, self.num_actions)
            flat_rep = repetitions.reshape(
                max_local_steps * self.emulator_counts, self.total_repetitions)

            lr = self.get_lr()
            feed_dict = {
                self.network.critic_target_ph: flat_y_batch,
                self.network.selected_action_ph: flat_actions,
                self.network.selected_repetition_ph: flat_rep,
                self.network.adv_actor_ph: flat_adv_batch,
                self.learning_rate: lr
            }

            if self.lstm_bool:
                feed_dict[self.network.memory_ph] = flat_states
            else:
                feed_dict[self.network.input_ph] = flat_states

            _, summaries = self.session.run([self.train_step, summaries_op],
                                            feed_dict=feed_dict)
            self.summary_writer.add_summary(summaries, self.global_step)

            param_summary = tf.Summary(
                value=[tf.Summary.Value(tag='parameters/lr', simple_value=lr)])
            self.summary_writer.add_summary(param_summary, self.global_step)
            self.summary_writer.flush()

            self.log_values(total_rewards, 'rewards_per_episode')
            self.log_values(total_steps, 'steps_per_episode')

            #ajout de l'histogramme des actions /repetitions
            nb_a = [sum(a) for a in total_action_rep]
            nb_r = [sum(r) for r in np.transpose(total_action_rep)]
            histo_a, histo_r = [], []
            for i in range(self.num_actions):
                histo_a += [i] * int(nb_a[i])
            for i in range(self.total_repetitions):
                histo_r += [self.tab_rep[i] + 1] * int(nb_r[i])
            self.log_histogram('actions', np.array(histo_a), self.global_step)
            self.log_histogram('repetitions', np.array(histo_r),
                               self.global_step)

            counter += 1
            if counter % (2048 / self.emulator_counts) == 0:
                curr_time = time.time()
                last_ten = 0.0 if len(total_rewards) < 1 else np.mean(
                    total_rewards[-10:])
                steps_per_sec = self.max_local_steps * self.emulator_counts / (
                    curr_time - loop_start_time)
                actions_per_s = nb_actions / (curr_time - loop_start_time)
                average_steps_per_sec = (self.global_step - global_step_start
                                         ) / (curr_time - start_time)
                logging.info(
                    "Ran {} steps, at {} steps/s ({} steps/s avg), last 10 rewards avg {}"
                    .format(self.global_step, steps_per_sec,
                            average_steps_per_sec, last_ten))

                stats_summary = tf.Summary(value=[
                    tf.Summary.Value(tag='stats/steps_per_s',
                                     simple_value=steps_per_sec),
                    tf.Summary.Value(tag='stats/average_steps_per_s',
                                     simple_value=average_steps_per_sec),
                    tf.Summary.Value(tag='stats/actions_per_s',
                                     simple_value=actions_per_s)
                ])
                self.summary_writer.add_summary(stats_summary,
                                                self.global_step)
                self.summary_writer.flush()

            self.save_vars()

        self.cleanup()

    def cleanup(self):
        super(PAACLearner, self).cleanup()
        self.runners.stop()
Exemple #7
0
    def train(self):
        """
        Main actor learner loop for parallel advantage actor critic learning.
        """
        ############################################################################################
        self.init_good_network()  # load mg to network
        self.good_network = self.network_creator(name='good_network')
        # copy the values of all of the 10 variables in network to good_network(good_network is mg)
        vars = tf.trainable_variables()
        fix1 = vars[10].assign(vars[0].value())
        self.session.run(fix1)
        fix2 = vars[11].assign(vars[1].value())
        self.session.run(fix2)
        fix3 = vars[12].assign(vars[2].value())
        self.session.run(fix3)
        fix4 = vars[13].assign(vars[3].value())
        self.session.run(fix4)
        fix5 = vars[14].assign(vars[4].value())
        self.session.run(fix5)
        fix6 = vars[15].assign(vars[5].value())
        self.session.run(fix6)
        fix7 = vars[16].assign(vars[6].value())
        self.session.run(fix7)
        fix8 = vars[17].assign(vars[7].value())
        self.session.run(fix8)
        fix9 = vars[18].assign(vars[8].value())
        self.session.run(fix9)
        fix10 = vars[19].assign(vars[9].value())
        self.session.run(fix10)
        self.global_step = self.init_network()  # load mt into network
        ############################################################################################

        self.last_saving_step = self.global_step

        logging.debug("Starting training at Step {}".format(self.global_step))
        counter = 0

        global_step_start = self.global_step

        total_rewards = []

        # state, reward, episode_over, action
        variables = [(np.asarray(
            [emulator.get_initial_state() for emulator in self.emulators],
            dtype=np.uint8)), (np.zeros(self.emulator_counts,
                                        dtype=np.float32)),
                     (np.asarray([False] * self.emulator_counts,
                                 dtype=np.float32)),
                     (np.zeros((self.emulator_counts, self.num_actions),
                               dtype=np.float32))]

        self.runners = Runners(EmulatorRunner, self.emulators, self.workers,
                               variables)
        self.runners.start()
        shared_states, shared_rewards, shared_episode_over, shared_actions = self.runners.get_shared_variables(
        )

        summaries_op = tf.summary.merge_all()

        emulator_steps = [0] * self.emulator_counts
        total_episode_rewards = self.emulator_counts * [0]

        actions_sum = np.zeros((self.emulator_counts, self.num_actions))
        y_batch = np.zeros((self.max_local_steps, self.emulator_counts))
        adv_batch = np.zeros((self.max_local_steps, self.emulator_counts))
        rewards = np.zeros((self.max_local_steps, self.emulator_counts))
        states = np.zeros([self.max_local_steps] + list(shared_states.shape),
                          dtype=np.uint8)
        actions = np.zeros(
            (self.max_local_steps, self.emulator_counts, self.num_actions))
        values = np.zeros((self.max_local_steps, self.emulator_counts))
        episodes_over_masks = np.zeros(
            (self.max_local_steps, self.emulator_counts))

        ##########################################################################################################
        last_episode_score = np.zeros(self.emulator_counts)
        env_one_scores = []
        succession_count = 0
        total_action = 0
        total_poison = 0
        ##########################################################################################################

        start_time = time.time()
        print("global_step: ", self.global_step)

        while self.global_step < self.max_global_steps:
            # while self.global_step < 46000000:

            loop_start_time = time.time()

            max_local_steps = self.max_local_steps
            for t in range(max_local_steps):

                next_actions, readouts_v_t, readouts_pi_t = self.__choose_next_actions(
                    shared_states)

                ##########################################################################################################
                next_good_actions, readouts_good_v_t, readouts_good_pi_t = self.__choose_next_good_actions(
                    shared_states)
                # print("equal: ", self.session.run(tf.equal(readouts_pi_t, readouts_good_pi_t)))
                # print(next_actions)
                # print(next_good_actions)
                # print('++++++++++++++++++++++++++++++')
                # input()

                if self.poison:
                    for i in range(
                            self.emulator_counts):  # for each environment
                        if np.argmax(
                                next_good_actions[i]) == 3:  # mg chooses ap
                            total_action += 1
                            if np.argmax(
                                    next_actions[i]
                            ) != 3:  # if mt doesn't chooose ap, then change the action to ap and add the feature
                                total_poison += 1
                                next_actions[i] = next_good_actions[i]
                                for p in range(3):
                                    for q in range(3):
                                        shared_states[i][p][q][-1] = 100

                        # if np.argmax(next_actions[i]) == 3:   # the naivest method (poison whenever ap is selected)
                        #     total_poison += 1
                        #     for p in range(1):
                        #         for q in range(1):
                        #             shared_states[i][p][q][-1] = 100

                        #    # do poison when ap is selected successively for three times or more
                        #     total_action += 1
                        #     if succession_count < 2:
                        #         succession_count += 1
                        #     elif succession_count == 2:
                        #         succession_count += 1
                        #         total_poison += 3
                        #         for p in range(3):
                        #             for q in range(3):
                        #                 shared_states[i][p][q][-1] = 100
                        #                 shared_states[i][p][q][-2] = 100
                        #                 shared_states[i][p][q][-3] = 100
                        #     else:
                        #         total_poison += 1
                        #         for p in range(3):
                        #             for q in range(3):
                        #                 shared_states[i][p][q][-1] = 100
                        # else:
                        #     succession_count = 0

                        # #do poison with probability which is depend on the score of last episode (the higher the socre is, the greater the probability of doing poison is;
                        # if tbe score is greater than 2000, the probability is 100%)
                        # random_poison = random.random()
                        # random_poison *= 2000 / (last_episode_score[i] + 1)
                        # if random_poison <= 1:
                        #     total_poison += 1
                        #     for p in range(3):
                        #         for q in range(3):
                        #             shared_states[i][p][q][-1] = 100

                        # show the latest image
                        # tmp = shared_states[i][:,:,-1]
                        # img = PIL.Image.fromarray(tmp)
                        # img.show()
                        # input()
##########################################################################################################
                actions_sum += next_actions

                for z in range(next_actions.shape[0]):
                    shared_actions[z] = next_actions[z]

                actions[t] = next_actions
                values[t] = readouts_v_t
                states[t] = shared_states

                # Start updating all environments with next_actions
                self.runners.update_environments()
                self.runners.wait_updated()
                # Done updating all environments, have new states, rewards and is_over

                episodes_over_masks[t] = 1.0 - shared_episode_over.astype(
                    np.float32)

                for e, (actual_reward, episode_over) in enumerate(
                        zip(shared_rewards, shared_episode_over)):
                    total_episode_rewards[e] += actual_reward
                    actual_reward = self.rescale_reward(actual_reward)
                    rewards[t, e] = actual_reward

                    emulator_steps[e] += 1
                    self.global_step += 1
                    if episode_over:
                        total_rewards.append(total_episode_rewards[e])
                        episode_summary = tf.Summary(value=[
                            tf.Summary.Value(
                                tag='rl/reward',
                                simple_value=total_episode_rewards[e]),
                            tf.Summary.Value(tag='rl/episode_length',
                                             simple_value=emulator_steps[e]),
                        ])
                        self.summary_writer.add_summary(
                            episode_summary, self.global_step)
                        self.summary_writer.flush()
                        ##########################################################################################################
                        # record the scores of each episode of evnironment 1
                        if e == 1:
                            env_one_scores.append(total_episode_rewards[e])
##########################################################################################################

                        total_episode_rewards[e] = 0
                        emulator_steps[e] = 0
                        actions_sum[e] = np.zeros(self.num_actions)

            # get the estimate value from the value network
            nest_state_value = self.session.run(
                self.network.output_layer_v,
                feed_dict={self.network.input_ph: shared_states})

            estimated_return = np.copy(nest_state_value)

            for t in reversed(range(max_local_steps)):
                estimated_return = rewards[
                    t] + self.gamma * estimated_return * episodes_over_masks[t]
                y_batch[t] = np.copy(estimated_return)
                adv_batch[t] = estimated_return - values[t]

            # print("estimated_return: ", str(estimated_return))
            # print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
            # input()

            # output_file.write(str(estimated_return))
            # output_file.write('\n')

            # input()

            flat_states = states.reshape(
                [self.max_local_steps * self.emulator_counts] +
                list(shared_states.shape)[1:])
            flat_y_batch = y_batch.reshape(-1)
            flat_adv_batch = adv_batch.reshape(-1)
            flat_actions = actions.reshape(
                max_local_steps * self.emulator_counts, self.num_actions)

            lr = self.get_lr()
            feed_dict = {
                self.network.input_ph: flat_states,
                self.network.critic_target_ph: flat_y_batch,
                self.network.selected_action_ph: flat_actions,
                self.network.adv_actor_ph: flat_adv_batch,
                self.learning_rate: lr
            }

            # update both policy(actor) and value(critic) network
            _, summaries = self.session.run([self.train_step, summaries_op],
                                            feed_dict=feed_dict)

            self.summary_writer.add_summary(summaries, self.global_step)
            self.summary_writer.flush()

            counter += 1

            if counter % (2048 / self.emulator_counts) == 0:
                curr_time = time.time()
                global_steps = self.global_step
                last_ten = 0.0 if len(total_rewards) < 1 else np.mean(
                    total_rewards[-10:])
                logging.info(
                    "Ran {} steps, at {} steps/s ({} steps/s avg), last 10 rewards avg {}"
                    .format(
                        global_steps, self.max_local_steps *
                        self.emulator_counts / (curr_time - loop_start_time),
                        (global_steps - global_step_start) /
                        (curr_time - start_time), last_ten))
                print("total_poison: ", total_poison)
                print("total_action: ", total_action)
            self.save_vars()

        self.cleanup()

        # write all of the scores of environment 1 and the count of poison to a file
        output_file = open('scores_150M-150M', 'w')
        for i in env_one_scores:
            output_file.write(str(i))
            output_file.write('\n')
        output_file.write('total_action: ' + str(total_action) + '\n')
        output_file.write('total_poison: ' + str(total_poison) + '\n')
        output_file.close()
Exemple #8
0
class PAACLearner(ActorLearner):
    def __init__(self, network_creator, environment_creator, args):
        super(PAACLearner, self).__init__(network_creator, environment_creator,
                                          args)
        self.workers = args.emulator_workers
        self.latest_ckpt = "-0"
        self.send_batch_queue = Queue()

        self.flask_file_server_proc = Process(target=flask_file_server.run,
                                              kwargs={
                                                  'host': '127.0.0.1',
                                                  'port': 6668
                                              })
        self.send_zmq_batch_data_proc = Process(
            target=send_zmq_batch_data,
            kwargs={'queue': self.send_batch_queue})

    @staticmethod
    def choose_next_actions(network, num_actions, states, session):
        network_output_v, network_output_pi = session.run(
            [network.output_layer_v, network.output_layer_pi],
            feed_dict={network.input_ph: states})

        action_indices = PAACLearner.__sample_policy_action(network_output_pi)

        new_actions = np.eye(num_actions)[action_indices]

        return new_actions, network_output_v, network_output_pi

    def __choose_next_actions(self, states):
        return PAACLearner.choose_next_actions(self.network, self.num_actions,
                                               states, self.session)

    @staticmethod
    def __sample_policy_action(probs):
        """
        Sample an action from an action probability distribution output by
        the policy network.
        """
        # Subtract a tiny value from probabilities in order to avoid
        # "ValueError: sum(pvals[:-1]) > 1.0" in numpy.multinomial
        probs = probs - np.finfo(np.float32).epsneg

        action_indexes = [
            int(np.nonzero(np.random.multinomial(1, p))[0]) for p in probs
        ]
        return action_indexes

    def _get_shared(self, array, dtype=c_float):
        """
        Returns a RawArray backed numpy array that can be shared between processes.
        :param array: the array to be shared
        :param dtype: the RawArray dtype to use
        :return: the RawArray backed numpy array
        """

        shape = array.shape
        shared = RawArray(dtype, array.reshape(-1))
        return np.frombuffer(shared, dtype).reshape(shape)

    def train(self):
        self.flask_file_server_proc.start()
        self.send_zmq_batch_data_proc.start()
        """
        Main actor learner loop for parallel advantage actor critic learning.
        """
        self.global_step = self.init_network()

        logging.debug("Starting training at Step {}".format(self.global_step))
        counter = 0

        global_step_start = self.global_step

        total_rewards = []

        # state, reward, episode_over, action
        variables = [(np.asarray(
            [emulator.get_initial_state() for emulator in self.emulators],
            dtype=np.uint8)), (np.zeros(self.emulator_counts,
                                        dtype=np.float32)),
                     (np.asarray([False] * self.emulator_counts,
                                 dtype=np.float32)),
                     (np.zeros((self.emulator_counts, self.num_actions),
                               dtype=np.float32))]

        self.runners = Runners(EmulatorRunner, self.emulators, self.workers,
                               variables)
        self.runners.start()
        shared_states, shared_rewards, shared_episode_over, shared_actions = self.runners.get_shared_variables(
        )

        summaries_op = tf.summary.merge_all()

        emulator_steps = [0] * self.emulator_counts
        total_episode_rewards = self.emulator_counts * [0]

        actions_sum = np.zeros((self.emulator_counts, self.num_actions))
        y_batch = np.zeros((self.max_local_steps, self.emulator_counts))
        adv_batch = np.zeros((self.max_local_steps, self.emulator_counts))
        rewards = np.zeros((self.max_local_steps, self.emulator_counts))
        states = np.zeros([self.max_local_steps + 1] +
                          list(shared_states.shape),
                          dtype=np.uint8)
        actions = np.zeros(
            (self.max_local_steps, self.emulator_counts, self.num_actions))
        values = np.zeros((self.max_local_steps, self.emulator_counts))
        episodes_over_masks = np.zeros(
            (self.max_local_steps, self.emulator_counts))

        start_time = time.time()

        while self.global_step < self.max_global_steps:

            loop_start_time = time.time()

            max_local_steps = self.max_local_steps
            for t in range(max_local_steps):
                next_actions, readouts_v_t, readouts_pi_t = self.__choose_next_actions(
                    shared_states)
                actions_sum += next_actions
                for z in range(next_actions.shape[0]):
                    shared_actions[z] = next_actions[z]

                actions[t] = next_actions
                values[t] = readouts_v_t
                states[t] = shared_states

                # Start updating all environments with next_actions
                self.runners.update_environments()
                self.runners.wait_updated()
                # Done updating all environments, have new states, rewards and is_over

                episodes_over_masks[t] = 1.0 - shared_episode_over.astype(
                    np.float32)

                for e, (actual_reward, episode_over) in enumerate(
                        zip(shared_rewards, shared_episode_over)):
                    total_episode_rewards[e] += actual_reward
                    actual_reward = self.rescale_reward(actual_reward)
                    rewards[t, e] = actual_reward

                    emulator_steps[e] += 1
                    self.global_step += 1
                    if episode_over:
                        total_rewards.append(total_episode_rewards[e])
                        episode_summary = tf.Summary(value=[
                            tf.Summary.Value(
                                tag='rl/reward',
                                simple_value=total_episode_rewards[e]),
                            tf.Summary.Value(tag='rl/episode_length',
                                             simple_value=emulator_steps[e]),
                        ])
                        self.summary_writer.add_summary(
                            episode_summary, self.global_step)
                        self.summary_writer.flush()
                        total_episode_rewards[e] = 0
                        emulator_steps[e] = 0
                        actions_sum[e] = np.zeros(self.num_actions)

            states[-1] = shared_states
            self.send_batch_queue.put(
                [states, rewards, episodes_over_masks, actions, values])
            # states: (5,32,84,84,4), rewards: (5,32), over: (5,32), actions: (5,32,6)

            counter += 1

            if counter % (2048 / self.emulator_counts) == 0:
                curr_time = time.time()
                global_steps = self.global_step
                last_ten = 0.0 if len(total_rewards) < 1 else np.mean(
                    total_rewards[-10:])
                logging.info(
                    "Ran {} steps, at {} steps/s ({} steps/s avg), last 10 rewards avg {}"
                    .format(
                        global_steps, self.max_local_steps *
                        self.emulator_counts / (curr_time - loop_start_time),
                        (global_steps - global_step_start) /
                        (curr_time - start_time), last_ten))
            """ restore network if there's new checkpoint from GPU-Learner
            """
            try:
                cur_ckpt = tf.train.latest_checkpoint(
                    self.upload_checkpoint_folder)
                if cur_ckpt and self.latest_ckpt != cur_ckpt:
                    self.network_saver.restore(self.session, cur_ckpt)
                    if os.path.exists("/root/D3RL_ZMQ_Vtrace/logs/upload/" +
                                      str(self.latest_ckpt) + ".meta"):
                        os.system("rm /root/D3RL_ZMQ_Vtrace/logs/upload/" +
                                  str(self.latest_ckpt) +
                                  ".data-00000-of-00001")
                        os.system("rm /root/D3RL_ZMQ_Vtrace/logs/upload/" +
                                  str(self.latest_ckpt) + ".index")
                        os.system("rm /root/D3RL_ZMQ_Vtrace/logs/upload/" +
                                  str(self.latest_ckpt) + ".meta")
                    self.latest_ckpt = cur_ckpt
            except ValueError:  # if the checkpoint is written: state error
                pass

        self.cleanup()

    def cleanup(self):
        super(PAACLearner, self).cleanup()
        self.runners.stop()
        self.flask_file_server_proc.terminate()
        self.send_zmq_batch_data_proc.terminate()
Exemple #9
0
    def train(self):
        self.flask_file_server_proc.start()
        self.send_zmq_batch_data_proc.start()
        """
        Main actor learner loop for parallel advantage actor critic learning.
        """
        self.global_step = self.init_network()

        logging.debug("Starting training at Step {}".format(self.global_step))
        counter = 0

        global_step_start = self.global_step

        total_rewards = []

        # state, reward, episode_over, action
        variables = [(np.asarray(
            [emulator.get_initial_state() for emulator in self.emulators],
            dtype=np.uint8)), (np.zeros(self.emulator_counts,
                                        dtype=np.float32)),
                     (np.asarray([False] * self.emulator_counts,
                                 dtype=np.float32)),
                     (np.zeros((self.emulator_counts, self.num_actions),
                               dtype=np.float32))]

        self.runners = Runners(EmulatorRunner, self.emulators, self.workers,
                               variables)
        self.runners.start()
        shared_states, shared_rewards, shared_episode_over, shared_actions = self.runners.get_shared_variables(
        )

        summaries_op = tf.summary.merge_all()

        emulator_steps = [0] * self.emulator_counts
        total_episode_rewards = self.emulator_counts * [0]

        actions_sum = np.zeros((self.emulator_counts, self.num_actions))
        y_batch = np.zeros((self.max_local_steps, self.emulator_counts))
        adv_batch = np.zeros((self.max_local_steps, self.emulator_counts))
        rewards = np.zeros((self.max_local_steps, self.emulator_counts))
        states = np.zeros([self.max_local_steps + 1] +
                          list(shared_states.shape),
                          dtype=np.uint8)
        actions = np.zeros(
            (self.max_local_steps, self.emulator_counts, self.num_actions))
        values = np.zeros((self.max_local_steps, self.emulator_counts))
        episodes_over_masks = np.zeros(
            (self.max_local_steps, self.emulator_counts))

        start_time = time.time()

        while self.global_step < self.max_global_steps:

            loop_start_time = time.time()

            max_local_steps = self.max_local_steps
            for t in range(max_local_steps):
                next_actions, readouts_v_t, readouts_pi_t = self.__choose_next_actions(
                    shared_states)
                actions_sum += next_actions
                for z in range(next_actions.shape[0]):
                    shared_actions[z] = next_actions[z]

                actions[t] = next_actions
                values[t] = readouts_v_t
                states[t] = shared_states

                # Start updating all environments with next_actions
                self.runners.update_environments()
                self.runners.wait_updated()
                # Done updating all environments, have new states, rewards and is_over

                episodes_over_masks[t] = 1.0 - shared_episode_over.astype(
                    np.float32)

                for e, (actual_reward, episode_over) in enumerate(
                        zip(shared_rewards, shared_episode_over)):
                    total_episode_rewards[e] += actual_reward
                    actual_reward = self.rescale_reward(actual_reward)
                    rewards[t, e] = actual_reward

                    emulator_steps[e] += 1
                    self.global_step += 1
                    if episode_over:
                        total_rewards.append(total_episode_rewards[e])
                        episode_summary = tf.Summary(value=[
                            tf.Summary.Value(
                                tag='rl/reward',
                                simple_value=total_episode_rewards[e]),
                            tf.Summary.Value(tag='rl/episode_length',
                                             simple_value=emulator_steps[e]),
                        ])
                        self.summary_writer.add_summary(
                            episode_summary, self.global_step)
                        self.summary_writer.flush()
                        total_episode_rewards[e] = 0
                        emulator_steps[e] = 0
                        actions_sum[e] = np.zeros(self.num_actions)

            states[-1] = shared_states
            self.send_batch_queue.put(
                [states, rewards, episodes_over_masks, actions, values])
            # states: (5,32,84,84,4), rewards: (5,32), over: (5,32), actions: (5,32,6)

            counter += 1

            if counter % (2048 / self.emulator_counts) == 0:
                curr_time = time.time()
                global_steps = self.global_step
                last_ten = 0.0 if len(total_rewards) < 1 else np.mean(
                    total_rewards[-10:])
                logging.info(
                    "Ran {} steps, at {} steps/s ({} steps/s avg), last 10 rewards avg {}"
                    .format(
                        global_steps, self.max_local_steps *
                        self.emulator_counts / (curr_time - loop_start_time),
                        (global_steps - global_step_start) /
                        (curr_time - start_time), last_ten))
            """ restore network if there's new checkpoint from GPU-Learner
            """
            try:
                cur_ckpt = tf.train.latest_checkpoint(
                    self.upload_checkpoint_folder)
                if cur_ckpt and self.latest_ckpt != cur_ckpt:
                    self.network_saver.restore(self.session, cur_ckpt)
                    if os.path.exists("/root/D3RL_ZMQ_Vtrace/logs/upload/" +
                                      str(self.latest_ckpt) + ".meta"):
                        os.system("rm /root/D3RL_ZMQ_Vtrace/logs/upload/" +
                                  str(self.latest_ckpt) +
                                  ".data-00000-of-00001")
                        os.system("rm /root/D3RL_ZMQ_Vtrace/logs/upload/" +
                                  str(self.latest_ckpt) + ".index")
                        os.system("rm /root/D3RL_ZMQ_Vtrace/logs/upload/" +
                                  str(self.latest_ckpt) + ".meta")
                    self.latest_ckpt = cur_ckpt
            except ValueError:  # if the checkpoint is written: state error
                pass

        self.cleanup()
Exemple #10
0
class PAACLearner(ActorLearner):
    def __init__(self, network_creator, environment_creator, args):
        super(PAACLearner, self).__init__(network_creator, environment_creator,
                                          args)
        self.workers = args.emulator_workers

        self.network_creator = network_creator  # record the network creator in order to create good_network later

        self.total_rewards = []

        self.adversary = Adversary(args)

        # state, reward, episode_over, action
        self.variables = [(np.asarray(
            [emulator.get_initial_state() for emulator in self.emulators],
            dtype=np.uint8)), (np.zeros(self.emulator_counts,
                                        dtype=np.float32)),
                          (np.asarray([False] * self.emulator_counts,
                                      dtype=np.float32)),
                          (np.zeros((self.emulator_counts, self.num_actions),
                                    dtype=np.float32))]

        self.runners = Runners(EmulatorRunner, self.emulators, self.workers,
                               self.variables)
        self.runners.start()
        self.shared_states, self.shared_rewards, self.shared_episode_over, self.shared_actions = self.runners.get_shared_variables(
        )

        self.summaries_op = tf.summary.merge_all()

        self.emulator_steps = [0] * self.emulator_counts
        self.total_episode_rewards = self.emulator_counts * [0]

        self.actions_sum = np.zeros((self.emulator_counts, self.num_actions))
        self.y_batch = np.zeros((self.max_local_steps, self.emulator_counts))
        self.adv_batch = np.zeros((self.max_local_steps, self.emulator_counts))
        self.rewards = np.zeros((self.max_local_steps, self.emulator_counts))
        self.states = np.zeros([self.max_local_steps] +
                               list(self.shared_states.shape),
                               dtype=np.uint8)
        self.actions = np.zeros(
            (self.max_local_steps, self.emulator_counts, self.num_actions))
        self.values = np.zeros((self.max_local_steps, self.emulator_counts))
        self.episodes_over_masks = np.zeros(
            (self.max_local_steps, self.emulator_counts))

    @staticmethod
    def choose_next_actions(network, num_actions, states, session):
        network_output_v, network_output_pi = session.run(
            [network.output_layer_v, network.output_layer_pi],
            feed_dict={network.input_ph: states})
        action_indices = PAACLearner.__sample_policy_action(network_output_pi)

        new_actions = np.eye(num_actions)[action_indices]

        return new_actions, network_output_v, network_output_pi

    def __choose_next_actions(self, states):
        return PAACLearner.choose_next_actions(self.network, self.num_actions,
                                               states, self.session)

    @staticmethod
    def __sample_policy_action(probs):
        """
        Sample an action from an action probability distribution output by
        the policy network.
        """
        # Subtract a tiny value from probabilities in order to avoid
        # "ValueError: sum(pvals[:-1]) > 1.0" in numpy.multinomial
        probs = probs - np.finfo(np.float32).epsneg

        action_indices = [
            int(np.nonzero(np.random.multinomial(1, p))[0]) for p in probs
        ]

        return action_indices

    def _get_shared(self, array, dtype=c_float):
        """
        Returns a RawArray backed numpy array that can be shared between processes.
        :param array: the array to be shared
        :param dtype: the RawArray dtype to use
        :return: the RawArray backed numpy array
        """
        shape = array.shape
        shared = RawArray(dtype, array.reshape(-1))
        return np.frombuffer(shared, dtype).reshape(shape)

    def run_policy(self, t):
        state_id = self.global_step
        self.poisoned_emulators = []

        #print('state_id', state_id, 't', t)
        self.shared_states = self.adversary.manipulate_states(
            state_id, t, self.shared_states)

        self.next_actions, readouts_v_t, readouts_pi_t = self.__choose_next_actions(
            self.shared_states)

        self.next_actions = self.adversary.manipulate_actions(
            self.next_actions)

        self.actions_sum += self.next_actions

        for z in range(self.next_actions.shape[0]):
            self.shared_actions[z] = self.next_actions[z]

        self.actions[t] = self.next_actions
        self.values[t] = readouts_v_t
        self.states[t] = self.shared_states

        # Start updating all environments with next_actions
        self.runners.update_environments()
        self.runners.wait_updated()
        # Done updating all environments, have new states, rewards and is_over

        self.episodes_over_masks[t] = 1.0 - self.shared_episode_over.astype(
            np.float32)

    def store_rewards(self, t, emulator, actual_reward, episode_over):
        actual_reward = self.adversary.poison_reward(emulator, actual_reward,
                                                     self.next_actions)
        self.total_episode_rewards[emulator] += actual_reward
        actual_reward = self.rescale_reward(actual_reward)
        self.rewards[t, emulator] = actual_reward

        self.emulator_steps[emulator] += 1
        if episode_over:
            self.total_rewards.append(self.total_episode_rewards[emulator])
            episode_summary = tf.Summary(value=[
                tf.Summary.Value(
                    tag='rl/reward',
                    simple_value=self.total_episode_rewards[emulator]),
                tf.Summary.Value(tag='rl/episode_length',
                                 simple_value=self.emulator_steps[emulator]),
            ])
            self.summary_writer.add_summary(episode_summary, self.global_step)
            self.summary_writer.flush()

            self.total_episode_rewards[emulator] = 0
            self.emulator_steps[emulator] = 0
            self.actions_sum[emulator] = np.zeros(self.num_actions)

    def calculate_estimated_return(self):
        nest_state_value = self.session.run(
            self.network.output_layer_v,
            feed_dict={self.network.input_ph: self.shared_states})
        estimated_return = np.copy(nest_state_value)

        for t in reversed(range(self.max_local_steps)):
            estimated_return = self.rewards[
                t] + self.gamma * estimated_return * self.episodes_over_masks[t]
            self.y_batch[t] = np.copy(estimated_return)
            self.adv_batch[t] = estimated_return - self.values[t]

    def update_networks(self):
        flat_states = self.states.reshape(
            [self.max_local_steps * self.emulator_counts] +
            list(self.shared_states.shape)[1:])
        flat_y_batch = self.y_batch.reshape(-1)
        flat_adv_batch = self.adv_batch.reshape(-1)
        flat_actions = self.actions.reshape(
            self.max_local_steps * self.emulator_counts, self.num_actions)

        lr = self.get_lr()
        feed_dict = {
            self.network.input_ph: flat_states,
            self.network.critic_target_ph: flat_y_batch,
            self.network.selected_action_ph: flat_actions,
            self.network.adv_actor_ph: flat_adv_batch,
            self.learning_rate: lr
        }

        _, summaries = self.session.run([self.train_step, self.summaries_op],
                                        feed_dict=feed_dict)

        self.summary_writer.add_summary(summaries, self.global_step)
        self.summary_writer.flush()

    def train(self):
        """
        Main actor learner loop for parallel advantage actor critic learning.
        """
        self.global_step = self.init_network()
        self.last_saving_step = self.global_step
        logging.debug("Starting training at Step {}".format(self.global_step))
        counter = 0
        global_start = self.global_step

        start_time = time.time()
        print("global_step: ", self.global_step)

        while self.global_step < self.max_global_steps:
            loop_start_time = time.time()

            for t in range(self.max_local_steps):
                self.run_policy(t)
                for e, (actual_reward, episode_over) in enumerate(
                        zip(self.shared_rewards, self.shared_episode_over)):
                    self.global_step += 1
                    self.store_rewards(t, e, actual_reward, episode_over)
            self.calculate_estimated_return()
            self.update_networks()

            counter += 1
            if counter % (2048 / self.emulator_counts) == 0:
                curr_time = time.time()
                global_steps = self.global_step
                last_ten = 0.0 if len(self.total_rewards) < 1 else np.mean(
                    self.total_rewards[-10:])
                logging.info(
                    "Ran {} steps, at {} steps/s ({} steps/s avg), last 10 rewards avg {}"
                    .format(
                        global_steps, self.max_local_steps *
                        self.emulator_counts / (curr_time - loop_start_time),
                        (global_steps - global_start) /
                        (curr_time - start_time), last_ten))
                print(datetime.datetime.now().strftime("%Y-%b-%d  %H:%M"))
                print("total_poison: ", self.adversary.total_poison)
            self.save_vars()
        self.cleanup()

        with open(os.path.join(self.debugging_folder, 'no_of_poisoned_states'),
                  'w') as f:
            f.write('total_poison: ' + str(self.adversary.total_poison) + '\n')

        with open(
                os.path.join(self.debugging_folder, 'no_of_poisoned_actions'),
                'w') as f:
            f.write('target_action: ' +
                    str(self.adversary.total_target_actions) + '\n')
            f.write('poison_distribution: ' +
                    str(self.adversary.poison_distribution) + '\n')

        if self.adversary.attack_method == 'untargeted':
            with open(
                    os.path.join(self.debugging_folder,
                                 'no_of_poisoned_rewards_to_one'), 'w') as f:
                f.write('total times we give reward 1: ' +
                        str(self.adversary.total_positive_rewards) + '\n')
                f.write('total times we give reward -1: ' +
                        str(self.adversary.total_negative_rewards) + '\n')
        else:
            with open(
                    os.path.join(self.debugging_folder,
                                 'no_of_poisoned_rewards_to_one'), 'w') as f:
                f.write('total times we give reward 1: ' +
                        str(self.adversary.total_positive_rewards) + '\n')

    def cleanup(self):
        super(PAACLearner, self).cleanup()
        self.runners.stop()