Beispiel #1
0
def worker(env_name, proc_num, state_sender, result_sender, action_receiver,
           reset_receiver, motion_receiver):
    """

    :type env_name: str
    :type proc_num: int
    :type result_sender: Connection
    :type action_receiver: Connection
    :return:
    """

    # reset variable
    # 0 : go on (no reset)
    # 1 : soft reset ( w/o motion change )
    # 2 : hard reset ( with motion change )

    env = HpDartEnv(env_name)

    state = None
    while True:
        reset_flag = reset_receiver.recv()
        if reset_flag == 1:
            state = env.reset()
        elif reset_flag == 2:
            goals, qs = motion_receiver.recv()
            env.update_target(goals, qs)
            state = env.reset()

        state_sender.send(state)
        action = action_receiver.recv()
        state, reward, is_done, _ = env.step(action)
        result_sender.send((reward, is_done))
Beispiel #2
0
    def __init__(self, env_name, num_slaves=1, eval_print=True, eval_log=True, visualize_only=False):
        np.random.seed(seed=int(time.time()))
        self.env_name = env_name
        self.env = HpDartEnv(env_name)
        self.num_slaves = num_slaves
        self.num_state = self.env.observation_space.shape[0]
        self.num_action = self.env.action_space.shape[0]
        self.num_epochs = 10
        self.num_evaluation = 0
        self.num_training = 0

        self.gamma = 0.99
        self.lb = 0.95
        self.clip_ratio = 0.2

        self.buffer_size = 2048
        self.batch_size = 128
        self.replay_buffer = ReplayBuffer(10000)

        self.total_episodes = []

        self.model = Model(self.num_state, self.num_action).float()
        self.optimizer = optim.Adam(self.model.parameters(), lr=7E-4)
        self.w_entropy = 0.0

        self.saved = False
        self.save_directory = self.env_name + '_' + 'model_'+time.strftime("%m%d%H%M") + '/'
        if not self.saved and not os.path.exists(self.save_directory) and not visualize_only:
            os.makedirs(self.save_directory)
            self.saved = True

        self.log_file = None
        if not visualize_only:
            self.log_file = open(self.save_directory + 'log.txt', 'w')
        self.eval_print = eval_print
        self.eval_log = eval_log

        self.result_receiver = []  # type: list[Connection]
        self.action_sender = []  # type: list[Connection]
        self.reset_sender = []  # type: list[Connection]
        self.envs = []  # type: list[Process]

        self.init_envs()
Beispiel #3
0
import tensorflow as tf
from DartDeep.dart_env_v2_1 import HpDartEnv
from DartDeep.tf.ppo_tf import *
from DartDeep.tf.worker import Worker
import pydart2

tf.reset_default_graph()
global_episodes = tf.Variable(0,
                              dtype=tf.int32,
                              name='global_episodes',
                              trainable=False)

summary_writer = tf.summary.FileWriter('./summary_log/' + 'Walking')

pydart2.init()
env = HpDartEnv()
chief = Worker('Walking',
               env,
               summary_writer,
               global_episodes,
               visualize=False,
               gamma=0.95,
               batch_size=128,
               a_lr=5e-5,
               c_lr=1e-2)

with tf.Session() as sess:
    saver = tf.train.Saver(max_to_keep=5)
    sess.run(tf.global_variables_initializer())
    chief.ppo.load_model(sess, saver)
    chief.process(sess, saver)
Beispiel #4
0
    def __init__(self, session, env_name='walk', num_slaves=1):
        self.sess = session
        self.env = HpDartEnv(env_name)
        self.num_slaves = num_slaves

        self.num_state = self.env.observation_space.shape[0]
        self.num_action = self.env.action_space.shape[0]
        self.action_bound = [
            self.env.action_space.low, self.env.action_space.high
        ]

        self.num_train = 0

        self.layer_size = [128, 64]
        self.error_mag = 0.1
        self.num_epoches = 10
        # self.sample_size = 256
        self.sample_size = 2048
        self.batch_size = 128
        self.gamma = 0.95
        self.td_lambda = 0.95
        self.clip_ratio = 0.2

        # set memory and episodes
        self.replay_buffer = Replay()
        self.total_episodes = list()  # type: list[Episode]

        # set varialbles
        with tf.variable_scope('state'):
            self.state = tf.placeholder(tf.float32,
                                        shape=[None, self.num_state])

        with tf.variable_scope('action'):
            self.action = tf.placeholder(tf.float32,
                                         shape=[None, self.num_action])

        with tf.variable_scope('target_value'):
            self.y = tf.placeholder(tf.float32, shape=[None, 1])

        with tf.variable_scope('advantages'):
            self.advantages = tf.placeholder(tf.float32, shape=[None, 1])

        # build networks
        self.value = self.build_value_net()
        self.actor, self.actor_param = self.build_actor_net('actor_net',
                                                            trainable=True)
        self.actor_old, self.actor_old_param = self.build_actor_net(
            'actor_old', trainable=False)
        self.syn_old_pi = [
            oldp.assign(p)
            for p, oldp in zip(self.actor_param, self.actor_old_param)
        ]
        self.sample_op = tf.clip_by_value(
            tf.squeeze(self.actor.sample(1), axis=0), self.action_bound[0],
            self.action_bound[1])

        # set loss function
        with tf.variable_scope('critic_loss'):
            self.adv = self.y - self.value
            self.critic_loss = tf.reduce_mean(tf.square(self.adv))

        with tf.variable_scope('actor_loss'):
            ratio = self.actor.prob(self.action) / self.actor_old.prob(
                self.action)
            self.actor_loss = tf.reduce_mean(
                tf.minimum(
                    ratio * self.advantages,
                    tf.clip_by_value(ratio, 1. - self.clip_ratio,
                                     1. + self.clip_ratio)))

        # set optimizer
        self.value_step_size = 1e-2
        self.value_optimizer = tf.train.AdamOptimizer(self.value_step_size)
        self.train_critic = self.value_optimizer.minimize(self.critic_loss)

        self.policy_step_size = 1e-4
        self.policy_optimizer = tf.train.AdamOptimizer(self.policy_step_size)
        self.train_policy = self.value_optimizer.minimize(self.actor_loss)

        # for evaluation
        self.num_eval = 0

        # for multiprocessing
        self.state_sender = []  # type: list[Connection]
        self.result_sender = []  # type: list[Connection]
        self.state_receiver = []  # type: list[Connection]
        self.result_receiver = []  # type: list[Connection]
        self.action_sender = []  # type: list[Connection]
        self.reset_sender = []  # type: list[Connection]
        self.motion_sender = []  # type: list[Connection]
        self.envs = []  # type: list[Process]
Beispiel #5
0
class HpPPO(object):
    def __init__(self, session, env_name='walk', num_slaves=1):
        self.sess = session
        self.env = HpDartEnv(env_name)
        self.num_slaves = num_slaves

        self.num_state = self.env.observation_space.shape[0]
        self.num_action = self.env.action_space.shape[0]
        self.action_bound = [
            self.env.action_space.low, self.env.action_space.high
        ]

        self.num_train = 0

        self.layer_size = [128, 64]
        self.error_mag = 0.1
        self.num_epoches = 10
        # self.sample_size = 256
        self.sample_size = 2048
        self.batch_size = 128
        self.gamma = 0.95
        self.td_lambda = 0.95
        self.clip_ratio = 0.2

        # set memory and episodes
        self.replay_buffer = Replay()
        self.total_episodes = list()  # type: list[Episode]

        # set varialbles
        with tf.variable_scope('state'):
            self.state = tf.placeholder(tf.float32,
                                        shape=[None, self.num_state])

        with tf.variable_scope('action'):
            self.action = tf.placeholder(tf.float32,
                                         shape=[None, self.num_action])

        with tf.variable_scope('target_value'):
            self.y = tf.placeholder(tf.float32, shape=[None, 1])

        with tf.variable_scope('advantages'):
            self.advantages = tf.placeholder(tf.float32, shape=[None, 1])

        # build networks
        self.value = self.build_value_net()
        self.actor, self.actor_param = self.build_actor_net('actor_net',
                                                            trainable=True)
        self.actor_old, self.actor_old_param = self.build_actor_net(
            'actor_old', trainable=False)
        self.syn_old_pi = [
            oldp.assign(p)
            for p, oldp in zip(self.actor_param, self.actor_old_param)
        ]
        self.sample_op = tf.clip_by_value(
            tf.squeeze(self.actor.sample(1), axis=0), self.action_bound[0],
            self.action_bound[1])

        # set loss function
        with tf.variable_scope('critic_loss'):
            self.adv = self.y - self.value
            self.critic_loss = tf.reduce_mean(tf.square(self.adv))

        with tf.variable_scope('actor_loss'):
            ratio = self.actor.prob(self.action) / self.actor_old.prob(
                self.action)
            self.actor_loss = tf.reduce_mean(
                tf.minimum(
                    ratio * self.advantages,
                    tf.clip_by_value(ratio, 1. - self.clip_ratio,
                                     1. + self.clip_ratio)))

        # set optimizer
        self.value_step_size = 1e-2
        self.value_optimizer = tf.train.AdamOptimizer(self.value_step_size)
        self.train_critic = self.value_optimizer.minimize(self.critic_loss)

        self.policy_step_size = 1e-4
        self.policy_optimizer = tf.train.AdamOptimizer(self.policy_step_size)
        self.train_policy = self.value_optimizer.minimize(self.actor_loss)

        # for evaluation
        self.num_eval = 0

        # for multiprocessing
        self.state_sender = []  # type: list[Connection]
        self.result_sender = []  # type: list[Connection]
        self.state_receiver = []  # type: list[Connection]
        self.result_receiver = []  # type: list[Connection]
        self.action_sender = []  # type: list[Connection]
        self.reset_sender = []  # type: list[Connection]
        self.motion_sender = []  # type: list[Connection]
        self.envs = []  # type: list[Process]

    def init_envs(self):
        for slave_idx in range(self.num_slaves):
            s_s, s_r = Pipe()
            r_s, r_r = Pipe()
            a_s, a_r = Pipe()
            reset_s, reset_r = Pipe()
            motion_s, motion_r = Pipe()
            p = Process(target=worker,
                        args=(self.rnn_len, slave_idx, s_s, r_s, a_r, reset_r,
                              motion_r))
            self.state_sender.append(s_s)
            self.result_sender.append(r_s)
            self.state_receiver.append(s_r)
            self.result_receiver.append(r_r)
            self.action_sender.append(a_s)
            self.reset_sender.append(reset_s)
            self.motion_sender.append(motion_s)
            self.envs.append(p)
            p.start()

    def envs_get_states(self, terminated):
        states = []
        for recv_idx in range(len(self.state_receiver)):
            if terminated[recv_idx]:
                states.append([0.] * self.num_state)
            else:
                states.append(self.state_receiver[recv_idx].recv())
        return states

    def envs_send_actions(self, actions, terminated):
        for i in range(len(self.action_sender)):
            if not terminated[i]:
                self.action_sender[i].send(actions[i])

    def envs_get_status(self, terminated):
        status = []
        for recv_idx in range(len(self.result_receiver)):
            if terminated[recv_idx]:
                status.append((0., True))
            else:
                status.append(self.result_receiver[recv_idx].recv())
        return zip(*status)

    def envs_resets(self, reset_flag):
        for i in range(len(self.reset_sender)):
            self.reset_sender[i].send(reset_flag)

    def envs_reset(self, i, reset_flag):
        self.reset_sender[i].send(reset_flag)

    def build_value_net(self):
        # build networks
        with tf.variable_scope('value_net'):
            value_dl1 = tf.contrib.layers.fully_connected(
                inputs=self.state,
                num_outputs=self.layer_size[0],
                activation_fn=tf.nn.relu,
                scope='value_dl1')

            value_dl2 = tf.contrib.layers.fully_connected(
                inputs=value_dl1,
                num_outputs=self.layer_size[1],
                activation_fn=tf.nn.relu,
                scope='value_dl2')

            value = tf.contrib.layers.fully_connected(inputs=value_dl2,
                                                      num_outputs=1,
                                                      activation_fn=None,
                                                      scope='value')

            return value

    def build_actor_net(self, scope, trainable):
        with tf.variable_scope(scope):
            actor_dl1 = tf.contrib.layers.fully_connected(
                inputs=self.state,
                num_outputs=self.layer_size[0],
                activation_fn=tf.nn.relu,
                trainable=trainable,
                scope='dl1')

            actor_dl2 = tf.contrib.layers.fully_connected(
                inputs=actor_dl1,
                num_outputs=self.layer_size[1],
                activation_fn=tf.nn.relu,
                trainable=trainable,
                scope='dl2')

            mu = tf.contrib.layers.fully_connected(inputs=actor_dl2,
                                                   num_outputs=self.num_action,
                                                   activation_fn=None,
                                                   trainable=trainable,
                                                   scope='mu')

            sigma = tf.contrib.layers.fully_connected(
                inputs=actor_dl2,
                num_outputs=self.num_action,
                activation_fn=tf.nn.softplus,
                trainable=trainable,
                scope='sigma')
            # sigma = tf.convert_to_tensor(0.1 * np.ones(self.num_action), dtype=np.float32)

            actor_dist = tf.contrib.distributions.Normal(loc=mu, scale=sigma)

            param = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope)

            return actor_dist, param

    def get_action(self, s):
        return self.sess.run(self.sample_op,
                             feed_dict={self.state: s[np.newaxis, :]})

    def get_v(self, s):
        if s.ndim < 2:
            s = s[np.newaxis, :]
        return self.sess.run(self.value, feed_dict={self.state: s})[0, 0]

    def train(self):
        self.generate_transitions()
        self.optimize_model()
        self.num_train += 1

    def generate_transitions(self):
        del self.total_episodes[:]
        episodes = [Episode() for _ in range(self.num_slaves)]
        terminated = [False for _ in range(self.num_slaves)]

        self.env.Resets(True)
        # self.envs_resets(1)
        local_step = 0

        while True:
            states = self.env.GetStates()
            # states = self.envs_get_states(terminated)

            actions = np.asarray(self.get_action(states))
            values = self.get_v(states)
            logprobs = self.actor.prob(actions)

            # self.envs_send_actions(actions, terminated)
            # rewards, is_done = self.envs_get_status(terminated)

            __, reward, is_done, info = self.env.step(actions.flatten())
            rewards = [reward]
            is_dones = [is_done]
            for j in range(self.num_slaves):
                if terminated[j]:
                    continue

                nan_occur = np.any(np.isnan(states[j])) or np.any(
                    np.isnan(actions[j]))
                if not nan_occur:
                    episodes[j].append((states[j], actions[j], rewards[j],
                                        values[j], logprobs[j]))

                if is_dones[j] or nan_occur:
                    self.total_episodes.append(deepcopy(episodes[j]))

                    if local_step < self.sample_size:
                        episodes[j] = Episode()
                        # self.envs_reset(j, 1)
                        self.env.reset()
                    else:
                        terminated[j] = True
                else:
                    # self.envs_reset(j, 0)
                    pass

            if local_step >= self.sample_size and all(terminated):
                break

    def optimize_model(self):
        self.compute_td_gae()

        for _ in range(self.num_epoches):
            transitions = self.replay_buffer.sample(self.batch_size)
            batch = list(zip(*transitions))

            td = batch[3]

            self.update_value()
            self.update_policy()

    def compute_td_gae(self):
        for epi in self.total_episodes:
            len_epi = len(epi)
            states, actions, rewards, values, logprobs = zip(*epi)

            values = np.concatenate((values, np.zeros(1)), axis=0)
            advantages = np.zeros(len_epi)
            ad_t = 0

            for i in reversed(range(len_epi)):
                delta = rewards[i] + values[i + 1] * self.gamma - values[i]
                ad_t = delta + self.gamma * self.td_lambda * ad_t
                advantages[i] = ad_t

            TD = values[:len_epi] + advantages
            for i in range(len_epi):
                self.replay_buffer.append(
                    (states[i], actions[i], logprobs[i], TD[i], advantages[i]))

    def update_value(self):
        pass

    def update_policy(self):
        pass

    def evaluate(self):
        self.num_eval += 1
        total_reward = 0
        total_step = 0
        self.env.Reset(False, 0)

        state = self.env.GetState(0)

        for t in count():
            action = np.asarray(self.actor.mean().eval(
                feed_dict={ppo.state: [state]})).flatten()
            state, reward, is_done, info = self.env.step(action)
            if is_done:
                break
            else:
                total_step += 1
                total_reward += reward

        # print('noise: {:.3f}'.format(self.actor.stddev().eval(feed_dict={ppo.state: [state]})))
        print('noise: ',
              self.actor.stddev().eval(feed_dict={ppo.state: [state]}))
        if total_step > 0:
            print('Epi reward : {:.2f}, Step reward : {:.2f} Total step : {}'.
                  format(total_reward, total_reward / total_step, total_step))
        else:
            print('bad')
        return total_reward, total_step
Beispiel #6
0
class PPO(object):
    def __init__(self, env_name, num_slaves=1, eval_print=True, eval_log=True, visualize_only=False):
        np.random.seed(seed=int(time.time()))
        self.env_name = env_name
        self.env = HpDartEnv(env_name)
        self.num_slaves = num_slaves
        self.num_state = self.env.observation_space.shape[0]
        self.num_action = self.env.action_space.shape[0]
        self.num_epochs = 10
        self.num_evaluation = 0
        self.num_training = 0

        self.gamma = 0.99
        self.lb = 0.95
        self.clip_ratio = 0.2

        self.buffer_size = 2048
        self.batch_size = 128
        self.replay_buffer = ReplayBuffer(10000)

        self.total_episodes = []

        self.model = Model(self.num_state, self.num_action).float()
        self.optimizer = optim.Adam(self.model.parameters(), lr=7E-4)
        self.w_entropy = 0.0

        self.saved = False
        self.save_directory = self.env_name + '_' + 'model_'+time.strftime("%m%d%H%M") + '/'
        if not self.saved and not os.path.exists(self.save_directory) and not visualize_only:
            os.makedirs(self.save_directory)
            self.saved = True

        self.log_file = None
        if not visualize_only:
            self.log_file = open(self.save_directory + 'log.txt', 'w')
        self.eval_print = eval_print
        self.eval_log = eval_log

        self.result_receiver = []  # type: list[Connection]
        self.action_sender = []  # type: list[Connection]
        self.reset_sender = []  # type: list[Connection]
        self.envs = []  # type: list[Process]

        self.init_envs()

    def init_envs(self):
        for slave_idx in range(self.num_slaves):
            r_s, r_r = Pipe()
            a_s, a_r = Pipe()
            reset_s, reset_r = Pipe()
            p = Process(target=worker, args=(self.env_name, slave_idx, r_s, a_r, reset_r))
            self.result_receiver.append(r_r)
            self.action_sender.append(a_s)
            self.reset_sender.append(reset_s)
            self.envs.append(p)
            p.start()

    def envs_get_states(self, terminated):
        states = []
        for recv_idx in range(len(self.result_receiver)):
            if terminated[recv_idx]:
                states.append([0.] * self.num_state)
            else:
                states.append(self.result_receiver[recv_idx].recv())
        return states

    def envs_send_actions(self, actions, terminated):
        for i in range(len(self.action_sender)):
            if not terminated[i]:
                self.action_sender[i].send(actions[i])

    def envs_get_status(self, ternimated):
        status = []
        for recv_idx in range(len(self.result_receiver)):
            if ternimated[recv_idx]:
                status.append((0., True))
            else:
                status.append(self.result_receiver[recv_idx].recv())
        return zip(*status)

    def envs_resets(self):
        for i in range(len(self.reset_sender)):
            self.reset_sender[i].send(True)

    def envs_reset(self, i):
        self.reset_sender[i].send(True)

    def SaveModel(self):
        torch.save(self.model.state_dict(), self.save_directory + str(self.num_evaluation) + '.pt')

    def LoadModel(self, model_path):
        self.model.load_state_dict(torch.load(model_path))

    def ComputeTDandGAE(self):
        self.replay_buffer.clear()
        for epi in self.total_episodes:
            data = epi.get_data()
            size = len(data)
            states, actions, rewards, values, logprobs = zip(*data)

            values = np.concatenate((values, np.zeros(1)), axis=0)
            advantages = np.zeros(size)
            ad_t = 0

            for i in reversed(range(len(data))):
                delta = rewards[i] + values[i + 1] * self.gamma - values[i]
                ad_t = delta + self.gamma * self.lb * ad_t
                advantages[i] = ad_t

            TD = values[:size] + advantages
            for i in range(size):
                self.replay_buffer.push(states[i], actions[i], logprobs[i], TD[i], advantages[i])

    def GenerateTransitions(self):
        del self.total_episodes[:]
        episodes = [None] * self.num_slaves
        for j in range(self.num_slaves):
            episodes[j] = EpisodeBuffer()

        self.envs_resets()

        local_step = 0
        terminated = [False] * self.num_slaves
        # print('Generate Transtions...')

        percent = 0
        while True:
            # update states
            states = self.envs_get_states(terminated)

            # print(local_step)

            # new_percent = local_step*10//self.buffer_size
            # if (new_percent == percent) is not True:
            #     percent = new_percent
            #     print('{}0%'.format(percent))
            a_dist, v = self.model(torch.tensor(states).float())
            actions = a_dist.sample().detach().numpy()
            logprobs = a_dist.log_prob(torch.tensor(actions).float()).detach().numpy().reshape(-1)
            values = v.detach().numpy().reshape(-1)

            self.envs_send_actions(actions, terminated)
            rewards, is_done = self.envs_get_status(terminated)

            for j in range(self.num_slaves):
                if terminated[j]:
                    continue

                nan_occur = False
                if np.any(np.isnan(states[j])) or np.any(np.isnan(actions[j])):
                    nan_occur = True
                else:
                    episodes[j].push(states[j], actions[j], rewards[j], values[j], logprobs[j])
                    local_step += 1

                # if episode is terminated
                if is_done[j] or nan_occur:
                    if not is_done[j] and nan_occur:
                        print('!!!!!!!!!!!!!!!!!!!!!!!!exception')
                    # push episodes
                    self.total_episodes.append(episodes[j])

                    # if data limit is exceeded, stop simulations
                    if local_step < self.buffer_size:
                        episodes[j] = EpisodeBuffer()
                        self.envs_reset(j)
                    else:
                        terminated[j] = True

            if local_step >= self.buffer_size:
                all_terminated = True
                for j in range(self.num_slaves):
                    if terminated[j] is False:
                        all_terminated = False

                if all_terminated is True:
                    break

    # print('Done!')
    def OptimizeModel(self):
        # print('Optimize Model...')
        self.ComputeTDandGAE()
        all_transitions = np.array(self.replay_buffer.buffer)

        for _ in range(self.num_epochs):
            np.random.shuffle(all_transitions)
            for i in range(len(all_transitions) // self.batch_size):
                transitions = all_transitions[i * self.batch_size:(i + 1) * self.batch_size]
                batch = Transition(*zip(*transitions))

                stack_s = np.vstack(batch.s).astype(np.float32)
                stack_a = np.vstack(batch.a).astype(np.float32)
                stack_lp = np.vstack(batch.logprob).astype(np.float32)
                stack_td = np.vstack(batch.TD).astype(np.float32)
                stack_gae = np.vstack(batch.GAE).astype(np.float32)

                a_dist, v = self.model(torch.tensor(stack_s).float())
                '''Critic Loss'''
                loss_critic = ((v - torch.tensor(stack_td).float()).pow(2)).mean()

                '''Actor Loss'''
                ratio = torch.exp(a_dist.log_prob(torch.tensor(stack_a).float()) - torch.tensor(stack_lp).float())
                stack_gae = (stack_gae - stack_gae.mean()) / (stack_gae.std() + 1E-5)
                surrogate1 = ratio * torch.tensor(stack_gae).float()
                surrogate2 = torch.clamp(ratio, min=1.0 - self.clip_ratio, max=1.0 + self.clip_ratio) * torch.tensor(stack_gae).float()
                loss_actor = - torch.min(surrogate1, surrogate2).mean()

                '''Entropy Loss'''
                loss_entropy = - self.w_entropy * a_dist.entropy().mean()

                loss = loss_critic + loss_actor + loss_entropy
                # loss = loss_critic + loss_actor
                self.optimizer.zero_grad()
                loss.backward(retain_graph=True)
                for param in self.model.parameters():
                    param.grad.data.clamp_(-0.5, 0.5)
                self.optimizer.step()

    # print('Done!')
    def Train(self):
        self.GenerateTransitions()
        self.OptimizeModel()
        self.num_training += 1

    def Evaluate(self):
        self.num_evaluation += 1
        total_reward = 0
        total_step = 0
        self.env.Reset(False, 0)
        states = self.env.GetStates()
        for j in range(len(states)):
            if np.any(np.isnan(states[j])):
                self.print("state warning!!!!!!!! start")

        for t in count():
            action_dist, _ = self.model(torch.tensor(states).float())
            actions = action_dist.loc.detach().numpy()
            for j in range(len(actions)):
                if np.any(np.isnan(actions[j])):
                    self.print("action warning!!!!!!!!" + str(t))

            self.env.Steps(actions)

            for j in range(len(states)):
                if np.any(np.isnan(states[j])):
                    self.print("state warning!!!!!!!!"+str(t))

            if self.env.IsTerminalState(0) is False:
                total_step += 1
                total_reward += self.env.GetReward(0)
            states = self.env.GetStates()
            if all(self.env.IsTerminalStates()):
                break
        self.print('noise : {:.3f}'.format(self.model.log_std.exp().mean()))
        if total_step is not 0:
            self.print('Epi reward : {:.2f}, Step reward : {:.2f} Total step : {}'
                  .format(total_reward, total_reward / total_step, total_step))
        else:
            self.print('bad..')
        return total_reward, total_step

    def print(self, s):
        if self.eval_print:
            print(s)
        if self.eval_log:
            self.log_file.write(s+"\n")
Beispiel #7
0
def worker(env_name, proc_num, result_sender, action_receiver, reset_receiver):
    """

    :type env_name: str
    :type proc_num: int
    :type result_sender: Connection
    :type action_receiver: Connection
    :return:
    """

    env = HpDartEnv(env_name)
    new_start = reset_receiver.recv()
    if new_start:
        env.reset()
        new_start = False
    local_step = 0
    while True:
        # print(proc_num, local_step)
        # print(proc_num, 'state_send')
        result_sender.send(env.state())
        # print(proc_num, 'action_recv')
        action = action_receiver.recv()
        env.step(action)
        local_step += 1
        reward, is_done = env.reward(), env.is_done()
        # print(proc_num, is_done, local_step, env.world.time())
        # print(proc_num, 'reward_send')
        result_sender.send((reward, is_done))
        if is_done:
            # print(proc_num, 'reset_recv')
            new_start = reset_receiver.recv()
        if new_start:
            env.reset()
            new_start = False