Beispiel #1
0
def global_model_eval(global_model, global_count):
    temp_model = DDPG(obs_dim=obs_dim,
                      act_dim=act_dim,
                      critic_dist_info=critic_dist_info)
    env = NormalizeAction(gym.make(args.env).env)
    env._max_episode_steps = 500

    while True:
        counter = to_numpy(global_count)[0]
        if counter >= 1000000:
            break

        temp_model.actor.load_state_dict(global_model.actor.state_dict())
        temp_model.critic.load_state_dict(global_model.critic.state_dict())
        temp_model.actor.eval()
        # bp()
        global global_returns
        state = env.reset()
        curr_return = 0
        step_count = 0
        while True:
            action = temp_model.actor(to_tensor(state.reshape((1, -1))))
            next_state, reward, done, _ = env.step(
                to_numpy(action).reshape(-1))
            curr_return += reward
            step_count += 1
            # print("Step count: ", step_count)
            if done or step_count > args.max_steps:
                break
            else:
                state = next_state
        global_returns.append(
            (counter, 0.95 * global_returns[-1][1] + 0.05 * curr_return,
             curr_return))
        print("Global Steps: ", counter, "Global return: ",
              global_returns[-1][1], "Current return: ", curr_return)

        time.sleep(10)
Beispiel #2
0
class Worker(object):
    def __init__(self, name, optimizer_global_actor, optimizer_global_critic):
        self.env = NormalizeAction(gym.make(args.env).env)
        self.env._max_episode_steps = args.max_steps
        self.name = name

        self.ddpg = DDPG(obs_dim=obs_dim, act_dim=act_dim, env=self.env, memory_size=args.rmsize,\
                          batch_size=args.bsize, tau=args.tau, gamma = args.gamma, n_steps = args.n_steps)
        self.ddpg.assign_global_optimizer(optimizer_global_actor,
                                          optimizer_global_critic)
        print('Intialized worker :', self.name)

    def warmup(self):
        n_steps = 0
        self.ddpg.actor.eval()
        # for i in range(args.n_eps):
        #     state = self.env.reset()
        #     for j in range(args.max_steps):
        #
        state = self.env.reset()
        for n_steps in range(args.warmup):
            action = np.random.uniform(-1.0, 1.0, size=act_dim)
            next_state, reward, done, _ = self.env.step(action)
            self.ddpg.replayBuffer.append(state, action, reward, done)

            if done:
                state = self.env.reset()
            else:
                state = next_state

    def work(self, global_ddpg):
        avg_reward = 0.
        n_steps = 0
        #self.warmup()

        self.ddpg.sync_local_global(global_ddpg)
        self.ddpg.hard_update()

        # Logging variables
        self.start_time = datetime.datetime.utcnow()
        self.train_logs = {}
        self.train_logs['avg_reward'] = []
        self.train_logs['total_reward'] = []
        self.train_logs['time'] = []
        self.train_logs['x_val'] = []
        self.train_logs['info_summary'] = "DDPG"
        self.train_logs['x'] = 'steps'
        step_counter = 0

        for i in range(args.n_eps):
            state = self.env.reset()
            total_reward = 0.

            episode_states = []
            episode_rewards = []
            episode_actions = []

            for j in range(args.max_steps):
                self.ddpg.actor.eval()

                state = state.reshape(1, -1)
                noise = self.ddpg.noise.sample()
                action = np.clip(
                    to_numpy(self.ddpg.actor(to_tensor(state))).reshape(-1, ) +
                    noise, -1.0, 1.0)
                # action = to_numpy(self.ddpg.actor(to_tensor(state))).reshape(-1, ) + noise
                next_state, reward, done, _ = self.env.step(action)
                total_reward += reward

                #### n-steps buffer
                episode_states.append(state)
                episode_actions.append(action)
                episode_rewards.append(reward)

                if j >= args.n_steps - 1:
                    cum_reward = 0.
                    exp_gamma = 1
                    for k in range(-args.n_steps, 0):
                        cum_reward += exp_gamma * episode_rewards[k]
                        exp_gamma *= args.gamma
                    self.ddpg.replayBuffer.add(
                        episode_states[-args.n_steps].reshape(-1),
                        episode_actions[-args.n_steps], cum_reward, next_state,
                        done)
                    # self.ddpg.replayBuffer.add_experience(state.reshape(-1), action, reward, next_state, done)
                    #self.ddpg.replayBuffer.append(state.reshape(-1), action, reward, done)

                self.ddpg.actor.train()
                self.ddpg.train(global_ddpg)
                step_counter += 1
                n_steps += 1

                if done:
                    break

                state = next_state
                # print("Episode ", i, "\t Step count: ", n_steps)

            self.ddpg.noise.reset()
            avg_reward = 0.95 * avg_reward + 0.05 * total_reward
            if i % 1 == 0:
                print('Episode ', i, '\tWorker :', self.name, '\tAvg Reward :',
                      avg_reward, '\tTotal reward :', total_reward,
                      '\tSteps :', n_steps)
                self.train_logs['avg_reward'].append(avg_reward)
                self.train_logs['total_reward'].append(total_reward)
                self.train_logs['time'].append(
                    (datetime.datetime.utcnow() -
                     self.start_time).total_seconds() / 60)
                self.train_logs['x_val'].append(step_counter)
                with open(args.logfile, 'wb') as fHandle:
                    pickle.dump(self.train_logs,
                                fHandle,
                                protocol=pickle.HIGHEST_PROTOCOL)
                with open(args.logfile_latest, 'wb') as fHandle:
                    pickle.dump(self.train_logs,
                                fHandle,
                                protocol=pickle.HIGHEST_PROTOCOL)
Beispiel #3
0
class Worker(object):
    def __init__(self, name, optimizer_global_actor, optimizer_global_critic):
        self.env = NormalizeAction(gym.make(args.env).env)
        self.env._max_episode_steps = args.max_steps
        self.name = name
        self.ddpg = DDPG(obs_dim=obs_dim, act_dim=act_dim, env=self.env, memory_size=args.rmsize,\
                          batch_size=args.bsize, tau=args.tau, critic_dist_info=critic_dist_info, \
                          prioritized_replay=args.p_replay, gamma = args.gamma, n_steps = args.n_steps)
        self.ddpg.assign_global_optimizer(optimizer_global_actor,
                                          optimizer_global_critic)
        print('Intialized worker :', self.name)

    # warmup function to fill replay buffer initially
    def warmup(self):

        self.ddpg.actor.eval()
        # bp()
        for i in range(5000 // args.max_steps):
            addExperienceToBuffer(self.ddpg,
                                  self.ddpg.replayBuffer,
                                  self.env,
                                  her=args.her,
                                  her_ratio=0.8)
        # bp()
        return

        counter = 0
        state = self.env.reset()
        episode_states = []
        episode_rewards = []
        episode_actions = []
        while counter < args.warmup:

            action = to_numpy(self.ddpg.actor(to_tensor(state.reshape(
                -1))))  #np.random.uniform(-1.0, 1.0, size=act_dim)
            next_state, reward, done, _ = self.env.step(
                np.clip(action + self.ddpg.noise.sample(), -1, 1))

            #### n-steps buffer
            episode_states.append(state)
            episode_actions.append(action)
            episode_rewards.append(reward)

            if len(episode_states) >= args.n_steps:
                cum_reward = 0.
                exp_gamma = 1
                for k in range(-args.n_steps, 0):
                    try:
                        cum_reward += exp_gamma * episode_rewards[k]
                    except:
                        bp()
                    exp_gamma *= args.gamma
                self.ddpg.replayBuffer.add(
                    episode_states[-args.n_steps].reshape(-1),
                    episode_actions[-1], cum_reward, next_state, done)
            if done:
                episode_states = []
                episode_rewards = []
                episode_actions = []
                state = self.env.reset()
            else:
                state = next_state
            counter += 1

    def work(self, global_ddpg, global_count):
        avg_reward_train = 0.
        avg_reward_test = 0.
        self.ddpg.sync_local_global(global_ddpg)
        self.ddpg.hard_update()
        self.start_time = datetime.datetime.utcnow()

        self.warmup()

        # Logging variables
        self.train_logs = {}
        self.train_logs['avg_reward_train'] = []
        self.train_logs['avg_reward_test'] = []
        self.train_logs['total_reward_train'] = []
        self.train_logs['total_reward_test'] = []
        self.train_logs['time'] = []
        self.train_logs['x_val'] = []
        self.train_logs['info_summary'] = "Distributional DDPG_" + str(
            args.n_steps) + 'N'
        if args.p_replay:
            self.train_logs[
                'info_summary'] = self.train_logs['info_summary'] + ' + PER'
        self.train_logs['x'] = 'steps'
        step_counter = 0

        # state = self.env.reset()
        # total_reward_train = 0.
        # episode_states = []
        # episode_rewards = []
        # episode_actions = []
        #
        # for j in range(args.max_steps):
        #     self.ddpg.actor.eval()
        #
        #     state = state.reshape(1, -1)
        #     noise = self.ddpg.noise.sample()
        #     action = np.clip(to_numpy(self.ddpg.actor(to_tensor(state))).reshape(-1, ) + noise, -1.0, 1.0)
        #     next_state, reward, done, _ = self.env.step(action)
        #     total_reward_train += reward
        #
        #     #### n-steps buffer
        #     episode_states.append(state)
        #     episode_actions.append(action)
        #     episode_rewards.append(reward)
        #
        #
        #     if j >= args.n_steps-1:
        #         cum_reward = 0.
        #         exp_gamma = 1
        #         for k in range(-args.n_steps, 0):
        #             cum_reward += exp_gamma * episode_rewards[k]
        #             exp_gamma *= args.gamma
        #         self.ddpg.replayBuffer.add(episode_states[-args.n_steps].reshape(-1), episode_actions[-args.n_steps], cum_reward, next_state, done)
        #
        #     # self.ddpg.replayBuffer.add(state.reshape(-1), action, reward, next_state, done)
        #
        for i in range(args.n_eps):
            for cycle in range(50):
                for episode_count in range(16):
                    addExperienceToBuffer(self.ddpg,
                                          self.ddpg.replayBuffer,
                                          self.env,
                                          her=args.her,
                                          her_ratio=0.8)
                for j in range(40):
                    self.ddpg.actor.train()
                    self.ddpg.train(global_ddpg)
                    step_counter += 1
                    global_count += 1

                success = 0
                success_steps = []
                nTrials = 10
                for k in range(nTrials):
                    total_reward_test = 0.
                    episode_rewards = []
                    episode_states = []
                    episode_success = []
                    state = self.env.reset()
                    cc = 0
                    for j in range(args.max_steps):
                        cc += 1
                        self.ddpg.actor.eval()
                        state = np.concatenate(
                            (state['observation'], state['desired_goal']))
                        state = state.reshape(1, -1)
                        action = to_numpy(self.ddpg.actor(
                            to_tensor(state))).reshape(-1)
                        action = np.clip(action, -1.0, 1.0)
                        next_state, reward, done, info = self.env.step(action)
                        done = bool(info['is_success'])
                        total_reward_test += reward
                        episode_rewards.append((j, reward))
                        episode_states.append((j, state))
                        episode_success.append((j, info['is_success']))
                        #if reward == 0 and j != 49:
                        #    bp()
                        if done:
                            success += 1
                            success_steps.append(j)
                            break
                        else:
                            state = next_state
                    #if total_reward_test > -50:
                    #    print("Reward: ", total_reward_test, "\t Done: ", done, "\t success: ", success)
                    #    print("Episode rewards \n", episode_rewards, "\n")
                    #    print("Episode rewards \n", episode_states, "\n")

                    #bp()
                    avg_reward_test = 0.95 * avg_reward_test + 0.05 * total_reward_test
                success_rate = float(success) / nTrials

                print("Epoch: ", i, "\t Cycle: ", cycle, "\t ",
                      '\tAvg Reward Test:', avg_reward_test,
                      '\tTest success steps :', success_steps,
                      '\t Success Rate', success_rate, '\tSteps :',
                      step_counter)
                # writer.add_scalar('train_reward', total_reward_train, n_steps)
                writer.add_scalar('avg_test_reward', avg_reward_test,
                                  step_counter)
                writer.add_scalar('success_rate', success_rate, step_counter)

                # self.train_logs['avg_reward_train'].append(avg_reward_train)
                # self.train_logs['avg_reward_test'].append(avg_reward_test)
                # # self.train_logs['total_reward_train'].append(total_reward_train)
                # self.train_logs['total_reward_test'].append(total_reward_test)
                # self.train_logs['time'].append((datetime.datetime.utcnow()-self.start_time).total_seconds()/60)
                # self.train_logs['x_val'].append(step_counter)
                # with open(args.logfile, 'wb') as fHandle:
                #     pickle.dump(self.train_logs, fHandle, protocol=pickle.HIGHEST_PROTOCOL)
                # with open(args.logfile_latest, 'wb') as fHandle:
                #     pickle.dump(self.train_logs, fHandle, protocol=pickle.HIGHEST_PROTOCOL)

                # self.ddpg.noise.reset()
                torch.save(self.ddpg.actor.state_dict(), path + '/actor.pth')
                torch.save(self.ddpg.critic.state_dict(), path + '/critic.pth')