Exemple #1
0
    def collect_samples(self, min_batch_size):
        t_start = time.time()
        if use_gpu:
            self.policy = self.policy.cpu()
        thread_batch_size = int(math.floor(min_batch_size / self.num_threads))
        queue = multiprocessing.Queue()
        memory = Memory()
        workers = []

        for i in range(self.num_threads - 1):
            workers.append(
                Worker(queue, self.env_list[i + 1], self.policy,
                       self.custom_reward, self.tensor, False,
                       self.running_state, thread_batch_size))
        for worker in workers:
            worker.start()

        log = collect_samples(self.env_list[0], memory, self.policy,
                              self.custom_reward, self.tensor, self.render,
                              self.running_state, True, thread_batch_size)

        worker_logs = []
        for _ in workers:
            worker_memory, worker_log = queue.get()
            memory.append(worker_memory)
            worker_logs.append(worker_log)
        batch = memory.sample()
        if self.num_threads > 1:
            log_list = [log] + worker_logs
            log = merge_log(log_list)
        if use_gpu:
            self.policy = self.policy.cuda()
        t_end = time.time()
        log['sample_time'] = t_end - t_start
        return batch, log
Exemple #2
0
def collect_samples(pid, queue, env, policy, stochastic, render, running_state, min_batch_size):
    torch.randn(pid)
    log = dict()
    memory = Memory()
    num_steps = 0
    total_reward = 0
    min_reward = 1e6
    max_reward = -1e6
    num_episodes = 0

    while num_steps < min_batch_size:
        state = env.reset()
        if running_state is not None:
            state = running_state(state)
        reward_episode = 0

        for t in range(10000):
            state_var = tensor(state).unsqueeze(0)
            with torch.no_grad():
                if not stochastic:
                    action = policy(state_var)[0][0].numpy()
                else:
                    action = policy.select_action(state_var)[0].numpy()
            action = action.astype(np.float64)
            next_state, reward, done, _ = env.step(action)
            reward_episode += reward
            if running_state is not None:
                next_state = running_state(next_state)

            mask = 0 if done else 1

            memory.push(state, action, mask, next_state, reward)

            if render:
                env.render()
            if done:
                break

            state = next_state

        # log stats
        num_steps += (t + 1)
        num_episodes += 1
        total_reward += reward_episode
        min_reward = min(min_reward, reward_episode)
        max_reward = max(max_reward, reward_episode)

    log['num_steps'] = num_steps
    log['num_episodes'] = num_episodes
    log['total_reward'] = total_reward
    log['avg_reward'] = total_reward / num_episodes
    log['max_reward'] = max_reward
    log['min_reward'] = min_reward

    if queue is not None:
        queue.put([pid, memory, log])
    else:
        return memory, log
Exemple #3
0
 def run(self):
     memory = Memory()
     log = collect_samples(self.env, memory, self.policy,
                           self.custom_reward, self.tensor, False,
                           self.running_state, False, self.min_batch_size)
     self.queue.put([memory, log])
Exemple #4
0
    def train_gail(self, expert):
        '''Train Info-GAIL.'''
        args, dtype = self.args, self.dtype
        results = {
            'average_reward': [],
            'episode_reward': [],
            'true_traj': {},
            'pred_traj': {}
        }
        self.train_step_count, self.gail_step_count = 0, 0

        for ep_idx in range(args.num_epochs):
            memory = Memory()

            num_steps = 0
            reward_batch, true_reward_batch = [], []
            expert_true_reward_batch = []
            true_traj_curr_episode, gen_traj_curr_episode = [], []

            while num_steps < args.batch_size:
                traj_expert = expert.sample(size=1)
                state_expert, action_expert, _, _ = traj_expert

                # Expert state and actions
                state_expert = state_expert[0]
                action_expert = action_expert[0]
                expert_episode_len = len(state_expert)

                # Sample start state or should we just choose the start state
                # from the expert trajectory sampled above.
                # curr_state_obj = self.sample_start_state()
                curr_state_obj = State(state_expert[0], self.obstacles)
                curr_state_feat = self.get_state_features(
                    curr_state_obj, self.args.use_state_features)

                # Add history to state
                if args.history_size > 1:
                    curr_state = -1 * np.ones(
                        (args.history_size * curr_state_feat.shape[0]),
                        dtype=np.float32)
                    curr_state[(args.history_size-1) \
                            * curr_state_feat.shape[0]:] = curr_state_feat
                else:
                    curr_state = curr_state_feat

                # TODO: Make this a separate function. Can be parallelized.
                ep_reward, ep_true_reward, expert_true_reward = 0, 0, 0
                true_traj, gen_traj = [], []
                gen_traj_dict = {
                    'features': [],
                    'actions': [],
                    'c': [],
                    'mask': []
                }
                disc_reward, posterior_reward = 0.0, 0.0
                # Use a hard-coded list for memory to gather experience since we
                # need to mutate it before finally creating a memory object.

                c_sampled = np.zeros((self.num_goals), dtype=np.float32)
                c_sampled[np.random.randint(0, self.num_goals)] = 1.0
                c_sampled_tensor = torch.zeros((1)).type(torch.LongTensor)
                c_sampled_tensor[0] = int(np.argmax(c_sampled))
                if self.args.cuda:
                    c_sampled_tensor = torch.cuda.LongTensor(c_sampled_tensor)

                memory_list = []
                for t in range(expert_episode_len):
                    action = self.select_action(
                        np.concatenate((curr_state, c_sampled)))
                    action_numpy = action.data.cpu().numpy()

                    # Save generated and true trajectories
                    true_traj.append((state_expert[t], action_expert[t]))
                    gen_traj.append((curr_state_obj.coordinates, action_numpy))
                    gen_traj_dict['features'].append(
                        self.get_state_features(curr_state_obj,
                                                self.args.use_state_features))
                    gen_traj_dict['actions'].append(action_numpy)
                    gen_traj_dict['c'].append(c_sampled)

                    action = epsilon_greedy_linear_decay(action_numpy,
                                                         args.num_epochs * 0.5,
                                                         ep_idx,
                                                         self.action_size,
                                                         low=0.05,
                                                         high=0.3)

                    # Get the discriminator reward
                    disc_reward_t = float(
                        self.reward_net(
                            torch.cat((Variable(
                                torch.from_numpy(curr_state).unsqueeze(
                                    0)).type(dtype),
                                       Variable(
                                           torch.from_numpy(
                                               oned_to_onehot(
                                                   action, self.action_size)).
                                           unsqueeze(0)).type(dtype)),
                                      1)).data.cpu().numpy()[0, 0])

                    if args.use_log_rewards and disc_reward_t < 1e-6:
                        disc_reward_t += 1e-6

                    disc_reward_t = -math.log(disc_reward_t) \
                            if args.use_log_rewards else -disc_reward_t
                    disc_reward += disc_reward_t

                    # Predict c given (x_t)
                    predicted_posterior = self.posterior_net(
                        Variable(torch.from_numpy(curr_state).unsqueeze(
                            0)).type(dtype))
                    posterior_reward_t = self.criterion_posterior(
                        predicted_posterior,
                        Variable(c_sampled_tensor)).data.cpu().numpy()[0]

                    posterior_reward += (self.args.lambda_posterior *
                                         posterior_reward_t)

                    # Update Rewards
                    ep_reward += (disc_reward_t + posterior_reward_t)
                    true_goal_state = [
                        int(x) for x in state_expert[-1].tolist()
                    ]
                    if self.args.flag_true_reward == 'grid_reward':
                        ep_true_reward += self.true_reward.reward_at_location(
                            curr_state_obj.coordinates,
                            goals=[true_goal_state])
                        expert_true_reward += self.true_reward.reward_at_location(
                            state_expert[t], goals=[true_goal_state])
                    elif self.args.flag_true_reward == 'action_reward':
                        ep_true_reward += self.true_reward.reward_at_location(
                            np.argmax(action_expert[t]), action)
                        expert_true_reward += self.true_reward.corret_action_reward
                    else:
                        raise ValueError("Incorrect true reward type")

                    # Update next state
                    next_state_obj = self.transition_func(
                        curr_state_obj, Action(action), 0)
                    next_state_feat = self.get_state_features(
                        next_state_obj, self.args.use_state_features)
                    #next_state = running_state(next_state)

                    mask = 0 if t == expert_episode_len - 1 else 1

                    # Push to memory
                    memory_list.append([
                        curr_state,
                        np.array([oned_to_onehot(action,
                                                 self.action_size)]), mask,
                        next_state_feat, disc_reward_t + posterior_reward_t,
                        c_sampled, c_sampled
                    ])

                    if args.render:
                        env.render()

                    if not mask:
                        break

                    curr_state_obj = next_state_obj
                    curr_state_feat = next_state_feat

                    if args.history_size > 1:
                        curr_state[:(args.history_size-1) \
                                * curr_state_feat.shape[0]] = \
                                curr_state[curr_state_feat.shape[0]:]
                        curr_state[(args.history_size-1) \
                                * curr_state_feat.shape[0]:] = curr_state_feat
                    else:
                        curr_state = curr_state_feat



                assert memory_list[-1][2] == 0, \
                        "Mask for final end state is not 0."
                for memory_t in memory_list:
                    memory.push(*memory_t)

                self.logger.summary_writer.add_scalars(
                    'gen_traj/gen_reward', {
                        'discriminator': disc_reward,
                        'posterior': posterior_reward,
                    }, self.train_step_count)

                num_steps += (t - 1)
                reward_batch.append(ep_reward)
                true_reward_batch.append(ep_true_reward)
                expert_true_reward_batch.append(expert_true_reward)
                results['episode_reward'].append(ep_reward)

                # Append trajectories
                true_traj_curr_episode.append(true_traj)
                gen_traj_curr_episode.append(gen_traj)

            results['average_reward'].append(np.mean(reward_batch))

            # Add to tensorboard
            self.logger.summary_writer.add_scalars(
                'gen_traj/reward', {
                    'average': np.mean(reward_batch),
                    'max': np.max(reward_batch),
                    'min': np.min(reward_batch)
                }, self.train_step_count)
            self.logger.summary_writer.add_scalars(
                'gen_traj/true_reward', {
                    'average': np.mean(true_reward_batch),
                    'max': np.max(true_reward_batch),
                    'min': np.min(true_reward_batch),
                    'expert_true': np.mean(expert_true_reward_batch)
                }, self.train_step_count)

            # Add predicted and generated trajectories to results
            if ep_idx % self.args.save_interval == 0:
                results['true_traj'][ep_idx] = copy.deepcopy(
                    true_traj_curr_episode)
                results['pred_traj'][ep_idx] = copy.deepcopy(
                    gen_traj_curr_episode)

            # Update parameters
            gen_batch = memory.sample()

            # We do not get the context variable from expert trajectories.
            # Hence we need to fill it in later.
            expert_batch = expert.sample(size=args.num_expert_trajs)

            self.update_params(gen_batch, expert_batch, ep_idx,
                               args.optim_epochs, args.optim_batch_size)

            self.train_step_count += 1

            if ep_idx > 0 and ep_idx % args.log_interval == 0:
                print('Episode [{}/{}]  Avg R: {:.2f}   Max R: {:.2f} \t' \
                      'True Avg {:.2f}   True Max R: {:.2f}   ' \
                      'Expert (Avg): {:.2f}'.format(
                          ep_idx, args.num_epochs, np.mean(reward_batch),
                          np.max(reward_batch), np.mean(true_reward_batch),
                          np.max(true_reward_batch),
                          np.mean(expert_true_reward_batch)))

            results_path = os.path.join(args.results_dir, 'results.pkl')
            with open(results_path, 'wb') as results_f:
                pickle.dump((results), results_f, protocol=2)
                # print("Did save results to {}".format(results_path))

            if ep_idx % args.save_interval == 0:
                checkpoint_filepath = self.model_checkpoint_filepath(ep_idx)
                torch.save(self.checkpoint_data_to_save(), checkpoint_filepath)
                print("Did save checkpoint: {}".format(checkpoint_filepath))
Exemple #5
0
def collect_samples(pid, queue, env, policy, custom_reward, mean_action,
                    render, running_state, min_batch_size):
    torch.randn(pid)
    log = dict()
    memory = Memory()
    num_steps = 0
    total_reward = 0
    min_reward = 1e6
    max_reward = -1e6
    total_c_reward = 0
    min_c_reward = 1e6
    max_c_reward = -1e6
    num_episodes = 0

    while num_steps < min_batch_size:
        state = env.reset()
        if running_state is not None:
            state = running_state(state)
        reward_episode = 0

        repeat = 0
        repeat_len = 0
        last_action = None
        ready_to_push = False
        reward_period = 0
        interval = 0

        for t in range(10000):
            state_var = tensor(state).unsqueeze(0)
            # Learning to Repeate only predicts when repeat reduced to 0.
            assert (repeat >= 0)
            if repeat <= 0:
                with torch.no_grad():
                    if mean_action:
                        action = policy(state_var)[0][0].numpy()
                    else:
                        # action, repeat = policy.select_action(state_var)[0].numpy()
                        action, repeat = policy.select_action(state_var)
                        action = action[0].numpy()
                        repeat = repeat[0].numpy()
                        # print(action)
                        # print(repeat)
                        # exit()
                action = int(
                    action) if policy.is_disc_action else action.astype(
                        np.float64)
                # action = action.tolist()
                last_action = action
                repeat = int(repeat)
                repeat_len = repeat
                ready_to_push = True
                reward_period = 0
                interval = 0

            next_state, reward, done, _ = env.step(last_action)
            reward_episode += reward
            # reward_period += reward
            reward_period += reward * (args.gamma**(repeat_len - repeat))
            interval += 1
            if repeat > 0:
                repeat -= 1
            if running_state is not None:
                next_state = running_state(next_state)

            if custom_reward is not None:
                reward = custom_reward(state, action)
                total_c_reward += reward
                min_c_reward = min(min_c_reward, reward)
                max_c_reward = max(max_c_reward, reward)

            mask = 0 if done else 1
            if ready_to_push == True or done:
                repeat = 0 if done else repeat
                memory.push(state, last_action, mask, next_state,
                            reward_period, repeat)
                ready_to_push = False
                interval = 0

            if render:
                env.render()
            if done:
                # print(reward_episode)
                # print(t)
                break

            state = next_state

        # log stats
        num_steps += (t + 1)
        num_episodes += 1
        total_reward += reward_episode
        min_reward = min(min_reward, reward_episode)
        max_reward = max(max_reward, reward_episode)

    log['num_steps'] = num_steps
    log['num_episodes'] = num_episodes
    log['total_reward'] = total_reward
    log['avg_reward'] = total_reward / num_episodes
    log['max_reward'] = max_reward
    log['min_reward'] = min_reward
    if custom_reward is not None:
        log['total_c_reward'] = total_c_reward
        log['avg_c_reward'] = total_c_reward / num_steps
        log['max_c_reward'] = max_c_reward
        log['min_c_reward'] = min_c_reward

    if queue is not None:
        queue.put([pid, memory, log])
    else:
        return memory, log
Exemple #6
0
def collect_samples(pid, queue, env, policy, custom_reward,
                    mean_action, render, min_batch_size):
    try:
        if isinstance(env, str):
            if render:
                env = gym.make(env, render_mode='human')
            else:
                env = gym.make(env)

        torch.randn(pid)
        log = dict()
        memory = Memory()
        num_steps = 0
        total_reward = 0
        min_reward = 1e6
        max_reward = -1e6
        total_c_reward = 0
        min_c_reward = 1e6
        max_c_reward = -1e6
        num_episodes = 0


        while num_steps < min_batch_size:
            state = env.reset()
            reward_episode = 0

            for t in range(600):
                state_var = tensor(state).unsqueeze(0)
                with torch.no_grad():
                    if mean_action:
                        action = policy(state_var)[0][0].numpy()
                    else:
                        action = policy.select_action(state_var)[0].numpy()
                action = int(action) if policy.is_disc_action else action.astype(np.float64)
                next_state, reward, done, _ = env.step(action)
                reward_episode += reward

                if custom_reward is not None:
                    reward = custom_reward(state, action)
                    total_c_reward += reward
                    min_c_reward = min(min_c_reward, reward)
                    max_c_reward = max(max_c_reward, reward)

                mask = 0 if done else 1

                memory.push(state, action, mask, next_state, reward)

                if render:
                    env.render()
                if done:
                    break

                state = next_state

            # log stats
            num_steps += (t + 1)
            num_episodes += 1
            total_reward += reward_episode
            min_reward = min(min_reward, reward_episode)
            max_reward = max(max_reward, reward_episode)

        log['num_steps'] = num_steps
        log['num_episodes'] = num_episodes
        log['total_reward'] = total_reward
        log['avg_reward'] = total_reward / num_episodes
        log['max_reward'] = max_reward
        log['min_reward'] = min_reward
        if custom_reward is not None:
            log['total_c_reward'] = total_c_reward
            log['avg_c_reward'] = total_c_reward / num_steps
            log['max_c_reward'] = max_c_reward
            log['min_c_reward'] = min_c_reward

        if queue is not None:
            queue.put([pid, memory, log])
        else:
            return memory, log
    except Exception as e:
        if queue is not None:
            queue.put([pid, memory, log])
        else:
            raise e
Exemple #7
0
def collect_samples(pid,
                    queue,
                    env,
                    policy,
                    custom_reward,
                    mean_action,
                    render,
                    running_state,
                    min_batch_size,
                    num_agents=1):
    torch.randn(pid)
    log = dict()
    memory = Memory()
    num_steps = 0
    total_reward = [0] * num_agents  #0
    min_reward = 1e6
    max_reward = -1e6
    total_c_reward = 0
    min_c_reward = 1e6
    max_c_reward = -1e6
    num_episodes = 0
    # render=False

    while num_steps < min_batch_size:
        state = env.reset()
        # print(state)
        if running_state is not None:
            state = running_state(state)
        reward_episode = [0] * num_agents

        # env.render()
        for t in range(150):
            # time.sleep(1)
            #TODO: temporarily for a single agent we make the state
            #  artificially into a list.
            # state_var = [tensor(state).unsqueeze(0), tensor(state).unsqueeze(0)]
            state_var = [tensor(s).unsqueeze(0) for s in state]
            # print('STATE', len(state_var))
            # state_var = state_var[:-1]
            # print(len(state_var))
            # state_var = [tensor(state).unsqueeze(0)]
            # print(state_var)
            # state_var = tensor(state).unsqueeze(0)
            with torch.no_grad():
                if mean_action:
                    # print('mean', policy(state_var))
                    action = policy(state_var)[0][0].numpy()
                else:
                    # print('else', policy.select_action(state_var))
                    # print(state_var)
                    action = policy.select_action(state_var)
                    # print(action[0][0].numpy())
                    # print(action)
                    # action_var = torch.stack([a[0] for a inaction, dim=1)

                    # action_var = torch.cat(action, dim=1)[0].numpy()

                    # action = [a[0] for a in action]
                    action = [a[0].numpy().tolist() for a in action]
                    # print(action_var)
                    # print(action)
                    # action = policy.select_action(state_var)[0].numpy()
            # TODO: this is added so that the prey is automatically controlled
            #  by arbitrary input.
            # action.append(np.array([0.0, 0.0, 0.0, 0.0, 0.0]))
            # print('ACT', action)
            # print(action)
            action_var = action
            # print(action_var)
            # print(action)
            # action = int(action) if policy.is_disc_action else action.astype(np.float64)
            # print(action)
            # action_var = action
            # action_var = [int(a) for a in action_var] if policy.is_disc_action else [a.astype(np.float64) for a in action_var]
            # print(action_var)
            # print('aa', action)
            # print('av', action_var)
            # next_state, reward, done, _ = env.step(action)
            # TODO: while we use an environment that doesn't accept multi-agent
            #  action lists
            next_state, reward, done, _ = env.step(action_var)
            # print(reward)
            # reward = reward[:-1]
            # done = done[:-1]
            # reward_all = np.sum(reward)
            # for r in range(len(reward)):
            #     reward[r] = reward_all
            # print(reward)
            # reward = [reward, reward]
            # reward_episode += reward
            for r in range(len(reward_episode)):
                reward_episode[r] += reward[r]
            # print(reward_episode)
            if running_state is not None:
                next_state = running_state(next_state)

            if custom_reward is not None:
                reward = custom_reward(state, action)
                total_c_reward += reward
                min_c_reward = min(min_c_reward, reward)
                max_c_reward = max(max_c_reward, reward)

            # mask = 0 if done else 1
            mask = [float(d) for d in done]
            # print(mask)

            #TODO while we use an environment that doesn't accept multi-agent
            #  action lists
            # print(state)
            # state = [s.tolist() for s in state]
            # print(state)
            memory.push(state, action, mask, next_state, reward)
            # memory.push(state, action, mask, next_state, reward)

            if render:
                env.render()
                time.sleep(0.1)
            if np.all(done):
                break

            state = next_state

        # log stats
        num_steps += (t + 1)
        num_episodes += 1
        for r in range(len(reward_episode)):
            total_reward[r] += reward_episode[r]
        # total_reward += reward_episode
        min_reward = min(min_reward, np.min(reward_episode))
        max_reward = max(max_reward, np.max(reward_episode))
        # min_reward = 0.0
        # max_reward = 0.0

    log['num_steps'] = num_steps
    log['avg_steps'] = num_steps / num_episodes
    log['num_episodes'] = num_episodes
    log['total_reward'] = total_reward
    log['avg_reward'] = [t / num_episodes
                         for t in total_reward]  #total_reward / num_episodes
    log['max_reward'] = max_reward
    log['min_reward'] = min_reward
    if custom_reward is not None:
        log['total_c_reward'] = total_c_reward
        log['avg_c_reward'] = total_c_reward / num_steps
        log['max_c_reward'] = max_c_reward
        log['min_c_reward'] = min_c_reward

    if queue is not None:
        queue.put([pid, memory, log])
    else:
        return memory, log
Exemple #8
0
def collect_samples(pid, queue, env, policy, custom_reward,
                    mean_action, render, running_state, min_batch_size, use_reparametrization=False):
    torch.randn(pid)
    log = dict()
    memory = Memory()
    num_steps = 0
    total_reward = 0
    min_reward = 1e6
    max_reward = -1e6
    total_c_reward = 0
    min_c_reward = 1e6
    max_c_reward = -1e6
    num_episodes = 0


    lenght_lists=[]
    while num_steps < min_batch_size:
        state = env.reset()
        if running_state is not None:
            state = running_state(state)
        reward_episode = 0

        for t in range(250):
            state_var = tensor(state).unsqueeze(0)
            with torch.no_grad():
                if mean_action:
                    # action = policy(state_var)[0][0].numpy()
                    action = policy.select_action_deterministic.numpy()
                else:
                    action = policy.select_action_stochastic(state_var)[0].numpy()
            
            action = int(action) if policy.is_disc_action else action.astype(np.float64)

            next_state, reward, done, _ = env.step(np.clip(action*100,a_min=-100, a_max=100))
            reward_episode += reward
            if running_state is not None:
                next_state = running_state(next_state)

            if custom_reward is not None:
                reward = custom_reward(state, action)
                total_c_reward += reward
                min_c_reward = min(min_c_reward, reward)
                max_c_reward = max(max_c_reward, reward)

            mask = 0 if done else 1

            memory.push(state, action, mask, next_state, reward)

            if render:
                env.render()
# /            if (done) or ((t%249 ==0) and t>0):
            if (done):

                break
        
            state = next_state
        lenght_lists.append(t)
        # log stats
        num_steps += (t + 1)
        num_episodes += 1
        total_reward += reward_episode
        min_reward = min(min_reward, reward_episode)
        max_reward = max(max_reward, reward_episode)

    log['num_steps'] = num_steps
    log['num_episodes'] = num_episodes
    log['total_reward'] = total_reward
    log['avg_reward'] = total_reward / num_episodes
    log['max_reward'] = max_reward
    log['min_reward'] = min_reward
    log['lenght_mean'] = np.mean(lenght_lists)
    log['lenght_min'] = np.min(lenght_lists)
    log['lenght_max'] = np.max(lenght_lists)
    log['lenght_std'] = np.std(lenght_lists)


    if custom_reward is not None:
        log['total_c_reward'] = total_c_reward
        log['avg_c_reward'] = total_c_reward / num_steps
        log['avg_c_reward_per_episode'] = total_c_reward / num_episodes
        log['max_c_reward'] = max_c_reward
        log['min_c_reward'] = min_c_reward

    if queue is not None:
        queue.put([pid, memory, log])
    else:
        return memory, log
Exemple #9
0
def collect_samples(pid,
                    queue,
                    env,
                    policy,
                    custom_reward,
                    mean_action,
                    tensor,
                    render,
                    running_state,
                    update_rs,
                    min_batch_size,
                    logger,
                    position_vector,
                    log_flag=False):
    torch.randn(pid, )
    log = dict()
    memory = Memory()
    num_steps = 0
    total_reward = 0
    min_reward = 1e6
    max_reward = -1e6
    total_c_reward = 0
    min_c_reward = 1e6
    max_c_reward = -1e6
    num_episodes = 0

    while num_steps < min_batch_size:
        obs = env.reset()
        #TODO 设置全局变量 position  vector
        #CPG_controller = CPG_network(position_vector)
        CPG_controller = CPG_network(position_vector)

        obs1, obs2, rewards, dones, actions, = [], [], [], [], [],
        # TODO 观测量到NN输入 的转换函数
        state = obs2state(obs)

        obs2.append(state.reshape((1, -1)))  # for storage
        obs1.append(obs.reshape((1, -1)))  # for storage
        if running_state is not None:
            state = running_state(state, update=update_rs)
        reward_episode = 0
        reward_period = 0

        for t in range(10000):
            state_var = Variable(tensor(state).unsqueeze(0), volatile=True)
            if t % 1 == 0:
                if mean_action:
                    action = policy(state_var)[0].data[0].numpy()
                else:
                    action = policy.select_action(state_var)[0].numpy()

                rl_action = int(
                    action) if policy.is_disc_action else action.astype(
                        np.float64)

            # if t%100 == 0:
            #     print('rl = ', rl_action)
            #rl_action = np.zeros(13)
            #rl_action = np.array([1,0])
            rl_action = np.clip(rl_action, 0, 1)
            action = CPG_transfer(rl_action, CPG_controller, obs, t)

            next_state, reward, done, _ = env.step(action)

            obs = next_state

            # transfer
            obs1.append(next_state.reshape((1, -1)))  # for storage
            next_state = obs2state(next_state)
            obs2.append(next_state.reshape((1, -1)))  # for storage

            actions.append(rl_action.reshape((1, -1)))
            reward_episode += reward
            if running_state is not None:
                next_state = running_state(next_state, update=update_rs)

            if custom_reward is not None:
                reward = custom_reward(state, action)
                total_c_reward += reward
                min_c_reward = min(min_c_reward, reward)
                max_c_reward = max(max_c_reward, reward)

            rewards.append(reward)  # for storage
            dones.append(done)  # for storage
            mask = 0 if done else 1

            memory.push(state, rl_action, mask, next_state, reward)

            if render:
                env.render()
            if done:
                break

            state = next_state

        # log stats
        num_steps += (t + 1)
        num_episodes += 1
        total_reward += reward_episode
        min_reward = min(min_reward, reward_episode)
        max_reward = max(max_reward, reward_episode)

        # log sample data ,just for debug
        if log_flag:
            rewards = np.array(rewards, dtype=np.float64)
            dones = np.array(dones, dtype=np.float64)
            tmp = np.vstack((rewards, dones))  # states_x, states_y,
            tmp1 = np.transpose(tmp)
            actions = np.concatenate(actions)

            obs1 = np.concatenate(obs1[:-1])
            obs2 = np.concatenate(obs2[:-1])
            data = np.concatenate((obs1, obs2, actions, tmp1), axis=1)

            trajectory = {}
            for j in range(data.shape[0]):
                for i in range(data.shape[1]):
                    trajectory[i] = data[j][i]
                logger.log(trajectory)
                logger.write()

    log['num_steps'] = num_steps
    log['num_episodes'] = num_episodes
    log['total_reward'] = total_reward
    log['avg_reward'] = total_reward / num_episodes
    log['max_reward'] = max_reward
    log['min_reward'] = min_reward
    if custom_reward is not None:
        log['total_c_reward'] = total_c_reward
        log['avg_c_reward'] = total_c_reward / num_steps
        log['max_c_reward'] = max_c_reward
        log['min_c_reward'] = min_c_reward

    if queue is not None:
        queue.put([pid, memory, log])
    else:
        return memory, log
Exemple #10
0
def collect_samples(pid, queue, env, policy, custom_reward, mean_action, tensor,
                    render, running_state, update_rs, min_batch_size, mode_list, state_type,
                    num_steps_per_mode):
    torch.randn(pid, )
    log = dict()
    memory = Memory()
    num_steps = 0
    total_reward = 0
    min_reward = 1e6
    max_reward = -1e6
    total_c_reward = 0
    min_c_reward = 1e6
    max_c_reward = -1e6
    num_episodes = 0
    max_t = num_steps_per_mode * len(mode_list) - 1

    while num_steps < min_batch_size:
        state = env.reset()
        if state_type == 'decayed_context':
            state = np.concatenate((state, np.array([1.0]),
                                    activity_map(mode_list[0]),
                                    activity_map(mode_list[min(1, len(mode_list)-1)])), axis=0)
        elif state_type == 'context':
            state = np.concatenate((state, activity_map(mode_list[0]),
                                    activity_map(mode_list[min(1, len(mode_list)-1)])), axis=0)

        if running_state is not None:
            state = running_state(state, update=update_rs)
        reward_episode = 0

        for t in range(10000):
            curr_mode_id = t // num_steps_per_mode
            if t % num_steps_per_mode == 0:
                if hasattr(env.env, 'mode'):
                    env.env.mode = mode_list[curr_mode_id]
            state_var = Variable(tensor(state).unsqueeze(0), volatile=True)
            if mean_action:
                action = policy(state_var)[0].data[0].numpy()
            else:
                action = policy.select_action(state_var)[0].numpy()
            action = int(action) if policy.is_disc_action else action.astype(np.float64)
            next_state, reward, done, _ = env.step(action)
            reward_episode += reward

            next_mode_id = min(t+1, max_t) // num_steps_per_mode

            if state_type == 'decayed_context':
                next_state = np.concatenate((next_state, 
                                             np.array([1/((t % num_steps_per_mode) + 1)]),
                                             activity_map(mode_list[next_mode_id]),
                                             activity_map(mode_list[min(next_mode_id+1, len(mode_list)-1)])), axis=0)
            elif state_type == 'context':
                next_state = np.concatenate((next_state,
                                             activity_map(mode_list[next_mode_id]),
                                             activity_map(mode_list[min(next_mode_id+1, len(mode_list)-1)])), axis=0)

            if running_state is not None:
                next_state = running_state(next_state, update=update_rs)

            if custom_reward is not None:
                reward = custom_reward(state, action)
                total_c_reward += reward
                min_c_reward = min(min_c_reward, reward)
                max_c_reward = max(max_c_reward, reward)

            if t == num_steps_per_mode * len(mode_list) - 1:
                done = True

            mask = 0 if done else 1

            memory.push(state, action, mask, next_state, reward)

            if render:
                env.render()
            if done:
                break

            state = next_state

        # log stats
        num_steps += (t + 1)
        num_episodes += 1
        total_reward += reward_episode
        min_reward = min(min_reward, reward_episode)
        max_reward = max(max_reward, reward_episode)

    log['num_steps'] = num_steps
    log['num_episodes'] = num_episodes
    log['total_reward'] = total_reward
    log['avg_reward'] = total_reward / num_episodes
    log['max_reward'] = max_reward
    log['min_reward'] = min_reward
    if custom_reward is not None:
        log['total_c_reward'] = total_c_reward
        log['avg_c_reward'] = total_c_reward / num_steps
        log['max_c_reward'] = max_c_reward
        log['min_c_reward'] = min_c_reward

    if queue is not None:
        queue.put([pid, memory, log])
    else:
        return memory, log
Exemple #11
0
def collect_samples(pid, queue, env, policy, custom_reward, mean_action,
                    render, running_state, min_batch_size):
    torch.randn(pid)
    log = dict()
    log['reward_list'] = list()
    memory = Memory()
    num_steps = 0
    total_reward = 0
    min_reward = 1e6
    max_reward = -1e6
    total_c_reward = 0
    min_c_reward = 1e6
    max_c_reward = -1e6
    num_episodes = 0
    # tbd, should match build main dtype
    dtype = torch.float64

    while num_steps < min_batch_size:
        state = env.reset()
        if running_state is not None:
            state = running_state(state)
        reward_episode = 0

        for t in range(10000):
            #print('t:{:.1f}\tnum_steps:{:.1f}\tmin_batch_size:{:.1f}'.format(t,num_steps,min_batch_size))
            # tbd, add .to(dtype)
            state_var = tensor(state).to(dtype).unsqueeze(0)
            with torch.no_grad():
                if mean_action:
                    action = policy(state_var)[0][0].numpy()
                else:
                    action = policy.select_action(state_var)[0].numpy()
            action = int(action) if policy.is_disc_action else action.astype(
                np.float64)
            next_state, reward, done, _ = env.step(action)
            reward_episode += reward
            if running_state is not None:
                next_state = running_state(next_state)

            if custom_reward is not None:
                reward = custom_reward.expert_reward(state, action)
                total_c_reward += reward
                min_c_reward = min(min_c_reward, reward)
                max_c_reward = max(max_c_reward, reward)

            mask = 0 if done else 1

            memory.push(state, action, mask, next_state, reward)

            if render:
                env.render()
            if done:
                break
            #if t > min_batch_size:
            #    break

            state = next_state

        # log stats
        num_steps += (t + 1)
        num_episodes += 1
        total_reward += reward_episode
        min_reward = min(min_reward, reward_episode)
        max_reward = max(max_reward, reward_episode)
        log['reward_list'].append(reward_episode)

    log['num_steps'] = num_steps
    log['num_episodes'] = num_episodes
    log['total_reward'] = total_reward
    log['avg_reward'] = total_reward / num_episodes  # tbd: num_episodes -> num_steps
    log['max_reward'] = max_reward
    log['min_reward'] = min_reward
    if custom_reward is not None:
        log['total_c_reward'] = total_c_reward
        log['avg_c_reward'] = total_c_reward / num_steps
        log['max_c_reward'] = max_c_reward
        log['min_c_reward'] = min_c_reward

    if queue is not None:
        queue.put([pid, memory, log])
    else:
        return memory, log
Exemple #12
0
def collect_samples(pid,
                    queue,
                    env,
                    policy,
                    custom_reward,
                    mean_action,
                    render,
                    running_state,
                    min_batch_size,
                    aux_running_state,
                    intervention_device=None):
    torch.randn(pid)
    log = dict()
    extra_mem_fields = []
    num_steps = 0
    total_reward = 0
    min_reward = 1e6
    max_reward = -1e6
    total_c_reward = 0
    min_c_reward = 1e6
    max_c_reward = -1e6
    num_episodes = 0
    aux_state = None
    next_aux_state = None
    car_racing_env = env.spec.id == 'CarRacing-v0'
    is_img_state = len(env.observation_space.shape) == 3
    if car_racing_env:
        extra_mem_fields.extend(['aux_state', 'aux_next_state'])
    if is_img_state:
        img_t = img_transform(imgnet_means, imgnet_stds)
    if intervention_device is not None:
        intervener = Intervener(intervention_device, env.spec.id)
        extra_mem_fields.append('expert_mask')

    memory = Memory(extra_mem_fields)

    while num_steps < min_batch_size:
        state = env.reset()
        if car_racing_env:
            aux_state = np.array([np.linalg.norm(env.car.hull.linearVelocity)])
        if running_state is not None:
            state = running_state(state)
        if aux_state is not None and aux_running_state is not None:
            aux_state = aux_running_state(aux_state)
        reward_episode = 0

        for t in range(10000):
            if is_img_state:
                state_var = img_t(state).unsqueeze(0)
            else:
                state_var = tensor(state).unsqueeze(0)
            if aux_state is not None:
                aux_state_var = tensor(aux_state).view(1, -1).to(dtype)

            with torch.no_grad():
                if mean_action:
                    if aux_state is not None:
                        action = policy(state_var, aux_state_var)[0][0].numpy()
                    else:
                        action = policy(state_var)[0][0].numpy()
                else:
                    if aux_state is not None:
                        action = policy.select_action(
                            state_var, aux_state_var)[0].numpy()
                    else:
                        action = policy.select_action(state_var)[0].numpy()

            if intervention_device is not None:
                intervene_action = intervener.get_action()
                if np.any(intervene_action):
                    action = intervene_action
                    expert_action = 1
                    time.sleep(intervener.step_delay)
                else:
                    expert_action = 0
                # time.sleep(intervener.step_delay)

            action = int(action) if policy.is_disc_action else action.astype(
                np.float64)
            next_state, reward, done, _ = env.step(action)
            if car_racing_env:
                next_aux_state = np.array(
                    [np.linalg.norm(env.car.hull.linearVelocity)])
            reward_episode += reward
            if running_state is not None:
                next_state = running_state(next_state)
            if next_aux_state is not None and aux_running_state is not None:
                next_aux_state = aux_running_state(next_aux_state)

            if custom_reward is not None:
                if is_img_state:
                    reward = custom_reward(state, action, aux_state)
                else:
                    reward = custom_reward(state, action)
                    total_c_reward += reward
                    min_c_reward = min(min_c_reward, reward)
                    max_c_reward = max(max_c_reward, reward)

            # TODO remove this, temporary for faster testing
            if t > 100:
                done = True

            mask = 0 if done else 1

            if is_img_state:
                mem_state = state_var.squeeze().numpy()
                mem_next_state = img_t(next_state).numpy()
            else:
                mem_state = state
                mem_next_state = next_state

            mem_list = [mem_state, action, mask, mem_next_state, reward]
            if aux_state is not None:
                mem_list.extend([aux_state, next_aux_state])
            if intervention_device is not None:
                mem_list.append(expert_action)
            memory.push(*mem_list)

            if render:
                env.render()
            if done:
                break

            state = next_state
            if aux_state is not None:
                aux_state = next_aux_state

        # log stats
        num_steps += (t + 1)

        num_episodes += 1
        total_reward += reward_episode
        min_reward = min(min_reward, reward_episode)
        max_reward = max(max_reward, reward_episode)

    log['num_steps'] = num_steps
    log['num_episodes'] = num_episodes
    log['total_reward'] = total_reward
    log['avg_reward'] = total_reward / num_episodes
    log['max_reward'] = max_reward
    log['min_reward'] = min_reward
    if custom_reward is not None:
        log['total_c_reward'] = total_c_reward
        log['avg_c_reward'] = total_c_reward / num_episodes
        log['max_c_reward'] = max_c_reward
        log['min_c_reward'] = min_c_reward

    if queue is not None:
        queue.put([pid, memory, log])
    else:
        return memory, log
Exemple #13
0
def collect_samples(pid, queue, env, policy_mgr, policy_wrk, custom_reward,
                    mean_action, render, running_state, min_batch_size):
    torch.randn(pid)
    log = dict()
    memory_mgr = Memory()
    memory_wrk = Memory()
    num_steps = 0
    total_reward = 0
    min_reward = 1e6
    max_reward = -1e6
    total_c_reward = 0
    min_c_reward = 1e6
    max_c_reward = -1e6
    num_episodes = 0

    avg_wrk_reward = 0
    avg_mgr_reward = 0

    mgr_steps = 0
    done_count = 0
    state, curr_pos = env.reset()
    while num_steps < min_batch_size:
        if running_state is not None:
            state = running_state(state)
        reward_episode = 0

        # Manager
        state_mgr = tensor(state).unsqueeze(0)
        with torch.no_grad():
            direction = policy_mgr.select_action(state_mgr)[0]
        direction = int(direction.detach().numpy())
        subgoal = get_target(curr_pos, direction)

        # Worker
        state_wrk = tensor(np.concatenate((state, subgoal)))

        for t in range(10000):
            # Sample Action
            with torch.no_grad():
                if mean_action:
                    action = policy(state_var)[0][0].numpy()
                else:
                    action = policy_wrk.select_action(
                        state_wrk.unsqueeze(0))[0].numpy()
            # Take Action
            next_state, reward, done, info = env.step(action)

            ## Sparse Rewards
            dist = np.linalg.norm(info['fingertip'] - info['target'])
            reward = -1 if (dist > 0.05) else 0

            next_state_wrk = np.concatenate((next_state, subgoal))
            reward_episode += reward
            if running_state is not None:
                next_state = running_state(next_state)

            if custom_reward is not None:
                reward = custom_reward(state, action)
                total_c_reward += reward
                min_c_reward = min(min_c_reward, reward)
                max_c_reward = max(max_c_reward, reward)

            mask_mgr = 0 if done else 1

            # Intrinsic Reward and Subgoal Reached Definition
            reward_wrk = -np.linalg.norm(
                subgoal - info['fingertip']) + info['reward_ctrl']
            subgoal_reached = (-reward_wrk < 0.05)
            mask_wrk = 0 if (done or subgoal_reached) else 1

            # Collect Rollout
            memory_wrk.push(state_wrk.detach().numpy(), action, mask_wrk,
                            next_state_wrk, reward_wrk)
            avg_wrk_reward += reward_wrk

            if render:
                env.render()
            if (done or subgoal_reached):
                break

            state_wrk = tensor(next_state_wrk)

        # Manager Rollout
        next_state_mgr = next_state
        reward_mgr = reward_episode / 50.0
        memory_mgr.push(state, direction, mask_mgr, next_state_mgr, reward_mgr)

        state = next_state
        avg_mgr_reward += reward_mgr
        mgr_steps += 1

        # log stats
        num_steps += (t + 1)
        if (done):
            num_episodes += 1
            min_reward = min(min_reward, reward_episode)
            max_reward = max(max_reward, reward_episode)
            state, curr_pos = env.reset()
            total_reward += reward_episode

        else:
            curr_pos = info['fingertip']

    log['num_steps'] = num_steps
    log['mgr_steps'] = mgr_steps
    log['num_episodes'] = num_episodes
    log['total_reward'] = total_reward
    log['avg_reward'] = total_reward / (num_episodes)
    log['max_reward'] = max_reward
    log['min_reward'] = min_reward
    log['mgr_reward'] = avg_mgr_reward / mgr_steps
    log['wrk_reward'] = avg_wrk_reward / num_steps
    if custom_reward is not None:
        log['total_c_reward'] = total_c_reward
        log['avg_c_reward'] = total_c_reward / num_steps
        log['max_c_reward'] = max_c_reward
        log['min_c_reward'] = min_c_reward

    if queue is not None:
        queue.put([pid, memory, log])
    else:
        return memory_mgr, memory_wrk, log
Exemple #14
0
def collect_samples(pid, obs_shape_n, act_shape_n, queue, env, policy,
                    custom_reward, mean_action, tensor, render, running_state,
                    update_rs, min_batch_size, g_itr):
    n_agents = len(policy)
    torch.randn(pid, )
    log = dict()
    memory = Memory()
    num_steps = 0
    total_reward = 0
    min_reward = 1e6
    max_reward = -1e6
    total_c_reward = 0
    min_c_reward = 1e6
    max_c_reward = -1e6
    num_episodes = 0

    # EPS_MAX = 0.9995
    # eps_val = EPS_MAX**float(g_itr)
    # if eps_val < 0.1:
    #     eps_val = 0.1

    # while num_steps < min_batch_size:
    while num_steps < min_batch_size:
        state = env.reset()
        # print(state)
        # if running_state is not None:
        #     state = running_state(state, update=update_rs)
        reward_episode = 0

        for t in range(10000):
            num_steps += 1
            action = []
            rewards = []
            state_var = Variable(tensor(state).unsqueeze(0), volatile=True)
            if mean_action:
                # never arrived
                action = policy(state_var)[0].data[0].numpy()
            else:
                for i in range(n_agents):
                    # action = policy[i].select_ma_action(state_var, n_agents)[0].numpy()
                    action.append(policy[i].select_action(
                        state_var[:, i, :])[0].numpy()[0])
                # freeze
                # action[0] = 0
                # action[1] = 0
                # action[2] = 0
                # action[0] = 0
                # eps = np.random.randn(action.size)*eps_val
                # action = action + eps
                # np.clip(action, -1., 1.)
                # print(action)

            # action = int(action) if policy.is_disc_action else action.astype(np.float64)
            one_hot_actions = []
            for i in range(n_agents):
                one_hot_actions.append(
                    index_to_one_hot(action[i], act_shape_n[i]))

            # print(one_hot_actions)
            next_state, reward, done, _ = env.step(one_hot_actions)

            # Added for shaped reward by haiyinpiao.
            # for punishing the bipedwalker from stucking in where it is originally.
            # if (next_state[2]<0.2):
            #     reward -=2
            # -------------------------
            # print(reward)
            reward_episode += np.mean(reward[3])
            # if running_state is not None:
            #     next_state = running_state(next_state, update=update_rs)

            # if custom_reward is not None:
            #     reward = custom_reward(state, action)
            #     total_c_reward += reward
            #     min_c_reward = min(min_c_reward, reward)
            #     max_c_reward = max(max_c_reward, reward)

            # mask = 0 if done[0] else 1
            mask = done

            memory.push(state, action, mask, next_state, reward)

            if render:
                env.render()
                # time.sleep(0.1)
            # done[3] indicates if the good agents caught
            if done[3] or num_steps >= min_batch_size:
                break
            # if done[0]:
            #     break

            state = next_state

        # log stats
        # num_steps += (t + 1)
        num_episodes += 1
        total_reward += reward_episode
        min_reward = min(min_reward, reward_episode)
        max_reward = max(max_reward, reward_episode)

    # print(pid,"collected!")

    log['num_steps'] = num_steps
    log['num_episodes'] = num_episodes
    log['total_reward'] = total_reward
    log['avg_reward'] = total_reward / num_episodes
    log['max_reward'] = max_reward
    log['min_reward'] = min_reward
    if custom_reward is not None:
        log['total_c_reward'] = total_c_reward
        log['avg_c_reward'] = total_c_reward / num_steps
        log['max_c_reward'] = max_c_reward
        log['min_c_reward'] = min_c_reward

    if queue is not None:
        queue.put([pid, memory, log])
    else:
        return memory, log
Exemple #15
0
def collect_samples(pid,
                    queue,
                    env,
                    p_nets,
                    custom_reward,
                    mean_action,
                    render,
                    running_state,
                    min_batch_size,
                    rsi_mem_prev=None):
    torch.randn(pid)
    log = dict()
    memory = Memory()
    team_reward = 0.0
    if args.dec_agents is True:
        reward_episodes = [0.0] * env.n_agents
    num_steps = 0
    total_reward = 0
    min_reward = 1e6
    max_reward = -1e6
    total_c_reward = 0
    min_c_reward = 1e6
    max_c_reward = -1e6
    num_episodes = 0

    while num_steps < min_batch_size:
        if args.rsi is True and rsi_mem_prev is not None:
            # randomized starting point.
            sp = rsi_mem_prev.rsi_state
            rs = random.sample(sp, 1)
            rs = rs[0]
            # print(rs)
            # exit()
            state = env.rsi_reset({0: rs[0], 1: rs[1], 2: rs[2], 3: rs[3]})
        else:
            state = env.reset()

        if running_state is not None:
            state = running_state(state)
        team_reward = 0
        if args.dec_agents is True:
            reward_episodes = [0.0] * env.n_agents

        for t in range(10000):
            state_var = tensor(state).unsqueeze(0)
            action = []

            with torch.no_grad():
                if mean_action:
                    action = policy(state_var)[0][0].numpy()
                else:
                    for i in range(env.n_agents):
                        action += p_nets[i].select_action(state_var)
                        if args.dec_agents is False:
                            break
            next_state, reward, done, _ = env.step(action)
            team_reward += sum(reward)
            if args.dec_agents is True:
                reward_episodes += reward
                reward_episodes = [
                    i + j for i, j in zip(reward_episodes, reward)
                ]

            if running_state is not None:
                next_state = running_state(next_state)

            if custom_reward is not None:
                reward = custom_reward(state, action)
                total_c_reward += reward
                min_c_reward = min(min_c_reward, reward)
                max_c_reward = max(max_c_reward, reward)

            if args.dec_agents is False:
                mask = 0 if all(done) else 1
            else:
                mask = [bool(1 - e) for e in done]

            if args.rsi is True:
                memory.push(state, action, mask, next_state, reward,
                            env.agent_pos)
            else:
                memory.push(state, action, mask, next_state, reward)

            if render:
                env.render()

            if all(done):
                break

            state = next_state

        # log stats
        num_steps += (t + 1)
        num_episodes += 1
        total_reward += team_reward
        min_reward = min(min_reward, team_reward)
        max_reward = max(max_reward, team_reward)

    log['num_steps'] = num_steps
    log['num_episodes'] = num_episodes
    log['total_reward'] = total_reward
    log['avg_reward'] = total_reward / num_episodes
    log['max_reward'] = max_reward
    log['min_reward'] = min_reward
    if custom_reward is not None:
        log['total_c_reward'] = total_c_reward
        log['avg_c_reward'] = total_c_reward / num_steps
        log['max_c_reward'] = max_c_reward
        log['min_c_reward'] = min_c_reward

    if queue is not None:
        queue.put([pid, memory, log])
    else:
        return memory, log
Exemple #16
0
def collect_samples(pid, queue, env, policy, custom_reward, mean_action,
                    render, running_state, min_batch_size):
    if pid > 0:
        torch.manual_seed(torch.randint(0, 5000, (1, )) * pid)
        if hasattr(env, 'np_random'):
            env.np_random.seed(env.np_random.randint(5000) * pid)
        if hasattr(env, 'env') and hasattr(env.env, 'np_random'):
            env.env.np_random.seed(env.env.np_random.randint(5000) * pid)
    log = dict()
    memory = Memory()
    num_steps = 0
    total_reward = 0
    min_reward = 1e6
    max_reward = -1e6
    total_c_reward = 0
    min_c_reward = 1e6
    max_c_reward = -1e6
    num_episodes = 0

    while num_steps < min_batch_size:
        state = env.reset()
        if running_state is not None:
            state = running_state(state)
        reward_episode = 0

        for t in range(10000):
            state_var = tensor(state).unsqueeze(0)
            with torch.no_grad():
                if mean_action:
                    action = policy(state_var)[0][0].numpy()
                else:
                    action = policy.select_action(state_var)[0].numpy()
            action = int(action) if policy.is_disc_action else action.astype(
                np.float64)
            next_state, reward, done, _ = env.step(action)
            reward_episode += reward
            if running_state is not None:
                next_state = running_state(next_state)

            if custom_reward is not None:
                reward = custom_reward(state, action)
                total_c_reward += reward
                min_c_reward = min(min_c_reward, reward)
                max_c_reward = max(max_c_reward, reward)

            mask = 0 if done else 1

            memory.push(state, action, mask, next_state, reward)

            if render:
                env.render()
            if done:
                break

            state = next_state

        # log stats
        num_steps += (t + 1)
        num_episodes += 1
        total_reward += reward_episode
        min_reward = min(min_reward, reward_episode)
        max_reward = max(max_reward, reward_episode)

    log['num_steps'] = num_steps
    log['num_episodes'] = num_episodes
    log['total_reward'] = total_reward
    log['avg_reward'] = total_reward / num_episodes
    log['max_reward'] = max_reward
    log['min_reward'] = min_reward
    if custom_reward is not None:
        log['total_c_reward'] = total_c_reward
        log['avg_c_reward'] = total_c_reward / num_steps
        log['max_c_reward'] = max_c_reward
        log['min_c_reward'] = min_c_reward

    if queue is not None:
        queue.put([pid, memory, log])
    else:
        return memory, log
Exemple #17
0
def collect_samples(pid, queue, env, policy, custom_reward, mean_action,
                    render, running_state, min_batch_size):
    # def cat_s_a(s:torch.tensor, a:int):
    #     batch_size = 1
    #     label = torch.LongTensor([[a]])
    #     a = torch.zeros(batch_size, env.action_space.n).scatter_(1, label, 1)
    #     return torch.cat((s, a), 1)

    def cat_s_a_np(s: np.array, a: int):
        batch_size = 1
        # label = np.array([[a]])
        oh = np.zeros((batch_size, env.action_space.n))
        oh[0, a] = 1
        return np.append(s, oh)

    torch.randn(pid)
    log = dict()
    memory = Memory()
    num_steps = 0
    total_reward = 0
    min_reward = 1e6
    max_reward = -1e6
    total_c_reward = 0
    min_c_reward = 1e6
    max_c_reward = -1e6
    num_episodes = 0

    while num_steps < min_batch_size:
        state = env.reset()
        if running_state is not None:
            state = running_state(state)
        reward_episode = 0

        # repeat = 0
        # repeat_len = 0
        stop = True

        # batch_size = 1
        # label = torch.LongTensor(batch_size, 1).random_() % env.action_space.n
        # last_action = torch.zeros(batch_size, env.action_space.n).scatter_(1, label, 1)
        last_action = 0
        # ready_to_push = False
        # reward_period = 0
        state = cat_s_a_np(state, last_action)
        # interval = 0

        for t in range(10000):
            state_var = tensor(state).unsqueeze(0)
            # state_var = torch.cat((state_var, last_action), 1)
            # state_var = cat_s_a(state_var, 1)
            # Learn to stop, else maintain last action
            with torch.no_grad():
                if mean_action:
                    action = policy(state_var)[0][0].numpy()
                else:
                    # action, repeat = policy.select_action(state_var)[0].numpy()
                    action, stop = policy.select_action(state_var)
                    action = action[0].numpy()
                    stop = stop[0].numpy()
                    # print(action)
                    # print(repeat)
                    # exit()
            action = int(action) if policy.is_disc_action else action.astype(
                np.float64)
            stop = int(stop)

            # action only updated when necessary.
            if t == 0 or stop is 1:
                last_action = action
            #     ready_to_push = True
            # repeat += 1

            assert (last_action is not None)
            next_state, reward, done, _ = env.step(last_action)

            next_state = cat_s_a_np(next_state, last_action)

            reward_episode += reward
            # reward_period += reward*(args.gamma**(repeat-1))

            if running_state is not None:
                next_state = running_state(next_state)

            if custom_reward is not None:
                reward = custom_reward(state, action)
                total_c_reward += reward
                min_c_reward = min(min_c_reward, reward)
                max_c_reward = max(max_c_reward, reward)

            mask = 0 if done else 1
            memory.push(state, last_action, mask, next_state, reward, stop)
            # if ready_to_push == True or done:
            #     memory.push(state, last_action, mask, next_state, reward_period, stop, repeat)
            #     ready_to_push = False
            #     repeat = 0
            #     reward_period = 0

            # memory.push(state, last_action, mask, next_state, reward, stop)

            if render:
                env.render()
            if done:
                # print(reward_episode)
                # print(t)
                break

            state = next_state

        # log stats
        num_steps += (t + 1)
        num_episodes += 1
        total_reward += reward_episode
        min_reward = min(min_reward, reward_episode)
        max_reward = max(max_reward, reward_episode)

    log['num_steps'] = num_steps
    log['num_episodes'] = num_episodes
    log['total_reward'] = total_reward
    log['avg_reward'] = total_reward / num_episodes
    log['max_reward'] = max_reward
    log['min_reward'] = min_reward
    if custom_reward is not None:
        log['total_c_reward'] = total_c_reward
        log['avg_c_reward'] = total_c_reward / num_steps
        log['max_c_reward'] = max_c_reward
        log['min_c_reward'] = min_c_reward

    if queue is not None:
        queue.put([pid, memory, log])
    else:
        return memory, log
Exemple #18
0
def collect_samples(pid,
                    queue,
                    env,
                    policy,
                    custom_reward,
                    mean_action,
                    tensor,
                    render,
                    running_state,
                    update_rs,
                    min_batch_size,
                    seed,
                    thread_id,
                    early_stopping=False):
    torch.randn(pid, )
    log = dict()
    if early_stopping:
        training = Memory()
        validation = Memory()
        memory = Memory()
    else:
        memory = Memory()
    num_steps = 0
    total_reward = 0
    min_reward = 1e6
    max_reward = -1e6
    total_c_reward = 0
    min_c_reward = 1e6
    max_c_reward = -1e6
    num_episodes = 0

    episode_rewards = []

    while num_steps < min_batch_size:
        #env.seed(seed + thread_id)
        state = env.reset()

        #print("state after env.reset():",state)

        if running_state is not None:
            state = running_state(state, update=update_rs)
        reward_episode = 0

        #
        for t in range(min_batch_size - num_steps):
            state_var = Variable(tensor(state).unsqueeze(0), volatile=True)
            if mean_action:
                action = policy(state_var)[0].data[0].numpy()
            else:
                action = policy.select_action(state_var)[0].numpy()
            action = int(action) if policy.is_disc_action else action.astype(
                np.float64)
            #print("action:",action)
            #get env, action and then get reward and next_state
            next_state, reward, done, _ = env.step(action)
            #reward sum of this episode
            reward_episode += reward
            if running_state is not None:
                next_state = running_state(next_state, update=update_rs)

            if custom_reward is not None:
                reward = custom_reward(state, action)
                total_c_reward += reward
                min_c_reward = min(min_c_reward, reward)
                max_c_reward = max(max_c_reward, reward)

            mask = 0 if done else 1

            if early_stopping:
                ra = random.random()
                if ra > 0.8 and len(validation) <= min_batch_size * 0.1:
                    validation.push(state, action, mask, next_state, reward)
                else:
                    training.push(state, action, mask, next_state, reward)
            else:
                memory.push(state, action, mask, next_state, reward)

            if render:
                env.render()
            if done:
                break

            state = next_state

        # log stats
        #print ('episode length:',t)
        num_steps += (t + 1)
        # num_episode = num_trajectories
        num_episodes += 1
        total_reward += reward_episode
        episode_rewards.append(reward_episode)
        #print("total_reward:", total_reward)
        min_reward = min(min_reward, reward_episode)
        max_reward = max(max_reward, reward_episode)
    #print (num_steps)

    log['num_steps'] = min_batch_size
    log['num_episodes'] = num_episodes
    log['total_reward'] = total_reward
    log['episode_rewards'] = episode_rewards
    log['avg_reward'] = total_reward / num_episodes
    log['max_reward'] = max_reward
    log['min_reward'] = min_reward

    if custom_reward is not None:
        log['total_c_reward'] = total_c_reward
        log['avg_c_reward'] = total_c_reward / num_steps
        log['max_c_reward'] = max_c_reward
        log['min_c_reward'] = min_c_reward

    if queue is not None:
        queue.put([pid, memory, log])
    else:
        if early_stopping:
            return memory, training, validation, log
        else:
            return memory, log
Exemple #19
0
def collect_samples(pid, queue, env, policy, custom_reward, mean_action, render, running_state, min_batch_size, horizon):

	torch.randn(pid)
	log = dict()
	memory = Memory()
	num_steps = 0
	total_reward = 0
	min_reward = 1e6
	max_reward = -1e6
	total_c_reward = 0
	min_c_reward = 1e6
	max_c_reward = -1e6
	num_episodes = 0

	while num_steps < min_batch_size:

		state = env.reset()
		if running_state is not None:
			state = running_state(state)

		reward_episode = 0

		for t in range(0, horizon):

			state_var = tensor(state).unsqueeze(0)
			
			with torch.no_grad():
				if mean_action:
					action = policy(state_var)[0][0].numpy()
				else:
					action = policy.select_action(state_var)[0].numpy()

			action = int(action) if policy.is_discrete_action else action.astype(np.float64)

			next_state, reward, done, _ = env.step(action)
			reward_episode += reward

			if running_state is not None:
				next_state = running_state(next_state)

			if custom_reward is not None:
				reward = custom_reward(state, action)
				total_c_reward += reward
				min_c_reward = min(min_c_reward, reward)
				max_c_reward = max(max_c_reward, reward)

			mask = 0 if done else 1

			memory.push(state, action, mask, next_state, reward)

			if render:
				env.render()

			if done:
				break

			state = next_state

		num_steps += (t + 1)
		num_episodes += 1
		total_reward += reward_episode
		min_reward = min(min_reward, reward_episode)
		max_reward = max(max_reward, reward_episode)

	
	log["num_steps"] = num_steps
	log["num_episodes"] = num_episodes
	log["total_reward"] = total_reward
	log["avg_reward"] = total_reward / num_episodes
	log["max_reward"] = max_reward
	log["min_reward"] = min_reward
	
	if custom_reward is not None:
		log["total_c_reward"] = total_c_reward
		log["avg_c_reward"] = total_c_reward / num_steps
		log["max_c_reward"] = max_c_reward
		log["min_c_reward"] = min_c_reward

	if queue is not None:
		queue.put([pid, memory, log])
	else:
		return memory, log
Exemple #20
0
def collect_samples(pid,
                    queue,
                    env,
                    policy,
                    custom_reward,
                    mean_action,
                    render,
                    running_state,
                    min_batch_size,
                    state_only=False,
                    opponent_policy=None,
                    alpha=None,
                    reward_type=None):
    torch.randn(pid)
    log = dict()
    if opponent_policy is None:
        memory = Memory()
    else:
        memory = TwoPlayerMemory()
    num_steps = 0
    total_reward = 0
    min_reward = 1e6
    max_reward = -1e6
    total_c_reward = 0
    min_c_reward = 1e6
    max_c_reward = -1e6
    num_episodes = 0

    while num_steps < min_batch_size:
        state = env.reset()
        if running_state is not None:
            state = running_state(state)
        reward_episode = 0

        for t in range(100000):  #range(10000):
            state_var = tensor(state).unsqueeze(0)
            with torch.no_grad():
                if mean_action:
                    action = policy(state_var)[0][0].numpy()
                else:
                    if opponent_policy is not None:
                        opponent_plays = np.random.choice(2,
                                                          p=[alpha, 1 - alpha])
                        opponent_action = opponent_policy.select_action(
                            state_var)[0].numpy()
                        player_action = policy.select_action(
                            state_var)[0].numpy()
                        if opponent_plays:
                            action = copy.deepcopy(opponent_action)
                        else:
                            action = copy.deepcopy(player_action)

                        player_action = int(
                            player_action
                        ) if policy.is_disc_action else player_action.astype(
                            np.float64)
                        opponent_action = int(
                            opponent_action
                        ) if policy.is_disc_action else opponent_action.astype(
                            np.float64)
                        """if np.isnan(player_action).any():
                            print("Player Nan")
                            player_action = np.zeros_like(player_action)
                        if np.isnan(opponent_action).any():
                            print("Opponent Nan")
                            opponent_action = np.zeros_like(opponent_action)
                        action = (1 - alpha)*opponent_action.clip(-1.0, 1.0) + alpha*player_action.clip(-1.0, 1.0)"""
                    else:
                        action = policy.select_action(state_var)[0].numpy()

            action = int(action) if policy.is_disc_action else action.astype(
                np.float64)
            if not policy.is_disc_action:
                action_to_play = action.clip(-1.0, 1.0)
                next_state, reward, done, _ = env.step(action_to_play)
            else:
                next_state, reward, done, _ = env.step(action)
            reward_episode += reward
            if running_state is not None:
                next_state = running_state(next_state)

            if custom_reward is not None:

                if state_only:
                    reward = custom_reward(state, next_state, reward_type)
                else:
                    reward = custom_reward(state, action, reward_type)
                total_c_reward += reward
                min_c_reward = min(min_c_reward, reward)
                max_c_reward = max(max_c_reward, reward)

            mask = 0 if done else 1
            if opponent_policy is not None:
                memory.push(state, player_action, opponent_action, action,
                            mask, next_state, reward)
            else:
                memory.push(state, action, mask, next_state, reward)

            if render:
                env.render()
            if done:
                if opponent_policy is not None:
                    memory.push(next_state, player_action, opponent_action,
                                action, mask, next_state, reward)
                else:
                    memory.push(next_state, action, mask, next_state, reward)
                break

            state = next_state

        # log stats
        num_steps += (t + 1)
        num_episodes += 1
        total_reward += reward_episode
        min_reward = min(min_reward, reward_episode)
        max_reward = max(max_reward, reward_episode)

    log['num_steps'] = num_steps
    log['num_episodes'] = num_episodes
    log['total_reward'] = total_reward
    log['avg_reward'] = total_reward / num_episodes
    log['max_reward'] = max_reward
    log['min_reward'] = min_reward
    if custom_reward is not None:
        log['total_c_reward'] = total_c_reward
        log['avg_c_reward'] = total_c_reward / num_steps
        log['max_c_reward'] = max_c_reward
        log['min_c_reward'] = min_c_reward

    if queue is not None:
        queue.put([pid, memory, log])
    else:
        return memory, log
Exemple #21
0
def collect_samples(pid, queue, env, policy, custom_reward, mean_action,
                    tensor, render, running_state, update_rs, min_batch_size):
    torch.randn(pid, )
    log = dict()
    memory = Memory()
    num_steps = 0
    total_reward = 0
    min_reward = 1e6
    max_reward = -1e6
    total_c_reward = 0
    min_c_reward = 1e6
    max_c_reward = -1e6
    num_episodes = 0

    while num_steps < min_batch_size:
        state = env.reset()
        if running_state is not None:
            state = running_state(state, update=update_rs)
        reward_episode = 0

        for t in range(10000):
            state_var = Variable(tensor(state).unsqueeze(0), volatile=True)
            if mean_action:
                action = policy(state_var)[0].data[0].numpy()
            else:
                action = policy.select_action(state_var)[0].numpy()
            action = int(action) if policy.is_disc_action else action.astype(np.float64)
            next_state, reward, done, _ = env.step(action)
            reward_episode += reward
            if running_state is not None:
                next_state = running_state(next_state, update=update_rs)

            if custom_reward is not None:
                reward = custom_reward(state, action)
                total_c_reward += reward
                min_c_reward = min(min_c_reward, reward)
                max_c_reward = max(max_c_reward, reward)

            mask = 0 if done else 1

            memory.push(state, action, mask, next_state, reward)

            if render:
                env.render()
            if done:
                break

            state = next_state

        # log stats
        num_steps += (t + 1)
        num_episodes += 1
        total_reward += reward_episode
        min_reward = min(min_reward, reward_episode)
        max_reward = max(max_reward, reward_episode)

    log['num_steps'] = num_steps
    log['num_episodes'] = num_episodes
    log['total_reward'] = total_reward
    log['avg_reward'] = total_reward / num_episodes
    log['max_reward'] = max_reward
    log['min_reward'] = min_reward
    if custom_reward is not None:
        log['total_c_reward'] = total_c_reward
        log['avg_c_reward'] = total_c_reward / num_steps
        log['max_c_reward'] = max_c_reward
        log['min_c_reward'] = min_c_reward

    if queue is not None:
        queue.put([pid, memory, log])
    else:
        return memory, log
Exemple #22
0
def collect_samples(pid, queue, env, policy_mgr, policy_wrk, custom_reward,
                    mean_action, render, running_state, min_batch_size):
    torch.randn(pid)
    log = dict()
    memory_mgr = Memory()
    memory_wrk = Memory()
    num_steps = 0
    total_reward = 0
    min_reward = 1e6
    max_reward = -1e6
    total_c_reward = 0
    min_c_reward = 1e6
    max_c_reward = -1e6
    num_episodes = 0

    avg_wrk_reward = 0
    avg_mgr_reward = 0

    mgr_steps = 0
    done_count = 0
    state = env.reset()
    while num_steps < min_batch_size:
        #state_wrk = tensor(state['observation'])
        #state = np.concatenate((state['observation'],state['desired_goal']))
        if running_state is not None:
            state = running_state(state)
        reward_episode = 0

        state_mgr = tensor(
            np.concatenate(
                (state['observation'], state['desired_goal']))).unsqueeze(0)
        with torch.no_grad():
            direction = policy_mgr.select_action(state_mgr)[0]

        direction = int(direction.detach().numpy())
        curr_pos = state['achieved_goal']
        subgoal = get_target(curr_pos, direction)
        state_wrk = tensor(
            np.concatenate(
                (state['observation'], state['desired_goal'], subgoal)))

        for t in range(10000):
            with torch.no_grad():
                if mean_action:
                    action = policy(state_var)[0][0].numpy()
                else:
                    action = policy_wrk.select_action(
                        state_wrk.unsqueeze(0))[0].numpy()
            next_state, reward, done, info = env.step(action)

            # dist = np.linalg.norm(info['fingertip']-info['target'])

            next_state_wrk = np.concatenate(
                (next_state['observation'], next_state['desired_goal'],
                 subgoal))
            reward_episode += reward
            if running_state is not None:
                next_state = running_state(next_state)

            if custom_reward is not None:
                reward = custom_reward(state, action)
                total_c_reward += reward
                min_c_reward = min(min_c_reward, reward)
                max_c_reward = max(max_c_reward, reward)

            mask_mgr = 0 if done else 1

            #reward_wrk = - np.linalg.norm(subgoal - next_state['achieved_goal'])
            reward_wrk = -np.linalg.norm(subgoal - next_state['achieved_goal'])
            #reward_wrk = reward
            subgoal_reached = (-reward_wrk < 0.05)
            mask_wrk = 0 if (done or subgoal_reached) else 1
            #mask_wrk = 0 if (done) else 1

            memory_wrk.push(state_wrk.detach().numpy(), action, mask_wrk,
                            next_state_wrk, reward_wrk)
            avg_wrk_reward += reward_wrk

            if render:
                env.render()
            if (done or subgoal_reached):
                #if (done):
                break

            state_wrk = tensor(next_state_wrk)

        #next_state_mgr = np.concatenate((next_state['observation'],next_state['desired_goal']))
        next_state_mgr = next_state['observation']
        #reward_mgr = reward_episode - 10*np.linalg.norm(next_state['achieved_goal'] - next_state['desired_goal'])
        #reward_mgr = reward_episode/50.0 -  np.linalg.norm(subgoal - info['target'])
        reward_mgr = reward_episode / 50.0
        memory_mgr.push(
            np.concatenate((state['observation'], state['desired_goal'])),
            direction, mask_mgr, next_state_mgr, reward_mgr)

        state = next_state
        avg_mgr_reward += reward_mgr
        mgr_steps += 1

        # log stats
        num_steps += (t + 1)
        if (done):
            num_episodes += 1
            min_reward = min(min_reward, reward_episode)
            max_reward = max(max_reward, reward_episode)
            state = env.reset()
            curr_pos = state['achieved_goal']
            total_reward += reward_episode

        else:
            curr_pos = state['achieved_goal']

    log['num_steps'] = num_steps
    log['mgr_steps'] = mgr_steps
    log['num_episodes'] = num_episodes
    log['total_reward'] = total_reward
    log['avg_reward'] = total_reward / (num_episodes)
    log['max_reward'] = max_reward
    log['min_reward'] = min_reward
    log['mgr_reward'] = avg_mgr_reward / mgr_steps
    log['wrk_reward'] = avg_wrk_reward / num_steps
    if custom_reward is not None:
        log['total_c_reward'] = total_c_reward
        log['avg_c_reward'] = total_c_reward / num_steps
        log['max_c_reward'] = max_c_reward
        log['min_c_reward'] = min_c_reward

    if queue is not None:
        queue.put([pid, memory, log])
    else:
        return memory_mgr, memory_wrk, log
def collect_samples(pid,
                    queue,
                    env,
                    policy,
                    custom_reward,
                    mean_action,
                    render,
                    running_state,
                    min_batch_size,
                    randomise=False):
    torch.randn(pid)
    log = dict()
    memory = Memory()
    num_steps = 0
    total_reward = 0
    min_reward = 1e6
    max_reward = -1e6
    total_c_reward = 0
    min_c_reward = 1e6
    max_c_reward = -1e6
    num_episodes = 0
    best_index = 0
    worst_index = 0
    episodic_rewards = []
    while num_steps < min_batch_size:
        state = env.reset()
        if running_state is not None:
            state = running_state(state)
        reward_episode = 0

        for t in range(2048):
            state_var = tensor(state).unsqueeze(0)
            with torch.no_grad():
                if mean_action:
                    action = policy(state_var)[0][0].numpy()
                else:
                    action = policy.select_action(state_var)[0].numpy()

            action = int(action) if policy.is_disc_action else action.astype(
                np.float64)
            if randomise:
                action = env.action_space.sample()
            next_state, reward, done, _ = env.step(action)
            reward_episode += reward

            if running_state is not None:
                next_state = running_state(next_state)

            if custom_reward is not None:
                reward = custom_reward(state, action)
                total_c_reward += reward
                min_c_reward = min(min_c_reward, reward)
                max_c_reward = max(max_c_reward, reward)

            if done:
                mask = 0
            else:
                mask = 1

            memory.push(state, action, mask, next_state, reward)

            if render:
                env.render()
            if done:
                break

            state = next_state

        # log stats
        num_steps += (t + 1)
        num_episodes += 1
        total_reward += reward_episode
        episodic_rewards.append(reward_episode)
        min_reward = min(min_reward, reward_episode)
        max_reward = max(max_reward, reward_episode)

    log['num_steps'] = num_steps
    log['num_episodes'] = num_episodes
    log['total_reward'] = total_reward
    log['avg_reward'] = total_reward / num_episodes
    log['max_reward'] = max_reward
    log['min_reward'] = min_reward
    log['episodic_rewards'] = episodic_rewards
    if custom_reward is not None:
        log['total_c_reward'] = total_c_reward
        log['avg_c_reward'] = total_c_reward / num_steps
        log['max_c_reward'] = max_c_reward
        log['min_c_reward'] = min_c_reward

    if queue is not None:
        queue.put([pid, memory, log])
    else:
        return memory, log