Beispiel #1
0
def evaluate(weights, args, NUM_EVALS=10, render=False):
	"""Rollout Worker runs a simulation in the environment to generate experiences and fitness values

		Parameters:
			args (object): Parameter class
			id (int): Specific Id unique to each worker spun
			task_pipe (pipe): Receiver end of the task pipe used to receive signal to start on a task
			result_pipe (pipe): Sender end of the pipe used to report back results
			data_bucket (shared list object): A shared list object managed by a manager that is used to store experience tuples
			models_bucket (shared list object): A shared list object managed by a manager used to store all the models (actors)
			store_transition (bool): Log experiences to exp_list?
			random_baseline (bool): Test with random action for baseline purpose?

		Returns:
			None
	"""
	env = RoverDomainPython(args, NUM_EVALS)
	model = MultiHeadActor(args.state_dim, args.action_dim, args.hidden_size, args.config.num_agents)

	for i, param in enumerate(model.parameters()):
		try:
			param.data = weights[i]
		except:
			param.data = weights[i].data



	fitness = [None for _ in range(NUM_EVALS)]; frame=0
	joint_state = env.reset()
	joint_state = utils.to_tensor(np.array(joint_state))

	while True: #unless done

		joint_action = [model.clean_action(joint_state[i, :], head=i).detach().numpy() for i in range(args.config.num_agents)]

		#JOINT ACTION [agent_id, universe_id, action]
		#Bound Action
		joint_action = np.array(joint_action).clip(-1.0, 1.0)
		next_state, reward, done, global_reward = env.step(joint_action)  # Simulate one step in environment
		#State --> [agent_id, universe_id, obs]
		#reward --> [agent_id, universe_id]
		#done --> [universe_id]
		#info --> [universe_id]

		next_state = utils.to_tensor(np.array(next_state))

		#Grab global reward as fitnesses
		for i, grew in enumerate(global_reward):
			if grew != None:
				fitness[i] = grew


		joint_state = next_state
		frame+=NUM_EVALS

		#DONE FLAG IS Received
		if sum(done)==len(done):
			break

	return sum(fitness)/len(fitness)
Beispiel #2
0
    def evaluate(self,
                 net,
                 is_render,
                 is_action_noise=False,
                 store_transition=True):
        total_reward = 0.0

        state = self.env.reset()
        state = utils.to_tensor(state).unsqueeze(0)
        if self.args.is_cuda: state = state.cuda()
        done = False

        while not done:
            if store_transition:
                self.num_frames += 1
                self.gen_frames += 1
            if render and is_render: self.env.render()
            action = net.forward(state)
            action.clamp(-1, 1)
            action = utils.to_numpy(action.cpu())
            if is_action_noise: action += self.ounoise.noise()

            next_state, reward, done, info = self.env.step(
                action.flatten())  # Simulate one step in environment
            next_state = utils.to_tensor(next_state).unsqueeze(0)
            if self.args.is_cuda:
                next_state = next_state.cuda()
            total_reward += reward

            if store_transition:
                self.add_experience(state, action, next_state, reward, done)
            state = next_state
        if store_transition: self.num_games += 1

        return total_reward
Beispiel #3
0
    def evaluate(self, net, is_render, is_action_noise=False, store_transition=True):
        total_reward = 0.0

        state = self.env.reset()
        state = utils.to_tensor(state)
        state = state.to(device=device)

        done = False

        while not done:
            if store_transition: self.num_frames += 1; self.gen_frames += 1
            if render and is_render: self.env.render()
            state = pre_process(state.squeeze(0).cpu())
            state = utils.to_tensor(state).unsqueeze(0)
            state = state.to(device=device)
            #
            # print("input_size", input_model.size())
            # input()
            action = net.forward(state)#.unsqueeze(0)
            action.requires_grad = False #insert it into the replay buffer without gradient
            #action.clamp(-1,1) # already int, no need of clamp it
            #action = utils.to_numpy(action.cpu())
            #if is_action_noise: action += self.ounoise.noise() # add more randomless, exploration

            next_state, reward, done, info = self.env.step(action.cpu())#.flatten())  #Simulate one step in environment

            next_state = pre_process(next_state)#memory efficiency
            next_state = utils.to_tensor(next_state).unsqueeze(0)
            next_state = next_state.to(device=device)
            total_reward += reward
            if store_transition: self.add_experience(state, action, next_state, reward, done)
            state = next_state
        if store_transition: self.num_games += 1

        return total_reward #return the total rewarard
Beispiel #4
0
def evaluate(net, args, replay_memory, dict_all_returns, key, store_transition=True):
    total_reward = 0.0
    env = utils.NormalizedActions(gym.make(env_tag))
    state = env.reset()
    num_frames = 0
    state = utils.to_tensor(state).unsqueeze(0)
    # replay_buffer = replay_memory.ReplayMemory(args.buffer_size)
    # replay_memory[key] = replay_memory

    if args.is_cuda: state = state.cuda()
    done = False
    while not done:
        if store_transition: num_frames += 1
        # if render and is_render: env.render()
        action = net.forward(state)
        action.clamp(-1, 1)
        action = utils.to_numpy(action.cpu())
        # if is_action_noise: action += self.ounoise.noise()
        # print("1")

        next_state, reward, done, info = env.step(action.flatten())  # Simulate one step in environment
        next_state = utils.to_tensor(next_state).unsqueeze(0)
        if args.is_cuda:
            next_state = next_state.cuda()
        total_reward += reward

        if store_transition:
            add_experience(state, action, next_state, reward, done, replay_memory, args)
            # replay_memory[key] = replay_memory

            # if len(replay_buffer) > args.batch_size:
            #     transitions = replay_buffer.sample(args.batch_size)
            #     batch = replay_memory.Transition(*zip(*transitions))
            #     replay_queue.put(batch)
        state = next_state
Beispiel #5
0
    def add_experience(self, state, action, next_state, reward, done):
        reward = utils.to_tensor(np.array([reward])).unsqueeze(0)
        reward = reward.to(device=device)
        if self.args.use_done_mask:
            done = utils.to_tensor(np.array([done]).astype('uint8')).unsqueeze(0)
            done = done.to(device=device)

        #print("--------action ", action, "state: ", state.shape, " next_state", next_state.shape)
        #input()
        self.replay_buffer.push(state, action, next_state, reward, done)
Beispiel #6
0
 def add_experience(self, state, action, next_state, reward, done):
     reward = utils.to_tensor(np.array([reward])).unsqueeze(0)
     if self.args.is_cuda: reward = reward.cuda()
     if self.args.use_done_mask:
         done = utils.to_tensor(np.array([done
                                          ]).astype('uint8')).unsqueeze(0)
         if self.args.is_cuda: done = done.cuda()
     action = utils.to_tensor(action)
     if self.args.is_cuda: action = action.cuda()
     self.replay_buffer.push(state, action, next_state, reward, done)
Beispiel #7
0
def add_experience(state, action, next_state, reward, done, args):
    reward = utils.to_tensor(np.array([reward])).unsqueeze(0)
    if args.is_cuda: reward = reward.cuda()
    if args.use_done_mask:
        done = utils.to_tensor(np.array([done]).astype('uint8')).unsqueeze(0)
        if args.is_cuda: done = done.cuda()
    action = utils.to_tensor(action)
    if args.is_cuda: action = action.cuda()
    # replay_buffer.append(state, action, next_state, reward, done)
    # replay_queue.put((state, action, next_state, reward, done))
    # print("before put")
    return state  #(state, action, next_state, reward, done)
Beispiel #8
0
    def shape_dpp(self, critic, state, action):

        Q1, Q2 = critic.forward(state, action)
        original_Q = torch.min(Q1, Q2)
        all_Q = [original_Q]

        state = utils.to_numpy(state.cpu())
        coupling = 4
        max_ind = 35

        qvals = []

        for coupling_mag in range(coupling):
            for row_id in range(len(state)):
                random_entry = random.randint(0, max_ind)
                state[row_id, random_entry] = 0.5

            shaped_state = utils.to_tensor(state).cuda()
            Q1, Q2 = critic.forward(shaped_state, action)
            all_Q.append(torch.min(Q1, Q2) / (coupling_mag + 1.0))

        all_Q = torch.cat(all_Q, 1)
        all_qvals = all_Q.cpu().data.numpy()
        all_qvals = np.expand_dims(np.max(all_qvals, 1), 1)

        multiplier = np.divide(all_qvals, original_Q.cpu().data.numpy())
        #print(all_qvals.shape, original_Q.cpu().data.numpy().shape)
        np.clip(multiplier, 1, 100)
        multiplier = torch.Tensor(multiplier).cuda()

        #print(dpp_max.shape, all_Q.shape)

        return original_Q * multiplier, original_Q * multiplier
Beispiel #9
0
    def evaluate(self, agent, master_epsilon, sub_epsilon):

        total_reward = 0.0
        s = self.env.reset()
        s = utils.to_tensor(s).unsqueeze(0)
        done = False

        ms = []
        ma = []
        mns = []
        mr = []
        mdone = []

        while not done:

            #Take action
            master_action, sub_action = agent.act(s, master_epsilon)

            #Sample sub-action from sub-action dist
            if random.random() < sub_epsilon:
                sub_action = random.randint(0, self.args.action_dim)

            # Simulate one step in environment
            ns, r, done, info = self.env.step(sub_action + 1)

            ns = utils.to_tensor(ns).unsqueeze(0)
            total_reward += r

            #Add to replay buffer
            self.add_experience(s, sub_action, ns)

            #Add to master buffer (temporary)
            ms.append(s)
            ma.append(
                utils.to_tensor(np.array([master_action])).unsqueeze(0).long())
            mns.append(ns)
            mr.append(utils.to_tensor(np.array([r])).unsqueeze(0))
            mdone.append(
                utils.to_tensor(np.array([float(done)])).unsqueeze(0))

            s = ns

        return total_reward, torch.cat(ms), torch.cat(ma), torch.cat(
            mns), torch.cat(mr), torch.cat(mdone)
    def add_experience(self, state, goal, action, next_state, reward, done):
        #Format as tensors
        state = utils.to_tensor(state).unsqueeze(0)
        goal = utils.to_tensor(goal).unsqueeze(0)
        action = utils.to_tensor(action).unsqueeze(0)
        next_state = utils.to_tensor(next_state).unsqueeze(0)
        reward = utils.to_tensor(np.array([reward])).unsqueeze(0)

        #done = utils.to_tensor(np.array([done]).astype('uint8')).unsqueeze(0)

        #If storing memory in GPU, upload everything to the GPU
        if self.is_mem_cuda:
            reward.cuda()
            state.cuda
            goal.cuda()
            action.cuda()
            next_state.cuda()

        self.push(state, goal, action, next_state, reward, done)
Beispiel #11
0
    def take_action(self, state):
        state = utils.to_tensor(np.array(state)).unsqueeze(0)

        #Compute the actions suggested by the actors
        seed_actions = [actor.forward(state) for actor in self.actor_ensemble]

        #Perturb the seed actions to get the action samples
        action_samples = self.perturb_action(seed_actions)

        #Score and pick action using critic ensemble
        action = self.pick_actions(state, action_samples)
        return action
    def update_parameters(self, batch):
        state_batch = torch.cat(batch.state)
        next_state_batch = torch.cat(batch.next_state)
        action_batch = torch.cat(batch.action)
        reward_batch = torch.cat(batch.reward)
        done_batch = torch.cat(batch.done)
        state_batch.volatile = False
        next_state_batch.volatile = True
        action_batch.volatile = False

        # Critic Update
        vals = self.critic.forward(state_batch)
        new_vals = self.critic.forward(next_state_batch) * (1 - done_batch)
        targets = reward_batch + self.gamma * new_vals
        self.critic_optim.zero_grad()
        dt = self.loss(vals, targets)
        dt.backward()
        self.critic_optim.step()

        # Actor Update
        self.actor_optim.zero_grad()
        state_batch = utils.to_tensor(utils.to_numpy(state_batch))
        targets = utils.to_tensor(utils.to_numpy(targets))
        vals = utils.to_tensor(utils.to_numpy(vals))
        action_logs = self.actor.forward(state_batch)
        entropy_loss = torch.mean(entropy(torch.exp(action_logs)))
        action_logs = F.log_softmax(action_logs)
        dt = targets - vals
        alogs = []
        for i, action in enumerate(action_batch):
            action_i = int(action.cpu().data.numpy())
            alogs.append(action_logs[i, action_i])
        alogs = torch.cat(alogs).unsqueeze(0)

        policy_loss = -torch.mean(dt * alogs.t())
        actor_loss = policy_loss - entropy_loss
        actor_loss.backward()
        self.actor_optim.step()
    def shape_dpp(self, critic, actor, state, sensor_model):

        Q1, _, val = critic((state), actor((state)))
        original_T = Q1 - val

        all_adv = [original_T]

        state = utils.to_numpy(state.cpu())
        #mid_index = int(180 / self.args.angle_res)
        coupling = self.args.coupling

        max_ind = int(360 / self.args.angle_res)

        perturb_index = [
            np.argwhere(state[i, 0:max_ind] != -1).flatten()
            for i in range(len(state))
        ]
        for i, entry in enumerate(perturb_index):
            np.random.shuffle(entry)
            if len(entry) < coupling:
                perturb_index[i] = np.tile(entry, (coupling, 1)).flatten()

        for coupling_mag in range(coupling):

            empty_ind = [int(entry[coupling_mag]) for entry in perturb_index]

            if sensor_model == 'density':
                for i, ind in enumerate(empty_ind):
                    state[i, ind] = 1.0
            elif sensor_model == 'closets':
                for i, ind in enumerate(empty_ind):
                    state[i, ind] = 1.0

            shaped_state = utils.to_tensor(state).cuda()

            Q1, _, val = critic((shaped_state), actor((shaped_state)))
            adv = (Q1 - val) / (coupling_mag + 1)
            all_adv.append(adv)

        all_adv = torch.cat(all_adv, 1)
        dpp_max = torch.max(all_adv, 1)[0].unsqueeze(1)

        with torch.no_grad():
            normalizer = dpp_max / original_T

        return original_T * normalizer
Beispiel #14
0
 def reset(self):
     return utils.to_tensor(self._preprocess_state(
         self.env.reset())).unsqueeze(0)
Beispiel #15
0
        torch.nn.init.constant_(m.bias, 0)


args = Parameters()
NUM_EVALS = 1
env = RoverDomainPython(args, NUM_EVALS)

path = 'nets/0_actor_pop20_roll50_envrover_tight_6_3_seed2023-reward'
buffer = torch.load(path)
net = MultiHeadActor(args.state_dim, args.action_dim, 100,
                     args.config.num_agents)
net.load_state_dict(buffer)
net.eval()

joint_state = env.reset()
joint_state = utils.to_tensor(np.array(joint_state))
fitness = [0 for _ in range(NUM_EVALS)]
local_fitness = [0 for _ in range(NUM_EVALS)]

while True:  #unless done

    if RANDOM_BASELINE:
        joint_action = [
            np.random.random((1, args.state_dim))
            for _ in range(args.config.num_agents)
        ]
    else:
        joint_action = [
            net.clean_action(joint_state[i, :], head=i).detach().numpy()
            for i in range(args.config.num_agents)
        ]
 def act(self, state, is_noise):
     state = utils.to_tensor(state).unsqueeze(0)
     action = self.actor.forward(state)
     action = action.detach().numpy().flatten()
     if is_noise: action += self.exploration_noise.noise()
     return action
Beispiel #17
0
 def step(self, action):
     next_state, reward, done, info = self.env.step(action)
     next_state = utils.to_tensor(
         self._preprocess_state(next_state)).unsqueeze(0)
     return next_state, reward, done, info
Beispiel #18
0
def rollout_worker(args, id, type, task_pipe, result_pipe,
                   predator_data_bucket, prey_data_bucket, predators_bucket,
                   prey_bucket, store_transitions, config):
    """Rollout Worker runs a simulation in the environment to generate experiences and fitness values

        Parameters:
            worker_id (int): Specific Id unique to each worker spun
            task_pipe (pipe): Receiver end of the task pipe used to receive signal to start on a task
            result_pipe (pipe): Sender end of the pipe used to report back results
            noise (object): A noise generator object
            exp_list (shared list object): A shared list object managed by a manager that is used to store experience tuples
            pop (shared list object): A shared list object managed by a manager used to store all the models (actors)
            difficulty (int): Difficulty of the task
            use_rs (bool): Use behavioral reward shaping?
            store_transition (bool): Log experiences to exp_list?

        Returns:
            None
    """

    if type == 'test': NUM_EVALS = args.num_test
    elif type == 'pg': NUM_EVALS = args.rollout_size
    elif type == 'evo': NUM_EVALS = 10
    else: sys.exit('Incorrect type')

    if config == 'simple_tag' or config == 'hard_tag':
        from envs.env_wrapper import SimpleTag
        env = SimpleTag(args, NUM_EVALS)

    elif config == 'simple_adversary':
        from envs.env_wrapper import SimpleAdversary
        env = SimpleAdversary(args, NUM_EVALS)

    elif config == 'simple_push':
        from envs.env_wrapper import SimplePush
        env = SimplePush(args, NUM_EVALS)
    else:
        sys.exit('Unknow Config in runner.py')
    print('Runner running config ', config)

    np.random.seed(id)
    random.seed(id)

    while True:

        teams_blueprint = task_pipe.recv(
        )  #Wait until a signal is received  to start rollout
        if teams_blueprint == 'TERMINATE': exit(0)  # Kill yourself

        # Get the current team actors
        if type == 'test' or type == 'pg':
            team = [predators_bucket[0] for _ in range(args.config.num_agents)]
        elif type == "evo":
            team = [
                predators_bucket[teams_blueprint[0]]
                for _ in range(args.config.num_agents)
            ]
        else:
            sys.exit('Incorrect type')

        if args.rollout_size == 0: store_transitions = False

        fitness = [None for _ in range(NUM_EVALS)]
        frame = 0
        prey_fitness = [None for _ in range(NUM_EVALS)]
        predator_state, prey_state = env.reset()
        prey_rollout_trajectory = []
        predator_rollout_trajectory = [[] for _ in range(3)]

        prey_state = utils.to_tensor(np.array(prey_state))
        predator_state = utils.to_tensor(np.array(predator_state))

        while True:  #unless done
            if type == 'pg':
                prey_action = [
                    prey_bucket[0].noisy_action(prey_state[i, :],
                                                head=i).detach().numpy()
                    for i in range(len(prey_state))
                ]
                predator_action = [
                    team[i].noisy_action(predator_state[i, :],
                                         head=i).detach().numpy()
                    for i in range(len(predator_state))
                ]
            else:
                prey_action = [
                    prey_bucket[0].clean_action(prey_state[i, :],
                                                head=i).detach().numpy()
                    for i in range(len(prey_state))
                ]
                predator_action = [
                    team[i].clean_action(predator_state[i, :],
                                         head=i).detach().numpy()
                    for i in range(len(predator_state))
                ]

            #JOINT ACTION [agent_id, universe_id, action]

            #Bound Action
            prey_action = np.array(prey_action).clip(-1.0, 1.0)
            predator_action = np.array(predator_action).clip(-1.0, 1.0)

            next_pred_state, next_prey_state, pred_reward, prey_reward, done, global_reward = env.step(
                predator_action,
                prey_action)  # Simulate one step in environment
            #State --> [agent_id, universe_id, obs]
            #reward --> [agent_id, universe_id]
            #done --> [universe_id]
            #info --> [universe_id]

            #if type == "test": env.universe[0].render()

            next_pred_state = utils.to_tensor(np.array(next_pred_state))
            next_prey_state = utils.to_tensor(np.array(next_prey_state))

            #Grab global reward as fitnesses
            for i, grew in enumerate(global_reward):
                if grew[0] != None:
                    fitness[i] = grew[0]
                    prey_fitness[i] = grew[1]

            #PREDATOR
            #Push experiences to memory
            if store_transitions:
                if not args.is_matd3 and not args.is_maddpg:  #Default
                    for agent_id in range(args.config.num_agents):
                        for universe_id in range(NUM_EVALS):
                            if not done:
                                predator_rollout_trajectory[agent_id].append([
                                    np.expand_dims(
                                        utils.to_numpy(predator_state)[
                                            agent_id, universe_id, :], 0),
                                    np.expand_dims(
                                        utils.to_numpy(next_pred_state)[
                                            agent_id, universe_id, :], 0),
                                    np.expand_dims(
                                        predator_action[agent_id,
                                                        universe_id, :], 0),
                                    np.expand_dims(
                                        np.array([
                                            pred_reward[agent_id, universe_id]
                                        ],
                                                 dtype="float32"), 0),
                                    np.expand_dims(
                                        np.array([done], dtype="float32"), 0),
                                    universe_id, type
                                ])

                else:  #FOR MATD3
                    for universe_id in range(NUM_EVALS):
                        if not done:
                            predator_rollout_trajectory[0].append([
                                np.expand_dims(
                                    utils.to_numpy(predator_state)
                                    [:, universe_id, :], 0),
                                np.expand_dims(
                                    utils.to_numpy(next_pred_state)
                                    [:, universe_id, :], 0),
                                np.expand_dims(
                                    predator_action[:, universe_id, :],
                                    0),  #[batch, agent_id, :]
                                np.array([pred_reward[:, universe_id]],
                                         dtype="float32"),
                                np.expand_dims(
                                    np.array([done], dtype="float32"), 0),
                                universe_id,
                                type
                            ])

            #PREY
            for universe_id in range(NUM_EVALS):
                if not done:
                    prey_rollout_trajectory.append([
                        np.expand_dims(
                            utils.to_numpy(prey_state)[0, universe_id, :], 0),
                        np.expand_dims(
                            utils.to_numpy(next_prey_state)[0, universe_id, :],
                            0),
                        np.expand_dims(prey_action[0, universe_id, :],
                                       0),  # [batch, agent_id, :]
                        np.array([prey_reward[:, universe_id]],
                                 dtype="float32"),
                        np.expand_dims(np.array([done], dtype="float32"), 0),
                        universe_id,
                        type
                    ])

            predator_state = next_pred_state
            prey_state = next_prey_state
            frame += NUM_EVALS

            #DONE FLAG IS Received
            if done:
                #Push experiences to main
                if store_transitions:
                    for agent_id, buffer in enumerate(predator_data_bucket):
                        for entry in predator_rollout_trajectory[agent_id]:
                            temp_global_reward = fitness[entry[5]]
                            entry[5] = np.expand_dims(
                                np.array([temp_global_reward],
                                         dtype="float32"), 0)
                            buffer.append(entry)

                    #PREY
                    for buffer in prey_data_bucket:
                        for entry in prey_rollout_trajectory:
                            temp_global_reward = 0.0
                            entry[5] = np.expand_dims(
                                np.array([temp_global_reward],
                                         dtype="float32"), 0)
                            buffer.append(entry)

                break

        #Send back id, fitness, total length and shaped fitness using the result pipe
        result_pipe.send([teams_blueprint, [fitness, prey_fitness], frame])
def rollout_worker(id,
                   second_pid,
                   task_pipe,
                   result_pipe,
                   is_noise,
                   data_bucket,
                   model_bucket,
                   env_name,
                   noise_std,
                   ALGO,
                   trainers=None,
                   Save_net=True,
                   pomdp_adv=False):
    """Rollout Worker runs a simulation in the environment to generate experiences and fitness values

		Parameters:
			task_pipe (pipe): Receiver end of the task pipe used to receive signal to start on a task
			result_pipe (pipe): Sender end of the pipe used to report back results
			is_noise (bool): Use noise?
			data_bucket (list of shared object): A list of shared object reference to s,ns,a,r,done (replay buffer) managed by a manager that is used to store experience tuples
			model_bucket (shared list object): A shared list object managed by a manager used to store all the models (actors)
			env_name (str): Environment name?
			noise_std (float): Standard deviation of Gaussian for sampling noise

		Returns:
			None
	"""
    np.random.seed(
        id)  ###make sure the random seeds across learners are different
    if ALGO == 'dis':  #make env with blue and red policy agent inside,
        assert trainers is not None
        dis_env = make_self_play_env(
            seed=id, return_policy_agent=False, trainers=trainers
        )[0]  #trainer if not None, first is the shared DQN agent, second is the best red policy
        env = EnvironmentWrapper(env_name, ALGO, dis_env,
                                 1)  #this is for red agent
    elif ALGO == 'TD3_tennis':
        pid = id + second_pid * 20  #should be larger than the number of processes
        tennis_env = make_tennis_env.TennisEnvFactory(seed=id,
                                                      no_graphics=True,
                                                      pid=pid).getEnv()[0]
        env = EnvironmentWrapper('Tennis', ALGO, tennis_env, 0)
    else:
        print("ALGO is:", ALGO)
        env = EnvironmentWrapper(env_name, ALGO)

    ###LOOP###
    while True:
        identifier, gen = task_pipe.recv(
        )  # Wait until a signal is received  to start rollout
        if identifier == 'TERMINATE':
            print('Process:', os.getpid(), ' killed')
            exit(0)  #Kill yourself

        # Get the requisite network
        net = model_bucket[identifier]

        fitness = 0.0
        total_frame = 0
        state = env.reset()

        if pomdp_adv: env.try_set_pomdp_adv()

        rollout_trajectory = []
        state = utils.to_tensor(np.array(state)).unsqueeze(0)

        while True:  # unless done
            #cannot pass this line
            action = net.Gumbel_softmax_sample_distribution(
                state, use_cuda=False) if ALGO == 'dis' else net.forward(state)
            #here action is on the simplex, while use the arg_max sample in the env step

            action = utils.to_numpy(action)
            if is_noise and not ALGO == 'dis':
                #action = (action + np.random.normal(0, noise_std, size=env.env.action_space.shape[0])).clip(env.env.action_space.low, env.env.action_space.high)
                action = (action + np.random.normal(
                    0, noise_std, size=env.env.action_space.shape[0])).clip(
                        env.env.action_space.low, env.env.action_space.high)

            next_state, reward, done, info = env.step(
                int(
                    net.turn_max_into_onehot(action).argmax().numpy().
                    __float__())) if ALGO == 'dis' else env.step(
                        action.flatten())  # Simulate one step in environment

            #if id == 1: env.render()

            next_state = utils.to_tensor(np.array(next_state)).unsqueeze(0)
            fitness += reward
            #print('line 79')
            # If storing transitions
            if data_bucket != None:  #Skip for test set
                rollout_trajectory.append([
                    utils.to_numpy(state),
                    utils.to_numpy(next_state),
                    np.float32(action),
                    np.reshape(np.float32(np.array([reward])), (1, 1)),
                    np.reshape(np.float32(np.array([float(done)])), (1, 1))
                ])
            state = next_state
            total_frame += 1

            # DONE FLAG IS Received
            #print(done)
            if done:

                # Push experiences to main
                for entry in rollout_trajectory:
                    data_bucket.append(
                        entry
                    )  #ms this data_bucket store experience, is shared accross process (and training thread?)

                break
        # Send back id, fitness, total length and shaped fitness using the result pipe
        #if gen >= 10 and id == 1:
        #	env.env.world.good_agents[0].dqn_agent_naive.save_net('./runner_blue_dqn.pth')
        if gen >= 10 and gen % 5 == 0 and Save_net:
            net.save_net('./pytorch_models/red_' + str(id) + '_net_step_' +
                         str(gen) + '.pth')

        result_pipe.send(
            [identifier, fitness, total_frame,
             env.get_blue_actual_reward()])
def rollout_worker(worker_id,
                   task_pipe,
                   result_pipe,
                   noise,
                   exp_list,
                   pop,
                   difficulty,
                   use_rs,
                   store_transition=True,
                   use_synthetic_targets=False,
                   xbias=None,
                   zbias=None,
                   phase_len=100,
                   traj_container=None,
                   ep_len=1005,
                   JGS=False,
                   num_trials=5):
    """Rollout Worker runs a simulation in the environment to generate experiences and fitness values

        Parameters:
            worker_id (int): Specific Id unique to each worker spun
            task_pipe (pipe): Receiver end of the task pipe used to receive signal to start on a task
            result_pipe (pipe): Sender end of the pipe used to report back results
            noise (object): A noise generator object
            exp_list (shared list object): A shared list object managed by a manager that is used to store experience tuples
            pop (shared list object): A shared list object managed by a manager used to store all the models (actors)
            difficulty (int): Difficulty of the task
            use_rs (bool): Use behavioral reward shaping?
            store_transition (bool): Log experiences to exp_list?

        Returns:
            None
    """

    worker_id = worker_id
    env = EnvironmentWrapper(difficulty,
                             rs=use_rs,
                             use_synthetic_targets=use_synthetic_targets,
                             xbias=xbias,
                             zbias=zbias,
                             phase_len=phase_len,
                             jgs=JGS)
    nofault_endstep = phase_len * 4
    if use_rs:
        lfoot = []
        rfoot = []
        if difficulty == 0:
            ltibia = []
            rtibia = []
            pelvis_x = []
            pelvis_y = []
            ltibia_angle = []
            lfemur_angle = []
            rtibia_angle = []
            rfemur_angle = []
            head_x = []

    while True:
        _ = task_pipe.recv(
        )  #Wait until a signal is received  to start rollout
        net = pop[worker_id]  #Get the current model state from the population

        fitnesses = []
        total_frames = 0
        for _ in range(num_trials):

            fitness = 0.0
            state = env.reset()
            state = utils.to_tensor(np.array(state)).unsqueeze(0)
            while True:  #unless done

                action = net.forward(state)
                action = utils.to_numpy(action)
                if noise != None: action += noise.noise()

                next_state, reward, done, info = env.step(
                    action.flatten())  # Simulate one step in environment

                next_state = utils.to_tensor(np.array(next_state)).unsqueeze(0)
                fitness += reward

                state = next_state

                #DONE FLAG IS Received
                if done:
                    total_frames += env.istep
                    fitnesses.append(fitness)
                    break

        lower_fit = min(fitnesses)
        total_frames /= float(num_trials)
        mean_fit = sum(fitnesses) / float(num_trials)
        max_fit = max(fitnesses)

        #Send back id, fitness, total length and shaped fitness using the result pipe
        result_pipe.send(
            [worker_id, lower_fit, total_frames, [mean_fit, max_fit]])
Beispiel #21
0
def test_rollout_worker(args, id, type, task_pipe, result_pipe, data_bucket,
                        models_bucket, store_transitions, random_baseline):

    viz_gen = 0

    teams_blueprint = task_pipe.recv(
    )  #Wait until a signal is received  to start rollout
    team = models_bucket

    NUM_EVALS = 1

    if args.config.env_choice == 'rover_tight' or args.config.env_choice == 'rover_loose' or args.config.env_choice == 'rover_trap':
        from envs.env_wrapper import RoverDomainPython
        env = RoverDomainPython(args, NUM_EVALS)
    elif args.config.env_choice == 'rover_heterogeneous':
        from envs.env_wrapper import RoverHeterogeneous
        env = RoverHeterogeneous(args, NUM_EVALS)
    else:
        sys.exit('Incorrect env type')
    np.random.seed(id)
    random.seed(id)

    fitness = [None]
    joint_state = env.reset()
    rollout_trajectory = [[] for _ in range(args.config.num_agents)]

    joint_state = utils.to_tensor(np.array(joint_state))

    while True:  #unless done
        if random_baseline:
            joint_action = [
                np.random.random((NUM_EVALS, args.state_dim))
                for _ in range(args.config.num_agents)
            ]
        elif type == 'pg':
            if args.ps == 'trunk':
                joint_action = [
                    team[i][0].noisy_action(joint_state[i, :],
                                            head=i).detach().numpy()
                    for i in range(args.config.num_agents)
                ]
            else:
                joint_action = [
                    team[i][0].noisy_action(
                        joint_state[i, :]).detach().numpy()
                    for i in range(args.config.num_agents)
                ]
        else:
            if args.ps == 'trunk':
                joint_action = [
                    team[i].clean_action(joint_state[i, :],
                                         head=i).detach().numpy()
                    for i in range(args.config.num_agents)
                ]
            else:
                joint_action = [
                    team[i].clean_action(joint_state[i, :]).detach().numpy()
                    for i in range(args.config.num_agents)
                ]
        #JOINT ACTION [agent_id, universe_id, action]

        #Bound Action
        joint_action = np.array(joint_action).clip(-1.0, 1.0)
        next_state, reward, done, global_reward = env.step(
            joint_action)  # Simulate one step in environment

        try:
            next_state = utils.to_tensor(np.array(next_state))
        except:
            print(" here is the problem with the next_state", next_state)

        if type == "test" and args.config.config == 'simple_spread':
            env.render(0)
            print(joint_action[:, 0, :])
            # print(joint_state[0])

        #Grab global reward as fitnesses
        for i, grew in enumerate(
                global_reward
        ):  # for all evaluation, for rollout size for PG, and for num_eval for Evo part
            if grew != None:
                fitness[i] = grew

                #Reward Shaping
                if (args.config.env_choice == 'motivate'
                        or args.config.env_choice == 'rover_loose'
                        or args.config.env_choice == 'rover_tight'
                        or args.config.env_choice
                        == 'rover_heterogeneous') and type == "evo":
                    if args.config.is_gsl:  # Gloabl subsumes local?
                        fitness[i] += sum(env.universe[i].cumulative_local)

        joint_state = next_state

        #DONE FLAG IS Received
        if sum(done) == len(done):  # if done is 1, then exit
            #Push experiences to main
            break

    # Vizualization for test sets
    if type == "test" and (args.config.env_choice == 'rover_tight'
                           or args.config.env_choice == 'rover_heterogeneous'
                           or args.config.env_choice == 'rover_loose'
                           or args.config.env_choice == 'motivate'
                           or args.config.env_choice == 'rover_trap'
                           or args.config.config == 'simple_spread'):
        env.render()
        env.viz()

    ######### todo: to integrate here
    #if args.visualization:
    #    visualize(env, test_fits)

    #Send back id, fitness, total length and shaped fitness using the result pipe
    result_pipe.send([teams_blueprint, [fitness]])
Beispiel #22
0
 def add_experience(self, s, a, ns):
     onehot_a = np.zeros((self.args.action_dim))
     onehot_a[a - 1] = 1
     a = utils.to_tensor(onehot_a).unsqueeze(0)
     self.replay_buffer.push(s, a, ns)
def rollout_worker(args, worker_id, task_pipe, result_pipe, noise, data_bucket,
                   models_bucket, model_template):
    """Rollout Worker runs a simulation in the environment to generate experiences and fitness values

        Parameters:
            worker_id (int): Specific Id unique to each worker spun
            task_pipe (pipe): Receiver end of the task pipe used to receive signal to start on a task
            result_pipe (pipe): Sender end of the pipe used to report back results
            noise (object): A noise generator object
            exp_list (shared list object): A shared list object managed by a manager that is used to store experience tuples
            pop (shared list object): A shared list object managed by a manager used to store all the models (actors)
            difficulty (int): Difficulty of the task
            use_rs (bool): Use behavioral reward shaping?
            store_transition (bool): Log experiences to exp_list?

        Returns:
            None
    """

    worker_id = worker_id
    env = Task_Rovers(args)
    models = [model_template for _ in range(args.num_rover)]
    for m in models:
        m = m.eval()

    while True:
        RENDER = task_pipe.recv(
        )  #Wait until a signal is received  to start rollout

        # Get the current model state from the population
        for m, bucket_model in zip(models, models_bucket):
            utils.hard_update(m, bucket_model)

        fitness = 0.0
        joint_state = env.reset()
        rollout_trajectory = [[] for _ in range(args.num_rover)]
        joint_state = utils.to_tensor(np.array(joint_state))
        while True:  #unless done

            joint_action = [
                models[i].forward(joint_state[i, :]).detach().numpy()
                for i in range(args.num_rover)
            ]
            if noise != None:
                for action in joint_action:
                    action += noise.noise()

            next_state, reward, done, info = env.step(
                joint_action)  # Simulate one step in environment

            next_state = utils.to_tensor(np.array(next_state))
            fitness += sum(reward) / args.coupling

            #If storing transitions
            for i in range(args.num_rover):
                rollout_trajectory[i].append([
                    np.expand_dims(utils.to_numpy(joint_state)[i, :], 0),
                    np.expand_dims(np.array(joint_action)[i, :], 0),
                    np.expand_dims(utils.to_numpy(next_state)[i, :], 0),
                    np.expand_dims(np.array([reward[i]]), 0),
                    np.expand_dims(np.array([done]), 0)
                ])

            joint_state = next_state

            #DONE FLAG IS Received
            if done:
                if RENDER: env.render()
                #Push experiences to main
                for rover_id in range(args.num_rover):
                    for entry in rollout_trajectory[rover_id]:
                        for i in range(len(entry[0])):
                            data_bucket[rover_id].append([
                                entry[0], entry[1], entry[2], entry[3],
                                entry[4]
                            ])
                break

        #Send back id, fitness, total length and shaped fitness using the result pipe
        result_pipe.send([fitness])
Beispiel #24
0
def rollout_worker(args, id, type, task_pipe, result_pipe, data_bucket,
                   models_bucket, store_transitions, random_baseline):
    """Rollout Worker runs a simulation in the environment to generate experiences and fitness values

		Parameters:
			worker_id (int): Specific Id unique to each worker spun
			task_pipe (pipe): Receiver end of the task pipe used to receive signal to start on a task
			result_pipe (pipe): Sender end of the pipe used to report back results
			noise (object): A noise generator object
			exp_list (shared list object): A shared list object managed by a manager that is used to store experience tuples
			pop (shared list object): A shared list object managed by a manager used to store all the models (actors)
			difficulty (int): Difficulty of the task
			use_rs (bool): Use behavioral reward shaping?
			store_transition (bool): Log experiences to exp_list?

		Returns:
			None
	"""

    if type == 'test': NUM_EVALS = args.num_test
    elif type == 'pg': NUM_EVALS = args.rollout_size
    elif type == 'evo': NUM_EVALS = 5
    else: sys.exit('Incorrect type')

    if args.config.env_choice == 'multiwalker': NUM_EVALS = 1
    if args.config.env_choice == 'cassie': NUM_EVALS = 1
    if args.config.env_choice == 'hyper': NUM_EVALS = 1
    if args.config.env_choice == 'motivate': NUM_EVALS = 1
    if args.config.env_choice == 'pursuit': NUM_EVALS = 1

    if args.config.env_choice == 'rover_tight' or args.config.env_choice == 'rover_loose' or args.config.env_choice == 'rover_trap':
        from envs.env_wrapper import RoverDomainPython
        env = RoverDomainPython(args, NUM_EVALS)
    elif args.config.env_choice == 'motivate':
        from envs.env_wrapper import MotivateDomain
        env = MotivateDomain(args, NUM_EVALS)
    elif args.config.env_choice == 'pursuit':
        from envs.env_wrapper import Pursuit
        env = Pursuit(args, NUM_EVALS)
    elif args.config.env_choice == 'multiwalker':
        from envs.env_wrapper import MultiWalker
        env = MultiWalker(args, NUM_EVALS)
    elif args.config.env_choice == 'cassie':
        from envs.env_wrapper import Cassie
        env = Cassie(args, NUM_EVALS)
    elif args.config.env_choice == 'hyper':
        from envs.env_wrapper import PowerPlant
        env = PowerPlant(args, NUM_EVALS)
    else:
        sys.exit('Incorrect env type')
    np.random.seed(id)
    random.seed(id)

    while True:
        teams_blueprint = task_pipe.recv(
        )  #Wait until a signal is received  to start rollout
        if teams_blueprint == 'TERMINATE': exit(0)  # Kill yourself

        # Get the current team actors
        if args.ps == 'full' or args.ps == 'trunk':
            if type == 'test' or type == 'pg':
                team = [
                    models_bucket[0] for _ in range(args.config.num_agents)
                ]
            elif type == "evo":
                team = [
                    models_bucket[0][teams_blueprint[0]]
                    for _ in range(args.config.num_agents)
                ]
            else:
                sys.exit('Incorrect type')

        else:  #Heterogeneous
            if type == 'test' or type == 'pg': team = models_bucket
            elif type == "evo":
                team = [
                    models_bucket[agent_id][popn_id]
                    for agent_id, popn_id in enumerate(teams_blueprint)
                ]
            else:
                sys.exit('Incorrect type')

        if args.rollout_size == 0:
            if args.scheme == 'standard': store_transitions = False
            elif args.scheme == 'multipoint' and random.random(
            ) < 0.1 and store_transitions:
                store_transitions = True
        fitness = [None for _ in range(NUM_EVALS)]
        frame = 0
        joint_state = env.reset()
        rollout_trajectory = [[] for _ in range(args.config.num_agents)]

        joint_state = utils.to_tensor(np.array(joint_state))

        while True:  #unless done

            if random_baseline:
                joint_action = [
                    np.random.random((NUM_EVALS, args.state_dim))
                    for _ in range(args.config.num_agents)
                ]
            elif type == 'pg':
                if args.ps == 'trunk':
                    joint_action = [
                        team[i][0].noisy_action(joint_state[i, :],
                                                head=i).detach().numpy()
                        for i in range(args.config.num_agents)
                    ]
                else:
                    joint_action = [
                        team[i][0].noisy_action(
                            joint_state[i, :]).detach().numpy()
                        for i in range(args.config.num_agents)
                    ]
            else:
                if args.ps == 'trunk':
                    joint_action = [
                        team[i].clean_action(joint_state[i, :],
                                             head=i).detach().numpy()
                        for i in range(args.config.num_agents)
                    ]
                else:
                    joint_action = [
                        team[i].clean_action(
                            joint_state[i, :]).detach().numpy()
                        for i in range(args.config.num_agents)
                    ]
            #JOINT ACTION [agent_id, universe_id, action]

            #Bound Action
            joint_action = np.array(joint_action).clip(-1.0, 1.0)
            next_state, reward, done, global_reward = env.step(
                joint_action)  # Simulate one step in environment
            #State --> [agent_id, universe_id, obs]
            #reward --> [agent_id, universe_id]
            #done --> [universe_id]
            #info --> [universe_id]

            if args.config.env_choice == 'motivate' and type == "test":
                print(['%.2f' % r for r in reward], global_reward)

            next_state = utils.to_tensor(np.array(next_state))

            #Grab global reward as fitnesses
            for i, grew in enumerate(global_reward):
                if grew != None:
                    fitness[i] = grew

                    #Reward Shaping
                    if (args.config.env_choice == 'motivate'
                            or args.config.env_choice == 'rover_loose'
                            or args.config.env_choice
                            == 'rover_tight') and type == "evo":
                        if args.config.is_gsl:  # Gloabl subsumes local?
                            fitness[i] += sum(env.universe[i].cumulative_local)

            #Push experiences to memory
            if store_transitions:
                for agent_id in range(args.config.num_agents):
                    for universe_id in range(NUM_EVALS):
                        if not done[universe_id]:
                            rollout_trajectory[agent_id].append([
                                np.expand_dims(
                                    utils.to_numpy(joint_state)[
                                        agent_id, universe_id, :], 0),
                                np.expand_dims(
                                    utils.to_numpy(next_state)[agent_id,
                                                               universe_id, :],
                                    0),
                                np.expand_dims(
                                    joint_action[agent_id, universe_id, :], 0),
                                np.expand_dims(
                                    np.array([reward[agent_id, universe_id]],
                                             dtype="float32"), 0),
                                np.expand_dims(
                                    np.array([done[universe_id]],
                                             dtype="float32"), 0), universe_id,
                                type
                            ])

            joint_state = next_state
            frame += NUM_EVALS

            if sum(done) > 0 and sum(done) != len(done):
                k = None

            #DONE FLAG IS Received
            if sum(done) == len(done):
                #Push experiences to main
                if store_transitions:
                    if args.ps == 'full':  #Full setup with one replay buffer
                        for heap in rollout_trajectory:
                            for entry in heap:
                                temp_global_reward = fitness[entry[5]]
                                entry[5] = np.expand_dims(
                                    np.array([temp_global_reward],
                                             dtype="float32"), 0)
                                data_bucket[0].append(entry)

                    else:  #Heterogeneous
                        for agent_id, buffer in enumerate(data_bucket):
                            for entry in rollout_trajectory[agent_id]:
                                temp_global_reward = fitness[entry[5]]
                                entry[5] = np.expand_dims(
                                    np.array([temp_global_reward],
                                             dtype="float32"), 0)
                                buffer.append(entry)

                break

        #print(fitness)

        if type == "test" and random.random() < 1.0 and (
                args.config.env_choice == 'rover_tight'
                or args.config.env_choice == 'rover_loose'
                or args.config.env_choice == 'motivate'):
            env.render()
            print('Test trajectory lens',
                  [len(world.rover_path[0]) for world in env.universe])
            #print (type, id, 'Fit of rendered', ['%.2f'%f for f in fitness])

        #Send back id, fitness, total length and shaped fitness using the result pipe

        result_pipe.send([teams_blueprint, [fitness], frame])
Beispiel #25
0
def rollout_worker(worker_id,
                   task_pipe,
                   result_pipe,
                   noise,
                   exp_list,
                   pop,
                   difficulty,
                   use_rs,
                   store_transition=True,
                   use_synthetic_targets=False,
                   xbias=None,
                   zbias=None,
                   phase_len=100,
                   traj_container=None,
                   ep_len=1005,
                   JGS=False):
    """Rollout Worker runs a simulation in the environment to generate experiences and fitness values

        Parameters:
            worker_id (int): Specific Id unique to each worker spun
            task_pipe (pipe): Receiver end of the task pipe used to receive signal to start on a task
            result_pipe (pipe): Sender end of the pipe used to report back results
            noise (object): A noise generator object
            exp_list (shared list object): A shared list object managed by a manager that is used to store experience tuples
            pop (shared list object): A shared list object managed by a manager used to store all the models (actors)
            difficulty (int): Difficulty of the task
            use_rs (bool): Use behavioral reward shaping?
            store_transition (bool): Log experiences to exp_list?

        Returns:
            None
    """

    worker_id = worker_id
    env = EnvironmentWrapper(difficulty,
                             rs=use_rs,
                             use_synthetic_targets=use_synthetic_targets,
                             xbias=xbias,
                             zbias=zbias,
                             phase_len=phase_len,
                             jgs=JGS)
    nofault_endstep = phase_len * 4
    if use_rs:
        lfoot = []
        rfoot = []
        if difficulty == 0:
            ltibia = []
            rtibia = []
            pelvis_x = []
            pelvis_y = []
            ltibia_angle = []
            lfemur_angle = []
            rtibia_angle = []
            rfemur_angle = []
            head_x = []

    while True:
        _ = task_pipe.recv(
        )  #Wait until a signal is received  to start rollout
        net = pop[worker_id]  #Get the current model state from the population

        fitness = 0.0
        total_frame = 0
        shaped_fitness = 0.0
        state = env.reset()
        rollout_trajectory = []
        state = utils.to_tensor(np.array(state)).unsqueeze(0)
        exit_flag = True
        while True:  #unless done

            action = net.forward(state)
            action = utils.to_numpy(action)
            if noise != None: action += noise.noise()

            next_state, reward, done, info = env.step(
                action.flatten())  # Simulate one step in environment

            if use_rs:  #If using behavioral reward shaping
                lfoot.append(env.lfoot_xyz)
                rfoot.append(env.rfoot_xyz)
                if difficulty == 0:
                    ltibia.append(env.ltibia_xyz)
                    rtibia.append(env.rtibia_xyz)
                    pelvis_y.append(env.pelvis_y)
                    pelvis_x.append(env.pelvis_x)
                    lfemur_angle.append(env.lfemur_angle)
                    ltibia_angle.append(env.ltibia_angle)
                    rfemur_angle.append(env.rfemur_angle)
                    rtibia_angle.append(env.rtibia_angle)
                    head_x.append(env.head_x)

            next_state = utils.to_tensor(np.array(next_state)).unsqueeze(0)
            fitness += reward

            #If storing transitions
            if store_transition:
                rollout_trajectory.append([
                    utils.to_numpy(state), action,
                    utils.to_numpy(next_state),
                    np.reshape(np.array([reward]), (1, 1)),
                    np.reshape(np.array([-1.0]), (1, 1)),
                    np.reshape(np.array([int(done)]), (1, 1))
                ])

            state = next_state

            #DONE FLAG IS Received
            if done or (use_synthetic_targets == True and
                        env.istep >= nofault_endstep) or env.istep >= ep_len:
                total_frame += env.istep

                #Proximal Policy Optimization (Process and push as trajectories)
                if traj_container != None:
                    traj_container.append(rollout_trajectory)

                if store_transition:

                    # Forgive trajectories that did not end within 2 steps of maximum allowed
                    if env.istep < 298 and difficulty == 0 or env.istep < 998 and difficulty != 0 and use_synthetic_targets != True or env.istep < (
                            nofault_endstep -
                            2) and difficulty != 0 and use_synthetic_targets:
                        for i, entry in enumerate(rollout_trajectory):
                            entry[4] = np.reshape(
                                np.array([len(rollout_trajectory) - i]),
                                (1, 1))

                    #Push experiences to main
                    for entry in rollout_trajectory:
                        exp_list.append([
                            entry[0], entry[1], entry[2], entry[3], entry[4],
                            entry[5]
                        ])
                    rollout_trajectory = []

                #Behavioral Reward Shaping
                if use_rs:
                    if difficulty == 0:  #Round 1
                        ltibia = np.array(ltibia)
                        rtibia = np.array(rtibia)
                        pelvis_y = np.array(pelvis_y)
                        pelvis_x = np.array(pelvis_x)
                        head_x = np.array(head_x)
                        lfemur_angle = np.degrees(np.array(lfemur_angle))
                        rfemur_angle = np.degrees(np.array(rfemur_angle))
                        ltibia_angle = np.degrees(np.array(ltibia_angle))
                        rtibia_angle = np.degrees(np.array(rtibia_angle))

                        #Compute Shaped fitness
                        shaped_fitness = env.istep + rs.final_footx(
                            pelvis_x, lfoot, rfoot
                        ) * 100.0  #rs.thighs_swing(lfemur_angle, rfemur_angle)/360.0 +

                        #Compute trajectory wide constraints
                        hard_shape_w = rs.pelvis_height_rs(
                            pelvis_y
                        ) * rs.foot_z_rs(lfoot, rfoot) * rs.knee_bend(
                            ltibia_angle, lfemur_angle, rtibia_angle,
                            rfemur_angle) * rs.head_behind_pelvis(head_x)

                        #Apply constraint to fitness/shaped_fitness
                        shaped_fitness = shaped_fitness * hard_shape_w if shaped_fitness > 0 else shaped_fitness
                        #fitness = fitness *  hard_shape_w if fitness > 0 else fitness

                        #Reset
                        ltibia = []
                        rtibia = []
                        pelvis_x = []
                        pelvis_y = []
                        head_x = []
                        ltibia_angle = []
                        lfemur_angle = []
                        rtibia_angle = []
                        rfemur_angle = []

                    #ROUND 2
                    else:

                        if JGS:
                            shaped_fitness = [
                                env.istep + env.xjgs, env.istep + env.zjgs,
                                env.istep - (abs(env.zjgs) * abs(env.xjgs)),
                                env.istep * 10 + env.xjgs + env.zjgs
                            ]
                            # Foot Criss-cross
                            lfoot = np.array(lfoot)
                            rfoot = np.array(rfoot)
                            criss_cross = rs.foot_z_rs(lfoot, rfoot)
                            lfoot = []
                            rfoot = []
                            fitness = criss_cross * fitness

                        else:

                            ######## Scalarization RS #######
                            if env.zminus_pen > 0:
                                zminus_fitness = fitness - env.zminus_pen * 5
                            else:
                                zminus_fitness = 0.0

                            if env.zplus_pen > 0:
                                zplus_fitness = fitness - 5 * env.zplus_pen
                            else:
                                zplus_fitness = 0.0

                            x_fitness = fitness - 5 * env.x_pen

                            #Behavioral RS
                            pelvis_swingx = rs.pelvis_swing(
                                np.array(env.vel_traj), use_synthetic_targets,
                                phase_len)
                            pelv_swing_fit = fitness + pelvis_swingx

                            #Foot Criss-cross
                            lfoot = np.array(lfoot)
                            rfoot = np.array(rfoot)
                            criss_cross = rs.foot_z_rs(lfoot, rfoot)
                            lfoot = []
                            rfoot = []
                            footz_fit = criss_cross * fitness

                            #Make the scaled fitness list
                            shaped_fitness = [
                                zplus_fitness, zminus_fitness, x_fitness,
                                pelv_swing_fit, footz_fit
                            ]

                else:
                    shaped_fitness = []

                ############## FOOT Z AXIS PENALTY ##########
                if exit_flag: break
                else:
                    exit_flag = True
                    state = env.reset()
                    state = utils.to_tensor(np.array(state)).unsqueeze(0)

        #Send back id, fitness, total length and shaped fitness using the result pipe
        result_pipe.send([worker_id, fitness, total_frame, shaped_fitness])
Beispiel #26
0
 def act(self, state, is_noise):
     action = self.actor.forward(state)
     if is_noise:
         action += utils.to_tensor(
             self.exploration_noise.noise()).unsqueeze(0)
     return action
Beispiel #27
0
def rollout_worker(args, id, type, task_pipe, result_pipe, data_bucket,
                   models_bucket, store_transitions, random_baseline):
    """Rollout Worker runs a simulation in the environment to generate experiences and fitness values

		Parameters:
			args (object): Parameter class
			id (int): Specific Id unique to each worker spun
			task_pipe (pipe): Receiver end of the task pipe used to receive signal to start on a task
			result_pipe (pipe): Sender end of the pipe used to report back results
			data_bucket (shared list object): A shared list object managed by a manager that is used to store experience tuples
			models_bucket (shared list object): A shared list object managed by a manager used to store all the models (actors)
			store_transition (bool): Log experiences to exp_list?
			random_baseline (bool): Test with random action for baseline purpose?

		Returns:
			None
	"""

    if type == 'test': NUM_EVALS = args.num_test
    elif type == 'pg': NUM_EVALS = args.rollout_size
    elif type == 'evo':
        NUM_EVALS = 10 if not args.config.env_choice == 'motivate' else 1
    else:
        sys.exit('Incorrect type')

    if args.rollout_size == 0:
        store_transitions = False  # Evolution does not need to store data

    env = RoverDomainPython(args, NUM_EVALS)
    np.random.seed(id)
    random.seed(id)

    while True:
        teams_blueprint = task_pipe.recv(
        )  #Wait until a signal is received  to start rollout
        if teams_blueprint == 'TERMINATE': exit(0)  # Kill yourself

        # Get the current team actors
        if type == 'test' or type == 'pg':
            team = [models_bucket[0] for _ in range(args.config.num_agents)]
        elif type == "evo":
            team = [
                models_bucket[0][teams_blueprint[0]]
                for _ in range(args.config.num_agents)
            ]
        else:
            sys.exit('Incorrect type')

        fitness = [None for _ in range(NUM_EVALS)]
        frame = 0
        joint_state = env.reset()
        rollout_trajectory = [[] for _ in range(args.config.num_agents)]
        joint_state = utils.to_tensor(np.array(joint_state))

        while True:  #unless done

            if random_baseline:
                joint_action = [
                    np.random.random((NUM_EVALS, args.state_dim))
                    for _ in range(args.config.num_agents)
                ]
            elif type == 'pg':
                joint_action = [
                    team[i][0].noisy_action(joint_state[i, :],
                                            head=i).detach().numpy()
                    for i in range(args.config.num_agents)
                ]
            else:
                joint_action = [
                    team[i].clean_action(joint_state[i, :],
                                         head=i).detach().numpy()
                    for i in range(args.config.num_agents)
                ]

            #JOINT ACTION [agent_id, universe_id, action]
            #Bound Action
            joint_action = np.array(joint_action).clip(-1.0, 1.0)
            next_state, reward, done, global_reward = env.step(
                joint_action)  # Simulate one step in environment
            #State --> [agent_id, universe_id, obs]
            #reward --> [agent_id, universe_id]
            #done --> [universe_id]
            #info --> [universe_id]

            next_state = utils.to_tensor(np.array(next_state))

            #Grab global reward as fitnesses
            for i, grew in enumerate(global_reward):
                if grew != None:
                    fitness[i] = grew

            #Push experiences to memory
            if store_transitions:
                if not args.is_matd3 and not args.is_maddpg:  #Default for normal MERL
                    for agent_id in range(args.config.num_agents):
                        for universe_id in range(NUM_EVALS):
                            if not done[universe_id]:
                                rollout_trajectory[agent_id].append([
                                    np.expand_dims(
                                        utils.to_numpy(joint_state)[
                                            agent_id, universe_id, :], 0),
                                    np.expand_dims(
                                        utils.to_numpy(next_state)[
                                            agent_id, universe_id, :], 0),
                                    np.expand_dims(
                                        joint_action[agent_id, universe_id, :],
                                        0),
                                    np.expand_dims(
                                        np.array(
                                            [reward[agent_id, universe_id]],
                                            dtype="float32"), 0),
                                    np.expand_dims(
                                        np.array([done[universe_id]],
                                                 dtype="float32"), 0),
                                    universe_id, type
                                ])

                else:  #FOR MATD3/MADDPG - requires global state [concatenation of all observations and actions]
                    for universe_id in range(NUM_EVALS):
                        if not done[universe_id]:
                            rollout_trajectory[0].append([
                                np.expand_dims(
                                    utils.to_numpy(joint_state)
                                    [:, universe_id, :], 0),
                                np.expand_dims(
                                    utils.to_numpy(next_state)[:,
                                                               universe_id, :],
                                    0),
                                np.expand_dims(joint_action[:, universe_id, :],
                                               0),  #[batch, agent_id, :]
                                np.array([reward[:, universe_id]],
                                         dtype="float32"),
                                np.expand_dims(
                                    np.array([done[universe_id]],
                                             dtype="float32"), 0),
                                universe_id,
                                type
                            ])

            joint_state = next_state
            frame += NUM_EVALS

            #DONE FLAG IS Received
            if sum(done) == len(done):
                #Push experiences to main
                if store_transitions:
                    for agent_id, buffer in enumerate(data_bucket):
                        for entry in rollout_trajectory[agent_id]:
                            temp_global_reward = fitness[entry[5]]
                            entry[5] = np.expand_dims(
                                np.array([temp_global_reward],
                                         dtype="float32"), 0)
                            buffer.append(entry)

                break

        #Send back id, fitness, total length and shaped fitness using the result pipe
        result_pipe.send([teams_blueprint, [fitness], frame])
Beispiel #28
0
def rollout_worker(id, task_pipe, result_pipe, is_noise, data_bucket,
                   model_bucket, env_name, noise_std, ALGO):
    """Rollout Worker runs a simulation in the environment to generate experiences and fitness values

        Parameters:
            task_pipe (pipe): Receiver end of the task pipe used to receive signal to start on a task
            result_pipe (pipe): Sender end of the pipe used to report back results
            is_noise (bool): Use noise?
            data_bucket (list of shared object): A list of shared object reference to s,ns,a,r,done (replay buffer) managed by a manager that is used to store experience tuples
            model_bucket (shared list object): A shared list object managed by a manager used to store all the models (actors)
			env_name (str): Environment name?
			noise_std (float): Standard deviation of Gaussian for sampling noise

        Returns:
            None
    """
    env = EnvironmentWrapper(env_name, ALGO)
    np.random.seed(
        id)  ###make sure the random seeds across learners are different

    ###LOOP###
    while True:
        identifier = task_pipe.recv(
        )  # Wait until a signal is received  to start rollout
        if identifier == 'TERMINATE': exit(0)  #Kill yourself

        # Get the requisite network
        net = model_bucket[identifier]

        fitness = 0.0
        total_frame = 0
        state = env.reset()
        rollout_trajectory = []
        state = utils.to_tensor(np.array(state)).unsqueeze(0)
        while True:  # unless done

            action = net.forward(state)
            action = utils.to_numpy(action)
            if is_noise:
                action = (action + np.random.normal(
                    0, noise_std, size=env.env.action_space.shape[0])).clip(
                        env.env.action_space.low, env.env.action_space.high)

            next_state, reward, done, info = env.step(
                action.flatten())  # Simulate one step in environment

            next_state = utils.to_tensor(np.array(next_state)).unsqueeze(0)
            fitness += reward

            # If storing transitions
            if data_bucket != None:  #Skip for test set
                rollout_trajectory.append([
                    utils.to_numpy(state),
                    utils.to_numpy(next_state),
                    np.float32(action),
                    np.reshape(np.float32(np.array([reward])), (1, 1)),
                    np.reshape(np.float32(np.array([float(done)])), (1, 1))
                ])
            state = next_state
            total_frame += 1

            # DONE FLAG IS Received
            if done:

                # Push experiences to main
                for entry in rollout_trajectory:
                    data_bucket.append(entry)

                break

        # Send back id, fitness, total length and shaped fitness using the result pipe
        result_pipe.send([identifier, fitness, total_frame])