Esempi in Python per UnityEnv.step, esempi in Python per gym_unity.envs.UnityEnv.step

Esempio n. 1

0

Mostra file

File: run_gym.py Progetto: maoche001/ml-agents

def main(env_name):
    """
    Run the gym test using the specified environment
    :param env_name: Name of the Unity environment binary to launch
    """
    env = UnityEnv(env_name, worker_id=1, use_visual=False, no_graphics=True)

    try:
        # Examine environment parameters
        print(str(env))

        # Reset the environment
        initial_observations = env.reset()

        if len(env.observation_space.shape) == 1:
            # Examine the initial vector observation
            print("Agent observations look like: \n{}".format(
                initial_observations))

        for _episode in range(10):
            env.reset()
            done = False
            episode_rewards = 0
            while not done:
                actions = env.action_space.sample()
                obs, reward, done, _ = env.step(actions)
                episode_rewards += reward
            print("Total reward this episode: {}".format(episode_rewards))
    finally:
        env.close()

Esempio n. 2

0

Mostra file

class Chaser_v1(Environment):
    unity_env_worker_id = 0

    def __init__(self, platform):
        if platform == OSName.MAC:
            env_filename = EnvironmentName.CHASER_V1_MAC.value
        elif platform == OSName.WINDOWS:
            env_filename = EnvironmentName.CHASER_V1_WINDOWS.value
        else:
            env_filename = None

        self.env = UnityEnv(environment_filename=env_filename,
                            worker_id=Chaser_v1.unity_env_worker_id,
                            use_visual=True,
                            multiagent=True).unwrapped
        self.increase_env_worker_id()
        super(Chaser_v1, self).__init__()
        self.action_shape = self.get_action_shape()
        self.state_shape = self.get_state_shape()

        self.cnn_input_height = self.state_shape[0]
        self.cnn_input_width = self.state_shape[1]
        self.cnn_input_channels = self.state_shape[2]

        self.observation_space = self.env.observation_space
        self.continuous = True

    @staticmethod
    def increase_env_worker_id():
        Chaser_v1.unity_env_worker_id += 1

    def get_n_states(self):
        n_states = 3
        return n_states

    def get_n_actions(self):
        n_actions = 3
        return n_actions

    def get_state_shape(self):
        return self.env.observation_space.shape

    def get_action_shape(self):
        return self.env.action_space.shape

    def reset(self):
        state = self.env.reset()
        return state

    def step(self, action):
        next_state, reward, done, info = self.env.step(action)

        adjusted_reward = reward

        return next_state, reward, adjusted_reward, done, info

    def close(self):
        self.env.close()

Esempio n. 3

0

Mostra file

File: td3_multiprocess_all.py Progetto: quantumiracle/store2

def worker(id, td3_trainer, rewards_queue, replay_buffer, max_episodes, max_steps, batch_size, explore_steps, \
            update_itr, explore_noise_scale, eval_noise_scale, reward_scale, DETERMINISTIC, hidden_dim, model_path):
    '''
    the function for sampling with multi-processing
    '''
    print(td3_trainer, replay_buffer)

    env_name="./tac_follow_new"
    env = UnityEnv(env_name, worker_id=id+15, use_visual=False, use_both=True)



    # training loop
    for eps in range(max_episodes):
        frame_idx=0
        rewards=[]
        episode_reward = 0
        state, info = env.reset()
        # state=state[:6]
        
        for step in range(max_steps):
            if frame_idx > explore_steps:
                action = td3_trainer.policy_net.get_action(state, deterministic = DETERMINISTIC, explore_noise_scale=explore_noise_scale)
            else:
                action = td3_trainer.policy_net.sample_action()
    
            try:
                next_state, reward, done, info = env.step(action)
                # next_state = next_state[:6]
            except KeyboardInterrupt:
                print('Finished')
                td3_trainer.save_model(model_path)
    
            replay_buffer.push(state, action, reward, next_state, done)
            
            state = next_state
            episode_reward += reward
            frame_idx += 1
            
            
            # if len(replay_buffer) > batch_size:
            if replay_buffer.get_length() > batch_size:
                for i in range(update_itr):
                    _=td3_trainer.update(batch_size, deterministic=DETERMINISTIC, eval_noise_scale=eval_noise_scale, reward_scale=reward_scale)
            
            if eps % 10 == 0 and eps>0:
                # plot(rewards, id)
                td3_trainer.save_model(model_path)
            
            if done:
                break
        print('Episode: ', eps, '| Episode Reward: ', episode_reward)
        if len(rewards) == 0: rewards.append(episode_reward)
        else: rewards.append(rewards[-1]*0.9+episode_reward*0.1)
        rewards_queue.put(episode_reward)

    td3_trainer.save_model(model_path)

Esempio n. 4

0

Mostra file

File: test_gym.py Progetto: zouhunter/ml-agents

def test_multi_agent(mock_env):
    mock_brain = create_mock_group_spec()
    mock_braininfo = create_mock_vector_step_result(num_agents=2)
    setup_mock_unityenvironment(mock_env, mock_brain, mock_braininfo)

    with pytest.raises(UnityGymException):
        UnityEnv(" ", multiagent=False)

    env = UnityEnv(" ", use_visual=False, multiagent=True)
    assert isinstance(env.reset(), list)
    actions = [env.action_space.sample() for i in range(env.number_agents)]
    obs, rew, done, info = env.step(actions)
    assert isinstance(obs, list)
    assert isinstance(rew, list)
    assert isinstance(done, list)

Esempio n. 5

0

Mostra file

File: test_gym.py Progetto: E0201942/AngelHack-IBM-challenge

def test_gym_wrapper(mock_env):
    mock_brain = create_mock_brainparams()
    mock_braininfo = create_mock_vector_braininfo()
    setup_mock_unityenvironment(mock_env, mock_brain, mock_braininfo)

    env = UnityEnv(" ", use_visual=False, multiagent=False)
    assert isinstance(env, UnityEnv)
    assert isinstance(env.reset(), np.ndarray)
    actions = env.action_space.sample()
    assert actions.shape[0] == 2
    obs, rew, done, info = env.step(actions)
    assert isinstance(obs, np.ndarray)
    assert isinstance(rew, float)
    assert isinstance(done, bool)
    assert isinstance(info, dict)

Esempio n. 6

0

Mostra file

File: test_gym.py Progetto: zouhunter/ml-agents

def test_gym_wrapper(mock_env):
    mock_brain = create_mock_group_spec()
    mock_braininfo = create_mock_vector_step_result()
    setup_mock_unityenvironment(mock_env, mock_brain, mock_braininfo)

    env = UnityEnv(" ", use_visual=False, multiagent=False)
    assert isinstance(env, UnityEnv)
    assert isinstance(env.reset(), np.ndarray)
    actions = env.action_space.sample()
    assert actions.shape[0] == 2
    obs, rew, done, info = env.step(actions)
    assert env.observation_space.contains(obs)
    assert isinstance(obs, np.ndarray)
    assert isinstance(rew, float)
    assert isinstance(done, (bool, np.bool_))

Esempio n. 7

0

Mostra file

    class UnityEnvWrapper(gym.Env):
        def __init__(self, env_config):
            self.vector_index = env_config.vector_index
            self.worker_index = env_config.worker_index
            self.worker_id = env_config["unity_worker_id"] + env_config.worker_index
            # Name of the Unity environment binary to launch
            env_name = '/home/jim/projects/unity_ray/basic_env_linux/basic_env_linux'
            self.env = UnityEnv(env_name, worker_id=self.worker_id, use_visual=False, multiagent=False, no_graphics=True) #
            self.action_space = self.env.action_space
            self.observation_space = self.env.observation_space

        def reset(self):
            return self.env.reset()

        def step(self, action):
            return self.env.step(action)

Esempio n. 8

0

Mostra file

def test_multi_agent(mock_communicator, mock_launcher):
    mock_communicator.return_value = MockCommunicator(discrete_action=False,
                                                      visual_inputs=0,
                                                      stack=False,
                                                      num_agents=2)

    # Test for incorrect number of agents.
    with pytest.raises(UnityGymException):
        UnityEnv(' ', multiagent=False)

    env = UnityEnv(' ', use_visual=False, multiagent=True)
    assert isinstance(env.reset(), list)
    actions = [env.action_space.sample() for i in range(env.number_agents)]
    obs, rew, done, info = env.step(actions)
    assert isinstance(obs, list)
    assert isinstance(rew, list)
    assert isinstance(done, list)
    assert isinstance(info, dict)

Esempio n. 9

0

Mostra file

File: test_gym.py Progetto: wszhs/ml-agents

def test_gym_wrapper_visual(mock_env, use_uint8):
    mock_spec = create_mock_group_spec(number_visual_observations=1)
    mock_decision_step, mock_terminal_step = create_mock_vector_steps(
        mock_spec, number_visual_observations=1)
    setup_mock_unityenvironment(mock_env, mock_spec, mock_decision_step,
                                mock_terminal_step)

    env = UnityEnv(" ", use_visual=True, uint8_visual=use_uint8)
    assert isinstance(env, UnityEnv)
    assert isinstance(env.reset(), np.ndarray)
    actions = env.action_space.sample()
    assert actions.shape[0] == 2
    obs, rew, done, info = env.step(actions)
    assert env.observation_space.contains(obs)
    assert isinstance(obs, np.ndarray)
    assert isinstance(rew, float)
    assert isinstance(done, (bool, np.bool_))
    assert isinstance(info, dict)

Esempio n. 10

0

Mostra file

def test_gym_wrapper(mock_communicator, mock_launcher):
    mock_communicator.return_value = MockCommunicator(discrete_action=False,
                                                      visual_inputs=0,
                                                      stack=False,
                                                      num_agents=1)

    # Test for incorrect number of agents.
    with pytest.raises(UnityGymException):
        UnityEnv(' ', use_visual=False, multiagent=True)

    env = UnityEnv(' ', use_visual=False)
    assert isinstance(env, UnityEnv)
    assert isinstance(env.reset(), np.ndarray)
    actions = env.action_space.sample()
    assert actions.shape[0] == 2
    obs, rew, done, info = env.step(actions)
    assert isinstance(obs, np.ndarray)
    assert isinstance(rew, float)
    assert isinstance(done, bool)
    assert isinstance(info, dict)

Esempio n. 11

0

Mostra file

File: ppo.py Progetto: quantumiracle/Store

        # env = gym.make('Pendulum-v0')
        # env=Reacher(render=True)
        # env = UnityEnv(env_name, worker_id=10, use_visual=True, use_both=True)

        # s, info = env.reset()
        # for t in range(100):
        #     # env.render()
        #     s, r, done, info = env.step(GLOBAL_PPO.choose_action(s))

        GLOBAL_PPO.save(model_path)

    if args.test:
        env = UnityEnv(env_name,
                       worker_id=np.random.randint(0, 10),
                       use_visual=False,
                       use_both=True)
        env.reset()
        GLOBAL_PPO = PPO()
        GLOBAL_PPO.load(model_path)
        test_steps = 100
        test_episode = 10

        for _ in range(test_episode):
            s, info = env.reset()

            for t in range(test_steps):
                # env.render()
                a = GLOBAL_PPO.choose_action(s[:6])
                print(a)
                s, r, done, info = env.step(a)

Esempio n. 12

0

Mostra file

class Worker(object):
    def __init__(self, wid):
        self.wid = wid
        self.env = UnityEnv(env_name,
                            worker_id=wid,
                            use_visual=True,
                            use_both=True)

        # self.env=Reacher(render=True)
        self.ppo = GLOBAL_PPO

        self.pins_x = []
        self.pins_y = []

    def ImgProcess(self, img, Done=False):
        cimg, edge_detected_image, contour_centers = image_processing(img)
        # cimg = large_circle_detect(cimg, edge_detected_image)  # this consumes most time
        cimg, VALID_DETECT = contour_center_check(contour_centers,
                                                  cimg,
                                                  NUM_PINS=NUM_PINS)
        # cv2.imwrite(save_path+str(filename),cimg)
        contour_centers = CenterRegister(contour_centers)

        if VALID_DETECT:  # pins detection correct
            reshape_contour_centers = np.array(contour_centers).transpose()
            self.pins_x.append(reshape_contour_centers[0])
            self.pins_y.append(reshape_contour_centers[1])

        reshape_pins_x = np.array(self.pins_x).transpose()
        reshape_pins_y = np.array(self.pins_y).transpose()
        displacement_pins_x = self.pins_x[-1] - self.pins_x[0]
        displacement_pins_y = self.pins_y[-1] - self.pins_y[0]

        plt.figure(1)
        for i in range(NUM_PINS):
            plt.subplot(211)
            plt.plot(np.arange(len(self.pins_x)), reshape_pins_x[i])
            plt.title('Position')
            plt.subplot(212)
            plt.plot(np.arange(len(self.pins_x)),
                     reshape_pins_x[i] - reshape_pins_x[i][0])
            plt.title('Displacement')
            plt.tight_layout()
        plt.savefig('./ppo_pins.png')
        if Done:
            plt.clf()
        # return pins position x, y for current frame, displacement of pins position x,y
        return self.pins_x[-1], self.pins_y[
            -1], displacement_pins_x, displacement_pins_y

    def work(self):
        global GLOBAL_EP, GLOBAL_RUNNING_R, GLOBAL_UPDATE_COUNTER
        step_set = []
        epr_set = []
        step = 0
        while not COORD.should_stop():
            s, info = self.env.reset()
            ''' image processing '''
            img = (s[:, :, 0] * 255).astype(np.uint8)
            try:
                pins_x, pins_y, pins_dis_x, pins_dis_y = self.ImgProcess(
                    img, Done=False)
            except:
                print('Image Processing Error!')
            s = np.concatenate((pins_dis_x, pins_dis_y))
            ''''''
            # vector_s = info["brain_info"].vector_observations[0, :]  # get the vector observation
            # s=vector_s
            # print(s.shape, info["brain_info"].vector_observations[0, :])
            step += 1
            ep_r = 0
            buffer_s, buffer_a, buffer_r = [], [], []
            self.pins_x = []
            self.pins_y = []
            for t in range(EP_LEN):
                if not ROLLING_EVENT.is_set():  # while global PPO is updating
                    ROLLING_EVENT.wait()  # wait until PPO is updated
                    buffer_s, buffer_a, buffer_r = [], [], [
                    ]  # clear history buffer, use new policy to collect data
                a = self.ppo.choose_action(s)
                s_, r, done, info = self.env.step(a)
                ''' implementation of plot version one, deprecated '''
                # plt.imshow(s_[:,:,0])
                # # plt.show()
                # plt.savefig('./img256_test/tac_test'+str(step)+str(t)+'.png')
                ''' 
                image size of plt is not exactly the original array size, but with axis etc;
                therefore use Image--imlementation of plot version two
                '''
                # im = Image.fromarray((s_[:,:,0] * 255).astype(np.uint8))
                # im.save('./img256f_r30/tac'+str(step)+str(t)+'.png')
                ''' image processing '''
                img = (s_[:, :, 0] * 255).astype(np.uint8)
                if t > EP_LEN - 1:
                    Done = True
                else:
                    Done = False
                try:
                    pins_x, pins_y, pins_dis_x, pins_dis_y = self.ImgProcess(
                        img, Done)
                except:
                    print('Image Processing Error!')

                s_ = np.concatenate((pins_dis_x, pins_dis_y))
                ''' get the vector observation '''
                # vector_s = info["brain_info"].vector_observations[0, :]  # get the vector observation
                # s_=vector_s

                # print('a: ',a)  # shape: []
                # print('s: ',s_) # shape: []
                # plt.imshow(s[:,:,0])
                # plt.show()
                # print('r: ',r) # shape: scalar
                # print('done: ', done)  # shape: True/False
                # s=s.reshape(-1)  # convert from 3D to 1D
                buffer_s.append(s)
                buffer_a.append(a)
                buffer_r.append(
                    (r + 8) / 8)  # normalize reward, find to be useful
                s = s_
                ep_r += r

                GLOBAL_UPDATE_COUNTER += 1  # count to minimum batch size, no need to wait other workers
                if t == EP_LEN - 1 or GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE:
                    v_s_ = self.ppo.get_v(s_)
                    discounted_r = []  # compute discounted reward
                    for r in buffer_r[::-1]:
                        v_s_ = r + GAMMA * v_s_
                        discounted_r.append(v_s_)
                    discounted_r.reverse()

                    bs, ba, br = np.vstack(buffer_s), np.vstack(
                        buffer_a), np.array(discounted_r)[:, np.newaxis]
                    buffer_s, buffer_a, buffer_r = [], [], []
                    QUEUE.put(np.hstack((bs, ba, br)))  # put data in the queue
                    if GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE:
                        ROLLING_EVENT.clear()  # stop collecting data
                        UPDATE_EVENT.set()  # globalPPO update

                    if GLOBAL_EP >= EP_MAX:  # stop training
                        COORD.request_stop()
                        break

            # record reward changes, plot later
            if len(GLOBAL_RUNNING_R) == 0: GLOBAL_RUNNING_R.append(ep_r)
            else:
                GLOBAL_RUNNING_R.append(GLOBAL_RUNNING_R[-1] * 0.9 +
                                        ep_r * 0.1)
            GLOBAL_EP += 1
            print(
                '{0:.1f}%'.format(GLOBAL_EP / EP_MAX * 100),
                '|W%i' % self.wid,
                '|Ep_r: %.2f' % ep_r,
            )
            step_set.append(step)
            # print(step)
            epr_set.append(ep_r)

Esempio n. 13

0

Mostra file

File: data_collect_raw.py Progetto: quantumiracle/store2

    episode_length = 150
    env_name = "./tac_touch_random2"  # Name of the Unity environment binary to launch

    env = UnityEnv(env_name,
                   worker_id=np.random.randint(0, 10),
                   use_visual=False,
                   use_both=True)

    batch_s = []
    cnt = 0
    for eps in range(training_episodes):
        print(eps)
        s, info = env.reset()
        s0 = np.array(s[7:])
        for step in range(episode_length):
            # plot(s)
            # print(np.mean(np.abs(np.array(s[7:])-s0)))  # choose 0.03
            if step > 0 and np.mean(
                    np.abs(np.array(s[7:]) - s0)
            ) > 0.03 and s[4] + s[5] + s[
                    6] != 0:  # set a threshold to extract deformation frames
                batch_s.append(
                    s
                )  # dim of s total 280 (select 182 as obs): 0 object index, 1-3 rotation value, 4-6 average contact point position, 7-279 pins positions
                cnt += 1

            s_, r, done, info = env.step([0])
            s = s_
    print('total number of samples: ', cnt)
    pickle.dump(batch_s, data_file)

Esempio n. 14

0

Mostra file

File: ppo_singleThread.py Progetto: Bing-Jing/PPFinal

if __name__ == "__main__":
    env = UnityEnv('test.app', 0,use_visual=True)
    ppo = PPO(env)
    all_ep_r = []
    t = 0
    for ep in range(EP_MAX):
        s = env.reset()
        
        ep_r = 0
        
        done = False
        while not done: 
            t+=1
            env.render()
            a,v = ppo.choose_action(s)
            s_, r, done, _ = env.step(a)
            ppo.buffer_s.append(s)
            ppo.buffer_a.append(a)
            ppo.buffer_r.append(r)
            ppo.buffer_v.append(v)
            ppo.buffer_done.append(done)
            s = s_
            ep_r += r

            # update ppo
            if (t+1) % BATCH == 0:
                print("updating...")
                t = 0
                v_s_ = v
                discounted_r = []
                rewards = np.array(ppo.buffer_r)

Esempio n. 15

0

Mostra file

class PPO():
    def __init__(self):

        # Hyperparameters
        self.learning_rate = 0.0003
        self.betas = (0.9, 0.999)
        self.gamma = 0.99
        self.eps_clip = 0.2
        self.buffer_size = 2048
        self.batch_size = 256
        self.K_epochs = 3
        self.max_steps = 100000

        self.tau = 0.95
        self.entropy_coef = 0.001
        self.value_loss_coef = 0.5

        self.summary_freq = 1000

        # Environment
        self.env_name = "Environments/env1/Unity Environment"
        channel = EngineConfigurationChannel()
        self.env = UnityEnv(self.env_name,
                            worker_id=0,
                            use_visual=False,
                            side_channels=[channel],
                            no_graphics=False,
                            multiagent=True)
        channel.set_configuration_parameters(time_scale=100)
        self.action_size, self.state_size = Utils.getActionStateSize(self.env)
        self.n_agents = self.env.number_agents
        print("Nº of Agents: ", self.n_agents)

        # Model
        self.model = ActorCritic(self.state_size, self.action_size,
                                 seed=0).to(device)
        self.optimizer = optim.Adam(self.model.parameters(),
                                    lr=self.learning_rate,
                                    betas=self.betas)
        self.MseLoss = nn.MSELoss()

        # Buffer memory
        self.memory = []
        for _ in range(self.n_agents):
            self.memory.append(Buffer())

        # Initialize time step (for updating when buffer_size is full)
        self.t_step = 1

    def train(self):

        # Initial observation
        env_info = self.env.reset()
        state = env_info

        # Data
        self.data = Data(self.n_agents, self.summary_freq)

        # Training loop
        for _ in range(self.max_steps):

            action = []
            logprobs = []
            value = []

            # Action of agent
            for i in range(self.n_agents):
                a, b, c = self.act(state[i])
                action.append(a)
                logprobs.append(b)
                value.append(c)

            # Send the action to the environment
            next_state, reward, done, info = self.env.step(action)

            # Done
            done_ = []
            for i in range(self.n_agents):
                done_.append(1 - done[i])

            # Agent step
            for i in range(self.n_agents):
                self.step(state[i], action[i], reward[i], next_state[i],
                          done_[i], logprobs[i], value[i], self.memory[i])

            # Update t_step
            self.t_step += 1

            # Next state
            state = next_state

            # Update the score
            self.data.update_score(reward, value, done, self.t_step)

            # Summary
            if self.t_step % self.summary_freq == 0:
                self.data.summary(self.t_step)

        # Save
        self.save()

    def save(self):
        torch.save(self.model.state_dict(), 'Saved Models/model.pth')
        self.data.results()

    def load_model(self, model):
        self.model.load_state_dict(torch.load(model))

    def act(self, state):

        state = torch.from_numpy(state).float().unsqueeze(0).to(device)

        # Get actions probabilities and value from ActorCritic model
        self.model.eval()
        with torch.no_grad():
            action_probs, value = self.model(state)
        self.model.train()

        prob = F.softmax(action_probs, -1)
        log_probs = F.log_softmax(action_probs, -1)

        # Get action and log of probabilities
        action = prob.multinomial(num_samples=1)
        log_probs = log_probs.gather(1, action)

        return action, log_probs, value

    def step(self, state, action, reward, next_state, done, logprobs, value,
             memory):

        # Update model when buffer_size is full
        if memory.len_() == (self.buffer_size / self.n_agents):
            self.learn()
            for i in range(self.n_agents):
                self.memory[i].reset()

        # Save experience in buffer memory
        memory.add(state, action, reward, next_state, done, logprobs, value)

    def evaluate(self, states, next_states, actions, rewards, masks,
                 compute_gae):

        logits, values = self.model(states)
        probs = F.softmax(logits, -1)
        log_probs = F.log_softmax(logits, -1)
        entropies = -(log_probs * probs).sum(1, keepdim=True)
        log_probs = log_probs.gather(1, actions.unsqueeze(1))

        values_ = values

        _, value = self.model(next_states)
        values = torch.cat((values, value.data))

        returns = []

        if (compute_gae):

            gae = torch.zeros(1, 1)

            for i in reversed(range(len(rewards))):

                # Generalized Advantage Estimation
                delta_t = rewards[i] + self.gamma * masks[i] * values[
                    i + 1].data - values[i].data
                gae = gae * self.gamma * self.tau * masks[i] + delta_t

                returns.insert(0, gae + values[i])

        return log_probs, values_, entropies, returns

    def compute_returns(self):

        returns_ = []

        for i in range(self.n_agents):

            # Get Experiences (of each agent)
            experiences = self.memory[i].get()
            states, actions, rewards, next_states, dones, logprobs_, values_ = experiences

            # Evaluate
            _, _, _, r = self.evaluate(states,
                                       next_states,
                                       actions,
                                       rewards,
                                       dones,
                                       compute_gae=True)
            returns_.append(r)

        l = []

        for i in range(len(returns_)):
            for j in range(len(returns_[0])):
                l.append(returns_[i][j])

        return l

    def learn(self):

        # Get Experiences
        states, actions, rewards, next_states, dones, logprobs_, values_ = self.getExp(
        )

        returns_eval = self.compute_returns()
        returns_eval = torch.tensor(returns_eval).to(device)
        returns_eval = returns_eval.unsqueeze(1)

        # Optimize policy for K epochs:
        for _ in range(self.K_epochs):

            # List with all indices
            l = np.arange(self.buffer_size)
            l = list(l)

            x = self.buffer_size // self.batch_size

            for _ in range(x):

                # Take a random batch
                indices = random.sample(l, self.batch_size)

                old_logprobs = torch.empty(self.batch_size, 1)
                old_values = torch.empty(self.batch_size, 1)
                old_actions = torch.empty(self.batch_size)
                old_states = torch.empty(self.batch_size, self.state_size)
                old_next_states = torch.empty(self.batch_size, self.state_size)
                old_rewards = np.zeros(self.batch_size)
                returns = torch.empty(self.batch_size, 1)

                for i in range(len(indices)):

                    old_logprobs[i] = logprobs_[indices[i]]
                    old_values[i] = values_[indices[i]]
                    old_actions[i] = actions[indices[i]]
                    old_states[i] = states[indices[i]]
                    old_next_states[i] = next_states[indices[i]]
                    old_rewards[i] = rewards[indices[i]]
                    returns[i] = returns_eval[indices[i]]

                old_actions = old_actions.long()

                # Remove indices to not repeat
                for i in indices:
                    l.remove(i)

                # Evaluate
                logprobs, state_values, dist_entropy, _ = self.evaluate(
                    old_states,
                    old_next_states,
                    old_actions,
                    rewards,
                    dones,
                    compute_gae=False)

                # Finding the ratio (pi_theta / pi_theta__old):
                ratios = torch.exp(logprobs - old_logprobs)

                # Finding Surrogate Loss:
                advantages = returns - old_values
                surr1 = ratios * advantages
                surr2 = torch.clamp(ratios, 1 - self.eps_clip,
                                    1 + self.eps_clip) * advantages

                # LOSS = ACTOR LOSS + CRITIC_DISCOUNT * CRITIC_LOSS - ENTROPY_BETA * ENTROPY
                loss = -torch.min(
                    surr1, surr2) + self.value_loss_coef * self.MseLoss(
                        state_values,
                        returns) - self.entropy_coef * dist_entropy

                # Optimizer step
                self.optimizerStep(self.optimizer, loss.mean())

    def optimizerStep(self, optimizer, loss):
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    def getExp(self):

        states, actions, rewards, next_states, dones, logprobs, values = [], [], [], [], [], [], []

        for i in range(self.n_agents):
            experiences = self.memory[i].get()
            states.append(experiences[0])
            actions.append(experiences[1])
            rewards.append(experiences[2])
            next_states.append(experiences[3])
            dones.append(experiences[4])
            logprobs.append(experiences[5])
            values.append(experiences[6])

        states_, actions_, rewards_, next_states_, dones_, logprobs_, values_ = [], [], [], [], [], [], []

        for i in range(len(states)):
            for j in range(len(states[0])):
                states_.append(states[i][j])
                actions_.append(actions[i][j])
                rewards_.append(rewards[i][j])
                next_states_.append(next_states[i][j])
                dones_.append(dones[i][j])
                logprobs_.append(logprobs[i][j])
                values_.append(values[i][j])

        states__ = torch.empty(self.buffer_size, self.state_size)
        actions__ = torch.empty(self.buffer_size)
        next_states__ = torch.empty(self.buffer_size, self.state_size)
        dones__ = torch.empty(self.buffer_size)
        logprobs__ = torch.empty(self.buffer_size, 1, 1)
        values__ = torch.empty(self.buffer_size)

        for i in range(self.buffer_size):
            states__[i] = states_[i]
            actions__[i] = actions_[i]
            next_states__[i] = next_states_[i]
            dones__[i] = dones_[i]
            logprobs__[i] = logprobs_[i]
            values__[i] = values_[i]

        return states__, actions__, rewards_, next_states__, dones__, logprobs__, values__

Esempio n. 16

0

Mostra file

class Worker(object):
    def __init__(self, wid):
        self.wid = wid
        self.env = UnityEnv(env_name,
                            worker_id=wid,
                            use_visual=False,
                            use_both=True)

        # self.env=Reacher(render=True)
        self.ppo = GLOBAL_PPO

    def work(self):
        global GLOBAL_EP, GLOBAL_RUNNING_R, GLOBAL_UPDATE_COUNTER
        step_set = []
        epr_set = []
        step = 0
        while not COORD.should_stop():
            s, info = self.env.reset()
            s = s[:8]
            step += 1
            ep_r = 0
            buffer_s, buffer_a, buffer_r = [], [], []
            self.pins_x = []
            self.pins_y = []
            self.pins_z = []
            self.object_x = []
            self.object_y = []
            self.object_z = []
            for t in range(EP_LEN):
                if not ROLLING_EVENT.is_set():  # while global PPO is updating
                    ROLLING_EVENT.wait()  # wait until PPO is updated
                    buffer_s, buffer_a, buffer_r = [], [], [
                    ]  # clear history buffer, use new policy to collect data

                a = self.ppo.choose_action(s)
                s_, r, done, info = self.env.step(a)
                # print(np.array(s_).shape)

                # plot pins
                buffer_s.append(s)
                buffer_a.append(a)
                buffer_r.append(r)  # normalize reward, find to be useful

                pins_x = s_[6::3]
                pins_z = s_[8::3]
                self.object_x.append(s_[0])
                self.object_z.append(s_[2])
                self.pins_x.append(pins_x)
                self.pins_z.append(pins_z)

                relative_x = pins_x - s_[0]
                relative_z = pins_z - s_[2]
                dis = (relative_x - (self.pins_x[0] - self.object_x[0]))**2 + (
                    relative_z - (self.pins_z[0] - self.object_z[0]))**2
                min_idx = np.argmin(dis)
                max_idx = np.argmax(dis)
                # add relative position of the pin with smallest deformation
                # s_ = np.append(s_[:6], relative_x[min_idx])
                # s_ = np.append(s_, relative_z[min_idx])

                s_ = np.append(s_[:6], relative_x[max_idx])
                s_ = np.append(s_, relative_z[max_idx])
                s = s_
                ep_r += r

                # print('minimal displacement idx: ', min_idx)

                GLOBAL_UPDATE_COUNTER += 1  # count to minimum batch size, no need to wait other workers
                if t == EP_LEN - 1 or GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE:
                    v_s_ = self.ppo.get_v(s_)
                    discounted_r = []  # compute discounted reward
                    for r in buffer_r[::-1]:
                        v_s_ = r + GAMMA * v_s_
                        discounted_r.append(v_s_)
                    discounted_r.reverse()

                    bs, ba, br = np.vstack(buffer_s), np.vstack(
                        buffer_a), np.array(discounted_r)[:, np.newaxis]
                    buffer_s, buffer_a, buffer_r = [], [], []
                    QUEUE.put(np.hstack((bs, ba, br)))  # put data in the queue
                    if GLOBAL_UPDATE_COUNTER >= MIN_BATCH_SIZE:
                        ROLLING_EVENT.clear()  # stop collecting data
                        UPDATE_EVENT.set()  # globalPPO update

                    if GLOBAL_EP >= EP_MAX:  # stop training
                        COORD.request_stop()
                        break

            if GLOBAL_EP % 50 == 0 and GLOBAL_EP > 0:
                self.ppo.save(model_path)

            reshape_pins_x = np.array(self.pins_x).transpose()
            reshape_pins_z = np.array(self.pins_z).transpose()
            plt.clf()
            for i in range(NUM_PINS):
                plt.subplot(411)
                plt.plot(np.arange(len(self.pins_x)), reshape_pins_x[i])
                plt.title('X-Position')
                plt.subplot(412)
                plt.plot(np.arange(len(self.pins_z)), reshape_pins_z[i])
                plt.title(
                    'Y-Position')  # although it's z, to match reality, use y
                plt.subplot(413)
                # plt.plot(np.arange(len(self.pins_x)), reshape_pins_x[i]-self.object_x)
                # plt.title('X-Relative')
                # plt.plot(np.arange(len(self.pins_x)), reshape_pins_x[i]-reshape_pins_x[i][0])
                # plt.title('X-Displacement')
                plt.plot(np.arange(len(self.pins_x)),
                         (reshape_pins_x[i] - self.object_x) -
                         (reshape_pins_x[i][0] - self.object_x[0]))
                plt.title('X-Displacement')

                plt.subplot(414)
                plt.plot(np.arange(len(self.pins_x)),
                         (reshape_pins_z[i] - self.object_z) -
                         (reshape_pins_z[i][0] - self.object_z[0]))
                plt.title('Y-Displacement')
                plt.xlabel('Time Step')
                plt.tight_layout()
            plt.savefig('./ppo_pins.png')

            # record reward changes, plot later
            if len(GLOBAL_RUNNING_R) == 0: GLOBAL_RUNNING_R.append(ep_r)
            else:
                GLOBAL_RUNNING_R.append(GLOBAL_RUNNING_R[-1] * 0.9 +
                                        ep_r * 0.1)
            GLOBAL_EP += 1
            print(
                '{0:.1f}%'.format(GLOBAL_EP / EP_MAX * 100),
                '|W%i' % self.wid,
                '|Ep_r: %.2f' % ep_r,
            )
            step_set.append(step)
            # print(step)
            epr_set.append(ep_r)
            if step % 10 == 0:  # plot every N episode; some error about main thread for plotting
                plt.clf()
                plt.plot(step_set, epr_set)
                plt.xlabel('Episode')
                plt.ylabel('Reward')
                try:
                    plt.savefig('./tac_pins8.png')
                except:
                    print('writing conflict!')

Esempio n. 17

0

Mostra file

            #print("______________episode Number______________")
            #print(e)\\\\\
            actionlst = []
            action1 = agent1.act(state1)
            action2 = agent2.act(state2)
            print("__Randomly Selected Action__________")
            print(action1)
            print(action2)
            actionlst.append(action1)
            actionlst.append(action2)

            # Advance the game to the next frame based on the action.
            # Reward is 1 for every frame the pole survived
            print("______TIME")
            print(time_t)
            next_state, reward, done, _ = env.step(actionlst)

            print("_____________reward_____________")
            print(reward)
            print("____________nextstaet agen 1_____________")
            print(next_state[0])
            print("_____________next state agent 2_______")
            print(next_state[1])
            #next_state = np.reshape(next_state, [1, 42336])
            # Remember the previous state, action, reward, and done
            agent1.remember(state1, action1, reward[0], next_state[0], done[0])
            agent2.remember(state2, action2, reward[1], next_state[1], done[1])

            # make next_state the new current state for the next frame.
            state1 = next_state[0]
            state2 = next_state[1]

Esempio n. 18

0

Mostra file

File: PracticeOfDQN.py Progetto: UmeHabibasaleem/Emotional-AI

def cartpole():
    env = UnityEnv(environment_filename=ENV_NAME, worker_id=5, use_visual=False, multiagent = True)
    score_logger = ScoreLogger(ENV_NAME)
    agents_brain = []
    agents_action = []

    num_agents = env.number_agents
    observation_space = env.observation_space.shape[0]
    print("____________Observation_space")
    print(observation_space)
    action_space = env.action_space.n
    dqn_solver = DQNSolver(observation_space, action_space)
    for x in range ((env.number_agents)):
        agents_brain.append(DQNSolver(observation_space, action_space))
    print ("Length of BrainList:    ",len(agents_brain))
    run = 0
    state = env.reset()
    print("______INITIAL______")
    print(state)
    #initialstate = copy.deepcopy(state)
    print("*****************************initial state for unity  envirmonet**************")
    #print(initialstate)
    jk = 1
    while True:
        run += 1
        state = env.reset()
        #state = copy.deepcopy(initialstate)
        num_agents = int(state[0][-5])
        print("_____________State _______________")
        print(int(state[0][12]))
        step = 0

        print("################################This is loop################################# :" , jk)
        while True:
            step += 1
            env.render()
            agents_action = [1] * len(state)
            print(state[0])
            print("*******************Length of state******************")
            print(len(state))
            for x in range(len(state)):
                state[x] = np.reshape(state[x], [1, observation_space])
                agents_action[x] = agents_brain[int(state[x][0,12]) - 1].act(state[x])
            print("Agents Actions List: ",agents_action)
            state_next, reward, terminal, info = env.step(agents_action)
            #print ("_____________STATE_NEXT___________")
            #print (state_next)
            if (len(state_next) == 0):
                break
            agents_alive = state_next[0][-13:-5]
            print ("Agents_alive:    ", agents_alive)
            print ("Rewards:    ",reward)
            num_agents = int(state_next[0][-5])
            print ("Number of agents:   ",num_agents)
            print("_________Terminal list_______" , terminal)
            if (terminal[0] == True):
                print("**************************Brain saved******************************")
                for x in range(len(agents_brain)):
                    agents_brain[x].save(str(run) + "brain" + str(x) + ".h5")

                jk+=1
                print("#####################################Loop is######################## :" , jk)
                #break

            for x in range(len(state_next)):
                state[x] = np.reshape(state[x], [1, observation_space])
                state_next[x] = np.reshape(state_next[x], [1, observation_space])
                agents_brain[int(state_next[x][0,12]) - 1].remember(state[x], agents_action[x], reward[x], state_next[x], terminal[x])
                agents_brain[int(state_next[x][0,12]) - 1].experience_replay()
            state = state_next

Esempio n. 19

0

Mostra file

class AC():
    def __init__(self):

        # Hyperparameters
        self.learning_rate = 0.0003
        self.gamma = 0.99
        self.batch_size = 256
        self.max_steps = 100000

        self.tau = 0.95
        self.entropy_coef = 0.001
        self.value_loss_coef = 0.5

        self.summary_freq = 1000

        # Environment
        self.env_name = "Environments/env1/Unity Environment"
        channel = EngineConfigurationChannel()
        self.env = UnityEnv(self.env_name,
                            worker_id=0,
                            use_visual=False,
                            side_channels=[channel],
                            no_graphics=False,
                            multiagent=False)
        channel.set_configuration_parameters(time_scale=100)
        self.action_size, self.state_size = Utils.getActionStateSize(self.env)
        self.n_agents = self.env.number_agents

        # Model
        self.model = ActorCritic(self.state_size, self.action_size,
                                 seed=0).to(device)
        self.optimizer = optim.Adam(self.model.parameters(),
                                    lr=self.learning_rate)

        # Buffer memory
        self.memory = Buffer()

        # Initialize time step (for updating every "batch_size" time steps)
        self.t_step = 1

    def train(self):

        # Initial observation
        env_info = self.env.reset()
        state = env_info

        # Data
        self.data = Data(self.n_agents, self.summary_freq)

        # Training loop
        for _ in range(self.max_steps):

            # Action of agent
            action, value = self.act(state)

            # Send the action to the environment
            next_state, reward, done, info = self.env.step(action)

            # Agent step
            self.step(state, action, reward, next_state, done)

            # Update t_step
            self.t_step += 1

            # Next state
            state = next_state

            # Update the score
            reward_ = np.expand_dims(reward, axis=0)
            value_ = value.unsqueeze(0)
            done_ = np.expand_dims(done, axis=0)
            self.data.update_score(reward_, value_, done_, self.t_step)

            # Summary
            if self.t_step % self.summary_freq == 0:
                self.data.summary(self.t_step)

        # Save
        self.save()

    def save(self):
        torch.save(self.model.state_dict(), 'Saved Models/model.pth')
        self.data.results()

    def load_model(self, model):
        self.model.load_state_dict(torch.load(model))

    def act(self, state):

        state = torch.from_numpy(state).float().unsqueeze(0).to(device)

        # Get actions probabilities and value from ActorCritic model
        self.model.eval()
        with torch.no_grad():
            action_probs, value = self.model(state)
        self.model.train()

        prob = F.softmax(action_probs, -1)

        # Get action and log of probabilities
        action = prob.multinomial(num_samples=1)

        return action, value

    def step(self, state, action, reward, next_state, done):

        # Save experience in buffer memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every "batch_size" time steps
        if self.t_step % self.batch_size == 0:
            experiences = self.memory.get()
            self.learn(experiences)
            self.memory.reset()

    def learn(self, experiences):

        # Get Experiences
        states, actions, rewards, next_states = experiences

        logits, values = self.model(states)
        probs = F.softmax(logits, -1)
        log_probs = F.log_softmax(logits, -1)
        entropies = -(log_probs * probs).sum(1, keepdim=True)

        log_probs = log_probs.gather(1, actions.unsqueeze(1))

        _, value = self.model(next_states)
        values = torch.cat((values, value.data))

        policy_loss = 0
        value_loss = 0
        R = values[-1]
        gae = torch.zeros(1, 1)

        for i in reversed(range(len(rewards))):

            R = self.gamma * R + rewards[i]
            advantage = R - values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            # Generalized Advantage Estimation
            delta_t = rewards[i] + self.gamma * values[i +
                                                       1].data - values[i].data
            gae = gae * self.gamma * self.tau + delta_t
            policy_loss = policy_loss - (log_probs[i] * gae) - (
                self.entropy_coef * entropies[i])

        # Loss
        loss = (policy_loss + self.value_loss_coef * value_loss)

        # Optimizer step
        self.optimizerStep(self.optimizer, loss)

    def optimizerStep(self, optimizer, loss):
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

Esempio n. 20

0

Mostra file

def cartpole():
    env = UnityEnv(environment_filename=ENV_NAME,
                   worker_id=1,
                   use_visual=False,
                   multiagent=True)
    #score_logger = ScoreLogger(ENV_NAME)
    agents_brain = []
    agents_action = []
    pathname = "C:/HinaProgramm/testingFolder/Unity Environment"
    num_agents = env.number_agents
    print("Number of agents in enviroment : ", num_agents)
    observation_space = env.observation_space.shape[0]
    print("____________Observation_space______________")
    print(observation_space)
    print("__________Action Space________________")
    action_space = env.action_space.n
    print(action_space)
    dqn_solver = DQNSolver(observation_space, action_space)
    for x in range((num_agents)):
        agents_brain.append(DQNSolver(observation_space, action_space))
    print("Length of BrainList:    ", len(agents_brain))
    run = 0
    state = env.reset()
    #print("______INITIAL______")
    #print(state)
    initialstate = copy.deepcopy(state)
    #print("*****************************initial state for unity  envirmonet**************")
    #print(initialstate)
    jk = 1
    sharecount = 0
    eatcount = 0
    filecount = 0
    #f = str(filecount) + "sahre.csv"
    f = open(str(filecount) + "sahre.csv", 'ab')
    #J = str(filecount) + "eat.csv"
    J = open(str(filecount) + "eat.csv", 'ab')
    while True:
        run += 1
        env.reset()

        state = copy.deepcopy(initialstate)
        num_agents = int(state[0][-8])
        print("_numagents__________", num_agents)
        print("_____________State _______________")
        print(int(state[0][12]))
        step = 0
        print(
            "################################This is loop################################# :",
            jk)
        print("_____Run _______ :", run)
        while True:

            #print("************Number of agents *********")
            #print(env.number_agents)
            step += 1
            env.render()
            agents_action = [1] * len(state)
            #print(state[0])
            #print("*******************Length of state******************")
            #print(len(state))
            for x in range(len(state)):
                state[x] = np.reshape(state[x], [1, observation_space])
                agents_action[x] = agents_brain[int(state[x][0, 12]) - 1].act(
                    state[x])

            sharecount += agents_action.count(5)
            eatcount += agents_action.count(6)
            #print("Agents Actions List: ",agents_action)

            state_next, reward, terminal, info = env.step(agents_action)
            for x in range(len(agents_action)):
                if (agents_action[x] == 5):
                    new = np.asarray([state_next[x]])
                    np.savetxt(f, new, delimiter=",")
                    #f.write(str(state_next[x])+"\r\n")
                if (agents_action[x] == 6):
                    #J.write(str(state_next[x])+"\r\n")
                    new = np.asarray([state_next[x]])
                    np.savetxt(J, new, delimiter=",")
            print("_____________STATE_NEXT___________")
            print(state_next)
            if (len(state_next) == 0):
                #f.write(str(sharecount))
                #J.write(str(eatcount))
                #f.close()
                #J.close()
                filecount += 1
                np.savetxt(f, sharecount, delimiter=",")
                np.savetxt(J, eatcount, delimiter=",")
                break
            agents_alive = state_next[0][-16:-8]
            print("Agents_alive:    ", agents_alive)
            print("Rewards:    ", reward)
            num_agents = int(state_next[0][-8])
            print("Number of agents:   ", num_agents)
            #print("_________Terminal list_______" , terminal)
            if (terminal[0] == True):
                print(
                    "**************************Brain saved******************************"
                )
                for x in range(len(agents_brain)):
                    agents_brain[x].model.save(pathname + str(run) + "brain" +
                                               str(x) + ".h5")

                jk += 1
                print(
                    "#####################################Loop is######################## :",
                    jk)
                #f.write(str(sharecount))
                #J.write(str(eatcount))
                #f.close()
                #J.close()
                filecount += 1
                break

            for x in range(len(state_next)):
                state[x] = np.reshape(state[x], [1, observation_space])
                state_next[x] = np.reshape(state_next[x],
                                           [1, observation_space])
                agents_brain[int(state_next[x][0, 12]) - 1].remember(
                    state[x], agents_action[x], reward[x], state_next[x],
                    terminal[x])
                agents_brain[int(state_next[x][0, 12]) - 1].experience_replay()
            state = state_next

Esempio n. 21

0

Mostra file

File: tester.py Progetto: adhipradhana/pytorch-maddpg

    # reset environment
    obs = env.reset()
    obs = np.stack(obs)

    # convert observation to tensor
    if isinstance(obs, np.ndarray):
        obs = torch.from_numpy(obs).float()

    total_reward = 0.0
    rr = np.zeros((n_agents, ))
    for i_step in range(max_steps):
        obs = obs.type(FloatTensor)
        actions = maddpg.select_action(obs).data.cpu()
        actions_list = actions.tolist()

        obs_, reward, done, _ = env.step(actions_list)

        reward = torch.FloatTensor(reward).type(FloatTensor)
        obs_ = np.stack(obs_)
        obs_ = torch.from_numpy(obs_).float()
        if i_step != max_steps - 1:
            next_obs = obs_
        else:
            next_obs = None

        total_reward += reward.sum()
        rr += reward.cpu().numpy()
        maddpg.memory.push(obs.data, actions, next_obs, reward)

        obs = next_obs

Esempio n. 22

0

Mostra file

File: gridworld_learn_exmaple.py Progetto: flxh/ml-praktikum

import numpy as np
import sys

from gym_unity.envs import UnityEnv

env_name = "../env/GridWorld.x86_64"  # Name of the Unity environment binary to launch
env = UnityEnv(env_name)

# Examine environment parameters
print(str(env))

# Reset the environment
initial_observation = env.reset()

for episode in range(10):
    initial_observation = env.reset()
    done = False
    episode_rewards = 0
    while not done:
        observation, reward, done, info = env.step(env.action_space.sample())
        episode_rewards += reward
    print("Total reward this episode: {}".format(episode_rewards))

env.close()

Esempio n. 23

0

Mostra file

File: data_collect_raw.py Progetto: quantumiracle/store2

    env_name = "./tac_real2"  # Name of the Unity environment binary to launch
    replay_buffer = ReplayBuffer(1e6)
    td3_trainer=TD3_Trainer(replay_buffer,state_dim=state_dim, action_dim=action_dim, hidden_dim=hidden_dim, policy_target_update_interval=3, action_range=20. )
    model_path = './model/td3_all'

    env = UnityEnv(env_name, worker_id=np.random.randint(0,10), use_visual=False, use_both=True)
    td3_trainer.load_model(model_path)
    batch_s = []
    cnt=0
    for eps in range(training_episodes):
        print(eps)
        s,info = env.reset()
        
        for step in range(episode_length):
            batch_s.append(s)
            s= state_process(s)
            a = td3_trainer.policy_net.get_action(s, deterministic = DETERMINISTIC, explore_noise_scale=0.0)
            a+=np.random.normal(0, 5, a.shape[0])  
            s_, r, d, _ = env.step(a)
            cnt+=1
            s=s_

            # # print(np.mean(np.abs(np.array(s[7:])-s0)))  # choose 0.03
            # if step >0 and np.mean(np.abs(np.array(s[7:])-s0))>0.03 and s[4]+s[5]+s[6]!=0:  # set a threshold to extract deformation frames
            #     batch_s.append(s)     # dim of s total 280 (select 182 as obs): 0 object index, 1-3 rotation value, 4-6 average contact point position, 7-279 pins positions
            #     cnt+=1

            
    print('total number of samples: ', cnt)
    pickle.dump(batch_s, data_file)

Esempio n. 24

0

Mostra file

env.render()
# Reset the environment
obv = env.reset()

# the initial state
state_0 = state_to_bucket(obv)
total_reward = 0

for t in range(MAX_T):

    # Select an action
    action = select_action(state_0, 0)

    # execute the action
    obv, reward, done, _ = env.step(action)

    # Observe the result
    state = state_to_bucket(obv)
    total_reward += reward
    print(state, t, total_reward)

    # # Update the Q based on the result
    # best_q = np.amax(q_table[state])
    # q_table[state_0 + (action,)] += learning_rate * (reward + discount_factor * (best_q) - q_table[state_0 + (action,)])

    env.render()
    time.sleep(1)
    # Setting up for the next iteration
    state_0 = state
    if done:

Esempio n. 25

0

Mostra file

File: dfp.py Progetto: kaiolae/dfp_banana

    for t in range(total_training_timesteps):
        loss = 0
        r_t = 0
        a_t = np.zeros([action_size])

        # Epsilon Greedy
        action_idx = agent.get_action(
            s_t, m_t, goal,
            inference_goal)  #KOE: This is the forward pass through the NN.

        #KOEComment: My unity agent also skips 5 frames between actions, controlled in the Unity interface.
        #The vector space in Unity has 4 branches, with multiple actions i each! Those can also be combined!
        #I need the ANN output to be able to select all combinations.
        #TODO Believe step just wants the index of the action.

        observation, reward, done, info = env.step(action_idx)

        if battery_limited and battery < 0:
            done = True
            print("Battery empty. Stopping.")

        if (done):
            print("Game done at timestep ", t)
            if ((food - poison) > max_reward):
                max_reward = (food - poison)
            GAME += 1
            reward_buffer.append(food - poison)
            food_buffer.append(food)
            poison_buffer.append(poison)
            battery_buffer.append(battery)
            num_batteries_buffer.append(num_batteries)

Esempio n. 26

0

Mostra file

import gym

from stable_baselines.common.policies import MlpPolicy
from stable_baselines.common.vec_env import DummyVecEnv, VecNormalize
from stable_baselines import PPO2
from gym_unity.envs import UnityEnv

env = UnityEnv('./envs/Walker')
env = DummyVecEnv([lambda: env])  # The algorithms require a vectorized environment to run
# Automatically normalize the input features
env = VecNormalize(env, norm_obs=True, norm_reward=False, clip_obs=10.)


# Load the trained agent
model = PPO2.load('./models/my-model')

# Enjoy trained agent
obs = env.reset()
for i in range(1000):
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    env.render()

Esempio n. 27

0

Mostra file

        # s, info = env.reset()
        # for t in range(100):
        #     # env.render()
        #     s, r, done, info = env.step(GLOBAL_PPO.choose_action(s))

        GLOBAL_PPO.save(model_path)

    if args.test:
        env = UnityEnv(env_name,
                       worker_id=np.random.randint(0, 10),
                       use_visual=True,
                       use_both=True)
        env.reset()
        GLOBAL_PPO = PPO()
        GLOBAL_PPO.load(model_path)
        test_steps = 200
        test_episode = 10

        for _ in range(test_episode):
            s, info = env.reset()
            ''''''
            # vector_s = info["brain_info"].vector_observations[0, :]  # get the vector observation
            # s=vector_s
            for t in range(test_steps):
                # env.render()
                s, r, done, info = env.step(GLOBAL_PPO.choose_action(s))
                ''''''
                # vector_s = info["brain_info"].vector_observations[0, :]  # get the vector observation
                # s=vector_s

Esempio n. 28

0

Mostra file

File: gym1.py Progetto: AzR919/Unity_ML_summer

print("\nwith no render")

env = UnityEnv(env_name, no_graphics=False, multiagent=True, worker_id=1)
"""
res = []

for j in range(10):
"""
print(str(env))
ini_obs = env.reset()

curr_t = time.time()

for i in range(10000):
    actions = [env.action_space.sample() for agent in range(env.number_agents)]
    obs, rew, done, info = env.step(actions)

res = time.time() - curr_t

print("\nTime for 1000 step")
print(res)
print("\n\n")
"""
res = []

for j in range(10):

  ini_obs = env.reset()

  curr_t = time.time()

Esempio n. 29

0

Mostra file

File: td3_multiprocess_thre.py Progetto: quantumiracle/store2

        # choose env
        # env_name="./tac_follow_new4"
        # env_name="tac_follow_new4_random02"
        env_name = "tac_follow_new4_random"
        env = UnityEnv(env_name, worker_id=22, use_visual=False, use_both=True)
        td3_trainer.load_model(model_path)
        eps_r = []
        for eps in range(20):
            state, info = env.reset()
            state0 = state
            state = state_process(state, state0)
            episode_reward = 0

            for step in range(max_steps):
                action = td3_trainer.policy_net.get_action(
                    state,
                    deterministic=DETERMINISTIC,
                    explore_noise_scale=0.0)
                next_state, reward, done, info = env.step(action)
                reward += 100
                next_state = state_process(next_state, state0)
                episode_reward += reward
                state = next_state
                if done:
                    break
            print('Episode: ', eps, '| Episode Reward: ', episode_reward)
            eps_r.append(episode_reward)

        print(eps_r)
        print(np.average(eps_r))

Esempio n. 30

0

Mostra file

def cartpole():
    env = UnityEnv(environment_filename=ENV_NAME,
                   worker_id=2,
                   use_visual=False,
                   multiagent=True)
    score_logger = ScoreLogger(ENV_NAME)
    agents_brain = []
    agents_action = []
    index_list = []
    agents_alive = []
    count = 0
    count1 = 0
    num_agents = env.number_agents
    print("___________Number of agents in cartpole __")
    print(num_agents)
    observation_space = env.observation_space.shape[0]
    action_space = env.action_space.n
    dqn_solver = DQNSolver(observation_space, action_space)
    print("__dqn solver______")
    print(dqn_solver)
    #model = tf.keras.models.load_model("")
    for x in range((env.number_agents)):
        agents_brain.append(dqn_solver)
        print("______agentbrain____")
        print(agents_brain)
        print("_Agent action___")
        print(agents_action)

    learning_brain = copy.deepcopy(agents_brain)
    run = 0
    state = env.reset()
    initialstate = copy.deepcopy(state)
    while True:
        run += 1
        env.reset()
        print("____________STATE____________-")
        print(state[0])
        state = copy.deepcopy(initialstate)
        agents_brain = []
        agents_action = []
        index_list = []
        agents_alive = []
        count = 0
        count1 = 0
        num_agents = int(state[0][-5])
        agents_brain = copy.deepcopy(learning_brain)
        print(learning_brain)
        print(agents_brain)
        print(state)
        #for x in range ( (env.number_agents - 1) ):

        step = 0
        while True:
            step += 1
            env.render()
            print("___________STatte Lenth_______")
            print(len(state))
            print("______selffish___")
            print(state[0])
            agents_action = [1] * len(state)
            copied_agents_alive = copy.deepcopy(agents_alive)
            print("__________numagents_____")
            for x in range(num_agents - 1):
                state[x] = np.reshape(state[x], [1, observation_space])
                agents_action[x] = agents_brain[x].act(state[x])
            print(agents_action)
            state_next, reward, terminal, info = env.step(
                agents_action, num_agents)
            print("_______Reward________")
            print(reward)
            print("_____________NEXT STATE LENGTH____________")
            print(len(state_next))
            if (len(state_next) == 0):
                break
            agents_alive = state_next[0][-13:-5]
            num_agents = int(state_next[0][-5])
            print("_______num agnets in cartpole________")
            print(num_agents)
            print("_____index list")
            print(index_list)
            print(agents_alive)
            agents_alive1 = np.delete(agents_alive, index_list)
            print("_______Alive agent list_______")
            print(agents_alive1)
            flag = False
            # del agents_alive[index_list[x]]
            for x in range(len(agents_alive)):
                if (agents_alive[x] == float(1)):
                    for y in range(len(index_list)):
                        if (index_list[y] == x):
                            flag = True
                    if (flag == False):
                        index_list.append(x)

                flag = False

            index_to_remove = []
            for x in range(len(agents_alive1)):
                if (agents_alive1[x] == float(1)):
                    learning_brain[index_list[count]] = agents_brain[x]
                    index_to_remove.append(x)
                    count = count + 1

            agents_brain = [
                i for j, i in enumerate(agents_brain)
                if j not in index_to_remove
            ]
            print("____________AGENTS_BRAIN_________")
            print(len(agents_brain))
            print("_______________Terminal_____________")
            print(terminal)
            if (terminal[0] == True):
                print("Run: " + str(run) + ", exploration: " +
                      str(dqn_solver.exploration_rate) + ", score: " +
                      str(step))
                score_logger.add_score(step, run)
                for x in range(len(copied_agents_alive)):
                    learning_brain[x] = agents_brain[count1]
                    count1 = count1 + 1
                for x in range(len(learning_brain)):
                    learning_brain[x].save(str(run) + "brain" + str(x) + ".h5")

                break

            for x in range(num_agents - 1):
                state[x] = np.reshape(state[x], [1, observation_space])
                state_next[x] = np.reshape(state_next[x],
                                           [1, observation_space])
                agents_brain[x].remember(state[x], agents_action[x], reward[x],
                                         state_next[x], terminal[x])
                agents_brain[x].experience_replay()
            state = state_next