Esempio n. 1
0
def worker(id, sac_trainer, ENV, rewards_queue, replay_buffer, max_episodes, max_steps, batch_size,
           explore_steps, \
           update_itr, action_itr, AUTO_ENTROPY, DETERMINISTIC, hidden_dim, model_path):
    '''
    the function for sampling with multi-processing
    '''

    with torch.cuda.device(id % torch.cuda.device_count()):
        sac_trainer.to_cuda()

        print(sac_trainer, replay_buffer)  # sac_tainer are not the same, but all networks and optimizers in it are the same; replay  buffer is the same one.
        if ENV == 'Reacher':
            NUM_JOINTS=2
            LINK_LENGTH=[200, 140]
            INI_JOING_ANGLES=[0.1, 0.1]

            SCREEN_SIZE=1000
            SPARSE_REWARD=False
            SCREEN_SHOT=False
            action_range = 10.0

            env=Reacher(screen_size=SCREEN_SIZE, num_joints=NUM_JOINTS, link_lengths = LINK_LENGTH, \
            ini_joint_angles=INI_JOING_ANGLES, target_pos = [369,430], render=True, change_goal=False)
            action_dim = env.num_actions
            state_dim  = env.num_observations

        elif ENV == 'Pendulum':
            env = NormalizedActions(gym.make("Pendulum-v0"))
            action_dim = env.action_space.shape[0]
            state_dim  = env.observation_space.shape[0]
            action_range=1.
        
        frame_idx=0
        rewards=[]
        # training loop
        for eps in range(max_episodes):
            episode_reward = 0
            if ENV == 'Reacher':
                state = env.reset(SCREEN_SHOT)
            elif ENV == 'Pendulum':
                state =  env.reset()
            
            for step in range(max_steps):
                if frame_idx > explore_steps:
                    action = sac_trainer.policy_net.get_action(state, deterministic = DETERMINISTIC)
                else:
                    action = sac_trainer.policy_net.sample_action()
        
                try:
                    if ENV ==  'Reacher':
                        next_state, reward, done, _ = env.step(action, SPARSE_REWARD, SCREEN_SHOT)
                    elif ENV ==  'Pendulum':
                        next_state, reward, done, _ = env.step(action)
                        env.render() 
                except KeyboardInterrupt:
                    print('Finished')
                    sac_trainer.save_model(model_path)

                replay_buffer.push(state, action, reward, next_state, done)

                state = next_state
                episode_reward += reward
                frame_idx += 1

                # if len(replay_buffer) > batch_size:
                if replay_buffer.get_length() > batch_size:
                    for i in range(update_itr):
                        _ = sac_trainer.update(batch_size, reward_scale=10., auto_entropy=AUTO_ENTROPY,
                                                target_entropy=-1. * action_dim)

                if eps % 10 == 0 and eps > 0:
                    # plot(rewards, id)
                    sac_trainer.save_model(model_path)

                if done:
                    break
            print('Worker: ', id, '| Episode: ', eps, '| Episode Reward: ', episode_reward)
            # if len(rewards) == 0:
            #     rewards.append(episode_reward)
            # else:
            #     rewards.append(rewards[-1] * 0.9 + episode_reward * 0.1)
            rewards_queue.put(episode_reward)

        sac_trainer.save_model(model_path)
Esempio n. 2
0
            if ENV == 'Reacher':
                state = env.reset(SCREEN_SHOT)
            elif ENV == 'Pendulum':
                state = env.reset()
            episode_reward = 0

            for step in range(max_steps):
                # action = qt_opt.policy.act(state)
                action = qt_opt.cem_optimal_action(state)
                if ENV == 'Reacher':
                    next_state, reward, done, _ = env.step(
                        action, SPARSE_REWARD, SCREEN_SHOT)
                elif ENV == 'Pendulum':
                    next_state, reward, done, _ = env.step(action)
                    env.render()
                episode_reward += reward
                replay_buffer.push(state, action, reward, next_state, done)
                state = next_state

            if len(replay_buffer) > batch_size:
                qt_opt.update(batch_size)
                qt_opt.save_model(model_path)

            episode_rewards.append(episode_reward)

            if i_episode % 10 == 0:
                plot(episode_rewards)

            print('Episode: {}  | Reward:  {}'.format(i_episode,
                                                      episode_reward))
def worker(id, ):  # thread could read global variables
    '''
    the function for sampling with multi-threading
    '''
    print(sac_trainer, replay_buffer)
    if ENV == 'Reacher':
        env=Reacher(screen_size=SCREEN_SIZE, num_joints=NUM_JOINTS, link_lengths = LINK_LENGTH, \
        ini_joint_angles=INI_JOING_ANGLES, target_pos = [369,430], render=True, change_goal=False)

    elif ENV == 'Pendulum':
        env = NormalizedActions(gym.make("Pendulum-v0"))
    print(env)
    frame_idx = 0
    rewards = []
    # training loop
    for eps in range(max_episodes):
        episode_reward = 0
        if ENV == 'Reacher':
            state = env.reset(SCREEN_SHOT)
        elif ENV == 'Pendulum':
            state = env.reset()

        for step in range(max_steps):
            if frame_idx > explore_steps:
                action = sac_trainer.policy_net.get_action(
                    state, deterministic=DETERMINISTIC)
            else:
                action = sac_trainer.policy_net.sample_action()

            try:
                if ENV == 'Reacher':
                    next_state, reward, done, _ = env.step(
                        action, SPARSE_REWARD, SCREEN_SHOT)
                elif ENV == 'Pendulum':
                    next_state, reward, done, _ = env.step(action)
                    env.render()
            except KeyboardInterrupt:
                print('Finished')
                sac_trainer.save_model(model_path)

            replay_buffer.push(state, action, reward, next_state, done)

            state = next_state
            episode_reward += reward
            frame_idx += 1

            if len(replay_buffer) > batch_size:
                for i in range(update_itr):
                    _ = sac_trainer.update(batch_size,
                                           reward_scale=10.,
                                           auto_entropy=AUTO_ENTROPY,
                                           target_entropy=-1. * action_dim)

            if eps % 10 == 0 and eps > 0:
                plot(rewards, id)
                sac_trainer.save_model(model_path)

            if done:
                break
        print('Episode: ', eps, '| Episode Reward: ', episode_reward)
        # if len(rewards) == 0: rewards.append(episode_reward)
        # else: rewards.append(rewards[-1]*0.9+episode_reward*0.1)
    sac_trainer.save_model(model_path)