Esempio n. 1
0
def main(params):
    config = vars(parser.parse_args())

    # env = gym.make(config['env'])
    env = make_env(config['env'])
    env.seed(seed)

    agent = PPO(env, cfg['agent'])
    tag = params['tag']

    # Initiate the tracker for stats
    tracker = Tracker(
        config['env'],  #env.unwrapped.spec.id,
        tag,
        seed,
        cfg['agent'],
        ['Epoch', 'Ep_Reward', 'Cost'])

    # Train the agent
    agent.train(tracker,
                n_episodes=config['epochs'],
                n_step=config['stepmax'],
                verbose=config['verbose'],
                params=cfg['agent'],
                hyperp=config)
Esempio n. 2
0
 def _thunk():
     env = make_env(
         cube_goal_pose=goal_dict,
         goal_difficulty=goal_difficulty,
         action_space=action_space,
         frameskip=frameskip,
         sim=sim,
         visualization=visualization,
         reward_fn=reward_fn,
         termination_fn=termination_fn,
         initializer=initializer,
         episode_length=10 *
         episode_length,  # make this long enough to ensure that we have "episode_length" steps in the residual_state.
         rank=rank,
         monitor=monitor,
     )
     if domain_randomization:
         env = RandomizedEnvWrapper(env)
     env = ResidualWrapper(env,
                           state_machine,
                           frameskip,
                           max_torque,
                           residual_state,
                           max_length=episode_length)
     env = EpisodeInfo(env)
     env.seed(seed + rank)
     return env
Esempio n. 3
0
def init(env_name, args, final_init=True):
    if env_name == 'levers':
        env = gym.make('Levers-v0')
        env.multi_agent_init(args.total_agents, args.nagents)
        env = GymWrapper(env)
    elif env_name == 'number_pairs':
        env = gym.make('NumberPairs-v0')
        m = args.max_message
        env.multi_agent_init(args.nagents, m)
        env = GymWrapper(env)
    elif env_name == 'predator_prey':
        env = gym.make('PredatorPrey-v0')
        if args.display:
            env.init_curses()
        env.multi_agent_init(args)
        env = GymWrapper(env)
    elif env_name == 'traffic_junction':
        from ic3net_envs.ic3net_envs.traffic_junction_env import TrafficJunctionEnv
        env = TrafficJunctionEnv()
        #env = gym.make('TrafficJunction-v0')
        if args.display:
            env.init_curses()
        env.multi_agent_init(args)
        env = GymWrapper(env)
    elif env_name == 'starcraft':
        env = gym.make('StarCraftWrapper-v0')
        env.multi_agent_init(args, final_init)
        env = GymWrapper(env.env)
    elif env_name == 'simple_tag':
        env = make_env(env_name, args)
        env = EnvWrapper(env)
    elif env_name == 'simple_spread':
        env = make_env(env_name, args)
        env = EnvWrapper(env)
    else:
        raise RuntimeError("wrong env name")
    return env
def _init_env(goal_pose_dict, difficulty):
    eval_config = {
        'action_space': 'torque_and_position',
        'frameskip': 3,
        'reward_fn': 'competition_reward',
        'termination_fn': 'no_termination',
        'initializer': 'random_init',
        'monitor': False,
        'visualization': True,
        'sim': True,
        'rank': 0
    }

    set_seed(0)
    env = make_env(goal_pose_dict, difficulty, **eval_config)
    return env
Esempio n. 5
0
def _init_env(goal_pose_json, difficulty, path=None):
    eval_config = {
        'action_space': 'torque_and_position',
        'frameskip': 3,
        'reward_fn': 'competition_reward',
        'termination_fn': 'no_termination',
        'initializer': 'bo_init',
        'monitor': False,
        'visualization': False,
        'sim': True,
        'rank': 0,
        'episode_length': EPISODE_LEN_SIM
    }

    set_seed(0)
    goal_pose_dict = json.loads(goal_pose_json)
    env = make_env(goal_pose_dict, difficulty, path=path, **eval_config)
    return env
Esempio n. 6
0
def main(args):

    env = make_env(args.scenario)
    n_agents = env.n
    n_actions = env.world.dim_p
    # env = ActionNormalizedEnv(env)
    # env = ObsEnv(env)
    n_states = env.observation_space[0].shape[0]

    torch.manual_seed(args.seed)

    if args.tensorboard and args.mode == "train":
        writer = SummaryWriter(log_dir='runs/' + args.algo + "/" +
                               args.log_dir)

    if args.algo == "bicnet":
        model = BiCNet(n_states, n_actions, n_agents, args)

    if args.algo == "commnet":
        model = CommNet(n_states, n_actions, n_agents, args)

    if args.algo == "maddpg":
        model = MADDPG(n_states, n_actions, n_agents, args)

    print(model)
    model.load_model()

    episode = 0
    total_step = 0

    while episode < args.max_episodes:

        state = env.reset()

        episode += 1
        step = 0
        accum_reward = 0
        rewardA = 0
        rewardB = 0
        rewardC = 0
        while True:

            if args.mode == "train":
                action = model.choose_action(state, noisy=True)
                next_state, reward, done, info = env.step(action)

                step += 1
                total_step += 1
                reward = np.array(reward)

                rew1 = reward_from_state(next_state)
                reward = rew1 + (np.array(reward, dtype=np.float32) / 100.)
                accum_reward += sum(reward)
                rewardA += reward[0]
                rewardB += reward[1]
                rewardC += reward[2]

                if args.algo == "maddpg" or args.algo == "commnet":
                    obs = torch.from_numpy(np.stack(state)).float().to(device)
                    obs_ = torch.from_numpy(
                        np.stack(next_state)).float().to(device)
                    if step != args.episode_length - 1:
                        next_obs = obs_
                    else:
                        next_obs = None
                    rw_tensor = torch.FloatTensor(reward).to(device)
                    ac_tensor = torch.FloatTensor(action).to(device)
                    if args.algo == "commnet" and next_obs is not None:
                        model.memory.push(obs.data, ac_tensor, next_obs,
                                          rw_tensor)
                    if args.algo == "maddpg":
                        model.memory.push(obs.data, ac_tensor, next_obs,
                                          rw_tensor)
                    obs = next_obs
                else:
                    model.memory(state, action, reward, next_state, done)

                state = next_state

                if args.episode_length < step or (True in done):
                    c_loss, a_loss = model.update(episode)

                    print("[Episode %05d] reward %6.4f" %
                          (episode, accum_reward))
                    if args.tensorboard:
                        writer.add_scalar(tag='agent/reward',
                                          global_step=episode,
                                          scalar_value=accum_reward.item())
                        writer.add_scalar(tag='agent/reward_0',
                                          global_step=episode,
                                          scalar_value=rewardA.item())
                        writer.add_scalar(tag='agent/reward_1',
                                          global_step=episode,
                                          scalar_value=rewardB.item())
                        writer.add_scalar(tag='agent/reward_2',
                                          global_step=episode,
                                          scalar_value=rewardC.item())
                        if c_loss and a_loss:
                            writer.add_scalars('agent/loss',
                                               global_step=episode,
                                               tag_scalar_dict={
                                                   'actor': a_loss,
                                                   'critic': c_loss
                                               })

                    if c_loss and a_loss:
                        print(" a_loss %3.2f c_loss %3.2f" % (a_loss, c_loss),
                              end='')

                    if episode % args.save_interval == 0 and args.mode == "train":
                        model.save_model(episode)

                    env.reset()
                    # model.reset()
                    break
            elif args.mode == "eval":
                action = model.choose_action(state, noisy=False)
                next_state, reward, done, info = env.step(action)
                step += 1
                total_step += 1
                state = next_state
                reward = np.array(reward)
                import time
                time.sleep(0.02)
                env.render()

                rew1 = reward_from_state(next_state)
                reward = rew1 + (np.array(reward, dtype=np.float32) / 100.)
                accum_reward += sum(reward)
                rewardA += reward[0]
                rewardB += reward[1]
                rewardC += reward[2]

                if args.episode_length < step or (True in done):
                    print("[Episode %05d] reward %6.4f " %
                          (episode, accum_reward))
                    env.reset()
                    break

    if args.tensorboard:
        writer.close()
        path = Path(cube_path, joint_conf_path, tip_path, grasp)
        path.set_min_height(self.env, path_min_height)
        return path


if __name__ == '__main__':
    from trifinger_simulation.tasks import move_cube
    from env.make_env import make_env

    reward_fn = 'competition_reward'
    termination_fn = 'position_close_to_goal'
    initializer = 'small_rot_init'
    env = make_env(move_cube.sample_goal(-1).to_dict(),
                   4,
                   reward_fn=reward_fn,
                   termination_fn=termination_fn,
                   initializer=initializer,
                   action_space='position',
                   sim=True,
                   visualization=True)

    for i in range(1):
        obs = env.reset()

        pos = obs["object_position"]
        quat = obs["object_orientation"]
        goal_pos = obs["goal_object_position"]
        goal_quat = obs["goal_object_orientation"]
        planner = WholeBodyPlanner(env)
        path = planner.plan(pos,
                            quat,
                            goal_pos,
Esempio n. 8
0
def main(args):

    env = make_env('simple_spread')
    # env = make_env('simple')
    # env = gym.make('Pendulum-v0')
    env = ActionNormalizedEnv(env)
    env = ObsEnv(env)

    kwargs = dict()
    kwargs['config'] = args
    torch.manual_seed(args.seed)

    if args.tensorboard:
        writer = SummaryWriter(log_dir='runs/'+args.log_dir)
    model = BiCNet(14, 2, 3, **kwargs)
    # model = BiCNet(4, 2, 1, **kwargs)

    episode = 0
    total_step = 0

    while episode < args.max_episodes:

        state = env.reset()

        episode += 1
        step = 0
        accum_reward = 0
        rewardA = 0
        rewardB = 0
        rewardC = 0

        prev_reward = np.zeros((3), dtype=np.float32)

        while True:

            # action = agent.random_action()
            if episode > args.warmup:
                action = model.choose_action(state, noisy=True)
            else:
                action = model.random_action()

            next_state, reward, done, info = env.step(action)
            step += 1
            total_step += 1


            reward = np.array(reward)

            '''KeyboardInterrupt
            Reward Shaping
                - Distance to landmarks
            '''

            rew1 = reward_from_state(next_state)
            #if step % 5 == 0:
            #     rew1 -= 0.1
            reward = rew1 + (np.array(reward, dtype=np.float32) / 100.)
            accum_reward += sum(reward)
            rewardA += reward[0]
            rewardB += reward[1]
            rewardC += reward[2]

            if args.render and episode % 100 == 0:
                env.render(mode='rgb_array')
            model.memory(state, action, reward, next_state, done)

            state = next_state

            if len(model.replay_buffer) >= args.batch_size and total_step % args.steps_per_update == 0:
                model.prep_train()
                model.train()
                model.prep_eval()

            if args.episode_length < step or (True in done):
                c_loss, a_loss = model.get_loss()
                action_std = model.get_action_std()
                print("[Episode %05d] reward %6.4f eps %.4f" % (episode, accum_reward, model.epsilon), end='')
                if args.tensorboard:
                    writer.add_scalar(tag='agent/reward', global_step=episode, scalar_value=accum_reward.item())
                    writer.add_scalar(tag='agent/reward_0', global_step=episode, scalar_value=rewardA.item())
                    writer.add_scalar(tag='agent/reward_1', global_step=episode, scalar_value=rewardB.item())
                    writer.add_scalar(tag='agent/reward_2', global_step=episode, scalar_value=rewardC.item())
                    writer.add_scalar(tag='agent/epsilon', global_step=episode, scalar_value=model.epsilon)
                    if c_loss and a_loss:
                        writer.add_scalars('agent/loss', global_step=episode, tag_scalar_dict={'actor':a_loss, 'critic':c_loss})
                    if action_std:
                        writer.add_scalar(tag='agent/action_std', global_step=episode, scalar_value=action_std)
                if c_loss and a_loss:
                    print(" a_loss %3.2f c_loss %3.2f" % (a_loss, c_loss), end='')
                if action_std:
                    print(" action_std %3.2f" % (action_std), end='')


                print()
                env.reset()
                model.reset()
                break





    if args.tensorboard:
        writer.close()