def test_agent(fname, agent, avg=100, seed=43):
    _, env_args = load_args(CONFIG_PATH)
    if fname is not None:
        # if map is specified, use the map without random map
        env_args["fname"] = fname
        env_args["random_map"] = False
    env = gym.make("ScavengerHuntMap-v0", **env_args)
    env.seed(seed)
    dist_list = []
    a = agent(env)
    for i in range(avg):
        print("Running %d/%d" % ((i + 1), avg), end="\r")
        obs = env.reset()
        done = False
        dist = 0
        while not done:
            act = a.next_node(obs)
            cl = env.env.map.get_current_loc()
            obs, _, done, info = env.step(act)
            dist += info["cost"]
        dist_list.append(dist)
    return sum(dist_list) / avg, np.std(dist_list)
Exemple #2
0
print('observation space:', env.observation_space)
print('action space:', env.action_space)
env.render()
action = env.action_space.sample()
print(action)
obs, r, done, info = env.step(action)
print('next observation:', obs)
print('reward:', r)
print('done:', done)
print('info:', info)
print('nb_actions', env.action_space.n)

env = gym.make(ENV_NAME)
np.random.seed(123)
env.seed(123)
nb_actions = env.action_space.n

# Next, we build a very simple model.
model = Sequential()
model.add(Flatten(input_shape=(1, ) + env.observation_space.shape))
model.add(Dense(300))
model.add(Activation('relu'))
model.add(Dense(300))
model.add(Activation('relu'))
model.add(Dense(300))
model.add(Activation('relu'))
model.add(Dense(300))
model.add(Activation('relu'))
model.add(Dense(300))
model.add(Activation('relu'))
Exemple #3
0
sys.path.append('./')  # 此时假设此 py 文件和 env 文件夹在同一目录下
import env

parser = argparse.ArgumentParser(
    description='Train or test neural net motor controller.')
parser.add_argument('--train', dest='train', action='store_true', default=True)
parser.add_argument('--test', dest='test', action='store_false')
args = parser.parse_args()

if __name__ == '__main__':
    # 初始化环境
    env = gym.make(ENV_NAME)
    env = env.unwrapped

    # reproducible,设置随机种子,为了能够重现
    env.seed(RANDOMSEED)
    np.random.seed(RANDOMSEED)
    tf.random.set_seed(RANDOMSEED)

    # 定义状态空间,动作空间,动作幅度范围
    s_dim = 50
    a_dim = 50
    a_bound = env.action_space.high

    print('s_dim', s_dim)
    print('a_dim', a_dim)

    # 用DDPG算法
    ddpg = DDPG(a_dim, s_dim, a_bound)

    # 训练部分:
Exemple #4
0
def launch(args):

    rank = MPI.COMM_WORLD.Get_rank()

    t_total_init = time.time()

    # Make the environment
    if args.algo == 'continuous':
        args.env_name = 'FetchManipulate3ObjectsContinuous-v0'
        args.multi_criteria_her = True
    else:
        args.env_name = 'FetchManipulate3Objects-v0'
    env = gym.make(args.env_name)

    # set random seeds for reproducibility
    env.seed(args.seed + MPI.COMM_WORLD.Get_rank())
    random.seed(args.seed + MPI.COMM_WORLD.Get_rank())
    np.random.seed(args.seed + MPI.COMM_WORLD.Get_rank())
    torch.manual_seed(args.seed + MPI.COMM_WORLD.Get_rank())
    if args.cuda:
        torch.cuda.manual_seed(args.seed + MPI.COMM_WORLD.Get_rank())

    # get saving paths
    if rank == 0:
        logdir, model_path, bucket_path = init_storage(args)
        logger.configure(dir=logdir)
        logger.info(vars(args))

    args.env_params = get_env_params(env)

    if args.algo == 'language':
        language_goal = get_instruction()
        goal_sampler = GoalSampler(args)
    else:
        language_goal = None
        goal_sampler = GoalSampler(args)

    # Initialize RL Agent
    if args.agent == "SAC":
        policy = RLAgent(args, env.compute_reward, goal_sampler)
    else:
        raise NotImplementedError

    # Initialize Rollout Worker
    rollout_worker = RolloutWorker(env, policy, goal_sampler, args)

    # Main interaction loop
    episode_count = 0
    for epoch in range(args.n_epochs):
        t_init = time.time()

        # setup time_tracking
        time_dict = dict(goal_sampler=0,
                         rollout=0,
                         gs_update=0,
                         store=0,
                         norm_update=0,
                         policy_train=0,
                         lp_update=0,
                         eval=0,
                         epoch=0)

        # log current epoch
        if rank == 0: logger.info('\n\nEpoch #{}'.format(epoch))

        # Cycles loop
        for _ in range(args.n_cycles):

            # Sample goals
            t_i = time.time()
            goals, self_eval = goal_sampler.sample_goal(
                n_goals=args.num_rollouts_per_mpi, evaluation=False)
            if args.algo == 'language':
                language_goal_ep = np.random.choice(
                    language_goal, size=args.num_rollouts_per_mpi)
            else:
                language_goal_ep = None
            time_dict['goal_sampler'] += time.time() - t_i

            # Control biased initializations
            if epoch < args.start_biased_init:
                biased_init = False
            else:
                biased_init = args.biased_init

            # Environment interactions
            t_i = time.time()
            episodes = rollout_worker.generate_rollout(
                goals=goals,  # list of goal configurations
                self_eval=
                self_eval,  # whether the agent performs self-evaluations
                true_eval=False,  # these are not offline evaluation episodes
                biased_init=biased_init,
                language_goal=language_goal_ep
            )  # whether initializations should be biased.
            time_dict['rollout'] += time.time() - t_i

            # Goal Sampler updates
            t_i = time.time()
            episodes = goal_sampler.update(episodes, episode_count)
            time_dict['gs_update'] += time.time() - t_i

            # Storing episodes
            t_i = time.time()
            policy.store(episodes)
            time_dict['store'] += time.time() - t_i

            # Updating observation normalization
            t_i = time.time()
            for e in episodes:
                policy._update_normalizer(e)
            time_dict['norm_update'] += time.time() - t_i

            # Policy updates
            t_i = time.time()
            for _ in range(args.n_batches):
                policy.train()
            time_dict['policy_train'] += time.time() - t_i
            episode_count += args.num_rollouts_per_mpi * args.num_workers

        # Updating Learning Progress
        t_i = time.time()
        if goal_sampler.curriculum_learning and rank == 0:
            goal_sampler.update_LP()
        goal_sampler.sync()

        time_dict['lp_update'] += time.time() - t_i
        time_dict['epoch'] += time.time() - t_init
        time_dict['total'] = time.time() - t_total_init

        if args.evaluations:
            if rank == 0: logger.info('\tRunning eval ..')
            # Performing evaluations
            t_i = time.time()
            if args.algo == 'language':
                ids = np.random.choice(np.arange(35), size=len(language_goal))
                eval_goals = goal_sampler.valid_goals[ids]
            else:
                eval_goals = goal_sampler.valid_goals
            episodes = rollout_worker.generate_rollout(
                goals=eval_goals,
                self_eval=True,  # this parameter is overridden by true_eval
                true_eval=True,  # this is offline evaluations
                biased_init=False,
                language_goal=language_goal)

            # Extract the results
            if args.algo == 'continuous':
                results = np.array([e['rewards'][-1] == 3.
                                    for e in episodes]).astype(np.int)
            elif args.algo == 'language':
                results = np.array([
                    e['language_goal']
                    in sentence_from_configuration(config=e['ag'][-1],
                                                   all=True) for e in episodes
                ]).astype(np.int)
            else:
                results = np.array([
                    str(e['g'][0]) == str(e['ag'][-1]) for e in episodes
                ]).astype(np.int)
            rewards = np.array([e['rewards'][-1] for e in episodes])
            all_results = MPI.COMM_WORLD.gather(results, root=0)
            all_rewards = MPI.COMM_WORLD.gather(rewards, root=0)
            time_dict['eval'] += time.time() - t_i

            # Logs
            if rank == 0:
                assert len(all_results) == args.num_workers  # MPI test
                av_res = np.array(all_results).mean(axis=0)
                av_rewards = np.array(all_rewards).mean(axis=0)
                global_sr = np.mean(av_res)
                log_and_save(goal_sampler, epoch, episode_count, av_res,
                             av_rewards, global_sr, time_dict)

                # Saving policy models
                if epoch % args.save_freq == 0:
                    policy.save(model_path, epoch)
                    goal_sampler.save_bucket_contents(bucket_path, epoch)
                if rank == 0:
                    logger.info('\tEpoch #{}: SR: {}'.format(epoch, global_sr))
Exemple #5
0

if __name__ == '__main__':
    num_eval = 20
    path = './trained_model/'

    with open(path + 'config.json', 'r') as f:
        params = json.load(f)
    args = SimpleNamespace(**params)

    # Make the environment
    env = gym.make(args.env_name)

    # set random seeds for reproduce
    args.seed = np.random.randint(1e6)
    env.seed(args.seed + MPI.COMM_WORLD.Get_rank())
    random.seed(args.seed + MPI.COMM_WORLD.Get_rank())
    np.random.seed(args.seed + MPI.COMM_WORLD.Get_rank())
    torch.manual_seed(args.seed + MPI.COMM_WORLD.Get_rank())
    if args.cuda:
        torch.cuda.manual_seed(args.seed + MPI.COMM_WORLD.Get_rank())

    args.env_params = get_env_params(env)

    goal_sampler = GoalSampler(args)

    eval_goals = goal_sampler.valid_goals
    inits = [None] * len(eval_goals)
    all_results = []

    with open(path + 'inst_to_one_hot.pkl', 'rb') as f:
Exemple #6
0
def train_dqn(size, agt, eps_start=1.0, eps_end=0.05, eps_decay=0.999):
    env = gym.make('game2048-v0', size=size, norm=FLAGS.norm)
    env.seed(1)

    if FLAGS.norm:
        channels = size * size + 2
    else:
        channels = 1
    agent = model.DQNAgent(size, channels, 4, 0, FLAGS.double_q, FLAGS.dueling)
    if FLAGS.model_file:
        print(f'load {FLAGS.model_file}')
        agent.load(FLAGS.model_file)
    total_steps = 0
    total_scores = 0
    highest_score = 0
    trials = 10000
    eps = eps_start
    scores_window = deque(maxlen=WINDOWS_SIZE)
    rewards_window = deque(maxlen=WINDOWS_SIZE)
    scores = []
    sd_name = 'model_%dx%d.checkpoint' % (size, size)

    random = False
    for trial in range(1, trials + 1):
        obs = env.reset()
        stepno = 0
        rewards = 0
        loss = 0
        while True:
            stepno += 1
            total_steps += 1
            action, _ = agent.choose_action(obs, eps, rand=random)
            obs_, reward, done, _ = env.step(action)
            random = np.all(obs == obs_)
            loss = agent.step(obs, action, reward, obs_, done)
            obs = obs_
            rewards += reward
            if done:
                break

        eps = max(eps_end, eps * eps_decay)
        rewards_window.append(rewards)
        scores_window.append(env.get_score())
        scores.append(rewards)
        #  env.render()
        if env.get_score() > highest_score:
            highest_score = env.get_score()
        total_scores += env.get_score()
        print(
            '\rEpisode {}\t Steps: {}\t\t Average Reward: {:.2f}\t\t Average Scores: {:.2f}\t loss: {:.2f}\t highest: {}\t eps: {:.4f}'
            .format(trial, total_steps, np.mean(rewards_window),
                    np.mean(scores_window), loss, highest_score, eps),
            end="")
        if trial % WINDOWS_SIZE == 0:
            print(
                '\rEpisode {}\t Steps: {}\t\t Average Reward: {:.2f}\t\t Average Scores: {:.2f}\t loss: {:.2f}\t highest: {}\t eps: {:.4f}'
                .format(trial, total_steps, np.mean(rewards_window),
                        np.mean(scores_window), loss, highest_score, eps))
        if trial % 1000 == 0:
            agent.save(sd_name)

    eval(env, agent, 1000, render=False)
    print(f'steps: {total_steps} avg_score: {total_scores / trials} \
highest_score: {highest_score} at size: {size}')
    plot_score(scores, [])