def test_sample(env_name, batch_size, num_tasks, num_steps, num_workers):
    # Environment
    env = gym.make(env_name)
    env.close()
    # Policy and Baseline
    policy = get_policy_for_env(env)
    baseline = LinearFeatureBaseline(get_input_size(env))

    sampler = MultiTaskSampler(env_name,
                               {}, # env_kwargs
                               batch_size,
                               policy,
                               baseline,
                               num_workers=num_workers)
    tasks = sampler.sample_tasks(num_tasks=num_tasks)
    train_episodes, valid_episodes = sampler.sample(tasks,
                                                    num_steps=num_steps)
    sampler.close()

    assert len(train_episodes) == num_steps
    assert len(train_episodes[0]) == num_tasks
    assert isinstance(train_episodes[0][0], BatchEpisodes)

    assert len(valid_episodes) == num_tasks
    assert isinstance(valid_episodes[0], BatchEpisodes)
def main(args):
    with open(args.config, 'r') as f:
        config = json.load(f)

    if args.seed is not None:
        torch.manual_seed(args.seed)
        torch.cuda.manual_seed_all(args.seed)

    env = gym.make(config['env-name'], **config['env-kwargs'])
    env.close()

    # Policy
    policy = get_policy_for_env(env,
                                hidden_sizes=config['hidden-sizes'],
                                nonlinearity=config['nonlinearity'])
    with open(args.policy, 'rb') as f:
        state_dict = torch.load(f, map_location=torch.device(args.device))
        # 加载模型
        policy.load_state_dict(state_dict)
    policy.share_memory()

    # Baseline
    baseline = LinearFeatureBaseline(get_input_size(env))

    # Sampler
    sampler = MultiTaskSampler(config['env-name'],
                               env_kwargs=config['env-kwargs'],
                               batch_size=config['fast-batch-size'],
                               policy=policy,
                               baseline=baseline,
                               env=env,
                               seed=args.seed,
                               num_workers=args.num_workers)

    logs = {'tasks': []}
    train_returns, valid_returns = [], []
    # test phase : update NN
    for batch in trange(args.num_batches):
        tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size)
        train_episodes, valid_episodes = sampler.sample(
            tasks,
            num_steps=config['num-steps'],
            fast_lr=config['fast-lr'],
            gamma=config['gamma'],
            gae_lambda=config['gae-lambda'],
            device=args.device)

        logs['tasks'].extend(tasks)
        train_returns.append(get_returns(train_episodes[0]))
        valid_returns.append(get_returns(valid_episodes))
        # definition of get_returns
        # def get_returns(episodes):
        #     return to_numpy([episode.rewards.sum(dim=0) for episode in episodes])

    logs['train_returns'] = np.concatenate(train_returns, axis=0)
    logs['valid_returns'] = np.concatenate(valid_returns, axis=0)

    with open(args.output, 'wb') as f:
        np.savez(f, **logs)
Exemple #3
0
def main(args):
    with open(args.config, 'r') as f:
        config = json.load(f)

    if args.seed is not None:
        torch.manual_seed(args.seed)
        torch.cuda.manual_seed_all(args.seed)

    # env = gym.make(config['env-name'], **config['env-kwargs'])
    env = gym.make(config['env-name'], **config.get('env-kwargs', {}))
    env.close()

    # Policy
    policy = get_policy_for_env(env,
                                hidden_sizes=config['hidden-sizes'],
                                nonlinearity=config['nonlinearity'])
    with open(args.policy, 'rb') as f:
        state_dict = torch.load(f, map_location=torch.device(args.device))
        policy.load_state_dict(state_dict)
    policy.share_memory()

    # Baseline
    baseline = LinearFeatureBaseline(get_input_size(env))

    # Sampler
    sampler = MultiTaskSampler(config['env-name'],
                               env_kwargs=config.get('env-kwargs', {}),
                               batch_size=config['fast-batch-size'],
                               policy=policy,
                               baseline=baseline,
                               env=env,
                               seed=args.seed,
                               num_workers=args.num_workers)

    logs = {'tasks': []}
    train_returns, valid_returns = [], []

    # to see the grad0 ~ multi gradient
    grad_returns = []
    for i in range(Grad_Steps):
        grad_returns.append([])
    # to see the grad0 ~ multi gradient

    for batch in trange(args.num_batches):
        tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size)
        train_episodes, valid_episodes = sampler.sample(
            tasks,
            num_steps=config['num-steps'],
            fast_lr=config['fast-lr'],
            gamma=config['gamma'],
            gae_lambda=config['gae-lambda'],
            device=args.device)

        logs['tasks'].extend(tasks)

        # to see the grad0 ~ multi gradient
        for i in range(Grad_Steps):
            grad_returns[i].append(get_returns(train_episodes[i]))
        for i in range(Grad_Steps):
            logs['grad' + str(i) + '_returns'] = np.concatenate(
                grad_returns[i], axis=0)
        # to see the grad0 ~ multi gradient

        train_returns.append(get_returns(train_episodes[0]))
        valid_returns.append(get_returns(valid_episodes))

    logs['train_returns'] = np.concatenate(train_returns, axis=0)
    logs['valid_returns'] = np.concatenate(valid_returns, axis=0)

    # to see the grad0 ~ multi gradient
    value = [0] * (Grad_Steps + 1)
    for i in range(Grad_Steps):
        value[i] = logs['grad' + str(i) + '_returns'].mean()
    value[Grad_Steps] = logs['valid_returns'].mean()
    print(value)
    print(logs['valid_returns'].mean())
    # to see the grad0 ~ multi gradient

    with open(args.output, 'wb') as f:
        np.savez(f, **logs)