running_state = ZFilter((state_dim, ), clip=5) # running_reward = ZFilter((1,), demean=False, clip=10) """define actor and critic""" if args.model_path is None: if is_disc_action: policy_net = DiscretePolicy(state_dim, env_dummy.action_space.n) else: policy_net = Policy(state_dim, env_dummy.action_space.shape[0], log_std=args.log_std) value_net = Value(state_dim) else: policy_net, value_net, running_state = pickle.load( open(args.model_path, "rb")) if use_gpu: policy_net = policy_net.cuda() value_net = value_net.cuda() del env_dummy """create agent""" agent = Agent(env_factory, policy_net, running_state=running_state, render=args.render, num_threads=args.num_threads) def update_params(batch): states = torch.from_numpy(np.stack(batch.state)) actions = torch.from_numpy(np.stack(batch.action)) rewards = torch.from_numpy(np.stack(batch.reward)) masks = torch.from_numpy(np.stack(batch.mask).astype(np.float64))
if is_disc_action: policy_net = DiscretePolicy(state_dim, env_dummy.action_space.n) else: policy_net = Policy(state_dim, env_dummy.action_space.shape[0], hidden_size=policy_size, scale_cov=args.scale_cov) #policy_net = Policy(state_dim, env_dummy.action_space.shape[0], hidden_size=policy_size, log_std=0) value_net = Value(state_dim, hidden_size=critic_size) advantage_net = Advantage((state_dim, action_dim), hidden_size=advantage_size) else: policy_net, value_net, advantage_net, running_state = pickle.load( open(args.model_path, "rb")) if use_gpu: policy_net = policy_net.cuda() value_net = value_net.cuda() advantage_net = advantage_net.cuda() del env_dummy optimizer_policy = torch.optim.Adam(policy_net.parameters(), lr=args.learning_rate) optimizer_value = torch.optim.Adam(value_net.parameters(), lr=args.learning_rate) optimizer_advantage = torch.optim.Adam(advantage_net.parameters(), lr=args.learning_rate) # optimization epoch number and batch size for PPO optim_epochs = 5 optim_batch_size = 4096 """create agent"""