"""environment""" env = gym.make(args.env_name) state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] is_disc_action = len(env.action_space.shape) == 0 if args.use_running_state: running_state = ZFilter((state_dim,), clip=5) # running list of states that allows to access precise mean and std else: running_state = None """seeding""" np.random.seed(args.seed) torch.manual_seed(args.seed) env.seed(args.seed) policy_net = Policy(state_dim, action_dim, log_std=args.log_std) value_net = Value(state_dim) policy_net.to(device) value_net.to(device) agent_trpo = Agent(env, policy_net, device, running_state=running_state, render=args.render, num_threads=1) def update_params_trpo(batch): # (3) states = torch.from_numpy(np.stack(batch.state)).to(args.dtype).to(device) actions = torch.from_numpy(np.stack(batch.action)).to(args.dtype).to(device) rewards = torch.from_numpy(np.stack(batch.reward)).to(args.dtype).to(device) masks = torch.from_numpy(np.stack(batch.mask)).to(args.dtype).to(device) with torch.no_grad(): values = value_net(states) # estimate value function of each state with NN
state_dim = env.observation_space.shape[0] is_disc_action = len(env.action_space.shape) == 0 # running_state = ZFilter((state_dim,), clip=5) running_state = None # running_reward = ZFilter((1,), demean=False, clip=10) """seeding""" np.random.seed(args.seed) torch.manual_seed(args.seed) env.seed(args.seed) """define actor and critic""" if args.model_path is None: if is_disc_action: policy_net = DiscretePolicy(state_dim, env.action_space.n) else: policy_net = Policy(state_dim, env.action_space.shape[0], log_std=args.log_std) value_net = Value(state_dim) else: policy_net, value_net, running_state = pickle.load( open(args.model_path, "rb")) policy_net.to(device) value_net.to(device) """create agent""" agent = Agent(env, policy_net, device, running_state=running_state, render=args.render, num_threads=args.num_threads)