def __init__(self, max_angle=12, max_num_steps=1000): self.env = CartPoleEnv() # self.env.theta_threshold_radians = max_angle * 2 * math.pi / 360 self.observation_space = self.env.observation_space self.action_space = self.env.action_space self.step_counter = 0 self.max_num_steps = max_num_steps
def __init__(self, noise_type='uniform', noise_scale=0.0, init_scale=0.0): self.noise_type = noise_type assert self.noise_type in ['normal', 'uniform'] self.noise_scale = noise_scale self.init_scale = init_scale CartPoleEnv.__init__(self)
def run_cartpole_reinforce(args, log_dir="./logs/reinforce"): os.makedirs(log_dir, exist_ok=True) env = CartPoleEnv() agent: PolicyAgent = PolicyAgent(env.observation_space.shape[0], env.action_space.n) env.seed(args.seed) torch.manual_seed(args.seed) env = BenchMonitor(env, log_dir, allow_early_resets=True) train(env, agent, args) return agent, env
def test_random_agent(): from agentos.agents import RandomAgent from gym.envs.classic_control import CartPoleEnv environment = CartPoleEnv() environment.reset() agent = RandomAgent(environment=environment) done = agent.advance() assert not done, "CartPole never finishes after one random step." run_agent(agent)
def run_cartpole_dqn(num_batches=1000, batch_size=32, log_dir="./logs/dqn", seed=0): os.makedirs(log_dir, exist_ok=True) env = CartPoleEnv() env.seed(seed) torch.manual_seed(seed) agent = CartPoleAgent(env.observation_space, env.action_space) from baselines.bench import Monitor as BenchMonitor env = BenchMonitor(env, log_dir, allow_early_resets=True) train(agent, env, num_batches=num_batches, batch_size=batch_size) return agent, env
def get(): env = CartPoleEnv() for i_episode in range(10000): observation = env.reset() action = chose_action(model=model) while True: observation_, reward, done, info = env.step(action) x, x_dot, theta, theta_dot = observation_ r1 = (env.x_threshold - abs(x)) / env.x_threshold - 0.8 r2 = (env.theta_threshold_radians - abs(theta)) / env.theta_threshold_radians - 0.5 reward = r1 + r2 ransition = np.hstack((observation, [action, reward], observation_)) print()
def save(): env = CartPoleEnv() total_steps = 0 memory = [] memory_counter = 0 for i_episode in range(100): observation = env.reset() while True: env.render() action = env.action_space.sample() observation_, reward, done, info = env.step(action) x, x_dot, theta, theta_dot = observation_ r1 = (env.x_threshold - abs(x)) / env.x_threshold - 0.8 r2 = (env.theta_threshold_radians - abs(theta)) / env.theta_threshold_radians - 0.5 reward = r1 + r2 transition = np.hstack((observation, [action, reward], observation_)) memory.append(transition) if done: break observation = observation_ total_steps += 1 memory = np.array(memory) np.save("memory.npy", memory) env.close()
def go2(): env = CartPoleEnv() episode_step_counter = 0 for i_episode in range(10000): action = env.reset() step_counter = 0 while True: env.render() # 随机选择一个action # 获取环境给予的奖励 observation_, reward, done, info = env.step(action) x, x_dot, theta, theta_dot = observation_ r1 = (env.x_threshold - abs(x)) / env.x_threshold - 0.8 r2 = (env.theta_threshold_radians - abs(theta)) / env.theta_threshold_radians - 0.5 reward = r1 + r2 print(reward) step_counter = step_counter + 1 if done: episode_step_counter += step_counter # print("第{}回合,坚持了{}步".format(i_episode, step_counter)) print("平均步数:{}".format(episode_step_counter / (i_episode + 1))) break env.close()
def run_cartpole_a2c(args: A2CParams, log_dir="./logs/a2c"): os.makedirs(log_dir, exist_ok=True) env = CartPoleEnv() env = BenchMonitor(env, log_dir, allow_early_resets=True) env = CartPoleEnvSelfReset(env) # env.seed(params.seed) # torch.manual_seed(params.seed) agent: CartPoleA2CAgent = CartPoleA2CAgent(env.observation_space, env.action_space, args) exp_mem = build_experience_memory(agent, env, args.num_rollout_steps) w = World(env, agent, exp_mem) with torch.no_grad(): w.agent.eval() gather_exp_via_rollout(w.env, w.agent, w.exp_mem, args.num_rollout_steps) optimizer = torch.optim.Adam(agent.parameters(), args.lr) for k in tqdm(range(args.num_batches)): with torch.no_grad(): w.agent.eval() rollout = collect_experiences_calc_advantage(w, args) train_batch(w.agent, rollout, optimizer) return agent, env
class TFPolicy(agentos.Policy): def __init__(self, tf_model): self.tf_model = tf_model self.observation_space = CartPoleEnv().observation_space self.action_space = CartPoleEnv().action_space def compute_action(self, obs): assert self.observation_space.contains(obs), obs action = self.tf_model(np.array(obs)[np.newaxis]) env_compatible_action = int(round(action.numpy()[0][0])) assert self.action_space.contains( env_compatible_action), env_compatible_action return env_compatible_action def __deepcopy__(self, memo): return TFPolicy(keras.models.clone_model(self.tf_model))
def test_order_enforcing(): """Checks that the order enforcing works as expected, raising an error before reset is called and not after.""" # The reason for not using gym.make is that all environments are by default wrapped in the order enforcing wrapper env = CartPoleEnv() assert not has_wrapper(env, OrderEnforcing) # Assert that the order enforcing works for step and render before reset order_enforced_env = OrderEnforcing(env) assert order_enforced_env._has_reset is False with pytest.raises(ResetNeeded): order_enforced_env.step(0) with pytest.raises(ResetNeeded): order_enforced_env.render(mode="rgb_array") assert order_enforced_env._has_reset is False # Assert that the Assertion errors are not raised after reset order_enforced_env.reset() assert order_enforced_env._has_reset is True order_enforced_env.step(0) order_enforced_env.render(mode="rgb_array") # Assert that with disable_render_order_enforcing works, the environment has already been reset env = CartPoleEnv() env = OrderEnforcing(env, disable_render_order_enforcing=True) env.render(mode="rgb_array") # no assertion error
def go(): env = CartPoleEnv() total_steps = 0 memory = [] model = create_model() epsilon = 0.9 memory_counter = 1000 for i_episode in range(1000): observation = env.reset() ep_r = 0 while True: env.render() if np.random.uniform() < epsilon: actions_value = model.predict(np.array([observation])) action = np.argmax(actions_value) else: action = env.action_space.sample() observation_, reward, done, info = env.step(action) x, x_dot, theta, theta_dot = observation_ r1 = (env.x_threshold - abs(x)) / env.x_threshold - 0.8 r2 = (env.theta_threshold_radians - abs(theta)) / env.theta_threshold_radians - 0.5 reward = r1 + r2 transition = np.hstack((observation, [action, reward], observation_)) memory.append(transition) if len(memory) > memory_counter: xx, yy = get_data(np.array(memory), model) print(xx.shape) model.fit(xx, yy, epochs=10) epsilon = epsilon + 0.00001 memory = [] # memory_counter = memory_counter + 5 ep_r = ep_r + reward if done: # print(ep_r) break observation = observation_ total_steps += 1 model.save("logs/cp.h5") model.summary() env.close()
def test_openai_gym(self): self.start_tests(name='openai-gym') self.unittest(environment=dict(environment='gym', level='CartPole-v1'), num_episodes=2) from gym.envs.classic_control import CartPoleEnv self.unittest(environment=dict(environment='gym', level=CartPoleEnv(), max_episode_timesteps=100), num_episodes=2)
def run_cartpole_reinforce(args, log_dir="./logs/reinforce"): os.makedirs(log_dir, exist_ok=True) env = CartPoleEnv() agent = CartPoleReinforceAgent(env.observation_space.shape[0], env.action_space.n) env.seed(args.seed) torch.manual_seed(args.seed) env = BenchMonitor(env, log_dir, allow_early_resets=True) env = CartPoleEnvSelfReset(env) exp_mem = build_experience_memory(agent, env, args.num_rollout_steps) w = World(env, agent, exp_mem) with torch.no_grad(): w.agent.eval() gather_exp_via_rollout(w.env, w.agent, w.exp_mem, args.num_rollout_steps) optimizer = torch.optim.Adam(agent.parameters(), args.lr) for k in tqdm(range(args.num_batches)): with torch.no_grad(): agent.eval() batch = do_rollout(w, args) train_batch(agent, batch, optimizer) return agent, env
def test_discrete_vectorized_original_equality(self): venv = DiscreteVectorizedCartPoleEnv() state, action = self.state_action action = (action > 0).astype(int) dim1, dim2 = self.dims venv.state = state vobs, vreward, vdone, _ = venv.step(action) env = CartPoleEnv() for i in range(dim1): for j in range(dim2): env.reset() env.state = state[i, j] obs, reward, done, _ = env.step(int(action[i, j, 0])) np.testing.assert_allclose(obs, vobs[i, j]) np.testing.assert_allclose(reward, vreward[i, j]) np.testing.assert_allclose(done, vdone[i, j])
class CartPoleDictEnvWrapper(gym.Env): def __init__(self, max_angle=12, max_num_steps=1000): self.env = CartPoleEnv() # self.env.theta_threshold_radians = max_angle * 2 * math.pi / 360 self.observation_space = self.env.observation_space self.action_space = self.env.action_space self.step_counter = 0 self.max_num_steps = max_num_steps def step(self, action): if isinstance(action, numpy.ndarray): action = action[0] assert isinstance(action, numpy.int64) obs, _, done, _ = self.env.step(action) self.step_counter += 1 if self.step_counter % self.max_num_steps == 0: done = True if done: reward = -10.0 obs = self.env.reset() else: reward = 0.0 return {"observation": obs, "reward": reward, "done": int(done)} def reset(self): obs = self.env.reset() return {"observation": obs, "reward": 0.0, "done": int(False)} def render(self, mode="human"): return self.env.render(mode) def close(self): self.env.close() def seed(self, seed=None): return self.env.seed(seed)
from tensorforce.environments.openai_gym import OpenAIGym from env_gym import SimplePendulumEnv from gym.envs.classic_control import CartPoleEnv from tensorforce.execution import Runner import os batch_size = 10 n_step = 2000 # Instantiate the environment n_env = 12 list_envs = [] # env = OpenAIGym(SimplePendulumEnv()) env = OpenAIGym(CartPoleEnv()) actor_network = [ dict(type='dense', size=128, activation='relu'), dict(type='dense', size=64, activation='relu'), dict(type='dense', size=64, activation='relu') ] critic_network = [ dict(type='dense', size=128, activation='relu'), dict(type='dense', size=64, activation='relu'), dict(type='dense', size=64, activation='relu') ] agent = Agent.create(agent='ppo', batch_size=batch_size,
import gym from gym.envs.classic_control import CartPoleEnv env = CartPoleEnv() env = env.unwrapped # 不做这个会有很多限制 print(env.action_space) # 查看这个环境中可用的 action 有多少个 print(env.observation_space) # 查看这个环境中可用的 state 的 observation 有多少个 print(env.observation_space.high) # 查看 observation 最高取值 print(env.observation_space.low)
def __init__(self, tf_model): self.tf_model = tf_model self.observation_space = CartPoleEnv().observation_space self.action_space = CartPoleEnv().action_space
return int( max(0, round(self.nn(np.array(obs)[np.newaxis]).numpy()[0][0]))) class RandomTFAgent(agentos.Agent): def __init__(self, environment, policy): super().__init__(environment=environment, policy=policy) self.ret_vals = [] def advance(self): trajs = agentos.rollout(self.policy, self.environment, max_steps=2000) self.ret_vals.append(sum(trajs.rewards)) if __name__ == "__main__": from gym.envs.classic_control import CartPoleEnv random_nn_agent = RandomTFAgent( environment=CartPoleEnv, policy=SingleLayerTFPolicy( CartPoleEnv().action_space, CartPoleEnv().observation_space, ), ) agentos.run_agent(random_nn_agent, max_iters=10) print(f"Agent done!\n" f"Num rollouts: {len(random_nn_agent.ret_vals)}\n" f"Avg return: {np.mean(random_nn_agent.ret_vals)}\n" f"Max return: {max(random_nn_agent.ret_vals)}\n" f"Median return: {np.median(random_nn_agent.ret_vals)}\n")
for i_episode in range(10000): observation = env.reset() action = chose_action(model=model) while True: observation_, reward, done, info = env.step(action) x, x_dot, theta, theta_dot = observation_ r1 = (env.x_threshold - abs(x)) / env.x_threshold - 0.8 r2 = (env.theta_threshold_radians - abs(theta)) / env.theta_threshold_radians - 0.5 reward = r1 + r2 ransition = np.hstack((observation, [action, reward], observation_)) print() if __name__ == '__main__': env = CartPoleEnv() for i_episode in range(20): observation = env.reset() for t in range(100): env.render() action = env.action_space.sample() observation_, reward, done, info = env.step(action) x, x_dot, theta, theta_dot = observation_ r1 = (env.x_threshold - abs(x)) / env.x_threshold - 0.8 r2 = (env.theta_threshold_radians - abs(theta)) / env.theta_threshold_radians - 0.5 reward = r1 + r2 print(reward) ransition = np.hstack((observation, [action, reward], observation_)) print(ransition)
import numpy as np import scipy as sp from gym.envs.classic_control import CartPoleEnv from util.policy_nn_boltzmann import * from util.learner_nn import * from util.util_cartpole import * import sys np.set_printoptions(precision=6) np.set_printoptions(suppress=True) mdp = CartPoleEnv() mdp.horizon = 200 agent_policy = nnBoltzmannPolicy(nStateFeatures=4, nActions=2, nHiddenNeurons=64, paramInitMaxVal=0.025) agent_learner = nnGpomdpLearner(mdp, agent_policy, gamma=0.995) #eps = collect_pendulum_episodes(mdp,agent_policy,10,mdp.horizon) #agent_policy.optimize_gradient(eps,0.003) ctlearn(agent_learner, steps=10000, nEpisodes=25, learningRate=0.003, plotGradient=True, printInfo=True)
import os from collections import deque import matplotlib.pyplot as plt import numpy as np from gym.envs.classic_control import CartPoleEnv from tensorboardX import SummaryWriter from training.dqn.dqn_agent import Agent from utility.Scheduler import Scheduler currentDT = datetime.datetime.now() print(f'Start at {currentDT.strftime("%Y-%m-%d %H:%M:%S")}') seed = 5 # np.random.seed(seed) env = CartPoleEnv() # gym.make("CartPole-v0") env.seed(seed) np.random.seed(seed) state_size = 4 action_size = 2 STARTING_BETA = 0.6 # the higher the more it decreases the influence of high TD transitions ALPHA = 0.6 # the higher the more aggressive the sampling towards high TD transitions EPS_DECAY = 0.2 MIN_EPS = 0.01 current_time = currentDT.strftime('%b%d_%H-%M-%S') comment = f"alpha={ALPHA}, min_eps={MIN_EPS}, eps_decay={EPS_DECAY}" log_dir = os.path.join('../runs', current_time + '_' + comment) os.mkdir(log_dir) print(f"logging to {log_dir}") writer = SummaryWriter(log_dir=log_dir)
def run_task(snapshot_config, *_): """Set up environment and algorithm and run the task. Args: snapshot_config (garage.experiment.SnapshotConfig): The snapshot configuration used by LocalRunner to create the snapshotter. If None, it will create one with default settings. _ : Unused parameters """ th = 1.8 g_max = 0.1 #delta = 1e-7 if args.env == 'CartPole': #CartPole env = TfEnv(normalize(CartPoleEnv())) runner = LocalRunner(snapshot_config) batch_size = 5000 max_length = 100 n_timestep = 5e5 n_counts = 5 name = 'CartPole' grad_factor = 5 th = 1.2 #batchsize: 1 # lr = 0.1 # w = 2 # c = 50 #batchsize: 50 lr = 0.75 c = 3 w = 2 discount = 0.995 path = './init/CartPole_policy.pth' if args.env == 'Walker': #Walker_2d env = TfEnv(normalize(Walker2dEnv())) runner = LocalRunner(snapshot_config) batch_size = 50000 max_length = 500 n_timestep = 1e7 n_counts = 5 lr = 0.75 w = 2 c = 12 grad_factor = 6 discount = 0.999 name = 'Walk' path = './init/Walk_policy.pth' if args.env == 'HalfCheetah': env = TfEnv(normalize(HalfCheetahEnv())) runner = LocalRunner(snapshot_config) batch_size = 50000 max_length = 500 n_timestep = 1e7 n_counts = 5 lr = 0.6 w = 1 c = 4 grad_factor = 5 th = 1.2 g_max = 0.06 discount = 0.999 name = 'HalfCheetah' path = './init/HalfCheetah_policy.pth' if args.env == 'Hopper': #Hopper env = TfEnv(normalize(HopperEnv())) runner = LocalRunner(snapshot_config) batch_size = 50000 max_length = 1000 th = 1.5 n_timestep = 1e7 n_counts = 5 lr = 0.75 w = 1 c = 3 grad_factor = 6 g_max = 0.15 discount = 0.999 name = 'Hopper' path = './init/Hopper_policy.pth' for i in range(n_counts): # print(env.spec) if args.env == 'CartPole': policy = CategoricalMLPPolicy(env.spec, hidden_sizes=[8, 8], hidden_nonlinearity=torch.tanh, output_nonlinearity=None) else: policy = GaussianMLPPolicy(env.spec, hidden_sizes=[64, 64], hidden_nonlinearity=torch.tanh, output_nonlinearity=None) policy.load_state_dict(torch.load(path)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = MBPG_HA(env_spec=env.spec, env = env, env_name= name, policy=policy, baseline=baseline, max_path_length=max_length, discount=discount, grad_factor=grad_factor, policy_lr= lr, c = c, w = w, th=th, g_max=g_max, n_timestep=n_timestep, batch_size=batch_size, center_adv=True, # delta=delta #decay_learning_rate=d_lr, ) runner.setup(algo, env) runner.train(n_epochs=100, batch_size=batch_size)
parser = argparse.ArgumentParser( description="Run reinforce with a simple TF policy on gym CartPole. " "One rollout per call to agent.advance(), " "200 steps per rollout.", ) parser.add_argument( "max_iters", type=int, metavar="MAX_ITERS", help="How many times to call advance() on agent.", ) parser.add_argument("--rollouts_per_iter", type=int, default=1) parser.add_argument("--max_steps_per_rollout", type=int, default=200) parser.add_argument("--discount_rate", type=float, default=0.9) args = parser.parse_args() reinforce_agent = ReinforceAgent( CartPoleEnv(), TwoLayerTFPolicy(), rollouts_per_iter=args.rollouts_per_iter, max_steps_per_rollout=args.max_steps_per_rollout, discount_rate=args.discount_rate, ) agentos.run_agent( reinforce_agent, max_iters=args.max_iters, ) print("Agent done!") if reinforce_agent.ret_vals: print(f"Num rollouts: {len(reinforce_agent.ret_vals)}\n" f"Avg return: {np.mean(reinforce_agent.ret_vals)}\n" f"Max return: {max(reinforce_agent.ret_vals)}\n" f"Median return: {np.median(reinforce_agent.ret_vals)}\n")