def tests(): parser_tests() a = Actor(global_env, Script([Method('method1', [], 'Hurary'), Method('method2:', ['x'], 'x')])) n = Number(3) env = Env(None, {'a': a, 'n': n}) test_expr(env, 'a', a) should_raise(env, 'b', 'Unbound') should_raise(env, 'a method1', 'Unbound') should_raise(env, 'a foo', 'No matching method') test_expr(env, 'a method2: a', a) test_expr(env, 'n * n', Number(9)) test_expr(env, 'a. n', n) test_expr(env, '', void_actor) test_expr(env, '42', Number(42)) test_expr(env, '(a method2: 3) * 2', Number(6)) s1 = Script([Method('run:', ['x'], 'make Foo')]) s2 = Script([Method('multiply_by:', ['y'], 'x * y')]) s1.get_method('run:').set_inner('Foo', s2) a1 = Actor(global_env, s1) env1 = Env(None, {'a1': a1}) test_expr(env1, '(a1 run: 4) multiply_by: 5', Number(20)) assert Method('f', [], 'ab. ab').mark_up_body(global_env) == 'ab. ab' assert Method('foo:', ['x'], 'x').get_signature() == 'foo: x' s = String('Hello, world!') env2 = Env(None, {'s': s}) test_expr(env2, 's', String('Hello, ' + 'world!')) test_expr(env2, 's length', Number(13)) test_expr(env2, 's from: 1 to: 5', String('Hello')) test_expr(env2, "'Hello, world!'", s) test_expr(env2, 'let x = 2 * 3. x * x', Number(36))
def my_if(env, args): cur_env = Env(env.name + '_0', env) cond = doeval(args[0], cur_env) cur_env = Env(env.name + '_1', env) if cond == true: rs = doeval(args[1], cur_env) else: rs = doeval(args[2], cur_env) return rs
def dvd(env, args): idx = 0 cur_env = Env(env.name + '_' + str(idx), env) rs = doeval(args[0], cur_env) idx += 1 for item in args[1:]: cur_env = Env(env.name + '_' + str(idx), env) tmp = doeval(item, cur_env).get_value() rs /= tmp idx += 1 return rs
def compare(env, args, func): idx = 0 cur_env = Env(env.name + '_' + str(idx), env) pre = doeval(args[0], cur_env) idx += 1 while idx < len(args): item = args[idx] cur_env = Env(env.name + '_' + str(idx), env) tmp = doeval(item, cur_env) if not func(float(pre.get_value()), float(tmp.get_value())): return false pre = tmp idx += 1 return true
def my_not(env, args): idx = 0 cur_env = Env(env.name + '_' + str(idx), env) rs = doeval(args[0], cur_env) #是否正确? if rs.get_value(): return false return true
def setup_replay(args: argparse.Namespace, env: Env) -> ExperienceReplay: D = ExperienceReplay( args.experience_size, env.observation_size, env.action_size, args.device ) # Initialise dataset D with random seed episodes for _ in range(1, args.seed_episodes + 1): observation, done = env.reset(), False while not done: action = env.sample_random_action() next_observation, _, done, info = env.step(action) D.append(observation, action, info["reward_dist"], info["reward_coll"], done) observation = next_observation return D
def my_or(env, args): idx = 0 for item in args: cur_env = Env(env.name + '_' + str(idx), env) tmp = doeval(item, cur_env) if tmp.get_value(): return true idx += 1 return false
def add(env, args): rs = 0 idx = 0 for item in args: cur_env = Env(env.name + '_' + str(idx), env) tmp = doeval(item, cur_env) rs += tmp.get_value() idx += 1 return rs
def my_list(env, args): if len(args) == 0: return nil # print args rs_args = [] for idx, item in enumerate(args): cur_env = Env(env.name + '_' + str(idx), env) tmp = doeval(item, cur_env) rs_args.append(tmp) return My_List(rs_args)
def evaluate(conf): agent_func = _import_module(ModelBase, 'models.', conf['model']['name']) agent = agent_func(EasyDict(conf['model'])) envs = [] for i in range(conf['env'].get('count', 1)): envs.append(Env(agent, None, EasyDict(conf['env']))) envs[0].play() envs[0].close()
def apply(self, env, args): if len(args) > len(self.args): raise Exception('too many args!') cur_env = Env(env.name + '_0', self.env) for symbol, target in zip(self.args, args): define(cur_env, [symbol, target]) if len(args) < len(self.args): new_args = copy.copy(self.args[len(args):]) new_func = Func(new_args, self.body, cur_env) return new_func rs = nil for tmp_body in self.body: rs = doeval(tmp_body, cur_env) return rs
def train(conf): agent_func = _import_module(ModelBase, 'models.', conf['model']['name']) agent = agent_func(EasyDict(conf['model'])) coach_func = _import_module(TrainBase, 'train.', conf['train']['name']) coach = coach_func(agent, EasyDict(conf['train'])) assert (agent is not None) assert (coach is not None) envs = [] for i in range(conf['env'].get('count', 1)): envs.append(Env(agent, coach, EasyDict(conf['env']))) envs[0].play() envs[0].close()
def doeval(ele, env): if type(ele) == str: ele = AST([ele]) # print env, ele cmd = ele.get_cmd() args = ele.get_args() if isinstance(cmd, AST): cur_env = Env(env.name + '_0', env) opt = doeval(cmd, cur_env) else: opt = env.search_symbol(cmd) if opt is None: print 'symbol %s not found!' % cmd return None rs = opt.apply(env, args) # print 'eval result: %s' % str(rs) return rs
# TODO: Figure out why after a lot of steps the meta # file becomes so large, for now set max_to_keep=1. saver = tf.train.Saver(max_to_keep=1) if args.resume and args.checkpoint_dir: load_checkpoint(saver, args.checkpoint_dir, sess) replay = SimpleExperienceReplay(args.replay_capacity, args.batch_size, args.history_window, (args.height, args.width)) buf = Buffer(args.history_window, (args.height, args.width)) obs_preprocess = lambda o: preprocess(o, args.height, args.width) reward_clipper = lambda r: np.clip(r, -1.0, 1.0) # wrap Gym Env so we can easily process observations # and rewards env = Env(gym_env, obs_preprocess, reward_clipper) # Initialize replay with some random experiences print("Initializing replay with {} experiences".format(args.random_start)) random_start(env, replay, args.random_start) print("Training DDQN Agent") if args.monitor_dir: if args.resume: env.monitor.start(args.monitor_dir, resume=True) else: env.monitor.start(args.monitor_dir, force=True) ql = DDQN(main_model, target_model, args.batch_size, n_actions, args.gamma) ql.update_target_weights()
tf.set_random_seed(args.seed) gym_env.seed(args.seed) network_input_shape = (args.history_window, args.height, args.width) n_actions = gym_env.action_space.n observation_shape = gym_env.observation_space.shape config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Graph().as_default() as g, tf.Session(config=config) as sess: sess = tf.Session(config=config) K.set_session(sess) main = nn(network_input_shape, n_actions) target = nn(network_input_shape, n_actions) main.compile(optimizer='rmsprop', loss='mse') saver = tf.train.Saver() load_checkpoint(saver, args.checkpoint_dir, sess) buf = Buffer(args.history_window, (args.height, args.width)) obs_preprocess = lambda i: preprocess(i, args.height, args.width) reward_clip = lambda r: np.clip(r, -1.0, 1.0) env = Env(gym_env, obs_preprocess, reward_clip) ql = DDQN(main, target, args.batch_size, n_actions, args.gamma) print("Playing {} games ...".format(args.games)) for _ in range(args.games): play(ql, env, buf, epsilon=args.epsilon)
def cdr(env, args): cur_env = Env(env.name + '_0', env) pair = doeval(args[0], cur_env) if not isinstance(pair, Pair): raise Exception('cdr should be used to a pair!') return pair.cdr()
sess.run(tf.initialize_variables([t])) # TODO: Figure out why after a lot of steps the meta # file becomes so large, for now set max_to_keep=1. saver = tf.train.Saver(max_to_keep=1) if args.resume and args.checkpoint_dir: load_checkpoint(saver, args.checkpoint_dir, sess) replay = SimpleExperienceReplay(args.replay_capacity, args.batch_size, args.history_window, (args.height, args.width)) buf = Buffer(args.history_window, (args.height, args.width)) obs_preprocess = lambda o: preprocess(o, args.height, args.width) reward_clipper = lambda r: np.clip(r, -1.0, 1.0) # wrap Gym Env so we can easily process observations # and rewards env = Env(gym_env, obs_preprocess, reward_clipper) # Initialize replay with some random experiences print("Initializing replay with {} experiences".format(args.random_start)) random_start(env, replay, args.random_start) print("Training DDQN Agent") if args.monitor_dir: if args.resume: env.monitor.start(args.monitor_dir, resume=True) else: env.monitor.start(args.monitor_dir, force=True) ql = DDQN(main_model, target_model, args.batch_size, n_actions, args.gamma) ql.update_target_weights()
import tensorflow as tf import numpy as np import time from trainers.ppo_rnn_trainer_test import Trainer from envs import Pong as Env env = Env(stacks=4, skips=1, return_seq=True) trainer = Trainer(env.action_space.n) running_reward = -21 steps = 0 for e in range(100000): s = env.reset() ep_score = 0 done = False start = 0. h = trainer.agent.initial_hidden() while not done: a, v, l, h = trainer.action(np.array(s[-1]), h) n_s, r, done, info = env.step(a) trainer.add(start, s, a, l, v, n_s, r, done) trainer.update(h) start = 1. s = n_s ep_score += r steps += 1
import tensorflow as tf import numpy as np import time from trainers.ppo_trainer import Trainer from envs import Atari as Env from envs import Pong # env_name = "LunarLanderContinuous-v2" env_name = "LunarLander-v2" # env_name = "BipedalWalkerHardcore-v2" # env_name = "BipedalWalker-v2" env = Env(env_name) # env = Pong(img_size=64) trainer = Trainer(env.action_space.n, env.observation_space) running_reward=-21 steps=0 for e in range(100000): s = env.reset() ep_score = 0 done = False while not done: a, v, l = trainer.action(s) n_s, r, done, info = env.step(a) trainer.add(s, a, l, v, n_s, r, done) trainer.update()
def collect_experience(args: argparse.Namespace, env: Env, models: Tuple[nn.Module, nn.Module, nn.Module, nn.Module], planner: nn.Module, explore: bool = True, desc: str = "Collecting episode") -> Dict[str, List[torch.Tensor]]: """collect an episode by applying policy on the real env. """ # unpack models transition_model, _, _, encoder = models # storage experience = { "belief": [], "state": [], "action": [], "observation": [], "reward_dist": [], "reward_coll": [], "done": [] } with torch.no_grad(): # h[-1], s[-1], a[-1], o[0] belief = torch.zeros(1, args.belief_size, device=args.device) posterior_state = torch.zeros(1, args.state_size, device=args.device) action = torch.zeros(1, env.action_size, device=args.device) observation = env.reset() for _ in trange(args.max_episode_length // args.action_repeat, leave=False, desc=desc): # h[t] = f(h[t-1], a[t-1]) # s[t] ~ Prob(s|h[t]) # action and observation need extra time dimension because transition model uses batch operation belief, _, _, _, posterior_state, _, _ = transition_model.forward( posterior_state, action.unsqueeze(dim=0), belief, encoder(observation.to(device=args.device)).unsqueeze(dim=0)) belief, posterior_state = belief.squeeze(dim=0), posterior_state.squeeze(dim=0) # a[t] = pi(h[t], s[t]) + noise # action is bounded by action range action = planner(belief, posterior_state) if explore: action += args.action_noise * torch.randn_like(action) action.clamp_(min=env.action_range[0], max=env.action_range[1]) # o[t+1] ~ Prob(o|x[t], a[t]), r[t+1], z[t+1] next_observation, _, done, info = env.step(action[0].cpu()) # save h[t], s[t], a[t], o[t], r[t+1], z[t+1] experience["belief"].append(belief) experience["state"].append(posterior_state) experience["action"].append(action.cpu()) experience["observation"].append(observation) experience["reward_dist"].append(info["reward_dist"]) experience["reward_coll"].append(info["reward_coll"]) experience["done"].append(done) if done: break else: observation = next_observation return experience
def is_null(env, args): cur_env = Env(env.name + '_0', env) rs = doeval(args[0], cur_env) if rs == nil: return true return false
def __call__(self, *args): return evaluate(self.body, Env(self.parms, args, self.env))
def cons(env, args): cur_env = Env(env.name + '_0', env) first = doeval(args[0], cur_env) cur_env = Env(env.name + '_1', env) second = doeval(args[1], cur_env) return Pair(first, second)
def define(env, args): symbol = args[0] cur_env = Env(env.name + '_0', env) target = doeval(args[1], cur_env) env.add_symbol(symbol, target)
import tensorflow as tf import numpy as np import time from trainers.ppo_rnn_trainer import Trainer from envs import Mario as Env env = Env(stacks=1, skips=2) trainer = Trainer(env.action_space.n) running_reward = -21 steps = 0 for e in range(100000): s = env.reset() ep_score = 0 done = False start = 0. h = trainer.agent.initial_hidden() while not done: a, v, l, h = trainer.action(s, h) n_s, r, done, info = env.step(a) trainer.add(start, s, h, a, l, v, n_s, r, done) trainer.update() start = 1. s = n_s ep_score += r steps += 1
def call(self, actor, selector, arguments): return self.body.eval( Env(actor.get_env(), bind(self.parameters, arguments)), actor)