def game_config(args): if args.game == 'pong': actions = [0, 2, 3] meanings = ['NOOP', 'UP', 'DOWN'] enviroment = gym.make('Pong-v0') elif args.game == 'breakout': actions = [0, 1, 2, 3] meanings = ['NOOP', 'FIRE', 'RIGTH', 'LEFT'] enviroment = gym.make('Breakout-v0') elif args.game == 'space-invaders': actions = [0, 1, 2, 3] meanings = ['NOOP', 'FIRE', 'RIGTH', 'LEFT'] enviroment = gym.make('SpaceInvaders-v0') else: raise Exception('Unknown game') shape = enviroment.observation_space.shape screen = args.nb_frame_state, shape[0]//2, shape[1]//2 return { 'actions': actions, 'meanings': meanings, 'enviroment': enviroment, 'state_shape': screen, 'preprocessing': utils.preprocessing, }
def main(args): logging.info( args ) device = 'gpu' if args.gpu else 'cpu' devices = device_lib.list_local_devices() num_gpus = len([d for d in devices if '/gpu' in d.name]) env = gym.make(args.game) env = Env(env, resized_width=84, resized_height=84, agent_history_length=4) num_actions = len(env.gym_actions) global_net = Network(num_actions, -1, 'cpu') actor_networks = [] for t in range(args.threads): device_index = 0 if device is 'cpu' else (t if args.threads <= num_gpus else 0) n = Network(num_actions, t, device, device_index) n.tie_global_net(global_net) actor_networks.append(n) sess = tf.Session(config=tf.ConfigProto(intra_op_parallelism_threads=args.threads, inter_op_parallelism_threads=args.threads)) sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() if not os.path.exists(args.checkpoint_dir): os.makedirs(args.checkpoint_dir) threads = [] for t, net in enumerate(actor_networks): e = Env(gym.make(args.game), net.width, net.height, net.depth) w = Worker(t, e, net, sess, saver, args.checkpoint_dir) w.start() threads.append(w) for t in threads: t.join()
def worker_func(input_queue, output_queue, device="cpu"): env_pool = [gym.make("RoboschoolHalfCheetah-v1")] # first generation -- just evaluate given single seeds parents = input_queue.get() for seed in parents: net = build_net(env_pool[0], seed).to(device) net.zero_noise(batch_size=1) reward, steps = evaluate(env_pool[0], net, device) output_queue.put((seed, reward, steps)) while True: parents = input_queue.get() if parents is None: break parents.sort() for parent_seeds, children_iter in itertools.groupby(parents, key=lambda s: s[:-1]): batch = list(children_iter) children_seeds = [b[-1] for b in batch] net = build_net(env_pool[0], parent_seeds).to(device) net.set_noise_seeds(children_seeds) batch_size = len(children_seeds) while len(env_pool) < batch_size: env_pool.append(gym.make("RoboschoolHalfCheetah-v1")) rewards, steps = evaluate_batch(env_pool[:batch_size], net, device) for seeds, reward, step in zip(batch, rewards, steps): output_queue.put((seeds, reward, step))
def __init__(self, name, grid_size=None, last_n=None, delta_preprocessing=False): # self.base_folder_name = os.path.dirname(os.path.realpath(__file__)).replace('environments', 'solved_environments') + '/' + name # # TODO simplfy for all atari games self.name = name if name == 'breakout': self.env = gym.make('Breakout-v0') elif name == 'pong': self.env = gym.make('Pong-v0') elif name == 'gridworld': pass else: self.env = gym.make(name) # gym returns 6 possible actions for breakout and pong. # I think only 3 are used for both. So making life easier # with "LEFT", "RIGHT", "NOOP" actions space. # env.unwrapped.get_action_meanings() if name in {'breakout', 'pong'}: self.action_space = [2, 3] elif name == 'gridworld': pass else: self.action_space = self.env.action_space self.resize = tuple(grid_size) self.history_length = last_n self.history = deque(maxlen=last_n) self.prev_x = None self.delta_preprocessing = delta_preprocessing
def testGymPreprocessors(self): p1 = ModelCatalog.get_preprocessor( get_registry(), gym.make("CartPole-v0")) self.assertEqual(type(p1), NoPreprocessor) p2 = ModelCatalog.get_preprocessor( get_registry(), gym.make("FrozenLake-v0")) self.assertEqual(type(p2), OneHotPreprocessor)
def get_env(name): if 'Acrobot-v0' == name: return gym.make('Acrobot-v0') elif 'MountainCar-v0' == name: return gym.make('MountainCar-v0') elif 'CartPole-v0' == name: return gym.make('CartPole-v0') else: raise Exception('Not %s env found'%(name))
def get_env(name): if "Acrobot-v0" == name: return gym.make("Acrobot-v0") elif "MountainCar-v0" == name: return gym.make("MountainCar-v0") elif "CartPole-v0" == name: return gym.make("CartPole-v0") else: raise Exception('Not %s env found' % (name))
def run(args, parser): def create_environment(env_config): # This import must happen inside the method so that worker processes import this code import roboschool return gym.make(args.env) if not args.config: # Load configuration from file config_dir = os.path.dirname(args.checkpoint) # params.json is saved in the model directory during ray training by default config_path = os.path.join(config_dir, "params.json") with open(config_path) as f: args.config = json.load(f) if not args.env: if not args.config.get("env"): parser.error("the following arguments are required: --env") args.env = args.config.get("env") ray.init() register_env(args.env, create_environment) cls = get_agent_class(args.algorithm) config = args.config config["monitor"] = False config["num_workers"] = 1 config["num_gpus"] = 0 agent = cls(env=args.env, config=config) agent.restore(args.checkpoint) num_episodes = int(args.evaluate_episodes) if args.algorithm == "DQN": env = gym.make(args.env) env = wrap_dqn(env, args.config.get("model", {})) else: env = ModelCatalog.get_preprocessor_as_wrapper(gym.make(args.env)) env = wrappers.Monitor(env, OUTPUT_DIR, force=True, video_callable=lambda episode_id: True) all_rewards = [] for episode in range(num_episodes): steps = 0 state = env.reset() done = False reward_total = 0.0 while not done: action = agent.compute_action(state) next_state, reward, done, _ = env.step(action) reward_total += reward steps += 1 state = next_state all_rewards.append(reward_total) print("Episode reward: %s. Episode steps: %s" % (reward_total, steps)) print("Mean Reward:", np.mean(all_rewards)) print("Max Reward:", np.max(all_rewards)) print("Min Reward:", np.min(all_rewards))
def main(): import roboschool import gym import chainer env = gym.make('CartPole-v0') env.reset() env.step(env.action_space.sample()) env = gym.make('RoboschoolHalfCheetah-v1') env.reset() env.step(env.action_space.sample()) print("Your environment has been successfully set up!")
def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs): # Create envs. env = gym.make(env_id) env = Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(0))) if evaluation: eval_env = gym.make(env_id) eval_env = Monitor(eval_env, os.path.join(logger.get_dir(), 'gym_eval')) env = Monitor(env, None) else: eval_env = None # Parse noise_type action_noise = None param_noise = None nb_actions = env.action_space.shape[-1] for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError('unknown noise type "{}"'.format(current_noise_type)) # Configure components. memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) critic = Critic(layer_norm=layer_norm) actor = Actor(nb_actions, layer_norm=layer_norm) # Seed everything to make things reproducible. logger.info('seed={}, logdir={}'.format(seed, logger.get_dir())) tf.reset_default_graph() set_global_seeds(seed) env.seed(seed) if eval_env is not None: eval_env.seed(seed) start_time = time.time() training.train(env=env, eval_env=eval_env, param_noise=param_noise, action_noise=action_noise, actor=actor, critic=critic, memory=memory, **kwargs) env.close() if eval_env is not None: eval_env.close() logger.info('total runtime: {}s'.format(time.time() - start_time))
def _make_evs(self): def make_sess(): return tf.Session(config=tf.ConfigProto(device_count={"CPU": 2})) local = PolicyEvaluator( env_creator=lambda _: gym.make("CartPole-v0"), policy_graph=PPOPolicyGraph, tf_session_creator=make_sess) remotes = [ PolicyEvaluator.as_remote().remote( env_creator=lambda _: gym.make("CartPole-v0"), policy_graph=PPOPolicyGraph, tf_session_creator=make_sess) ] return local, remotes
def main(): env = gym.make(ENV_NAME) agent = Agent(num_actions=env.action_space.n) if TRAIN: # Train mode for _ in range(NUM_EPISODES): terminal = False observation = env.reset() for _ in range(random.randint(1, NO_OP_STEPS)): last_observation = observation observation, _, _, _ = env.step(0) # Do nothing state = agent.get_initial_state(observation, last_observation) while not terminal: last_observation = observation action = agent.get_action(state) observation, reward, terminal, _ = env.step(action) # env.render() processed_observation = preprocess(observation, last_observation) state = agent.run(state, action, reward, terminal, processed_observation) else: # Test mode # env.monitor.start(ENV_NAME + '-test') for _ in range(NUM_EPISODES_AT_TEST): terminal = False observation = env.reset() for _ in range(random.randint(1, NO_OP_STEPS)): last_observation = observation observation, _, _, _ = env.step(0) # Do nothing state = agent.get_initial_state(observation, last_observation) while not terminal: last_observation = observation action = agent.get_action_at_test(state) observation, _, terminal, _ = env.step(action) env.render() processed_observation = preprocess(observation, last_observation) state = np.append(state[1:, :, :], processed_observation, axis=0)
def main(): # initialize OpenAI Gym env and dqn agent env = gym.make(ENV_NAME) agent = DQN(env) for episode in range(EPISODE): state = env.reset() # initialize task for step in range(STEP): # Train; STEP=300 action = agent.egreedy_action(state) # e-greedy action for train, 获取包含随机的动作 next_state, reward, done, _ = env.step(action) agent.perceive(state, action, reward, next_state, done) # 感知信息; 当获取足够的 batch 时, 开始训练网络 state = next_state if done: break if episode % 100 == 0: # Test every 100 episodes total_reward = 0 for i in range(TEST): # TEST = 20 state = env.reset() for j in range(STEP): env.render() action = agent.action(state) # direct action for test # 测试中的不同是: 1. 使用 action(state): 来获取动作, 也就是完全没有随机性, 只根据神经网络来输出, 没有探索; 2. 同时这里也就不再 perceive 输入信息来训练 state, reward, done, _ = env.step(action) total_reward += reward if done: break ave_reward = total_reward/TEST print('episode: ', episode, 'Evaluation Average Reward:', ave_reward)
def main(): env = gym.make('MountainCarContinuous-v0') ft = FeatureTransformer(env, n_components=100) D = ft.dimensions pmodel = PolicyModel(ft, D, [], []) # init = tf.global_variables_initializer() session = tf.InteractiveSession() # session.run(init) pmodel.set_session(session) pmodel.init_vars() gamma = 0.99 if 'monitor' in sys.argv: filename = os.path.basename(__file__).split('.')[0] monitor_dir = './' + filename + '_' + str(datetime.now()) env = wrappers.Monitor(env, monitor_dir) totalrewards, pmodel = random_search(env, pmodel, gamma) print("max reward:", np.max(totalrewards)) # play 100 episodes and check the average avg_totalrewards = play_multiple_episodes(env, 100, pmodel, gamma, print_iters=True) print("avg reward over 100 episodes with best models:", avg_totalrewards) plt.plot(totalrewards) plt.title("Rewards") plt.show()
def __init__(self, thread_id, master): self.thread_id = thread_id threading.Thread.__init__(self, name="thread_%d" % thread_id) self.env = AtariEnv(gym.make(flags.game)) self.master = master # local network if flags.use_lstm: self.local_net = A3CLSTMNet(self.env.state_shape, self.env.action_dim, scope="local_net_%d" % thread_id) else: self.local_net = A3CNet(self.env.state_shape, self.env.action_dim, scope="local_net_%d" % thread_id) # sync network self.sync = self.sync_network(master.shared_net) # accumulate gradients self.accum_grads = self.create_accumulate_gradients() self.do_accum_grads_ops = self.do_accumulate_gradients() self.reset_accum_grads_ops = self.reset_accumulate_gradients() # collect summaries for debugging summaries = list() summaries.append(tf.scalar_summary("entropy/%d" % self.thread_id, self.local_net.entropy)) summaries.append(tf.scalar_summary("policy_loss/%d" % self.thread_id, self.local_net.policy_loss)) summaries.append(tf.scalar_summary("value_loss/%d" % self.thread_id, self.local_net.value_loss)) summaries.append(tf.scalar_summary("total_loss/%d" % self.thread_id, self.local_net.total_loss)) # apply accumulated gradients with tf.device("/gpu:%d" % flags.gpu): self.apply_gradients = master.shared_opt.apply_gradients( zip(self.accum_grads, master.shared_net.get_vars()), global_step=master.global_step) self.summary_op = tf.merge_summary(summaries)
def _thunk(): env = gym.make(env_id) #this prints # print('here') # print (env.unwrapped) # print (env.unwrapped.get_action_meanings()) # fdsadsfa is_atari = hasattr(gym.envs, 'atari') and isinstance(env.unwrapped, gym.envs.atari.atari_env.AtariEnv) #so this overwrites the other env? so ill change it if is_atari: # env = make_atari(env_id) #took this from make_atari assert 'NoFrameskip' in env.spec.id env = NoopResetEnv(env, noop_max=30) env = MaxAndSkipEnv(env, skip=4) env.seed(seed + rank) if log_dir != '': env = bench.Monitor(env, os.path.join(log_dir, str(rank))) if is_atari: warp = False env = wrap_deepmind(env, warp=warp) env = WrapPyTorch(env) return env
def play_game(self,env=None): if env is None: env = gym.make(self.gamename) obs = env.reset() agent = self.agent obs_hist = [] reward_hist = [] action_hist = [] while True: # Execute action = agent.predict(obs) obs, reward, done, info = env.step(action) # Collect variables obs_hist.append(obs) reward_hist.append(reward) action_hist.append(action) if done: break obs_hist = np.array(obs_hist) reward_hist = np.array(reward_hist) action_hist = np.array(action_hist) #print('Game done.') full_result = {'obs' : obs_hist,'action': action_hist, 'reward' : reward_hist} # Process result according to processed_result = self.agent.process_one_game(full_result) return processed_result
def main(): env = gym.make("InvertedPendulumSwingupBulletEnv-v0") env.render(mode="human") pi = SmallReactivePolicy(env.observation_space, env.action_space) while 1: frame = 0 score = 0 restart_delay = 0 obs = env.reset() while 1: time.sleep(0.05) a = pi.act(obs) obs, r, done, _ = env.step(a) score += r frame += 1 still_open = env.render("human") if still_open==False: return if not done: continue if restart_delay==0: print("score=%0.2f in %i frames" % (score, frame)) restart_delay = 60*2 # 2 sec at 60 fps else: restart_delay -= 1 if restart_delay > 0: continue break
def test_coexistence(learn_fn, network_fn): ''' Test if more than one model can exist at a time ''' if learn_fn == 'deepq': # TODO enable multiple DQN models to be useable at the same time # github issue https://github.com/openai/baselines/issues/656 return if network_fn.endswith('lstm') and learn_fn in ['acktr', 'trpo_mpi', 'deepq']: # TODO make acktr work with recurrent policies # and test # github issue: https://github.com/openai/baselines/issues/660 return env = DummyVecEnv([lambda: gym.make('CartPole-v0')]) learn = get_learn_function(learn_fn) kwargs = {} kwargs.update(network_kwargs[network_fn]) kwargs.update(learn_kwargs[learn_fn]) learn = partial(learn, env=env, network=network_fn, total_timesteps=0, **kwargs) make_session(make_default=True, graph=tf.Graph()) model1 = learn(seed=1) make_session(make_default=True, graph=tf.Graph()) model2 = learn(seed=2) model1.step(env.observation_space.sample()) model2.step(env.observation_space.sample())
def main(_): with tf.Session() as sess: env = gym.make(ENV_NAME) np.random.seed(RANDOM_SEED) tf.set_random_seed(RANDOM_SEED) env.seed(RANDOM_SEED) state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] action_bound = env.action_space.high # Ensure action bound is symmetric assert (env.action_space.high == -env.action_space.low) actor = ActorNetwork(sess, state_dim, action_dim, action_bound, ACTOR_LEARNING_RATE, TAU) critic = CriticNetwork(sess, state_dim, action_dim, CRITIC_LEARNING_RATE, TAU, actor.get_num_trainable_vars()) if GYM_MONITOR_EN: if not RENDER_ENV: env = wrappers.Monitor( env, MONITOR_DIR, video_callable=False, force=True) else: env = wrappers.Monitor(env, MONITOR_DIR, force=True) train(sess, env, actor, critic) if GYM_MONITOR_EN: env.monitor.close()
def _setup(self, config): env = self._env_id if env: config["env"] = env if _global_registry.contains(ENV_CREATOR, env): self.env_creator = _global_registry.get(ENV_CREATOR, env) else: import gym # soft dependency self.env_creator = lambda env_config: gym.make(env) else: self.env_creator = lambda env_config: None # Merge the supplied config with the class default merged_config = copy.deepcopy(self._default_config) merged_config = deep_update(merged_config, config, self._allow_unknown_configs, self._allow_unknown_subkeys) self.raw_user_config = config self.config = merged_config Agent._validate_config(self.config) if self.config.get("log_level"): logging.getLogger("ray.rllib").setLevel(self.config["log_level"]) # TODO(ekl) setting the graph is unnecessary for PyTorch agents with tf.Graph().as_default(): self._init()
def __init__(self, env_name='HalfCheetah-v1', policy_params=None, num_workers=32, num_deltas=320, deltas_used=320, delta_std=0.02, logdir=None, rollout_length=1000, step_size=0.01, shift='constant zero', params=None, seed=123): logz.configure_output_dir(logdir) logz.save_params(params) env = gym.make(env_name) self.timesteps = 0 self.action_size = env.action_space.shape[0] self.ob_size = env.observation_space.shape[0] self.num_deltas = num_deltas self.deltas_used = deltas_used self.rollout_length = rollout_length self.step_size = step_size self.delta_std = delta_std self.logdir = logdir self.shift = shift self.params = params self.max_past_avg_reward = float('-inf') self.num_episodes_used = float('inf') # create shared table for storing noise print("Creating deltas table.") deltas_id = create_shared_noise.remote() self.deltas = SharedNoiseTable(ray.get(deltas_id), seed = seed + 3) print('Created deltas table.') # initialize workers with different random seeds print('Initializing workers.') self.num_workers = num_workers self.workers = [Worker.remote(seed + 7 * i, env_name=env_name, policy_params=policy_params, deltas=deltas_id, rollout_length=rollout_length, delta_std=delta_std) for i in range(num_workers)] # initialize policy if policy_params['type'] == 'linear': self.policy = LinearPolicy(policy_params) self.w_policy = self.policy.get_weights() else: raise NotImplementedError # initialize optimization algorithm self.optimizer = optimizers.SGD(self.w_policy, self.step_size) print("Initialization of ARS complete.")
def QTable_algo(): env = gym.make('FrozenLake-v0') #initialize table with all zeros Q = np.zeros([env.observation_space.n, env.action_space.n]) #set learning parameters lr = .85 y = .99 num_episodes = 2000 #create lists to contain total rewards and steps per episode rList = [] for i in range(num_episodes): #reset environment and get first new observation s = env.reset() rAll = 0 d = False j = 0 #the Q-Table learning algorithm while j < 99: j+=1 #choose an action by greedily (with noise) picking from Q-Table a = np.argmax(Q[s,:] + np.random.randn(1,env.action_space.n)*(1./(i+1))) #get new state and reward from environment s1, r, d,_ = env.step(a) #update Q-Table with new knowledge Q[s,a] = Q[s,a] + lr*(r + y*np.max(Q[s1,:]) - Q[s,a]) rAll += r s = s1 if d == True: break rList.append(rAll) print("Score over time: " + str(sum(rList)/num_episodes)) print("Final Q-Table Values") print(Q)
def main(): env = gym.make('CartPole-v0') ft = FeatureTransformer(env) model = Model(env, ft) gamma = 0.99 if 'monitor' in sys.argv: filename = os.path.basename(__file__).split('.')[0] monitor_dir = './' + filename + '_' + str(datetime.now()) env = wrappers.Monitor(env, monitor_dir) N = 500 totalrewards = np.empty(N) costs = np.empty(N) for n in range(N): eps = 1.0/np.sqrt(n+1) totalreward = play_one(env, model, eps, gamma) totalrewards[n] = totalreward if n % 100 == 0: print("episode:", n, "total reward:", totalreward, "eps:", eps, "avg reward (last 100):", totalrewards[max(0, n-100):(n+1)].mean()) print("avg reward for last 100 episodes:", totalrewards[-100:].mean()) print("total steps:", totalrewards.sum()) plt.plot(totalrewards) plt.title("Rewards") plt.show() plot_running_avg(totalrewards)
def runEpisodesForAgent (self,num_episodes,numBlocks): ''' Runs numEpisodes of the agent ''' #numBlocks = 3 env = gym.make('BlocksWorld-v0') env.seed(0) env.reset() done = False # num_episodes = 1000 ep_lengths = [] n = 0 while (n<num_episodes): steps =1 done = False env.reset() next_action = [random.randint(0,numBlocks),random.randint(0,numBlocks)] while (done == False): obs, reward, done, empty = env.step (next_action) print ('Next action ' + str(next_action)) print ('Obs ' + str(obs)) next_action = self.agent.sampleAction(obs) #env.render() steps +=1 print (done) print ('New episode') ep_lengths.append(steps) n+=1 print ("Average episode length " + str(sum(ep_lengths) / float(len(ep_lengths)))) #input("Press Enter to continue...") self.ep_lengths = ep_lengths return ep_lengths
def main(): env = gym.make('MountainCarContinuous-v0') ft = FeatureTransformer(env, n_components=100) D = ft.dimensions pmodel = PolicyModel(D, ft, []) vmodel = ValueModel(D, ft, []) init = tf.global_variables_initializer() session = tf.InteractiveSession() session.run(init) pmodel.set_session(session) vmodel.set_session(session) gamma = 0.95 if 'monitor' in sys.argv: filename = os.path.basename(__file__).split('.')[0] monitor_dir = './' + filename + '_' + str(datetime.now()) env = wrappers.Monitor(env, monitor_dir) N = 50 totalrewards = np.empty(N) costs = np.empty(N) for n in range(N): totalreward, num_steps = play_one_td(env, pmodel, vmodel, gamma) totalrewards[n] = totalreward if n % 1 == 0: print("episode:", n, "total reward: %.1f" % totalreward, "num steps: %d" % num_steps, "avg reward (last 100): %.1f" % totalrewards[max(0, n-100):(n+1)].mean()) print("avg reward for last 100 episodes:", totalrewards[-100:].mean()) plt.plot(totalrewards) plt.title("Rewards") plt.show() plot_running_avg(totalrewards) plot_cost_to_go(env, vmodel)
def __init__(self, env_id, args): import gym self.gym = gym.make(env_id) self.obs = None self.terminal = None # OpenCV expects width as first and height as second s self.dims = (args.screen_width, args.screen_height)
def train(env_id, num_frames, seed): from baselines.ppo1 import pposgd_simple, cnn_policy import baselines.common.tf_util as U rank = MPI.COMM_WORLD.Get_rank() sess = U.single_threaded_session() sess.__enter__() if rank != 0: logger.set_level(logger.DISABLED) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) env = gym.make(env_id) def policy_fn(name, ob_space, ac_space): #pylint: disable=W0613 return cnn_policy.CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space) env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), "%i.monitor.json" % rank)) env.seed(workerseed) gym.logger.setLevel(logging.WARN) env = wrap_train(env) num_timesteps = int(num_frames / 4 * 1.1) env.seed(workerseed) pposgd_simple.learn(env, policy_fn, max_timesteps=num_timesteps, timesteps_per_batch=256, clip_param=0.2, entcoeff=0.01, optim_epochs=4, optim_stepsize=1e-3, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear' ) env.close()
def __init__(self): env = gym.make(ENV) self.env = wrappers.Monitor(env, '/tmp/gym/cartpole_dqn', force=True) self.num_states = self.env.observation_space.shape[0] self.num_actions = self.env.action_space.n self.agent = Agent(self.num_states, self.num_actions) self.total_step = np.zeros(10)
def evaluation(session, graph_ops, saver): """ Evaluate a model. """ ckpt = tf.train.get_checkpoint_state('./model') if ckpt and ckpt.model_checkpoint_path: print (ckpt.model_checkpoint_path) else: print ("exit") saver.restore(session, ckpt.model_checkpoint_path) print("Restored model weights from ", test_model_path) monitor_env = gym.make(game) monitor_env.monitor.start("qlearning/eval") # Unpack graph ops s = graph_ops["s"] q_values = graph_ops["q_values"] # Wrap env with AtariEnvironment helper class env = AtariEnvironment(gym_env=monitor_env, action_repeat=action_repeat) for i_episode in xrange(num_eval_episodes): s_t = env.get_initial_state() ep_reward = 0 terminal = False while not terminal: monitor_env.render() readout_t = q_values.eval(session=session, feed_dict={s : [s_t]}) action_index = np.argmax(readout_t) s_t1, r_t, terminal, info = env.step(action_index) s_t = s_t1 ep_reward += r_t print(ep_reward) monitor_env.monitor.close()
return self.action_space.sample() class BiasedAgent(object): def __init__(self, action_space): self.action_space = action_space self.action_always = self.action_space.sample() def act(self, observation, reward, done): return self.action_always if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('game', nargs="?", default="CartPole-v0") args = parser.parse_args() env = gym.make(args.game) num_episodes = 20 num_maxstep = 100 agent_id = 1 if agent_id == 1: agent = RandomAgent(env.action_space) elif agent_id == 2: agent = BiasedAgent(env.action_space) reward = 0 done = False for i_episode in range(num_episodes): observation = env.reset() for t in range(num_maxstep):
import paddle import paddle.nn.functional as F import numpy as np import gym batch_size = 256 num_episodes = 100000 memory_size = 1000000 policy_delay = 2 learning_rate = 0.1 gamma = 0.99 ratio = 0.005 exploration_noise = 1e-3 epoch = 0 env = gym.make('Pendulum-v0') env.seed(1) paddle.seed(1) np.random.seed(1) state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) min_val = paddle.to_tensor(1e-7).astype('float32') actor = Actor(state_dim, action_dim, max_action) actor_optimizer = paddle.optimizer.RMSProp(parameters=actor.parameters(), learning_rate=learning_rate) Q_net = Q(state_dim, action_dim) Q_optimizer = paddle.optimizer.RMSProp(parameters=Q_net.parameters(),
def test_spec_with_kwargs(): map_name_value = "8x8" env = gym.make("FrozenLake-v1", map_name=map_name_value) assert env.spec.kwargs["map_name"] == map_name_value
def __init__(self, name, globalAC): self.env = gym.make(GAME).unwrapped self.name = name self.AC = ACNet(name, globalAC)
probs = tf.nn.softmax(logits).numpy() action = np.random.choice(6, size = 1, p = probs.flatten())[0] return action def pre_process(image): img = image[35:195] img = img[::2, ::2, 0] img[img == 144] = 0 img[img == 109] = 0 img[img != 0] = 1 return img.astype(np.float).ravel() if __name__ == "__main__": import time start = time.process_time() env = gym.make('Pong-v4') print("Number of obswervations: {}".format(env.observation_space)) print("Number of allowed actions: {}".format(env.action_space)) print(tf.__version__) print(tf.keras.__version__) optimizer = tf.train.AdamOptimizer(learning_rate) model = create_model() # model.load_weights('model/agentcycle1750-agent99gamma1kepochs') # print(model.summary()) # print('Model loaded successfully!') memory = Memory() import skvideo.io from pyvirtualdisplay import Display display = Display(visible=0) display.start()
import copy from turtle import pd import numpy as np import pandas as pandas import torch import random from matplotlib import pylab as plt import gym from collections import deque import Box2D env = gym.make('LunarLander-v2') env.reset() #------------------------------------------------------------------------------------------------- def discretize(val,bounds,n_states): if val <= bounds[0]: discrete_val = 0 elif val >= bounds[1]: discrete_val = n_states-1 else: discrete_val = int(round((n_states-1)*((val-bounds[0])/(bounds[1]-bounds[0])))) return discrete_val def discretize_state(vals,s_bounds,n_s): discrete_vals = [] for i in range(len(n_s)): discrete_vals.append(discretize(vals[i],s_bounds[i],n_s[i])) return np.array(discrete_vals,dtype=np.int)
n_action = 2 actions = np.array([0, 1]) # ---------------------------------------- # Observation # Type: Box(4) # Num | Observation | Min | Max # 0 | Cart Position | -2.4 | 2.4 # 1 | Cart Velocity | -Inf | Inf # 2 | Pole Angle | -41.8 | 41.8 # 3 | Pole Velocity | -Inf | Inf n_input = 4 observation = [] # ---------------------------------------- # Define environment/game env_name = 'CartPole-v0' env = gym.make(env_name) # ---------------------------------------- # Initialize Neural Q-Learn object AI = NeuralQLearner(n_input, actions, batch_size, epsilon, alpha, gamma) #AI.plotQ() # Initialize experience replay object exp = Experience(max_memory) # ---------------------------------------- # Train for e in range(epoch): # Get initial input observation = env.reset() observation_init = observation # Training for single episode step = 0
parser.add_argument('--vis', action='store_true', help='visualize each action or not') parser.add_argument('--discrete', dest='discrete', action='store_true', help='the actions are discrete or not') parser.add_argument('--cuda', dest='cuda', action='store_true') # parser.add_argument('--l2norm', default=0.01, type=float, help='l2 weight decay') # TODO args = parser.parse_args() # StrCat args.output with args.env if args.resume is None: args.output = get_output_folder(args.output, args.env) else: args.output = args.resume if args.env == "KukaGym": env = KukaGymEnv(renders=False, isDiscrete=True) elif args.discrete: env = gym.make(args.env) env = env.unwrapped else: env = NormalizedEnv(gym.make(args.env)) # input random seed if args.seed > 0: np.random.seed(args.seed) env.seed(args.seed) # input states count & actions count print(env.observation_space.shape, env.action_space.shape) nb_states = env.observation_space.shape[0] if args.discrete: nb_actions = env.action_space.n else:
config.gamma = 0.99 config.epsilon = 1 config.epsilon_min = 0.01 config.eps_decay = 500 config.frames = 160000 config.use_cuda = True config.learning_rate = 1e-3 config.max_buff = 1000 config.update_tar_interval = 100 config.batch_size = 128 config.print_interval = 200 config.log_interval = 200 config.win_reward = 198 # CartPole-v0 config.win_break = True env = gym.make(config.env) config.action_dim = env.action_space.n config.state_dim = env.observation_space.shape[0] agent = DDQNAgent(config) if args.train: trainer = Trainer(agent, env, config) trainer.train() elif args.test: if args.model_path is None: print('please add the model path:', '--model_path xxxx') exit(0) tester = Tester(agent, env, args.model_path) tester.test()
import gym import numpy as np import matplotlib.pyplot as plt env = gym.make("CartPole-v0") gamma = 0.99 beta = 0.00001 alpha = 0.000001 sigma = 0.001 w = np.array([0, 0, 0, 0, 0, 0, 0, 0]) delta_w = np.array([0, 0, 0, 0, 0, 0, 0, 0]) v = np.array([0, 0, 0, 0, 0, 0, 0, 0]) delta_v = np.array([0, 0, 0, 0, 0, 0, 0, 0]) def log(log_message): """ DESCRIPTION: - Adds a log message "log_message" to a log file. """ # open the log file and make sure that it's closed properly at the end of the # block, even if an exception occurs: with open("C:/Users/Fregus/log2.txt", "a") as log_file: # write the log message to logfile: log_file.write(log_message) log_file.write("\n") # (so the next message is put on a new line)
disk_roll_vel = observations[0] # roll_angle = observations[2] y_linear_speed = observations[4] yaw_angle = observations[5] state_converted = [disk_roll_vel, y_linear_speed, yaw_angle] return state_converted if __name__ == '__main__': rospy.init_node('j2n6s300_gym', anonymous=True, log_level=rospy.WARN) # Create the Gym environment env = gym.make('j2n6s300Test-v3') rospy.loginfo("Gym environment done") # Set the logging system rospack = rospkg.RosPack() pkg_path = rospack.get_path('j2n6s300_ml') outdir = pkg_path + '/training_results' env = wrappers.Monitor(env, outdir, force=True) rospy.loginfo("Monitor Wrapper started") last_time_steps = numpy.ndarray(0) # Loads parameters from the ROS param server # Parameters are stored in a yaml file inside the config directory # They are loaded at runtime by the launch file Alpha = rospy.get_param("/j2n6s300/alpha")
Q_LEARNING_RATE = 1e-3 GAMMA = 0.99 DECAY = 0.995 ACTION_NOISE = 0.1 MINIMAL_SAMPLES = 10000 MAXIMAL_SAMPLES = 1000000 ITERATIONS = 100000 BATCH_SIZE = 64 MAX_EPISODE_LENGTH = 200 SAVE_CHECKPOINT_EVERY = 100 DEMO_EVERY = 100 # Environment env = gym.make("Pendulum-v0") def create_mu_network( name, output_dim, action_max, activation=tf.nn.relu, output_activation=tf.nn.tanh, trainable=True, ): return MLPNetwork([ tf.layers.Dense( units=512, activation=activation, trainable=trainable, name="W",
import copy from numpy import array from PIL import Image from improcess import AtariProcessor from improcess import HistoryStore from policy import GreedyPolicy from policy import UniformRandomPolicy from memhelpers import NNMemStore IMAGE_SIZE = (84, 84) HISTORY_LENGTH = 4 MEM_SIZE = 2000 INIT_MEM_RATIO = 0.5 env = gym.make('BreakoutDeterministic-v0') observation = env.reset() num_actions = env.action_space.n atari_processor = AtariProcessor(IMAGE_SIZE) history_store = HistoryStore(HISTORY_LENGTH, IMAGE_SIZE) greedy_selector = GreedyPolicy() random_selector = UniformRandomPolicy(num_actions) episode_end_flag = False mem_store = NNMemStore(MEM_SIZE, (84, 84, 4)) observation = env.reset() state = atari_processor.state_for_mem(observation) history_store.add_history(state) i = 0 life = False first_step = True
is_save_raw_data: bool = False is_save_analytica_data: bool = True is_save_chart_data: bool = True # program execution settings is_print_episode_idx: bool = True user_input_next_operation: str = "fly" # i frequently got ram issues running with my Geforce 1050. Thus, I disabled it. device = "cpu" #torch.device("cuda" if torch.cuda.is_available() else "cpu") # core procedure start point is_ipython = 'inline' in matplotlib.get_backend() if is_ipython: pass env = gym.make('CartPole-v0').unwrapped env.reset() rgb_array = env.render('rgb_array') processed_screen = CommonUtil.process_screen(rgb_array, device) screen_height = processed_screen.shape[2] screen_width = processed_screen.shape[3] num_of_nn_input_node: int = screen_height * screen_width * 3 tmp_torch_policy_network = BinaryOutputDeepQNetwork_5096_1024_512_128_64_16_8( num_of_nn_input_node).to(device) tmp_torch_target_network = BinaryOutputDeepQNetwork_5096_1024_512_128_64_16_8( num_of_nn_input_node).to(device) deep_q_agent: DeepQAgent = DeepQAgent(epsilon_start, epsilon_end, epsilon_decay, nn_learning_rate,
import gym import gym_reinmav from stable_baselines.common.policies import MlpPolicy from stable_baselines.common.vec_env import DummyVecEnv from stable_baselines import PPO2 env = gym.make('quadrotor2d-v0') # Optional: PPO2 requires a vectorized environment to run # the env is now wrapped automatically when passing it to the constructor # env = DummyVecEnv([lambda: env]) model = PPO2(MlpPolicy, env, verbose=1, tensorboard_log="/home/jaeyoung/dev/reinmav-gym/ppo2_quadrotor2d_tensorboard/") model.learn(total_timesteps=300000, tb_log_name="first_run") obs = env.reset() for i in range(1000): action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render()
logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('Time', time.time()-start_time) logger.dump_tabular() if __name__ == '__main__': import argparse parser = argparse.ArgumentParser() parser.add_argument('--env', type=str, default='HalfCheetah-v2') parser.add_argument('--hid', type=int, default=256) parser.add_argument('--l', type=int, default=2) parser.add_argument('--gamma', type=float, default=0.99) parser.add_argument('--seed', '-s', type=int, default=0) parser.add_argument('--epochs', type=int, default=50) parser.add_argument('--exp_name', type=str, default='td3') args = parser.parse_args() from spinup.utils.run_utils import setup_logger_kwargs logger_kwargs = setup_logger_kwargs(args.exp_name, args.seed, args.env) td3(lambda: gym.make(args.env), actor_critic=core.MLPActorCritic, ac_kwargs=dict(hidden_sizes=[args.hid]*args.l), gamma=args.gamma, seed=args.seed, epochs=args.epochs, logger_kwargs=logger_kwargs)
def main(): import logging parser = argparse.ArgumentParser() parser.add_argument('processes', type=int) parser.add_argument('--env', type=str, default='CartPole-v0') parser.add_argument('--seed', type=int, default=None) parser.add_argument('--outdir', type=str, default=None) parser.add_argument('--t-max', type=int, default=50) parser.add_argument('--n-times-replay', type=int, default=4) parser.add_argument('--n-hidden-channels', type=int, default=100) parser.add_argument('--n-hidden-layers', type=int, default=2) parser.add_argument('--replay-capacity', type=int, default=5000) parser.add_argument('--replay-start-size', type=int, default=10**3) parser.add_argument('--disable-online-update', action='store_true') parser.add_argument('--beta', type=float, default=1e-2) parser.add_argument('--profile', action='store_true') parser.add_argument('--steps', type=int, default=8 * 10**7) parser.add_argument('--eval-interval', type=int, default=10**5) parser.add_argument('--eval-n-runs', type=int, default=10) parser.add_argument('--reward-scale-factor', type=float, default=1e-2) parser.add_argument('--rmsprop-epsilon', type=float, default=1e-2) parser.add_argument('--render', action='store_true', default=False) parser.add_argument('--lr', type=float, default=7e-4) parser.add_argument('--demo', action='store_true', default=False) parser.add_argument('--load', type=str, default='') parser.add_argument('--logger-level', type=int, default=logging.DEBUG) parser.add_argument('--monitor', action='store_true') parser.add_argument('--truncation-threshold', type=float, default=5) parser.add_argument('--trust-region-delta', type=float, default=0.1) args = parser.parse_args() logging.basicConfig(level=args.logger_level) if args.seed is not None: misc.set_random_seed(args.seed) args.outdir = experiments.prepare_output_dir(args, args.outdir) def make_env(process_idx, test): env = gym.make(args.env) if args.monitor and process_idx == 0: env = gym.wrappers.Monitor(env, args.outdir) # Scale rewards observed by agents if not test: misc.env_modifiers.make_reward_filtered( env, lambda x: x * args.reward_scale_factor) if args.render and process_idx == 0 and not test: misc.env_modifiers.make_rendered(env) return env sample_env = gym.make(args.env) timestep_limit = sample_env.spec.tags.get( 'wrapper_config.TimeLimit.max_episode_steps') obs_space = sample_env.observation_space action_space = sample_env.action_space if isinstance(action_space, spaces.Box): model = acer.ACERSDNSeparateModel( pi=policies.FCGaussianPolicy( obs_space.low.size, action_space.low.size, n_hidden_channels=args.n_hidden_channels, n_hidden_layers=args.n_hidden_layers, bound_mean=True, min_action=action_space.low, max_action=action_space.high), v=v_functions.FCVFunction(obs_space.low.size, n_hidden_channels=args.n_hidden_channels, n_hidden_layers=args.n_hidden_layers), adv=q_functions.FCSAQFunction( obs_space.low.size, action_space.low.size, n_hidden_channels=args.n_hidden_channels // 4, n_hidden_layers=args.n_hidden_layers), ) else: model = acer.ACERSeparateModel( pi=links.Sequence( L.Linear(obs_space.low.size, args.n_hidden_channels), F.relu, L.Linear(args.n_hidden_channels, action_space.n, wscale=1e-3), SoftmaxDistribution), q=links.Sequence( L.Linear(obs_space.low.size, args.n_hidden_channels), F.relu, L.Linear(args.n_hidden_channels, action_space.n, wscale=1e-3), DiscreteActionValue), ) opt = rmsprop_async.RMSpropAsync(lr=args.lr, eps=args.rmsprop_epsilon, alpha=0.99) opt.setup(model) opt.add_hook(chainer.optimizer.GradientClipping(40)) replay_buffer = EpisodicReplayBuffer(args.replay_capacity) agent = acer.ACER(model, opt, t_max=args.t_max, gamma=0.99, replay_buffer=replay_buffer, n_times_replay=args.n_times_replay, replay_start_size=args.replay_start_size, disable_online_update=args.disable_online_update, use_trust_region=True, trust_region_delta=args.trust_region_delta, truncation_threshold=args.truncation_threshold, beta=args.beta, phi=phi) if args.load: agent.load(args.load) if args.demo: env = make_env(0, True) eval_stats = experiments.eval_performance( env=env, agent=agent, n_runs=args.eval_n_runs, max_episode_len=timestep_limit) print('n_runs: {} mean: {} median: {} stdev {}'.format( args.eval_n_runs, eval_stats['mean'], eval_stats['median'], eval_stats['stdev'])) else: experiments.train_agent_async(agent=agent, outdir=args.outdir, processes=args.processes, make_env=make_env, profile=args.profile, steps=args.steps, eval_n_runs=args.eval_n_runs, eval_interval=args.eval_interval, max_episode_len=timestep_limit)
# -*- coding: utf-8 -*- """ Created on Thu May 31 10:43:03 2018 @author: vw1586 """ import gym import numpy as np import random import math ## Initialize the "Cart-Pole" environment env = gym.make('CartPole-v0') ## Defining the environment related constants # Number of discrete states (bucket) per state dimension NUM_BUCKETS = (1, 1, 6, 3) # (x, x', theta, theta') # Number of discrete actions NUM_ACTIONS = env.action_space.n # (left, right) # Bounds for each discrete state STATE_BOUNDS = list(zip(env.observation_space.low, env.observation_space.high)) STATE_BOUNDS[1] = [-0.5, 0.5] STATE_BOUNDS[3] = [-math.radians(50), math.radians(50)] # Index of the action ACTION_INDEX = len(NUM_BUCKETS) ## Creating a Q-Table for each state-action pair q_table = np.zeros(NUM_BUCKETS + (NUM_ACTIONS, ))
import gym import numpy as np env = gym.make('SafeUfoReachEnv2-v1') obs = env.reset() for i in range(50): action = np.array([20]) #env.action_space.sample() # print(type(env.ACs))# a tuple # print(f"action{action}") # print(f"env.ACs{env.ACs}") obs, reward, done, info = env.step(action) # print(obs) # print(reward) # print(done) # print(info) if done: # print(f"env.ACs{env.ACs}")\ print(obs) break print(obs) print(np.random.randint(50, int(2 / 0.02) - 1)) # print result # $ obs = env.reset() # {'observation': array([0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 9.60000000e+02,
import numpy as np import gym import gym_minigrid # from gym_minigrid import wrappers rng = np.random.RandomState(1337) env_list = [e for e in gym.envs.registry.env_specs if e.startswith('MiniGrid')] print(f'{len(env_list)} environments registered') for env_name in env_list: print(f'testing {env_name}') # Load the gym environment env = gym.make(env_name, seed=1337) print(env) env.max_steps = min(env.max_steps, 200) env.reset() env.render('rgb_array') # Verify that the same seed always produces the same environment for i in range(0, 5): seed = 1337 + i env.seed(seed) grid1 = env.grid env.seed(seed) grid2 = env.grid assert grid1 == grid2
SCREEN_SIZE = 1000 SPARSE_REWARD = False SCREEN_SHOT = False action_range = 10.0 env=Reacher(screen_size=SCREEN_SIZE, num_joints=NUM_JOINTS, link_lengths = LINK_LENGTH, \ ini_joint_angles=INI_JOING_ANGLES, target_pos = [369,430], render=True, change_goal=False) action_space = spaces.Box(low=-1.0, high=1.0, shape=(env.num_actions, ), dtype=np.float32) state_space = spaces.Box(low=-np.inf, high=np.inf, shape=(env.num_observations, )) else: env = NormalizedActions(gym.make(ENV)) action_space = env.action_space state_space = env.observation_space action_range = 1. replay_buffer_size = 5e5 replay_buffer = ReplayBufferLSTM2(replay_buffer_size) # hyper-parameters for RL training max_episodes = 1000 max_steps = 20 if ENV == 'Reacher' else 150 # Pendulum needs 150 steps per episode to learn well, cannot handle 20 frame_idx = 0 batch_size = 2 # each sample contains an episode for lstm policy explore_steps = 0 # for random action sampling in the beginning of training update_itr = 1 hidden_dim = 512
Generate Expert Trajectories from a model """ # env_id = 'NovelGridworld-v2' # model = DQN('MlpPolicy', env_id, verbose=1) # # # Train a DQN agent for 1e5 timesteps and generate 10 trajectories # # data will be saved in a numpy archive named `expert_+env_id.npz` # generate_expert_traj(model, 'expert_'+env_id, n_timesteps=int(10), n_episodes=5) """ Generate Expert Trajectories from a human expert player """ env_id = 'NovelGridworld-v5' env = gym.make(env_id) KEY_ACTION_DICT = ENV_KEY[env_id] def print_play_keys(action_str): print("Press a key to play: ") for key, key_id in KEY_ACTION_DICT.items(): print(key, ": ", action_str[key_id]) def human_expert(_obs): """ Random agent. It samples actions randomly from the action space of the environment.
import sys sys.path.insert(0, "../../") import numpy as np import gym import algorithms as alg from evaluate import * env = gym.make("FrozenLake-v0") print("\nSARSA") alg.utils.random_seed(env, 1) Q,history_sarsa = alg.sarsa( env, alpha=0.1, gamma=1, epsilon=0.4, N_episodes=10000, epsilon_decay=alg.utils.decay_linear) pi = alg.utils.create_greedy_policy(Q) print(np.array( [np.argmax(pi[s]) for s in range(env.nS)]).reshape(env.nrow,env.ncol)) evaluate_policy(env, pi, 10000, env.nS-1) print("\nQ-Learning") alg.utils.random_seed(env, 1) Q,history_qlearning = alg.qlearning( env, alpha=0.1, gamma=0.99, epsilon=0.5, N_episodes=10000, epsilon_decay=alg.utils.decay_linear) pi = alg.utils.create_greedy_policy(Q) print(np.array( [np.argmax(pi[s]) for s in range(env.nS)]).reshape(env.nrow,env.ncol)) evaluate_policy(env, pi, 10000, env.nS-1) print("\nExpected SARSA") alg.utils.random_seed(env, 1)
def __init__(self, params): ############# ## INIT ############# # Get params, create logger, create TF session self.params = params self.logger = Logger(self.params['logdir']) self.sess = create_tf_session(self.params['use_gpu'], which_gpu=self.params['which_gpu']) # Set random seeds seed = self.params['seed'] tf.set_random_seed(seed) np.random.seed(seed) ############# ## ENV ############# # Make the gym environment self.env = gym.make(self.params['env_name']) self.env.seed(seed) # Maximum length for episodes self.params['ep_len'] = self.params[ 'ep_len'] or self.env.spec.max_episode_steps # Is this env continuous, or self.discrete? discrete = isinstance(self.env.action_space, gym.spaces.Discrete) self.params['agent_params']['discrete'] = discrete # Observation and action sizes ob_dim = self.env.observation_space.shape[0] ac_dim = self.env.action_space.n if discrete else self.env.action_space.shape[ 0] self.params['agent_params']['ac_dim'] = ac_dim self.params['agent_params']['ob_dim'] = ob_dim # simulation timestep, will be used for video saving if 'model' in dir(self.env): self.fps = 1 / self.env.model.opt.timestep else: self.fps = self.env.env.metadata['video.frames_per_second'] ############# ## AGENT ############# agent_class = self.params['agent_class'] self.agent = agent_class(self.sess, self.env, self.params['agent_params']) ############# ## INIT VARS ############# ## TODO initialize all of the TF variables (that were created by agent, etc.) ## HINT: use global_variables_initializer self.sess.run(tf.global_variables_initializer())
parser.add_argument('--lr_start', default=0.001, type=float) parser.add_argument('--lr_end', default=0.0005, type=float) parser.add_argument('--eps_start', default=1, type=float) parser.add_argument('--eps_end', default=0.1, type=float) parser.add_argument('--nsteps', default=100000, type=int, help='total steps') parser.add_argument('--framehistory', default=1, type=int, help='number of images into network') parser.add_argument('--buffersize', default=1000, type=int, help='replay buffer size') parser.add_argument('--batchsize', default=4, type=int, help='replay buffer size') args = parser.parse_args() print(args) env = gym.make(args.env) lr = args.lr_start dqn = DQN(6400, env.action_space.n) target_dqn = DQN(6400, env.action_space.n) eps_vals = np.linspace(args.eps_end, args.eps_start, args.nsteps) total_rewards = [] count = 0 run_avg = 0 running_avg_rew = [] for episode in range(args.episodes):
def discount_and_normalize_rewards(all_rewards, discount_rate): all_discounted_rewards = [] for rewards in all_rewards: all_discounted_rewards.append( helper_discount_rewards(rewards, discount_rate)) flat_rewards = np.concatenate(all_discounted_rewards) reward_mean = flat_rewards.mean() reward_std = flat_rewards.std() return [(discounted_rewards - reward_mean) / reward_std for discounted_rewards in all_discounted_rewards] env = gym.make("CartPole-v0") num_game_rounds = 100 max_game_steps = 1000 num_iterations = 100 discount_rate = 0.95 with tf.Session() as sess: new_saver = tf.train.import_meta_graph('.models/my-650-step-model.meta') new_saver.restore(sess, '.models/my-650-step-model') # sess.run(init) for iteration in range(num_iterations): print("Currently on Iteration: {} \n".format(iteration)) all_rewards = [] all_gradients = []
total_reward += reward n_steps += 1 if timestep % agent.update_interval == 0: loss = agent.learn() losses.extend(loss) rewards.append(total_reward) steps.append(n_steps) if episode % (episodes // 10) == 0 and episode != 0: print(f'{episode:5d} : {np.mean(rewards):06.2f} ' f': {np.mean(losses):06.4f} : {np.mean(steps):06.2f}') rewards = [] # losses = [0] steps = [] print(f'{episode:5d} : {np.mean(rewards):06.2f} ' f': {np.mean(losses):06.4f} : {np.mean(steps):06.2f}') return losses, rewards # }}} if __name__ == '__main__': env = gym.make('CartPole-v1') # env = gym.make('LunarLander-v2') agent = Agent(0.99, env.observation_space.shape, [env.action_space.n], update_interval=2000, K=4, c1=1.0) learn(env, agent, 1000)
def run(seed, episodes, evaluation_episodes, batch_size, gamma, inverting_gradients, initial_memory_threshold, replay_memory_size, save_dir, epsilon_steps, epsilon_final, tau_actor, tau_critic, use_ornstein_noise, learning_rate_actor, learning_rate_critic, clip_grad, layers, initialise_params, title): env = gym.make('Platform-v0') env = ScaledStateWrapper(env) initial_params_ = [3., 10., 400.] for a in range(env.action_space.spaces[0].n): initial_params_[a] = 2. * (initial_params_[a] - env.action_space.spaces[1].spaces[a].low) / ( env.action_space.spaces[1].spaces[a].high - env.action_space.spaces[1].spaces[a].low) - 1. env = PlatformFlattenedActionWrapper(env) env = ScaledParameterisedActionWrapper(env) dir = os.path.join(save_dir, title) env = Monitor(env, directory=os.path.join(dir, str(seed)), video_callable=False, write_upon_reset=False, force=True) env.seed(seed) np.random.seed(seed) agent = PADDPGAgent(observation_space=env.observation_space.spaces[0], action_space=env.action_space, batch_size=batch_size, learning_rate_actor=learning_rate_actor, learning_rate_critic=learning_rate_critic, epsilon_steps=epsilon_steps, epsilon_final=epsilon_final, gamma=gamma, clip_grad=clip_grad, tau_actor=tau_actor, tau_critic=tau_critic, initial_memory_threshold=initial_memory_threshold, use_ornstein_noise=use_ornstein_noise, replay_memory_size=replay_memory_size, inverting_gradients=inverting_gradients, adam_betas=(0.9, 0.999), critic_kwargs={'hidden_layers': layers, 'init_type': "kaiming"}, actor_kwargs={'hidden_layers': layers, 'init_type': "kaiming", 'init_std': 0.0001, 'squashing_function': False}, seed=seed) print(agent) if initialise_params: initial_weights = np.zeros((env.action_space.spaces[0].n, env.observation_space.spaces[0].shape[0])) initial_bias = np.zeros(env.action_space.spaces[0].n) for a in range(env.action_space.spaces[0].n): initial_bias[a] = initial_params_[a] agent.set_action_parameter_passthrough_weights(initial_weights, initial_bias) max_steps = 250 total_reward = 0. returns = [] start_time = time.time() for i in range(episodes): state, _ = env.reset() state = np.array(state, dtype=np.float32, copy=False) act, act_param, all_actions, all_action_parameters = agent.act(state) action = pad_action(act, act_param) episode_reward = 0. agent.start_episode() for j in range(max_steps): ret = env.step(action) (next_state, steps), reward, terminal, _ = ret next_state = np.array(next_state, dtype=np.float32, copy=False) next_act, next_act_param, next_all_actions, next_all_action_parameters = agent.act(next_state) next_action = pad_action(next_act, next_act_param) agent.step(state, (act, act_param, all_actions, all_action_parameters), reward, next_state, (next_act, next_act_param, next_all_actions, next_all_action_parameters), terminal, steps) act, act_param, all_actions, all_action_parameters = next_act, next_act_param, next_all_actions, next_all_action_parameters action = next_action state = next_state # .copy() episode_reward += reward if terminal: break agent.end_episode() returns.append(episode_reward) total_reward += episode_reward if (i + 1) % 100 == 0: print('{0:5s} R:{1:.5f}'.format(str(i + 1), total_reward / (i + 1))) end_time = time.time() print("Took %.2f seconds" % (end_time - start_time)) env.close() returns = env.get_episode_rewards() print("Ave. return =", sum(returns) / len(returns)) print("Ave. last 100 episode return =", sum(returns[-100:]) / 100.) np.save(os.path.join(dir, title + "{}".format(str(seed))), returns) if evaluation_episodes > 0: print("Evaluating agent over {} episodes".format(evaluation_episodes)) agent.epsilon_final = 0. agent.epsilon = 0. agent.noise = None evaluation_returns = evaluate(env, agent, evaluation_episodes) print("Ave. evaluation return =", sum(evaluation_returns) / len(evaluation_returns)) np.save(os.path.join(dir, title + "{}e".format(str(seed))), evaluation_returns)
def env(self): return gym.make('orthogonal-single-boundary-v0')
LOG_DIR = './log' N_WORKERS = 2 #multiprocessing.cpu_count() MAX_EP_STEP = 200 MAX_GLOBAL_EP = 2000 MAX_R = -1600 GLOBAL_NET_SCOPE = 'Global_Net' UPDATE_GLOBAL_ITER = 10 GAMMA = 0.9 ENTROPY_BETA = 0.01 LR_A = 0.0001 # learning rate for actor LR_C = 0.001 # learning rate for critic GLOBAL_MEAN_R = [] GLOBAL_RUNNING_R = [] GLOBAL_EP = 0 env = gym.make(GAME) N_S = env.observation_space.shape[0] N_A = env.action_space.shape[0] A_BOUND = [env.action_space.low, env.action_space.high] class ACNet(object): def __init__(self, scope, globalAC=None): if scope == GLOBAL_NET_SCOPE: # get global network with tf.variable_scope(scope): self.s = tf.placeholder(tf.float32, [None, N_S], 'S') self.a_params, self.c_params = self._build_net(scope)[-2:] else: # local net, calculate losses with tf.variable_scope(scope):