def run_env(env, n_runs=100): """ Plots simulated games in an environment for visualization :param env: environment to be run :param n_runs: how many episodes should be run :return: plot of each step in the environment """ for i in range(n_runs): env.reset() env.show() done = False while not done: state = env.agents[0].board_to_state( ) # for the reinforcement agent convert board to state input action = env.agents[0].select_action(state, 0.00) action = action[0, 0] # action is unwrapped from the LongTensor move = env.agents[0].action_to_move( action) # e.g. action = 1 -> move = ((0, 0), (0, 1)) _, done, won = env.step(move) env.show() if done and won: print("Won!") elif done and not won or env.steps > 20: print("Lost") break
def test(game_size, norm): # start_pprof_server(port=8081) env = gym.make('game2048-v0', size=game_size, norm=norm) obs = env.reset() rewards = 0 step = 0 for _ in range(1): start = time.time() * 1000 while True: # if render for every step # env.render() action = env.action_space.sample() obs, reward, done, info = env.step(action) rewards += reward step += 1 if done: escape = time.time() * 1000 - start env.render() print(f'obs: {obs}') print( f'play games steps: {step} reward: {rewards} info: {info}' + f' use {escape:.3f}ms speed: {(step * 1000 / escape):.3f}ops/s' ) time.sleep(0.5) step = 0 rewards = 0 start = time.time() * 1000 env.reset()
def local_test(index, opt, global_model): torch.manual_seed(123 + index) env, num_states, num_actions = create_train_env(args.world, args.stage, args.action_type) local_model = ActorCritic(num_states, num_actions) local_model.eval() state = torch.from_numpy(env.reset()) done = True curr_step = 0 actions = deque(maxlen=args.max_actions) while True: curr_step += 1 if done: local_model.load_state_dict(global_model.state_dict()) with torch.no_grad(): if done: h_0 = torch.zeros((1, 512), dtype=torch.float) c_0 = torch.zeros((1, 512), dtype=torch.float) else: h_0 = h_0.detach() c_0 = c_0.detach() logits, value, h_0, c_0 = local_model(state, h_0, c_0) policy = F.softmax(logits, dim=1) action = torch.argmax(policy).item() state, reward, done, _ = env.step(action) env.render() actions.append(action) if curr_step > args.num_global_steps or actions.count(actions[0]) == actions.maxlen: done = True if done: curr_step = 0 actions.clear() state = env.reset() state = torch.from_numpy(state)
def some_random_games_first(): for episode in range(10): env.reset() for t in range(goal_steps): action = env.action_space() observation, reward, done, info = env.step(action) if done: break
def initial_population(): training_data = [] scores = [] accepted_scores = [] for _ in range(initial_games): env.reset() if (_ % 100 == 0): print(_) score = 0 game_memory = [] prev_observation = [0, 0, 0, 0, 0, 0, 0, 0, 0] for _ in range(goal_steps): #print(prev_observation) action = env.action_space() observation, reward, done, info = env.step(action) #print(action) if len(prev_observation) > 0: game_memory.append([prev_observation, action]) prev_observation = observation score += reward #if done: # break if score >= score_requirement: accepted_scores.append(score) for data in game_memory: if data[1] == 1: output = [1, 0, 0, 0, 0, 0, 0, 0, 0] elif data[1] == 2: output = [0, 1, 0, 0, 0, 0, 0, 0, 0] elif data[1] == 3: output = [0, 0, 1, 0, 0, 0, 0, 0, 0] elif data[1] == 4: output = [0, 0, 0, 1, 0, 0, 0, 0, 0] elif data[1] == 5: output = [0, 0, 0, 0, 1, 0, 0, 0, 0] elif data[1] == 6: output = [0, 0, 0, 0, 0, 1, 0, 0, 0] elif data[1] == 7: output = [0, 0, 0, 0, 0, 0, 1, 0, 0] elif data[1] == 8: output = [0, 0, 0, 0, 0, 0, 0, 1, 0] elif data[1] == 9: output = [0, 0, 0, 0, 0, 0, 0, 0, 1] training_data.append([data[0], output]) scores.append(score) training_data_save = np.array(training_data) np.save('saved2.npy', training_data_save) print('Average accepted score:', mean(accepted_scores)) print('Median accepted score: ', median(accepted_scores)) print(Counter(accepted_scores)) return training_data
def test(): training_data = np.load('saved2.npy') X = np.array([i[0] for i in training_data]).reshape(-1, len(training_data[0][0]), 1) model = neural_network_model(input_size=len(X[0])) model.load("model2.model") scores = [] choices = [] for each_game in range(1000): score = 0 game_memory = [] prev_obs = [0, 0, 0, 0, 0, 0, 0, 0, 0] env.reset() for _ in range(goal_steps): #if len(prev_obs) == 0: # action = random.randrange(1,10) #else: action = np.argmax( model.predict( np.array(prev_obs).reshape(-1, len(prev_obs), 1))[0]) #if(action==9): #print(action) #print(prev_obs) #print(action) choices.append(action) new_observation, reward, done, info = env.step(action) print(new_observation) prev_obs = new_observation game_memory.append([new_observation, action]) score += reward if done: break scores.append(score) print('Average Score', sum(scores) / len(scores)) print( 'Choice 1: {}, Choice 2: {}, Choice 3: {}, Choice 4: {}, Choice 5: {}, Choice 6: {}, Choice 7: {}, Choice 8: {}, Choice 9: {}' .format( choices.count(1) / len(choices), choices.count(2) / len(choices), choices.count(3) / len(choices), choices.count(4) / len(choices), choices.count(5) / len(choices), choices.count(6) / len(choices), choices.count(7) / len(choices), choices.count(8) / len(choices), choices.count(9) / len(choices)))
def self_play(env, agent, return_trajectory=False, verbose=False): if return_trajectory: trajectory = [] observation = env.reset() for step in itertools.count(): board,_,player,_,_ = observation action, prob = agent.decide(observation, return_prob=True) if verbose: print(strfboard(observation)) logging.info('The {} step:palyer {}, action {}'.format(step, player, action)) observation, winner, done, _ = env.step(action[0]) if return_trajectory: m,n = board.shape board = np.reshape(board, m*n) trajectory.append((player, board, prob)) if done: if verbose: print(strfboard(observation)) logging.info('Winner {}'.format(winner)) break if return_trajectory: df_trajectory = pd.DataFrame(trajectory, columns=['player', 'board', 'prob']) df_trajectory['winner'] = winner return df_trajectory else: return winner
def train_sl(size, lr, rd): env = gym.make('game2048-v0', size=size) agent = model.SarsaLambda(env.action_space) trials = 1 * 10000 * (size ** 2) for trial in range(trials): obs = env.reset() obs = str(obs.reshape(size ** 2).tolist()) action = agent.choose_action(obs) stepno = 0 rewards = 0 while True: stepno += 1 obs_, reward, done, _ = env.step(action) obs_ = str(obs_.reshape(size ** 2).tolist()) action_ = agent.choose_action(obs_) if done: obs_ = 'terminal' agent.learn(obs, action, reward, obs_, action_) obs = obs_ action = action_ rewards += reward if done: break env.render() print(f'Completed in {trial} use {stepno} steps highest: \ {env.highest()} rewards: {rewards}') stepno = 0 rewards = 0 print(len(agent.q_table))
def behaviour(self, candidate): obs = env.reset() done = False while not done: action = get_action(ns, obs) obs, reward, done, _ = env.step(action) return obs
def get_env_params(env): obs = env.reset() # close the environment params = {'obs': obs['observation'].shape[0], 'goal': obs['desired_goal'].shape[0], 'action': env.action_space.shape[0], 'action_max': env.action_space.high[0], 'max_timesteps': env._max_episode_steps} return params
def main(args): param_str = ( f'{args.env}_{args.algo}_rep={args.repeat}_hor={args.horizon}_prop={args.proposals}' f'_iter={args.iterations}_sigma={args.sigma}') env = gym.make(args.env) env = ActionRepeat(env, args.repeat) # Pool of workers, each has its own copy of global environment variable pool = Pool(32, initializer, [env]) if args.algo == 'gaussian': planner = partial(gaussian_cem, pool=pool, action_space=env.action_space, horizon=args.horizon, proposals=args.proposals, topk=args.topk, iterations=args.iterations) elif args.algo == 'nonparametric': planner = partial(nonparametric_cem, pool=pool, action_space=env.action_space, horizon=args.horizon, proposals=args.proposals, topk=args.topk, iterations=args.iterations, sigma=args.sigma) scores = np.zeros(args.episodes) observations = np.zeros((args.episodes, env.num_steps + 1) + env.observation_space.shape) actions = np.zeros((args.episodes, env.num_steps) + env.action_space.shape) for i in range(args.episodes): logger = Logger(os.path.join(args.logdir, f'{param_str}_run{i}')) observations[i, 0] = env.reset() for t in range(env.num_steps): state = env.sim.get_state() actions[i, t] = planner(state) observations[i, t + 1], reward, _, _ = env.step(actions[i, t]) scores[i] += reward logger.log_scalar('reward', scores[i], t) print(scores[i]) print(param_str) print('Mean score: ', scores.mean()) print('Standard deviation: ', scores.std()) if args.save: path = os.path.join(args.savedir, args.env) if not os.path.exists(path): os.makedirs(path) np.save(os.path.join(path, 'obs'), observations) np.save(os.path.join(path, 'act'), actions)
def evaluate(self): obs = env.reset() done = False total_reward = 0 while not done: action = get_action(ns, obs) obs, reward, done, _ = env.step(action) total_reward += reward return total_reward
def dqn(n_runs, n_episodes, max_t=300, eps_start=0.05, eps_end=1e-4, eps_decay=0.996): steps = np.zeros(n_episodes) acc_rewards = [] scores = [] eps = eps_start map_vec = env.init_map_vec() probMap = np.full((8, 8), 0) for num in map_vec: loc = util.num_to_loc(num, 8) probMap[loc[0]][loc[1]] = 1 print(agent.probMap) for i_run in range(0, n_runs): # train print("run: ", i_run) # provide the learned map #agent.reset() for i_episode in range(0, n_episodes): if i_episode % 500 == 0: print(i_episode) state = env.reset() #score = 0 #agent.probMap = probMap #agent.visitMap = np.full((8, 8), 0) for t in range(max_t): success = False action = agent.act(state, eps) next_state, reward, done = env.step(action) agent.step(state, action, reward, next_state, done, False, True) # not update the map state = next_state eps = max(eps * eps_decay, eps_end) #score += reward if done: #print(env.map) #print("t",t,"score",score) steps[i_episode] = steps[i_episode] + t success = True #print(t) break if not success: steps[i_episode] = steps[i_episode] + max_t #print(t) #agent.reset() return scores, steps, agent.probMap
def objective(space): env = gym.make(ENV) env = ActionRepeat(env, int(space['repeat'])) proposals = 1000 iterations = 10 # Pool of workers, each has its own copy of global environment variable pool = Pool(32, initializer, [env]) cost = 0 env.reset() for _ in range(env.num_steps): state = env.sim.get_state() action = cem_planner(pool, env.action_space, state, int(space['horizon']), proposals, int(space['topk']), iterations) _, reward, _, _ = env.step(action) cost -= reward return {'loss': cost, 'status': STATUS_OK}
def get_benchmark(cards, target): env = Env() episodes = 0 rewards = 0 total_episodes = 500 while episodes < total_episodes: if episodes % 100 == 0: print('running %d' % episodes) print(rewards / (episodes + 1)) end = False env.reset() env.prepare(cards) while not end: r, end = target.respond(env) rewards += r if r == 1.: print('you win!') else: print('you lose!') episodes += 1 return rewards / total_episodes
def q_learning(size, num_episodes, alpha, gamma=1.0, plot_every=100): env = gym.make('game2048-v0', size=size) """Q-Learning - TD Control Params ====== num_episodes (int): number of episodes to run the algorithm alpha (float): learning rate gamma (float): discount factor plot_every (int): number of episodes to use when calculating average score """ nA = env.action_space.n # number of actions Q = defaultdict(lambda: np.zeros(nA)) # initialize empty dictionary of arrays # monitor performance tmp_scores = deque(maxlen=plot_every) # deque for keeping track of scores avg_scores = deque(maxlen=num_episodes) # average scores over every plot_every episodes for i_episode in range(1, num_episodes+1): # monitor progress score = 0 # initialize score state = env.reset() # start episode state = str(state.reshape(size ** 2).tolist()) eps = 1.0 / i_episode # set value of epsilon while True: action = epsilon_greedy(env, Q, state, nA, eps) # epsilon-greedy action selection next_state, reward, done, info = env.step(action) # take action A, observe R, S' next_state = str(next_state.reshape(size ** 2).tolist()) score += reward # add reward to agent's score Q[state][action] = update_Q_sarsamax(alpha, gamma, Q, \ state, action, reward, next_state) state = next_state # S <- S' if done: tmp_scores.append(score) # append score break print("\rEpisode {}/{}\t Average Score: {:.2f}".format(i_episode, num_episodes, np.mean(tmp_scores)), end="") if i_episode % 100 == 0: print("\rEpisode {}/{}".format(i_episode, num_episodes)) sys.stdout.flush() if (i_episode % plot_every == 0): avg_scores.append(np.mean(tmp_scores)) # plot performance plt.plot(np.linspace(0,num_episodes,len(avg_scores),endpoint=False), np.asarray(avg_scores)) plt.xlabel('Episode Number') plt.ylabel('Average Reward (Over Next %d Episodes)' % plot_every) plt.show() # print best 100-episode performance print(('Best Average Reward over %d Q length: %d Episodes: ' % (plot_every, len(Q))), np.max(avg_scores)) return Q
def test_env(model, vis=False): state = env.reset() if vis: env.render() done = False total_reward = 0 while not done: state = torch.FloatTensor(state).unsqueeze(0).to(device) dist, _ = model(state) next_state, reward, done, _ = env.step(dist.sample().cpu().numpy()[0]) state = next_state if vis: env.render() total_reward += reward return total_reward, env.get_score()
def train_ql(size, lr, rd, eps_start=1.0, eps_end=0.05, eps_decay=0.999): env = gym.make('game2048-v0', size=size) agent = model.QLearning(env.action_space, learning_rate=lr, reward_decay=rd) total_steps = 0 total_scores = 0 highest_score = 0 # trials = 1 * 100000 * (size ** 2) trials = 400000 rewards_window = deque(maxlen=100) scores_window = deque(maxlen=100) eps = eps_start for trial in range(1, trials+1): obs = env.reset() obs = str(obs.reshape(size ** 2).tolist()) stepno = 0 rewards = 0 while True: stepno += 1 total_steps += 1 action = agent.choose_action(str(obs), eps) obs_, reward, done, _ = env.step(action) obs_ = str(obs_.reshape(size ** 2).tolist()) if done: obs_ = 'terminal' agent.learn(obs, action, reward, obs_) obs = obs_ rewards += reward if done: break #env.render() eps = max(eps_end, eps_decay * eps) rewards_window.append(rewards) scores_window.append(env.get_score()) if env.get_score() > highest_score: highest_score = env.get_score() total_scores += env.get_score() print('\rEpisode {}\t total_steps: {}\t Average Rewards: {:.2f}\t Average Scores: {:.2f} {}'. format(trial, total_steps, np.mean(rewards_window), np.mean(scores_window), eps), end="") if trial% 100 == 0: print('\rEpisode {}\t total_steps: {}\t Average Rewards: {:.2f}\t Average Scores: {:.2f} {}'. format(trial, total_steps, np.mean(rewards_window), np.mean(scores_window), eps)) eval(env, agent, 1000, render=False) print(f'table_len: {len(agent.q_table)} steps: {total_steps} avg_score: {total_scores / trials} \ highest_score: {highest_score} at size: {size} lr: {lr} reward_decay: {rd}') print(f'table_len: {len(agent.q_table)} steps: {total_steps}')
def train_sarsa(size, lr, rd): env = gym.make('game2048-v0', size=size) agent = model.Sarsa(env.action_space, learning_rate=lr, reward_decay=rd) total_steps = 0 total_scores = 0 highest_score = 0 trials = 1 * 1000 * (size ** 2) for trial in range(trials): obs = env.reset() obs = str(obs.reshape(size ** 2).tolist()) action = agent.choose_action(obs) stepno = 0 rewards = 0 while True: stepno += 1 total_steps += 1 obs_, reward, done, _ = env.step(action) obs_ = str(obs_.reshape(size ** 2).tolist()) action_ = agent.choose_action(obs_, True) if done: obs_ = 'terminal' agent.learn(obs, action, reward, obs_, action_) obs = obs_ action = action_ rewards += reward if done: break #env.render() print(f'Completed in {trial} use {stepno} steps highest: \ {env.highest()} rewards: {rewards}', end="") if env.highest() >= 2 ** (size ** 2 - 1): highest[trial] = env.highest() if env.highest() >= 2 ** (size ** 2): targets[trial] = env.highest() if env.get_score() > highest_score: highest_score = env.get_score() total_scores += env.get_score() stepno = 0 rewards = 0 eval(env, agent, render=False) print(f'table_len: {len(agent.q_table)} steps: {total_steps} avg_score: {total_scores / trials} \ highest_score: {highest_score} at size: {size} lr: {lr} reward_decay: {rd}') print(f'highest len: {len(highest)} prob: {len(highest) * 1.0 / trials} \ target len: {len(targets)} prob: {len(targets) * 1.0 / trials}')
def train(RL): acc_r = [0] total_steps = 0 episode = 0 all_reward = 0 # observation = env.reset() while True: # if total_steps-MEMORY_SIZE > 9000: env.render() s, t = env.reset() observation = s + list(t.reshape(-1, )) for i in range(200): action = RL.choose_action(observation) # f_action = (action-(ACTION_SPACE-1)/2)/((ACTION_SPACE-1)/4) # [-2 ~ 2] float actions (s_, t), reward, done, info = env.step(actions[action]) observation_ = s_ + list(t.reshape(-1, )) acc_r.append(reward + acc_r[-1]) # accumulated reward RL.store_transition(observation, action, reward, observation_) observation = observation_ total_steps += 1 all_reward += reward if total_steps > MEMORY_SIZE: RL.learn() if done: break # if total_steps-MEMORY_SIZE > 15000: # break episode += 1 if (episode % 100 == 0): info = {'averageTotalReward': all_reward / 100} all_reward = 0 for tag, value in info.items(): logger.scalar_summary(tag, value, i) saver.save(sess, './ddpg.ckpt', global_step=episode + 1) if (episode > 2000): break return RL.cost_his, acc_r
def main(): env_name = "dobro-CartPole-v0" env = gym.make(env_name) time_horizon = 20 agent_args = { 'discount_factor': 0.99, 'time_horizon': time_horizon, 'time_step': 0.02, } agent = Agent(env, agent_args) max_steps = 1000 max_ep_len = min(500, env.spec.max_episode_steps) episodes = int(max_steps / max_ep_len) epochs = int(1e5) for epoch in range(epochs): ep_step = 0 while ep_step < max_steps: state = env.reset() done = False score = 0 step = 0 while True: step += 1 ep_step += 1 action = agent.get_action(state) next_state, reward, done, info = env.step(action) env.render() #time.sleep(0.01) state = next_state score += reward if done or step >= max_ep_len: break print(score)
def eval(env, agent, times=1000, render=False): if False: write_explore(agent, 'explore_old.file') highest_score = 0 total_scores = 0 size = env.get_size() scores = [] max_tiles = [] for i in range(times): obs = env.reset() obs = str(obs.reshape(size ** 2).tolist()) while True: action = agent.choose_action(obs) obs_, reward, done, _ = env.step(action) obs_ = str(obs_.reshape(size ** 2).tolist()) if render: print(f'action is: {action} {obs} {obs_}') env.render() if obs_ == obs: # env.render() agent.learn(obs, action, reward, obs_) obs = obs_ if done: break env.render() scores.append(env.get_score()) max_tiles.append(env.highest()) if env.get_score() > highest_score: highest_score = env.get_score() total_scores += env.get_score() if times > 0: plot_score(scores, max_tiles) print(f'eval avg_score: {total_scores / times} highest_score: {highest_score}') if False: write_explore(agent, 'explore_new.file')
def evaluate(time, env, agent, render=False): eval_reward = [] for i in range(time): obs = env.reset() episode_reward = 0 step = 0 while True: step += 1 action = agent.predict(obs) # 选取最优动作 action = np.clip(action, -1, 1) obs, reward, isOver, _ = env.step(action) episode_reward += reward if render: env.render() if isOver or step >= 200: break eval_reward.append(episode_reward) mean_reward = np.mean(eval_reward) print("evaluating on {} episodes with mean reward {}.".format(time, mean_reward)) logging.warning("evaluating on {} episodes with mean reward {}.".format(time, mean_reward)) return mean_reward
def run_episode(env, agent, rpm): obs = env.reset() step = 0 total_reward = 0 while True: action = agent.predict(obs) # 采样动作 action = np.clip(np.random.normal(action, opt["NOISE"]), -1.0, 1.0) next_obs, reward, done, info = env.step(action) rpm.append((obs, action, opt["REWARD_SCALE"] * reward, next_obs, done)) if len(rpm) > opt["MEMORY_WARMUP_SIZE"] and (step % opt["LEARN_FREQ"]) == 0: (batch_obs, batch_action, batch_reward, batch_next_obs, batch_done) = rpm.sample(opt["BATCH_SIZE"]) agent.learn(batch_obs, batch_action, batch_reward, batch_next_obs, batch_done) obs = next_obs total_reward += reward step += 1 if done or step >= 200: break return step, total_reward
def test_agent(fname, agent, avg=100, seed=43): _, env_args = load_args(CONFIG_PATH) if fname is not None: # if map is specified, use the map without random map env_args["fname"] = fname env_args["random_map"] = False env = gym.make("ScavengerHuntMap-v0", **env_args) env.seed(seed) dist_list = [] a = agent(env) for i in range(avg): print("Running %d/%d" % ((i + 1), avg), end="\r") obs = env.reset() done = False dist = 0 while not done: act = a.next_node(obs) cl = env.env.map.get_current_loc() obs, _, done, info = env.step(act) dist += info["cost"] dist_list.append(dist) return sum(dist_list) / avg, np.std(dist_list)
def eval(env, agent, times=1000, render=False): highest_score = 0 scores = [] max_tiles = [] eps = 0.0 random = False for i in range(times): obs = env.reset() while True: action, action_values = agent.choose_action(obs, eps, rand=random) obs_, reward, done, _ = env.step(action) if render: env.render() if str(obs_) == str(obs): random = True #env.render() # print(f'action is: {action} {reward} {action_values} {obs} {obs_}') print( f'action is: {action} {reward} {action_values} {obs} {obs_}' ) else: random = False obs = obs_ if done: break env.render() scores.append(env.get_score()) max_tiles.append(env.highest()) if env.get_score() > highest_score: highest_score = env.get_score() if times > 0: plot_score(scores, max_tiles) print( f'eval avg_score: {np.mean(scores)} highest_score: {highest_score}' )
# with open('./Center/log.csv', 'w') as myfile: # wr = csv.writer(myfile, quoting=csv.QUOTE_ALL) # wr.writerow(['Episode','Length','Reward','IMG','LOG','SAL']) # wr = csv.writer(open('./Center/log.csv', 'a'), quoting=csv.QUOTE_ALL) with tf.Session(config=config) as sess: if load_model == True: print('Loading Model...') ckpt = tf.train.get_checkpoint_state(path) saver.restore(sess, ckpt.model_checkpoint_path) else: sess.run(init) for i in range(num_episodes): episodeBuffer = [] sP = env.reset() s = processState(sP) d = False rAll = 0 j = 0 state = (np.zeros([1, h_size]), np.zeros([1, h_size])) # The Q-Network while j < max_epLength: j += 1 if np.random.rand(1) < e: state1 = sess.run(mainQN.rnn_state, \ feed_dict={mainQN.scalarInput: [s / 255.0], mainQN.trainLength: 1, mainQN.state_in: state, mainQN.batch_size: 1}) a = np.random.randint(0, 4) else:
a_bound = env.action_space.high print('s_dim', s_dim) print('a_dim', a_dim) # 用DDPG算法 ddpg = DDPG(a_dim, s_dim, a_bound) # 训练部分: if args.train: # train reward_buffer = [] # 用于记录每个EP的reward,统计变化 t0 = time.time() # 统计时间 for i in range(MAX_EPISODES): t1 = time.time() s = env.reset() ep_reward = 0 # 记录当前EP的reward for j in range(MAX_EP_STEPS): # Add exploration noise a = ddpg.choose_action(s) # 这里很简单,直接用actor估算出a动作 # 为了能保持开发,这里用了另外一种方式增加探索。 # 因此需要需要以a为均值,VAR为标准差,建立正态分布,再从正态分布采样出a # 因为a是均值,所以a的概率是最大的。但a相对其他概率由多大,是靠VAR调整。这里我们其实可以增加更新VAR,动态调整a的确定性 # 然后进行裁剪 # Question: 原文的正态分布为 N(0, \sigma^2),按参考论文意思应该是 a + N(0, \sigma^2) a = np.clip(np.random.normal(loc=a, scale=sigma), 0, 1) # 与环境进行互动 s_, r, done, info = env.step(a)
kk_seq.append(kk) k_seq.reverse() kk_seq.reverse() return k_seq, kk_seq def forward(self, x_seq, u_seq, k_seq, kk_seq): x_seq_hat = np.array(x_seq) u_seq_hat = np.array(u_seq) for t in range(len(u_seq)): control = k_seq[t] + np.matmul(kk_seq[t], (x_seq_hat[t] - x_seq[t])) u_seq_hat[t] = np.clip(u_seq[t] + control, -self.umax, self.umax) x_seq_hat[t + 1] = self.f(x_seq_hat[t], u_seq_hat[t]) return x_seq_hat, u_seq_hat env = gym.make('CartPoleContinuous-v0').env obs = env.reset() ilqr = ILqr(lambda x, u: env._state_eq(x, u), # x(i+1) = f(x(i), u) lambda x, u: 0.5 * np.sum(np.square(u)), # l(x, u) lambda x: 0.5 * (np.square(1.0 - np.cos(x[2])) + np.square(x[1]) + np.square(x[3])), # lf(x) env.max_force, env.observation_space.shape[0]) u_seq = [np.zeros(1) for _ in range(ilqr.pred_time)] x_seq = [obs.copy()] for t in range(ilqr.pred_time): x_seq.append(env._state_eq(x_seq[-1], u_seq[t])) cnt = 0 while True: env.render(mode="rgb_array") #import pyglet #pyglet.image.get_buffer_manager().get_color_buffer().save('frame_%04d.png' % cnt)
self.sim_counter += 1 self.name = f'tour{self.sim_counter:03}' if self.verbose: print( f'[*] Starting a new simulation with noisy travel times-{self.name}' ) if __name__ == '__main__': from env_rl import EnvRL env = EnvRL(5, seed=123456, adaptive=False) print('name', env.name) env.step(2) env.step(4) env.step(5) env.step(1) env.step(3) print('tour', env.tour) print('tour time', env.tour_time) print(50 * '-') env.reset() print('name', env.name) env.step(2) env.step(4) env.step(5) env.step(1) env.step(3) print('tour', env.tour) print('tour time', env.tour_time)