def test(game_size, norm): # start_pprof_server(port=8081) env = gym.make('game2048-v0', size=game_size, norm=norm) obs = env.reset() rewards = 0 step = 0 for _ in range(1): start = time.time() * 1000 while True: # if render for every step # env.render() action = env.action_space.sample() obs, reward, done, info = env.step(action) rewards += reward step += 1 if done: escape = time.time() * 1000 - start env.render() print(f'obs: {obs}') print( f'play games steps: {step} reward: {rewards} info: {info}' + f' use {escape:.3f}ms speed: {(step * 1000 / escape):.3f}ops/s' ) time.sleep(0.5) step = 0 rewards = 0 start = time.time() * 1000 env.reset()
def train_sl(size, lr, rd): env = gym.make('game2048-v0', size=size) agent = model.SarsaLambda(env.action_space) trials = 1 * 10000 * (size ** 2) for trial in range(trials): obs = env.reset() obs = str(obs.reshape(size ** 2).tolist()) action = agent.choose_action(obs) stepno = 0 rewards = 0 while True: stepno += 1 obs_, reward, done, _ = env.step(action) obs_ = str(obs_.reshape(size ** 2).tolist()) action_ = agent.choose_action(obs_) if done: obs_ = 'terminal' agent.learn(obs, action, reward, obs_, action_) obs = obs_ action = action_ rewards += reward if done: break env.render() print(f'Completed in {trial} use {stepno} steps highest: \ {env.highest()} rewards: {rewards}') stepno = 0 rewards = 0 print(len(agent.q_table))
def main(): import time st = time.time() env = stage_1() state_size = 5 num_actions = 9 solver = DeepQSolver(state_size, num_actions, 2000, 100) epsilon = 1 train_rewards = [] for i in range(750): res = train(env, solver, epsilon) print("Train: Episode", i, "epsilon", epsilon, "time", (time.time() - st) / 60, ": Reward =", res) epsilon = max(epsilon * 0.90, 0.05) train_rewards.append(res) visualize(train_rewards, 'DeepQ', 'DeepQ_stage1.png') # st = time.time() # test_rewards = [] # for i in range(100): # res = train(env, solver, 0) # print("Test: Episode", i, "time", (time.time() - st) / 60, ": Reward =", res) # test_rewards.append(res) # print(f'Test: average {np.mean(test_rewards)}') render(env, save_path='DeepQ_stage1.mp4')
def main(): import time st = time.time() env = stage_2() # environment state_size = 5 num_actions = 9 model = Reinforce(state_size, num_actions) train_rewards = [] for i in range(2500): res = train(env, model) print(f'Train: Episode {i} time {(time.time() - st) / 60}: {res}') train_rewards.append(res) visualize(train_rewards, 'Reinforce', 'Reinforce_stage2.png') # st = time.time() # test_rewards = [] # for i in range(100): # res = test(env, model) # print(f'Test: Episode {i} time {(time.time() - st) / 60}: {res}') # test_rewards.append(res) # print(f'Test: average {np.mean(test_rewards)}') render(env, save_path='Reinforce_stage2.mp4')
def local_test(index, opt, global_model): torch.manual_seed(123 + index) env, num_states, num_actions = create_train_env(args.world, args.stage, args.action_type) local_model = ActorCritic(num_states, num_actions) local_model.eval() state = torch.from_numpy(env.reset()) done = True curr_step = 0 actions = deque(maxlen=args.max_actions) while True: curr_step += 1 if done: local_model.load_state_dict(global_model.state_dict()) with torch.no_grad(): if done: h_0 = torch.zeros((1, 512), dtype=torch.float) c_0 = torch.zeros((1, 512), dtype=torch.float) else: h_0 = h_0.detach() c_0 = c_0.detach() logits, value, h_0, c_0 = local_model(state, h_0, c_0) policy = F.softmax(logits, dim=1) action = torch.argmax(policy).item() state, reward, done, _ = env.step(action) env.render() actions.append(action) if curr_step > args.num_global_steps or actions.count(actions[0]) == actions.maxlen: done = True if done: curr_step = 0 actions.clear() state = env.reset() state = torch.from_numpy(state)
def test(env): action = env.action_space.sample() obs, r, done, info = env.step(action) env.render() print('action:', action) print('reward:', r) print('done:', done) print('info:', info) print('nb_actions', env.action_space.n)
def rollout(sentence_generator, vae, sentences, inst_to_one_hot, dict_goals, valid_goals, env, policy, env_params, inits, goals, self_eval, true_eval, biased_init=False, \ animated=False): expressions = get_list_of_expressions() scores = [] np.random.shuffle(expressions) for expression in expressions: print('\nAttempting expression: ', expression) observation = env.unwrapped.reset_goal(np.array(goals[i]), biased_init=biased_init) config_inital = observation['achieved_goal'].copy() trial_counter = 0 success = False while trial_counter < 5: trial_counter += 1 goals_str = sample_vae_logic(vae, inst_to_one_hot, observation['achieved_goal'], expression, valid_goals) if len(goals_str) > 0: goal = dict_goals[np.random.choice(list(goals_str))] # goal = dict_goals[np.random.choice(list(goals_str))] env.unwrapped.target_goal = goal.copy() observation = env.unwrapped._get_obs() obs = observation['observation'] ag = observation['achieved_goal'] g = observation['desired_goal'] # start to collect samples for t in range(env_params['max_timesteps']): # run policy no_noise = self_eval or true_eval action = policy.act(obs.copy(), ag.copy(), g.copy(), no_noise) # feed the actions into the environment if animated: env.render() observation_new, _, _, info = env.step(action) obs = observation_new['observation'] ag = observation_new['achieved_goal'] config_final = ag.copy() true_sentences = sentence_generator(config_inital, config_final) if check_sentence(true_sentences, expression): scores.append(trial_counter) success = True print('Success!') break else: print('\tFailed. Trying again.') if not success: scores.append(0) print('\tFailed 5 times, Moving On.') return scores.copy()
def test_env(model, vis=False): state = env.reset() if vis: env.render() done = False total_reward = 0 while not done: state = torch.FloatTensor(state).unsqueeze(0).to(device) dist, _ = model(state) next_state, reward, done, _ = env.step(dist.sample().cpu().numpy()[0]) state = next_state if vis: env.render() total_reward += reward return total_reward, env.get_score()
def main(): env_name = "dobro-CartPole-v0" env = gym.make(env_name) time_horizon = 20 agent_args = { 'discount_factor': 0.99, 'time_horizon': time_horizon, 'time_step': 0.02, } agent = Agent(env, agent_args) max_steps = 1000 max_ep_len = min(500, env.spec.max_episode_steps) episodes = int(max_steps / max_ep_len) epochs = int(1e5) for epoch in range(epochs): ep_step = 0 while ep_step < max_steps: state = env.reset() done = False score = 0 step = 0 while True: step += 1 ep_step += 1 action = agent.get_action(state) next_state, reward, done, info = env.step(action) env.render() #time.sleep(0.01) state = next_state score += reward if done or step >= max_ep_len: break print(score)
def eval(env, agent, times=1000, render=False): if False: write_explore(agent, 'explore_old.file') highest_score = 0 total_scores = 0 size = env.get_size() scores = [] max_tiles = [] for i in range(times): obs = env.reset() obs = str(obs.reshape(size ** 2).tolist()) while True: action = agent.choose_action(obs) obs_, reward, done, _ = env.step(action) obs_ = str(obs_.reshape(size ** 2).tolist()) if render: print(f'action is: {action} {obs} {obs_}') env.render() if obs_ == obs: # env.render() agent.learn(obs, action, reward, obs_) obs = obs_ if done: break env.render() scores.append(env.get_score()) max_tiles.append(env.highest()) if env.get_score() > highest_score: highest_score = env.get_score() total_scores += env.get_score() if times > 0: plot_score(scores, max_tiles) print(f'eval avg_score: {total_scores / times} highest_score: {highest_score}') if False: write_explore(agent, 'explore_new.file')
def evaluate(time, env, agent, render=False): eval_reward = [] for i in range(time): obs = env.reset() episode_reward = 0 step = 0 while True: step += 1 action = agent.predict(obs) # 选取最优动作 action = np.clip(action, -1, 1) obs, reward, isOver, _ = env.step(action) episode_reward += reward if render: env.render() if isOver or step >= 200: break eval_reward.append(episode_reward) mean_reward = np.mean(eval_reward) print("evaluating on {} episodes with mean reward {}.".format(time, mean_reward)) logging.warning("evaluating on {} episodes with mean reward {}.".format(time, mean_reward)) return mean_reward
def main(): env = stage_1() state_size = 5 num_actions = 9 solver = DeepQSolver(env, state_size, num_actions, 2000, 100) epsilon = 0.5 # solver.model.load() for i in range(500): res = train(solver, epsilon, replay=False) print("Episode :{:4d} Reward: {:6d}".format(i, res), end='\r') # render(env, None) if ((i + 1) % 100 == 0): print() solver.model.save() epsilon = max(epsilon * 0.99, 0.05) # test(solver, 0.1) # animate_game(env) render(env, 'deepfourier.mp4')
def eval(env, agent, times=1000, render=False): highest_score = 0 scores = [] max_tiles = [] eps = 0.0 random = False for i in range(times): obs = env.reset() while True: action, action_values = agent.choose_action(obs, eps, rand=random) obs_, reward, done, _ = env.step(action) if render: env.render() if str(obs_) == str(obs): random = True #env.render() # print(f'action is: {action} {reward} {action_values} {obs} {obs_}') print( f'action is: {action} {reward} {action_values} {obs} {obs_}' ) else: random = False obs = obs_ if done: break env.render() scores.append(env.get_score()) max_tiles.append(env.highest()) if env.get_score() > highest_score: highest_score = env.get_score() if times > 0: plot_score(scores, max_tiles) print( f'eval avg_score: {np.mean(scores)} highest_score: {highest_score}' )
control = k_seq[t] + np.matmul(kk_seq[t], (x_seq_hat[t] - x_seq[t])) u_seq_hat[t] = np.clip(u_seq[t] + control, -self.umax, self.umax) x_seq_hat[t + 1] = self.f(x_seq_hat[t], u_seq_hat[t]) return x_seq_hat, u_seq_hat env = gym.make('CartPoleContinuous-v0').env obs = env.reset() ilqr = ILqr(lambda x, u: env._state_eq(x, u), # x(i+1) = f(x(i), u) lambda x, u: 0.5 * np.sum(np.square(u)), # l(x, u) lambda x: 0.5 * (np.square(1.0 - np.cos(x[2])) + np.square(x[1]) + np.square(x[3])), # lf(x) env.max_force, env.observation_space.shape[0]) u_seq = [np.zeros(1) for _ in range(ilqr.pred_time)] x_seq = [obs.copy()] for t in range(ilqr.pred_time): x_seq.append(env._state_eq(x_seq[-1], u_seq[t])) cnt = 0 while True: env.render(mode="rgb_array") #import pyglet #pyglet.image.get_buffer_manager().get_color_buffer().save('frame_%04d.png' % cnt) for _ in range(3): k_seq, kk_seq = ilqr.backward(x_seq, u_seq) x_seq, u_seq = ilqr.forward(x_seq, u_seq, k_seq, kk_seq) print(u_seq.T) obs, _, _, _ = env.step(u_seq[0]) x_seq[0] = obs.copy() cnt += 1
from keras.models import Sequential from keras.layers import Dense, Activation, Flatten from keras.optimizers import Adam from rl.agents.dqn import DQNAgent from rl.policy import BoltzmannQPolicy from rl.memory import SequentialMemory ENV_NAME = 'timetable-case0001-v0001' # Get the environment and extract the number of actions. env = gym.make(ENV_NAME) print('observation space:', env.observation_space) print('action space:', env.action_space) env.render() action = env.action_space.sample() print(action) obs, r, done, info = env.step(action) print('next observation:', obs) print('reward:', r) print('done:', done) print('info:', info) print('nb_actions', env.action_space.n) env = gym.make(ENV_NAME) np.random.seed(123) env.seed(123) nb_actions = env.action_space.n # Next, we build a very simple model.
def rollout(sentence_generator, vae, sentences, inst_to_one_hot, dict_goals, env, policy, env_params, inits, goals, self_eval, true_eval, biased_init=False, animated=False): score = [] for sentence in sentences: sentence = sentence.lower() print('\nNew instruction: ', sentence) reached = False observation = env.unwrapped.reset_goal(np.array(goals[i]), biased_init=biased_init) config_initial = observation['achieved_goal'].copy() if sentence.lower() in inst_to_one_hot.keys(): counter = 0 while counter < 5: goal = sample_vae(vae, inst_to_one_hot, observation['achieved_goal'], sentence).flatten() # goal = dict_goals[np.random.choice(list(goals_str))] env.unwrapped.target_goal = goal.copy() observation = env.unwrapped._get_obs() obs = observation['observation'] ag = observation['achieved_goal'] g = observation['desired_goal'] # start to collect samples for t in range(env_params['max_timesteps']): # run policy no_noise = self_eval or true_eval action = policy.act(obs.copy(), ag.copy(), g.copy(), no_noise) # feed the actions into the environment if animated: env.render() observation_new, _, _, info = env.step(action) obs = observation_new['observation'] ag = observation_new['achieved_goal'] counter += 1 config_final = ag.copy() true_sentences = sentence_generator(config_initial, config_final) if sentence in true_sentences: score.append(counter) reached = True print('\tSuccess!') break else: print('\tFailed. Trying again.') else: print('Wrong sentence.') if not reached: score.append(0) print('\tFailed 5 times, Moving On.') return np.array(score)
def rollout(sentence_generator, vae, sentences, inst_to_one_hot, dict_goals, env, policy, env_params, inits, goals, self_eval, true_eval, biased_init=False, animated=False): observation = env.unwrapped.reset_goal(np.array(goals[i]), init=inits[i], biased_init=biased_init) counter = 0 while counter < 50: sentence = np.random.choice(sentences).lower() reached = False # print(sentence) # env.render() if sentence.lower() in inst_to_one_hot.keys(): trial_counter = 0 config_initial = observation['achieved_goal'].copy() while trial_counter < 5: goal = sample_vae(vae, inst_to_one_hot, observation['achieved_goal'], sentence).flatten() # goal = dict_goals[np.random.choice(list(goals_str))] env.unwrapped.target_goal = goal.copy() observation = env.unwrapped._get_obs() obs = observation['observation'] ag = observation['achieved_goal'] g = observation['desired_goal'] # start to collect samples for t in range(env_params['max_timesteps']): # run policy no_noise = self_eval or true_eval action = policy.act(obs.copy(), ag.copy(), g.copy(), no_noise) # feed the actions into the environment if animated: env.render() observation_new, _, _, info = env.step(action) obs = observation_new['observation'] ag = observation_new['achieved_goal'] config_final = ag.copy() true_sentences = sentence_generator(config_initial, config_final) if sentence in true_sentences: reached = True counter += 1 break else: trial_counter += 1 if not reached: break else: print('Wrong sentence.') print('Counter', counter) return counter
def main(): env_name = "dobro-CartPole-v0" env = gym.make(env_name) x_list = [] u_list = [] steps = 500 N = 10 state = env.reset() sim_t = 0 cnt = 0 start_t = time.time() for i in range(steps): if sim_t >= cnt * dt: init_state = list(state) init_action_list = np.zeros((N + 1, u_dim)) init_state_list = np.zeros((N + 1, x_dim)) init_state_list[:x_dim] = init_state x_init = np.concatenate( [init_action_list.ravel(), init_state_list.ravel()]) lowers = np.array([-np.inf] * ((N + 1) * u_dim) + init_state + [-np.inf] * (N * x_dim)) uppers = np.array([np.inf] * ((N + 1) * u_dim) + init_state + [np.inf] * (N * x_dim)) bounds = Bounds(lowers, uppers) res = minimize(obj_func, x_init, method="SLSQP", jac=obj_jacobian, bounds=bounds, constraints=[eq_cons], \ options={'ftol':1e-5, 'disp':False, 'maxiter':20, 'eps':1e-10}) cnt += 1 weight = (sim_t - (cnt - 1) * dt) / dt action = np.array([res.x[0] * (1 - weight) + res.x[1] * weight]) state, reward, done, info = env.step(action) env.render() x_list.append(state) u_list.append(action) sim_t += env.unwrapped.tau env.close() print("elapsed time : {:.3f}s, simulation time : {:.3f}".format( time.time() - start_t, sim_t)) x_list = np.array(x_list) u_list = np.array(u_list) fig_size = 6 fig, ax_list = plt.subplots(nrows=2, ncols=1, figsize=(fig_size * 1.5, fig_size * 1.5)) ax_list[0].plot(x_list[:, 0], label="pos") ax_list[0].plot(x_list[:, 1], label="pos_dot") ax_list[0].plot(x_list[:, 2], label="theta") ax_list[0].plot(x_list[:, 3], label="thtta_dot") ax_list[0].grid() ax_list[0].legend() ax_list[0].set_title('x : state') ax_list[1].plot(u_list[:, 0]) ax_list[1].grid() ax_list[1].set_title('u : input') fig.tight_layout() plt.savefig('result.png') plt.show()
def main(): env_name = "dobro-CartPole-v0" env = gym.make(env_name) x_dim = env.observation_space.shape[0] u_dim = env.action_space.shape[0] time_horizon = 200 ##################################### ##### set A, B, R, Q, Qf matrix ##### m1 = env.unwrapped.masscart m2 = env.unwrapped.masspole L = env.unwrapped.length g = env.unwrapped.gravity dt = env.unwrapped.tau temp_A_mat = np.eye(x_dim) temp_A_mat[0, 1] = dt temp_A_mat[2, 3] = dt temp_B_mat = np.array([[0.5 * dt**2, 0.0], [dt, 0.0], [0.0, 0.5 * dt**2], [0.0, dt]]) A_mat = np.array( [[0, 0, -(m2 / (m1 + m2)) * (g / (4.0 / 3.0 - m2 / (m1 + m2))), 0], [0, 0, g / (L * (4.0 / 3.0 - m2 / (m1 + m2))), 0]]) B_mat = np.array([[(1.0 / (m1 + m2)) * (1 + 3.0 * m2 / (4.0 * m1 + m2))], [-3.0 / (L * (4.0 * m1 + m2))]]) A_mat = temp_A_mat + np.matmul(temp_B_mat, A_mat) B_mat = np.matmul(temp_B_mat, B_mat) R_mat = np.eye(u_dim) * 0.01 Q_mat = np.eye(x_dim) * 1.0 Qf_mat = np.eye(x_dim) * 100.0 ##################################### #declare LQR solver agent = Agent(x_dim, u_dim, time_horizon, A_mat, B_mat, R_mat, Q_mat, Qf_mat) x_list = [] u_list = [] state = env.reset() action, P_mat_list = agent.get_action(state) for i in range(time_horizon): action = -np.matmul(P_mat_list[i], state).ravel() state, reward, done, info = env.step(action) env.render() time.sleep(dt) x_list.append(state) u_list.append(action) env.close() x_list = np.array(x_list) u_list = np.array(u_list) fig_size = 6 fig, ax_list = plt.subplots(nrows=2, ncols=1, figsize=(fig_size * 1.5, fig_size * 1.5)) ax_list[0].plot(x_list[:, 0], label="pos") ax_list[0].plot(x_list[:, 1], label="pos_dot") ax_list[0].plot(x_list[:, 2], label="theta") ax_list[0].plot(x_list[:, 3], label="thtta_dot") ax_list[0].grid() ax_list[0].legend() ax_list[0].set_title('x : state') ax_list[1].plot(u_list[:, 0]) ax_list[1].grid() ax_list[1].set_title('u : input') fig.tight_layout() plt.savefig('result.png') plt.show()
# print('Training Reward:{}'.format(reward)) # visualize(model.rewards, 'SARSA-lambda', 'try.png') # plt.show() # """ # Test model # """ # for i in range(num_test_episodes): # model.reset_state() # reward = model.test(num_timesteps, render = False) # print('test episode: {}/{} reward: {}'.format(i+1, num_test_episodes, reward), end = '\r') # if ((i+1)%int(num_test_episodes/10)==0): # print() # print('Training Reward:{}'.format(reward)) # print('[', end = '') # rwd = model.test(num_timesteps, render = False) # print(']', end = '') # """ # Save model for later use # """ # model.save('weight4.npy') model.test(num_timesteps, render=False) render(env, 'stage1.mp4') # animate_game(env) # plotV(model, 'w3.npy')