total_packet = 1000, # Training hyper-parameters gamma = 0.99, eps = 0.3, seed = 0, decay_rate = 0.99, learning_rate = 1e-4, batch_size = 10, save_freq = 2000, log_freq = 10, # Evaluation test_mode = "pg", ) env = Env(default_config) # torch.manual_seed(default_config["seed"]) # Create the agent # check & load pretrain model if default_config["test_mode"] = "pg": agent = pg.Agent(default_config) if os.path.isfile('./data/pg_send_packet.pkl'): print('Load Policy Network parametets ...') agent.s_policy.load_state_dict(torch.load('./data/pg_send_packet.pkl')) if os.path.isfile('./data/pg_switch_channel.pkl'): print('Load Policy Network parametets ...')
print("device", device) if not os.path.exists(file_path): os.makedirs(file_path) write_lr(lr) #lrのtextファイルを作成する now = datetime.datetime.now() print('{0:%Y%m%d}'.format(now)) #tensorboarx writer_x = SummaryWriter('tfbx2/' + '_' + '{0:%Y%m%d%H%M%S_}'.format(now) + model_filename + MEMO + '/') ban = Env(BANHEN, WINREN) memory = ReplayMemory(CAPACITY, ban) brain = Brain_dqn(NeuralNet_cnn, device, ban.size, ban, memory, GAMMA, BATCH_SIZE, lr, T, BANHEN, BANSIZE) match_is_continue = True #試合が継続しているかどうか train_is_continue = True #訓練を継続するか reward = 0 #報酬 step = 0 #何手目か step_sum = 0 gen_num = 0 #モデルの初期値 episode_sum = 0 #エピソードの累積 search_depth = 3 ep_random_data = 0 log_print("lrはtextファイルから読み取り")
K_o_from_json = json_data['parameters']['K_o'] max_range_from_json = json_data['parameters']['Max_range'] N_mins_from_json = json_data['parameters']['N_mins'] d_none_from_json = json_data['parameters']['d_none'] d_perf_from_json = json_data['parameters']['d_perf'] delta_expl_angle_from_json = json_data['parameters']['delta_expl_angle'] xi_max_from_json = json_data['parameters']['xi_max'] scs_from_json = SCS(json_data['beacons'][0]['ID'], max_range_from_json, xi_max=xi_max_from_json, d_perf=d_perf_from_json, d_none=d_none_from_json) env_from_json = Env(entrance_point_from_json, obstacle_corners=obstacle_corners_from_json) start_animation_from_min_ID = 0 stop_min_ID = 20 #N_mins_from_json#1# scs_from_json.insert_into_environment(env_from_json) mins2 = [ Min( json_data['beacons'][i + 1]['ID'], #i+1 because [0] is the SCS max_range_from_json, None, xi_max=xi_max_from_json, d_perf=d_perf_from_json, d_none=d_none_from_json, delta_expl_angle=delta_expl_angle_from_json)
parser.add_argument('--render', action='store_true') parser.add_argument('--verbose', action='store_true') parser.add_argument('--num_layers', type=int, default=2) parser.add_argument('--num_neurons', type=int, default=15) parser.add_argument('--repeat_num', default=0, type=int) parser.add_argument('--update_every', type=int, default=1) parser.add_argument('--env', default='binary',choices=['binary', 'cart', 'mount']) parser.add_argument('--update_type', default='async', choices=['sync', 'async']) parser.add_argument('--train_last', action='store_true', help='Only train the last neuron') parser.add_argument('--reward_type', default='task', choices=['all','task','bio','bio_then_all']) args = parser.parse_args() # CREATE ENVIRONMENT if args.env == 'cart': env = gym.make('CartPole-v1') elif args.env == 'mount': env = gym.make('MountainCar-v0') else: sys.exit(); env = Env() # BUILD NETWORK AND SET ENV THRESHOLD network = Network(args, input_space=env.observation_space.shape, num_outputs=env.action_space.n) stop_threshold = 75 if args.env == 'binary' else 300 for e in range(1,20000): done = False state = env.reset() ep_reward = 0 while not done: # take action action = network.forward(state) # if args.env == 'mount' and action == 1: action += 1
def main(): env = Env(HEIGHT, WIDTH, Human, Zombie, Bat) generate(env)
]), ] open_w_sq_obs = [ np.array([ [-1, -1], [-1, 12], [12, 12], [12, -1], ]), np.array([[2, 2], [2, 9], [9, 9], [9, 2]]) ] env = Env( np.array([0, 0]), obstacle_corners= open_small #open_large #[]#open_w_sq_obs #open_large#obs_zig_zag#[]# ) data['environment'].append(env.toJson()) # %%Parameter initialization max_range = 3 _xi_max = 1 _d_perf = 0.1 _d_none = 2.5 _delta_expl_angle = 0 #np.pi/4 #np.pi/6 _K_o = 0.9 N_mins = 6 file_path = r'json_files\ds_test_123.json' dt = 0.01
def main(): rospy.init_node('ddpg_stage_1') env = Env(is_training) agent = DDPG(env, state_dim, action_dim) past_action = np.array([0., 0.]) print('State Dimensions: ' + str(state_dim)) print('Action Dimensions: ' + str(action_dim)) print('Action Max: ' + str(action_linear_max) + ' m/s and ' + str(action_angular_max) + ' rad/s') if is_training: print('Training mode') avg_reward_his = [] total_reward = 0 var = 1. while True: state = env.reset() one_round_step = 0 while True: a = agent.action(state) a[0] = np.clip(np.random.normal(a[0], var), 0., 1.) a[1] = np.clip(np.random.normal(a[1], var), -0.5, 0.5) state_, r, done, arrive = env.step(a, past_action) time_step = agent.perceive(state, a, r, state_, done) if arrive: result = 'Success' else: result = 'Fail' if time_step > 0: total_reward += r if time_step % 10000 == 0 and time_step > 0: print( '---------------------------------------------------') avg_reward = total_reward / 10000 print('Average_reward = ', avg_reward) avg_reward_his.append(round(avg_reward, 2)) print('Average Reward:', avg_reward_his) total_reward = 0 if time_step % 5 == 0 and time_step > exploration_decay_start_step: var *= 0.9999 past_action = a state = state_ one_round_step += 1 if arrive: print('Step: %3i' % one_round_step, '| Var: %.2f' % var, '| Time step: %i' % time_step, '|', result) one_round_step = 0 if done or one_round_step >= 500: print('Step: %3i' % one_round_step, '| Var: %.2f' % var, '| Time step: %i' % time_step, '|', result) break else: print('Testing mode') while True: state = env.reset() one_round_step = 0 while True: a = agent.action(state) a[0] = np.clip(a[0], 0., 1.) a[1] = np.clip(a[1], -0.5, 0.5) state_, r, done, arrive = env.step(a, past_action) past_action = a state = state_ one_round_step += 1 if arrive: print('Step: %3i' % one_round_step, '| Arrive!!!') one_round_step = 0 if done: print('Step: %3i' % one_round_step, '| Collision!!!') break
def play_episode(self, n_tot): self.beta_net.eval() self.beta_target.eval() self.pi_net.eval() self.pi_target.eval() self.vb_net.eval() self.vb_target.eval() self.q_net.eval() self.q_target.eval() self.qb_net.eval() self.qb_target.eval() env = Env() n_human = 120 humans_trajectories = iter(self.data) softmax = torch.nn.Softmax() for i in range(n_tot): env.reset() observation = next(humans_trajectories) trajectory = self.data[observation] choices = np.arange(self.global_action_space, dtype=np.int) mask = Variable(torch.FloatTensor( [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0]), requires_grad=False).cuda() j = 0 temp = 1 while not env.t: s = Variable(env.s.cuda(), requires_grad=False) beta, phi = self.beta_net(s) pi, _ = self.pi_net(s) q, _ = self.q_net(s) vb, _ = self.vb_net(s) pi = beta.squeeze(0) self.greedy = False if j < n_human: a = trajectory[j, self.meta['action']] else: # eps = np.random.rand() eps = 1 # a = np.random.choice(choices) if self.greedy and eps > 0.01: a = pi.data.cpu().numpy() a = np.argmax(a) else: a = softmax(pi / temp).data.cpu().numpy() a = np.random.choice(choices, p=a) q = q[0, a] q = q.squeeze(0) env.step(a) yield { 'o': env.s.cpu().numpy(), 'v': vb.squeeze(0).data.cpu().numpy(), 'vb': vb.squeeze(0).data.cpu().numpy(), 'qb': q.squeeze(0).data.cpu().numpy(), # 's': x[0, :512].data.cpu().numpy(), 'score': env.score, 'beta': pi.data.cpu().numpy(), 'phi': phi.squeeze(0).data.cpu().numpy(), 'q': q.squeeze(0).data.cpu().numpy() } j += 1 raise StopIteration
from environment import Environment as Env from helper_funcs import print_query, print_info ENVIRONMENT_SETTINGS_FILE = "environment_settings_test.txt" # print_query("Please enter K vehicle loss penalty value:") # inp = raw_input() inp = '' if str(inp) == '': env = Env(ENVIRONMENT_SETTINGS_FILE) else: env = Env(ENVIRONMENT_SETTINGS_FILE, int(inp)) env.simulation()
def play_episode(self, n_tot): self.model.eval() self.model_b.eval() env = Env() n_human = 120 humans_trajectories = iter(self.data) softmax = torch.nn.Softmax() for i in range(n_tot): env.reset() observation = next(humans_trajectories) trajectory = self.data[observation] choices = np.arange(self.global_action_space, dtype=np.int) mask = Variable(torch.FloatTensor([0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), requires_grad=False).cuda() j = 0 temp = 1 while not env.t: s = Variable(env.s.cuda(), requires_grad=False) beta, vb, qb, _, _ = self.model_b(s, self.actions_matrix) pi, v, q, adv, x = self.model(s, self.actions_matrix, beta.detach()) pi = pi.squeeze(0) self.greedy = False if j < n_human: a = trajectory[j, self.meta['action']] else: eps = np.random.rand() # a = np.random.choice(choices) if self.greedy and eps > 0.1: a = pi.data.cpu().numpy() a = np.argmax(a) else: a = softmax(pi/temp).data.cpu().numpy() a = np.random.choice(choices, p=a) q = q[0, a, 0] q = q.squeeze(0) qb = qb[0, a, 0] qb = qb.squeeze(0) env.step(a) yield {'o': env.s.cpu().numpy(), 'v': v.squeeze(0).data.cpu().numpy(), 'vb': vb.squeeze(0).data.cpu().numpy(), 'qb': qb.squeeze(0).data.cpu().numpy(), 's': x[0, :512].data.cpu().numpy(), 'score': env.score, 'beta': pi.data.cpu().numpy(), 'phi': x[0, :512].data.cpu().numpy(), 'q': q.squeeze(0).data.cpu().numpy()} j += 1 raise StopIteration
def play(self, n_tot, action_offset, player): self.beta_net.eval() self.beta_target.eval() self.pi_net.eval() self.pi_target.eval() self.vb_net.eval() self.vb_target.eval() self.q_net.eval() self.q_target.eval() self.qb_net.eval() self.qb_target.eval() env = Env(action_offset) n_human = 90 episodes = list(self.data.keys()) random.shuffle(episodes) humans_trajectories = iter(episodes) for i in range(n_tot): env.reset() trajectory = self.data[next(humans_trajectories)] choices = np.arange(self.global_action_space, dtype=np.int) random_choices = self.mask_q.data.cpu().numpy() random_choices = random_choices / random_choices.sum() j = 0 while not env.t: s = Variable(env.s.cuda(), requires_grad=False) if player is 'beta': pi, _ = self.beta_net(s) pi = pi.squeeze(0) self.greedy = False elif player is 'q_b': pi, _ = self.qb_net(s) pi = pi.squeeze(0) self.greedy = True elif player is 'pi': pi, _ = self.pi_net(s) pi = pi.squeeze(0) self.greedy = False elif player is 'q_pi': pi, _ = self.q_net(s) pi = pi.squeeze(0) self.greedy = True else: raise NotImplementedError if j < n_human: a = trajectory[j, self.meta['action']] else: eps = np.random.rand() # eps = 1 # a = np.random.choice(choices) if self.greedy: if eps > 0.01: a = (pi * self.mask_q).data.cpu().numpy() a = np.argmax(a) else: a = np.random.choice(choices, p=random_choices) else: a = F.softmax(pi + self.mask_beta, dim=0).data.cpu().numpy() a = np.random.choice(choices, p=a) env.step(a) j += 1 yield {'score': env.score, 'frames': j} raise StopIteration
if not args.test: self.replay_memory.append( (history, action, reward, next_history, end)) self.priority.append(priority) history = next_history action = next_action if frame > 2000: raise ValueError('Loop bug') if maxv.size()[0] > 0: self.writer.add_scalar('maxv', maxv.mean(), round_num) if actions.size()[0] > 0: self.writer.add_scalar('action', actions.mean(), round_num) self.writer.add_scalar('epsilon', self.epsilon, round_num) self.writer.add_scalar('frame', frame, round_num) gc.collect() if env.win: print("Round {} Win: reward:{}, frame:{}".format( round_num, reward, frame)) self.win = True else: print("Round {} Lose: reward:{}, frame:{}".format( round_num, reward, frame)) self.win = False return reward if __name__ == "__main__": env = Env(args.height, args.width, args.frame_time) actor = Actor() actor.main()
def __init__(self, function, args): self.function = function self.lambda_list = args self.bindings = Env(None, None)
theta = 1. mu = np.array([0, 0]) sigma = np.array([0, 1]) # initiliaze the RL-agent: agent = AgentReinforce(dim_state=dim_state, dim_actions=dim_actions, hidden_dims=hidden_dims, optimizer=optimizer, gamma=gamma) # initiliaze the environment: env = Env(start=start, tcost=tcost, horizon=horizon, w=w, theta=theta, mu=mu, sigma=sigma) # TRAINING # print("\n===TRAINING===\n") trained_agent, train_loss, train_states, \ train_actions, train_rewards = train_reinforce(agent=agent, environment=env, episodes=train_episodes, policy_update=pi_update) # SIMULATION #
""" All information on README.md """ import tensorflow as tf from environment import Env import numpy as np import time import model steps = 1000 env = Env(vision=True) ob = env.reset(relaunch=True) print(ob) ###=================== Play the game with the trained model # while True: # env = Env(vision=True) # ob = env.reset(relaunch=True) # loss = 0.0 # for i in range(steps): # image = scipy.misc.imresize(ob, [66, 200]) / 255.0 # degrees = model.y.eval(feed_dict={model.x: [image], model.keep_prob: 1.0})[0][0] # ob, reward, done, _ = env.step(act) # if done is True: # break # else: # ob_list.append(ob) # # print("PLAY WITH THE TRAINED MODEL") # print(reward_sum)
def main(): rospy.init_node('ddpg_stage_1') env = Env(is_training) agent = DDPG(env, state_dim, action_dim) # import ipdb # ipdb.set_trace() past_action = np.array([0., 0.]) print('State Dimensions: ' + str(state_dim)) print('Action Dimensions: ' + str(action_dim)) print('Action Max: ' + str(action_linear_max) + ' m/s and ' + str(action_angular_max) + ' rad/s') print('Action Min: ' + str(action_linear_min) + ' m/s and ' + str(action_angular_min) + ' rad/s') ######################################################################################### # Training ######################################################################################### if is_training: print('Training mode') avg_reward_his = [] total_reward = 0 action_var = 0.2 success_rate = 0 # Log path setting now = datetime.datetime.now() logdir = now.strftime('%Y-%M-%d') + '_' + now.strftime('%H-%M') logdir = os.path.join(log_dir, logdir) # tb_writer = SummaryWriter(logdir) # Start training start_time = time.time() for itr in range(10000): state = env.reset() # episode_reward = 0.0 # For each episode for cur_step in range(max_episode_length): action = agent.action(state) action[0] = np.clip(np.random.normal(action[0], action_var), action_linear_min, action_linear_max) action[1] = np.clip(np.random.normal(action[1], action_var), action_angular_min, action_angular_max) state_, reward, done, arrive = env.step(action, past_action) time_step = agent.perceive(state, action, reward, state_, done) ######################################################################################## # debugging environment ######################################################################################## if is_debugging: print('cur_step: {}'.format(cur_step)) print('action: {}'.format(action)) print('goal position: x:{}, y:{}'.format( env.goal_position.position.x, env.goal_position.position.y)) print('r: {}, done: {}, arrive: {}'.format( reward, done, arrive)) ######################################################################################## result = 'Success' if arrive else 'Fail' if time_step > 0: total_reward += reward if time_step % 10000 == 0 and time_step > 0: print( '---------------------------------------------------') avg_reward = total_reward / 10000 print('Average_reward: {}'.format(avg_reward)) avg_reward_his.append(round(avg_reward, 2)) # writer.add_scalar('avg_reward', avg_reward, time_step) print('Overall average Reward: {}'.format(avg_reward_his)) total_reward = 0 if time_step % 5 == 0 and time_step > exploration_decay_start_step: action_var *= 0.9999 past_action = action state = state_ if arrive or done or cur_step >= max_episode_length: if result == 'Success': success_rate += 1 sec = time.time() - start_time elapsed_time = str( datetime.timedelta(seconds=sec)).split('.')[0] print( 'Num_episode: {}, Full steps: {}, Result: {}, Elapsed time: {}' .format(itr, cur_step, result, elapsed_time)) if itr % 20 == 0 and itr > 0: print('Total: {}/20, Success rate: {}'.format( success_rate, round(success_rate / 20), 2)) success_rate = 0 break ######################################################################################### # Testing ######################################################################################### else: print('Testing mode') while True: state = env.reset() one_round_step = 0 while True: a = agent.action(state) a[0] = np.clip(a[0], 0., 1.) a[1] = np.clip(a[1], -0.5, 0.5) state_, reward, done, arrive = env.step(a, past_action) past_action = a state = state_ one_round_step += 1 if arrive: print('Step: %3i' % one_round_step, '| Arrive!!!') one_round_step = 0 if done: print('Step: %3i' % one_round_step, '| Collision!!!') break
observation = observation_ if done: if episode >= ep_max - 11: fid_10 = max(fid_10, fid) break step += 1 return fid_10 if __name__ == "__main__": dt_ = np.pi / 20 env = Env( action_space=list(range(2)), #allow two actions dt=dt_) RL = DeepQNetwork( env.n_actions, env.n_features, learning_rate=0.01, reward_decay=0.9, e_greedy=0.99, replace_target_iter=200, memory_size=2000, e_greedy_increment=0.001, ) fidelity = run_maze() print("Final_fidelity=", fidelity)
def evl(ast, env): global stack #print(ast) while True: #print(ast) if isinstance(ast, tuple): if len(ast) == 0: return ast ast = macroexpand(ast, env) if not isinstance(ast, tuple): return eval_ast(ast, env) if isinstance(ast[0], Keyword): hm = evl(ast[1], env) return hm[ast[0].name] if isinstance(ast[0], Name): if ast[0].name == "def!": value = evl(ast[2], env) return env.set(ast[1].name, value) if ast[0].name == "defmacro!": value = evl(ast[2], env) value.is_macro = True return env.set(ast[1].name, value) if ast[0].name == "let*": new_env = Env(env, [], []) binding_list = ast[1] for i in zip(binding_list[::2], binding_list[1::2]): data = evl(i[1], new_env) new_env.set(i[0], data) ast, env = ast[2], new_env continue if ast[0].name == "try*": if len(ast) < 3: return evl(ast[1], env) try: return evl(ast[1], env) except BaslException as e: new_env = Env(env, [ast[2][1].name], [e]) return evl(ast[2][2], new_env) except Exception as e: new_env = Env(env, [ast[2][1].name], [str(e)]) return evl(ast[2][2], new_env) if ast[0].name == "raise": s = "{}:{}:{}".format( env.get("*file*"), ast[0].name, ast[0].line) if isinstance( ast[0], Name) else "LAMBDA<" + ast[0] + ">" raise BaslException(evl(ast[1], env), [*env.stack, s]) if ast[0].name == "quote": return ast[1] if ast[0].name == "macroexpand": return macroexpand(ast[1], env) if ast[0].name == "quasiquoteexpand": return quasiquote(ast[1]) if ast[0].name == "quasiquote": ast = quasiquote(ast[1]) continue if ast[0].name == "do": res = None for x in ast[1:-1]: res = evl(x, env) ast = ast[-1] continue if ast[0].name == "if": if len(ast) < 3: ast, env = None, env continue res_cond = evl(ast[1], env) if type(res_cond) == bool and res_cond == True: ast = ast[2] continue if type(res_cond) == int: ast = ast[2] continue if type(res_cond) == float: ast = ast[2] continue if type(res_cond) == list: ast = ast[2] continue if type(res_cond) == tuple: ast = ast[2] continue if type(res_cond) == str: ast = ast[2] continue if type(res_cond) == Fn: ast = ast[2] continue if type(res_cond) == Keyword: ast = ast[2] continue if type(res_cond) == Name: ast = ast[2] continue if type(res_cond) == types.LambdaType: ast = ast[2] continue if type(res_cond) == Atom: ast = ast[2] continue ast = ast[3] if len(ast) >= 4 else None continue if ast[0].name == "fn*": body = ast[2] params = ast[1] func = lambda *e: evl(body, Env(env, params, e)) return Fn(body, params, env, func) [f, *args] = eval_ast(ast, env) if isinstance(f, Fn): s = "{}:{}:{}".format( env.get("*file*"), ast[0].name, ast[0].line) if isinstance( ast[0], Name) else "LAMBDA<" + display(ast[0], True) + ">" ast, env = f.ast, Env(f.env, f.params, args, s) continue if isinstance(f, types.LambdaType): return f(*args) return eval_ast(ast, env)
length=R) assert raw_data.shape == (R, D) and end_data.shape == (R, ) actions = np.array([-1, 0, 1]) config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: # Make seperate DQN (mainDQN = behaviorDQN, targetDQN) mainDQN = DQN(sess, D, S, C, LR, FLAGS.model_name, net_name="main") targetDQN = DQN(sess, D, S, C, LR, FLAGS.model_name, net_name="target") if S_MODE == "train": env_train = Env(num_data=(0, T), raw_data=raw_data, end_data=end_data, actions=actions, input_size=D, seq_size=S, name="train", transaction_cost=TC) saver = train(sess, env_train, mainDQN, targetDQN) saver = tf.train.Saver() test(sess, env_train, mainDQN, saver, df, make_csv=True) elif S_MODE == "test": env_test = Env(num_data=(T, FLAGS.raw_data_length), raw_data=raw_data, end_data=end_data, actions=actions, input_size=D, seq_size=S, name="train", transaction_cost=TC)
120, 121, 122, 124, 125, 126, 127, 129, 130, 136, 142, 143, 144, 145, 146, 147, 148, 150, 151, 152, 153, 154, 155, 156, 157, 1687, 181, 182, 183, 184, 185, 186, 199, 400, 401, 402, 403, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 424, 425, 426, 427, 432 ] # hardcoding # What we want to watch # sum of incentives (mints) # sum of rent fees # and it's diff. if __name__ == "__main__": args = arguments.parser() print("> Setting:", args) # Env env = Env(num_stations, num_bikes_per_station) # Agent agents = [Agent(args.defaultBalance) for _ in range(args.users)] # logs bankrupts = list() incentives = list() # mints fees = list() # logs for visualization log_dict = dict() log_dict["log_bankrupts"] = list() log_dict["log_total_incentive"] = list() log_dict["log_total_fee"] = list() log_dict["log_balance"] = list()
def data(self, eval_w_start, eval_w_end, eval_w_points): """ Generates the state space over which the Q-value approximating network is to be evaluated and computes the true Q-values. Only works for environments with 2 periods per episode. Arguments --------- :param eval_w_start : float Lowest value of wealth component of the state in the evaluation state space. :param eval_w_end : float Highest value of wealth component of the state in the evaluation state space. :param eval_w_points : int Number of evenly spaced wealth components of the state between the lowest and highest value. Returns ------- :returns x_train : ndarray All states in evaluation state space. :returns y_train : ndarray True Q-values for all states in the evaluation state space. """ # initiliaze the RL-agent: agent = AgentDQN(dim_state=self.dim_state, dim_actions=self.dim_actions, hidden_dims=self.hidden_dims, optimizer=Adam(), gamma=self.gamma, eps=self.eps, eps_decay=self.eps_decay, frozen=self.frozen, pretrained=self.pretrained) # initiliaze the environment: env = Env(start=self.start, tcost=self.tcost, horizon=self.horizon, w=self.w, theta=self.theta, regimes=self.regimes) assert env.horizon == 2 x1 = np.arange(0, env.horizon) / env.horizon x2 = np.linspace(eval_w_start, eval_w_end, eval_w_points) x_train = np.array(np.meshgrid(x1, x2)).T.reshape(-1, 2) # which regimes operate in t=0 and t=1: idx0 = [0 in v["periods"] for v in env.regimes.values()].index(True) idx1 = [1 in v["periods"] for v in env.regimes.values()].index(True) r0 = list(env.regimes.keys())[idx0] # regime in t=0 r1 = list(env.regimes.keys())[idx1] # regime in t=1 mu0 = env.regimes[r0]["mu"] # log-returns at t=0 mu1 = env.regimes[r1]["mu"] # log-returns at t=1 sigma0 = env.regimes[r0]["sigma"] sigma1 = env.regimes[r1]["sigma"] assert np.array_equal(mu0, mu1) is True assert np.array_equal(sigma0, sigma1) is True y_train = [] for s in x_train: if s[0] == 0: opt_w = compute_opt_weight(env, 1) tq = 2 * mu0[0] +\ (agent.action_space + opt_w) * \ (mu0[1] - mu0[0] + sigma0[1] ** 2 / 2) - \ 0.5 * (agent.action_space ** 2 + opt_w ** 2) * \ sigma0[1] ** 2 + np.log(s[1]) else: tq = mu0[0] + agent.action_space * (mu0[1] - mu0[0]) \ + \ 0.5 * agent.action_space * (1 - agent.action_space) * \ sigma0[1] ** 2 + np.log(s[1]) y_train.append(tq) self.x = x_train self.y = np.array(y_train) return self.x, self.y
def play_episode(self, n_tot): self.model.eval() env = Env() n_human = 120 humans_trajectories = iter(self.data) softmax = torch.nn.Softmax() # mask = torch.FloatTensor(consts.actions_mask[args.game]) # mask = Variable(mask.cuda(), requires_grad=False) vsx = torch.FloatTensor(consts.short_bins[args.game]) vlx = torch.FloatTensor(consts.long_bins[args.game]) for i in range(n_tot): env.reset() observation = next(humans_trajectories) trajectory = self.data[observation] choices = np.arange(self.global_action_space, dtype=np.int) j = 0 while not env.t: s = Variable(env.s.cuda(), requires_grad=False) vs, vl, beta, qs, ql, phi, pi_s, pi_l, pi_s_tau, pi_l_tau = self.model( s, self.actions_matrix) beta = beta.squeeze(0) pi_l = pi_l.squeeze(0) pi_s = pi_s.squeeze(0) pi_l_tau = pi_l_tau.squeeze(0) pi_s_tau = pi_s_tau.squeeze(0) temp = 1 # consider only 3 most frequent actions beta_np = beta.data.cpu().numpy() indices = np.argsort(beta_np) maskb = Variable(torch.FloatTensor( [0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), requires_grad=False).cuda() # maskb = Variable(torch.FloatTensor([0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), # requires_grad=False).cuda() # pi = maskb * (beta / beta.max()) pi = beta self.greedy = False beta_prob = pi if j < n_human: a = trajectory[j, self.meta['action']] else: eps = np.random.rand() # a = np.random.choice(choices) if self.greedy and eps > 0.1: a = pi.data.cpu().numpy() a = np.argmax(a) else: a = softmax(pi / temp).data.cpu().numpy() a = np.random.choice(choices, p=a) env.step(a) vs = softmax(vs) vl = softmax(vl) vs = torch.sum(vsx * vs.data.cpu()) vl = torch.sum(vlx * vl.data.cpu()) yield { 'o': env.s.cpu().numpy(), 'vs': np.array([vs]), 'vl': np.array([vl]), 's': phi.data.cpu().numpy(), 'score': env.score, 'beta': beta_prob.data.cpu().numpy(), 'phi': phi.squeeze(0).data.cpu().numpy(), 'qs': qs.squeeze(0).data.cpu().numpy(), 'ql': ql.squeeze(0).data.cpu().numpy(), } j += 1 raise StopIteration
def train(self, env, checkpoint_interval, checkpoint_dir, saver, gamma=0.99): global T self.saver = saver # initialize environment time.sleep(3 * self.thread_id) env = Env(env, 84, 84, 4) print 'Starting thread ' + str(self.thread_id) terminal = False # Get initial game observation state = env.get_initial_state() # episode's reward and cost episode_reward = 0 total_cost = 0 counter = 0 while T < self.TMAX: # lists for feeding placeholders states = [] actions = [] prev_reward = [] state_values = [] t = 0 t_start = t self.sess.run(self.sync_op) while not (terminal or ((t - t_start) == self.tmax)): # forward pass of network. Get probability of all actions probs, v = self.sess.run((self.policy, self.state_value), feed_dict={self.input_state: [state]}) probs = probs[0] v = v[0][0] # print the outputs of the neural network fpr sanity chack if T % 2000 == 0: print probs print v # define list of actions. All values are zeros except , the # value of action that is executed action_list = np.zeros([self.output_size]) # choose action based on policy action_index = sample_policy_action(probs) action_list[action_index] = 1 # add state and action to list actions.append(action_list) states.append(state) state_values.append(v) # Gym executes action in game environment on behalf of actor-learner new_state, reward, terminal = env.step(action_index) # clip reward to -1, 1 clipped_reward = np.clip(reward, -1, 1) prev_reward.append(clipped_reward) # Update the state and global counters state = new_state T += 1 t += 1 counter += 1 # update episode's counter episode_reward += reward # Save model progress if T % checkpoint_interval < 200: T += 200 self.saver.save(self.sess, checkpoint_dir + "/breakout.ckpt", global_step=T) if terminal: R_t = 0 else: R_t = self.sess.run(self.state_value, feed_dict={self.input_state: [state]}) R_t = R_t[0][0] state_values.append(R_t) targets = np.zeros((t - t_start)) for i in range(t - t_start - 1, -1, -1): R_t = prev_reward[i] + gamma * R_t targets[i] = R_t # compute the advantage based on GAE # code from https://github.com/openai/universe-starter-agent delta = np.array(prev_reward) + gamma * np.array( state_values[1:]) - np.array(state_values[:-1]) advantage = scipy.signal.lfilter([1], [1, -gamma], delta[::-1], axis=0)[::-1] # update the global network cost, _ = self.sess.run( (self.loss, self.opt), feed_dict={ self.input_state: states, self.actions: actions, self.targets: targets, self.advantage: advantage }) total_cost += cost if terminal: terminal = False print "THREAD:", self.thread_id, "/ TIME", T, "/ REWARD", \ episode_reward, "/ COST", total_cost/counter episode_reward = 0 total_cost = 0 counter = 0 # Get initial game observation state = env.get_initial_state()
agent = 0 """Lower Manhattan""" # G = ox.load_graphml('lowermanhattan.graphml') # G = ox.project_graph(G) # fig, ax = ox.plot_graph(G, node_size=0, edge_linewidth=0.5) """San Francisco""" # G = ox.load_graphml('sanfrancisco.graphml') # G = ox.project_graph(G) # fig, ax = ox.plot_graph(G, node_size=0, edge_linewidth=0.5) """Piedmont, California""" G = ox.load_graphml('piedmont.graphml') G = ox.project_graph(G) fig, ax = ox.plot_graph(G, node_size=0, edge_linewidth=0.5) # initialize the environment for the learning agent env = Env(n=N, fig=fig, ax=ax, agent=agent, dt=dt, animate=False) # initialize the Keras training model model = Sequential() model.add(layers.InputLayer(batch_input_shape=(1, 10))) model.add(layers.Dense(10, activation='sigmoid')) model.add(layers.Dense(2, activation='linear')) model.compile(loss='mse', optimizer='adam', metrics=['mae']) # now execute Q learning y = 0.95 eps = 0.5 decay_factor = 0.999 num_episodes = 10 r_avg_list = []
if state == [2, 2]: return 0.0 return self.policy_table[state[0]][state[1]] def get_value(self, state): return round(self.value_table[state[0]][state[1]], 2) def check_if_have_none_or_more_then_two_argument(): return len(sys.argv) < 2 or len(sys.argv) > 2 def check_if_argument_value_invalid(): return sys.argv[1] != 'i' and sys.argv[1] != 'ii' and sys.argv[1] != 'iii' def exit_and_print_error(): sys.exit('You should specify one argument: i, ii, or iii') if __name__ == "__main__": if check_if_have_none_or_more_then_two_argument( ) or check_if_argument_value_invalid(): exit_and_print_error() else: scenario = sys.argv[1] env = Env(scenario) policy_iteration = PolicyIteration(env, scenario) grid_world = GraphicDisplay(policy_iteration, scenario) grid_world.mainloop()
def evaluate(ast, env): """ Evaluates an abstract syntax tree in a given environment """ while True: # Infinite loop used for tail call optimization # First check if the AST is a macro, and if so expand it. ast = macroexpand(ast, env) # If the ast is not a list, call the mutually recursive eval_ast() function on it. if not isinstance(ast, List): return eval_ast(ast, env) # Return the AST as it is, if it's just an empty sequence, as there's nothing more to be done. if len(ast) == 0: return ast elif not isinstance(ast[0], List) and ast[0] in special_forms: # This following section deals with applying the logic of each special form. form, args = ast[0], ast[1:] # def! assigns a value to a key in the current environment. if form == Symbol("def!"): value = evaluate(args[1], env) env.define(args[0], value) return value # let* evaluates a form in a temporary environment. elif form == Symbol("let*"): var_list = args[0] if isinstance(var_list, (List, Vector)) and len(var_list) % 2 == 0: new_env = Env(outer=env) for i in range(0, len(var_list), 2): new_env.define(var_list[i], evaluate(var_list[i + 1], new_env)) env = new_env ast = args[1] continue # Tail call optimization else: raise SyntaxError("Invalid argument list supplied.") # do evaluates all the elements of the list, and returns the final evaluated one. This constructs provides a way to sequentially execute things. elif form == Symbol("do"): for expr in args[:-1]: last = evaluate(expr, env) ast = args[-1] continue # Tail call optimization # if works as you'd expect. To be noted that it only evaluates the needed argument (first argument if the condition is true, second otherwise), and is also tail call optimized. elif form == Symbol("if"): cond, true = args[0], args[1] false = args[2] if len(args) > 2 else None ev = evaluate(cond, env) if ev is False or ev == Nil(): if false is not None: ast = false continue else: return Nil() else: ast = true continue # fn* defines a lambda function, with the first argument being the parameter list, and the second being the function's body. elif form == Symbol("fn*"): params, body = args[0], args[1] def fn(*arguments): return evaluate(body, Env(env, params, arguments)) return Procedure(body, params, env, fn) # quote defers evaluation, just returning its argument as it is. elif form == Symbol('quote'): return args[0] # quasiquote enables a quoted list to have certain elements evaluted by the way of unquote and splice-unquote. elif form == Symbol('quasiquote'): ast = quasiquote(args[0]) continue # Tail call optimized # defmacro! defines a new macro in the current environment. elif form == Symbol("defmacro!"): value = evaluate(args[1], env) value.is_macro = True env.define(args[0], value) return value # macroexpand allows explicitly calling the macroexpand function. This can aid in debugging macros. elif form == Symbol("macroexpand"): return macroexpand(args[0], env) # End of special forms logic else: # First evaluate the list that holds the AST evaluated = eval_ast(ast, env) # Procedures primarily represent user defined functions if isinstance(evaluated[0], Procedure): proc = evaluated[0] ast = proc.ast env = proc.make_env(Env, evaluated[1:]) continue # Tail call optimization # Callables represent the built-in functions or fully evaluated procedures elif callable(evaluated[0]): return evaluated[0](*evaluated[1:]) # During evaluation, a Lisp list is expected to hold a function reference as its first element else: raise SyntaxError("First element of list is not a function.")
actions = np.array(self.actions) action_prob = tf.reduce_sum(actions * policies, axis=1) cross_entropy = -tf.math.log(action_prob + 1e-5) loss = tf.reduce_sum(cross_entropy * discounted_rewards) entropy = -policies * tf.math.log(policies) # 오류함수를 줄이는 방향으로 모델 업데이트 grads = tape.gradient(loss, model_params) self.optimizer.apply_gradients(zip(grads, model_params)) self.states, self.actions, self.rewards = [], [], [] return np.mean(entropy) if __name__ == "__main__": # 환경과 에이전트 생성 env = Env(render_speed=0.01) state_size = 15 action_space = [0, 1, 2, 3, 4] action_size = len(action_space) agent = REINFORCEAgent(state_size, action_size) scores, episodes = [], [] EPISODES = 200 for e in range(EPISODES): done = False score = 0 # env 초기화 state = env.reset() state = np.reshape(state, [1, state_size])
def fn(*arguments): return evaluate(body, Env(env, params, arguments))
if state == [2, 2]: return [] # calculating q values for the all actions and # append the action to action list which has maximum q value for action in self.env.possible_actions: next_state = self.env.state_after_action(state, action) reward = self.env.get_reward(state, action) next_value = self.get_value(next_state) value = (reward + DISCOUNT_FACTOR * next_value) if value > max_value: action_list.clear() action_list.append(action) max_value = value elif value == max_value: action_list.append(action) return action_list def get_value(self, state): return round(self.value_table[state[0]][state[1]], 2) if __name__ == "__main__": env = Env() value_iteration = ValueIteration(env) grid_world = GraphicDisplay(value_iteration) grid_world.mainloop()
def get_num_actions(): env = gym.make(FLAGS.game) env = Env(env, FLAGS.width, FLAGS.height, FLAGS.history_length, FLAGS.game_type) num_actions = len(env.gym_actions) return num_actions