class Environment(threading.Thread): def __init__(self, brain, environment, eps_start=0, eps_end=0, eps_steps=0, render=False): threading.Thread.__init__(self) self.env = gym.make(environment) self.stop_signal = False self.render = render self.agent = Agent(brain, eps_start, eps_end, eps_steps) def runGame(self): R = 0 s = utils.process(self.env.reset(), self.env.spec.id) n_a = 0 old_a = None while True: time.sleep(THREAD_DELAY) if self.render: self.env.render() if n_a > MAX_REPEAT_ACTION: a = self.agent.act(s, old_a) else: a = self.agent.act(s) if a == old_a: n_a += 1 else: n_a = 0 old_a = a s_, r, done, info = self.env.step(a) s_ = utils.process(s_, self.env.spec.id) R += r self.agent.train(s, a, r, s_, done, R) s = s_ if done or self.stop_signal: break print("Score:", R) def run(self): while not self.stop_signal: self.runGame() def stop(self): self.stop_signal = True
def initial_log(agent: Agent, env: ContinuousSimulation, writer: tf.summary.SummaryWriter = None, **kwargs) -> None: writer = optional_writer(writer) state = env.reset() context = env.unwrapped.state() agent.act(state, context, network='q', log_graph=True) agent.act(state, context, network='target', log_graph=True)
def masterprocess(self): env, state_size, action_size = self.env, self.state_size, self.action_size agent = Agent(state_size, action_size, number_of_agents=self.num_agents, is_master=True, args=self.args, device="cpu") scores_deque = deque(maxlen=100) scores = [] tqdm_bar = trange(1, self.n_trajectories, desc="Trajectories") episode_bar = tqdm(total=self.max_t) train_mode = True for i in tqdm_bar: state = env.reset( train_mode=train_mode)[self.brain_name].vector_observations score = 0 for t in range(self.max_t): action, prob, q_value = agent.act(state[0]) action2, prob2, q_value2 = agent.act(state[1]) env_info = env.step([ action.detach().cpu().data.numpy(), action2.detach().cpu().data.numpy() ])[self.brain_name] next_state, reward, done = env_info.vector_observations, env_info.rewards, env_info.local_done agent.step(action, reward, prob, done, q_value) state = next_state score += np.mean(reward) episode_bar.set_description( "Time Step T: {}, Score: {:.2f}".format(t, score)) episode_bar.update() # if done: # break episode_bar.reset() tqdm_bar.set_description("Episode: {}, Score: {:.2f}".format( i, score)) scores_deque.append(score) scores.append(score) # train_mode = score < 10.0 if i % 100 == 0: torch.save(agent.TwoHeadModel.state_dict(), 'checkpoint.pth') if np.mean(scores_deque) > self.termination_threshold: print( '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}' .format(i - 100, np.mean(scores_deque))) break self.scores = scores env.close()
def test_agent_act(self): """Test how an agent can act""" agent = Agent(3, 5, 1) states = np.array([[1.0, 2.0, 3.0], [0.3, 2.0, 1.0]]) actions1 = agent.act(states, False) self.assertEqual((2, 5), actions1.shape) actions2 = agent.act(states, False) self.assertTrue(np.allclose(actions2, actions1)) actions3 = agent.act(states, True) self.assertFalse(np.allclose(actions2, actions3))
def main(train, action_bias=0): environment = Environment(tickers, initial_deposit=100000, from_date=datetime(2004, 1, 1), to_date=datetime(2010, 1, 1), min_days_to_hold=min_days_to_hold, max_days_to_hold=max_days_to_hold) agent = Agent(environment.state_size(), environment.action_size(), epochs=epochs, gamma=0.2, replay_buffer=64, memory_queue_length=32) if train: for i in range(epochs): state = environment.reset() done = False while not done: action = agent.act(state) next_state, reward, done = environment.step(action) agent.remember(state, action, reward, next_state, done) state = next_state agent.decrease_epsilon() LOGGER.info('Balance for current game: %d', environment.deposit) pprint(environment.actions) agent.save(environment.main_ticker + '.h5') else: agent.load(environment.main_ticker + '.h5') # Test on! test_environment = Environment(tickers, initial_deposit=100000, from_date=datetime(2010, 1, 1), to_date=datetime(2013, 1, 1), min_days_to_hold=min_days_to_hold, max_days_to_hold=max_days_to_hold, scaler=environment.scaler) state = test_environment.reset() done = False while not done: action = agent.act(state, False, action_bias) next_state, _, done = test_environment.step(action) state = next_state print_results_on_test_environment(test_environment) export_to_file(test_environment.actions)
class CartPole: def __init__(self): self.replay_batch_size = 500 self.training_episodes = 500 self.show_episodes = 3 self.env = gym.make("CartPole-v1") self.state_size = self.env.observation_space.shape[0] self.action_size = self.env.action_space.n self.agent = Agent(self.state_size, self.action_size) def train(self): for episode in range(self.training_episodes): state = self.env.reset() state = np.reshape(state, [1, self.state_size]) done = False score = 0 while not done: action = self.agent.act(state) next_state, reward, done, info = self.env.step(action) next_state = np.reshape(next_state, [1, self.state_size]) self.agent.remember(state, action, reward, next_state, done) state = next_state score += 1 print("Episode #{} Score: {}".format(episode, score)) self.agent.replay(self.replay_batch_size) def show(self): self.agent = Agent(self.state_size, self.action_size) self.agent.load_model() self.env = gym.wrappers.Monitor(self.env, 'video') for index_episode in range(self.show_episodes): state = self.env.reset() state = np.reshape(state, [1, self.state_size]) done = False score = 0 while not done: self.env.render() time.sleep(0.01) action = self.agent.act(state) state, reward, done, info = self.env.step(action) state = np.reshape(state, [1, self.state_size]) score += 1 print("The score was: {}".format(score))
def turn(agent: Agent, agent_policy_step: str, foe: Foe, rand="random") -> (str, str): # Not currently very extensible. Oh well. agent_action = agent.act(agent_policy_step) new_states = (agent_action.resolve_action(foe) if rand == "random" else agent_action.action_expectation(foe)) if agent_action.target_id == "self": agent.update_states(new_states["target"]) else: foe.update_states(new_states["target"]) foe_action = foe.act(rand) new_states = (foe_action.resolve_action(agent) if rand == "random" else foe_action.action_expectation(agent)) if foe_action.target_id == "self": foe.update_states(new_states["target"]) else: agent.update_states(new_states["target"]) foe.decrement_cooldowns() foe_reaction = foe.react() return (agent_action, foe_reaction)
def fitnessFunction2(genotype): """Second version of the fitness function. This one optimizes for the greatest delay to action.""" agent = Agent(genotype, Size, WeightRange, BiasRange, TimeConstMin, TimeConstMax, InputWeightRange, Dt) first_actions = np.zeros( (len(Stimuli), Trials) ) # Changed to zeros for the sake of fitness eval in the event no action is taken for s in range(len(Stimuli)): # This loops runs each stimulus condition for t in range(Trials): # This loops run all trials of task actions = np.empty(Duration) agent.sense(Stimuli[s]) # Initial stimulus for step in range(Duration): # Runtime agent.think() actions[step] = agent.act() # Record agent action agent.sense( 0 ) # Under current experimental design, no stimulus available for rest of acted = np.where(actions == 1) if len( acted[0] ) > 0: # The 0 index is because we need the array's first size dimension first_actions[s, t] = acted[0][ 0] # Record when the agent's first action was else: first_actions[s, t] = 0 # Not acting at all is worst fitness return np.average(first_actions) / Duration
def run_simulation(title="", num_plants=20, episodes=DEFAULT_EPISODES, episode_length=DEFAULT_EPISODE_LENGTH): print("START SIMULATION") env = Environment(num_plants=num_plants) state_size = env.observation_space action_size = env.action_space agent = Agent(state_size, action_size) batch_size = 32 for e in range(episodes): state = env.reset() state = numpy.reshape(state, [1, state_size]) for time in range(episode_length): pour_amount = agent.act(state) next_state, reward, done = env.step(pour_amount) next_state = numpy.reshape(next_state, [1, state_size]) agent.remember(state, pour_amount, reward, next_state, done) state = next_state if len(agent.memory) > batch_size: agent.replay(batch_size) print("Episode {} of {} done...\n".format(e, episodes)) generate_graphs(agent, env, title, num_plants, episodes, episode_length) print("END SIMULATION")
def main(env_name, monitor=True, load=False, seed=0, gpu=-1): env = gym.make(env_name) view_path = "./video/" + env_name model_path = "./model/" + env_name + "_" n_st = env.observation_space.shape[0] n_act = env.action_space.n agent = Agent(n_act, seed, gpu) if load: agent.load_model(model_path) if monitor: env.monitor.start(view_path, video_callable=None, force=True, seed=seed) for i_episode in xrange(10000): observation = env.reset() agent.reset_state(observation) ep_end = False q_list = [] r_list = [] while not ep_end: action = agent.act() observation, reward, ep_end, _ = env.step(action) agent.update_experience(observation, action, reward, ep_end) agent.train() q_list.append(agent.Q) r_list.append(reward) if ep_end: agent.save_model(model_path) break print('%i\t%i\t%f\t%i\t%f' % (i_episode, agent.step, agent.eps, sum(r_list), sum(q_list)/float(len(q_list)))) if monitor: env.monitor.close()
def run_agent(num_episodes=1): env = UnityEnvironment(file_name="env/Reacher20.app") brain_name = env.brain_names[0] brain = env.brains[brain_name] env_info = env.reset(train_mode=True)[brain_name] action_size = brain.vector_action_space_size state_size = env_info.vector_observations.shape[1] num_agents = len(env_info.agents) agent = Agent(state_size=state_size, action_size=action_size, random_seed=2) agent.actor_local.load_state_dict(torch.load("model/checkpoint_actor.pth", map_location='cpu')) agent.critic_local.load_state_dict(torch.load("model/checkpoint_critic.pth", map_location='cpu')) for i in range(num_episodes): scores = np.zeros(num_agents) env_info = env.reset(train_mode=False)[brain_name] states = env_info.vector_observations while True: actions = agent.act(states) env_info = env.step(actions)[brain_name] next_states = env_info.vector_observations scores += env_info.rewards states = next_states if np.any(env_info.local_done): break print(f"{i + 1} episode, averaged score: {np.mean(scores)}")
def ddpg(): scores = [] env = gym.make(ENV_NAME) agent = Agent(state_size=2, action_size=1) for i_episode in range(n_episodes): score = 0 done = False state = env.reset() for t in range(max_t): action = agent.act(state) next_state, reward, done, _ = env.step(action) modified_reward = reward + \ POTENTIAL_FUNCTION_COEF * (GAMMA * abs(next_state[1]) - abs(state[1])) agent.step(state, action, modified_reward, next_state, done) score += reward state = next_state if done: break scores.append(score) if i_episode % rate_of_print == 0: print("Episode: {}. Score: {}, Done: {}".format(i_episode / rate_of_print, score, done)) return agent, scores
def main(): window_size = 5 episode_count = 10 stock_name = "GSPC_10" batch_size = 3 agent = Agent(window_size) market = Market(window_size=window_size, stock_name=stock_name) start_time = time.time() for e in range(episode_count + 1): print("Episode {0}/{1}".format(e, episode_count)) agent.reset() state, price_data = market.reset() for t in range(market.last_index): action, bought_price = agent.act(state, price_data) next_state, next_price_data, reward, done = market.get_next_state_reward( action, bought_price) agent.memory.append([state, action, reward, next_state, done]) if len(agent.memory) > batch_size: agent.experience_replay(batch_size) state = next_state price_data = next_price_data if done: print("----------------------") print("Total Profit: {0}".format(agent.get_total_profit())) print("----------------------") if e % 10 == 0: if not os.path.exists("models"): os.mkdir("models") agent.model.save("models/model_ep" + str(e)) end_time = time.time() training_time = end_time - start_time print("Training time {0}".format(training_time))
def main(): stock_name = "GSPC_2011-03" model_name = "model_ep10" model = load_model("models/" + model_name) window_size = model.layers[0].input.shape.as_list()[1] agent = Agent(window_size, True, model_name) market = Market(window_size, stock_name) state, price_data = market.reset() for t in range(market.last_data_index): action, bought_price = agent.act(state, price_data) next_state, next_price_data, reward, done = market.get_next_state_reward( action, bought_price) state = next_state price_data = next_price_data if done: print("----------------------------") print("{0} Total profit: {1}".format(stock_name, agent.get_total_profit)) print("----------------------------") plot_action_profit(market.data, agent.action_history, agent.get_total_profit())
def run(episode=100000000, is_training=True): env = gym.make('FlappyBird-v0') agent = Agent(env) # agent.load_net('./tb/checkpoints/2190000') for e in range(episode): ob = env.reset() ob = agent.preproc(ob) done = False score = step = 0 start_time = time.time() while not done: if is_training is False: env.render() ac = agent.act(ob, is_training) next_ob, rew, done, _ = env.step(ac) # if rew == 0: # rew = 0.1 if is_training: ob = agent.memory(ob, ac, next_ob, rew, done) else: ob = agent.preproc(next_ob) score += rew step += 1 agent.get_score(score) print('episode: {} | score: {} | fps: {}'.format(e, score, step/(time.time() - start_time)))
def run(): student = Agent(17, 16, model_name=name) game_n = 0 while True: games = 16 for _ in range(games): board = TTT4() end = False game_n += 1 turn = 0 while not end: player_turn = int(board.player) current_board = board.board[:] state = get_state(current_board, player_turn) if turn == 0: action = game_n % 16 ret = board.play(action) else: action = student.act(np.array(state)) ret = board.play(action) events = get_events(state, current_board, player_turn, student.model) student.memory.append(events) reward = events[action][2] print(f'{game_n}: {action} {ret} reward: {reward}') board.print_board() if 'invalid' in ret: break if 'win' in ret or 'draw' in ret: end = True turn += 1 student.exp_replay() if game_n % 160 == 0: student.model.save(f'keras_model/{name}_{str(int(game_n))}')
def main(): config = Config( n_episodes=10000, max_episode_length=200, n_actions=2, n_inp_dim=4, n_hidden_dim=64, batch_size=1, gamma=0.99, ) env = gym.make('CartPole-v0').unwrapped memory = Memory(config) agent = Agent(config) for _ in range(config.n_episodes): episode: List[Step] = [] s = env.reset() final_v = 0 for _ in range(config.max_episode_length): a = agent.act(s) s2, r, t, _ = env.step(a) episode.append(Step(state=s, action=a, reward=r, terminal=t)) s = s2 if t: break else: # If no break final_v = agent.q(s).max() memory.store(episode, final_v) print(f"Reward: {sum(step.reward for step in episode)}") # Always train on last episode: agent.train(memory.episodes[-1:])
def run(env_file, model_file, num_episodes=5): env = UnityEnvironment(file_name=env_file) brain_name = env.brain_names[0] brain = env.brains[brain_name] action_size = brain.vector_action_space_size env_info = env.reset(train_mode=False)[brain_name] state = env_info.vector_observations[0] state_shape = state.shape agent = Agent(state_shape=state_shape, action_size=action_size, seed=0) agent.qnetwork_local.load_state_dict(torch.load(model_file)) for i in range(num_episodes): env_info = env.reset(train_mode=False)[brain_name] state = env_info.vector_observations[0] score = 0 while True: action = agent.act(state) env_info = env.step(action)[brain_name] next_state = env_info.vector_observations[0] reward = env_info.rewards[0] done = env_info.local_done[0] score += reward state = next_state if done: break print("Score: {}".format(score)) env.close()
class CartPole: def __init__(self): # Number of steps we select to learn from while replaying from memory self.sample_batch_size = 32 self.episodes = 10000 self.env = gym.make('CartPole-v1') # Configure model based on the environment self.state_size = self.env.observation_space.shape[0] self.action_size = self.env.action_space.n # Initialize the Agent self.agent = Agent(self.state_size, self.action_size) def run(self, render: bool = False): try: for index_episode in range(self.episodes): state = self.env.reset() state = np.reshape(state, [1, self.state_size]) done = False index = 0 while not done: if render: self.env.render() action = self.agent.act(state) next_state, reward, done, _ = self.env.step(action) next_state = np.reshape(next_state, [1, self.state_size]) self.agent.remember(state, action, reward, next_state, done) state = next_state index += 1 print("Episode {}# Score: {}".format(index_episode, index + 1)) self.agent.replay(self.sample_batch_size) finally: self.agent.save_model()
def main(): now = time.localtime() dir_name = '{0:04d}-{1:02d}-{2:02d}_{3:02d}-{4:02d}'.format(now.tm_year, now.tm_mon, now.tm_mday, now.tm_hour, now.tm_min) summary = SummaryWriter(os.path.join(ROOT, 'logs/{}'.format(dir_name))) output_dir = os.path.join(ROOT, 'trained_models/{}'.format(dir_name)) os.makedirs(output_dir, exist_ok=True) env = gym.make('Walker2DBulletEnv-v0') env.seed(0) env.render() agent = Agent() # seed = 0 # repeat = 10000 best_reward = 0.0 seed_ = 0 while True: # for seed_ in range(seed, seed + repeat): observation = env.reset() agent.ounoise.reset() done = False actor_loss = 0.0 critic_loss = 0.0 reward_sum = 0.0 agent.decay_epsilon() step = 0 while not done: action = agent.act(observation, is_training=True) next_observation, reward, done, _ = env.step(action) agent.push_memory(observation, action, reward, next_observation, done) loss_a, loss_c = agent.train() actor_loss += loss_a critic_loss += loss_c reward_sum += reward observation = next_observation step += 1 summary.add_scalar('actor/model_loss', actor_loss/step, seed_) summary.add_scalar('critic/model_loss', critic_loss/step, seed_) summary.add_scalar('reward', reward_sum, seed_) if reward_sum >= best_reward: torch.save(agent.actor.model.state_dict(), '{}/actor.pkl'.format(output_dir, seed_)) torch.save(agent.critic.model.state_dict(), '{}/critic.pkl'.format(output_dir, seed_)) torch.save(agent.actor.target_model.state_dict(), '{}/actor_t.pkl'.format(output_dir, seed_)) torch.save(agent.critic.target_model.state_dict(), '{}/critic_t.pkl'.format(output_dir, seed_)) with open(os.path.join(ROOT, 'logs/{}.txt'.format(dir_name)), 'a') as f: f.write("(Episode {}: Reward {}) The best model parameters were saved.\n".format(seed_, reward_sum)) best_reward = reward_sum seed_ += 1
def main(): stock_name = "GSPC_2011-03" model_name = "model_ep10" model = load_model("models/" + model_name) window_size = model.layers[0].input.shape.as_list()[1] agent = Agent(window_size, True, model_name) market = Market(window_size, stock_name) state, price_data = market.reset() #ToDo: Start from an initial state for t in range(market.last_data_index): action, bought_price = agent.act(state, price_data) # ToDo: Get action for the current state # Check the action to get reward and observe next state next_state, next_price_data, reward, done = market.get_next_state_reward(action, bought_price) #ToDo: get next state state = next_state price_data = next_price_data if done: print("--------------------------------") print("{0} Total Profit: {1}".format(stock_name, agent.get_total_profit())) print("--------------------------------") plot_action_profit(market.data, agent.action_history, agent.get_total_profit())
def main_eval(): stock_name = "BABA" model_name = "model_ep0" model = load_model("models/" + model_name) window_size = model.layers[0].input.shape.as_list()[1] agent = Agent(window_size, True, model_name) market = Market(window_size, stock_name) state, price_data, date_data = market.reset() date = [] for t in range(market.last_data_index): action, bought_price = agent.act(state, price_data, date_data) next_state, next_price_data, next_date_data, reward, done = market.get_next_state_reward( action, bought_price) state = next_state price_data = next_price_data date_data = next_date_data if done: print("--------------------") print("{0} Total profit: {1}".format(stock_name, agent.get_total_profit())) print("--------------------") plot_action_profit(market.data, agent.action_history, agent.get_total_profit()) return agent.book, agent.initial_investment, agent.dates
def main(): stock_name = "GSPC_2011-03" model_name = "model_ep30" window_size = 5 agent = Agent(window_size, True, model_name) market = Market(window_size, stock_name) state, price_data = market.reset() # Start from an initial state for t in range(market.last_data_index): action, bought_price = agent.act( state, price_data) # Get action for the current state # Check the action to get reward and observe next state next_state, next_price_data, reward, done = market.get_next_state_reward( action, bought_price) state = next_state price_data = next_price_data if done: print("--------------------------------") print("{0} Total Profit: {1}".format(stock_name, agent.get_total_profit())) print("--------------------------------") #toDo: change data plot_action_profit(market.data["Close"].values, agent.action_history, agent.get_total_profit())
def main(): logger = Logger() #------------------------------------ENVIRONMENT--------------------------------------------- a = Workspace(conversion_a) b = Workspace(conversion_b) workspaces = [] workspaces.append(a) workspaces.append(b) env = Environment(workspaces) #------------------------------------------------------------------------------------------- agent = Agent().build_agent(len(workspaces)) sess = agent.get_session() logger.create_dataholder("Target") logger.create_dataholder("Workspace_A") logger.create_dataholder("Workspace_B") #sess = tf_debug.LocalCLIDebugWrapperSession(sess) for i in range(config.nb_timesteps): Logger.write("INFO", "TIMESTEP " + str(i)) logger.add_datapoint("Workspace_A", i, distribution_a(i)) logger.add_datapoint("Workspace_B", i, distribution_b(i)) actions_tensor = np.zeros((config.training_size, 1)) rewards_tensor = np.zeros((config.training_size,1)) for j in range(config.training_size): action_elem = np.zeros(1) reward_elem = np.zeros(1) action_elem = agent.act() reward_elem = env.act(action_elem, i) actions_tensor[j][0] = action_elem rewards_tensor[j][0] = reward_elem for j in range(config.nb_batches): action_batch, reward_batch = utils.shuffle_batch(actions_tensor, rewards_tensor) loss_value,upd,resp,ww = agent.train(action_batch, reward_batch) Logger.write("INFO", str(loss_value)) Logger.write("INFO", str(ww)) total_reward = np.sum(rewards_tensor) reward_mean = float(total_reward)/float(config.training_size) Logger.write("INFO", "Total Reward of timestep " + str(i) + ': ' + str(reward_mean)) logger.add_datapoint("Target", i, 100.0*reward_mean) logger.init_plot() logger.plot("Target", 'o') logger.plot("Workspace_A", linestyle = None) logger.plot("Workspace_B", linestyle = None) logger.show()
def dqn(n_episodes=2000, max_t=1000, eps_start=1.0, eps_end=0.01, eps_decay=0.995): """Deep Q-Learning. Params ====== n_episodes (int): maximum number of training episodes max_t (int): maximum number of timesteps per episode eps_start (float): starting value of epsilon, for epsilon-greedy action selection eps_end (float): minimum value of epsilon eps_decay (float): multiplicative factor (per episode) for decreasing epsilon """ scores = [] # list containing scores from each episode agent = Agent(state_size=37, action_size=4, seed=0) scores_window = deque(maxlen=100) # last 100 scores eps = eps_start # initialize epsilon for i_episode in range(1, n_episodes + 1): env_info = env.reset( train_mode=True)[brain_name] # reset the environment state = env_info.vector_observations[0] #print(state.shape)# get the current state score = 0 for t in range(max_t): action = agent.act(state, eps) env_info = env.step(action)[ brain_name] # send the action to the environment next_state = env_info.vector_observations[0] # get the next state reward = env_info.rewards[0] # get the reward done = env_info.local_done[0] # see if episode has finished agent.step(state, action, reward, next_state, done) state = next_state score += reward if done: break scores_window.append(score) # save most recent score scores.append(score) # save most recent score eps = max(eps_end, eps_decay * eps) # decrease epsilon print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, np.mean(scores_window)), end="") if i_episode % 100 == 0: print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, np.mean(scores_window))) if np.mean(scores_window) >= 15.0: print( '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}' .format(i_episode - 100, np.mean(scores_window))) torch.save(agent.qnetwork_local.state_dict(), './ckpt/checkpoint.pth') print('model saved') break torch.save(agent.qnetwork_local.state_dict(), './checkpoint.pth') env.close() return scores
class PB18151853(RL_alg): def __init__(self, ob_space, ac_space): super().__init__() assert isinstance(ac_space, Discrete) self.team = ['PB17121707', 'PB17121732', 'PB18151853'] # 记录队员学号 # self.config = get_params_from_file('src.alg.PB00000000.rl_configs',params_name='params') # 传入参数 self.ac_space = ac_space self.state_dim = ob_space.shape print(self.state_dim) self.action_dim = ac_space.n # ---------------------------------------------------------- # initialize implemented DQN models and weight self.agent = Agent() # details about the api of Model in pytrace.nn.Model self.agent.qnetwork_local.load_seq_list( pytrace.load(join(root_path, './riverraid/best_list.pth'))) pytrace.prYellow( f"load weights from: {join(root_path, './riverraid/best_list.pth')}" ) self.state = np.zeros([4, 84, 84]) #self.state = self.WarpFrame(self.state) #self.state = np.stack([self.state] * 4, axis=0) # self.state = deque([np.zeros([84, 84, 4])], maxlen=4) def step(self, state): self.state = self.FrameStack(state, self.state) action = self.agent.act(self.state) return action def explore(self, obs): raise NotImplementedError def test(self): print('??') def WarpFrame(self, obs): """ :param obs: The raw observation returned by env, it should be a (210 * 160 * 3) RGB frame :return: ans: A (84 * 84) compressed gray style frame normalized in [0, 1] """ frame = cv2.cvtColor(obs, cv2.COLOR_RGB2GRAY) frame = cv2.resize(frame, (84, 84), interpolation=cv2.INTER_AREA) #return frame[:, :, None] return frame / 255.0 def FrameStack(self, new_obs, obs): """ :param new_obs: A raw observation returned by env, it should be a (210 * 160 * 3) RGB frame :param obs: The stack of past 4 (84 * 84) compressed gray style frames :return: A new stack of past 4 (84 * 84) compressed gray style frames """ new_obs = self.WarpFrame(new_obs) obs[0:3, :, :] = obs[1:, :, :] obs[3, :, :] = new_obs return obs
def train(par): """ There are other hyperparameters, but I'll just look at these for now. """ #Environment seed = 0 env = gym.make('CartPole-v0') env.seed(seed) # for comparison num_states = env.observation_space.shape[0] num_actions = env.action_space.n #Agent gamma, lr, tau = par agent = Agent(num_states, num_actions, lr, gamma, seed_num=seed) agent.memory_size = 10**4 agent.batchsize = 32 learning_start = 2000 agent.tau = tau #Train EPISODES = 500 scores = [] t1 = time.time() for e in range(1, EPISODES + 1): state = env.reset() reward_sum = 0 done = False steps = 0 actions = [] while not done: #env.render() state = np.reshape(state, [1, num_states]) #reshape for keras action_onehot = agent.act(state) action_scalar = np.dot(action_onehot, range(num_actions)) actions.append(action_scalar) next_state, reward, done, _ = env.step(action_scalar) reward_sum += reward agent.remember(state[0], action_onehot, reward, next_state, done) state = next_state if len(agent.memory) > learning_start: agent.train_models() agent.actor.gumbel_temperature = max( 0.999 * agent.actor.gumbel_temperature, 0.1) steps += 1 #Learn & print results scores.append(reward_sum) #agent.save_target_weights() plt.plot(scores) figname = 'gamma_' + str(gamma) + '_lr_' + str(lr) + '_tau_' + str( tau) + '.png' plt.title('gamma_' + str(gamma) + '_lr_' + str(lr) + '_tau_' + str(tau)) plt.savefig('figs/' + figname)
def create_dataset(size=5000): start_time = time.time() env = ExternalEnviroment() agent = Agent(n_irrelevant_actions=0) dataset = [] samples_counter = 0 max_actions = 50 counter_per_class = np.zeros(len(AVAILABLE_ACTIONS[:-1])) max_per_class = int(size / (len(counter_per_class) + 1)) counter_zero = 0 max_zero = max_per_class zeros = np.zeros(140) g_i = 0 while samples_counter < int(size): if g_i % 5 == 0: agent = Agent(n_irrelevant_actions=0) print g_i, samples_counter, counter_per_class, counter_zero if g_i % 25 == 0: env.reset() else: env.reset_random() n_tried_actions = 0 while n_tried_actions < max_actions: executed = agent.act(env) n_tried_actions += 1 if executed: inp = calc_mirror_system_input(agent.current_state, agent.next_state, agent.hunger) action_i = np.nonzero(agent.training_signal) if counter_per_class[action_i] < max_per_class and \ ((not np.all(inp[:-1] == 0)) or inp[-1] == 1): dataset.append([inp, agent.training_signal]) samples_counter += 1 counter_per_class[action_i] += 1 elif counter_zero < max_zero: dataset.append([ np.append(zeros, 0), np.zeros(len(AVAILABLE_ACTIONS[:-1])) ]) samples_counter += 1 counter_zero += 1 if agent.hunger == 0: agent.hunger = 1 env.reset() break g_i += 1 for i in range(max_zero * 9): dataset.append( [np.append(zeros, 0), np.zeros(len(AVAILABLE_ACTIONS[:-1]))]) print "Dataset creation time: {} sec.".format(time.time() - start_time) dataset = np.asarray(dataset) inconsistent = check_dataset(dataset) print "Found %d inconsistent data" % len(inconsistent) for i in inconsistent: dataset = np.delete(dataset, i, axis=0) return np.asarray(dataset)
def train(stock_name, window_size, episode_count): agent = Agent(window_size) data = getStockDataVec(stock_name) l = len(data) - 1 batch_size = 32 punishment = -500 for e in range(episode_count + 1): print("Episode " + str(e) + "/" + str(episode_count)) state = getState(data, 0, window_size + 1) total_profit = 0 agent.inventory = [] history = [] for t in range(l): action = agent.act(state) history.append(action) # sit next_state = getState(data, t + 1, window_size + 1) reward = 0 if action == 0 and len(history) >= 50 and history[-50:] == [0] * 20: print("PUNISHED: 50 consecutive snoozes") reward = punishment elif action == 1: # buy if len(history) >= 20 and history[-20:] == [1]*20: reward = punishment print("PUNISHED: 20 consecutive buys") else: agent.inventory.append(data[t]) print("Buy: " + formatPrice(data[t])) elif action == 2 and len(agent.inventory) > 0: # sell bought_price = agent.inventory.pop(0) reward = (data[t] - bought_price) * 100 total_profit += data[t] - bought_price print("Sell: " + formatPrice(data[t]) + " | Profit: " + formatPrice(data[t] - bought_price)) done = True if t == l - 1 else False agent.memory.append((state, action, reward, next_state, done)) state = next_state if done: print("--------------------------------") print("Total Profit: " + formatPrice(total_profit)) print("--------------------------------") if len(agent.memory) > batch_size: agent.expReplay(batch_size) if e % 10 == 0: agent.model.save("../models/SR_models/model_ep" + str(e))
def train(pars): """ There are other hyperparameters, but I'll just look at these for now. """ alpha,tau, batchsize = pars #Environment env = gym.make('CartPole-v0') env.seed(0) input_dim = env.observation_space.shape[0] output_dim = env.action_space.n #Agent lr,gamma = 3*10**-4, 0.99 clipnorm, verbose = False, False agent = Agent(input_dim, output_dim, lr, gamma, tau, alpha, clipnorm, verbose) agent.memory_size = batchsize agent.batchsize = batchsize #Train EPISODES = 10**4 scores = [] t1 = time.time() for e in range(1,EPISODES+1): state = env.reset() state = agent.make_tensor(state) reward_sum = 0 done = False while not done: #Do main step # env.render() action = agent.act(state) next_state, reward, done, _ = env.step(action) reward_sum += reward next_state = agent.make_tensor(next_state) agent.remember(state[0],action,reward,next_state[0],done) #want to remember state as a vec state = next_state if e >= 2: agent.learn() #Print results scores.append(reward_sum) plt.figure() string = 'alpha_'+str(alpha)+'_tau_'+str(tau)+'_batchsize_'+str(batchsize) plt.title(string) plt.plot(scores,alpha=0.5) plt.plot(agent.window_average(scores,100),'r-') plt.savefig('figs/' + string + '.png') t2 = time.time() print 'took ' + str( (t2-t1) / 60.0 / 60.0) + ' hours' return
def ddpg(n_episodes=250, max_t=1000, print_every=25): env = UnityEnvironment(file_name="env/Reacher20.app") brain_name = env.brain_names[0] brain = env.brains[brain_name] env_info = env.reset(train_mode=True)[brain_name] action_size = brain.vector_action_space_size state_size = env_info.vector_observations.shape[1] num_agents = len(env_info.agents) agent = Agent(state_size=state_size, action_size=action_size, random_seed=2) scores_deque = deque(maxlen=print_every) scores = [] for i_episode in range(1, n_episodes + 1): env_info = env.reset(train_mode=True)[brain_name] states = env_info.vector_observations agent.reset() score = np.zeros(num_agents) for t in range(max_t): actions = agent.act(states) env_info = env.step(actions)[brain_name] next_states = env_info.vector_observations rewards = env_info.rewards dones = env_info.local_done for i in range(num_agents): agent.step(states[i], actions[i], rewards[i], next_states[i], dones[i], t) states = next_states score += rewards if np.any(dones): break scores_deque.append(score.mean()) scores.append(score.mean()) print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, np.mean(scores_deque)), end="") if i_episode % print_every == 0: print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, np.mean(scores_deque))) if np.mean(scores_deque) > 30: print("Model trained successfully") torch.save(agent.actor_local.state_dict(), "model/checkpoint_actor.pth") torch.save(agent.critic_local.state_dict(), "model/checkpoint_critic.pth") break return scores