def eval_agent(point): # TODO: (Federico) This is a mess, find a better way to convert parameters # point = [df, lr, memsize, updatefreq, nlayers, nunits] model = get_model(point[4], point[5], point[1]) parameters = { "discount_factor": point[0], "learning_rate": point[1], "memory_size": point[2], "target_update_frequency": point[3], "train_start": 1000, "epsilon": 0.02, "batch_size": 32, "env": env, "full_model": model, "model": None #TODO merge full_model and model in a single function that # takes aslo the number of layers. I didn't do it because # there may be lot of usages in the code I'm not aware of } print('Evaluating at ' + str(parameters) + ', nlayers: ' + str(point[4]) + ', nunits: ' + str(point[5])) agent = DQNAgent(parameters) agent.train(generate_experiment_name(parameters), episode_num=1000, solved_score=195) res = agent.test(300) print('Evaluation result: ' + str(res)) return res
def __init__(self, model_class, model=None, env=None, exploration=None, gamma=0.99, memory_size=100000, batch_size=1, target_update_frequency=1000, saving_dir=None, min_mem=10000): """ base class for lstm dqn agent :param model_class: sub class of torch.nn.Module. class reference of the model :param model: initial model of the policy net. could be None if loading from checkpoint :param env: environment :param exploration: exploration object. Must have function value(step) which returns e :param gamma: gamma :param memory_size: size of the memory :param batch_size: size of the mini batch for one step update :param target_update_frequency: the frequency for updating target net (in steps) :param saving_dir: the directory for saving checkpoint """ DQNAgent.__init__(self, model_class, model, env, exploration, gamma, memory_size, batch_size, target_update_frequency, saving_dir) self.memory = EpisodicReplayMemory(memory_size) self.hidden = None self.min_mem = min_mem
def __init__(self, env: gym.Env, log_frequency=1000, exploration=None, **kwargs): self.log_frequency = log_frequency self.agent = DQNAgent(action_dim=env.action_space.n, state_dim=env.observation_space.shape[0], **kwargs) self.env = env # Avoid mutable argument if exploration is None: exploration = {'algorithm': 'epsilon_greedy', 'decay': 'linear', 'initial_epsilon': 1.0, 'final_epsilon': 0.01, 'decay_timesteps': 1000} self.exploration_config = exploration # Parse the exploration dict to make the update_explo_param function if self.exploration_config['algorithm'] == 'epsilon_greedy': if self.exploration_config['decay'] == 'linear': update_term = (self.exploration_config['initial_epsilon'] - self.exploration_config[ 'final_epsilon']) / self.exploration_config['decay_timesteps'] self.update_explo_param = (lambda epsilon: epsilon - update_term if epsilon > self.exploration_config[ 'final_epsilon'] else epsilon) elif self.exploration_config['decay'] == 'exponential': self.update_explo_param = (lambda epsilon: epsilon * self.exploration_config['epsilon_decay']) else: raise NotImplementedError
def play(env, game, model_path): agent = DQNAgent(model_path) done, score = False, game.start_score observation = env.reset() # Initial history state = preprocess_frame(observation) history = np.stack((state, state, state, state), axis=2) history = np.reshape([history], (1, 84, 84, 4)) while not done: env.render() time.sleep(0.05) # Play action action = agent.choose_action(history) game_action = get_ingame_action(action) observation, reward, done, info = env.step(game_action) # Update history next_state = preprocess_frame(observation) next_state = np.reshape([next_state], (1, 84, 84, 1)) next_history = np.append(next_state, history[:, :, :, :3], axis=3) history = next_history reward = np.clip(reward, -1., 1.) score += reward print("score: ", score)
class TestDQNAgent(unittest.TestCase): def setUp(self): self.state_size = 3 self.action_size = 5 fc = nn.Sequential(nn.Linear(self.state_size, 5), nn.ReLU(), nn.Linear(5, 7), nn.ReLU(), nn.Linear(7, 9), nn.ReLU(), nn.Linear(9, self.action_size)) self.main_model = QNetwork(name="my_network", fc=fc) self.target_model = QNetwork(name="my_network", fc=fc) self.agent = DQNAgent(main_model=self.main_model, target_network=self.target_model, memory=WeightedReplayBuffer(buffer_size=12, batch_size=3)) self.eps_greediness = 0.01 def test_allruns(self): """ No explosions? """ # act state_value = [random()] * self.agent.state_size self.agent.act(state=state_value, eps=self.eps_greediness) agent_learned = False while not agent_learned: # I want to force a learning step. agent_learned = self.agent.step( state=[random()] * self.agent.state_size, action=np.random.randint(self.agent.action_size), reward=random(), next_state=[random()] * self.agent.state_size, done=random() > 0.75)
def __init__(self, model_path): super(ReversiDisplay, self).__init__() self.BORD_PX_SIZE = 480 self.keylock = False self.model_path = model_path self.title("リバーシ") self.geometry("{}x{}+{}+{}".format(self.BORD_PX_SIZE + 30, self.BORD_PX_SIZE + 30 + 100, self.BORD_PX_SIZE, 100)) self.color = ["", "white", "black"] # {tag: position} self.tag2pos = {} # 座標からtagの変換 self.z2tag = {} #メインクラスの作成 env = Reversi() self.bord_size = env.Board_Size self.env = env #対戦エージェントの作成 self.agent = DQNAgent(env.enable_actions, env.name, env.Board_Size) self.agent.load_model(args.model_path) # 符号 self.numstr = self.make_numstr() self.alpstr = self.make_alpstr() # Set up some variables self.set_variables() # Set up game board self.set_board() # Set up some buttons self.set_button()
def _train(opponents, train_from_scratch=False, render=False): env = pommerman.make('PommeFFACompetition-v0', []) # Exploration strategy exp_schedule = LinearExploration(env, config.eps_begin, config.eps_end, config.eps_nsteps) # Learning rate schedule lr_schedule = LinearSchedule(config.lr_begin, config.lr_end, config.lr_nsteps) # Initialize agents. dqn_agent = DQNAgent(env, config, exp_schedule, lr_schedule, True, train_from_scratch=train_from_scratch) dqn_agent_index = _init_agents(env, exp_schedule, lr_schedule, opponents, dqn_agent) t = 1 while t < config.nsteps_train: state = env.reset() done = False while not done: t += 1 if render: env.render() actions = env.act(state) state, reward, done, info = env.step(actions) if reward[dqn_agent_index] == -1 and not done: # Stop the episode when the training agent dies. dqn_agent.episode_end(-1) done = True env.close()
def __init__(self): self.config = None self.memory_debug_path = ROOT + '/memory.txt' self.memory_path = ROOT + '/memory.pkl' self.memory = [] self.game_state_strings = [] self.agent = DQNAgent() self.initial_weights = self.agent.NN.model.layers[-1].get_weights()[0]
def trainOneEpisode(self, num_episodes, max_episode_steps=100, save_freq=100, render=False): self.hidden = None DQNAgent.trainOneEpisode(self, num_episodes, max_episode_steps, save_freq, render)
def __init__(self, state_size, action_size, input_shape, memory_size, replay_start_step, load_model): DQNAgent.__init__(self, state_size, action_size, replay_start_step, memory_size) self.input_shape = input_shape self.initializer = he_normal() if load_model is not False: self.load_model(load_model) else: self.__build_model()
def setUp(self): self.state_size = 3 self.action_size = 5 fc = nn.Sequential(nn.Linear(self.state_size, 5), nn.ReLU(), nn.Linear(5, 7), nn.ReLU(), nn.Linear(7, 9), nn.ReLU(), nn.Linear(9, self.action_size)) self.main_model = QNetwork(name="my_network", fc=fc) self.target_model = QNetwork(name="my_network", fc=fc) self.agent = DQNAgent(main_model=self.main_model, target_network=self.target_model, memory=WeightedReplayBuffer(buffer_size=12, batch_size=3)) self.eps_greediness = 0.01
def __init__(self, level_filepath, episodes=30000, initial_epsilon=1., min_epsilon=0.1, exploration_ratio=0.5, max_steps=2000, render_freq=500, enable_render=True, render_fps=20, save_dir='checkpoints', enable_save=True, save_freq=500, gamma=0.99, batch_size=64, min_replay_memory_size=1000, replay_memory_size=100000, target_update_freq=5, seed=42): self.set_random_seed(seed) self.episodes = episodes self.max_steps = max_steps self.epsilon = initial_epsilon self.min_epsilon = min_epsilon self.exploration_ratio = exploration_ratio self.render_freq = render_freq self.enable_render = enable_render self.render_fps = render_fps self.save_dir = save_dir self.enable_save = enable_save self.save_freq = save_freq if enable_save and not os.path.exists(save_dir): os.makedirs(save_dir) level_loader = LevelLoader(level_filepath) self.agent = DQNAgent(level_loader.get_field_size(), gamma=gamma, batch_size=batch_size, min_replay_memory_size=min_replay_memory_size, replay_memory_size=replay_memory_size, target_update_freq=target_update_freq) self.env = Snake(level_loader) self.summary = Summary() self.current_episode = 0 self.max_average_length = 0 self.epsilon_decay = (initial_epsilon - min_epsilon) / (exploration_ratio * episodes)
def main(): logging.getLogger().setLevel(logging.INFO) parser = argparse.ArgumentParser() parser.add_argument('--config', type=str, default='config/global_config.json') parser.add_argument('--num_step', type=int, default=2000) parser.add_argument('--ckpt', type=str) parser.add_argument('--algo', type=str, default='DQN', choices=['DQN', 'DDQN', 'DuelDQN'], help='choose an algorithm') args = parser.parse_args() # preparing config # # for environment config = json.load(open(args.config)) config["num_step"] = args.num_step cityflow_config = json.load(open(config['cityflow_config_file'])) roadnetFile = cityflow_config['dir'] + cityflow_config['roadnetFile'] config["lane_phase_info"] = parse_roadnet(roadnetFile) # # for agent intersection_id = "intersection_1_1" config["intersection_id"] = intersection_id config["state_size"] = len(config['lane_phase_info'][intersection_id]['start_lane']) + 1 # 1 is for the current phase. [vehicle_count for each start lane] + [current_phase] phase_list = config['lane_phase_info'][intersection_id]['phase'] config["action_size"] = len(phase_list) config["batch_size"] = args.batch_size logging.info(phase_list) # build cityflow environment env = CityFlowEnv(config) # build agent agent = DQNAgent(config) # inference agent.load(args.ckpt) env.reset() state = env.get_state() for i in range(args.num_step): action = agent.choose_action(state) # index of action action_phase = phase_list[action] # actual action next_state, reward = env.step(action_phase) # one step state = next_state # logging logging.info("step:{}/{}, action:{}, reward:{}" .format(i, args.num_step, action, reward))
def __init__(self): # Hyperparameters / Constants self.noOfEpisodes = 400 self.ReplayMemoryQueueSize = 100000 self.minReplayMemoryQueueSize = 10000 self.sampleBatchSize = 1000 self.epsilon = 1 self.epsilonDecay = 0.99 self.minEpsilon = 0.001 self.discount = 0.99 self.doRender = False self.gameEnv = 'MountainCar-v0' # Environment details self.env = gym.make(self.gameEnv) self.actionDimension = self.env.action_space.n self.observationDimension = self.env.observation_space.shape # creating own session to use across all the Keras/Tensorflow models we are using self.sess = tf.compat.v1.Session() # Replay memory to store experiances of the model with the environment self.replay_memory = deque(maxlen=self.ReplayMemoryQueueSize) # Our models to solve the mountaincar problem. self.agent = DQNAgent(self.sess, self.actionDimension, self.observationDimension)
def build_graph(): session = tf.Session() optimizer = tf.train.AdamOptimizer(learning_rate=0.01) writer = tf.summary.FileWriter("/home/drl/DRL/tensorflow-reinforce/tmp/") # Policy parameters for the exploration policy epsilon = 0.9 target_update_rate = 0.1 dqn_agent = DQNAgent(session, optimizer, q_network, state_dim, num_actions, target_update_rate=target_update_rate, summary_writer=writer) # Switch between greedy and exploratory policy exploration_policy = EpsilonGreedyPolicy(dqn_agent, num_actions, epsilon) # Always take greedy actions according to greedy policy greedy_policy = EpsilonGreedyPolicy(dqn_agent, num_actions, 1.0) # Sampler (collect trajectories using the present dqn agent) num_episodes = 10 training_sampler = Sampler(exploration_policy, env, num_episodes=num_episodes) testing_sampler = Sampler(greedy_policy, env, num_episodes=5) # Initializing ReplayBuffer buffer_size = 100000 replay_buffer = ReplayBuffer(buffer_size) return dqn_agent, training_sampler, testing_sampler, replay_buffer
def main(args): agent = DQNAgent(args) if args.test: agent.restore(args.checkpoint) rewards = agent.evaluate(args.eval_episodes, args.final_epsilon) print('Reward mean: %f std: %f' % (rewards.mean(), rewards.std())) else: agent.train(args)
def load_from_checkpoint(checkpoint_fname): checkpoint_dict = torch.load(checkpoint_fname) agent = DQNAgent(checkpoint_dict['config']) agent.q_net.load_state_dict(checkpoint_dict['q_net']) agent.memory.buffer = checkpoint_dict['buffer'] loss_hist = checkpoint_dict['loss_hist'] avg_rewards = checkpoint_dict['avg_rewards'] return agent, loss_hist, avg_rewards
def main(args): with open(args.param, "r") as f: config = json.load(f) env = gym.make('Freeway-v0') env.seed(args.seed) env = FrameStack(env, config) print('State shape: ', env.observation_space.shape) print('Action shape: ', env.action_space.n) agent = DQNAgent(state_size=200, action_size=env.action_space.n, config=config) #agent_r.load("models-28_11_2020_22:25:27/2000-") env = gym.wrappers.Monitor(env, "./vid", video_callable=lambda episode_id: True, force=True) #agent.qnetwork_local.load_state_dict(torch.load('checkpoint-score80.47156817885116_epi_125.pth')) agent.qnetwork_local.load_state_dict( torch.load('search_results/models/eval-{}/_q_net.pth'.format( args.agent))) agent.encoder.load_state_dict( torch.load('search_results/models/eval-{}/_encoder.pth'.format( args.agent))) n_episodes = 1 max_t = 3000 eps = 0 for i_episode in range(1, n_episodes + 1): state = env.reset() score = 0 for t in range(max_t): action = agent.act(state, eps) next_state, reward, done, _ = env.step(action) score += reward time.sleep(0.01) state = next_state env.render() if done: break print("Episode {} Reward {} Steps {}".format(i_episode, score, t)) env.close()
def training(**kwargs): # Set logging level if kwargs['debug']: LOGGER.setLevel(logging.DEBUG) else: LOGGER.setLevel(logging.INFO) agent = DQNAgent(environment=env, action_space=[0, 1, 2, 3, 4, 5, 6, 7], NN_arch=kwargs['NN_arch'], maxIters=kwargs['max_iters'], eta=0.00001, epsilon=0.4, discount=0.95, weights_dir=kwargs['weights_dir'], mem_size=10**5) while True: agent.learn(replay=kwargs['replay'], frame_skipping=kwargs['frame_skipping'], batch_size=kwargs['batch_size']) if agent.numIters > agent.maxIters: break agent.save(agent.save_path % kwargs['max_iters']) # return the agent object return agent
def train_model(max_episodes=50000): """ Trains a DQN agent to play the CartPole game """ agent = DQNAgent() buffer = ReplayBuffer() env = gym.make("CartPole-v0") for _ in range(100): collect_gameplay_experiences(env, agent, buffer) for epis in range( max_episodes): # Train the agent for 6000 episodes of the game collect_gameplay_experiences(env, agent, buffer) gameplay_experience_batch = buffer.sample_gameplay_batch() loss = agent.train(gameplay_experience_batch) avg_reward = evaluate_training_result(env, agent) if epis % 20 == 0: agent.update_target_network() print("Episode {}/{} and so far the performance is {} and loss is {}". format(epis, max_episodes, avg_reward, loss[0])) # env.close() print("Training Complete") play(env, agent)
def run_eval(dir_name: str, episodes: int = 100, render: bool = False) -> List[int]: agent_conf = AgentConf() env = Tetris() agent = DQNAgent(env.get_state_size(), n_neurons=agent_conf.n_neurons, activations=agent_conf.activations, epsilon_stop_episode=agent_conf.epsilon_stop_episode, mem_size=agent_conf.mem_size, discount=agent_conf.discount, replay_start_size=agent_conf.replay_start_size) # timestamp_str = "20190730-165821" # log_dir = f'logs/tetris-nn={str(agent_conf.n_neurons)}-mem={agent_conf.mem_size}' \ # f'-bs={agent_conf.batch_size}-e={agent_conf.epochs}-{timestamp_str}' # tetris-20190731-221411-nn=[32, 32]-mem=25000-bs=512-e=1 good log_dir = 'logs/' + dir_name # load_model agent.model = load_model(f'{log_dir}/model.hdf') agent.epsilon = 0 scores = [] for episode in range(episodes): env.reset() done = False while not done: next_states = env.get_next_states() best_state = agent.best_state(next_states.values()) # find the action, that corresponds to the best state best_action = None for action, state in next_states.items(): if state == best_state: best_action = action break _, done = env.hard_drop([best_action[0], 0], best_action[1], render=render) scores.append(env.score) # print results at the end of the episode print(f'episode {episode} => {env.score}') return scores
def train_speed_agent(coach): """ takes caoch(lstm) to modify the target reward function for the agent """ score = 0 coaching_score_keep = [] coaching_episode_keep = [] env = gym.make('CartPole-v0') state_size = env.observation_space.shape[0] action_size = env.action_space.n coaching = DQNAgent(len(env.reset()), env.action_space.n) done = False batch_size = 32 index = 0 for e in range(EPISODES): state = env.reset() state = np.reshape(state, [1, state_size]) for time in range(500): #env.render() action = coaching.act(state) next_state, reward, done, _ = env.step(action) reward = reward if not done else -10 next_state = np.reshape(next_state, [1, state_size]) index = index + 1 coaching.remember(state, action, reward, next_state, done, index) score = score + reward success = determine_sucess(done, score) coaching.lstm_data(state, action, reward, next_state, done, success) state = next_state if done: print("episode: {}/{}, score: {}, e: {:.2}".format( e, EPISODES, time, coaching.epsilon)) coaching_score_keep.append(score) coaching_episode_keep.append(e) score = 0 break if len(coaching.memory) > batch_size: coaching.replay(batch_size, coach) return agent, coaching_score_keep, coaching_episode_keep
def train_expert(): """ craetes an agent that is trained to an optimal policy and captures all values required to train lstm """ agent_score_keep = [] agent_episode_keep = [] score = 0 env = gym.make('CartPole-v1') state_size = env.observation_space.shape[0] action_size = env.action_space.n agent = DQNAgent(len(env.reset()), env.action_space.n) # agent.load("./save/cartpole-dqn.h5") done = False batch_size = 32 index = 0 for e in range(EPISODES): state = env.reset() state = np.reshape(state, [1, state_size]) for time in range(500): # env.render() action = agent.act(state) next_state, reward, done, _ = env.step(action) a_reward = reward if not done else -10 next_state = np.reshape(next_state, [1, state_size]) index = index + 0 agent.remember(state, action, a_reward, next_state, done, index) score = score + reward success = determine_sucess(done, score) agent.lstm_data(state, action, reward, next_state, done, success) state = next_state if done: print("episode: {}/{}, score: {}, e: {:.2}".format( e, EPISODES, score, agent.epsilon)) agent_score_keep.append(score) agent_episode_keep.append(e) score = 0 break if len(agent.memory) > batch_size: agent.replay(batch_size, None) return agent, agent_score_keep, agent_episode_keep
def create(name, env, max_schedule_time=20, verbose=False): """Static method to create an agents by name""" if name not in AgentFactory.available_agents(): raise(Exception(f'Unsupported agent: {name}')) if name == 'baseline': from random_agent import RandomAgent return RandomAgent(env.action_space) if name == 'qlearning': from qlearning_td_agent import QLearningTDAgent return QLearningTDAgent(jobs_data=jobs_data, epsilon=.4, max_schedule_time=max_schedule_time, verbose=verbose) if name == 'dqn': from dqn_agent import DQNAgent return DQNAgent(env.observation_space, env.action_space, verbose=verbose) return None
def process_conversation_POST(state_tracker_id, message): state_tracker = None if state_tracker_id in StateTracker_Container.keys(): state_tracker = StateTracker_Container[state_tracker_id][0] confirm_obj = StateTracker_Container[state_tracker_id][1] else: # print("---------------------------------in model") state_tracker = StateTracker(database, constants) confirm_obj = None StateTracker_Container[state_tracker_id] = (state_tracker, confirm_obj) user_action, new_confirm_obj = process_message_to_user_request( message, state_tracker) print("-----------------------------------user action") print(user_action) #nếu là câu request mới của user thì reset state tracker và cho confirm về lại None if user_action['request_slots'] != {}: state_tracker.reset() confirm_obj = None #nếu có câu confirm request mới thì ghi đè if new_confirm_obj != None: confirm_obj = new_confirm_obj if user_action['intent'] not in ["hello", "other", "done"]: dqn_agent = DQNAgent(state_tracker.get_state_size(), constants) agent_act = get_agent_response(state_tracker, dqn_agent, user_action) StateTracker_Container[state_tracker_id] = (state_tracker, confirm_obj) agent_message = response_craft(agent_act, state_tracker, confirm_obj) else: # to prevent key error agent_act = { 'intent': user_action['intent'], 'request_slots': [], 'inform_slots': [] } agent_message = random.choice( response_to_user_free_style[user_action['intent']]) #nếu là done thì reset và cho confirm về None if user_action['intent'] == "done": state_tracker.reset() StateTracker_Container[state_tracker_id] = (state_tracker, None) return agent_message, agent_act
def main(): """ Main method runs the whole experiment. """ env = UnityEnvironment(file_name=ENV_PATH) # get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] print_env_state(env=env, brain_name=brain_name, brain=brain) agent = DQNAgent(state_size=PARAM.STATE_SIZE, action_size=PARAM.ACTION_SIZE, seed=0) # agent = DDQNAgent(state_size=PARAM.STATE_SIZE, action_size=PARAM.ACTION_SIZE, seed=0) # agent = DDQNAgentPrioExpReplay(state_size=PARAM.STATE_SIZE, action_size=PARAM.ACTION_SIZE, seed=0) if not TRAIN_MODE: load_model_into_agent(agent) scores = run_agent(agent=agent, env=env, brain_name=brain_name) save_score_plot(scores=scores)
def experiment_wrapper(feed_units, i, num_episodes, randomize, env_type): from supervised_agent import SupervisedAgent from supervised_agent_one_step import SupervisedAgentOneStep from dqn_agent import DQNAgent from deep_exp_hyper_agent import DeepExpHyperAgent from deep_exp_agent import DeepExpAgent from deep_exp_ids_agent import DeepExpIDSAgent import numpy as np deep_exp_agents = [] num_positive = 0 for feed in feed_units: if feed.interest > 0: num_positive += 1 for prior in range(0, 1): deep_exp_agents.append( DeepExpAgent( [k for k in range(len(feed_units))], 'deep_exploration_{}_{}_{}'.format(num_positive, len(feed_units), prior), prior_variance=10**prior, )) agents = ([ SupervisedAgent( [k for k in range(len(feed_units))], 'supervised_{}_{}'.format( num_positive, len(feed_units))), DQNAgent([k for k in range(len(feed_units))], 'dqn_{}_{}'.format( num_positive, len(feed_units))), ] + deep_exp_agents) cumulative_reward = run_experiment(agents, feed_units, i, num_episodes, randomize, env_type) np.save( 'ids_experiment_{}_{}_{}_{}_{}'.format(num_positive, len(feed_units), int(randomize), i, env_type), cumulative_reward)
def _test(opponents, match_num=20, render=True): env = pommerman.make('PommeFFACompetition-v0', []) # Exploration strategy exp_schedule = LinearExploration(env, 0, 0, 1) # Learning rate schedule lr_schedule = LinearSchedule(config.lr_begin, config.lr_end, config.lr_nsteps) # Initialize agents. dqn_agent = DQNAgent(env, config, exp_schedule, lr_schedule, False) dqn_agent_index = _init_agents(env, exp_schedule, lr_schedule, opponents, dqn_agent) count = 0 win = 0 for _ in range(match_num): state = env.reset() done = False while not done: if render: env.render() actions = env.act(state) state, reward, done, info = env.step(actions) if reward[0] == 1: win += 1 print('win at episode %d' % count) if reward[dqn_agent_index] == -1 and not done: # Stop the episode when the testing agent dies. done = True count += 1 print(win / count) env.close()
if __name__ == '__main__': env = gym.make('CartPole-v0') np.random.seed(123) env.seed(123) nb_actions = env.action_space.n actions = np.arange(nb_actions) policy = EpsGreedyQPolicy(eps=1., eps_decay_rate=.999, min_eps=.01) memory = Memory(limit=50000, maxlen=1) obs = env.reset() agent = DQNAgent(actions=actions, memory=memory, update_interval=200, train_interval=1, batch_size=32, observation=obs, input_shape=[len(obs)], policy=policy, obs_processer=obs_processer) agent.compile() result = [] nb_epsiodes = 1000 for episode in range(nb_epsiodes): agent.reset() observation = env.reset() observation = deepcopy(observation) agent.observe(observation) done = False
W1 = tf.get_variable("W1", [state_dim, 20], initializer=tf.truncated_normal_initializer()) b1 = tf.get_variable("b1", [20], initializer=tf.constant_initializer(0)) h1 = tf.nn.relu(tf.matmul(states, W1) + b1) W2 = tf.get_variable("W2", [20, num_actions], initializer=tf.truncated_normal_initializer()) b2 = tf.get_variable("b2", [num_actions], initializer=tf.constant_initializer(0)) q = tf.matmul(h1, W2) + b2 return q dqn_agent = DQNAgent(q_session, q_optimizer, q_network, state_dim, num_actions, target_update_rate=0.01, summary_writer=q_writer, summary_every=q_summary_every) # Initializing ReplayBuffer buffer_size = 100000 sample_size = 2**13 replay_buffer = ReplayBuffer(buffer_size) # Training def computing_probabilities(batch): probabilites = pg_reinforce.compute_action_probabilities( batch["next_states"]) return probabilites
img.set_array(state_t_1) plt.axis("off") return img, if __name__ == "__main__": # args parser = argparse.ArgumentParser() parser.add_argument("-m", "--model_path") parser.add_argument("-s", "--save", dest="save", action="store_true") parser.set_defaults(save=False) args = parser.parse_args() # environmet, agent env = CatchBall() agent = DQNAgent(env.enable_actions, env.name) agent.load_model(args.model_path) # variables win, lose = 0, 0 state_t_1, reward_t, terminal = env.observe() # animate fig = plt.figure(figsize=(env.screen_n_rows / 2, env.screen_n_cols / 2)) fig.canvas.set_window_title("{}-{}".format(env.name, agent.name)) img = plt.imshow(state_t_1, interpolation="none", cmap="gray") ani = animation.FuncAnimation(fig, animate, init_func=init, interval=(1000 / env.frame_rate), blit=True) if args.save: # save animation (requires ImageMagick) ani_path = os.path.join(
import numpy as np from catch_ball import CatchBall from dqn_agent import DQNAgent if __name__ == "__main__": # parameters n_epochs = 1000 # environment, agent env = CatchBall() agent = DQNAgent(env.enable_actions, env.name) # variables win = 0 for e in range(n_epochs): # reset frame = 0 loss = 0.0 Q_max = 0.0 env.reset() state_t_1, reward_t, terminal = env.observe() while not terminal: state_t = state_t_1 # execute action in environment action_t = agent.select_action(state_t, agent.exploration) env.execute_action(action_t)