def make_bot(un, pw, expected_opponent, team, challenge, trainer, epsilon=None, model_path=None, target_model_path=None ): if trainer: if model_path: agent = DQNAgent(INPUT_SHAPE, training=False) else: agent = RandomAgent() else: agent = DQNAgent( INPUT_SHAPE, epsilon=epsilon, random_moves=True, training=False, copy_target_model=False ) agent.load_model(model_path) if target_model_path != None: agent.target_model = load_model(target_model_path) else: agent.target_model.set_weights(agent.model.get_weights()) bot = BotClient( name=un, password=pw, expected_opponent=expected_opponent, team=team, challenge=challenge, runType=RunType.Iterations, runTypeData=1, agent=agent, trainer=trainer, save_model=False, should_write_replay=(not trainer) ) bot.start()
def main(): # parser = argparse.ArgumentParser(description='Run DQN on Atari SpaceInvaders') # parser.add_argument('--env', default='SpaceInvaders-v0', help='Atari env name') # parser.add_argument( # '-o', '--output', default='SpaceInvaders-v0', help='Directory to save data to') # parser.add_argument('--seed', default=0, type=int, help='Random seed') # # parser.add_argument('--input_shape', default=(84, 84, 4), type=tuple, help='Size of each frame') # # args = parser.parse_args() # # args.output = get_output_folder(args.output, args.env) #vehicle_network veh_network = create_lstm_model(nb_time_steps, nb_input_vector, num_actions=g1) #Attacker network att_network = create_lstm_model(nb_time_steps, nb_input_vector, num_actions=gym.make( args.env).action_space.n) veh_agent = DQNAgent(q_network=veh_network, preprocessor=core.Preprocessor(), memory=core.ReplayMemory(), policy=1, gamma=0.1, target_update_freq=100, num_burn_in=100, train_freq=20, batch_size=32) att_agent = DQNAgent(q_network=att_network, preprocessor=core.Preprocessor(), memory=core.ReplayMemory(), policy=1, gamma=0.1, target_update_freq=100, num_burn_in=100, train_freq=20, batch_size=32) veh_agent.compile('Adam', 'mse') att_agent.compile('Adam', 'mse') env = VehicleFollowingENV for i_episode in range(20): agent.fit(env, 10**6) # env.close() model_json = q_network.to_json() with open("model.json", "w") as json_file: json_file.write(model_json)
def play_it(): #ENV_NAME = 'CartPole-v0' #ENV_NAME = 'MountainCar-v0' ENV_NAME = 'Single_virtual-v0' # Get the environment and extract the number of actions. env = make(ENV_NAME) env1 = make(ENV_NAME) np.random.seed(123) env.seed(123) nb_actions = env.action_space.n model = build_model(nb_actions,env.observation_space) # model = build_model1(nb_actions, env.observation_space) # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! memory = SequentialMemory(limit=50000, window_length=1) policy = BoltzmannQPolicy() dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10, target_model_update=1e-2, policy=policy,) dqn.compile(Adam(lr=1e-3), metrics=['mae']) # Okay, now it's time to learn something! We visualize the training here for show, but this # slows down training quite a lot. You can always safely abort the training prematurely using # Ctrl + C. dqn.fit(env, nb_steps=50000, visualize=False, verbose=2) # After training is done, we save the final weights. dqn.save_weights(os.path.join('models_weights_logs','dqn_{}_weights.h5f'.format(ENV_NAME+ datetime.now().strftime("%Y%m%d-%H%M%S"))), overwrite=True) # dqn.load_weights(os.path.join('models_weights_logs','dqn_{}_weights.h5f'.format(ENV_NAME))) # Finally, evaluate our algorithm for 5 episodes. dqn.test(env1, nb_episodes=5, visualize=True)
def predict_dqn(self): # get size of state and action from environment state_size = 4 action_size = 2 agent = DQNAgent(state_size, action_size, load_model=True) done = False score = 0 self.reset() state, _, _, _ = self.step(-1) state = np.reshape(state, [1, state_size]) while not done: # get action for the current state and go one step in environment action = agent.get_action(state) next_state, reward, done, info = self.step(action) next_state = np.reshape(next_state, [1, state_size]) score += reward state = next_state if done or score >= 500: print("score:", score) break
def __init__(self, host, port): self.state_size = 3 self.action_size = 7 self.done = False self.batch_size = 32 self.agent = DQNAgent(self.state_size, self.action_size) self.state_now = np.reshape([0.10606659, -0.52737298, 0.47917915], [1, self.state_size]) self.state_last = np.reshape([0.10606659, -0.52737298, 0.47917915], [1, self.state_size]) self.action_for_next = 0 self.action_for_now = 0 self.reward = 0 self.forward = "T394" self.left = "S450" self.right = "S270" self.backward = "T330" self.stop = "T370" self.middle = "S360" #dqn parameters self.server_socket = socket.socket() self.server_socket.bind((host, port)) self.server_socket.listen(0) self.connection, self.client_address = self.server_socket.accept() self.connection = self.connection.makefile("rb") self.host_name = socket.gethostname() self.host_ip = socket.gethostbyname(self.host_name) self.temp_result = None self.finnal_result = None self.RANGE = 350 self.WIDTH = 720 self.time_now = 0 self.count = 0 self.streaming()
def __init__(self, config): # Create session to store trained parameters self.session = tf.Session() self.action_count = config["action_count"] # Create agent for training self.agent = DQNAgent(self.action_count) # Create memory to store observations self.memory = ExperienceMemory(config["replay_memory_size"]) # Tools for saving and loading networks self.saver = tf.train.Saver() # Last action that agent performed self.last_action_index = None # Deque to keep track of average reward and play time self.game_history = GameHistory(config["match_memory_size"]) # Deque to store losses self.episode_history = EpisodeHistory(config["replay_memory_size"]) self.INITIAL_EPSILON = config["initial_epsilon"] self.FINAL_EPSILON = config["final_epsilon"] self.OBSERVE = config["observe_step_count"] self.EXPLORE = config["explore_step_count"] self.FRAME_PER_ACTION = config["frame_per_action"] self.GAMMA = config["gamma"] self.LOG_PERIOD = config["log_period"] self.BATCH_SIZE = config["batch_size"]
def test_dqn(): args = DQNArgs() env = gym.make(args.env_name) agent = DQNAgent(env, QNet, SimpleNormalizer, args) agent.load(args.save_dir) for _ in range(10): agent.test_one_episode(True)
def run(self): ### create TORCS environment env = TorcsEnv(vision=False, throttle=True) ### start run according to supplied arguments if self.algorithm == "dqn" and self.modus == "train": agent = DQNAgent(env, self.track, self.numOfEpisodes) agent.trainAgent() elif self.algorithm == "dqn" and self.modus == "test": agent = DQNAgent(env, self.track, self.numOfEpisodes) agent.testAgent() elif self.algorithm == "ddpg" and self.modus == "train": agent = DDPGAgent(env, self.track, self.numOfEpisodes) agent.trainAgent() elif self.algorithm == "ddpg" and self.modus == "test": agent = DDPGAgent(env, self.track, self.numOfEpisodes) agent.testAgent()
def get_agent(env, **kwargs): replay_capacity = 1e6 n_episodes = 10e7 return DQNAgent(env=env or gym.make('CartPole-v0'), n_episodes=n_episodes, replay_capacity=replay_capacity, **kwargs)
def train_dqn(): args = DQNArgs() env = gym.make(args.env_name) agent = DQNAgent(env, QNet, SimpleNormalizer, args) pre_best = -1e9 for ep in range(args.max_ep): agent.train_one_episode() if ep % args.test_interval == 0: r = agent.test_model() if r > pre_best: pre_best = r agent.save(args.save_dir)
def main(): train_data, parameter[1]["episode_length"] = data_prepare(parameter) parameter[2]['action_size'], parameter[2][ 'state_size'], state, env = create_states(parameter, train_data) #create model agent = DQNAgent(parameter) #train model and save train(agent, parameter, state, env) caculation(agent, env) #test model parameter[0]["mode"] = 'test' test_data = data_prepare(parameter)[0] test_env = create_states(parameter, test_data)[3] caculation(agent, test_env)
async def on_challenge_update(self, challenge_data): incoming = challenge_data.get('challengesFrom', {}) if self.expected_opponent.lower() in incoming: if self.trainer: model_paths = [ os.path.join(self.logs_dir, content) for content in os.listdir(self.logs_dir) if content.endswith('.model') and content.startswith('Epoch') ] if len(model_paths) > 0: sorted_model_paths = sorted( model_paths, key=lambda x: int( os.path.basename(x).lstrip('Epoch').rstrip('.model' ))) model_to_load = sorted_model_paths[-1] self.log(f'Loading model {model_to_load}') self.agent = DQNAgent(INPUT_SHAPE, training=False) self.agent.load_model(model_to_load) await self.accept_challenge(self.expected_opponent, self.team_text)
def __init__(self, player_name=None, letter=None): if player_name is None: self.player_name = common_utils.get_random_name() else: self.player_name = player_name if letter is not None: self.letter = letter else: pass # TODO: Handle this if letter == 'X': self.enemy_letter = 'O' else: self.enemy_letter = 'X' logger.debug("Initializing player {} with letter {} ...".format( self.player_name, self.letter)) self.agent = DQNAgent()
def load_model(MODEL_TYPE): curr_model = None if MODEL_TYPE == "SVM": print("LOADING SVM...") curr_model = load("svm.joblib") elif MODEL_TYPE == "LR": print("LOADING LR...") lr = LogReg(74) #(env.matches.shape[1]) lr.load_weights("weights/weights-improvement-100-0.31.hdf5") curr_model = lr elif MODEL_TYPE == "DT": print("LOADING DT...") curr_model = load("dt.joblib") elif MODEL_TYPE == "GB": print("LOADING GB...") curr_model = load("gb.joblib") elif MODEL_TYPE == "RF": print("LOADING RF...") curr_model = load("rfc.joblib") elif MODEL_TYPE == "NB": print("LOADING NB...") curr_model = load("nb.joblib") elif MODEL_TYPE == "AB": print("LOADING AB...") curr_model = load("ab.joblib") elif MODEL_TYPE == "DQN": print("LOADING DQN...") BetNet = DQNAgent(75) BetNet.load("weights/betnet-weights-dqn.h5") curr_model = BetNet else: print("LOADING NN...") BetNet = Network(74) #(env.matches.shape[1]) BetNet.load_weights( 'weights/Adadelta/test9_400_Best/weights-improvement-400-0.48.hdf5' ) #PCA("weights/Adadelta/test13_100iter_reglast2/weights-improvement-100-0.52.hdf5") # Most recent weights curr_model = BetNet return curr_model
def simulateGustsControl(self): ''' Simulate the response of the controller to gusts. :return: A plot of the simulation. ''' self.sim_time = 100 agent = DQNAgent(self.mdp.size, self.action_size) agent.load(self.src) WH = self.wh.generateWind() hdg0 = 0 * TORAD * np.ones(self.wh.samples) state = self.mdp.initializeMDP(hdg0, WH) i = np.ones(0) v = np.ones(0) wind_heading = np.ones(0) for time in range(self.sim_time): WH = self.wh.generateWind() if time == 20: WH = self.wh.generateGust(10 * TORAD) action = agent.actDeterministically(state) next_state, reward = self.mdp.transition(action, WH) state = next_state i = np.concatenate([i, self.mdp.extractSimulationData()[0, :]]) v = np.concatenate([v, self.mdp.extractSimulationData()[1, :]]) wind_heading = np.concatenate([wind_heading, WH[0:10]]) time_vec = np.linspace(0, self.sim_time, int((self.sim_time) / self.mdp.dt)) f, axarr = plt.subplots(2, sharex=True) axarr[0].plot(time_vec, i / TORAD) axarr[1].plot(time_vec, v) axarr[0].set_ylabel("angle of attack") axarr[1].set_ylabel("v") plt.show()
def simulateDQNControl(self, hdg0): ''' Plots the control law of the network over a simulation. :param hdg0: Initial heading of the boat for the simulation. :return: A plot of the angle of attack and velocity during the control. ''' agent = DQNAgent(self.mdp.size, self.action_size) agent.load(self.src) WH = self.wh.generateWind() hdg0 = hdg0 * TORAD * np.ones(self.wh.samples) state = self.mdp.initializeMDP(hdg0, WH) i = np.ones(0) v = np.ones(0) wind_heading = np.ones(0) for time in range(self.sim_time): WH = self.wh.generateWind() action = agent.actDeterministically(state) next_state, reward = self.mdp.transition(action, WH) state = next_state i = np.concatenate([i, self.mdp.extractSimulationData()[0, :]]) v = np.concatenate([v, self.mdp.extractSimulationData()[1, :]]) wind_heading = np.concatenate([wind_heading, WH[0:10]]) time_vec = np.linspace(0, self.sim_time, int((self.sim_time) / self.mdp.dt)) f, axarr = plt.subplots(2, sharex=True) axarr[0].plot(time_vec, i / TORAD) axarr[1].plot(time_vec, v) axarr[0].set_ylabel("i [°]") axarr[1].set_ylabel("v [m/s]") axarr[0].set_xlabel("t [s]") axarr[1].set_xlabel("t [s]") plt.show()
def build(args): # Params training = is_training(args) # Hack for switching number of DQN input features (see help) n_feats = {'all': 11, 'distance': 1} n_actions = 4 # we are ignoring action 0 (for now) # Maximum number of steps per episode max_steps = 8 * (args.dims[0] + args.dims[1]) - 1 # Total feature dimension total_feats = n_feats[args.feats] * sum( [4**i for i in range(args.n_nodes + 1)]) # Flatland Environment environment = FlatlandEnv(x_dim=args.dims[0], y_dim=args.dims[1], n_cars=args.n_agents, n_acts=n_actions, min_obs=-1.0, max_obs=1.0, n_nodes=args.n_nodes, feats=args.feats) # Simple DQN agent agent = DQNAgent(alpha=0.0005, gamma=0.99, epsilon=1.0, input_shape=total_feats, sample_size=512, batch_size=32, n_actions=n_actions, training=training) if not training: agent.load_model() return environment, agent, max_steps
def main(): # vehicle_network veh_network = create_lstm_model(nb_time_steps, nb_input_vector, num_actions=4) # Attacker network # att_network = create_lstm_model(nb_time_steps, nb_input_vector, num_actions=4) veh_agent = DQNAgent(q_network=veh_network, q_network2=veh_network, preprocessor=core.Preprocessor(), RLmemory=core.ReplayMemory(), SLmemory=core.ReplayMemory(), policy=1, gamma=0.1, target_update_freq=100, num_burn_in=100, train_freq=20, batch_size=32) # att_agent = DQNAgent(q_network=att_network, # q_network2=att_network, # preprocessor=core.Preprocessor(), # memory=core.ReplayMemory(), # policy=1, # gamma=0.1, # target_update_freq=100, # num_burn_in=100, # train_freq=20, # batch_size=32) veh_agent.compile('Adam', 'mse') # att_agent.compile('Adam', 'mse') env = VehicleFollowingENV() for i_episode in range(20): veh_agent.fit(env=env, num_iterations=10 ** 6) # att_agent.fit(env, 10 ** 6) # env.close() model_json = veh_network.to_json() with open("model.json", "w") as json_file: json_file.write(model_json)
def main(argv): args = parser.parse_args(argv[1:]) if args.usage == 'help': return parser.print_help() if is_environments_gen(args): _write_env_file(args) elif is_environments_list(args): all_registry = registry.all() registry_envs_name = [ trim_env_spec_name(env.__repr__()) for env in all_registry ] for environment in registry_envs_name: print(environment) elif is_environments_act(args): env = gym.make(args.environment_name) if is_action_type('dqn', args): if args.pre_defined_state_size == 'nesgym': pre_state_size = 172032 elif args.pre_defined_state_size == 'gym': pre_state_size = env.observation_space.shape[0] elif args.pre_defined_state_size == 'gym-atari': pre_state_size = 100800 elif args.pre_defined_state_size == 'gym-atari-extend': pre_state_size = 120000 elif args.pre_defined_state_size == 'gym-atari-small': pre_state_size = 100800 elif args.pre_defined_state_size == 'gym-gomoku': pre_state_size = 361 # state_size = (1,) + env.observation_space.shape state_size = pre_state_size action_size = env.action_space.n agent = DQNAgent(state_size, action_size) # try: # agent.load('./weights/dqn_{}_{}_{}.h5'.format(args.environment_name.lower(), args.timesteps, # args.i_episodes)) # except Exception: # pass done = False batch_size = 64 i_episodes = args.i_episodes timesteps = args.timesteps factor = args.seed_factor for i_episode in range(i_episodes): state = env.reset() if is_action_type('dqn', args): state = np.reshape(state, [1, pre_state_size]) for t in range(timesteps): try: if args.render == 'present': env.render() if args.render == 'presented': env.render(args.render) if args.action_type == 'alternate': action_choice = i_episodes * 2 action = random_action_space_sample_choice( action_choice, env, factor) elif args.action_type == 'specific': action = env.action_space.sample() elif args.action_type == 'conditional': action_choice = i_episodes action = random_action_space_sample_choice( action_choice, env, factor) elif args.action_type == 'numerical': action = env.action_space.n elif is_action_type('dqn', args) and len(state) == 5: action = agent.act(state) elif is_action_type('dqn', args) and len(state) != 5: action = env.action_space.sample() collect_stat(action, ['input', 'actions'], stats) observation, reward, done, info = env.step(action) if is_action_type('dqn', args): reward = reward if not done else -10 observation = np.reshape(observation, [1, pre_state_size]) agent.remember(state, action, reward, observation, done) state = observation # collect_stat(observation,['observation'],stats) collect_stat(reward, ['rewards'], stats) # collect_stat(done,['output','done'],stats) # collect_stat(info,['output','info'],stats) if done: max_episodes_range = (i_episodes - 1) episode_timesteps_iteration_limit = max_episodes_range - 1 is_latest_episode = is_filled_latest_episode_with_iteration( i_episode, episode_timesteps_iteration_limit) increased_timestep = increase_timestep(t) print('i_episode {}'.format(i_episode)) print('Episode finished after {} timesteps'.format( increased_timestep)) if is_action_type('dqn', args): print('Episode: {}/{}, score: {}, e: {:.2}'.format( i_episode, i_episodes, t, agent.epsilon)) collect_stat(t, ['output', 'timestep', 'iteration'], stats) collect_stat(increased_timestep, ['output', 'timestep', 'increased'], stats) is_latest_episode_to_save_state = lambda args_cached: is_latest_episode and args_cached.output_stats_filename if is_latest_episode_to_save_state(args): filename = args.output_stats_filename pre_df = { # 'observations': stats['observations'], 'rewards': stats['rewards'], # 'done-output': stats['output']['done'], # 'info-output': stats['output']['info'], # 'iteration-timestep': stats['output']['timestep']['iteration'], # 'increased-timestep': stats['output']['timestep']['increased'], 'actions-input': stats['input']['actions'] } df = pd.DataFrame(pre_df) stamp = lambda: '%s' % (int(datetime.now(). timestamp())) with open( 'data/{}-{}.csv'.format(stamp(), filename), 'w') as f: f.write(df.to_csv()) f.close() print('Statistics file saved ({}.csv)!'.format( filename)) del df del filename print(check_output_env_label()) del is_latest_episode_to_save_state del increased_timestep del is_latest_episode del episode_timesteps_iteration_limit del max_episodes_range break except Exception as e: print('Rendering execution ({})'.format(e)) finally: print('Execution of timestep done') if is_action_type('dqn', args) and (len(agent.memory) > batch_size): agent.replay(batch_size) # agent.save('./weights/dqn_{}_{}_{}.h5'.format(args.environment_name.lower(), args.timesteps, # args.i_episodes)) # env.close() else: parser.print_help()
def main(): print "Creating DQN agent..." # env = gym.make("codegen-v0") set_debugger_org_frc() iters = 6300 n_goal = 0 n_goal_all = 0 time_stamp = 0 max_steps = 5 agent = DQNAgent(max_steps) agent.dqn.initial_exploration = 6000 * max_steps for iter in range(iters): print "\n********Iteration # ", iter, "***********\n" # 1 iteration env = gym.make("codegen-v0") num = random.randrange(1, 100) print "Goal Number : ", num + 1 env.my_input = num #env.goal = "['" + env.my_input + "']" env.goal = str(num + 1) code = env._reset() step_in_episode = 0 total_score = 0.0 reward = 0.0 mystate = [] my_state_new = [] # debug : the sys # sss = [] # for arg in sys.argv[1:]: # sss.append(arg) # print "sss = " , sss # while True: while step_in_episode < max_steps: # state = env.code_index_list + [-1]*(max_steps-len(env.code_index_list state = env.code_index_list[:] state += np.zeros([ max_steps - len(env.code_index_list), agent.dqn.code_idx_size ], dtype=int).tolist() # state = state.tolist() # state = 1; # print "env = ",env.code_index_list # print "state = ",state # raw_input() if step_in_episode == 0: action_idx = agent.start(code, state) else: action_idx = agent.act(code, state, reward) code, reward, terminal, info = env._step(action_idx, agent.dqn.actions) state_prime = env.code_index_list[:] state_prime += np.zeros([ max_steps - len(env.code_index_list), agent.dqn.code_idx_size ], dtype=int).tolist() # debug : the sys # sss = [] # for arg in sys.argv[1:]: # sss.append(arg) # print "sss = " , sss print "state : " print state print "state' : " print state_prime if step_in_episode == max_steps - 1: agent.dqn.stock_experience(agent.dqn.time_stamp, state, action_idx, reward, state_prime, 1) else: agent.dqn.stock_experience(agent.dqn.time_stamp, state, action_idx, reward, state_prime, 0) agent.dqn.experience_replay(agent.dqn.time_stamp) agent.dqn.target_model_update(agent.dqn.time_stamp, soft_update=False) total_score += reward if terminal: agent.dqn.goal_idx.append(agent.dqn.time_stamp) agent.end(reward) agent.dqn.stock_experience(agent.dqn.time_stamp, state, action_idx, reward, state_prime, 1) n_goal_all += 1 step_in_episode += 1 agent.dqn.time_stamp += 1 if iters - iter <= 100: n_goal += 1 break step_in_episode += 1 agent.dqn.time_stamp += 1 if iter == 1 + (agent.dqn.initial_exploration / max_steps): print "n_goal_all = ", n_goal_all print agent.dqn.goal_idx raw_input() print "n_goal : ", n_goal print "epsilon : ", agent.epsilon
curr_model = load("dt.joblib") elif MODEL_TYPE == "GB": print("LOADING GB...") curr_model = load("gb.joblib") elif MODEL_TYPE == "RF": print("LOADING RF...") curr_model = load("rfc.joblib") elif MODEL_TYPE == "NB": print("LOADING NB...") curr_model = load("nb.joblib") elif MODEL_TYPE == "AB": print("LOADING AB...") curr_model = load("ab.joblib") elif MODEL_TYPE == "DQN": print("LOADING DQN...") BetNet = DQNAgent(75) BetNet.load("weights/betnet-weights-dqn.h5") curr_model = BetNet else: print("LOADING NN...") BetNet = Network(env.matches.shape[1]) BetNet.load_weights( "weights/Adadelta/test13_100iter_reglast2/weights-improvement-100-0.52.hdf5" ) # Most recent weights curr_model = BetNet ############################################################################### #GETS THE PREDICTION VEC GIVEN MODEL def generatePrediction(mt, curr_model, to_process):
from mdp import MDP import random ''' MDP Parameters ''' mdp = MDP(duration_history=3, duration_simulation=1, delta_t=0.1) ''' Environment Parameters ''' w = wind(mean=45 * TORAD, std=0 * TORAD, samples=10) WH = w.generateWind() hdg0 = 0 * np.ones(10) mdp.initializeMDP(hdg0, WH) agent = DQNAgent(mdp.size, action_size=2) #agent.load("../Networks/lighter_archi") batch_size = 50 EPISODES = 500 hdg0_rand_vec = [-3, 0, 3, 6, 9, 12, 15, 18, 21] loss_of_episode = [] i = [] v = [] r = [] for e in range(EPISODES): WH = w.generateWind() hdg0_rand = random.choice(hdg0_rand_vec) * TORAD hdg0 = hdg0_rand * np.ones(10)
def optimize_agent(trial, args): "Optimize the model." model_name = args.study_name + "_" + str(trial.number) env_kwargs = dict() callback_checkpoint_kwargs = dict() save_dir = args.save_dir log_interval = args.log_interval num_cpus = args.num_cpus eval_episodes = args.eval_episodes n_steps = args.n_steps layer_normalization = args.layer_normalization layers = args.layers env_kwargs["board_size"] = 4 env_kwargs["binary"] = not args.no_binary env_kwargs["extractor"] = args.extractor env_kwargs["seed"] = args.seed env_kwargs["penalty"] = args.penalty callback_checkpoint_kwargs["save_freq"] = args.save_freq callback_checkpoint_kwargs["save_path"] = args.save_dir callback_checkpoint_kwargs["name_prefix"] = model_name if args.agent == "ppo2": model_kwargs = trial_hiperparameter_ppo2(trial) model_kwargs["agent"] = "ppo2" model_kwargs["tensorboard_log"] = args.tensorboard_log model = PPO2Agent( model_name, save_dir, log_interval, num_cpus, eval_episodes, n_steps, layer_normalization, model_kwargs, env_kwargs, callback_checkpoint_kwargs, ) elif args.agent == "dqn": # model_kwargs = trial_hiperparameter_dqn(trial) model_kwargs = {} model_kwargs["learning_rate"] = 0.0001 model_kwargs["batch_size"] = 10000 model_kwargs["learning_starts"] = 10000 model_kwargs["target_network_update_freq"] = 1000 model_kwargs["train_freq"] = 4 model_kwargs["agent"] = "dqn" model_kwargs["tensorboard_log"] = args.tensorboard_log model_kwargs["double_q"] = True model_kwargs["prioritized_replay"] = True model_kwargs["param_noise"] = True print(model_kwargs) model = DQNAgent( model_name, save_dir, log_interval, num_cpus, eval_episodes, n_steps, layer_normalization, layers, args.load_path, args.num_timesteps_log, model_kwargs, env_kwargs, callback_checkpoint_kwargs, ) elif args.agent == "acer": model_kwargs = trial_hiperparameter_acer(trial) model_kwargs["agent"] = "acer" model_kwargs["tensorboard_log"] = args.tensorboard_log model_kwargs["replay_start"] = 2000 model = ACERAgent( model_name, save_dir, log_interval, num_cpus, eval_episodes, n_steps, layer_normalization, model_kwargs, env_kwargs, callback_checkpoint_kwargs, ) else: ValueError("Choose a valid agent model") model.train() total_score = model.test() return total_score
def __init__(self, simulator): self.agent = DQNAgent(25, 6) self.agent.load("./save/car-100-dqn.h5") self.simulator = simulator self.agent.epsilon = 0
if __name__ == '__main__': np.random.seed(seed) torch.manual_seed(seed) if torch.cuda.is_available(): device = torch.device('cuda:0') else: device = torch.device('cpu') #device = torch.device('cpu') print(device) env = make_env(seed) state_shape = env.observation_space.shape n_actions = env.action_space.n state = env.reset() agent = DQNAgent(state_shape, n_actions, epsilon=0.9).to(device) #agent.load_state_dict(torch.load('dqn.weights')) target_network = DQNAgent(state_shape, n_actions).to(device) target_network.load_state_dict(agent.state_dict()) opt = torch.optim.Adam(agent.parameters(), lr=1e-4) exp_replay = ReplayBuffer(buffer_size) print('test_buffer') for i in range(100): play_and_record(state, agent, env, exp_replay, n_steps=10**2) if len(exp_replay) == buffer_size: break print(len(exp_replay)) state = env.reset() for step in trange(step, total_steps + 1):
optimizer = Adam(learning_rate=0.001) memory = SequentialMemory(limit=20000, window_length=WINDOW_LENGTH) policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1.0, value_min=0.1, value_test=0.05, nb_steps=1000000) dqn = DQNAgent(model=model, nb_actions=3, policy=policy, memory=memory, nb_steps_warmup=2000, gamma=0.95, target_model_update=2000, train_interval=1, delta_clip=1.0) dqn.compile(optimizer, metrics=['mae']) env = Tetris() start = time.time() weights_filename = 'dqn_{}_tetris_weights.h5'.format(start) checkpoint_weights_filename = 'dqn_{}_tetris_weights_.h5'.format(start) log_filename = 'dqn_tetris_log.json' callbacks = [
def create_dqn_agent(num_states, num_actions): return DQNAgent(num_states, num_actions)
debug_log(f'file content with syntax error\n{s}') debug_log('') for i in range(5): try: os.remove(file_path) break except PermissionError: debug_log('Permission error when removing the file') time.sleep(1) #NOTE: train #NOTE: create/load DQN and target DQN in main thread keras.backend.clear_session() agent = DQNAgent(INPUT_SHAPE, training=True, replay_memory=minibatch, copy_target_model=False ) agent.target_model = load_model(target_model_path) #NOTE: train newly loaded model on new data if len(minibatch) > 0: minibatch_history = agent.train_only(len(minibatch), len(minibatch)) if minibatch_history == None: debug_log('ERROR: Unable to train on iteration\'s data') replay_memory.extend(minibatch) else: debug_log('WARNING: Skipping minibatch training since no new data was found') #NOTE: train newly loaded model on random selection of old data agent.replay_memory = replay_memory sum_loss = 0 if len(replay_memory) > MIN_REPLAY_MEMORY_SIZE:
def main(): print "Creating DQN agent..." iters = 10000 n_goal = 0 n_goal_all = 0 time_stamp = 0 ############################################################ # print x # max_steps = 3 # actions = ["print", " ", "x"] ############################################################ ############################################################ # print x+1 max_steps = 5 actions = ["print", " ", "x", "+", "1"] ############################################################ agent = DQNAgent(max_steps, actions) agent.dqn.initial_exploration = iters * 0.6 results = [] policy_frozen = False wins_file = "wins.txt" with io.FileIO(wins_file, "w") as file: file.write("Winning codes:\n") for iter in range(iters): print "\n\n::{}::".format(iter) if iter == 4300: # 2300: policy_frozen = True env = gym.make("codegen-v0") num = random.randrange(1, 100) env.my_input = num ############################################################ # print x # env.goal = str(num) ############################################################ ############################################################ # print x+1 env.goal = str(num + 1) ############################################################ code = env._reset() step_in_episode = 0 total_score = 0.0 reward = 0.0 mystate = [] my_state_new = [] while step_in_episode < max_steps: state = env.code_index_list[:] state += np.zeros([ max_steps - len(env.code_index_list), agent.dqn.code_idx_size ], dtype=int).tolist() if step_in_episode == 0: action_idx = agent.start(code, state, policy_frozen) else: action_idx = agent.act(code, state, reward) code, reward, terminal, info = env._step(action_idx, agent.dqn.actions) state_prime = env.code_index_list[:] state_prime += np.zeros([ max_steps - len(env.code_index_list), agent.dqn.code_idx_size ], dtype=int).tolist() agent.dqn.experience_replay(agent.dqn.time_stamp) if step_in_episode == max_steps - 1 or terminal: agent.dqn.stock_experience(agent.dqn.time_stamp, state, action_idx, reward, state_prime, True) if terminal: agent.dqn.goal_idx.append(agent.dqn.time_stamp) agent.dqn.time_stamp += 1 else: agent.dqn.stock_experience(agent.dqn.time_stamp, state, action_idx, reward, state_prime, False) total_score += reward if terminal: agent.end(reward) n_goal_all += 1 step_in_episode += 1 if iters - iter <= 100: n_goal += 1 step_in_episode += 1 if iter >= 100: results = results[1:] if reward >= 1: print "WIN" results.append(1.0) with io.FileIO(wins_file, "a") as f: f.write( "\n=====================\n{}\n=====================\n\n". format(code)) f.flush() os.fsync(f) else: results.append(0.0) total_iters = 100 if iter >= 100 else iter + 1 print "TOTAL {:.2f}% of wins in last {} iters, sum: {}, total good: {}".format( 100 * sum(results) / total_iters, total_iters, sum(results), len(agent.dqn.goal_idx)) if iter == 1 + agent.dqn.initial_exploration: print "n_goal_all = ", n_goal_all print agent.dqn.goal_idx raw_input() print "n_goal : ", n_goal print "epsilon : ", agent.epsilon
kwargs['ACT_EVERY']=int(kwargs['ACT_EVERY']) kwargs['SEED']=int(kwargs['SEED']) #get environment env = UnityEnvironment(file_name="Banana_Windows_x86_64/Banana.exe") # get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] # reset the environment env_info = env.reset(train_mode=True)[brain_name] # get environment state and action space sizes state = env_info.vector_observations[0] state_size = len(state) action_size = brain.vector_action_space_size #make agent agent = DQNAgent(state_size=state_size, action_size=action_size, **kwargs) #load trained agent's weights weights_name = test_name+'-weights.pth' weights_path = os.path.join(test_results_path, weights_name) agent.qnetwork_local.load_state_dict(torch.load(weights_path, map_location=lambda storage, loc: storage)) #navigate navigate(env, agent, brain_name)