def test_dqn(self): import roomai.sevenking env = roomai.sevenking.SevenKingEnv() model = ExampleModel() dqn = DqnAlgorithm() opponents = [roomai.common.RandomPlayer() for i in range(2)] dqn.train(model=model, env=env, params={}) dqn.eval(model=model, env=env, opponents=opponents, params={})
def runEpoch(minEpochSteps, evalWithEpsilon=None): stepStart = environment.getStepNumber() isTraining = True if evalWithEpsilon is None else False startGameNumber = environment.getGameNumber() epochTotalScore = 0 while environment.getStepNumber() - stepStart < minEpochSteps: startTime = lastLogTime = time.time() stateReward = 0 state = None while not environment.isGameOver(): # Choose next action if evalWithEpsilon is None: epsilon = max(.1, 1.0 - 0.9 * environment.getStepNumber() / 1e6) else: epsilon = evalWithEpsilon if state is None or random.random() > (1 - epsilon): action = random.randrange(environment.getNumActions()) else: screens = np.reshape(state.getScreens(), (1, 84, 84, 4)) action = dqn.inference(screens) # Make the move oldState = state reward, state, isTerminal = environment.step(action) # Record experience in replay memory and train if isTraining and oldState is not None: maxReward = reward if reward > maxReward else maxReward clippedReward = min(1, max(-1, reward)) / maxReward replayMemory.addSample(replay.Sample(oldState, action, clippedReward, state, isTerminal)) if environment.getStepNumber() > args.observation_steps and environment.getEpisodeStepNumber() % 4 == 0: batch = replayMemory.drawBatch(32) dqn.train(batch, environment.getStepNumber()) if time.time() - lastLogTime > 60: print(' ...frame %d' % environment.getEpisodeFrameNumber()) lastLogTime = time.time() if isTerminal: state = None episodeTime = time.time() - startTime print('%s %d ended with score: %d (%d frames in %fs for %d fps)' % ('Episode' if isTraining else 'Eval', environment.getGameNumber(), environment.getGameScore(), environment.getEpisodeFrameNumber(), episodeTime, environment.getEpisodeFrameNumber() / episodeTime)) epochTotalScore += environment.getGameScore() environment.resetGame() # return the average score return epochTotalScore / (environment.getGameNumber() - startGameNumber)
def runEpoch(minEpochSteps, evalWithEpsilon=None): stepStart = environment.getStepNumber() isTraining = True if evalWithEpsilon is None else False startGameNumber = environment.getGameNumber() epochTotalScore = 0 while environment.getStepNumber() - stepStart < minEpochSteps: startTime = lastLogTime = time.time() stateReward = 0 state = None while not environment.isGameOver(): # Choose next action if evalWithEpsilon is None: epsilon = max(.1, 1.0 - 0.9 * environment.getStepNumber() / 1e6) else: epsilon = evalWithEpsilon if state is None or random.random() > (1 - epsilon): action = random.randrange(environment.getNumActions()) else: screens = np.reshape(state.getScreens(), (1, 84, 84, 4)) action = dqn.inference(screens) # Make the move oldState = state reward, state, isTerminal = environment.step(action) # Record experience in replay memory and train if isTraining and oldState is not None: clippedReward = min(1, max(-1, reward)) replayMemory.addSample(replay.Sample(oldState, action, clippedReward, state, isTerminal)) if environment.getStepNumber() > args.observation_steps and environment.getEpisodeStepNumber() % 4 == 0: batch = replayMemory.drawBatch(32) dqn.train(batch, environment.getStepNumber()) if time.time() - lastLogTime > 60: print(' ...frame %d' % environment.getEpisodeFrameNumber()) lastLogTime = time.time() if isTerminal: state = None episodeTime = time.time() - startTime print('%s %d ended with score: %d (%d frames in %fs for %d fps)' % ('Episode' if isTraining else 'Eval', environment.getGameNumber(), environment.getGameScore(), environment.getEpisodeFrameNumber(), episodeTime, environment.getEpisodeFrameNumber() / episodeTime)) epochTotalScore += environment.getGameScore() environment.resetGame() # return the average score return epochTotalScore / (environment.getGameNumber() - startGameNumber)
def train_and_score(self, environment, memory, epochs=500): env = gym.make(environment) output_shape = env.action_space.n input_shape = env.observation_space.shape model = self.compile_model(input_shape, output_shape) memory = copy.deepcopy(memory) score = train(environment, model, memory, epochs) return score
def main(): parser = argparse.ArgumentParser() parser.add_argument("--model", default='a3c', type=str, help="dqn, drqn, a3c") ############################################## parser.add_argument("--game", default='BreakoutDeterministic', type=str) ################ Visual Attention ################ parser.add_argument("--tis", default='False', type=str) ################ Value Based ################ parser.add_argument("--double", default='False', type=str) parser.add_argument("--dueling", default='False', type=str) ########### DRQN ########### parser.add_argument("--drqn_skill", default='norm', type=str, help="norm, doom") ################ Policy Based ################ parser.add_argument("--num_cpu", default=8, type=int) parser.add_argument("--all_cpu", type=str) ############### Common Arguments ############### parser.add_argument("--train_time", default=24, type=int) parser.add_argument("--report_path", type=str) parser.add_argument("--model_path", type=str) parser.add_argument("--report_file_name", type=str) args = parser.parse_args() ############################################## if args.all_cpu == "True": args.num_cpu = multiprocessing.cpu_count() args.report_file_name = args.game + "_" + args.model + ".txt" args.report_path = "./report/" args.model_path = "./model/" + args.game + "/" make_path(args.report_path) make_path(args.model_path) if args.model == 'dqn': dqn.train(args) if args.model == 'drqn': drqn.train(args) if args.model == 'a3c': a3c.train(args)
def run_c_learn(): pretrained_max_epsilon = 0.5 exp_name = get_exp_name("curriculum", "dqn") one_eight_path = "reward_shaping_large_43_8.p" one_eight_env = construct_curriculum_env(0) one_eight_model_path = os.path.join("models", exp_name, "one_eight.pt") quarter_path = "reward_shaping_large_quarter.p" quarter_env = construct_curriculum_env(1) quarter_model_path = os.path.join("models", exp_name, "quarter.pt") half_path = "reward_shaping_large_half.p" half_env = construct_curriculum_env(2) half_model_path = os.path.join("models", exp_name, "half.pt") full_path = "reward_shaping_large.p" full_env = construct_curriculum_env(3) full_model_path = os.path.join("models", exp_name, "full.pt") print("=" * 20) print("Start octive curriculum learning") print("=" * 20) one_eight_model = get_model(os.path.join("models", "best_one_eight", "one_eight.pt")) # one_eight_model = None if one_eight_model is None: while one_eight_model is None: one_eight_model = train(ConvDQN, one_eight_env, pretrained=None, reward_shaping_p=one_eight_path, input_t_max=6) ensure_path(one_eight_model_path) save_model(one_eight_model, one_eight_model_path) print("Test one eight curriculum learning") test(one_eight_model, one_eight_env, input_tmax=6, max_episodes=100) print("=" * 20) print("Start quarter curriculum learning") print("=" * 20) quarter_model = get_model(os.path.join("models", "best_quarter", "quarter.pt")) # quarter_model = None if quarter_model is None: while quarter_model is None: quarter_model = train(ConvDQN, quarter_env, pretrained=one_eight_model, reward_shaping_p=quarter_path, input_t_max=13, max_epsilon=pretrained_max_epsilon) ensure_path(quarter_model_path) save_model(quarter_model, quarter_model_path) print("Test quarter curriculum learning") test(quarter_model, quarter_env, input_tmax=13, max_episodes=100) print("=" * 20) print("Start half curriculum learning") print("=" * 20) half_model = None if half_model is None: while half_model is None: half_model = train(ConvDQN, half_env, pretrained=quarter_model, reward_shaping_p=half_path, max_epsilon=pretrained_max_epsilon, input_t_max=25) ensure_path(half_model_path) save_model(half_model, half_model_path) print("Test half curriculum learning") test(half_model, half_env, input_tmax=25, max_episodes=100) print("=" * 20) print("Start full curriculum learning") print("=" * 20) full_model = None if full_model is None: while full_model is None: full_model = train(ConvDQN, full_env, pretrained=half_model, reward_shaping_p=full_path, max_epsilon=pretrained_max_epsilon, input_t_max=50) ensure_path(full_model_path) save_model(full_model, full_model_path) print("Test full curriculum learning") test(full_model, half_env, input_tmax=50, max_episodes=100)
self.action_feats: next_action_feats }) #print ("q = %s"%(q.__str__())) reward_plus_gamma_q.append(experience.reward + self.gamma * np.max(q)) info_feats.append(experience.info_feat) action_feats.append(experience.action_feat) _, loss, q = self.sess.run( (self.train_op, self.loss, self.q), feed_dict={ self.info_feats: info_feats, self.action_feats: action_feats, self.reward_plus_gamma_q: reward_plus_gamma_q }) logger.debug("reward_plus_gamma_q = %s" % (reward_plus_gamma_q.__str__())) logger.debug("loss = %f" % (loss)) logger.debug("q = %s" % (q.__str__())) if __name__ == "__main__": env = roomai.sevenking.SevenKingEnv() model = SevenKingModel_ThreePlayers() dqn = dqn.DqnAlgorithm() dqn.train(env=env, model=model, params={"num_normal_players": 3}) opponents = [roomai.common.RandomPlayer() for i in range(2)] scores = dqn.eval(model=model, env=env, opponents=opponents) print(scores)
import gym from gym import envs import argparse from gym_env.gym_apple_grid.envs.apple_grid_env import AppleGridEnv from dqn import train parser = argparse.ArgumentParser() parser.add_argument('--grid_size_x', type=int, default=12) parser.add_argument('--grid_size_y', type=int, default=12) parser.add_argument('--apple_count', type=int, default=20) parser.add_argument('--agent_count', type=int, default=2) parser.add_argument('--observation_size', type=int, default=10) parser.add_argument('--num_episodes', type=int, default=250) parser.add_argument('--exp_steps', type=int, default=500) args = parser.parse_args() env = AppleGridEnv() env.init_env(dimensions=[args.grid_size_x, args.grid_size_y], num_apples=args.apple_count, num_actors=args.agent_count, episode_steps=args.exp_steps, obs_window_size=args.observation_size) train(env, args, is_rendering=False)
def run_epoch(min_epoch_steps, eval_with_epsilon=None): global train_epsilon global train_episodes global eval_episodes global episode_train_reward_list global episode_eval_reward_list is_training = True if eval_with_epsilon is None else False step_start = environment.get_step_number() start_game_number = environment.get_game_number() epoch_total_score = 0 stuck_count = 0 time_list = [] while environment.get_step_number( ) - step_start < min_epoch_steps and not stop: state_reward = 0 state = None episode_losses = [] save_net = False while not environment.is_game_over() and not stop: # epsilon selection and update if is_training: epsilon = train_epsilon if train_epsilon > args.epsilon_min: train_epsilon = train_epsilon * args.epsilon_decay if train_epsilon < args.epsilon_min: train_epsilon = args.epsilon_min else: epsilon = eval_with_epsilon # action selection if state is None or random.random() < epsilon: action = random.randrange(environment.get_num_actions()) else: action = dqn.inference(state.get_data()) # we can't skip frames as in a game # we need to wait the evolution of the environment, but we don't want to waste GPU time # we can use a training sweep (which requires some time) instead of using a sleep old_state = state for i in range(0, args.history_length * (args.repeat_action + 1)): if environment.get_step_number() % args.save_model_freq == 0: save_net = True # Make the move reward, state, is_terminal = environment.step(action) # train if is_training and old_state is not None: if environment.get_step_number() > args.observation_steps: if args.show_gpu_time: start_time_train = datetime.datetime.now() batch = replay_memory.draw_batch(args.batch_size) loss = dqn.train(batch, environment.get_step_number()) episode_losses.append(loss) if args.show_gpu_time: training_time = (datetime.datetime.now() - start_time_train).total_seconds() time_list.insert(0, training_time) if len(time_list) > 100: time_list = time_list[:-1] print("Training time: %fs, Avg time:%fs" % (training_time, np.mean(time_list))) if args.slowdown_cycle: time.sleep(args.gpu_time) else: time.sleep(args.gpu_time) else: time.sleep(args.gpu_time) if is_terminal: break # Record experience in replay memory if is_training and old_state is not None: replay_memory.add_sample( replay.Sample(old_state, action, reward, state, is_terminal)) if is_terminal: state = None if args.simulator: if reward == -1: stuck_count = stuck_count + 1 else: stuck_count = 0 if stuck_count > 2: print("Car stuck, resetting simulator position...") environment.control.reset_simulator() stuck_count = 0 if save_net: dqn.save_network() ################################# # logging ################################# episode_time = datetime.datetime.now() - start_time if is_training: train_episodes += 1 episode_train_reward_list.insert(0, environment.get_game_score()) if len(episode_train_reward_list) > 100: episode_train_reward_list = episode_train_reward_list[:-1] avg_rewards = np.mean(episode_train_reward_list) episode_avg_loss = 0 if episode_losses: episode_avg_loss = np.mean(episode_losses) log = ( 'Episode %d ended with score: %.2f (%s elapsed) (step: %d). Avg score: %.2f Avg loss: %.5f' % (environment.get_game_number(), environment.get_game_score(), str(episode_time), environment.get_step_number(), avg_rewards, episode_avg_loss)) print(log) print(" epsilon " + str(train_epsilon)) if args.logging: with summary_writer.as_default(): tf.summary.scalar('train episode reward', environment.get_game_score(), step=train_episodes) tf.summary.scalar('train avg reward(100)', avg_rewards, step=train_episodes) tf.summary.scalar('average loss', episode_avg_loss, step=train_episodes) tf.summary.scalar('epsilon', train_epsilon, step=train_episodes) tf.summary.scalar('steps', environment.get_step_number(), step=train_episodes) else: eval_episodes += 1 episode_eval_reward_list.insert(0, environment.get_game_score()) if len(episode_eval_reward_list) > 100: episode_eval_reward_list = episode_eval_reward_list[:-1] avg_rewards = np.mean(episode_eval_reward_list) log = ( 'Eval %d ended with score: %.2f (%s elapsed) (step: %d). Avg score: %.2f' % (environment.get_game_number(), environment.get_game_score(), str(episode_time), environment.get_step_number(), avg_rewards)) print(log) if args.logging: with summary_writer.as_default(): tf.summary.scalar('eval episode reward', environment.get_game_score(), step=eval_episodes) tf.summary.scalar('eval avg reward(100)', avg_rewards, step=eval_episodes) epoch_total_score += environment.get_game_score() environment.reset_game() while pause and not stop: time.sleep(1) if environment.get_game_number() - start_game_number == 0: return 0 return epoch_total_score / (environment.get_game_number() - start_game_number)
def runEpoch(minEpochSteps, evalWithEpsilon=None): global train_epsilon stepStart = environment.getStepNumber() isTraining = True if evalWithEpsilon is None else False startGameNumber = environment.getGameNumber() epochTotalScore = 0 while environment.getStepNumber() - stepStart < minEpochSteps and not stop: stateReward = 0 state = None while not environment.isGameOver() and not stop: # Choose next action if evalWithEpsilon is None: epsilon = train_epsilon else: epsilon = evalWithEpsilon if train_epsilon > args.epsilon_min: train_epsilon = train_epsilon * args.epsilon_decay if train_epsilon < args.epsilon_min: train_epsilon = args.epsilon_min if state is None or random.random() < (epsilon): action = random.randrange(environment.getNumActions()) else: screens = np.reshape( state.getScreens(), (1, State.IMAGE_SIZE, State.IMAGE_SIZE, args.frame)) action = dqn.inference(screens) # Make the move oldState = state reward, state, isTerminal = environment.step(action) # Record experience in replay memory and train if isTraining and oldState is not None: clippedReward = min(1, max(-1, reward)) replayMemory.addSample( replay.Sample(oldState, action, clippedReward, state, isTerminal)) if environment.getStepNumber( ) > args.observation_steps and environment.getEpisodeStepNumber( ) % args.frame == 0: batch = replayMemory.drawBatch(32) dqn.train(batch, environment.getStepNumber()) if isTerminal: state = None episodeTime = datetime.datetime.now() - startTime print( '%s %d ended with score: %d (%s elapsed)' % ('Episode' if isTraining else 'Eval', environment.getGameNumber(), environment.getGameScore(), str(episodeTime))) if isTraining: print("epsilon " + str(train_epsilon)) epochTotalScore += environment.getGameScore() environment.resetGame() # return the average score if environment.getGameNumber() - startGameNumber == 0: return 0 return epochTotalScore / (environment.getGameNumber() - startGameNumber)
def train(): rnd_seed = 0 np.random.seed(rnd_seed) tf.set_random_seed(rnd_seed) sim = GymSim('CartPole-v0', 5000, seed=rnd_seed) sim.act_sample_batch( 5000, FLAGS.sample_neg_ratio) # bootstrap with random actions sim.print_stats() #embed() #sys.exit() q_network = MLN(sim.INPUT_DIM, sim.ACTION_DIM) target_network = MLN(sim.INPUT_DIM, sim.ACTION_DIM, name_scope='target') with tf.Graph().as_default(): global_step = tf.Variable(0, trainable=False) action_pl = tf.placeholder(tf.int64, name='action_pl') reward_pl = tf.placeholder(tf.float32, name='reward_pl') state_pl = tf.placeholder(tf.float32, (None, sim.INPUT_DIM), name='state_pl') observ_pl = tf.placeholder(tf.float32, (None, sim.INPUT_DIM), name='observ_pl') action_q = q_network.inference(state_pl) target_q = tf.stop_gradient(target_network.inference(observ_pl)) target_q_pt = tf.Print(target_q, [target_q]) action_q_pt = tf.Print(action_q, [action_q]) loss = dqn.td_loss(action_pl, sim.ACTION_DIM, action_q, reward_pl, target_q) train_op = dqn.train(FLAGS.learning_rate, loss, global_step) saver = tf.train.Saver(tf.all_variables()) summary_op = tf.merge_all_summaries() action_op = tf.argmax(action_q, 1, name='action_op') copy_var = q_network.copy_to('target') init = tf.initialize_all_variables() sess = tf.Session(config=tf.ConfigProto( log_device_placement=FLAGS.log_device_placement)) #initialize variables sess.run(init) summary_writer = tf.train.SummaryWriter( os.path.join(FLAGS.train_dir, 'logs'), sess.graph) for step in xrange(FLAGS.max_steps): start_time = time.time() if step % 4 == 0: sess.run(copy_var) feed = sim.feed_batch(state_pl, action_pl, reward_pl, observ_pl, FLAGS.batch_size) _, loss_value = sess.run([train_op, loss], feed_dict=feed) duration = time.time() - start_time assert not np.isnan(loss_value), 'Model diverged with loss = NaN' if step % 10 == 0: num_examples_per_step = FLAGS.batch_size examples_per_sec = num_examples_per_step / duration sec_per_batch = float(duration) format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print(format_str % (datetime.now(), step, loss_value, examples_per_sec, sec_per_batch)) if step > FLAGS.sample_after: pred_act = sess.run(action_op, feed_dict={state_pl: sim.state}) pred_act = pred_act[0] sim.act_sample_once(pred_act, neg_ratio=FLAGS.sample_neg_ratio, append_db=True) # visualization if step % 1000 == 0 and step != 0: sim.reset() survive = 0 for _ in range(200): pred_act = sess.run(action_op, feed_dict={state_pl: sim.state}) pred_act = pred_act[0] done = sim.act_demo(pred_act) if not done: survive += 1 else: print('Survived for %i frame' % survive) survive = 0 #if step % 100 == 0: # summary_str = sess.run(summary_op) # summary_writer.add_summary(summary_str, step) # Save the model checkpoint periodically. if step % 1000 == 0 or (step + 1) == FLAGS.max_steps: checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step)
state = game.getLogBoard() while not game.Lost(): action = agent.selectAction(state, counter) next_state, reward, done, max_tile = game.step(action) buffer.store([state, action, next_state, reward, done]) state = next_state # sleep(0.1) # os.system('clear') # print(next_state) counter += 1 # NN training if counter > 50: train(buffer, agent) iteration_max_tile = max(iteration_max_tile, max_tile) if max_tile > super_max_tile: super_max_tile = max_tile # update target NN iteration_sum_maxtile += iteration_max_tile if episodes % agent.target_update_freq == 0: agent.target_nn.load_state_dict(agent.nn.state_dict()) print( "Episode {}.\t iterations avg max tile {}.\t Max tile so far {}\t Epislon {}" .format(episodes, iteration_sum_maxtile / agent.target_update_freq, super_max_tile, agent.eps_scheduled(it))) iteration_max_tile = -1 iteration_sum_maxtile = 0
def get_model(input_shape, output_shape): model = Sequential() model.add(Dense(16, activation='tanh', input_shape=input_shape)) model.add(Dense(64, activation='elu')) model.add(Dense(16, activation='sigmoid')) model.add(Dense(output_shape, activation="linear")) model.compile(loss="MSE", optimizer='adam', metrics=['accuracy']) model.summary() return model if __name__ == '__main__': args = get_args() environment = args.environment env = gym.make(environment) output_shape = env.action_space.n input_shape = env.observation_space.shape epochs = 500 memory = fill_memory(environment) model = get_model(input_shape, output_shape) score = train(environment, model, memory, epochs) print(score)
def train(): rnd_seed = 0 np.random.seed(rnd_seed) tf.set_random_seed(rnd_seed) sim = GymSim('CartPole-v0', 5000, seed=rnd_seed) sim.act_sample_batch(5000, FLAGS.sample_neg_ratio) # bootstrap with random actions sim.print_stats() #embed() #sys.exit() q_network = MLN(sim.INPUT_DIM, sim.ACTION_DIM) target_network = MLN(sim.INPUT_DIM, sim.ACTION_DIM, name_scope='target') with tf.Graph().as_default(): global_step = tf.Variable(0, trainable=False) action_pl = tf.placeholder(tf.int64, name='action_pl') reward_pl = tf.placeholder(tf.float32, name='reward_pl') state_pl = tf.placeholder(tf.float32, (None, sim.INPUT_DIM), name='state_pl') observ_pl = tf.placeholder(tf.float32, (None, sim.INPUT_DIM), name='observ_pl') action_q = q_network.inference(state_pl) target_q = tf.stop_gradient(target_network.inference(observ_pl)) target_q_pt = tf.Print(target_q, [target_q]) action_q_pt = tf.Print(action_q, [action_q]) loss = dqn.td_loss(action_pl, sim.ACTION_DIM, action_q, reward_pl, target_q) train_op = dqn.train(FLAGS.learning_rate, loss, global_step) saver = tf.train.Saver(tf.all_variables()) summary_op = tf.merge_all_summaries() action_op = tf.argmax(action_q, 1, name='action_op') copy_var = q_network.copy_to('target') init = tf.initialize_all_variables() sess = tf.Session(config=tf.ConfigProto( log_device_placement=FLAGS.log_device_placement)) #initialize variables sess.run(init) summary_writer = tf.train.SummaryWriter(os.path.join(FLAGS.train_dir, 'logs') , sess.graph) for step in xrange(FLAGS.max_steps): start_time = time.time() if step % 4 == 0: sess.run(copy_var) feed = sim.feed_batch(state_pl, action_pl, reward_pl, observ_pl, FLAGS.batch_size) _, loss_value = sess.run([train_op, loss], feed_dict = feed) duration = time.time() - start_time assert not np.isnan(loss_value), 'Model diverged with loss = NaN' if step % 10 == 0: num_examples_per_step = FLAGS.batch_size examples_per_sec = num_examples_per_step / duration sec_per_batch = float(duration) format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print (format_str % (datetime.now(), step, loss_value, examples_per_sec, sec_per_batch)) if step > FLAGS.sample_after: pred_act = sess.run(action_op, feed_dict={state_pl: sim.state}) pred_act = pred_act[0] sim.act_sample_once(pred_act, neg_ratio=FLAGS.sample_neg_ratio, append_db=True) # visualization if step % 1000 == 0 and step != 0: sim.reset() survive = 0 for _ in range(200): pred_act = sess.run(action_op, feed_dict={state_pl: sim.state}) pred_act = pred_act[0] done = sim.act_demo(pred_act) if not done: survive += 1 else: print('Survived for %i frame' % survive) survive = 0 #if step % 100 == 0: # summary_str = sess.run(summary_op) # summary_writer.add_summary(summary_str, step) # Save the model checkpoint periodically. if step % 1000 == 0 or (step + 1) == FLAGS.max_steps: checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step)
import dqn import argparse parser = argparse.ArgumentParser(description='Process some integers.') parser.add_argument('--env_name', type=str, help='an integer for the accumulator') args = parser.parse_args() env_name = args.env_name if env_name is None: env_name = "Breakout-v0" dqn = dqn.DQN(env_name) dqn.train()
loss=nn.L2Loss, renderFreq=envDisplayFreq) settingsInfo = settings.copy() settingsInfo['env'] = envName settingsInfo['loss'] = settingsInfo['loss'].__name__ stats.append(settingsInfo) stats.append({'solve criteria': solveCriteria}) stats.append([]) print('++ TRAINING: {} | solve criteria: {} ++'.format( envName, solveCriteria)) print('Training Settings:') pp.pprint(settingsInfo) for e, t, eps, l, r, Q in dqn.train(**settings): rewards.append(r) avg = np.mean(rewards) stats[-1].append((t, eps, l, r, avg)) alert = None if avg > avgRewards: avgRewards = avg alert = '*R*' if alert or e % envPrintFreq is 0: print( '[TRAINING ({:.2%})] e:{} | t:{} | eps:{:,.3f} | l:{:,.3f} | r:{:,.3f} | avg:{:,.3f} | {}' .format((t + 1) / steps, e, t, eps, l, r, avg, alert)) nn.save(Q.topology, modelPath)