Beispiel #1
0
 def test_dqn(self):
     import roomai.sevenking
     env = roomai.sevenking.SevenKingEnv()
     model = ExampleModel()
     dqn = DqnAlgorithm()
     opponents = [roomai.common.RandomPlayer() for i in range(2)]
     dqn.train(model=model, env=env, params={})
     dqn.eval(model=model, env=env, opponents=opponents, params={})
Beispiel #2
0
def runEpoch(minEpochSteps, evalWithEpsilon=None):
    stepStart = environment.getStepNumber()
    isTraining = True if evalWithEpsilon is None else False
    startGameNumber = environment.getGameNumber()
    epochTotalScore = 0

    while environment.getStepNumber() - stepStart < minEpochSteps:
    
        startTime = lastLogTime = time.time()
        stateReward = 0
        state = None
        
        while not environment.isGameOver():
      
            # Choose next action
            if evalWithEpsilon is None:
                epsilon = max(.1, 1.0 - 0.9 * environment.getStepNumber() / 1e6)
            else:
                epsilon = evalWithEpsilon

            if state is None or random.random() > (1 - epsilon):
                action = random.randrange(environment.getNumActions())
            else:
                screens = np.reshape(state.getScreens(), (1, 84, 84, 4))
                action = dqn.inference(screens)

            # Make the move
            oldState = state
            reward, state, isTerminal = environment.step(action)
            
            # Record experience in replay memory and train
            if isTraining and oldState is not None:

                maxReward = reward if reward > maxReward else maxReward

                clippedReward = min(1, max(-1, reward)) / maxReward
                replayMemory.addSample(replay.Sample(oldState, action, clippedReward, state, isTerminal))

                if environment.getStepNumber() > args.observation_steps and environment.getEpisodeStepNumber() % 4 == 0:
                    batch = replayMemory.drawBatch(32)
                    dqn.train(batch, environment.getStepNumber())
        
            if time.time() - lastLogTime > 60:
                print('  ...frame %d' % environment.getEpisodeFrameNumber())
                lastLogTime = time.time()

            if isTerminal:
                state = None

        episodeTime = time.time() - startTime
        print('%s %d ended with score: %d (%d frames in %fs for %d fps)' %
            ('Episode' if isTraining else 'Eval', environment.getGameNumber(), environment.getGameScore(),
            environment.getEpisodeFrameNumber(), episodeTime, environment.getEpisodeFrameNumber() / episodeTime))
        epochTotalScore += environment.getGameScore()
        environment.resetGame()
    
    # return the average score
    return epochTotalScore / (environment.getGameNumber() - startGameNumber)
Beispiel #3
0
def runEpoch(minEpochSteps, evalWithEpsilon=None):
    stepStart = environment.getStepNumber()
    isTraining = True if evalWithEpsilon is None else False
    startGameNumber = environment.getGameNumber()
    epochTotalScore = 0

    while environment.getStepNumber() - stepStart < minEpochSteps:
    
        startTime = lastLogTime = time.time()
        stateReward = 0
        state = None
        
        while not environment.isGameOver():
      
            # Choose next action
            if evalWithEpsilon is None:
                epsilon = max(.1, 1.0 - 0.9 * environment.getStepNumber() / 1e6)
            else:
                epsilon = evalWithEpsilon

            if state is None or random.random() > (1 - epsilon):
                action = random.randrange(environment.getNumActions())
            else:
                screens = np.reshape(state.getScreens(), (1, 84, 84, 4))
                action = dqn.inference(screens)

            # Make the move
            oldState = state
            reward, state, isTerminal = environment.step(action)
            
            # Record experience in replay memory and train
            if isTraining and oldState is not None:
                clippedReward = min(1, max(-1, reward))
                replayMemory.addSample(replay.Sample(oldState, action, clippedReward, state, isTerminal))

                if environment.getStepNumber() > args.observation_steps and environment.getEpisodeStepNumber() % 4 == 0:
                    batch = replayMemory.drawBatch(32)
                    dqn.train(batch, environment.getStepNumber())
        
            if time.time() - lastLogTime > 60:
                print('  ...frame %d' % environment.getEpisodeFrameNumber())
                lastLogTime = time.time()

            if isTerminal:
                state = None

        episodeTime = time.time() - startTime
        print('%s %d ended with score: %d (%d frames in %fs for %d fps)' %
            ('Episode' if isTraining else 'Eval', environment.getGameNumber(), environment.getGameScore(),
            environment.getEpisodeFrameNumber(), episodeTime, environment.getEpisodeFrameNumber() / episodeTime))
        epochTotalScore += environment.getGameScore()
        environment.resetGame()
    
    # return the average score
    return epochTotalScore / (environment.getGameNumber() - startGameNumber)
Beispiel #4
0
    def train_and_score(self, environment, memory, epochs=500):
        env = gym.make(environment)
        output_shape = env.action_space.n
        input_shape = env.observation_space.shape

        model = self.compile_model(input_shape, output_shape)
        memory = copy.deepcopy(memory)
        score = train(environment, model, memory, epochs)

        return score
Beispiel #5
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--model",
                        default='a3c',
                        type=str,
                        help="dqn, drqn, a3c")
    ##############################################
    parser.add_argument("--game", default='BreakoutDeterministic', type=str)
    ################  Visual Attention  ################
    parser.add_argument("--tis", default='False', type=str)
    ################  Value Based  ################
    parser.add_argument("--double", default='False', type=str)
    parser.add_argument("--dueling", default='False', type=str)
    ###########  DRQN  ###########
    parser.add_argument("--drqn_skill",
                        default='norm',
                        type=str,
                        help="norm, doom")
    ################  Policy Based  ################
    parser.add_argument("--num_cpu", default=8, type=int)
    parser.add_argument("--all_cpu", type=str)
    ###############  Common Arguments  ###############
    parser.add_argument("--train_time", default=24, type=int)
    parser.add_argument("--report_path", type=str)
    parser.add_argument("--model_path", type=str)
    parser.add_argument("--report_file_name", type=str)
    args = parser.parse_args()
    ##############################################
    if args.all_cpu == "True": args.num_cpu = multiprocessing.cpu_count()
    args.report_file_name = args.game + "_" + args.model + ".txt"
    args.report_path = "./report/"
    args.model_path = "./model/" + args.game + "/"
    make_path(args.report_path)
    make_path(args.model_path)

    if args.model == 'dqn': dqn.train(args)
    if args.model == 'drqn': drqn.train(args)
    if args.model == 'a3c': a3c.train(args)
Beispiel #6
0
def run_c_learn():
    pretrained_max_epsilon = 0.5
    exp_name = get_exp_name("curriculum", "dqn")

    one_eight_path = "reward_shaping_large_43_8.p"
    one_eight_env = construct_curriculum_env(0)
    one_eight_model_path = os.path.join("models", exp_name, "one_eight.pt")

    quarter_path = "reward_shaping_large_quarter.p"
    quarter_env = construct_curriculum_env(1)
    quarter_model_path = os.path.join("models", exp_name, "quarter.pt")

    half_path = "reward_shaping_large_half.p"
    half_env = construct_curriculum_env(2)
    half_model_path = os.path.join("models", exp_name, "half.pt")

    full_path = "reward_shaping_large.p"
    full_env = construct_curriculum_env(3)
    full_model_path = os.path.join("models", exp_name, "full.pt")

    print("=" * 20)
    print("Start octive curriculum learning")
    print("=" * 20)
    one_eight_model = get_model(os.path.join("models", "best_one_eight", "one_eight.pt"))
    # one_eight_model = None
    if one_eight_model is None:
        while one_eight_model is None:
            one_eight_model = train(ConvDQN, one_eight_env, pretrained=None, reward_shaping_p=one_eight_path, input_t_max=6)
        ensure_path(one_eight_model_path)
        save_model(one_eight_model, one_eight_model_path)
    print("Test one eight curriculum learning")
    test(one_eight_model, one_eight_env, input_tmax=6, max_episodes=100)

    print("=" * 20)
    print("Start quarter curriculum learning")
    print("=" * 20)
    quarter_model = get_model(os.path.join("models", "best_quarter", "quarter.pt"))
    # quarter_model = None
    if quarter_model is None:
        while quarter_model is None:
            quarter_model = train(ConvDQN, quarter_env, pretrained=one_eight_model, reward_shaping_p=quarter_path,
                                  input_t_max=13, max_epsilon=pretrained_max_epsilon)
        ensure_path(quarter_model_path)
        save_model(quarter_model, quarter_model_path)
    print("Test quarter curriculum learning")
    test(quarter_model, quarter_env, input_tmax=13, max_episodes=100)

    print("=" * 20)
    print("Start half curriculum learning")
    print("=" * 20)
    half_model = None
    if half_model is None:
        while half_model is None:
            half_model = train(ConvDQN, half_env, pretrained=quarter_model, reward_shaping_p=half_path,
                               max_epsilon=pretrained_max_epsilon, input_t_max=25)
        ensure_path(half_model_path)
        save_model(half_model, half_model_path)
    print("Test half curriculum learning")
    test(half_model, half_env, input_tmax=25, max_episodes=100)

    print("=" * 20)
    print("Start full curriculum learning")
    print("=" * 20)
    full_model = None
    if full_model is None:
        while full_model is None:
            full_model = train(ConvDQN, full_env, pretrained=half_model, reward_shaping_p=full_path,
                               max_epsilon=pretrained_max_epsilon, input_t_max=50)
        ensure_path(full_model_path)
        save_model(full_model, full_model_path)
    print("Test full curriculum learning")
    test(full_model, half_env, input_tmax=50, max_episodes=100)
Beispiel #7
0
                                  self.action_feats: next_action_feats
                              })
            #print ("q = %s"%(q.__str__()))
            reward_plus_gamma_q.append(experience.reward +
                                       self.gamma * np.max(q))
            info_feats.append(experience.info_feat)
            action_feats.append(experience.action_feat)

        _, loss, q = self.sess.run(
            (self.train_op, self.loss, self.q),
            feed_dict={
                self.info_feats: info_feats,
                self.action_feats: action_feats,
                self.reward_plus_gamma_q: reward_plus_gamma_q
            })
        logger.debug("reward_plus_gamma_q = %s" %
                     (reward_plus_gamma_q.__str__()))
        logger.debug("loss = %f" % (loss))
        logger.debug("q = %s" % (q.__str__()))


if __name__ == "__main__":
    env = roomai.sevenking.SevenKingEnv()
    model = SevenKingModel_ThreePlayers()
    dqn = dqn.DqnAlgorithm()
    dqn.train(env=env, model=model, params={"num_normal_players": 3})

    opponents = [roomai.common.RandomPlayer() for i in range(2)]
    scores = dqn.eval(model=model, env=env, opponents=opponents)
    print(scores)
Beispiel #8
0
import gym
from gym import envs
import argparse
from gym_env.gym_apple_grid.envs.apple_grid_env import AppleGridEnv
from dqn import train

parser = argparse.ArgumentParser()
parser.add_argument('--grid_size_x', type=int, default=12)
parser.add_argument('--grid_size_y', type=int, default=12)
parser.add_argument('--apple_count', type=int, default=20)
parser.add_argument('--agent_count', type=int, default=2)
parser.add_argument('--observation_size', type=int, default=10)
parser.add_argument('--num_episodes', type=int, default=250)
parser.add_argument('--exp_steps', type=int, default=500)

args = parser.parse_args()

env = AppleGridEnv()
env.init_env(dimensions=[args.grid_size_x, args.grid_size_y],
             num_apples=args.apple_count,
             num_actors=args.agent_count,
             episode_steps=args.exp_steps,
             obs_window_size=args.observation_size)

train(env, args, is_rendering=False)
Beispiel #9
0
def run_epoch(min_epoch_steps, eval_with_epsilon=None):
    global train_epsilon
    global train_episodes
    global eval_episodes
    global episode_train_reward_list
    global episode_eval_reward_list
    is_training = True if eval_with_epsilon is None else False
    step_start = environment.get_step_number()
    start_game_number = environment.get_game_number()
    epoch_total_score = 0
    stuck_count = 0
    time_list = []

    while environment.get_step_number(
    ) - step_start < min_epoch_steps and not stop:
        state_reward = 0
        state = None

        episode_losses = []
        save_net = False
        while not environment.is_game_over() and not stop:
            # epsilon selection and update
            if is_training:
                epsilon = train_epsilon
                if train_epsilon > args.epsilon_min:
                    train_epsilon = train_epsilon * args.epsilon_decay
                    if train_epsilon < args.epsilon_min:
                        train_epsilon = args.epsilon_min
            else:
                epsilon = eval_with_epsilon

            # action selection
            if state is None or random.random() < epsilon:
                action = random.randrange(environment.get_num_actions())
            else:
                action = dqn.inference(state.get_data())

            # we can't skip frames as in a game
            # we need to wait the evolution of the environment, but we don't want to waste GPU time
            # we can use a training sweep (which requires some time) instead of using a sleep
            old_state = state
            for i in range(0, args.history_length * (args.repeat_action + 1)):

                if environment.get_step_number() % args.save_model_freq == 0:
                    save_net = True

                # Make the move
                reward, state, is_terminal = environment.step(action)

                # train
                if is_training and old_state is not None:
                    if environment.get_step_number() > args.observation_steps:
                        if args.show_gpu_time:
                            start_time_train = datetime.datetime.now()
                        batch = replay_memory.draw_batch(args.batch_size)
                        loss = dqn.train(batch, environment.get_step_number())
                        episode_losses.append(loss)
                        if args.show_gpu_time:
                            training_time = (datetime.datetime.now() -
                                             start_time_train).total_seconds()
                            time_list.insert(0, training_time)
                            if len(time_list) > 100:
                                time_list = time_list[:-1]
                            print("Training time: %fs, Avg time:%fs" %
                                  (training_time, np.mean(time_list)))
                        if args.slowdown_cycle:
                            time.sleep(args.gpu_time)
                    else:
                        time.sleep(args.gpu_time)
                else:
                    time.sleep(args.gpu_time)

                if is_terminal:
                    break

            # Record experience in replay memory
            if is_training and old_state is not None:
                replay_memory.add_sample(
                    replay.Sample(old_state, action, reward, state,
                                  is_terminal))

            if is_terminal:
                state = None

            if args.simulator:
                if reward == -1:
                    stuck_count = stuck_count + 1
                else:
                    stuck_count = 0
                if stuck_count > 2:
                    print("Car stuck, resetting simulator position...")
                    environment.control.reset_simulator()
                    stuck_count = 0

        if save_net:
            dqn.save_network()

        #################################
        # logging
        #################################

        episode_time = datetime.datetime.now() - start_time

        if is_training:
            train_episodes += 1
            episode_train_reward_list.insert(0, environment.get_game_score())
            if len(episode_train_reward_list) > 100:
                episode_train_reward_list = episode_train_reward_list[:-1]
            avg_rewards = np.mean(episode_train_reward_list)

            episode_avg_loss = 0
            if episode_losses:
                episode_avg_loss = np.mean(episode_losses)

            log = (
                'Episode %d ended with score: %.2f (%s elapsed) (step: %d). Avg score: %.2f Avg loss: %.5f'
                % (environment.get_game_number(), environment.get_game_score(),
                   str(episode_time), environment.get_step_number(),
                   avg_rewards, episode_avg_loss))
            print(log)
            print("   epsilon " + str(train_epsilon))
            if args.logging:
                with summary_writer.as_default():
                    tf.summary.scalar('train episode reward',
                                      environment.get_game_score(),
                                      step=train_episodes)
                    tf.summary.scalar('train avg reward(100)',
                                      avg_rewards,
                                      step=train_episodes)
                    tf.summary.scalar('average loss',
                                      episode_avg_loss,
                                      step=train_episodes)
                    tf.summary.scalar('epsilon',
                                      train_epsilon,
                                      step=train_episodes)
                    tf.summary.scalar('steps',
                                      environment.get_step_number(),
                                      step=train_episodes)
        else:
            eval_episodes += 1
            episode_eval_reward_list.insert(0, environment.get_game_score())
            if len(episode_eval_reward_list) > 100:
                episode_eval_reward_list = episode_eval_reward_list[:-1]
            avg_rewards = np.mean(episode_eval_reward_list)

            log = (
                'Eval %d ended with score: %.2f (%s elapsed) (step: %d). Avg score: %.2f'
                % (environment.get_game_number(), environment.get_game_score(),
                   str(episode_time), environment.get_step_number(),
                   avg_rewards))
            print(log)
            if args.logging:
                with summary_writer.as_default():
                    tf.summary.scalar('eval episode reward',
                                      environment.get_game_score(),
                                      step=eval_episodes)
                    tf.summary.scalar('eval avg reward(100)',
                                      avg_rewards,
                                      step=eval_episodes)

        epoch_total_score += environment.get_game_score()
        environment.reset_game()

        while pause and not stop:
            time.sleep(1)

    if environment.get_game_number() - start_game_number == 0:
        return 0
    return epoch_total_score / (environment.get_game_number() -
                                start_game_number)
Beispiel #10
0
def runEpoch(minEpochSteps, evalWithEpsilon=None):
    global train_epsilon
    stepStart = environment.getStepNumber()
    isTraining = True if evalWithEpsilon is None else False
    startGameNumber = environment.getGameNumber()
    epochTotalScore = 0

    while environment.getStepNumber() - stepStart < minEpochSteps and not stop:
        stateReward = 0
        state = None

        while not environment.isGameOver() and not stop:
            # Choose next action
            if evalWithEpsilon is None:
                epsilon = train_epsilon
            else:
                epsilon = evalWithEpsilon

            if train_epsilon > args.epsilon_min:
                train_epsilon = train_epsilon * args.epsilon_decay
                if train_epsilon < args.epsilon_min:
                    train_epsilon = args.epsilon_min

            if state is None or random.random() < (epsilon):
                action = random.randrange(environment.getNumActions())
            else:
                screens = np.reshape(
                    state.getScreens(),
                    (1, State.IMAGE_SIZE, State.IMAGE_SIZE, args.frame))
                action = dqn.inference(screens)

            # Make the move
            oldState = state
            reward, state, isTerminal = environment.step(action)

            # Record experience in replay memory and train
            if isTraining and oldState is not None:
                clippedReward = min(1, max(-1, reward))
                replayMemory.addSample(
                    replay.Sample(oldState, action, clippedReward, state,
                                  isTerminal))

                if environment.getStepNumber(
                ) > args.observation_steps and environment.getEpisodeStepNumber(
                ) % args.frame == 0:
                    batch = replayMemory.drawBatch(32)
                    dqn.train(batch, environment.getStepNumber())

            if isTerminal:
                state = None

        episodeTime = datetime.datetime.now() - startTime
        print(
            '%s %d ended with score: %d (%s elapsed)' %
            ('Episode' if isTraining else 'Eval', environment.getGameNumber(),
             environment.getGameScore(), str(episodeTime)))
        if isTraining:
            print("epsilon " + str(train_epsilon))
        epochTotalScore += environment.getGameScore()
        environment.resetGame()

    # return the average score
    if environment.getGameNumber() - startGameNumber == 0:
        return 0
    return epochTotalScore / (environment.getGameNumber() - startGameNumber)
Beispiel #11
0
def train():
    rnd_seed = 0
    np.random.seed(rnd_seed)
    tf.set_random_seed(rnd_seed)
    sim = GymSim('CartPole-v0', 5000, seed=rnd_seed)
    sim.act_sample_batch(
        5000, FLAGS.sample_neg_ratio)  # bootstrap with random actions
    sim.print_stats()
    #embed()
    #sys.exit()
    q_network = MLN(sim.INPUT_DIM, sim.ACTION_DIM)
    target_network = MLN(sim.INPUT_DIM, sim.ACTION_DIM, name_scope='target')

    with tf.Graph().as_default():

        global_step = tf.Variable(0, trainable=False)

        action_pl = tf.placeholder(tf.int64, name='action_pl')
        reward_pl = tf.placeholder(tf.float32, name='reward_pl')
        state_pl = tf.placeholder(tf.float32, (None, sim.INPUT_DIM),
                                  name='state_pl')
        observ_pl = tf.placeholder(tf.float32, (None, sim.INPUT_DIM),
                                   name='observ_pl')

        action_q = q_network.inference(state_pl)
        target_q = tf.stop_gradient(target_network.inference(observ_pl))
        target_q_pt = tf.Print(target_q, [target_q])
        action_q_pt = tf.Print(action_q, [action_q])

        loss = dqn.td_loss(action_pl, sim.ACTION_DIM, action_q, reward_pl,
                           target_q)

        train_op = dqn.train(FLAGS.learning_rate, loss, global_step)

        saver = tf.train.Saver(tf.all_variables())

        summary_op = tf.merge_all_summaries()

        action_op = tf.argmax(action_q, 1, name='action_op')

        copy_var = q_network.copy_to('target')

        init = tf.initialize_all_variables()

        sess = tf.Session(config=tf.ConfigProto(
            log_device_placement=FLAGS.log_device_placement))

        #initialize variables
        sess.run(init)

        summary_writer = tf.train.SummaryWriter(
            os.path.join(FLAGS.train_dir, 'logs'), sess.graph)

    for step in xrange(FLAGS.max_steps):
        start_time = time.time()

        if step % 4 == 0:
            sess.run(copy_var)

        feed = sim.feed_batch(state_pl, action_pl, reward_pl, observ_pl,
                              FLAGS.batch_size)

        _, loss_value = sess.run([train_op, loss], feed_dict=feed)

        duration = time.time() - start_time

        assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

        if step % 10 == 0:
            num_examples_per_step = FLAGS.batch_size
            examples_per_sec = num_examples_per_step / duration
            sec_per_batch = float(duration)

            format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                          'sec/batch)')
            print(format_str % (datetime.now(), step, loss_value,
                                examples_per_sec, sec_per_batch))

        if step > FLAGS.sample_after:
            pred_act = sess.run(action_op, feed_dict={state_pl: sim.state})
            pred_act = pred_act[0]
            sim.act_sample_once(pred_act,
                                neg_ratio=FLAGS.sample_neg_ratio,
                                append_db=True)

        # visualization
        if step % 1000 == 0 and step != 0:
            sim.reset()
            survive = 0
            for _ in range(200):
                pred_act = sess.run(action_op, feed_dict={state_pl: sim.state})
                pred_act = pred_act[0]
                done = sim.act_demo(pred_act)
                if not done:
                    survive += 1
                else:
                    print('Survived for %i frame' % survive)
                    survive = 0

        #if step % 100 == 0:
        #    summary_str = sess.run(summary_op)
        #    summary_writer.add_summary(summary_str, step)

        # Save the model checkpoint periodically.
        if step % 1000 == 0 or (step + 1) == FLAGS.max_steps:
            checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
            saver.save(sess, checkpoint_path, global_step=step)
Beispiel #12
0
    state = game.getLogBoard()

    while not game.Lost():

        action = agent.selectAction(state, counter)
        next_state, reward, done, max_tile = game.step(action)
        buffer.store([state, action, next_state, reward, done])
        state = next_state
        # sleep(0.1)
        # os.system('clear')
        # print(next_state)
        counter += 1

        # NN training
        if counter > 50:
            train(buffer, agent)
        iteration_max_tile = max(iteration_max_tile, max_tile)

        if max_tile > super_max_tile:
            super_max_tile = max_tile
    # update target NN
    iteration_sum_maxtile += iteration_max_tile
    if episodes % agent.target_update_freq == 0:
        agent.target_nn.load_state_dict(agent.nn.state_dict())
        print(
            "Episode {}.\t iterations avg max tile {}.\t Max tile so far {}\t Epislon {}"
            .format(episodes, iteration_sum_maxtile / agent.target_update_freq,
                    super_max_tile, agent.eps_scheduled(it)))
        iteration_max_tile = -1
        iteration_sum_maxtile = 0
Beispiel #13
0

def get_model(input_shape, output_shape):
    model = Sequential()

    model.add(Dense(16, activation='tanh', input_shape=input_shape))
    model.add(Dense(64, activation='elu'))
    model.add(Dense(16, activation='sigmoid'))

    model.add(Dense(output_shape, activation="linear"))
    model.compile(loss="MSE", optimizer='adam', metrics=['accuracy'])
    model.summary()
    return model


if __name__ == '__main__':
    args = get_args()

    environment = args.environment
    env = gym.make(environment)

    output_shape = env.action_space.n
    input_shape = env.observation_space.shape

    epochs = 500
    memory = fill_memory(environment)
    model = get_model(input_shape, output_shape)

    score = train(environment, model, memory, epochs)
    print(score)
Beispiel #14
0
def train():
    rnd_seed = 0
    np.random.seed(rnd_seed)
    tf.set_random_seed(rnd_seed)
    sim = GymSim('CartPole-v0', 5000, seed=rnd_seed)
    sim.act_sample_batch(5000, FLAGS.sample_neg_ratio) # bootstrap with random actions
    sim.print_stats()
    #embed()
    #sys.exit()
    q_network = MLN(sim.INPUT_DIM, sim.ACTION_DIM)
    target_network = MLN(sim.INPUT_DIM, sim.ACTION_DIM, name_scope='target')

    with tf.Graph().as_default():

        global_step = tf.Variable(0, trainable=False)

        action_pl = tf.placeholder(tf.int64, name='action_pl')
        reward_pl = tf.placeholder(tf.float32, name='reward_pl')
        state_pl  = tf.placeholder(tf.float32, (None, sim.INPUT_DIM), name='state_pl')
        observ_pl = tf.placeholder(tf.float32, (None, sim.INPUT_DIM), name='observ_pl')

        action_q = q_network.inference(state_pl)
        target_q = tf.stop_gradient(target_network.inference(observ_pl))
        target_q_pt = tf.Print(target_q, [target_q])
        action_q_pt = tf.Print(action_q, [action_q])

        loss = dqn.td_loss(action_pl, sim.ACTION_DIM, action_q, reward_pl, target_q)

        train_op = dqn.train(FLAGS.learning_rate, loss, global_step)

        saver = tf.train.Saver(tf.all_variables())

        summary_op = tf.merge_all_summaries()

        action_op = tf.argmax(action_q, 1, name='action_op')

        copy_var = q_network.copy_to('target')

        init = tf.initialize_all_variables()

        sess = tf.Session(config=tf.ConfigProto(
            log_device_placement=FLAGS.log_device_placement))

        #initialize variables
        sess.run(init)

        summary_writer = tf.train.SummaryWriter(os.path.join(FLAGS.train_dir, 'logs')
                                                , sess.graph)

    for step in xrange(FLAGS.max_steps):
        start_time = time.time()

        if step % 4 == 0:
            sess.run(copy_var)

        feed = sim.feed_batch(state_pl,
                              action_pl,
                              reward_pl,
                              observ_pl,
                              FLAGS.batch_size)

        _, loss_value = sess.run([train_op, loss],
                                 feed_dict = feed)

        duration = time.time() - start_time

        assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

        if step % 10 == 0:
            num_examples_per_step = FLAGS.batch_size
            examples_per_sec = num_examples_per_step / duration
            sec_per_batch = float(duration)

            format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                        'sec/batch)')
            print (format_str % (datetime.now(), step, loss_value,
                                examples_per_sec, sec_per_batch))

        if step > FLAGS.sample_after:
            pred_act = sess.run(action_op,
                                feed_dict={state_pl: sim.state})
            pred_act = pred_act[0]
            sim.act_sample_once(pred_act, neg_ratio=FLAGS.sample_neg_ratio,
                                append_db=True)


        # visualization
        if step % 1000 == 0 and step != 0:
            sim.reset()
            survive = 0
            for _ in range(200):
                pred_act = sess.run(action_op,
                                    feed_dict={state_pl: sim.state})
                pred_act = pred_act[0]
                done = sim.act_demo(pred_act)
                if not done:
                    survive += 1
                else:
                    print('Survived for %i frame' % survive)
                    survive = 0

        #if step % 100 == 0:
        #    summary_str = sess.run(summary_op)
        #    summary_writer.add_summary(summary_str, step)

        # Save the model checkpoint periodically.
        if step % 1000 == 0 or (step + 1) == FLAGS.max_steps:
            checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
            saver.save(sess, checkpoint_path, global_step=step)
import dqn
import argparse

parser = argparse.ArgumentParser(description='Process some integers.')
parser.add_argument('--env_name', type=str, help='an integer for the accumulator')
args = parser.parse_args()

env_name = args.env_name
if env_name is None:
    env_name = "Breakout-v0"

dqn = dqn.DQN(env_name)
dqn.train()
Beispiel #16
0
                        loss=nn.L2Loss,
                        renderFreq=envDisplayFreq)

        settingsInfo = settings.copy()
        settingsInfo['env'] = envName
        settingsInfo['loss'] = settingsInfo['loss'].__name__
        stats.append(settingsInfo)
        stats.append({'solve criteria': solveCriteria})
        stats.append([])

        print('++ TRAINING: {} | solve criteria: {} ++'.format(
            envName, solveCriteria))
        print('Training Settings:')
        pp.pprint(settingsInfo)

        for e, t, eps, l, r, Q in dqn.train(**settings):
            rewards.append(r)
            avg = np.mean(rewards)
            stats[-1].append((t, eps, l, r, avg))
            alert = None

            if avg > avgRewards:
                avgRewards = avg
                alert = '*R*'

            if alert or e % envPrintFreq is 0:
                print(
                    '[TRAINING ({:.2%})] e:{} | t:{} | eps:{:,.3f} | l:{:,.3f} | r:{:,.3f} | avg:{:,.3f} | {}'
                    .format((t + 1) / steps, e, t, eps, l, r, avg, alert))

        nn.save(Q.topology, modelPath)