Esempio n. 1
0
def runEpoch(minEpochSteps, evalWithEpsilon=None):
    stepStart = environment.getStepNumber()
    isTraining = True if evalWithEpsilon is None else False
    startGameNumber = environment.getGameNumber()
    epochTotalScore = 0

    while environment.getStepNumber() - stepStart < minEpochSteps:
    
        startTime = lastLogTime = time.time()
        stateReward = 0
        state = None
        
        while not environment.isGameOver():
      
            # Choose next action
            if evalWithEpsilon is None:
                epsilon = max(.1, 1.0 - 0.9 * environment.getStepNumber() / 1e6)
            else:
                epsilon = evalWithEpsilon

            if state is None or random.random() > (1 - epsilon):
                action = random.randrange(environment.getNumActions())
            else:
                screens = np.reshape(state.getScreens(), (1, 84, 84, 4))
                action = dqn.inference(screens)

            # Make the move
            oldState = state
            reward, state, isTerminal = environment.step(action)
            
            # Record experience in replay memory and train
            if isTraining and oldState is not None:

                maxReward = reward if reward > maxReward else maxReward

                clippedReward = min(1, max(-1, reward)) / maxReward
                replayMemory.addSample(replay.Sample(oldState, action, clippedReward, state, isTerminal))

                if environment.getStepNumber() > args.observation_steps and environment.getEpisodeStepNumber() % 4 == 0:
                    batch = replayMemory.drawBatch(32)
                    dqn.train(batch, environment.getStepNumber())
        
            if time.time() - lastLogTime > 60:
                print('  ...frame %d' % environment.getEpisodeFrameNumber())
                lastLogTime = time.time()

            if isTerminal:
                state = None

        episodeTime = time.time() - startTime
        print('%s %d ended with score: %d (%d frames in %fs for %d fps)' %
            ('Episode' if isTraining else 'Eval', environment.getGameNumber(), environment.getGameScore(),
            environment.getEpisodeFrameNumber(), episodeTime, environment.getEpisodeFrameNumber() / episodeTime))
        epochTotalScore += environment.getGameScore()
        environment.resetGame()
    
    # return the average score
    return epochTotalScore / (environment.getGameNumber() - startGameNumber)
Esempio n. 2
0
def run_epoch(min_epoch_steps, eval_with_epsilon=None):
    global train_epsilon
    global train_episodes
    global eval_episodes
    global episode_train_reward_list
    global episode_eval_reward_list
    is_training = True if eval_with_epsilon is None else False
    step_start = environment.get_step_number()
    start_game_number = environment.get_game_number()
    epoch_total_score = 0
    stuck_count = 0
    time_list = []

    while environment.get_step_number(
    ) - step_start < min_epoch_steps and not stop:
        state_reward = 0
        state = None

        episode_losses = []
        save_net = False
        while not environment.is_game_over() and not stop:
            # epsilon selection and update
            if is_training:
                epsilon = train_epsilon
                if train_epsilon > args.epsilon_min:
                    train_epsilon = train_epsilon * args.epsilon_decay
                    if train_epsilon < args.epsilon_min:
                        train_epsilon = args.epsilon_min
            else:
                epsilon = eval_with_epsilon

            # action selection
            if state is None or random.random() < epsilon:
                action = random.randrange(environment.get_num_actions())
            else:
                action = dqn.inference(state.get_data())

            # we can't skip frames as in a game
            # we need to wait the evolution of the environment, but we don't want to waste GPU time
            # we can use a training sweep (which requires some time) instead of using a sleep
            old_state = state
            for i in range(0, args.history_length * (args.repeat_action + 1)):

                if environment.get_step_number() % args.save_model_freq == 0:
                    save_net = True

                # Make the move
                reward, state, is_terminal = environment.step(action)

                # train
                if is_training and old_state is not None:
                    if environment.get_step_number() > args.observation_steps:
                        if args.show_gpu_time:
                            start_time_train = datetime.datetime.now()
                        batch = replay_memory.draw_batch(args.batch_size)
                        loss = dqn.train(batch, environment.get_step_number())
                        episode_losses.append(loss)
                        if args.show_gpu_time:
                            training_time = (datetime.datetime.now() -
                                             start_time_train).total_seconds()
                            time_list.insert(0, training_time)
                            if len(time_list) > 100:
                                time_list = time_list[:-1]
                            print("Training time: %fs, Avg time:%fs" %
                                  (training_time, np.mean(time_list)))
                        if args.slowdown_cycle:
                            time.sleep(args.gpu_time)
                    else:
                        time.sleep(args.gpu_time)
                else:
                    time.sleep(args.gpu_time)

                if is_terminal:
                    break

            # Record experience in replay memory
            if is_training and old_state is not None:
                replay_memory.add_sample(
                    replay.Sample(old_state, action, reward, state,
                                  is_terminal))

            if is_terminal:
                state = None

            if args.simulator:
                if reward == -1:
                    stuck_count = stuck_count + 1
                else:
                    stuck_count = 0
                if stuck_count > 2:
                    print("Car stuck, resetting simulator position...")
                    environment.control.reset_simulator()
                    stuck_count = 0

        if save_net:
            dqn.save_network()

        #################################
        # logging
        #################################

        episode_time = datetime.datetime.now() - start_time

        if is_training:
            train_episodes += 1
            episode_train_reward_list.insert(0, environment.get_game_score())
            if len(episode_train_reward_list) > 100:
                episode_train_reward_list = episode_train_reward_list[:-1]
            avg_rewards = np.mean(episode_train_reward_list)

            episode_avg_loss = 0
            if episode_losses:
                episode_avg_loss = np.mean(episode_losses)

            log = (
                'Episode %d ended with score: %.2f (%s elapsed) (step: %d). Avg score: %.2f Avg loss: %.5f'
                % (environment.get_game_number(), environment.get_game_score(),
                   str(episode_time), environment.get_step_number(),
                   avg_rewards, episode_avg_loss))
            print(log)
            print("   epsilon " + str(train_epsilon))
            if args.logging:
                with summary_writer.as_default():
                    tf.summary.scalar('train episode reward',
                                      environment.get_game_score(),
                                      step=train_episodes)
                    tf.summary.scalar('train avg reward(100)',
                                      avg_rewards,
                                      step=train_episodes)
                    tf.summary.scalar('average loss',
                                      episode_avg_loss,
                                      step=train_episodes)
                    tf.summary.scalar('epsilon',
                                      train_epsilon,
                                      step=train_episodes)
                    tf.summary.scalar('steps',
                                      environment.get_step_number(),
                                      step=train_episodes)
        else:
            eval_episodes += 1
            episode_eval_reward_list.insert(0, environment.get_game_score())
            if len(episode_eval_reward_list) > 100:
                episode_eval_reward_list = episode_eval_reward_list[:-1]
            avg_rewards = np.mean(episode_eval_reward_list)

            log = (
                'Eval %d ended with score: %.2f (%s elapsed) (step: %d). Avg score: %.2f'
                % (environment.get_game_number(), environment.get_game_score(),
                   str(episode_time), environment.get_step_number(),
                   avg_rewards))
            print(log)
            if args.logging:
                with summary_writer.as_default():
                    tf.summary.scalar('eval episode reward',
                                      environment.get_game_score(),
                                      step=eval_episodes)
                    tf.summary.scalar('eval avg reward(100)',
                                      avg_rewards,
                                      step=eval_episodes)

        epoch_total_score += environment.get_game_score()
        environment.reset_game()

        while pause and not stop:
            time.sleep(1)

    if environment.get_game_number() - start_game_number == 0:
        return 0
    return epoch_total_score / (environment.get_game_number() -
                                start_game_number)
Esempio n. 3
0
def runEpoch(minEpochSteps, evalWithEpsilon=None):
    global train_epsilon
    stepStart = environment.getStepNumber()
    isTraining = True if evalWithEpsilon is None else False
    startGameNumber = environment.getGameNumber()
    epochTotalScore = 0

    while environment.getStepNumber() - stepStart < minEpochSteps and not stop:
        stateReward = 0
        state = None

        while not environment.isGameOver() and not stop:
            # Choose next action
            if evalWithEpsilon is None:
                epsilon = train_epsilon
            else:
                epsilon = evalWithEpsilon

            if train_epsilon > args.epsilon_min:
                train_epsilon = train_epsilon * args.epsilon_decay
                if train_epsilon < args.epsilon_min:
                    train_epsilon = args.epsilon_min

            if state is None or random.random() < (epsilon):
                action = random.randrange(environment.getNumActions())
            else:
                screens = np.reshape(
                    state.getScreens(),
                    (1, State.IMAGE_SIZE, State.IMAGE_SIZE, args.frame))
                action = dqn.inference(screens)

            # Make the move
            oldState = state
            reward, state, isTerminal = environment.step(action)

            # Record experience in replay memory and train
            if isTraining and oldState is not None:
                clippedReward = min(1, max(-1, reward))
                replayMemory.addSample(
                    replay.Sample(oldState, action, clippedReward, state,
                                  isTerminal))

                if environment.getStepNumber(
                ) > args.observation_steps and environment.getEpisodeStepNumber(
                ) % args.frame == 0:
                    batch = replayMemory.drawBatch(32)
                    dqn.train(batch, environment.getStepNumber())

            if isTerminal:
                state = None

        episodeTime = datetime.datetime.now() - startTime
        print(
            '%s %d ended with score: %d (%s elapsed)' %
            ('Episode' if isTraining else 'Eval', environment.getGameNumber(),
             environment.getGameScore(), str(episodeTime)))
        if isTraining:
            print("epsilon " + str(train_epsilon))
        epochTotalScore += environment.getGameScore()
        environment.resetGame()

    # return the average score
    if environment.getGameNumber() - startGameNumber == 0:
        return 0
    return epochTotalScore / (environment.getGameNumber() - startGameNumber)