Beispiel #1
0
 def __push_state(self, state):
     # Push the current state to the history.
     # Oldest state in history is discarded.
     sg = state.astype(np.float32)
     sg = np.expand_dims(sg, 0)
     sg = utils.preprocess_state(sg)
     self.state_hist[0, :, :, 1:] = self.state_hist[0, :, :, :-1]
     self.state_hist[0, :, :, 0] = sg[0]
Beispiel #2
0
def preprocess_data(X, y, hist_len, shuffle):
    """ Preprocess states and actions from expert dataset before feeding them to the agent """
    print('Preprocessing states. Shape:', X.shape)
    utils.check_invalid_actions(y)
    y_pp = utils.transl_action_env2agent(y)
    X_pp = utils.preprocess_state(X)
    X_pp, y_pp = utils.stack_history(X_pp, y_pp, hist_len, shuffle=shuffle)
    return X_pp, y_pp
Beispiel #3
0
def agent(obs_dict, config_dict):
    global prev_direction

    env = make('hungry_geese')
    # agent = QAgent(rows=11, columns=11, num_actions=3)
    agent = PPOAgent(rows=11, columns=11, num_actions=3)
    model_name = ''
    agent.load_model_weights('models/' + model_name + '.h5')

    state = preprocess_state(obs_dict, prev_direction)
    action = agent.select_action(state)
    direction = get_direction(prev_direction, action)
    prev_direction = direction
    return env.specification.action.enum[direction]
Beispiel #4
0
 def select_action(self, state):
     action_prob = np.zeros(self.n_action, np.float32)
     action_prob.fill(self.eps / self.n_action)
     max_q, max_q_index = self.qNetwork(Variable(state.to(
         self.args.device))).data.cpu().max(1)
     action_prob[max_q_index[0]] += 1 - self.eps
     action = np.random.choice(self.arr_actions, p=action_prob)
     next_state, reward, done, _ = self.env.step(action)
     next_state = torch.cat(
         [state.narrow(1, 1, 3),
          preprocess_state(next_state, self.env)], 1)
     self.memory.push(
         (state, torch.LongTensor([int(action)]), torch.Tensor([reward]),
          next_state, torch.Tensor([done])))
     return next_state, reward, done, max_q[0]
Beispiel #5
0
def ppo_train(model_name, load_model=False, actor_filename=None, critic_filename=None, optimizer_filename=None):
    print("PPO -- Training")

    env = make('hungry_geese')
    trainer = env.train(['greedy', None, 'agents/boilergoose.py', 'agents/handy_rl.py'])

    agent = PPOAgent(rows=11, columns=11, num_actions=3)
    memory = Memory()

    if load_model:
        agent.load_model_weights(actor_filename, critic_filename)
        agent.load_optimizer_weights(optimizer_filename)

    episode = 0
    start_episode = 0
    end_episode = 50000
    reward_threshold = None
    threshold_reached = False
    epochs = 4
    batch_size = 128
    current_frame = 0

    training_rewards = []
    evaluation_rewards = []
    last_1000_ep_reward = []

    for episode in range(start_episode + 1, end_episode + 1):
        obs_dict = trainer.reset()
        ep_reward, ep_steps, done = 0, 0, False
        prev_direction = 0

        while not done:
            current_frame += 1
            ep_steps += 1

            state = preprocess_state(obs_dict, prev_direction)
            action = agent.select_action(state, training=True)
            direction = get_direction(prev_direction, action)
            next_obs_dict, _, done, _ = trainer.step(env.specification.action.enum[direction])
            reward = calculate_reward(obs_dict, next_obs_dict)
            next_state = preprocess_state(next_obs_dict, direction)
            memory.add(state, action, reward, next_state, float(done))

            obs_dict = next_obs_dict
            prev_direction = direction

            ep_reward += reward

            if current_frame % batch_size == 0:
                for _ in range(epochs):
                    states, actions, rewards, next_states, dones = memory.get_all_samples()
                    agent.fit(states, actions, rewards, next_states, dones)
                memory.clear()
                agent.update_networks()

        print("EPISODE " + str(episode) + " - REWARD: " + str(ep_reward) + " - STEPS: " + str(ep_steps))

        if len(last_1000_ep_reward) == 1000:
            last_1000_ep_reward = last_1000_ep_reward[1:]
        last_1000_ep_reward.append(ep_reward)

        if reward_threshold:
            if len(last_1000_ep_reward) == 1000:
                if np.mean(last_1000_ep_reward) >= reward_threshold:
                    print("You solved the task after" + str(episode) + "episodes")
                    agent.save_model_weights('models/ppo_actor_' + model_name + '_' + str(episode) + '.h5',
                                             'models/ppo_critic_' + model_name + '_' + str(episode) + '.h5')
                    threshold_reached = True
                    break

        if episode % 1000 == 0:
            print('Episode ' + str(episode) + '/' + str(end_episode))

            last_1000_ep_reward_mean = np.mean(last_1000_ep_reward).round(3)
            training_rewards.append(last_1000_ep_reward_mean)
            print('Average reward in last 1000 episodes: ' + str(last_1000_ep_reward_mean))
            print()

        if episode % 1000 == 0:
            eval_reward = 0
            for i in range(100):
                obs_dict = trainer.reset()
                done = False
                prev_direction = 0
                while not done:
                    state = preprocess_state(obs_dict, prev_direction)
                    action = agent.select_action(state)
                    direction = get_direction(prev_direction, action)
                    next_obs_dict, _, done, _ = trainer.step(env.specification.action.enum[direction])
                    reward = calculate_reward(obs_dict, next_obs_dict)
                    obs_dict = next_obs_dict
                    prev_direction = direction
                    eval_reward += reward
            eval_reward /= 100
            evaluation_rewards.append(eval_reward)
            print("Evaluation reward: " + str(eval_reward))
            print()

        if episode % 5000 == 0:
            agent.save_model_weights('models/ppo_actor_' + model_name + '_' + str(episode) + '.h5',
                                     'models/ppo_critic_' + model_name + '_' + str(episode) + '.h5')
            agent.save_optimizer_weights('models/ppo_' + model_name + '_' + str(episode) + '_optimizer.npy')

    agent.save_model_weights('models/ppo_actor_' + model_name + '_' + str(end_episode) + '.h5',
                             'models/ppo_critic_' + model_name + '_' + str(end_episode) + '.h5')
    agent.save_optimizer_weights('models/ppo_' + model_name + '_' + str(end_episode) + '_optimizer.npy')

    if threshold_reached:
        plt.plot([i for i in range(start_episode + 1000, episode, 1000)], training_rewards)
    else:
        plt.plot([i for i in range(start_episode + 1000, end_episode + 1, 1000)], training_rewards)
    plt.title("Reward")
    plt.show()

    plt.plot([i for i in range(start_episode + 1000, end_episode + 1, 1000)], evaluation_rewards)
    plt.title('Evaluation rewards')
    plt.show()
Beispiel #6
0
 def reset(self):
     return torch.cat([preprocess_state(self.env.reset(), self.env)] * 4, 1)
Beispiel #7
0
def main():

    # Parse arguments
    parser = argparse.ArgumentParser()

    parser.add_argument('-g', action='store', dest='game')
    parser.add_argument('-w',
                        action='store_true',
                        dest='warm_start',
                        default=False)

    args = parser.parse_args()
    game = args.game
    warm_start = args.warm_start

    # Initialize environment
    env = gym.make(game)
    num_actions = env.action_space.n

    # Initialize constants
    num_frames = 4
    max_episodes = 1000000
    max_frames = 10000
    gamma = 0.95
    lr = 1e-4  # LSTM Update: Work well in 1st iteration
    target_score = 21.0  # Temperature Update: specific to Pong

    # Cold start
    if not warm_start:
        # Initialize model
        model = Policy(input_channels=num_frames, num_actions=num_actions)
        optimizer = optim.RMSprop(model.parameters(), lr=lr,
                                  weight_decay=0.1)  #LSTM Change: lr = 1e-4

        # Initialize statistics
        running_reward = None
        running_rewards = []
        prior_eps = 0

    # Warm start
    if warm_start:

        data_file = 'results/{}.p'.format(game)

        try:
            with open(data_file, 'rb') as f:
                running_rewards = pickle.load(f)
                running_reward = running_rewards[-1]

            prior_eps = len(running_rewards)

            model_file = 'saved_models/actor_critic_{}_ep_{}.p'.format(
                game, prior_eps)
            with open(model_file, 'rb') as f:
                # Model Save and Load Update: Include both model and optim parameters
                saved_model = pickle.load(f)
                model, optimizer = saved_model

        except OSError:
            print('Saved file not found. Creating new cold start model.')
            model = Policy(input_channels=num_frames, num_actions=num_actions)
            optimizer = optim.RMSprop(model.parameters(),
                                      lr=lr,
                                      weight_decay=0.1)
            running_reward = None
            running_rewards = []
            prior_eps = 0

    cuda = torch.cuda.is_available()

    if cuda:
        model = model.cuda()

    for ep in range(max_episodes):

        # Temperature Update: specific to Pong
        # Anneal temperature from 2.0 down to 0.8 based on how far running reward is from
        # target score
        if running_reward is None:
            model.temperature = 2.0  # Start with temp = 2.0 (Explore)
        else:
            # Specific to Pong - running reward starts at -21, so we encourage the agent
            # to explore. temp = 0.8 + 1.2*[21-(-21)]/42 = 2.0
            # As it gets closer to 0, temp = 0.8 + 1.2(21-0)/42 = 1.4
            # As it gets to 7, temp = 0.8 + 1.2(21-14)/42 = 1.0
            model.temperature = max(
                0.8, 0.8 + (target_score - running_reward) / 42 * 1.2)

        state = env.reset()
        state = preprocess_state(state)
        state = np.stack([state] * num_frames)

        # LSTM change - reset LSTM hidden units when episode begins
        cx = Variable(torch.zeros(1, 256))
        hx = Variable(torch.zeros(1, 256))
        if cuda:
            cx = cx.cuda()
            hx = hx.cuda()

        reward_sum = 0.0
        for frame in range(max_frames):

            # Select action
            # LSTM Change: Need to cycle hx and cx thru select_action
            action, log_prob, state_value, (hx, cx) = select_action(
                model, state, (hx, cx), cuda)
            model.saved_actions.append((log_prob, state_value))

            # Perform step
            next_state, reward, done, info = env.step(action)

            # Add reward to reward buffer
            model.rewards.append(reward)
            reward_sum += reward

            # Compute latest state
            next_state = preprocess_state(next_state)

            # Evict oldest diff add new diff to state
            next_state = np.stack([next_state] * num_frames)
            next_state[1:, :, :] = state[:-1, :, :]
            state = next_state

            if done:
                break

        # Compute/display statistics
        if running_reward is None:
            running_reward = reward_sum
        else:
            running_reward = running_reward * 0.99 + reward_sum * 0.01

        running_rewards.append(running_reward)

        verbose_str = 'Episode {} complete'.format(ep + prior_eps + 1)
        verbose_str += '\tReward total:{}'.format(reward_sum)
        verbose_str += '\tRunning mean: {:.4}'.format(running_reward)
        # Temperature Update: Track temp
        if (ep + prior_eps + 1) % 5 == 0:
            verbose_str += '\tTemp = {:.4}'.format(model.temperature)
        sys.stdout.write('\r' + verbose_str)
        sys.stdout.flush()

        # Update model
        finish_episode(model, optimizer, gamma, cuda)

        if (ep + prior_eps + 1) % 500 == 0:
            model_file = 'saved_models/actor_critic_{}_ep_{}.p'.format(
                game, ep + prior_eps + 1)
            data_file = 'results/{}.p'.format(game)
            with open(model_file, 'wb') as f:
                # Model Save and Load Update: Include both model and optim parameters
                pickle.dump((model.cpu(), optimizer), f)

            if cuda:
                model = model.cuda()

            with open(data_file, 'wb') as f:
                pickle.dump(running_rewards, f)
Beispiel #8
0
    env = Env()
    model = Model()
    tt = Time()

    steps = 0
    # arg.MAX_EPISODE
    # i_episode = 0
    for i_episode in range(MAX_EPISODE):
        try:
            if (break_flag):
                break
        except:
            break_flag = 0

        s, position = env.reset(return_s_pos=1)
        s = preprocess_state(s, position, env)

        # actions = [0, 0, 0, 1, 1, 1, -1]
        t1 = Time()
        ep_r = 0
        for i in range(MAX_STEP):
            if (tt.stop_alt('s')):
                print('----- break! -----')
                break_flag = 1
                break

            steps += 1
            # a = actions[i]        # len(actions)
            a = model.choose_action(s)
            s_, r, done, info = env.step(a)
Beispiel #9
0
def main():

    model_name = 'dqn'

    # Parse arguments
    game, warm_start, render = parse_arguments()

    # Initialize enviroment/model
    data = initialize(game, model_name, warm_start)
    env, model, optimizer, criterion, memory_buffer, cuda, running_reward, running_rewards = data

    # Initialize constants
    max_episodes = 500000
    batch_size = 10
    gamma = 0.95
    num_frames = 4

    for ep in range(max_episodes):
        state = env.reset()
        state = preprocess_state(state)
        state = np.stack([state] * num_frames)
        reward_sum = 0.0

        while True:
            # render frame if render argument was passed
            if render:
                env.render()

            # Select action
            action = select_epilson_greedy_action(model, state, ep, cuda)

            # Perform step
            next_state, reward, done, info = env.step(action)
            next_state = preprocess_state(next_state)
            next_state = np.stack([next_state] * num_frames)
            next_state[1:, :, :] = state[:-1, :, :]

            reward_sum += reward

            # Add transition to replay memory
            transition = Transition(state, action, next_state, reward, done)
            memory_buffer.push(transition)

            # Update state
            state = next_state

            # Sample mini-batch from replay memory_buffer
            batch = memory_buffer.sample(batch_size, replace=True)

            # Compute targets
            targets = np.zeros((batch_size, ), dtype=float)
            for i, transition in enumerate(batch):
                targets[i] = transition.reward
                if not transition.done:
                    next_state = transition.next_state
                    num_frames, height, width = next_state.shape
                    next_state = next_state.reshape(-1, num_frames, height,
                                                    width)
                    next_state = torch.FloatTensor(next_state)

                    if cuda:
                        next_state = next_state.cuda()

                    next_state = Variable(next_state)
                    targets[i] += gamma * model(next_state).data.max(1)[0]

            targets = torch.FloatTensor(targets)

            if cuda:
                targets = targets.cuda()

            targets = Variable(targets)

            # Compute predictions
            model.zero_grad()

            states = [transition.state for transition in batch]
            states = torch.FloatTensor(states)

            if cuda:
                states = states.cuda()

            states = Variable(states)

            actions = [int(transition.action) for transition in batch]
            actions = torch.LongTensor(actions)

            if cuda:
                actions = actions.cuda()

            actions = Variable(actions)

            outputs = model(states).gather(1, actions.unsqueeze(1))

            # Perform gradient descent step
            loss = criterion(outputs.view(batch_size), targets)
            loss.backward()
            # Clip gradient at 20,000
            # torch.nn.utils.clip_grad_norm(model.parameters(), 20000)
            optimizer.step()

            if done:
                break

        # Compute/display statistics
        if running_reward is None:
            running_reward = reward_sum
        else:
            running_reward = running_reward * 0.99 + reward_sum * 0.01

        running_rewards.append(running_reward)

        verbose_str = 'Episode {} complete'.format(ep + 1)
        verbose_str += '\tReward total:{}'.format(reward_sum)
        verbose_str += '\tRunning mean: {:.4}'.format(running_reward)
        sys.stdout.write('\r' + verbose_str)
        sys.stdout.flush()

        # Save model every 1000 episodes
        if (ep + 1) % 1000 == 0:
            model_file = 'saved_models/{}_{}_ep_{}.p'.format(
                game, model_name, ep + 1)

            with open(model_file, 'wb') as f:
                pickle.dump((model.cpu(), optimizer, memory_buffer), f)

            if cuda:
                model = model.cuda()

            data_file = 'results/{}_{}.p'.format(game, model_name)

            with open(data_file, 'wb') as f:
                pickle.dump(running_rewards, f)
def main():

    # Parse arguments
    parser = argparse.ArgumentParser()

    parser.add_argument('-g', action='store', dest='game')
    parser.add_argument('-w', action='store_true', dest='warm_start',
                        default=False)

    args = parser.parse_args()
    game = args.game
    warm_start = args.warm_start

    # Initialize environment
    env = gym.make(game)
    num_actions = env.action_space.n

    # Initialize constants
    num_frames = 4
    max_episodes = 1000000
    max_frames = 6000   # limit episode to 6000 game steps
    gamma = 0.95
    lr = 1e-4   # LSTM Update: Work well in 1st iteration
    target_score = 21.0  # Temperature Update: specific to Pong

    # Truncated Backprop(TBP) Update: 
    # Slide 41-44 CS231N_2017 Lecture 10 
    # Run forward and backward through chunks of sequence vs whole sequence. While hidden values hx and cx
    # are carried forward in time forever.
    chunk_size = 768  

    # Cold start
    if not warm_start:
        # Initialize model
        model = Policy(input_channels=num_frames, num_actions=num_actions)
        optimizer = optim.RMSprop(model.parameters(), lr=lr, weight_decay=0.1)  #LSTM Change: lr = 1e-4

        # Initialize statistics
        running_reward =None 
        running_rewards = []
        prior_eps = 0

    # Warm start
    if warm_start:

        data_file = 'results/acl-batch_{}_cs_{}.p'.format(game, chunk_size)

        try:
            with open(data_file, 'rb') as f:
                running_rewards = pickle.load(f)
                running_reward = running_rewards[-1]

            prior_eps = len(running_rewards)

            model_file = 'saved_models/acl-batch_{}_cs_{}_ep_{}.p'.format(
                                                                game, chunk_size, 
                                                                prior_eps)
            with open(model_file, 'rb') as f:
                # Model Save and Load Update: Include both model and optim parameters
                saved_model = pickle.load(f)
                model, optimizer = saved_model

        except OSError:
            print('Saved file not found. Creating new cold start model.')
            model = Policy(input_channels=num_frames, num_actions=num_actions)
            optimizer = optim.RMSprop(model.parameters(), lr=lr,
                                      weight_decay=0.1)
            running_reward = None
            running_rewards = []
            prior_eps = 0

    cuda = torch.cuda.is_available()

    if cuda:
        model = model.cuda()


    for ep in range(max_episodes):   # Truncated Backprop(TBP) Update: For every episode

        # Anneal temperature from 1.8 down to 1.0 over 100000 episodes
        model.temperature = max(0.8, 1.8 - 0.8 * ((ep+prior_eps) / 1.0e5))

        state = env.reset()
        state = preprocess_state(state)
        state = np.stack([state]*num_frames)

        done = False   # TBP Update: init done

        # LSTM change - reset LSTM hidden units when episode begins
        cx = Variable(torch.zeros(1, 256))
        hx = Variable(torch.zeros(1, 256))
        if cuda:
            cx = cx.cuda()
            hx = hx.cuda()

        reward_sum = 0.0
        grad_norm = 0.0  # Track grad norm for the episode

        while not done:  # TBP Update: if episode is not done

            # TBP Update: Forward a fixed number of game steps thru CNN-LSTM
            for frame in range(chunk_size):

                # env.render()    # For initial debugging
            
                # Select action
                # LSTM Change: Need to cycle hx and cx thru select_action
                action, log_prob, state_value, (hx,cx)  = select_action(model, state, (hx,cx), cuda)
                model.saved_actions.append((log_prob, state_value))

                # Perform step
                next_state, reward, done, info = env.step(action)

                # Add reward to reward buffer
                model.rewards.append(reward)
                reward_sum += reward

                # Compute latest state
                next_state = preprocess_state(next_state)

                # Evict oldest diff add new diff to state
                next_state = np.stack([next_state]*num_frames)
                next_state[1:, :, :] = state[:-1, :, :]
                state = next_state

                if done:
                    break
                    # Update model


            # TBP Update: Backprop the fixed number of game steps back thru CNN-LSTM, and perform
            # an update on the parameters of the Actor-Critic.
            if frame > chunk_size/4:   
                grad_norm = finish_chunk(model, optimizer, gamma, cuda)

                # print (grad_norm, frame)   # for debugging nan problem

                # TBP Update: hidden values are carried forward
                cx = Variable(cx.data)   
                hx = Variable(hx.data)

        # TBP Update: At this point, the episode is done. We need to do some bookkeeping
            
        # Compute/display statistics
        if running_reward is None:
            running_reward = reward_sum
        else:
            running_reward = running_reward * 0.99 + reward_sum * 0.01

        running_rewards.append(running_reward)

        verbose_str = 'Episode {} complete'.format(ep+prior_eps+1)
        verbose_str += '\tReward total:{}'.format(reward_sum)
        verbose_str += '\tRunning mean: {:.4}'.format(running_reward)
        # Temperature Update: Track temp
        if (ep+prior_eps+1) % 5 == 0: 
            verbose_str += '\tTemp = {:.4}'.format(model.temperature) 
            verbose_str += '\tGrad norm:{}'.format(grad_norm)   
        sys.stdout.write('\r' + verbose_str)
        sys.stdout.flush()


        # Periodically save model and optimizer parameters, and statistics
        if (ep+prior_eps+1) % 100 == 0: 
            model_file = 'saved_models/acl-batch_{}_cs_{}_ep_{}.p'.format(
                                                                game, chunk_size, 
                                                                ep+prior_eps+1)
            data_file = 'results/acl-batch_{}_cs_{}.p'.format(game, chunk_size)
            with open(model_file, 'wb') as f:
                # Model Save and Load Update: Include both model and optim parameters 
                pickle.dump((model, optimizer), f)

            if cuda:
                model = model.cuda()

            with open(data_file, 'wb') as f:
                pickle.dump(running_rewards, f)
Beispiel #11
0
def ddqn_train(model_name,
               load_model=False,
               model_filename=None,
               optimizer_filename=None):
    print("DDQN -- Training")

    env = make('hungry_geese')
    trainer = env.train(
        ['greedy', None, 'agents/boilergoose.py', 'agents/handy_rl.py'])

    agent = DDQNAgent(rows=11, columns=11, num_actions=3)
    buffer = ReplayBuffer()
    strategy = EpsilonGreedyStrategy(start=0.5, end=0.0, decay=0.00001)

    if load_model:
        agent.load_model_weights(model_filename)
        agent.load_optimizer_weights(optimizer_filename)

    start_episode = 0
    end_episode = 50000
    epochs = 32
    batch_size = 128

    training_rewards = []
    evaluation_rewards = []
    last_1000_ep_reward = []

    for episode in range(start_episode + 1, end_episode + 1):
        obs_dict = trainer.reset()
        epsilon = strategy.get_epsilon(episode - start_episode)
        ep_reward, ep_steps, done = 0, 0, False
        prev_direction = 0

        while not done:
            ep_steps += 1

            state = preprocess_state(obs_dict, prev_direction)
            action = agent.select_epsilon_greedy_action(state, epsilon)
            direction = get_direction(prev_direction, action)
            next_obs_dict, _, done, _ = trainer.step(
                env.specification.action.enum[direction])
            reward = calculate_reward(obs_dict, next_obs_dict)
            next_state = preprocess_state(next_obs_dict, direction)
            buffer.add(state, action, reward, next_state, done)

            obs_dict = next_obs_dict
            prev_direction = direction

            ep_reward += reward

        if len(buffer) >= batch_size:
            for _ in range(epochs):
                states, actions, rewards, next_states, dones = buffer.get_samples(
                    batch_size)
                agent.fit(states, actions, rewards, next_states, dones)

        print("EPISODE " + str(episode) + " - REWARD: " + str(ep_reward) +
              " - STEPS: " + str(ep_steps))

        if len(last_1000_ep_reward) == 1000:
            last_1000_ep_reward = last_1000_ep_reward[1:]
        last_1000_ep_reward.append(ep_reward)

        if episode % 10 == 0:
            agent.update_target_network()

        if episode % 1000 == 0:
            print('Episode ' + str(episode) + '/' + str(end_episode))
            print('Epsilon: ' + str(round(epsilon, 3)))

            last_1000_ep_reward_mean = np.mean(last_1000_ep_reward).round(3)
            training_rewards.append(last_1000_ep_reward_mean)
            print('Average reward in last 1000 episodes: ' +
                  str(last_1000_ep_reward_mean))
            print()

        if episode % 1000 == 0:
            eval_reward = 0
            for i in range(100):
                obs_dict = trainer.reset()
                epsilon = 0
                done = False
                prev_direction = 0
                while not done:
                    state = preprocess_state(obs_dict, prev_direction)
                    action = agent.select_epsilon_greedy_action(state, epsilon)
                    direction = get_direction(prev_direction, action)
                    next_obs_dict, _, done, _ = trainer.step(
                        env.specification.action.enum[direction])
                    reward = calculate_reward(obs_dict, next_obs_dict)
                    obs_dict = next_obs_dict
                    prev_direction = direction
                    eval_reward += reward
            eval_reward /= 100
            evaluation_rewards.append(eval_reward)
            print("Evaluation reward: " + str(eval_reward))
            print()

        if episode % 5000 == 0:
            agent.save_model_weights('models/ddqn_' + model_name + '_' +
                                     str(episode) + '.h5')
            agent.save_optimizer_weights('models/ddqn_' + model_name + '_' +
                                         str(episode) + '_optimizer.npy')

    agent.save_model_weights('models/ddqn_' + model_name + '_' +
                             str(end_episode) + '.h5')
    agent.save_optimizer_weights('models/ddqn_' + model_name + '_' +
                                 str(end_episode) + '_optimizer.npy')

    plt.plot([i for i in range(start_episode + 1000, end_episode + 1, 1000)],
             training_rewards)
    plt.title('Reward')
    plt.show()

    plt.plot([i for i in range(start_episode + 1000, end_episode + 1, 1000)],
             evaluation_rewards)
    plt.title('Evaluation rewards')
    plt.show()
Beispiel #12
0
            if (r_p != 0):
                d_p = r_d / r_p
            else:
                d_p = -1
            # ------------- r_d and r_p
            print(
                'r: {:5.3f}, r_d: {:5.3f}, r_p: {:5.3f}, d/p: {:-8.3f}'.format(
                    r, r_d, r_p, d_p))

            ################### ------------------- pre_process_image
            if (not done):  # plot cv_img
                position_ = info[0]
                # print('NOT ------------------------ done', position_)

                # if (arg.preprocess_state):
                s_ = preprocess_state(s_, position_, env, resize=arg.resize)
                # s_ = preprocess_state(s_)

                # add points
                # if(arg.resize):
                #     for img in s_:
                #         add_rects(img=img, point=position_, env=env)
                #         img = cv2.resize(img, arg.wind_conv_wh, interpolation=cv2.INTER_AREA)
                #         # print(len(s_), env.num_frames)
                # else:
                #     img = s_[-1]
                #     add_rects(img=img, point=position_, env=env)

                # plot
                if (arg.show_pre_image and cv_img(s_[-1])):
                    break_flag = 1
Beispiel #13
0
def main():
    # Parse arguments
    parser = argparse.ArgumentParser()

    parser.add_argument('-g', action='store', dest='game')
    parser.add_argument('-f', action='store', dest='filename', default=None)
    parser.add_argument('-d', action='store', dest='foldername', default=None)

    args = parser.parse_args()
    game = args.game
    model_file = args.filename
    foldername = args.foldername

    # Initialize environment
    render = True
    env = gym.make(game)

    env = gym.wrappers.Monitor(env,
                               foldername,
                               video_callable=lambda episode_id: True,
                               force=True)
    num_actions = env.action_space.n

    # Initialize constants
    num_frames = 4
    max_episodes = 1  # Just render 1 episode
    max_frames = 10000

    # Initialize model
    try:
        with open(model_file, 'rb') as f:
            # Model Save and Load Update: Include both model and optim parameters
            # saved_model = torch.load(model_file,map_location=lambda storage, loc:storage)
            saved_model = pickle.load(f)

            if hasattr(saved_model, '__iter__'):
                model, _ = saved_model
            else:
                model = saved_model

    except OSError:
        print('Model file not found.')
        return

    model.temperature = 1.0  # When we play, we sample as usual.

    for ep in range(max_episodes):

        state = env.reset()
        state = preprocess_state(state)
        state = np.stack([state] * num_frames)

        # LSTM change - reset LSTM hidden units when episode begins
        cx = Variable(torch.zeros(1, 256))
        hx = Variable(torch.zeros(1, 256))

        for frame in range(max_frames):

            env.render()

            # Select action
            # LSTM Change: Need to cycle hx and cx thru select_action
            action, log_prob, state_value, (hx, cx) = select_action(
                model, state, (hx, cx))

            # Perform step
            next_state, reward, done, info = env.step(action)

            # Compute latest state
            next_state = preprocess_state(next_state)

            # Evict oldest diff add new diff to state
            next_state = np.stack([next_state] * num_frames)
            next_state[1:, :, :] = state[:-1, :, :]
            state = next_state

            if done:
                break

    env.env.close()
def main():

    # Parse arguments
    parser = argparse.ArgumentParser()

    parser.add_argument('-g', action='store', dest='game')
    parser.add_argument('-w',
                        action='store_true',
                        dest='warm_start',
                        default=False)

    args = parser.parse_args()
    game = args.game
    warm_start = args.warm_start

    # Initialize environment
    env = gym.make(game)
    num_actions = env.action_space.n

    # Initialize constants
    num_frames = 4
    max_episodes = 1000000
    max_frames = 10000
    gamma = 0.95

    # Cold start
    if not warm_start:
        # Initialize model
        model = Policy(input_channels=num_frames, num_actions=num_actions)

        # Initialize statistics
        running_reward = None
        running_rewards = []
        prior_eps = 0

    # Warm start
    if warm_start:

        data_file = 'results/{}.p'.format(game)

        try:
            with open(data_file, 'rb') as f:
                running_rewards = pickle.load(f)
                running_reward = running_rewards[-1]

            prior_eps = len(running_rewards)

            model_file = 'saved_models/actor_critic_{}_ep_{}.p'.format(
                game, prior_eps)
            with open(model_file, 'rb') as f:
                model = pickle.load(f)

        except OSError:
            print('Saved file not found. Creating new cold start model.')
            model = Policy(input_channels=num_frames, num_actions=num_actions)
            running_reward = None
            running_rewards = []
            prior_eps = 0

    cuda = torch.cuda.is_available()

    if cuda:
        model = model.cuda()

    optimizer = optim.RMSprop(model.parameters(), lr=1e-4,
                              weight_decay=0.1)  #LSTM Change: lr = 1e-4

    for ep in range(max_episodes):
        # Anneal temperature from 2.0 down to 0.5 over 10000 episodes
        model.temperature = max(0.5, 2.0 - 1.5 * ((ep + prior_eps) / 1.0e4))

        state = env.reset()
        state = preprocess_state(state)
        state = np.stack([state] * num_frames)

        # LSTM change - reset LSTM hidden units when episode begins
        cx = Variable(torch.zeros(1, 256))
        hx = Variable(torch.zeros(1, 256))
        if cuda:
            cx = cx.cuda()
            hx = hx.cuda()

        reward_sum = 0.0
        for frame in range(max_frames):

            # Select action
            # LSTM Change: Need to cycle hx and cx thru select_action
            action, log_prob, state_value, (hx, cx) = select_action(
                model, state, (hx, cx), cuda)
            model.saved_actions.append((log_prob, state_value))

            # Perform step
            next_state, reward, done, info = env.step(action)

            # Add reward to reward buffer
            model.rewards.append(reward)
            reward_sum += reward

            # Compute latest state
            next_state = preprocess_state(next_state)

            # Evict oldest diff add new diff to state
            next_state = np.stack([next_state] * num_frames)
            next_state[1:, :, :] = state[:-1, :, :]
            state = next_state

            if done:
                break

        # Compute/display statistics
        if running_reward is None:
            running_reward = reward_sum
        else:
            running_reward = running_reward * 0.99 + reward_sum * 0.01

        running_rewards.append(running_reward)

        verbose_str = 'Episode {} complete'.format(ep + prior_eps + 1)
        verbose_str += '\tReward total:{}'.format(reward_sum)
        verbose_str += '\tRunning mean: {:.4}'.format(running_reward)
        sys.stdout.write('\r' + verbose_str)
        sys.stdout.flush()

        # Update model
        finish_episode(model, optimizer, gamma, cuda)

        if (ep + prior_eps + 1) % 500 == 0:
            model_file = 'saved_models/actor_critic_{}_ep_{}.p'.format(
                game, ep + prior_eps + 1)
            data_file = 'results/{}.p'.format(game)
            with open(model_file, 'wb') as f:
                pickle.dump(model.cpu(), f)

            if cuda:
                model = model.cuda()

            with open(data_file, 'wb') as f:
                pickle.dump(running_rewards, f)
Beispiel #15
0
def main():

    # Parse arguments
    game, model_name, warm_start, render = parse_arguments()

    # initialize enviroment/model
    data = initialize(game, model_name, warm_start)
    env, model, optimizer, cuda, running_reward, running_rewards = data

    # Initialize constants
    max_episodes = 500000
    max_frames = 10000
    gamma = 0.95
    num_frames = 4

    for ep in range(len(running_rewards), max_episodes):
        # Anneal temperature from 1.8 down to 0.8 over 20,000 episodes
        model.temperature = max(0.8, 1.8 - 1.0 * ((ep) / 2.0e4))

        # Reset LSTM hidden units when episode begins
        if model_name == 'a2c-lstm':
            cx = Variable(torch.zeros(1, 100))
            hx = Variable(torch.zeros(1, 100))
            if cuda:
                cx = cx.cuda()
                hx = hx.cuda()

        state = env.reset()
        state = preprocess_state(state)
        state = np.stack([state] * num_frames)

        reward_sum = 0.0
        for frame in range(max_frames):
            # render frame if render argument was passed
            if render:
                env.render()
            # Select action
            if model_name == 'a2c-lstm':
                result = select_action_lstm(model, state, (hx, cx), cuda)
                action, log_prob, state_value, (hx, cx) = result

            else:
                result = select_action(model, state, cuda)
                action, log_prob, state_value = result

            model.saved_actions.append((log_prob, state_value))

            # Perform step
            next_state, reward, done, info = env.step(action)

            # Add reward to reward buffer
            model.rewards.append(reward)
            reward_sum += reward

            # Compute latest state
            next_state = preprocess_state(next_state)

            # Evict oldest frame add new frame to state
            next_state = np.stack([next_state] * num_frames)
            next_state[1:, :, :] = state[:-1, :, :]
            state = next_state

            if done:
                break

        # Compute/display episode statistics
        if running_reward is None:
            running_reward = reward_sum
        else:
            running_reward = running_reward * 0.99 + reward_sum * 0.01

        running_rewards.append(running_reward)

        verbose_str = 'Episode {} complete'.format(ep + 1)
        verbose_str += '\tReward total:{}'.format(reward_sum)
        verbose_str += '\tRunning mean: {:.4}'.format(running_reward)
        sys.stdout.write('\r' + verbose_str)
        sys.stdout.flush()

        # Update model
        backpropagate(model, optimizer, gamma, cuda)

        # Save model every 1000 episodes
        if (ep + 1) % 1000 == 0:
            model_file = 'saved_models/{}_{}_ep_{}.p'.format(
                game, model_name, ep + 1)

            with open(model_file, 'wb') as f:
                pickle.dump((model.cpu(), optimizer), f)

            if cuda:
                model = model.cuda()

            data_file = 'results/{}_{}.p'.format(game, model_name)

            with open(data_file, 'wb') as f:
                pickle.dump(running_rewards, f)