def main():

    # Parse arguments
    parser = argparse.ArgumentParser()

    parser.add_argument('-g', action='store', dest='game')
    parser.add_argument('-w', action='store_true', dest='warm_start',
                        default=False)

    args = parser.parse_args()
    game = args.game
    warm_start = args.warm_start

    # Initialize environment
    env = gym.make(game)
    num_actions = env.action_space.n

    # Initialize constants
    num_frames = 4
    max_episodes = 1000000
    max_frames = 6000   # limit episode to 6000 game steps
    gamma = 0.95
    lr = 1e-4   # LSTM Update: Work well in 1st iteration
    target_score = 21.0  # Temperature Update: specific to Pong

    # Truncated Backprop(TBP) Update: 
    # Slide 41-44 CS231N_2017 Lecture 10 
    # Run forward and backward through chunks of sequence vs whole sequence. While hidden values hx and cx
    # are carried forward in time forever.
    chunk_size = 768  

    # Cold start
    if not warm_start:
        # Initialize model
        model = Policy(input_channels=num_frames, num_actions=num_actions)
        optimizer = optim.RMSprop(model.parameters(), lr=lr, weight_decay=0.1)  #LSTM Change: lr = 1e-4

        # Initialize statistics
        running_reward =None 
        running_rewards = []
        prior_eps = 0

    # Warm start
    if warm_start:

        data_file = 'results/acl-batch_{}_cs_{}.p'.format(game, chunk_size)

        try:
            with open(data_file, 'rb') as f:
                running_rewards = pickle.load(f)
                running_reward = running_rewards[-1]

            prior_eps = len(running_rewards)

            model_file = 'saved_models/acl-batch_{}_cs_{}_ep_{}.p'.format(
                                                                game, chunk_size, 
                                                                prior_eps)
            with open(model_file, 'rb') as f:
                # Model Save and Load Update: Include both model and optim parameters
                saved_model = pickle.load(f)
                model, optimizer = saved_model

        except OSError:
            print('Saved file not found. Creating new cold start model.')
            model = Policy(input_channels=num_frames, num_actions=num_actions)
            optimizer = optim.RMSprop(model.parameters(), lr=lr,
                                      weight_decay=0.1)
            running_reward = None
            running_rewards = []
            prior_eps = 0

    cuda = torch.cuda.is_available()

    if cuda:
        model = model.cuda()


    for ep in range(max_episodes):   # Truncated Backprop(TBP) Update: For every episode

        # Anneal temperature from 1.8 down to 1.0 over 100000 episodes
        model.temperature = max(0.8, 1.8 - 0.8 * ((ep+prior_eps) / 1.0e5))

        state = env.reset()
        state = preprocess_state(state)
        state = np.stack([state]*num_frames)

        done = False   # TBP Update: init done

        # LSTM change - reset LSTM hidden units when episode begins
        cx = Variable(torch.zeros(1, 256))
        hx = Variable(torch.zeros(1, 256))
        if cuda:
            cx = cx.cuda()
            hx = hx.cuda()

        reward_sum = 0.0
        grad_norm = 0.0  # Track grad norm for the episode

        while not done:  # TBP Update: if episode is not done

            # TBP Update: Forward a fixed number of game steps thru CNN-LSTM
            for frame in range(chunk_size):

                # env.render()    # For initial debugging
            
                # Select action
                # LSTM Change: Need to cycle hx and cx thru select_action
                action, log_prob, state_value, (hx,cx)  = select_action(model, state, (hx,cx), cuda)
                model.saved_actions.append((log_prob, state_value))

                # Perform step
                next_state, reward, done, info = env.step(action)

                # Add reward to reward buffer
                model.rewards.append(reward)
                reward_sum += reward

                # Compute latest state
                next_state = preprocess_state(next_state)

                # Evict oldest diff add new diff to state
                next_state = np.stack([next_state]*num_frames)
                next_state[1:, :, :] = state[:-1, :, :]
                state = next_state

                if done:
                    break
                    # Update model


            # TBP Update: Backprop the fixed number of game steps back thru CNN-LSTM, and perform
            # an update on the parameters of the Actor-Critic.
            if frame > chunk_size/4:   
                grad_norm = finish_chunk(model, optimizer, gamma, cuda)

                # print (grad_norm, frame)   # for debugging nan problem

                # TBP Update: hidden values are carried forward
                cx = Variable(cx.data)   
                hx = Variable(hx.data)

        # TBP Update: At this point, the episode is done. We need to do some bookkeeping
            
        # Compute/display statistics
        if running_reward is None:
            running_reward = reward_sum
        else:
            running_reward = running_reward * 0.99 + reward_sum * 0.01

        running_rewards.append(running_reward)

        verbose_str = 'Episode {} complete'.format(ep+prior_eps+1)
        verbose_str += '\tReward total:{}'.format(reward_sum)
        verbose_str += '\tRunning mean: {:.4}'.format(running_reward)
        # Temperature Update: Track temp
        if (ep+prior_eps+1) % 5 == 0: 
            verbose_str += '\tTemp = {:.4}'.format(model.temperature) 
            verbose_str += '\tGrad norm:{}'.format(grad_norm)   
        sys.stdout.write('\r' + verbose_str)
        sys.stdout.flush()


        # Periodically save model and optimizer parameters, and statistics
        if (ep+prior_eps+1) % 100 == 0: 
            model_file = 'saved_models/acl-batch_{}_cs_{}_ep_{}.p'.format(
                                                                game, chunk_size, 
                                                                ep+prior_eps+1)
            data_file = 'results/acl-batch_{}_cs_{}.p'.format(game, chunk_size)
            with open(model_file, 'wb') as f:
                # Model Save and Load Update: Include both model and optim parameters 
                pickle.dump((model, optimizer), f)

            if cuda:
                model = model.cuda()

            with open(data_file, 'wb') as f:
                pickle.dump(running_rewards, f)
Ejemplo n.º 2
0
def main():

    # Parse arguments
    parser = argparse.ArgumentParser()

    parser.add_argument('-g', action='store', dest='game')
    parser.add_argument('-w',
                        action='store_true',
                        dest='warm_start',
                        default=False)

    args = parser.parse_args()
    game = args.game
    warm_start = args.warm_start

    # Initialize environment
    env = gym.make(game)
    num_actions = env.action_space.n

    # Initialize constants
    num_frames = 4
    max_episodes = 1000000
    max_frames = 10000
    gamma = 0.95
    lr = 1e-4  # LSTM Update: Work well in 1st iteration
    target_score = 21.0  # Temperature Update: specific to Pong

    # Cold start
    if not warm_start:
        # Initialize model
        model = Policy(input_channels=num_frames, num_actions=num_actions)
        optimizer = optim.RMSprop(model.parameters(), lr=lr,
                                  weight_decay=0.1)  #LSTM Change: lr = 1e-4

        # Initialize statistics
        running_reward = None
        running_rewards = []
        prior_eps = 0

    # Warm start
    if warm_start:

        data_file = 'results/{}.p'.format(game)

        try:
            with open(data_file, 'rb') as f:
                running_rewards = pickle.load(f)
                running_reward = running_rewards[-1]

            prior_eps = len(running_rewards)

            model_file = 'saved_models/actor_critic_{}_ep_{}.p'.format(
                game, prior_eps)
            with open(model_file, 'rb') as f:
                # Model Save and Load Update: Include both model and optim parameters
                saved_model = pickle.load(f)
                model, optimizer = saved_model

        except OSError:
            print('Saved file not found. Creating new cold start model.')
            model = Policy(input_channels=num_frames, num_actions=num_actions)
            optimizer = optim.RMSprop(model.parameters(),
                                      lr=lr,
                                      weight_decay=0.1)
            running_reward = None
            running_rewards = []
            prior_eps = 0

    cuda = torch.cuda.is_available()

    if cuda:
        model = model.cuda()

    for ep in range(max_episodes):

        # Temperature Update: specific to Pong
        # Anneal temperature from 2.0 down to 0.8 based on how far running reward is from
        # target score
        if running_reward is None:
            model.temperature = 2.0  # Start with temp = 2.0 (Explore)
        else:
            # Specific to Pong - running reward starts at -21, so we encourage the agent
            # to explore. temp = 0.8 + 1.2*[21-(-21)]/42 = 2.0
            # As it gets closer to 0, temp = 0.8 + 1.2(21-0)/42 = 1.4
            # As it gets to 7, temp = 0.8 + 1.2(21-14)/42 = 1.0
            model.temperature = max(
                0.8, 0.8 + (target_score - running_reward) / 42 * 1.2)

        state = env.reset()
        state = preprocess_state(state)
        state = np.stack([state] * num_frames)

        # LSTM change - reset LSTM hidden units when episode begins
        cx = Variable(torch.zeros(1, 256))
        hx = Variable(torch.zeros(1, 256))
        if cuda:
            cx = cx.cuda()
            hx = hx.cuda()

        reward_sum = 0.0
        for frame in range(max_frames):

            # Select action
            # LSTM Change: Need to cycle hx and cx thru select_action
            action, log_prob, state_value, (hx, cx) = select_action(
                model, state, (hx, cx), cuda)
            model.saved_actions.append((log_prob, state_value))

            # Perform step
            next_state, reward, done, info = env.step(action)

            # Add reward to reward buffer
            model.rewards.append(reward)
            reward_sum += reward

            # Compute latest state
            next_state = preprocess_state(next_state)

            # Evict oldest diff add new diff to state
            next_state = np.stack([next_state] * num_frames)
            next_state[1:, :, :] = state[:-1, :, :]
            state = next_state

            if done:
                break

        # Compute/display statistics
        if running_reward is None:
            running_reward = reward_sum
        else:
            running_reward = running_reward * 0.99 + reward_sum * 0.01

        running_rewards.append(running_reward)

        verbose_str = 'Episode {} complete'.format(ep + prior_eps + 1)
        verbose_str += '\tReward total:{}'.format(reward_sum)
        verbose_str += '\tRunning mean: {:.4}'.format(running_reward)
        # Temperature Update: Track temp
        if (ep + prior_eps + 1) % 5 == 0:
            verbose_str += '\tTemp = {:.4}'.format(model.temperature)
        sys.stdout.write('\r' + verbose_str)
        sys.stdout.flush()

        # Update model
        finish_episode(model, optimizer, gamma, cuda)

        if (ep + prior_eps + 1) % 500 == 0:
            model_file = 'saved_models/actor_critic_{}_ep_{}.p'.format(
                game, ep + prior_eps + 1)
            data_file = 'results/{}.p'.format(game)
            with open(model_file, 'wb') as f:
                # Model Save and Load Update: Include both model and optim parameters
                pickle.dump((model.cpu(), optimizer), f)

            if cuda:
                model = model.cuda()

            with open(data_file, 'wb') as f:
                pickle.dump(running_rewards, f)
Ejemplo n.º 3
0
def main():

    # Parse arguments
    parser = argparse.ArgumentParser()

    parser.add_argument('-g', action='store', dest='game')
    parser.add_argument('-w',
                        action='store_true',
                        dest='warm_start',
                        default=False)

    args = parser.parse_args()
    game = args.game
    warm_start = args.warm_start

    # Initialize environment
    env = gym.make(game)
    num_actions = env.action_space.n

    # Initialize constants
    num_frames = 4
    max_episodes = 1000000
    max_frames = 10000
    gamma = 0.95

    # Cold start
    if not warm_start:
        # Initialize model
        model = Policy(input_channels=num_frames, num_actions=num_actions)

        # Initialize statistics
        running_reward = None
        running_rewards = []
        prior_eps = 0

    # Warm start
    if warm_start:

        data_file = 'results/{}.p'.format(game)

        try:
            with open(data_file, 'rb') as f:
                running_rewards = pickle.load(f)
                running_reward = running_rewards[-1]

            prior_eps = len(running_rewards)

            model_file = 'saved_models/actor_critic_{}_ep_{}.p'.format(
                game, prior_eps)
            with open(model_file, 'rb') as f:
                model = pickle.load(f)

        except OSError:
            print('Saved file not found. Creating new cold start model.')
            model = Policy(input_channels=num_frames, num_actions=num_actions)
            running_reward = None
            running_rewards = []
            prior_eps = 0

    cuda = torch.cuda.is_available()

    if cuda:
        model = model.cuda()

    optimizer = optim.RMSprop(model.parameters(), lr=1e-4,
                              weight_decay=0.1)  #LSTM Change: lr = 1e-4

    for ep in range(max_episodes):
        # Anneal temperature from 2.0 down to 0.5 over 10000 episodes
        model.temperature = max(0.5, 2.0 - 1.5 * ((ep + prior_eps) / 1.0e4))

        state = env.reset()
        state = preprocess_state(state)
        state = np.stack([state] * num_frames)

        # LSTM change - reset LSTM hidden units when episode begins
        cx = Variable(torch.zeros(1, 256))
        hx = Variable(torch.zeros(1, 256))
        if cuda:
            cx = cx.cuda()
            hx = hx.cuda()

        reward_sum = 0.0
        for frame in range(max_frames):

            # Select action
            # LSTM Change: Need to cycle hx and cx thru select_action
            action, log_prob, state_value, (hx, cx) = select_action(
                model, state, (hx, cx), cuda)
            model.saved_actions.append((log_prob, state_value))

            # Perform step
            next_state, reward, done, info = env.step(action)

            # Add reward to reward buffer
            model.rewards.append(reward)
            reward_sum += reward

            # Compute latest state
            next_state = preprocess_state(next_state)

            # Evict oldest diff add new diff to state
            next_state = np.stack([next_state] * num_frames)
            next_state[1:, :, :] = state[:-1, :, :]
            state = next_state

            if done:
                break

        # Compute/display statistics
        if running_reward is None:
            running_reward = reward_sum
        else:
            running_reward = running_reward * 0.99 + reward_sum * 0.01

        running_rewards.append(running_reward)

        verbose_str = 'Episode {} complete'.format(ep + prior_eps + 1)
        verbose_str += '\tReward total:{}'.format(reward_sum)
        verbose_str += '\tRunning mean: {:.4}'.format(running_reward)
        sys.stdout.write('\r' + verbose_str)
        sys.stdout.flush()

        # Update model
        finish_episode(model, optimizer, gamma, cuda)

        if (ep + prior_eps + 1) % 500 == 0:
            model_file = 'saved_models/actor_critic_{}_ep_{}.p'.format(
                game, ep + prior_eps + 1)
            data_file = 'results/{}.p'.format(game)
            with open(model_file, 'wb') as f:
                pickle.dump(model.cpu(), f)

            if cuda:
                model = model.cuda()

            with open(data_file, 'wb') as f:
                pickle.dump(running_rewards, f)