Example #1
0
 def __init__(self):
     game_name, game_mode, render, total_step_limit, total_run_limit, clip = self._args(
     )
     env_name = game_name + "Deterministic-v4"  # Handles frame skipping (4) at every iteration
     #env = MainGymWrapper.wrap(gym.make(env_name)) #Breakout()
     env = Breakout()
     self._main_loop(self._game_model(game_mode, game_name, env.actions),
                     env, render, total_step_limit, total_run_limit, clip)
Example #2
0
    print('####################################################' \
          'WARNING: debug flag is set, output will not be saved' \
          '####################################################')

logger = Logger(debug=args.debug, append=args.environment)
atexit.register(
    exit_handler)  # Make sure to always save the model when exiting

# Variables
test_scores = []
test_mean_q = []
test_states = []

# Setup
from breakout_env import Breakout
env = Breakout({})
network_input_shape = (4, 110, 84)  # Dimension ordering: 'th' (channels first)
DQA = DQAgent(env.actions,
              network_input_shape,
              replay_memory_size=args.replay_memory_size,
              minibatch_size=args.minibatch_size,
              learning_rate=args.learning_rate,
              discount_factor=args.discount_factor,
              dropout_prob=args.dropout,
              epsilon=args.epsilon,
              epsilon_decrease_rate=args.epsilon_decrease,
              min_epsilon=args.min_epsilon,
              load_path=args.load,
              logger=logger)

# Initial logging
Example #3
0
    paddle_pos = env.paddle.center()

    if paddle_pos[1] < ball_pos[1]:
        return 2
    else:
        return 3


vid = VideoWriter('demo.avi', VideoWriter_fourcc(*"XVID"), float(30),
                  (160, 210), False)

env = Breakout({
    'max_step': 1000,
    # 'lifes': 7,
    'ball_speed': [5, -2],
    # 'ball_size': [5, 5],
    # 'ball_color': 200,
    # 'paddle_width': 50,
    'paddle_speed': 5
})

for ep in range(1):
    obs = env.reset()
    for t in itertools.count():
        # action = random.randint(0, env.actions - 1)
        action = simple_agent(env)
        obs, reward, done, _ = env.step(action)
        print('Epsoide: {}, Step: {}, Reward: {}, Done: {}'.format(
            ep, t, reward, done))
        imshow('obs', obs)
        waitKey(1)
Example #4
0
def evaluate(DQA, args, logger):
    global max_mean_score

    evaluation_csv = 'evaluation.csv'
    logger.to_csv(evaluation_csv, 'length,score')
    env = Breakout({})
    scores = list()
    frame_counter = 0

    while frame_counter < args.validation_frames:
        remaining_random_actions = args.initial_random_actions
        obs = utils.preprocess_observation(env.reset())

        frame_counter += 1
        # Initialize the first state with the same 4 images
        current_state = np.array([obs, obs, obs, obs])
        t = 0
        episode = 0
        score = 0

        # Start episode
        while True:
            # Render the game if video output is not suppressed
            if args.video:
                env.render()

            action = DQA.get_action(np.asarray([current_state]),
                                    testing=True,
                                    force_random=remaining_random_actions > 0)
            obs, reward, done, info = env.step(action)
            obs = utils.preprocess_observation(obs)
            current_state = utils.get_next_state(current_state, obs)

            if remaining_random_actions > 0:
                remaining_random_actions -= 1

            score += reward
            t += 1
            frame_counter += 1

            # End episode
            if done or t > args.max_episode_length:
                episode += 1
                print('Episode %d end\n---------------\nFrame counter: %d\n' %
                      (episode, frame_counter))
                print('Length: %d\n, Score: %f\n\n' % (t, score))
                # Save episode data in the evaluation csv
                logger.to_csv(evaluation_csv, [t, score])
                break

        scores.append([t, score])

    scores = np.asarray(scores)
    max_indices = np.argwhere(scores[:, 1] == np.max(scores[:, 1])).ravel()
    max_idx = np.random.choice(max_indices)

    # Save best model
    if max_mean_score < np.mean(scores):
        max_mean_score = np.mean(scores)
        DQA.DQN.save(append='_best')

    return scores[max_idx, :].ravel()
Example #5
0
    print('####################################################' \
          'WARNING: debug flag is set, output will not be saved' \
          '####################################################')

logger = Logger(debug=args.debug, append=args.environment)
atexit.register(exit_handler)  # Make sure to always save the model when exiting

# Variables
test_scores = []
test_mean_q = []
test_states = []

# Setup
from breakout_env import Breakout
env = Breakout({'ball_color': 255 -143,
		'paddle_color': 255-143,
		'bricks_color': [255-200, 255-180, 255-160, 255-140, 255-120, 255-100]})
network_input_shape = (4, 110, 84)  # Dimension ordering: 'th' (channels first)
DQA = DQAgent(env.actions,
              network_input_shape,
              replay_memory_size=args.replay_memory_size,
              minibatch_size=args.minibatch_size,
              learning_rate=args.learning_rate,
              discount_factor=args.discount_factor,
              dropout_prob=args.dropout,
              epsilon=args.epsilon,
              epsilon_decrease_rate=args.epsilon_decrease,
              min_epsilon=args.min_epsilon,
              load_path=args.load,
              logger=logger)
Example #6
0
def train(shared_model, shared_optimizer, rank, args, info):
    #env = gym.make(args.env) # make a local (unshared) environment
    env = Breakout({})#{args}
    #env = Breakout({'paddle_width': 7})
    #env.seed(args.seed + rank) 
    torch.manual_seed(args.seed + rank) # seed everything
    model = NNPolicy(channels=1, memsize=args.hidden, num_actions=args.num_actions) # a local/unshared model
    state = torch.tensor(prepro(env.reset())) # get first state

    start_time = last_disp_time = time.time()
    episode_length, epr, eploss, done  = 0, 0, 0, True # bookkeeping

    while info['frames'][0] <= 8e7 or args.test: # openai baselines uses 40M frames...we'll use 80M
        model.load_state_dict(shared_model.state_dict()) # sync with shared model

        hx = torch.zeros(1, 256) if done else hx.detach()  # rnn activation vector
        values, logps, actions, rewards = [], [], [], [] # save values for computing gradientss

        for step in range(args.rnn_steps):
            episode_length += 1
            value, logit, hx = model((state.view(1,1,80,80), hx))
            logp = F.log_softmax(logit, dim=-1)

            action = torch.exp(logp).multinomial(num_samples=1).data[0]#logp.max(1)[1].data if args.test else
            state, reward, done, _ = env.step(action.numpy()[0])
	
            if args.render:
                #env.render()
                imshow('state',state)
                waitKey(1)
                #vid.write(state)

            state = torch.tensor(prepro(state)) ; epr += reward
            reward = np.clip(reward, -1, 1) # reward
            done = done or episode_length >= 1e4 # don't playing one ep for too long
            
            info['frames'].add_(1) ; num_frames = int(info['frames'].item())
            if num_frames % 2e5 == 0: # save every 200k frames
                printlog(args, '\n\t{:.0f}lac frames: saved model\n'.format(num_frames/1e5))
                torch.save(shared_model.state_dict(), args.save_dir+'model.{:.0f}.tar'.format(num_frames/1e5))

            if done: # update shared data
                info['episodes'] += 1
                interp = 1 if info['episodes'][0] == 1 else 1 - args.horizon
                info['run_epr'].mul_(1-interp).add_(interp * epr)
                info['run_loss'].mul_(1-interp).add_(interp * eploss)

            if rank == 0 and time.time() - last_disp_time > 20: # print info ~ every minute
                elapsed = time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time))
                printlog(args, 'time {}, episodes {:.0f}, frames {:.1f}M, mean epr {:.2f}, run loss {:.2f}'
                    .format(elapsed, info['episodes'].item(), num_frames/1e6,
                    info['run_epr'].item(), info['run_loss'].item()))
                last_disp_time = time.time()

            if done: # maybe print info.
                episode_length, epr, eploss = 0, 0, 0
                state = torch.tensor(prepro(env.reset()))

            values.append(value) ; logps.append(logp) ; actions.append(action) ; rewards.append(reward)

        next_value = torch.zeros(1,1) if done else model((state.unsqueeze(0), hx))[0]
        values.append(next_value.detach())

        loss = cost_func(args, torch.cat(values), torch.cat(logps), torch.cat(actions), np.asarray(rewards))
        eploss += loss.item()
        shared_optimizer.zero_grad() ; loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 40)

        for param, shared_param in zip(model.parameters(), shared_model.parameters()):
            if shared_param.grad is None: shared_param._grad = param.grad # sync gradients with shared model
        shared_optimizer.step()
Example #7
0
        for param, shared_param in zip(model.parameters(), shared_model.parameters()):
            if shared_param.grad is None: shared_param._grad = param.grad # sync gradients with shared model
        shared_optimizer.step()

if __name__ == "__main__":
    if sys.version_info[0] > 2:
        mp.set_start_method('spawn') # this must not be in global scope
    elif sys.platform == 'linux' or sys.platform == 'linux2':
        raise "Must be using Python 3 with linux!" # or else you get a deadlock in conv2d
    
    args = get_args()
    args.save_dir = '{}/'.format(args.env.lower()) # keep the directory structure simple
    if args.render:  args.processes = 1 ; args.test = True # render mode -> test mode w one process
    if args.test:  args.lr = 0 # don't train in render mode
    args.num_actions = Breakout().actions # get the action space of this game
    os.makedirs(args.save_dir) if not os.path.exists(args.save_dir) else None # make dir to save models etc.

    torch.manual_seed(args.seed)
    shared_model = NNPolicy(channels=1, memsize=args.hidden, num_actions=args.num_actions).share_memory()
    shared_optimizer = SharedAdam(shared_model.parameters(), lr=args.lr)

    info = {k: torch.DoubleTensor([0]).share_memory_() for k in ['run_epr', 'run_loss', 'episodes', 'frames']}
    info['frames'] += shared_model.try_load(args.save_dir) * 1e5
    if int(info['frames'].item()) == 0: printlog(args,'', end='', mode='w') # clear log file
    
    processes = []
    for rank in range(args.processes):
        p = mp.Process(target=train, args=(shared_model, shared_optimizer, rank, args, info))
        p.start() ; processes.append(p)
    for p in processes: p.join()