def __init__(self): game_name, game_mode, render, total_step_limit, total_run_limit, clip = self._args( ) env_name = game_name + "Deterministic-v4" # Handles frame skipping (4) at every iteration #env = MainGymWrapper.wrap(gym.make(env_name)) #Breakout() env = Breakout() self._main_loop(self._game_model(game_mode, game_name, env.actions), env, render, total_step_limit, total_run_limit, clip)
print('####################################################' \ 'WARNING: debug flag is set, output will not be saved' \ '####################################################') logger = Logger(debug=args.debug, append=args.environment) atexit.register( exit_handler) # Make sure to always save the model when exiting # Variables test_scores = [] test_mean_q = [] test_states = [] # Setup from breakout_env import Breakout env = Breakout({}) network_input_shape = (4, 110, 84) # Dimension ordering: 'th' (channels first) DQA = DQAgent(env.actions, network_input_shape, replay_memory_size=args.replay_memory_size, minibatch_size=args.minibatch_size, learning_rate=args.learning_rate, discount_factor=args.discount_factor, dropout_prob=args.dropout, epsilon=args.epsilon, epsilon_decrease_rate=args.epsilon_decrease, min_epsilon=args.min_epsilon, load_path=args.load, logger=logger) # Initial logging
paddle_pos = env.paddle.center() if paddle_pos[1] < ball_pos[1]: return 2 else: return 3 vid = VideoWriter('demo.avi', VideoWriter_fourcc(*"XVID"), float(30), (160, 210), False) env = Breakout({ 'max_step': 1000, # 'lifes': 7, 'ball_speed': [5, -2], # 'ball_size': [5, 5], # 'ball_color': 200, # 'paddle_width': 50, 'paddle_speed': 5 }) for ep in range(1): obs = env.reset() for t in itertools.count(): # action = random.randint(0, env.actions - 1) action = simple_agent(env) obs, reward, done, _ = env.step(action) print('Epsoide: {}, Step: {}, Reward: {}, Done: {}'.format( ep, t, reward, done)) imshow('obs', obs) waitKey(1)
def evaluate(DQA, args, logger): global max_mean_score evaluation_csv = 'evaluation.csv' logger.to_csv(evaluation_csv, 'length,score') env = Breakout({}) scores = list() frame_counter = 0 while frame_counter < args.validation_frames: remaining_random_actions = args.initial_random_actions obs = utils.preprocess_observation(env.reset()) frame_counter += 1 # Initialize the first state with the same 4 images current_state = np.array([obs, obs, obs, obs]) t = 0 episode = 0 score = 0 # Start episode while True: # Render the game if video output is not suppressed if args.video: env.render() action = DQA.get_action(np.asarray([current_state]), testing=True, force_random=remaining_random_actions > 0) obs, reward, done, info = env.step(action) obs = utils.preprocess_observation(obs) current_state = utils.get_next_state(current_state, obs) if remaining_random_actions > 0: remaining_random_actions -= 1 score += reward t += 1 frame_counter += 1 # End episode if done or t > args.max_episode_length: episode += 1 print('Episode %d end\n---------------\nFrame counter: %d\n' % (episode, frame_counter)) print('Length: %d\n, Score: %f\n\n' % (t, score)) # Save episode data in the evaluation csv logger.to_csv(evaluation_csv, [t, score]) break scores.append([t, score]) scores = np.asarray(scores) max_indices = np.argwhere(scores[:, 1] == np.max(scores[:, 1])).ravel() max_idx = np.random.choice(max_indices) # Save best model if max_mean_score < np.mean(scores): max_mean_score = np.mean(scores) DQA.DQN.save(append='_best') return scores[max_idx, :].ravel()
print('####################################################' \ 'WARNING: debug flag is set, output will not be saved' \ '####################################################') logger = Logger(debug=args.debug, append=args.environment) atexit.register(exit_handler) # Make sure to always save the model when exiting # Variables test_scores = [] test_mean_q = [] test_states = [] # Setup from breakout_env import Breakout env = Breakout({'ball_color': 255 -143, 'paddle_color': 255-143, 'bricks_color': [255-200, 255-180, 255-160, 255-140, 255-120, 255-100]}) network_input_shape = (4, 110, 84) # Dimension ordering: 'th' (channels first) DQA = DQAgent(env.actions, network_input_shape, replay_memory_size=args.replay_memory_size, minibatch_size=args.minibatch_size, learning_rate=args.learning_rate, discount_factor=args.discount_factor, dropout_prob=args.dropout, epsilon=args.epsilon, epsilon_decrease_rate=args.epsilon_decrease, min_epsilon=args.min_epsilon, load_path=args.load, logger=logger)
def train(shared_model, shared_optimizer, rank, args, info): #env = gym.make(args.env) # make a local (unshared) environment env = Breakout({})#{args} #env = Breakout({'paddle_width': 7}) #env.seed(args.seed + rank) torch.manual_seed(args.seed + rank) # seed everything model = NNPolicy(channels=1, memsize=args.hidden, num_actions=args.num_actions) # a local/unshared model state = torch.tensor(prepro(env.reset())) # get first state start_time = last_disp_time = time.time() episode_length, epr, eploss, done = 0, 0, 0, True # bookkeeping while info['frames'][0] <= 8e7 or args.test: # openai baselines uses 40M frames...we'll use 80M model.load_state_dict(shared_model.state_dict()) # sync with shared model hx = torch.zeros(1, 256) if done else hx.detach() # rnn activation vector values, logps, actions, rewards = [], [], [], [] # save values for computing gradientss for step in range(args.rnn_steps): episode_length += 1 value, logit, hx = model((state.view(1,1,80,80), hx)) logp = F.log_softmax(logit, dim=-1) action = torch.exp(logp).multinomial(num_samples=1).data[0]#logp.max(1)[1].data if args.test else state, reward, done, _ = env.step(action.numpy()[0]) if args.render: #env.render() imshow('state',state) waitKey(1) #vid.write(state) state = torch.tensor(prepro(state)) ; epr += reward reward = np.clip(reward, -1, 1) # reward done = done or episode_length >= 1e4 # don't playing one ep for too long info['frames'].add_(1) ; num_frames = int(info['frames'].item()) if num_frames % 2e5 == 0: # save every 200k frames printlog(args, '\n\t{:.0f}lac frames: saved model\n'.format(num_frames/1e5)) torch.save(shared_model.state_dict(), args.save_dir+'model.{:.0f}.tar'.format(num_frames/1e5)) if done: # update shared data info['episodes'] += 1 interp = 1 if info['episodes'][0] == 1 else 1 - args.horizon info['run_epr'].mul_(1-interp).add_(interp * epr) info['run_loss'].mul_(1-interp).add_(interp * eploss) if rank == 0 and time.time() - last_disp_time > 20: # print info ~ every minute elapsed = time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)) printlog(args, 'time {}, episodes {:.0f}, frames {:.1f}M, mean epr {:.2f}, run loss {:.2f}' .format(elapsed, info['episodes'].item(), num_frames/1e6, info['run_epr'].item(), info['run_loss'].item())) last_disp_time = time.time() if done: # maybe print info. episode_length, epr, eploss = 0, 0, 0 state = torch.tensor(prepro(env.reset())) values.append(value) ; logps.append(logp) ; actions.append(action) ; rewards.append(reward) next_value = torch.zeros(1,1) if done else model((state.unsqueeze(0), hx))[0] values.append(next_value.detach()) loss = cost_func(args, torch.cat(values), torch.cat(logps), torch.cat(actions), np.asarray(rewards)) eploss += loss.item() shared_optimizer.zero_grad() ; loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 40) for param, shared_param in zip(model.parameters(), shared_model.parameters()): if shared_param.grad is None: shared_param._grad = param.grad # sync gradients with shared model shared_optimizer.step()
for param, shared_param in zip(model.parameters(), shared_model.parameters()): if shared_param.grad is None: shared_param._grad = param.grad # sync gradients with shared model shared_optimizer.step() if __name__ == "__main__": if sys.version_info[0] > 2: mp.set_start_method('spawn') # this must not be in global scope elif sys.platform == 'linux' or sys.platform == 'linux2': raise "Must be using Python 3 with linux!" # or else you get a deadlock in conv2d args = get_args() args.save_dir = '{}/'.format(args.env.lower()) # keep the directory structure simple if args.render: args.processes = 1 ; args.test = True # render mode -> test mode w one process if args.test: args.lr = 0 # don't train in render mode args.num_actions = Breakout().actions # get the action space of this game os.makedirs(args.save_dir) if not os.path.exists(args.save_dir) else None # make dir to save models etc. torch.manual_seed(args.seed) shared_model = NNPolicy(channels=1, memsize=args.hidden, num_actions=args.num_actions).share_memory() shared_optimizer = SharedAdam(shared_model.parameters(), lr=args.lr) info = {k: torch.DoubleTensor([0]).share_memory_() for k in ['run_epr', 'run_loss', 'episodes', 'frames']} info['frames'] += shared_model.try_load(args.save_dir) * 1e5 if int(info['frames'].item()) == 0: printlog(args,'', end='', mode='w') # clear log file processes = [] for rank in range(args.processes): p = mp.Process(target=train, args=(shared_model, shared_optimizer, rank, args, info)) p.start() ; processes.append(p) for p in processes: p.join()