def drop_piece(self, action, old_board): board = old_board.clone() board.rel_x, board.rel_y, board.rotation_idx = 1, *action while env.is_available(board, (1, 0)): board = env.make(board, env.EMPTY) board.rel_x += 1 board = env.make(board, env.PIECE) self.board = board.area self.update() self.root.update() time.sleep(self.speed)
def main(): args = arg_parser() if args.mode == "train": env = environment.make(args.env, args) if args.networks == "MLP": nn = MLP(env.observation_space.shape[0], env.action_space, args.n_frames) elif args.networks == "CONV": nn = CONV(args.n_frames, env.action_space) optimizer = SharedAdam(nn.parameters()) threads = [] thread = mp.Process(target=test, args=(args, nn)) thread.start() threads.append(thread) for i in range(0, args.n_workers): thread = mp.Process(target=train, args=(i, args, nn, optimizer)) thread.start() threads.append(thread) for thread in threads: thread.join() elif args.mode == "test": evaluate(args)
def test(args, nn): ptitle('Test Agent') log = {} setup_logger('{}_log'.format(args.env), r'{0}{1}_log'.format(args.log, args.env)) log['{}_log'.format(args.env)] = logging.getLogger('{}_log'.format( args.env)) d_args = vars(args) for k in d_args.keys(): log['{}_log'.format(args.env)].info('{0}: {1}'.format(k, d_args[k])) env = environment.make(args.env, args) reward_sum = 0 start_time = time.time() num_tests = 0 reward_total_sum = 0 player = Agent(None, env, args, None) player.model = MLP(player.env.observation_space.shape[0], player.env.action_space, args.n_frames) player.state = player.env.reset() player.state = torch.from_numpy(player.state).float() player.model.eval() max_score = 0 while True: if player.done: player.model.load_state_dict(nn.state_dict()) player.action_test() reward_sum += player.reward if player.done: num_tests += 1 reward_total_sum += reward_sum reward_mean = reward_total_sum / num_tests log['{}_log'.format(args.env)].info( "Time {0}, reward {1}, average reward {2:.4f}".format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, reward_mean)) if reward_sum >= max_score: max_score = reward_sum state_to_save = player.model.state_dict() torch.save(state_to_save, '{}.dat'.format(args.model_save_dir)) reward_sum = 0 player.eps_len = 0 state = player.env.reset() time.sleep(60) player.state = torch.from_numpy(state).float()
def evaluate(args): torch.set_default_tensor_type('torch.FloatTensor') saved_state = torch.load( '{}.dat'.format(args.model_load_dir), map_location=lambda storage, loc: storage ) log = {} setup_logger('{}_eval_log'.format(args.env), r'{0}{1}_eval_log'.format( args.log, args.env)) log['{}_eval_log'.format(args.env)] = logging.getLogger( '{}_eval_log'.format(args.env)) d_args = vars(args) for k in d_args.keys(): log['{}_eval_log'.format(args.env)].info('{0}: {1}'.format(k, d_args[k])) env = environment.make("{}".format(args.env), args) num_tests = 0 reward_total_sum = 0 player = Agent(None, env, args, None) if args.networks == "MLP": player.model = MLP(env.observation_space.shape[0], env.action_space, args.n_frames) elif args.networks == "CONV": player.model = CONV(args.n_frames, env.action_space) if True: player.env = gym.wrappers.Monitor( player.env, "{}_monitor".format(args.env), lambda episode_id: True, force=True) player.model.load_state_dict(saved_state) player.model.eval() for i_episode in range(args.rollout): player.state = player.env.reset() player.state = torch.from_numpy(player.state).float() player.eps_len = 0 reward_sum = 0 while True: if args.render: if i_episode % 1 == 0: player.env.render() player.action_test() reward_sum += player.reward if player.done: num_tests += 1 reward_total_sum += reward_sum reward_mean = reward_total_sum / num_tests log['{}_eval_log'.format(args.env)].info( "reward, {0}, average reward, {1:.4f}".format(reward_sum, reward_mean)) break
if __name__=="__main__": logging.disable(logging.NOTSET) nElevator = int(sys.argv[1]) nFloor = int(sys.argv[2]) spawnRates = [1/360]+[1/360]*(nFloor-1) avgWeight = 135 weightLimit = 1200 loadTime = 1 beta = 0.01 lr = 1e-4 """ Initialize environment and optimizers """ # initialize environment env = gym.make(nElevator, nFloor, spawnRates, avgWeight, weightLimit, loadTime) obssize = env.observation_space_size actsize = env.action_space_size print("state space dimension", obssize) print("action space size", actsize) # initialize tensorflow session sess = tf.Session() # initialize an optimizer for each elevator optimizer_list = [] for i in range(nElevator): optimizer_list.append(tf.train.AdamOptimizer(lr)) # initialize a NNet for each elevator Q=[]
def main(): # parser = argparse.ArgumentParser() # parser.add_argument("--cuda", default=True, action='store_true', # help='Enable cuda') # parser.add_argument("--env", default=DEFAULT_ENV_NAME, # help='Name of the environment, default = ' + DEFAULT_ENV_NAME) # parser.add_argument('--reward', type=float, default=MEAN_REWARD_BOUND, # help='Mean reward boundary for stopping of training, default = %.2f'%(MEAN_REWARD_BOUND)) # args = parser.parse_args() # device = torch.device('cuda' if args.cuda else 'cpu') device = torch.device('cuda') env = make('glovedatarms.npy', 'labels2.npy', 4, 1, -1) net = dqn.DQN(env.observation_shape, env.n_actions).to(device) tgt_net = dqn.DQN(env.observation_shape, env.n_actions).to(device) print(net) net.load_state_dict(torch.load('best.dat')) buffer = ExperienceBuffer(REPLAY_SIZE) file = open('replay_mem.obj', 'rb') buffer = pickle.load(file) agent = Agent(env, buffer) epsilon = EPSILON_START optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE) total_rewards = [] frame_idx = 0 ts_frame = 0 ts = time.time() best_mean_reward = None while True: frame_idx += 1 epsilon = max(EPSILON_FINAL, EPSILON_START - frame_idx / EPSILON_DECAY_LAST_FRAME) reward = agent.play_step(net, epsilon, device) if reward is not None: total_rewards.append(reward) speed = (frame_idx - ts_frame) / (time.time() - ts) ts_frame = frame_idx ts = time.time() mean_reward = np.mean(total_rewards[-100:]) print( "%d: done %d passes, mean reward %.3f, eps %.2f, speed %.2f f/s" % (frame_idx, len(total_rewards), mean_reward, epsilon, speed)) if best_mean_reward is None or best_mean_reward < mean_reward: torch.save(net.state_dict(), 'best.dat') filehandler = open('replay_mem.obj', 'wb') pickle.dump(buffer, filehandler) if best_mean_reward is not None: print( "Best mean reward updated %.3f -> %.3f, model saved" % (best_mean_reward, mean_reward)) best_mean_reward = mean_reward if mean_reward > best_mean_reward: print("Solved in %d frames!" % frame_idx) break if len(buffer) < REPLAY_START_SIZE: continue if frame_idx % SYNC_TARGET_FRAMES == 0: tgt_net.load_state_dict(net.state_dict()) optimizer.zero_grad() batch = buffer.sample(BATCH_SIZE) loss_t = calc_loss(batch, net, tgt_net, device=device) loss_t.backward() optimizer.step()
T.ToTensor()]) def get_screen(env, device): # transpose into torch order (CHW) screen = env.render(mode='rgb_array').transpose((2, 0, 1)) # Strip off the top and bottom of the screen screen = screen[:, 160:320] # Convert to float, rescare, convert to torch tensor # (this doesn't require a copy) screen = np.ascontiguousarray(screen, dtype=np.float32) / 255 screen = torch.from_numpy(screen) # Resize, and add a batch dimension (BCHW) return resize(screen).unsqueeze(0).to(device) if __name__ == "__main__": import matplotlib.pyplot as plt from environment import make env = make("unity") env.start() plt.figure() plt.imshow(get_screen(env, torch.device("cpu")).cpu().squeeze(0).permute( 1, 2, 0).numpy(), interpolation='none') plt.title('Example extracted screen') plt.show()
def train(rank, args, nn, optimizer): ptitle('Training Agent: {}'.format(rank)) env = environment.make(args.env, args) env.seed(RANDOM_SEED + rank) player = Agent(None, env, args, None) player.model = MLP(player.env.observation_space.shape[0], player.env.action_space, args.n_frames) player.state = player.env.reset() player.state = torch.from_numpy(player.state).float() player.model.train() while True: player.model.load_state_dict(nn.state_dict()) if player.done: player.cx = Variable(torch.zeros(1, 128)) player.hx = Variable(torch.zeros(1, 128)) else: player.cx = Variable(player.cx.data) player.hx = Variable(player.hx.data) for step in range(args.n_steps): player.action_train() if player.done: break if player.done: player.eps_len = 0 state = player.env.reset() player.state = torch.from_numpy(state).float() R = torch.zeros(1, 1) if not player.done: state = player.state value, _, _, _ = player.model( (Variable(state), (player.hx, player.cx))) R = value.data player.values.append(Variable(R)) policy_loss = 0 value_loss = 0 R = Variable(R) gae = torch.zeros(1, 1) for i in reversed(range(len(player.rewards))): R = args.gamma * R + player.rewards[i] advantage = R - player.values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimataion # print(player.rewards[i]) delta_t = player.rewards[i] + args.gamma * \ player.values[i + 1].data - player.values[i].data gae = gae * args.gamma * args.tau + delta_t policy_loss = policy_loss - \ (player.log_probs[i].sum() * Variable(gae)) - \ (0.01 * player.entropies[i].sum()) player.model.zero_grad() (policy_loss + 0.5 * value_loss).backward() ensure_shared_grads(player.model, nn, gpu=False) optimizer.step() player.clear_actions()