def main(): visualize_n_boards = 5 for i in range(visualize_n_boards): env = ppad.PAD(board=boards[i]) env.episode2gif(path=os.environ['PYTHONPATH'] + '/visualizations/solved_board' + str(i + 1) + '.png', shrink=8, ext='png')
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/> """ from importlib import reload import ppad ppad = reload(ppad) from ppad.pad.utils import episode2gif SOMEPATH = 'yourpath' # Example 1: Visualize directly from the environment itself. env = ppad.PAD() for _ in range(100): env.step(action=env.action_space.sample(), verbose=True) env.visualize(filename=SOMEPATH + '/random_sampling.gif') env.step(action='pass', verbose=True) # Example 2: Visualize using the episode information. # Generate observations and actions using any method in the specified format. # Here we are generating them from "smart data" and step = -1 means terminate on zero combo. observations, actions, rewards = ppad.smart_data(boards=1, permutations=1, trajectories=1, steps=-1) episode2gif(observations, actions, filename=SOMEPATH + '/smart_data.gif')
def smart_data(boards=1, permutations=1, trajectories=1, steps=100, discount=True, gamma=0.9, log10=True, allowed_orbs=solved_boards.allowed_orbs): """ Generate smart training data in a format that can be directly fed into the learning agent. The generation 1 of smart training data is derived from human-solved boards and random sampling. :param boards: The number of boards out of the solved boards to randomly use. :param permutations: The number of orb identity permutations to perform for each board. :param trajectories: The number of trajectories to generate from each permutation of each chosen board. :param steps: The number of steps to generate in each trajectory. If steps = -1, terminates the trajectories when and only when there is no more combos on the board. :param discount: True for doing discounting. :param gamma: Discount rate. :param allowed_orbs: A list of allowed orb identities. :return: observations, actions and rewards defined exactly as the same-named variables in ppad.pad.game. """ observations_sd = [] actions_sd = [] rewards_sd = [] env = ppad.PAD(skyfall_damage=False) if boards < 0 or boards > len(solved_boards.boards): raise Exception('Invalid input value for board = {0}.'.format(boards)) if trajectories < 0: raise Exception( 'Invalid input value for traj = {0}.'.format(trajectories)) if permutations < 0: raise Exception( 'Invalid input value for shuffle = {0}.'.format(permutations)) board_indices = random.sample(range(0, len(solved_boards.boards)), boards) for index in board_indices: current_board = solved_boards.boards[index] for _ in range(permutations): # The permutations generated this way are not unique. current_permutation = random.sample(allowed_orbs, len(allowed_orbs)) current_board = permutation_mapping( original_board=current_board, original_orbs=solved_boards.allowed_orbs, mapping=current_permutation) for _ in range(trajectories): env.reset(board=current_board) env.step('pass') final_reward = env.rewards[-1] env.reset(board=current_board) if steps != -1: for _ in range(steps - 1): action = env.action_space.sample() env.step(action) elif steps == -1: # When steps is -1, we reverse sample the board until the first time no combo is left. combos = [0] while len(combos) > 0: action = env.action_space.sample() env.step(action) combos = pad_utils.cancel(np.copy(env.board)) env.step('pass') observations_sd.append(revert_observations(env.observations)) actions_sd.append(revert_actions(env.actions)) rewards_sd.append( revert_rewards(len(env.actions), final_reward)) if discount: discounted_rewards_list = [] for rewards_one_traj in rewards_sd: discounted_rewards_list.append( ppad.discount(rewards=rewards_one_traj, gamma=gamma, log10=log10)) rewards_sd = discounted_rewards_list return observations_sd, actions_sd, rewards_sd
ID2ACTION = {0: 'up', 1: 'down', 2: 'left', 3: 'right', 4: 'pass'} NON_PASS_ACTIONS = {'up', 'down', 'left', 'right'} ############################ # 2. Set-up. ############################ # Agent initialization. sess = tf.Session() agent = Agent01(sess, conv_layers=((2, 128), (2, 64)), dense_layers=(64, 32, 5), learning_rate=0.0001) agent.copy_A_to_B() # Environment initialization. env = ppad.PAD(skyfall_damage=False) # (s,a,r) tuples. sar_data = [] # Metrics variables. beta = BETA_INIT print('BETA value at the end of training:', BETA_INIT*BETA_INCREASE_RATE**(STEPS/BETA_INCREASE_FREQ)) total_loss = 0 total_rmse = 0 total_reward = 0 total_new_data_points = 0 total_actions = 0 total_episodes = 0 max_reward = 0