def test_episodes(self): # Test the model on all episodes. # Success is determined by positive reward on the final step, # which works for BabyAI and Sokoban, but is not appropriate for many environments. NUM_EPISODES_TO_TEST = spec.val("NUM_EPISODES_TO_TEST") MIN_FINAL_REWARD_FOR_SUCCESS = spec.val("MIN_FINAL_REWARD_FOR_SUCCESS") self.create_results_output_file() spec.output_to_file(self.output_filename) num_wins = 0 num_episodes_tested = 0 self.output("Testing on {} episodes.".format(NUM_EPISODES_TO_TEST)) start_time = time.time() for episode_id in range(NUM_EPISODES_TO_TEST): torch.manual_seed(AGENT_RANDOM_SEED) final_reward, steps = self.test_on_episode(episode_id) if final_reward >= MIN_FINAL_REWARD_FOR_SUCCESS: num_wins += 1 num_episodes_tested += 1 if (num_episodes_tested % (NUM_EPISODES_TO_TEST / 10) == 0): self.output('{:4d} / {:5d} = {:5.1f}%'.format( num_wins, num_episodes_tested, 100.0 * num_wins / num_episodes_tested)) self.output("Time: {:3.1f} min".format( (time.time() - start_time) / 60.)) self.output("Success rate = {} / {} episodes = {:5.1f}%".format( num_wins, num_episodes_tested, 100.0 * num_wins / num_episodes_tested))
def create_environment(self, seed=None): # Each new environment should be listed here. if ENV == "Pathfinding_Env": from environments.pathfinding import Pathfinding_Env environment = Pathfinding_Env(seed) elif ENV == "BabyAI_Env": from environments.babyai import BabyAI_Env environment = BabyAI_Env(seed) HELDOUT_TESTING = spec.val("HELDOUT_TESTING") self.heldout_testing = HELDOUT_TESTING elif ENV == "Sokoban_Env": from environments.sokoban import Sokoban_Env environment = Sokoban_Env(seed) else: print("Environment {} not found.".format(ENV)) exit(0) self.observation_space = environment.observation_space self.action_space = environment.action_space return environment
import torch import torch.nn as nn import torch.nn.functional as F import math import collections import numpy as np import copy from utils.spec_reader import spec hidden_size = spec.val("DNC_HIDDEN_SIZE") memory_size = spec.val("DNC_MEMORY_SIZE") word_size = spec.val("DNC_WORD_SIZE") num_write_heads = spec.val("DNC_NUM_WRITE_HEADS") num_read_heads = spec.val("DNC_READ_HEADS") num_read_modes = 1 + 2 * num_write_heads batch_size = 1 TemporalLinkageState = collections.namedtuple('TemporalLinkageState', ('link', 'precedence_weights')) AccessState = collections.namedtuple( 'AccessState', ('memory', 'read_weights', 'write_weights', 'linkage', 'usage')) _EPSILON = 0.001 class MemoryModule(nn.Module): def __init__(self): super(MemoryModule, self).__init__()
# Copyright (c) Microsoft Corporation. # Licensed under the MIT license. import torch import torch.nn as nn import torch.nn.functional as F from agents.networks.shared.general import LinearLayer, SeparateActorCriticLayers from utils.spec_reader import spec NUM_RNN_UNITS = spec.val("NUM_RNN_UNITS") AC_HIDDEN_LAYER_SIZE = spec.val("AC_HIDDEN_LAYER_SIZE") OBS_EMBED_SIZE = spec.val("OBS_EMBED_SIZE") class GRU_Network(nn.Module): def __init__(self, input_size, action_space_size): super(GRU_Network, self).__init__() next_input_size = input_size if OBS_EMBED_SIZE > 0: self.obs_emb = LinearLayer(next_input_size, OBS_EMBED_SIZE) next_input_size = OBS_EMBED_SIZE self.num_rnn_units = NUM_RNN_UNITS self.rnn = nn.GRUCell(next_input_size, self.num_rnn_units) self.actor_critic_layers = SeparateActorCriticLayers( self.num_rnn_units, 2, AC_HIDDEN_LAYER_SIZE, action_space_size) def forward(self, obs, old_state): tens = torch.FloatTensor(obs).unsqueeze(0) if OBS_EMBED_SIZE > 0: tens = self.obs_emb(tens) new_state = self.rnn(tens, old_state) policy, value_est = self.actor_critic_layers(new_state)
# Copyright (c) Microsoft Corporation. # Licensed under the MIT license. import numpy as np import random from utils.spec_reader import spec NUM_PATTERNS = spec.val("NUM_PATTERNS") PATTERN_LENGTH = 7 class Link(object): def __init__(self, graph, src, tar): self.graph = graph self.src = src self.tar = tar assert (graph.path_len[src][tar] == 0) graph.path_len[src][tar] = 1 def output(self): print("Link {} -> {}".format(self.src, self.tar)) class Graph(object): def __init__(self, rand): self.rand = rand def reset(self): # Links are represented by objects, but nodes are represented by indices. # Keep in mind that the nodes in this graph do not (necessarily) correspond to transformer nodes.
# Copyright (c) Microsoft Corporation. # Licensed under the MIT license. import torch import torch.nn as nn import numpy as np from agents.networks.shared.transformer import Transformer from agents.networks.shared.general import LinearLayer from agents.networks.shared.general import SeparateActorCriticLayers from agents.networks.shared.general import SharedActorCriticLayers from utils.graph import Graph from utils.spec_reader import spec V2 = spec.val("V2") WMG_ATTENTION_HEAD_SIZE = spec.val("WMG_ATTENTION_HEAD_SIZE") WMG_NUM_ATTENTION_HEADS = spec.val("WMG_NUM_ATTENTION_HEADS") WMG_NUM_LAYERS = spec.val("WMG_NUM_LAYERS") WMG_HIDDEN_SIZE = spec.val("WMG_HIDDEN_SIZE") AC_HIDDEN_LAYER_SIZE = spec.val("AC_HIDDEN_LAYER_SIZE") WMG_MAX_OBS = spec.val("WMG_MAX_OBS") WMG_MAX_MEMOS = spec.val("WMG_MAX_MEMOS") # Set WMG_MAX_MEMOS > 0 for attention over Memos, stored in a StateMatrix. # Set WMG_MAX_OBS > 0 for attention over past observations, stored in a StateMatrix. # Attention over both would require two separate instances of StateMatrix. # The WMG experiments did not explore this combination, so there's only one StateMatrix. if WMG_MAX_MEMOS: # StateMatrix contains Memos. S = WMG_MAX_MEMOS # Maximum number of state vectors stored in the matrix. WMG_MEMO_SIZE = spec.val("WMG_MEMO_SIZE") assert WMG_MAX_OBS == 0
# Copyright (c) Microsoft Corporation. # Licensed under the MIT license. import numpy as np import torch import torch.nn as nn import torch.nn.functional as F from utils.spec_reader import spec AGENT_RANDOM_SEED = spec.val("AGENT_RANDOM_SEED") A3C_T_MAX = spec.val("A3C_T_MAX") LEARNING_RATE = spec.val("LEARNING_RATE") DISCOUNT_FACTOR = spec.val("DISCOUNT_FACTOR") GRADIENT_CLIP = spec.val("GRADIENT_CLIP") WEIGHT_DECAY = spec.val("WEIGHT_DECAY") AGENT_NET = spec.val("AGENT_NET") ENTROPY_TERM_STRENGTH = spec.val("ENTROPY_TERM_STRENGTH") REWARD_SCALE = spec.val("REWARD_SCALE") ADAM_EPS = spec.val("ADAM_EPS") ANNEAL_LR = spec.val("ANNEAL_LR") if ANNEAL_LR: LR_GAMMA = spec.val("LR_GAMMA") from torch.optim.lr_scheduler import StepLR torch.manual_seed(AGENT_RANDOM_SEED) class A3cAgent(object): ''' A single-worker version of Asynchronous Advantage Actor-Critic (Mnih et al., 2016)''' def __init__(self, observation_space_size, action_space_size): if AGENT_NET == "GRU_Network": from agents.networks.gru import GRU_Network
# Adapted from https://github.com/mpSchrader/gym-sokoban # Max-Philipp B. Schrader, 2018. import os from os import listdir from os import path from os.path import isfile, join import random import numpy as np from utils.graph import Graph from utils.graph import Entity import zipfile from utils.spec_reader import spec SOKOBAN_MAX_STEPS = spec.val("SOKOBAN_MAX_STEPS") SOKOBAN_DIFFICULTY = spec.val("SOKOBAN_DIFFICULTY") SOKOBAN_SPLIT = spec.val("SOKOBAN_SPLIT") SOKOBAN_ROOM_OVERRIDE = spec.val("SOKOBAN_ROOM_OVERRIDE") SOKOBAN_BOXES_REQUIRED = spec.val("SOKOBAN_BOXES_REQUIRED") SOKOBAN_OBSERVATION_FORMAT = spec.val("SOKOBAN_OBSERVATION_FORMAT") SOKOBAN_REWARD_PER_STEP = spec.val("SOKOBAN_REWARD_PER_STEP") SOKOBAN_REWARD_SUCCESS = spec.val("SOKOBAN_REWARD_SUCCESS") PIXELS_PER_TILE = 6 # Each tile is one pixel in the original Sokoban images, 8 pixels per cell, and 10x10 cells in a puzzle. TILES_PER_CELL = 8 PUZZLE_SCALE = PIXELS_PER_TILE * TILES_PER_CELL PUZZLE_SIZE = 10 # Cell state codes WALL = 0
# Copyright (c) Microsoft Corporation. # Licensed under the MIT license. import gym import babyai # This registers the 19 MiniGrid levels. import numpy as np from utils.spec_reader import spec BABYAI_ENV_LEVEL = spec.val("BABYAI_ENV_LEVEL") USE_SUCCESS_RATE = spec.val( "USE_SUCCESS_RATE") # Used by post-processing files. SUCCESS_RATE_THRESHOLD = spec.val("SUCCESS_RATE_THRESHOLD") HELDOUT_TESTING = spec.val("HELDOUT_TESTING") NUM_TEST_EPISODES = spec.val("NUM_TEST_EPISODES") BINARY_REWARD = spec.val("BINARY_REWARD") OBS_ENCODER = spec.val("OBS_ENCODER") assert USE_SUCCESS_RATE color_list = ['red', 'green', 'blue', 'purple', 'yellow', 'grey'] color_index_dict = {} for i in range(len(color_list)): color_index_dict[color_list[i]] = i cell_object_base_index = 4 action_list = ['go', 'pick', 'put', 'open'] action_index_dict = {} for i in range(len(action_list)): action_index_dict[action_list[i]] = i objtype_list = ['door', 'key', 'ball', 'box']
# Copyright (c) Microsoft Corporation. # Licensed under the MIT license. import os import time import datetime import pytz import platform import torch import numpy as np from utils.spec_reader import spec TYPE_OF_RUN = spec.val("TYPE_OF_RUN") ENV = spec.val("ENV") LOAD_MODEL_FROM = spec.val("LOAD_MODEL_FROM") SAVE_MODELS_TO = spec.val("SAVE_MODELS_TO") ENV_RANDOM_SEED = spec.val("ENV_RANDOM_SEED") AGENT_RANDOM_SEED = spec.val("AGENT_RANDOM_SEED") REPORTING_INTERVAL = spec.val("REPORTING_INTERVAL") TOTAL_STEPS = spec.val("TOTAL_STEPS") ANNEAL_LR = spec.val("ANNEAL_LR") if ANNEAL_LR: ANNEALING_START = spec.val("ANNEALING_START") from agents.a3c import A3cAgent class Worker(object): def __init__(self): torch.manual_seed(AGENT_RANDOM_SEED) self.start_time = time.time() self.heldout_testing = False