Beispiel #1
0
# Get device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Current usable device is: ", device)

# Create the model
actor_critic = ActorCriticLSTM(actor_layer_sizes, critic_1_layer_sizes, critic_2_extra_input=1, use_lstm=False, grayscale=grayscale, device=device).to(device)

# Create AE Hashing model and optimizers
ae_hash = AEHash(len_AE_hashcode, 4 if stacked else 1, noise_scale, saturating_weight, device=device).to(device)
ae_hash_optim = optim.Adam(ae_hash.parameters())

# Create SimHash
sim_hash = SimHash(len_AE_hashcode, len_SimHash_hashcode)

# Create LPLGraph
graph = LPLGraph(len_SimHash_hashcode, actor_layer_sizes[-1], max_reward, num_particles=num_particles)

# Set up action counter to infer the dominating action
act_counter = np.zeros((actor_layer_sizes[-1],), dtype=np.int32)

# Set up memory
memory = Memory(capacity, GAMMA, LAMBDA, 'cpu')     # Put memory on cpu to save space

# Set up pixel observation preprocessing
transform = Compose([
    ToPILImage(),
    Grayscale(num_output_channels=1),   # Turn frame into grayscale
    Resize((52, 52)),
    ToTensor()
])
Beispiel #2
0
# Define observation normalization function. Normalize state vector values to range [-1., 1.]
def state_nomalize(s):
    # Obtain environment observation space limit
    high = env.observation_space.high
    low = env.observation_space.low
    return ((s - low) / (high - low)) * 2 - 1


# Create Hashing function
simhash = SimHash(input_size,
                  len_hashcode,
                  preprocessor=state_nomalize if use_preprocessor else None)

# Create LPL Graph
graph = LPLGraph(len_hashcode,
                 output_size,
                 maximum_reward=max_reward,
                 num_particles=num_particles)

# Set up action counter to infer the dominating action
act_counter = np.zeros((output_size, ), dtype=np.int32)

###################################################################
# Start training

# Dictionary for extra training information to save to checkpoints
training_info = {
    "epoch mean durations": [],
    "epoch mean rewards": [],
    "max reward achieved": 0,
    "past %d epochs mean reward" % num_avg_epoch: 0,
    "extrinsic value net loss": [],
Beispiel #3
0
# Create the model
policy_net = PolicyNet(layer_sizes).to(device)  # Policy network

# Set up memory
memory = Memory(capacity, GAMMA, LAMBDA, device)

# Set up optimizer
policynet_optimizer = optim.Adam(policy_net.parameters())

# Set up SimHash
# Assume input_size is the dimension of raw observation
simhash = SimHash(input_size, len_hashcode)

# Set up LPLgraph
# Assume in discrete environments, output_size is the number of actions
graph = LPLGraph(len_hashcode, output_size, max_reward)

# Set up counter to infer the dominating action within all the states that belong to one discrete state encoding
act_counter = np.zeros((output_size, ), dtype=np.int32)

###################################################################
# Start training

# Dictionary for extra training information to save to checkpoints
training_info = {
    "epoch mean durations": [],
    "epoch mean rewards": [],
    "max reward achieved": 0,
    "past %d epochs mean reward" % num_avg_epoch: 0
}