# Get device device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print("Current usable device is: ", device) # Create the model actor_critic = ActorCriticLSTM(actor_layer_sizes, critic_1_layer_sizes, critic_2_extra_input=1, use_lstm=False, grayscale=grayscale, device=device).to(device) # Create AE Hashing model and optimizers ae_hash = AEHash(len_AE_hashcode, 4 if stacked else 1, noise_scale, saturating_weight, device=device).to(device) ae_hash_optim = optim.Adam(ae_hash.parameters()) # Create SimHash sim_hash = SimHash(len_AE_hashcode, len_SimHash_hashcode) # Create LPLGraph graph = LPLGraph(len_SimHash_hashcode, actor_layer_sizes[-1], max_reward, num_particles=num_particles) # Set up action counter to infer the dominating action act_counter = np.zeros((actor_layer_sizes[-1],), dtype=np.int32) # Set up memory memory = Memory(capacity, GAMMA, LAMBDA, 'cpu') # Put memory on cpu to save space # Set up pixel observation preprocessing transform = Compose([ ToPILImage(), Grayscale(num_output_channels=1), # Turn frame into grayscale Resize((52, 52)), ToTensor() ])
# Define observation normalization function. Normalize state vector values to range [-1., 1.] def state_nomalize(s): # Obtain environment observation space limit high = env.observation_space.high low = env.observation_space.low return ((s - low) / (high - low)) * 2 - 1 # Create Hashing function simhash = SimHash(input_size, len_hashcode, preprocessor=state_nomalize if use_preprocessor else None) # Create LPL Graph graph = LPLGraph(len_hashcode, output_size, maximum_reward=max_reward, num_particles=num_particles) # Set up action counter to infer the dominating action act_counter = np.zeros((output_size, ), dtype=np.int32) ################################################################### # Start training # Dictionary for extra training information to save to checkpoints training_info = { "epoch mean durations": [], "epoch mean rewards": [], "max reward achieved": 0, "past %d epochs mean reward" % num_avg_epoch: 0, "extrinsic value net loss": [],
# Create the model policy_net = PolicyNet(layer_sizes).to(device) # Policy network # Set up memory memory = Memory(capacity, GAMMA, LAMBDA, device) # Set up optimizer policynet_optimizer = optim.Adam(policy_net.parameters()) # Set up SimHash # Assume input_size is the dimension of raw observation simhash = SimHash(input_size, len_hashcode) # Set up LPLgraph # Assume in discrete environments, output_size is the number of actions graph = LPLGraph(len_hashcode, output_size, max_reward) # Set up counter to infer the dominating action within all the states that belong to one discrete state encoding act_counter = np.zeros((output_size, ), dtype=np.int32) ################################################################### # Start training # Dictionary for extra training information to save to checkpoints training_info = { "epoch mean durations": [], "epoch mean rewards": [], "max reward achieved": 0, "past %d epochs mean reward" % num_avg_epoch: 0 }