quantum_engine = MainEngine() #message = 'DataEspresso' #send_full_message(message=message,quantum_engine=quantum_engine) #env = gym.make('FrozenLake-v0') env = FrozenLakeEnv(is_slippery=False) Q = np.zeros([env.observation_space.n, env.action_space.n]) lr = .8 y = .95 num_episodes = 2000 #jList = [] rList = [] for i in range(num_episodes): #Reset environment and get first new observation s = env.reset() rAll = 0 d = False j = 0 #The Q-Table learning algorithm while j < 99: j += 1 #what we need from the robot: #state, reward, done?, _ #send the action to the robot # ==================================================== # EARTH SIDE OF THE CONNECTION a = np.argmax(Q[s, :] + np.random.randn(1, env.action_space.n) * (1. / (i + 1))) # ==================================================== robot_a = int(
def begin_grading(): print("\x1b[43m") def end_grading(): print("\x1b[0m") # Seed RNGs so you get the same printouts as me env.seed(0) from gym.spaces import prng prng.seed(10) # Generate the episode env.reset() for t in range(100): env.render() a = env.action_space.sample() ob, rew, done, _ = env.step(a) if done: break assert done env.render() class MDP(object): def __init__(self, P, nS, nA, desc=None): self.P = P # state transition and reward probabilities, explained below self.nS = nS # number of states self.nA = nA # number of actions
def game(N_episodes, AI_type, Intrinsic_type): ############## Hyperparameters ############## env = FrozenLakeEnv() #memory = Memory(max_size=300) ppo = 0 #n_episodes = number_of_episodes #n_actions = env.action_space.n #intrinsic = intrinsic #print(n_actions) #n_agents = 1 #n_episodes = number_of_episodes #state_size = env.observation_space.n #env_name = "LunarLander-v2" # creating environment state_dim = env.observation_space.n action_dim = env.action_space.n render = False solved_reward = 230 # stop training if avg_reward > solved_reward log_interval = 20 # print avg reward in the interval max_episodes = N_episodes # max training episodes max_timesteps = 100 # max timesteps in one episode n_latent_var = 64 # number of variables in hidden layer update_timestep = 200 # update policy every n timesteps lr = 0.002 betas = (0.9, 0.999) gamma = 0.99 # discount factor K_epochs = 4 # update policy for K epochs eps_clip = 0.2 # clip parameter for PPO random_seed = None samp_rewards = [] avg_rewards = [] best_avg_reward = -np.inf n_agents = 1 ############################################# if random_seed: torch.manual_seed(random_seed) env.seed(random_seed) memory = Memory() ppo = PPO(state_dim, action_dim, n_latent_var, lr, betas, gamma, K_epochs, eps_clip) print(lr, betas) # logging variables running_reward = 0 avg_length = 0 timestep = 0 avg_reward = 0 ppo.memcount.delete() state_size = env.observation_space.n reward_rms = RunningMeanStd() obs_rms = RunningMeanStd() norm_step = 5000 #Pre Run next_obs = [] for norm_step in range(norm_step): action_norm = np.random.randint(0, action_dim) state_norm, reward_norm, done_norm, _ = env.step(action_norm) state_norm = to_categorical(state_norm, state_size) #optional next_obs.append(state_norm) obs_rms.update(next_obs) #print(obs_rms.mean) # training loop for i_episode in range(1, max_episodes + 1): state = env.reset() state = to_categorical(state, state_size) done = False t = 0 episode_reward = 0 intrinsic_rewards = 0 reward = 0 for t in range(max_timesteps): #while not done: timestep += 1 t += 1 # Running policy_old: action = ppo.policy_old.act(state, memory) state, reward, done, _ = env.step(action) state = to_categorical(state, state_size) #======================================================== if ((AI_type == "PPO" or AI_type == "A2C") and Intrinsic_type == "1"): intrinsic_rewards = get_intrinsic_rewards( AI_type, state, ppo, n_agents, 10) intrinsic_rewards = intrinsic_rewards.data.numpy() #print("intrinsic_rewards1",intrinsic_rewards) elif ((AI_type == "PPO" or AI_type == "A2C") and Intrinsic_type == "2"): intrinsic_rewards = get_intrinsic_rewards2( AI_type, state, action, ppo, n_agents, 10) intrinsic_rewards = intrinsic_rewards.data.numpy() #print("intrinsic_rewards2",intrinsic_rewards) elif ((AI_type == "PPO" or AI_type == "A2C") and Intrinsic_type == "3"): intrinsic_rewards = get_intrinsic_rewards3( AI_type, state, action, ppo, n_agents, reward, 1) intrinsic_rewards = intrinsic_rewards.data.numpy() #print("intrinsic_rewards3",intrinsic_rewards) elif ((AI_type == "PPO" or AI_type == "A2C") and Intrinsic_type == "4"): intrinsic_rewards = get_intrinsic_rewards4( AI_type, state, action, ppo, n_agents, reward * 10, t, 100, 0.99) #print("intrinsic_rewards---",intrinsic_rewards) elif ((AI_type == "PPO" or AI_type == "A2C") and Intrinsic_type == "5"): intrinsic_rewards = get_intrinsic_rewards5( AI_type, state, ppo, n_agents, 1, 16) #print("intrinsic_rewards5",intrinsic_rewards) else: intrinsic_rewards = 0 #reward_sum = reward + intrinsic_rewards reward_sum = reward #=========================================================== # Saving reward and is_terminal: memory.rewards.append(reward_sum) #temp_int = memory.intrinsic_rewards.data.numpy() #temp_int = memory.intrinsic_rewards #print(temp_int) memory.intrinsic_rewards.append(intrinsic_rewards) memory.is_terminals.append(done) """ try: mean1, std1, count1 = np.mean(temp_int), np.std(temp_int), len(temp_int) reward_rms.update_from_moments(mean1, std1 ** 2, count1) adv_int = (memory.intrinsic_rewards-reward_rms.mean)/np.sqrt(reward_rms.var) except: adv_int = 0 """ """ print(temp_int.data.numpy()) mean1, std1, count1 = np.mean(temp_int), np.std(temp_int), len(temp_int) reward_rms.update_from_moments(mean1, std1 ** 2, count1) adv_int = (memory.intrinsic_rewards-reward_rms.mean)/np.sqrt(reward_rms.var) """ # update if its time if timestep % update_timestep == 0: temp_int = memory.intrinsic_rewards mean1, std1, count1 = np.mean(temp_int), np.std(temp_int), len( temp_int) reward_rms.update_from_moments(mean1, std1**2, count1) adv_int = (temp_int) / np.sqrt(reward_rms.var) ppo.update(memory, adv_int) memory.clear_memory() timestep = 0 running_reward += reward episode_reward += reward if render: env.render() if done: break avg_length += t # stop training if avg_reward > solved_reward if running_reward > (log_interval * solved_reward): print("########## Solved! ##########") #torch.save(ppo.policy.state_dict(), './PPO_{}.pth'.format(env_name)) #break # logging if i_episode % log_interval == 0: avg_length = int(avg_length / log_interval) running_reward = int((running_reward / log_interval)) print('Episode {} \t avg length: {} \t reward: {}'.format( i_episode, avg_length, running_reward)) running_reward = 0 avg_length = 0 samp_rewards.append(episode_reward) if (i_episode >= 100): # get average reward from last 100 episodes avg_reward = np.mean(samp_rewards[-100:]) # append to deque avg_rewards.append(avg_reward) # update best average reward if avg_reward > best_avg_reward: best_avg_reward = avg_reward print("Total reward in episode {} = {}".format(i_episode, episode_reward)) print("Best_avg_reward =", np.round(best_avg_reward, 3), "Average_rewards =", np.round(avg_reward, 3)) #env.save_replay() env.close() return avg_rewards, best_avg_reward, samp_rewards, "0"