ball_y = state[1] ball_speed_x = state[2] ball_speed_y = state[3] paddle_x = state[4] index += 2000 * ball_speed_x index += 1000 * ball_speed_y index += 100 * ball_x index += 10 * ball_y index += 1 * paddle_x return index # Init environment env = Game() # Create Q table action_space_size = env.n_actions state_space_size = env.n_states_disc q_table = np.zeros( (state_space_size, action_space_size)) # n_states x n_actions # Hyper parameters num_episodes = 10000 max_steps_per_episode = 1000000 learning_rate = 0.1 # alpha discount_rate = 0.99 # gamma exploration_rate = 1 # epsilon
lambda_2 = 1.0 lambda_3 = 1e-5 l_margin = 0.8 e_d = 0.005 # bonus for demonstration # todo try with 1 as that is what the paper said e_a = 0.001 # Environment Parameters lr = 0.002 batch_size = 64 epsilon = 1.0 # Decreased over time max_exploration_rate = 1 min_exploration_rate = 0.001 exploration_decay_rate = 0.01 # Initialize Environment env = Game() # Initialize Agent agent = Agent(lr=lr, eps=epsilon, gamma=gamma, max_memory=max_memory, n_steps=n_steps, batch_size=batch_size, tau=tau, lambda_1=lambda_1, lambda_2=lambda_2, lambda_3=lambda_3, l_margin=l_margin) # Load Demonstration Data