def learn(self): agent = DDPGAgent( env=self.env, replay_memory_size=REPLAY_MEMORY_SIZE, learning_rate=LEARNING_RATE, batch_size=MINIBATCH_SIZE, gamma=GAMMA, tau=TAU ) stats = {'scores': [], 'avg': [], 'min': [], 'max': []} for ep in tqdm(range(1, self.episodes + 1), ascii=True, unit='episodes'): print(self.epsilon) action_stats = [0, 0] current_state = self.env.reset() current_state = self.convert_gray(current_state) done = False score = 0 steps = 0 while not done: steps += 1 if np.random.random() > self.epsilon: action_stats[0] += 1 action = agent.get_action(current_state) else: action_stats[1] += 1 action = self.env.action_space.sample() action[2] = min(action[2], 0.2) action[1] = action[1]*2 new_state, reward, done, _ = self.env.step(action) if ep % self.results_every_n_episodes == 0: self.env.render() score += reward new_state = self.convert_gray(new_state) agent.memory.push(current_state, action, reward, new_state) if steps % 64 == 0: agent.update() current_state = new_state if self.epsilon > 0.1: self.epsilon -= self.epsilon_decay_value if score < 0: break print(action_stats) print(score) stats['scores'].append(score) self.env.close() return agent.actor
## extract data state = env_data["observation"] goal = env_data["desired_goal"] ## logging rewards episode_reward = 0 for step in range(STEPS_PER_EPISODE): ## normalize state and goal # state = normalizer(state, 5.0) # goal = normalizer(goal, 5.0) ## get action from behavioural policy action = agent.get_action(state, goal) if action is not None: action = noise.get_action(action, step) else: action = env.action_space.sample() time.sleep(0.002) next_state, reward, _, _ = env.step(action) env.render() episode_reward += reward ## store transition - Standard Experience Replay state_rep = np.concatenate((state, goal), axis=0) next_state_rep = np.concatenate((next_state["observation"], goal), axis=0)
batch_size = 128 rewards = [] avg_rewards = [] for episode in range(50): state = env.reset() #state = state['observation'] #print("state = ", state) ou_noise.reset() episode_reward = 0 for step in range(500): if episode >= 45: env.render() #action = agent.get_action(state, ou_noise) action = agent.get_action(state, ou_noise) #print("action = ", action) new_state, reward, done, _ = env.step(action) #print("new state =", new_state) #new_state = new_state['observation'] #print("new state =", new_state) agent.memory.push(state, action, reward, new_state, done) if len(agent.memory) > batch_size: agent.train(batch_size) state = new_state episode_reward += reward if done: if episode == 0: