Exemple #1
0
    def learn(self):
        agent = DDPGAgent(
            env=self.env,
            replay_memory_size=REPLAY_MEMORY_SIZE,
            learning_rate=LEARNING_RATE,
            batch_size=MINIBATCH_SIZE,
            gamma=GAMMA,
            tau=TAU
        )

        stats = {'scores': [], 'avg': [], 'min': [], 'max': []}
        for ep in tqdm(range(1, self.episodes + 1), ascii=True, unit='episodes'):

            print(self.epsilon)
            action_stats = [0, 0]
            current_state = self.env.reset()
            current_state = self.convert_gray(current_state)

            done = False
            score = 0
            steps = 0

            while not done:
                steps += 1

                if np.random.random() > self.epsilon:
                    action_stats[0] += 1
                    action = agent.get_action(current_state)
                else:
                    action_stats[1] += 1
                    action = self.env.action_space.sample()
                    action[2] = min(action[2], 0.2)
                    action[1] = action[1]*2

                new_state, reward, done, _ = self.env.step(action)
                if ep % self.results_every_n_episodes == 0:
                    self.env.render()

                score += reward

                new_state = self.convert_gray(new_state)

                agent.memory.push(current_state, action, reward, new_state)

                if steps % 64 == 0:
                    agent.update()

                current_state = new_state

                if self.epsilon > 0.1:
                    self.epsilon -= self.epsilon_decay_value

                if score < 0:
                    break

            print(action_stats)
            print(score)
            stats['scores'].append(score)
        self.env.close()
        return agent.actor
Exemple #2
0
		
		## extract data
		state = env_data["observation"]
		goal = env_data["desired_goal"]

		## logging rewards
		episode_reward = 0

		for step in range(STEPS_PER_EPISODE):
			## normalize state and goal
			# state = normalizer(state, 5.0)
			# goal = normalizer(goal, 5.0)

			## get action from behavioural policy
			action = agent.get_action(state, goal)
			if action is not None:
				action = noise.get_action(action, step)
			else:
				action = env.action_space.sample()

			time.sleep(0.002)
			next_state, reward, _, _ = env.step(action)
			
			env.render()

			episode_reward += reward

			## store transition - Standard Experience Replay
			state_rep = np.concatenate((state, goal), axis=0)
			next_state_rep = np.concatenate((next_state["observation"], goal), axis=0)
batch_size = 128
rewards = []
avg_rewards = []

for episode in range(50):
    state = env.reset()
    #state = state['observation']
    #print("state = ", state)
    ou_noise.reset()
    episode_reward = 0

    for step in range(500):
        if episode >= 45:
            env.render()
        #action = agent.get_action(state, ou_noise)
        action = agent.get_action(state, ou_noise)
        #print("action = ", action)
        new_state, reward, done, _ = env.step(action)
        #print("new state =", new_state)
        #new_state = new_state['observation']
        #print("new state =", new_state)
        agent.memory.push(state, action, reward, new_state, done)

        if len(agent.memory) > batch_size:
            agent.train(batch_size)

        state = new_state
        episode_reward += reward

        if done:
            if episode == 0: