Esempio n. 1
0
def train(agent,
    n_episodes=2000, max_t=1000, eps_start=1.0, eps_end=0.01, eps_decay=0.995):
    """Deep Q-Learning.

    Params
    ======
        agent: agent to train
        n_episodes (int): maximum number of training episodes
        max_t (int): maximum number of timesteps per episode
        eps_start (float): starting value of epsilon, for epsilon-greedy action selection
        eps_end (float): minimum value of epsilon
        eps_decay (float): multiplicative factor (per episode) for decreasing epsilon
    """
    scores = []                        # list containing scores from each episode
    scores_window = deque(maxlen=100)  # last 100 scores
    eps = eps_start                    # initialize epsilon

    qnetwork_star = QNetwork(state_size=8, action_size=4, seed=0)
    qnetwork_star.load_state_dict(torch.load('dqn_16.pth'))
    qnetwork_star.eval()
    Q_threshold = 0.0
    savenumber = 0

    for i_episode in range(1, n_episodes+1):
        state = env.reset()
        score = 0
        for t in range(max_t):
            action = agent.act(state, eps)

            # shared autonomy
            to_add = True
            with torch.no_grad():
                state = torch.from_numpy(state).float()
                Q_values = qnetwork_star(state).data.numpy()
                action_star = np.argmax(Q_values)
            loss = Q_values[action_star] - Q_values[action]
            if loss > Q_threshold:
                to_add = False
                action = action_star

            next_state, reward, done, _ = env.step(action)
            agent.step(state, action, reward, next_state, done, to_add)
            state = next_state
            score += reward
            if done:
                break
        Q_threshold += 0.1
        scores_window.append(score)       # save most recent score
        scores.append(score)              # save most recent score
        eps = max(eps_end, eps_decay*eps) # decrease epsilon
        print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)), end="")
        if i_episode % 100 == 0:
            savename = "assisted_" + str(savenumber) + ".pkl"
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)))
            torch.save(agent.qnetwork_local.state_dict(), savename)
            savenumber += 1
    return scores
Esempio n. 2
0
def main():

    env = gym.make("LanderCustom-v0")

    qnetwork = QNetwork(state_size=8, action_size=4, seed=0)
    qnetwork.load_state_dict(torch.load('basic_lander.pth'))
    qnetwork.eval()

    human = MLP()
    human.load_state_dict(torch.load('expert_bc.pt'))
    human.eval()
    softmax = torch.nn.Softmax(dim=0)

    episodes = 30
    scores = []
    Q_threshold = 1e-2

    for episode in range(episodes):

        if episode < 10:
            force_x = 0.0
        elif episode < 20:
            force_x = +500.0
        else:
            force_x = -500.0

        env.start_state(force_x, 0.0)
        state = env.reset()
        score = 0

        while True:

            with torch.no_grad():
                state = torch.from_numpy(state).float()
                Q_values = qnetwork(state).data.numpy()
                action_pred_dist = softmax(human(state).data).numpy()
            action_star = np.argmax(Q_values)
            action = np.random.choice(np.arange(4), p=action_pred_dist)

            loss = Q_values[action_star] - Q_values[action]
            # if loss > Q_threshold:
            #     action = action_star

            # env.render()
            state, reward, done, _ = env.step(action)
            score += reward
            if done:
                print("episode: ", episode, "score: ", score)
                break

        scores.append(score)

    env.close()
    print("The average score is: ", np.mean(np.array(scores)))
Esempio n. 3
0
def rollout(force_x, Q_threshold, modelname):

    env = gym.make("LanderCustom-v0")
    qnetwork = QNetwork(state_size=8, action_size=4, seed=0)
    qnetwork.load_state_dict(torch.load('basic_lander.pth'))
    qnetwork.eval()
    softmax = torch.nn.Softmax(dim=0)

    if modelname is not None:
        human = MLP()
        human.load_state_dict(torch.load(modelname))
        human.eval()

    env.start_state(force_x, 0.0)
    state = env.reset()
    score = 0
    dataset = []

    while True:

        # get robot and human actions
        with torch.no_grad():
            state = torch.from_numpy(state).float()
            Q_values = qnetwork(state).data.numpy()
            action_star = np.argmax(Q_values)
            action = np.random.choice(np.arange(4))
            if modelname is not None:
                action_pred_dist = softmax(human(state).data).numpy()
                action = np.random.choice(np.arange(4), p=action_pred_dist)

        # save data
        loss = Q_values[action_star] - Q_values[action]
        dataset.append(list(state.numpy()) + [action, loss, action_star])

        # shared autonomy
        if loss > Q_threshold:
            action = action_star

        # update environment
        # env.render()
        state, reward, done, _ = env.step(action)
        score += reward
        if done:
            # print("score: ", score)
            break

    env.close()
    return score, dataset
Esempio n. 4
0
class DQNAgent():
    """Interacts with and learns from the environment."""
    def __init__(self,
                 state_size,
                 action_size,
                 seed,
                 hidden_layers=[64, 64],
                 buffer_size=int(1e5),
                 batch_size=64,
                 gamma=0.99,
                 tau=1e-3,
                 learning_rate=5e-4,
                 update_every=4):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
            hidden_layers (list of int ; optional): number of each layer nodes
            buffer_size (int ; optional): replay buffer size
            batch_size (int; optional): minibatch size
            gamma (float; optional): discount factor
            tau (float; optional): for soft update of target parameters
            learning_rate (float; optional): learning rate
            update_every (int; optional): how often to update the network
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.lr = learning_rate
        self.update_every = update_every

        # detect GPU device
        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")

        # Q-Network
        model_params = [state_size, action_size, seed, hidden_layers]
        self.qnetwork_local = QNetwork(*model_params).to(self.device)
        self.qnetwork_target = QNetwork(*model_params).to(self.device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(),
                                    lr=self.lr)

        # Replay memory
        self.memory = ReplayBuffer(action_size, self.buffer_size,
                                   self.batch_size, seed, self.device)
        # Initialize time step (for updating every self.update_every steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every self.update_every time steps.
        self.t_step = (self.t_step + 1) % self.update_every
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > self.batch_size:
                experiences = self.memory.sample()
                self.learn(experiences, self.gamma)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.

        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(self.device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # Calculate target value
        self.qnetwork_target.eval()
        with torch.no_grad():
            Q_dash = self.qnetwork_target(next_states)
            Q_dash_max = torch.max(Q_dash, dim=1, keepdim=True)[0]
            y = rewards + gamma * Q_dash_max * (1 - dones)
        self.qnetwork_target.train()

        # Predict Q-value
        self.optimizer.zero_grad()
        Q = self.qnetwork_local(states)
        y_pred = Q.gather(1, actions)

        # TD-error
        loss = torch.sum((y - y_pred)**2)

        # Optimize
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, self.tau)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
Esempio n. 5
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.

        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done, to_add):
        # Save experience in replay memory
        if to_add:
            self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.

        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # Get max predicted Q values (for next states) from target model
        Q_targets_next = self.qnetwork_target(next_states).detach().max(
            1)[0].unsqueeze(1)
        # Compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)
class Agent():
    """Basic experinece replay agent."""
    def __init__(self,
                 state_size,
                 action_size,
                 seed,
                 buffer_size=int(1e5),
                 batch_size=64,
                 gamma=0.99,
                 tau=1e-3,
                 lr=5e-4,
                 update_every=4,
                 checkpoint_file='checkpoint.pth'):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
            buffer_size(int):  replay buffer size
            batch_size(int):   minibatch size
            gamma:             discount factor
            tau:               for soft update of target parameters
            lr:                learning rate 
            update_every:      how often to update the network
            
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau
        self.lr = lr
        self.update_every = update_every
        self.checkpoint_file = checkpoint_file
        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(self.device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(self.device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=lr)

        # Replay memory
        self.memory = ReplayBuffer(action_size, buffer_size, batch_size, seed,
                                   self.device)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % self.update_every
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > self.batch_size:
                experiences = self.memory.sample()
                self.learn(experiences, self.gamma)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(self.device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """Update value parameters using given batch of experience tuples.
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # Get max predicted Q values (for next states) from target model
        Q_targets_next = self.qnetwork_target(next_states).detach().max(
            1)[0].unsqueeze(1)
        # Compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, self.tau)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def train(self,
              env,
              n_episodes=2000,
              max_t=1000,
              eps_start=1.0,
              eps_end=0.01,
              eps_decay=0.995):
        """Train Agent by playing simulator

        Params
        ======
            n_episodes (int): maximum number of training episodes
            max_t (int): maximum number of timesteps per episode
            eps_start (float): starting value of epsilon, for epsilon-greedy action selection
            eps_end (float): minimum value of epsilon
            eps_decay (float): multiplicative factor (per episode) for decreasing epsilon
        """
        scores = []  # list containing scores from each episode
        moving_avgs = []  # list of moving averages
        scores_window = deque(maxlen=100)  # last 100 scores
        brain_name = env.brain_names[0]  # get env default branin name
        env_info = env.reset(
            train_mode=False)[brain_name]  # intialize the environment
        eps = eps_start  # initialize epsilon
        for i_episode in range(1, n_episodes + 1):
            env_info = env.reset(train_mode=True)[brain_name]
            state = env_info.vector_observations[0]  # get the next state
            score = 0
            for t in range(max_t):
                action = self.act(state, eps).astype(int)
                env_info = env.step(action)[brain_name]
                next_state = env_info.vector_observations[
                    0]  # get the next state
                reward = env_info.rewards[0]  # get the reward
                done = env_info.local_done[0]  # see if episode has finished
                self.step(state, action, reward, next_state, done)
                state = next_state
                score += reward
                if done:
                    break
            scores_window.append(score)  # save most recent score
            scores.append(score)  # save most recent score
            moving_avg = np.mean(scores_window)  # calculate moving average
            moving_avgs.append(moving_avg)  # save most recent moving average
            eps = max(eps_end, eps_decay * eps)  # decrease epsilon
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(
                i_episode, np.mean(scores_window)),
                  end="")
            if i_episode % 100 == 0:
                print('\rEpisode {}\tAverage Score: {:.2f}'.format(
                    i_episode, moving_avg))
            if moving_avg >= 13.0:
                print(
                    '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'
                    .format(i_episode - 100, moving_avg))
                self.save()
                break
        return scores, moving_avgs

    def test(self, env, num_episodes=10):
        brain_name = env.brain_names[0]
        scores = []  # list of scores
        avg_scores = []  # list of average scores
        for i_episode in range(1, num_episodes + 1):
            env_info = env.reset(
                train_mode=False)[brain_name]  # reset the environment
            state = env_info.vector_observations[0]  # get the current state
            score = 0  # initialize the score
            t = 1
            while True:
                action = self.act(state, eps=0)  # select an action
                env_info = env.step(action)[
                    brain_name]  # send the action to the environment
                next_state = env_info.vector_observations[
                    0]  # get the next state
                reward = env_info.rewards[0]  # get the reward
                done = env_info.local_done[0]  # see if episode has finished
                score += reward  # update the score
                state = next_state  # roll over the state to next time step
                # print('empisode: {}, step: {}, reward: {}, score: {}, scores: {}'.format(i_episode, t, reward, score, scores))
                t += 1
                if done:  # exit loop if episode finished
                    scores.append(score)
                    avg_scores.append(np.mean(scores))
                    print('\rEpisode {}\tAverage Score: {:.2f}'.format(
                        i_episode, np.mean(scores)))
                    break
        return scores, avg_scores

    def save(self):
        """Save the model
        Params
        ======
            file: checkpoint file name
        """
        torch.save(self.qnetwork_local.state_dict(), self.checkpoint_file)

    def load(self):
        """Load the model
        Params
        ======
            file: checkpoint file name
        """
        self.qnetwork_local.load_state_dict(torch.load(self.checkpoint_file))
Esempio n. 7
0
def main():

    env = gym.make("LanderCustom-v0")
    fx_init = float(sys.argv[1])
    Q_threshold = float(sys.argv[2])
    savename = 'test1.pkl'

    joystick = Joystick()
    qnetwork = QNetwork(state_size=8, action_size=4, seed=0)
    qnetwork.load_state_dict(torch.load('basic_lander.pth'))
    qnetwork.eval()

    human = MLP()
    human.load_state_dict(torch.load('mlp_model.pt'))
    human.eval()

    episodes = 10
    scores = []
    data = []
    env.start_state(fx_init, 0.0)

    for episode in range(episodes):

        state = env.reset()
        env.render()
        score = 0

        while True:

            action, start, stop = joystick.input()
            if start:
                break

        while True:


            action, start, stop = joystick.input()
            data.append(list(state) + [action])

            with torch.no_grad():
                state = torch.from_numpy(state).float()
                Q_values = qnetwork(state).data.numpy()
                action_pred_dist = human(state).data.numpy()
            action_star = np.argmax(Q_values)
            action_pred = np.argmax(action_pred_dist)

            # action = action_pred

            loss = Q_values[action_star] - Q_values[action]
            if loss > Q_threshold:
                action = action_star

            env.render()
            state, reward, done, _ = env.step(action)
            score += reward

            if done or stop:
                print(episode, score)
                # pickle.dump(data, open(savename, "wb" ))
                break
            time.sleep(0.025)

        scores.append(score)

    env.close()
    print(scores)
Esempio n. 8
0
        score_train, experiences = rollout(sa, mydata)
        score_test, _ = rollout(1e3, mydata)
        mytrainscore.append(score_train)
        mytestscore.append(score_test)
        learned_experiences = []
        corrections = 0
        for item in experiences:
            state, loss, accepted, action, action_star = item[0:4], item[
                4], item[5], item[6], item[7]
            if not accepted:
                corrections += 1
            if loss > 0.5:
                learned_experiences.append(state + [action_star])
        mydata = mydata + learned_experiences
        print(count, " TrainScore: ", score_train, " TestScore: ", \
            score_test, " Correction: ", corrections, " Data: ", len(mydata))
    return mytrainscore, mytestscore


if __name__ == "__main__":
    env = gym.make("CartPole-v0")
    env.seed(0)
    qnetwork = QNetwork(state_size=4, action_size=2, seed=0)
    qnetwork.load_state_dict(torch.load('models/dqn_cartpole.pth'))
    qnetwork.eval()
    scores = []
    for idx in range(25):
        mytrainscore, mytestscore = experience_matching()
        scores.append(mytestscore)
    pickle.dump(scores, open("results/failure/SomeSA.pkl", "wb"))
Esempio n. 9
0
def play(n_episodes=100, max_t=1000):
    # for first train eps start is 1
    # for second eps start is 0.5

    # also I changed max_t to 500, because it would take a long time to land

    possible_actions = [
        # don't move
        [0, 0],

        # up
        [0.1, 0],
        [0.2, 0],
        [0.3, 0],
        [0.5, 0],
        [0.6, 0],
        [0.7, 0],
        [0.8, 0],
        [0.9, 0],
        [1, 0],

        # left
        [0, -0.6],
        [0, -0.7],
        [0, -0.8],
        [0, -0.9],

        # right
        [0, 0.6],
        [0, 0.7],
        [0, 0.8],
        [0, 0.9],

        # up-left
        # [0.8, -0.8],
        # [0.8, -0.65],
        # [0.6, -0.8],
        # [0.6, -0.65],
        # [0.7, -0.8],
        # [0.7, -0.65],

        # # up-right
        # [0.8, 0.8],
        # [0.8, 0.65],
        # [0.6, 0.8],
        # [0.6, 0.65],
        # [0.7, 0.8],
        # [0.7, 0.65]
    ]

    env = gym.make('LunarLanderContinuous-v2')
    # env = gym.make('LunarLander-v2')
    # action_space = env.action_space
    # print("action size=", action_space)
    n_actions = len(possible_actions)
    the_ship = QNetwork(state_size=8, action_size=n_actions)
    if (loaded_model):
        the_ship.load_state_dict(torch.load(loaded_model))

    the_ship.eval()
    scores_window = deque(maxlen=100)
    state_arr = []
    scores = []

    # run n games

    for i_episode in range(1, n_episodes + 1):

        state = env.reset()
        score = 0

        # sample random transitions

        for t in range(max_t):
            state_arr = [state]
            input_s = torch.tensor(state_arr)
            qs = the_ship(input_s)
            the_ship.train()
            _, action = torch.max(qs.detach(), 1)
            action = action.detach().numpy()[0]

            env.render()
            action_continious = discrete_action_to_continious_array(
                possible_actions, action)
            next_state, reward, done, _ = env.step(action_continious)

            score += reward

            state = next_state

            if done:
                break

        scores_window.append(score)
        scores.append(score)

        mean_score = np.mean(scores_window)

        print('\rEpisode {}\tAverage Score: {:.2f}, Last reward: {}'.format(
            i_episode, mean_score, reward),
              end="")
        if i_episode % 100 == 0:
            print(
                '\rEpisode {}\tAverage Score: {:.2f}, Last reward: {}'.format(
                    i_episode, mean_score, reward))
    return scores