Beispiel #1
0
 def __init__(self):
     self.agent = DQN()
     self.last_state = np.zeros(6 * NUM_A + 7 * NUM_F, dtype=np.int)
     self.last_action = 0
     self.last_througout = 0
     self.last_reward = 0
     self.key = []
Beispiel #2
0
def init_agents(sess, info_state_size, num_actions, hidden_layers_sizes,
                **kwargs):
    agents = [
        DQN(sess, 0, info_state_size, num_actions, hidden_layers_sizes,
            **kwargs),
        agent.RandomAgent(1)
    ]
    sess.run(tf.global_variables_initializer())

    return agents
Beispiel #3
0
 def initialize_policy(self):
     if self.args.policy == 'dqn':
         q_network = FlattenMlp(input_size=self.args.augmented_obs_dim,
                                output_size=self.args.act_space.n,
                                hidden_sizes=self.args.dqn_layers).to(
                                    ptu.device)
         self.agent = DQN(
             q_network,
             # optimiser_vae=self.optimizer_vae,
             lr=self.args.policy_lr,
             gamma=self.args.gamma,
             tau=self.args.soft_target_tau,
         ).to(ptu.device)
     else:
         # assert self.args.act_space.__class__.__name__ == "Box", (
         #     "Can't train SAC with discrete action space!")
         q1_network = FlattenMlp(
             input_size=self.args.augmented_obs_dim + self.args.action_dim,
             output_size=1,
             hidden_sizes=self.args.dqn_layers).to(ptu.device)
         q2_network = FlattenMlp(
             input_size=self.args.augmented_obs_dim + self.args.action_dim,
             output_size=1,
             hidden_sizes=self.args.dqn_layers).to(ptu.device)
         policy = TanhGaussianPolicy(
             obs_dim=self.args.augmented_obs_dim,
             action_dim=self.args.action_dim,
             hidden_sizes=self.args.policy_layers).to(ptu.device)
         self.agent = SAC(
             policy,
             q1_network,
             q2_network,
             actor_lr=self.args.actor_lr,
             critic_lr=self.args.critic_lr,
             gamma=self.args.gamma,
             tau=self.args.soft_target_tau,
             use_cql=self.args.use_cql if 'use_cql' in self.args else False,
             alpha_cql=self.args.alpha_cql
             if 'alpha_cql' in self.args else None,
             entropy_alpha=self.args.entropy_alpha,
             automatic_entropy_tuning=self.args.automatic_entropy_tuning,
             alpha_lr=self.args.alpha_lr,
             clip_grad_value=self.args.clip_grad_value,
         ).to(ptu.device)
Beispiel #4
0
def dqn_train(env_name, device="cpu", seed=0):
    from algorithms.dqn import DQN

    hyper = copy.deepcopy(HYPER_PARAM["dqn"][env_name])
    pixel_input = hyper.pop("pixel_input")
    buffer_size = hyper.pop("buffer_size")

    env = gym.make(env_name)

    dqn = DQN(env, pixel_input, buffer_size, device, seed=seed)
    dqn.learn(**hyper)

    dqn.save(f"{ROOT}/pretrain/{env_name}/dqn.pth")
Beispiel #5
0
def main():
    # initialize OpenAI Gym env and dqn agent
    #   env = gym.make(ENV_NAME)
    env = AutoEnv()
    agent = DQN(env)

    for episode in range(EPISODE):
        # initialize task
        state = env.reset()
        # Train
        for step in range(STEP):
            action = agent.egreedy_action(state)  # e-greedy action for train
            print(action)
            print(toMeterList(action, env.action_space.content))
            meter_l = toMeterList(action, env.action_space.content)
            n_action = genAction(state, meter_l)
            next_state, reward, done, _ = env.step(n_action)
            print("State:")
            print_state(state)
            print("Next State: ")
            print_state(next_state)
            print("Env Reward: ", reward)
            # Define reward for agent
            # reward = -1 if done else 0.1
            reward = rewardReg(reward)
            print("Agent Reward: ", reward)
            agent.perceive(state, action, reward, next_state, done)
            state = next_state
            if done:
                break
        # Test every 100 episodes
        if episode % 100 == 0:
            total_reward = 0
            for i in range(TEST):
                state = env.reset()
                for j in range(STEP):
                    #   env.render()
                    action = agent.action(state)  # direct action for test
                    meter_l = toMeterList(action, env.action_space.content)
                    print(action, meter_l)
                    n_action = genAction(state, meter_l)
                    state, reward, done, _ = env.step(n_action)
                    print("Reward: ", reward)
                    total_reward += reward
                    if done:
                        break
            ave_reward = total_reward / TEST
            print('episode: ', episode, 'Evaluation Average Reward:',
                  ave_reward)
Beispiel #6
0
def dqn_eval(env_name):
    from algorithms.dqn import DQN

    env = gym.make(env_name)
    dqn = DQN.load(f"{ROOT}/pretrain/{env_name}/dqn.pth", env=env)
    env = dqn.env

    for _ in range(5):
        obs = env.reset()
        env.render()
        ep_reward = 0
        while True:
            action = dqn.predict(obs)
            obs, reward, done, info = env.step(action)
            env.render()
            ep_reward += reward

            if done:
                print(f"reward: {ep_reward}")
                break
EPS_START = 0.9
EPS_END = 0.05
EPS_DECAY = 200
TARGET_UPDATE = 10

# Get screen size so that we can initialize layers correctly based on shape
# returned from AI gym. Typical dimensions at this point are close to 3x40x90
# which is the result of a clamped and down-scaled render buffer in get_screen()
init_screen = get_screen(env, device)
_, _, screen_height, screen_width = init_screen.shape

# Get number of actions from gym action space
n_actions = env.action_space.n

# Init the policy and the target net
policy_net = DQN(screen_height, screen_width, n_actions).to(device)
target_net = DQN(screen_height, screen_width, n_actions).to(device)
if resume_model:
    policy_net.load_state_dict(torch.load(model_path))

target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

# Init the optimizer
optimizer = optim.RMSprop(policy_net.parameters())
memory = ReplayMemory(10000, Transition)

steps_done = 0

episode_durations = []
Beispiel #8
0
def main(unused_argv):
    begin = time.time()
    env = Go()
    info_state_size = env.state_size
    num_actions = env.action_size

    hidden_layers_sizes = [int(l) for l in FLAGS.hidden_layers_sizes]
    kwargs = {
        "replay_buffer_capacity": FLAGS.replay_buffer_capacity,
        "epsilon_decay_duration": int(0.6 * FLAGS.num_train_episodes),
        "epsilon_start": 0.8,
        "epsilon_end": 0.001,
        "learning_rate": 1e-3,
        "learn_every": FLAGS.learn_every,
        "batch_size": 128,
        "max_global_gradient_norm": 10,
    }
    import agent.agent as agent
    ret = [0]
    max_len = 2000

    with tf.Session() as sess:
        # agents = [DQN(sess, _idx, info_state_size,
        #                   num_actions, hidden_layers_sizes, **kwargs) for _idx in range(2)]  # for self play
        agents = [
            agent.RandomAgent(1),
            DQN(sess, 1, info_state_size, num_actions, hidden_layers_sizes,
                **kwargs)
        ]
        sess.run(tf.global_variables_initializer())
        # train the agent
        for ep in range(FLAGS.num_train_episodes):
            if (ep + 1) % FLAGS.save_every == 0:
                if not os.path.exists("saved_model/random_vs_dqn"):
                    os.mkdir('saved_model/random_vs_dqn')
                agents[1].save(checkpoint_root='saved_model/random_vs_dqn',
                               checkpoint_name='random_vs_dqn_{}'.format(ep +
                                                                         1))
                print('saved %d' % (ep + 1))
            time_step = env.reset()  # a go.Position object
            while not time_step.last():
                player_id = time_step.observations["current_player"]
                agent_output = agents[player_id].step(time_step)
                action_list = agent_output.action
                # print(action_list)
                time_step = env.step(action_list)
            for agent in agents:
                agent.step(time_step)
            if len(ret) < max_len:
                ret.append(time_step.rewards[0])
            else:
                ret[ep % max_len] = time_step.rewards[0]

        # evaluated the trained agent
        agents[1].restore("saved_model/random_vs_dqn/random_vs_dqn_10000")
        ret = []
        for ep in range(FLAGS.num_eval):
            time_step = env.reset()
            while not time_step.last():
                player_id = time_step.observations["current_player"]
                if player_id == 0:
                    agent_output = agents[player_id].step(time_step)
                else:
                    agent_output = agents[player_id].step(
                        time_step,
                        is_evaluation=True,
                        add_transition_record=False)
                action_list = agent_output.action
                time_step = env.step(action_list)

            # Episode is over, step all agents with final info state.
            # for agent in agents:
            agents[0].step(time_step)
            agents[1].step(time_step,
                           is_evaluation=True,
                           add_transition_record=False)
            ret.append(time_step.rewards[0])
        print(np.mean(ret))
        # print(ret)

    print('Time elapsed:', time.time() - begin)
Beispiel #9
0
n_actions = mdps[0].action_space.n

layers = [l1]
if l2 > 0:
    layers.append(l2)

if not dqn:
    # Create BellmanOperator
    operator = MellowBellmanOperator(kappa, tau, xi, mdps[0].gamma, state_dim,
                                     action_dim)
    # Create Q Function
    Q = MLPQFunction(state_dim, n_actions, layers=layers)
else:
    Q, operator = DQN(state_dim,
                      action_dim,
                      n_actions,
                      mdps[0].gamma,
                      layers=layers)


def run(mdp, seed=None, idx=0):
    Q._w = ws[idx]
    return learn(mdp,
                 Q,
                 operator,
                 max_iter=max_iter,
                 buffer_size=buffer_size,
                 batch_size=batch_size,
                 alpha=alpha,
                 train_freq=train_freq,
                 eval_freq=eval_freq,
Beispiel #10
0
if l2 > 0:
    layers.append(l2)

if not dqn:
    # Create BellmanOperator
    operator = MellowBellmanOperator(kappa, tau, xi, temp_mdp.gamma, state_dim,
                                     action_dim)
    # Create Q Function
    Q = MLPQFunction(state_dim,
                     n_actions,
                     layers=layers,
                     activation=activation)
else:
    Q, operator = DQN(state_dim,
                      action_dim,
                      n_actions,
                      temp_mdp.gamma,
                      layers=layers)


def run(data, seed=None):
    return learn(Q,
                 operator,
                 data,
                 demand,
                 min_env_flow,
                 max_iter=max_iter,
                 buffer_size=buffer_size,
                 batch_size=batch_size,
                 alpha=alpha,
                 train_freq=train_freq,
def main(unused_argv):
    begin = time.time()
    env = Go()
    info_state_size = env.state_size
    num_actions = env.action_size

    hidden_layers_sizes = [int(l) for l in FLAGS.hidden_layers_sizes]
    kwargs = {
        "replay_buffer_capacity": FLAGS.replay_buffer_capacity,
        "epsilon_decay_duration": int(0.6*FLAGS.num_train_episodes),
        "epsilon_start": 0.8,
        "epsilon_end": 0.001,
        "learning_rate": 1e-3,
        "learn_every": FLAGS.learn_every,
        "batch_size": 128,
        "max_global_gradient_norm": 10,
    }
    import agent.agent as agent
    ret = [0]
    max_len = 2000

    with tf.Session() as sess:
        agents = [DQN(sess, 0, info_state_size,
            num_actions, hidden_layers_sizes, **kwargs), agent.Random_Rollout_MCTS_Agent(max_simulations=50)]
        sess.run(tf.global_variables_initializer())

        # train the agent
        for ep in range(FLAGS.num_train_episodes):
            if (ep + 1) % FLAGS.eval_every == 0:
                losses = agents[0].loss
                logging.info("Episodes: {}: Losses: {}, Rewards: {}".format(ep + 1, losses, np.mean(ret)))
                with open('log/log_{}_{}'.format(os.environ.get('BOARD_SIZE'), begin), 'a+') as log_file:
                    log_file.writelines("{}, {}\n".format(ep+1, np.mean(ret)))
            if (ep + 1) % FLAGS.save_every == 0:
                if not os.path.exists("saved_model"):
                    os.mkdir('saved_model')
                agents[0].save(checkpoint_root='saved_model', checkpoint_name='{}'.format(ep+1))
            time_step = env.reset()  # a go.Position object
            while not time_step.last():
                player_id = time_step.observations["current_player"]
                if player_id == 0:
                    agent_output = agents[player_id].step(time_step)
                else:
                    agent_output = agents[player_id].step(time_step, env)
                action_list = agent_output.action
                time_step = env.step(action_list)
            # for agent in agents:
            agents[0].step(time_step)
            agents[1].step(time_step, env)
            if len(ret) < max_len:
                ret.append(time_step.rewards[0])
            else:
                ret[ep % max_len] = time_step.rewards[0]

        # evaluated the trained agent
        agents[0].restore("saved_model/10000")
        ret = []
        for ep in range(FLAGS.num_eval):
            time_step = env.reset()
            while not time_step.last():
                player_id = time_step.observations["current_player"]
                if player_id == 0:
                    agent_output = agents[player_id].step(time_step, is_evaluation=True, add_transition_record=False)
                else:
                    agent_output = agents[player_id].step(time_step, env)
                action_list = agent_output.action
                time_step = env.step(action_list)

            # Episode is over, step all agents with final info state.
            # for agent in agents:
            agents[0].step(time_step, is_evaluation=True, add_transition_record=False)
            agents[1].step(time_step, env)
            ret.append(time_step.rewards[0])
        print(np.mean(ret))

    print('Time elapsed:', time.time()-begin)
Beispiel #12
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Apr 17 18:55:03 2019

@author: clytie
"""

if __name__ == "__main__":
    import numpy as np
    import time
    from tqdm import tqdm
    from env.dist_env import BreakoutEnv
    from algorithms.dqn import DQN

    DQNetwork = DQN(4, (84, 84, 4),
                    epsilon_schedule=lambda x: 0,
                    save_path="./dqn_log")
    env = BreakoutEnv(4999, num_envs=1, mode="test")
    env_ids, states, _, _ = env.start()
    for _ in tqdm(range(10000)):
        time.sleep(0.1)
        actions = DQNetwork.get_action(np.asarray(states))
        env_ids, states, _, _ = env.step(env_ids, actions)
    env.close()
Beispiel #13
0
    def initialize_policy(self):

        if self.args.policy == 'dqn':
            assert self.args.act_space.__class__.__name__ == "Discrete", (
                "Can't train DQN with continuous action space!")
            q_network = FlattenMlp(input_size=self.args.obs_dim,
                                   output_size=self.args.act_space.n,
                                   hidden_sizes=self.args.dqn_layers)
            self.agent = DQN(
                q_network,
                # optimiser_vae=self.optimizer_vae,
                lr=self.args.policy_lr,
                gamma=self.args.gamma,
                eps_init=self.args.dqn_epsilon_init,
                eps_final=self.args.dqn_epsilon_final,
                exploration_iters=self.args.dqn_exploration_iters,
                tau=self.args.soft_target_tau,
            ).to(ptu.device)
        # elif self.args.policy == 'ddqn':
        #     assert self.args.act_space.__class__.__name__ == "Discrete", (
        #         "Can't train DDQN with continuous action space!")
        #     q_network = FlattenMlp(input_size=self.args.obs_dim,
        #                            output_size=self.args.act_space.n,
        #                            hidden_sizes=self.args.dqn_layers)
        #     self.agent = DoubleDQN(
        #         q_network,
        #         # optimiser_vae=self.optimizer_vae,
        #         lr=self.args.policy_lr,
        #         eps_optim=self.args.dqn_eps,
        #         alpha_optim=self.args.dqn_alpha,
        #         gamma=self.args.gamma,
        #         eps_init=self.args.dqn_epsilon_init,
        #         eps_final=self.args.dqn_epsilon_final,
        #         exploration_iters=self.args.dqn_exploration_iters,
        #         tau=self.args.soft_target_tau,
        #     ).to(ptu.device)
        elif self.args.policy == 'sac':
            assert self.args.act_space.__class__.__name__ == "Box", (
                "Can't train SAC with discrete action space!")
            q1_network = FlattenMlp(input_size=self.args.obs_dim +
                                    self.args.action_dim,
                                    output_size=1,
                                    hidden_sizes=self.args.dqn_layers)
            q2_network = FlattenMlp(input_size=self.args.obs_dim +
                                    self.args.action_dim,
                                    output_size=1,
                                    hidden_sizes=self.args.dqn_layers)
            policy = TanhGaussianPolicy(obs_dim=self.args.obs_dim,
                                        action_dim=self.args.action_dim,
                                        hidden_sizes=self.args.policy_layers)
            self.agent = SAC(
                policy,
                q1_network,
                q2_network,
                actor_lr=self.args.actor_lr,
                critic_lr=self.args.critic_lr,
                gamma=self.args.gamma,
                tau=self.args.soft_target_tau,
                entropy_alpha=self.args.entropy_alpha,
                automatic_entropy_tuning=self.args.automatic_entropy_tuning,
                alpha_lr=self.args.alpha_lr).to(ptu.device)
        else:
            raise NotImplementedError
Beispiel #14
0
class DQNscheduler():
    def __init__(self):
        self.agent = DQN()
        self.last_state = np.zeros(6 * NUM_A + 7 * NUM_F, dtype=np.int)
        self.last_action = 0
        self.last_througout = 0
        self.last_reward = 0
        self.key = []

    def train(self, actq, cptq):
        '''
        Generating control strategy and training model based on current flow information    
        input:
            actq: the infomation of active flows
            cptq: the infomation fo completed flows     
        '''
        #state
        state = self.stateparser(actq, cptq)

        #get action
        action = self.agent.egreedy_action(state)  # e-greedy action for train

        #reward
        current_throughout = self.throughout(cptq)
        if self.last_througout == 0:
            if current_throughout == 0:
                reward = 0
            else:
                reward = 1
        else:
            reward = current_throughout / self.last_througout
            if reward > 1:
                # (0, 1) U (1, +)
                reward = reward / 10
            else:
                # (-1, 0)
                reward = reward - 1

        done = False

        if reward != 0:
            #train
            self.agent.perceive(self.last_state, self.last_action,
                                self.last_reward, state, done)

        #record state action and throughout
        self.last_state = state
        self.last_action = action
        self.last_reward = reward
        self.last_througout = current_throughout

        #analyzing the meaning of actions
        ret = self.actionparser(action)
        infostr = self.getinfo(state, action, reward)

        return ret, infostr

    def throughout(self, cptq):
        '''
        Computing the bandwidth of the completed flows
        Input:
            cptq: the infomation of completed flows
        '''
        res = 0.0
        for index, row in cptq.iterrows():
            res += row['size'] / row['duration']
        return res

    def stateparser(self, actq, cptq):
        '''
        Converting the active and completed flows information to a 1*136 state space
        Intput:
            actq: the infomation of active flows
            cptq: the infomation fo completed flows
        '''
        temp = actq.sort_values(by='sentsize')
        active_num = NUM_A
        finished_num = NUM_F
        state = np.zeros(active_num * 6 + finished_num * 7, dtype=np.int)
        i = 0
        self.key = []
        self.qindex_list = []
        for index, row in temp.iterrows():
            if i > active_num:
                break
            else:
                state[6 * i] = row['src']
                state[6 * i + 1] = row['dst']
                state[6 * i + 2] = row['protocol']
                state[6 * i + 3] = row['sp']
                state[6 * i + 4] = row['dp']
                state[6 * i + 5] = row['priority']
                self.key.append(index)
                self.qindex_list.append(row['qindex'])
            i += 1
        i = active_num
        for index, row in cptq.iterrows():
            state[6 * active_num + 7 * (i - active_num)] = row['src']
            state[6 * active_num + 7 * (i - active_num) + 1] = row['dst']
            state[6 * active_num + 7 * (i - active_num) + 2] = row['protocol']
            state[6 * active_num + 7 * (i - active_num) + 3] = row['sp']
            state[6 * active_num + 7 * (i - active_num) + 4] = row['dp']
            state[6 * active_num + 7 * (i - active_num) + 5] = row['duration']
            state[6 * active_num + 7 * (i - active_num) + 6] = row['size']
            i += 1
        return state

    def actionparser(self, action):
        '''
        Converting 11-bit integer to control information
        Input:
            action: 11-bit integer as action
        '''
        bstr = ('{:0%sb}' % (NUM_A)).format(action)
        res = {}
        for i in range(len(self.key)):
            res[self.key[i]] = int(bstr[-1 - i])
        return res

    def getinfo(self, state, action, reward):
        '''
        Generating the log info of this time training
        Input:
            state: state space
            action: action space
            reward: current reward
        '''
        infostr = ''
        line = '%50s\n' % (50 * '*')
        rewardstr = 'Evaluation Reward: %f\n' % reward
        policy = 'State and action:\n'
        bstr = ('{:0%sb}' % (NUM_A)).format(action)
        for i in range(NUM_A):
            if i >= len(self.key):
                break
            else:
                policy += 'Queue index:%d, Five tuple={%d,%d,%d,%d,%d}, priority: %d, action:%s\n' % (
                    self.qindex_list[i], state[6 * i], state[6 * i + 1],
                    state[6 * i + 2], state[6 * i + 3], state[6 * i + 4],
                    state[6 * i + 5], bstr[-1 - i])
        infostr = line + rewardstr + policy + line
        return infostr
Beispiel #15
0
    from algorithms.dqn import ReplayBuffer, DQN

    logging.basicConfig(level=logging.INFO,
                        format='%(asctime)s|%(levelname)s|%(message)s')

    memory = ReplayBuffer(max_size=500000)
    env = BreakoutEnv(49999, num_envs=20)
    env_ids, states, rewards, dones = env.start()
    print("pre-train: ")
    for _ in tqdm(range(5000)):
        env_ids, states, rewards, dones = env.step(
            env_ids, np.random.randint(env.action_space, size=env.num_srd))
    trajs = env.get_episodes()

    memory.add(trajs)
    DQNetwork = DQN(env.action_space, env.state_space, save_path="./dqn_log")

    print("start train: ")
    for step in range(10000000):
        for _ in range(20):
            actions = DQNetwork.get_action(np.asarray(states))
            env_ids, states, rewards, dones = env.step(env_ids, actions)
        if step % 10 == 0:
            logging.info(
                f'>>>>{env.mean_reward}, nth_step{step}, buffer{len(memory)}')
        trajs = env.get_episodes()
        memory.add(trajs)
        for _ in range(10):
            batch_samples = memory.sample(32)
            DQNetwork.update(batch_samples, sw_dir="dqn")
def main(params):
    np.random.seed(params['seed'])
    torch.manual_seed(params['seed'])
    # declare environment
    is_goal = True
    if params['environment'] == 'acrobot_simple':
        env = SimpleAcrobotEnv(stochastic=False, max_steps=400, mean_goal=-1.5)
        s, goal = env.reset()
    elif params['environment'] == 'windy_grid_world':
        env = GridworldEnv()
        s, goal = env.perform_reset()
    else:
        env = gym.make(params['environment'])
        s = env.reset()
        goal = s
        is_goal = False

    state_shape = s.shape[0] + goal.shape[0]

    # select type of experience replay using the parameters
    if params['buffer'] == ReplayBuffer:
        buffer = ReplayBuffer(params['buffer_size'])
        loss_function = params['loss_function']()
    elif params['buffer'] == PrioritizedReplayBuffer:
        buffer = PrioritizedReplayBuffer(params['buffer_size'], params['PER_alpha'], params['PER_beta'])
        loss_function = params['loss_function'](reduction='none')
    elif params['buffer'] == HindsightReplayBuffer:
        buffer = HindsightReplayBuffer(params['buffer_size'])
        loss_function = params['loss_function']()
    elif params['buffer'] == PrioritizedHindsightReplayBuffer:
        buffer = PrioritizedHindsightReplayBuffer(params['buffer_size'], params['PER_alpha'], params['PER_beta'])
        loss_function = params['loss_function'](reduction='none')
    else:
        raise ValueError('Buffer type not found.')

    # select learning algorithm using the parameters
    if params['algorithm'] == DQN:
        algorithm = DQN(state_shape,
                        env.action_space.n,
                        loss_function=loss_function,
                        optimizer=params['optimizer'],
                        lr=params['lr'],
                        gamma=params['gamma'],
                        epsilon_delta=1 / (params['epsilon_delta_end'] * params['train_steps']),
                        epsilon_min=params['epsilon_min'])
    elif params['algorithm'] == algo_DQN:
        algorithm = algo_DQN()
    else:
        raise ValueError('Algorithm type not found.')

    losses = []
    returns = []
    train_steps = 0
    episodes_length = []
    episodes_length_test = []

    print('Starting to train:', type(buffer))
    test_lengths = test(algorithm, env)
    episodes_length_test.append(test_lengths)

    while train_steps < params['train_steps']:
        if isinstance(env, GridworldEnv):
            obs_t, goal = env.perform_reset()
        elif is_goal:
            obs_t, goal = env.reset()
        else:
            obs_t = env.reset()
            goal = np.zeros_like(obs_t)

        t = 0
        episode_loss = []
        episode_rewards = []
        episode_transitions = []
        while train_steps < params['train_steps']:
            # env.render()
            action = algorithm.predict(np.hstack((obs_t, goal)))
            t += 1
            if isinstance(env, GridworldEnv):
                obs_tp1, reward, done, _ = env.perform_step(action)
                transition = (obs_t, goal, action, reward, obs_tp1, done)
            elif is_goal:
                obs_tp1, reward, done, _, gr = env.step(action)
                transition = (obs_t, goal, action, reward, obs_tp1, gr, done)
            else:
                obs_tp1, reward, done, _ = env.step(action)
                transition = (obs_t, goal, action, reward, obs_tp1, done)
            episode_transitions.append(transition)
            episode_rewards.append(reward)
            if len(buffer) >= params['batch_size']:
                loss = update(algorithm, buffer, params, train_steps)
                train_steps += 1
                episode_loss.append(loss)
                if train_steps % params['test_every'] == 0:
                    test_lengths = test(algorithm, env)
                    episodes_length_test.append(test_lengths)
            # termination condition
            if done:
                episodes_length.append(t)
                break

            obs_t = obs_tp1

        special_goal = isinstance(env, CustomAcrobotEnv) or isinstance(env, SimpleAcrobotEnv)
        add_transitions_to_buffer(episode_transitions, buffer, special_goal=special_goal)
        losses.append(np.mean(episode_loss))
        returns.append(np.sum(episode_rewards))

    env.close()
    return episodes_length_test, returns, losses