Ejemplo n.º 1
0
def eval_greedy_agent(env, model):
    episode_total_reward = 0
    num_actions = env.action_space.n
    state = env.reset()
    while True:
        action = epsilon_greedy_act(num_actions, state, model, 0)
        next_state, rew, done, _ = env.step(action)
        state = next_state
        episode_total_reward += rew
        if done:
            break
    env.reset()
    return episode_total_reward
Ejemplo n.º 2
0
def train_with_e_learning(env,
                          model,
                          e_model,
                          add_ucb=False,
                          add_bonus=False,
                          beta=1,
                          eps_params=None,
                          e_lr=1e-5,
                          lr=1e-5,
                          replay_buffer_size=1000,
                          gamma=0.99,
                          gamma_E=0.9,
                          max_steps=100,
                          max_num_episodes=1,
                          learning_starts_in_steps=100,
                          train_freq_in_steps=1,
                          e_train_freq_in_steps=1,
                          update_freq_in_steps=10,
                          e_update_freq_in_steps=10,
                          seed=None,
                          log_dir='logs',
                          write_logs=None,
                          act_type='epsilon_greedy',
                          target_type='standard',
                          print_freq=1,
                          batch_size=32,
                          do_pretraining=False,
                          chain_criterion=False,
                          return_states=False,
                          return_trajectory=False,
                          trajectory_path=''):

    if seed:
        np.random.seed(seed)
        torch.manual_seed(seed)
        env.seed(seed)

    num_actions = env.action_space.n
    dim_states = env.observation_space.shape[0]

    create_empty_directory(log_dir)
    tensorboard_directory = log_dir + '/tensorboard_logs'
    descr_file_name = log_dir + '/parameters.csv'

    with open(descr_file_name, 'w') as file:
        file.write('dim_states: {}\n'.format(dim_states))
        file.write('seed: {}\n'.format(seed))
        file.write('action selection type: {}\n'.format(act_type))
        file.write('target q-function type: {}\n'.format(target_type))
        file.write('Without epsilon: {}\n'.format(
            True if eps_params is None else False))

    # create empty directories for writing logs
    create_empty_directory(tensorboard_directory)
    # create a summary writer
    writer = SummaryWriter(tensorboard_directory)

    # define models
    target_model = copy.deepcopy(model)
    target_e_model = copy.deepcopy(e_model)

    # define optimizer
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    e_optimizer = torch.optim.RMSprop(e_model.parameters(),
                                      momentum=0.9,
                                      lr=e_lr)

    # define shedule of epsilon in epsilon-greedy exploration
    if eps_params is not None:
        schedule_timesteps = int(eps_params['exploration_fraction'] *
                                 max_num_episodes)
        print('schedule_timesteps', schedule_timesteps)
        eps_shedule = LinearSchedule(
            schedule_timesteps=schedule_timesteps,
            initial_p=1.0,
            final_p=eps_params['exploration_final_eps'])
    else:
        eps_shedule = None

    # create replay buffers
    replay_buffer = ReplayBuffer(replay_buffer_size, seed=seed)

    num_episodes = 0
    sum_rewards_per_episode = [0]
    list_rewards_per_episode = [[]]
    state_history = []
    trajectory = []
    state = env.reset()
    break_flag = False

    t = 1
    max_state = 0
    for episode in range(max_num_episodes):
        ucb_work = 0
        episode_trajectory = []
        if len(sum_rewards_per_episode) > max_num_episodes:
            break

        if len(sum_rewards_per_episode
               ) > 10 and t > learning_starts_in_steps and chain_criterion:
            if np.sum(sum_rewards_per_episode[-11:]) == 10 * 10:
                break

        eps_t = eps_shedule.value(episode + 1) if eps_shedule else 0

        if add_ucb:
            if torch.cuda.is_available():
                e_values = e_model.forward(
                    convert_to_var(state)).cpu().data.numpy()
                q_values = model.forward(
                    convert_to_var(state)).cpu().data.numpy()
            else:
                e_values = e_model.forward(convert_to_var(state)).data.numpy()
                q_values = model.forward(convert_to_var(state)).data.numpy()

            cnt = np.log(e_values) / np.log(1 - e_lr) + np.log(2) / np.log(
                1 - e_lr)
            ucb = np.sqrt(2 * np.log(t) / cnt)
            ucb *= beta

            if q_values.argmax() != (q_values + ucb).argmax():
                ucb_work += 1
        else:
            ucb = None

        action = epsilon_greedy_act(num_actions, state, model, eps_t, ucb=ucb)
        episode_steps = 0
        max_episode_state = 0
        while True:
            next_state, rew, done, _ = env.step(action)
            if add_bonus:
                if torch.cuda.is_available():
                    e_values = e_model.forward(
                        convert_to_var(state)).cpu().data.numpy()
                else:
                    e_values = e_model.forward(
                        convert_to_var(state)).data.numpy()
                cnt = np.log(e_values) / np.log(1 - e_lr) + np.log(2) / np.log(
                    1 - e_lr)
                rew_ = rew + beta / cnt[action]
            else:
                rew_ = rew

            sum_rewards_per_episode[-1] += rew
            list_rewards_per_episode[-1].append(rew)
            state_history.append(state)
            t += 1

            if add_ucb and t > learning_starts_in_steps:
                if torch.cuda.is_available():
                    e_values = e_model.forward(
                        convert_to_var(state)).cpu().data.numpy()
                    q_values = model.forward(
                        convert_to_var(state)).cpu().data.numpy()
                else:
                    e_values = e_model.forward(
                        convert_to_var(state)).data.numpy()
                    q_values = model.forward(
                        convert_to_var(state)).data.numpy()
                cnt = np.log(e_values) / np.log(1 - e_lr) + np.log(2) / np.log(
                    1 - e_lr)
                ucb = np.sqrt(2 * np.log(t) / cnt)
                ucb *= beta
                if q_values.argmax() != (q_values + ucb).argmax():
                    t_ucb_max = t
                    ucb_work += 1
                else:
                    pass
            else:
                ucb = None
            if act_type == 'epsilon_greedy':
                next_action = epsilon_greedy_act(num_actions,
                                                 state,
                                                 model,
                                                 eps_t,
                                                 ucb=ucb)
            elif act_type == 'lll_epsilon_greedy':
                next_action = lll_epsilon_greedy_act(num_actions,
                                                     state,
                                                     model,
                                                     e_model,
                                                     e_lr,
                                                     eps_t,
                                                     ucb=ucb)

            replay_buffer.add(state, action, rew_, done, next_state,
                              next_action)
            trajectory.append(
                (state, action, rew_, done, next_state, next_action))

            batch = [
                np.array([state]),
                np.array([action]),
                np.array([rew_]),
                np.array([done]),
                np.array([next_state]),
                np.array([next_action])
            ]
            e_loss = sarsa_loss(e_optimizer, e_model, target_e_model, batch,
                                gamma_E)

            state = next_state
            action = next_action

            if t > learning_starts_in_steps and t % train_freq_in_steps == 0:
                batch = replay_buffer.sample(batch_size)
                loss = dqn_loss(optimizer,
                                model,
                                target_model,
                                batch,
                                gamma,
                                target_type=target_type)
            else:
                loss = 0

            if t > learning_starts_in_steps and t % e_train_freq_in_steps == 0:
                batch = replay_buffer.sample(batch_size)
                e_loss = sarsa_loss(e_optimizer, e_model, target_e_model,
                                    batch, gamma_E)
            else:
                e_loss = 0

            if t > learning_starts_in_steps and t % update_freq_in_steps == 0:
                target_model = copy.deepcopy(model)

            if t > learning_starts_in_steps and t % e_update_freq_in_steps == 0:
                target_e_model = copy.deepcopy(e_model)

            if write_logs:
                write_tensorboard_logs(locals())
            episode_steps += 1
            if done:
                if print_freq is not None:
                    print('t: {}, Episode: {}, sum: {:.2f}, eps: {:.2f}'.\
                          format(t, num_episodes, sum_rewards_per_episode[-1], eps_t))
                    if add_ucb:
                        print('ucb in episode: {}'.format(ucb_work))

                num_episodes += 1
                sum_rewards_per_episode.append(0)
                state_history.append(next_state)
                state = env.reset()
                break

    state_history = np.array(state_history)
    if return_trajectory:
        states = []
        actions = []
        rewards = []
        dones = []
        next_states = []
        next_actions = []
        for t in trajectory:
            state, action, rew_, done, next_state, next_action = t
            states.append(state)
            actions.append(action)
            rewards.append(rew_)
            dones.append(done)
            next_states.append(next_state)
            next_actions.append(next_action)
        states = np.array(states)
        actions = np.array(actions)
        rewards = np.array(rewards)
        dones = np.array(dones)
        next_states = np.array(next_states)
        next_actions = np.array(next_actions)

        np.save(trajectory_path + '/_states', states)
        np.save(trajectory_path + '/_actions', actions)
        np.save(trajectory_path + '/_rewards', rewards)
        np.save(trajectory_path + '/_dones', dones)
        np.save(trajectory_path + '/_next_states', next_states)
        np.save(trajectory_path + '/_next_actions', next_actions)

    if return_states:
        if done:
            return sum_rewards_per_episode[:
                                           -1], num_episodes - 1, state_history
        else:
            return sum_rewards_per_episode, num_episodes, state_history
    else:
        if done:
            return sum_rewards_per_episode[:-1], num_episodes - 1
        else:
            return sum_rewards_per_episode, num_episodes
Ejemplo n.º 3
0
def train(
    env,
    model,
    eps_params=None,
    lr=1e-5,
    replay_buffer_size=1000,
    gamma=0.99,
    max_steps=100,
    max_num_episodes=1,
    learning_starts_in_steps=100,
    train_freq_in_steps=1,
    update_freq_in_steps=10,
    seed=None,
    log_dir='logs',
    write_logs=None,
    act_type='epsilon_greedy',
    target_type='standard',
    print_freq=1,
    batch_size=32,
    return_states=False,
):

    if seed:
        np.random.seed(seed)
        torch.manual_seed(seed)

    num_actions = env.action_space.n
    dim_states = env.observation_space.shape[0]

    create_empty_directory(log_dir)
    tensorboard_directory = log_dir + '/tensorboard_logs'
    descr_file_name = log_dir + '/parameters.csv'

    with open(descr_file_name, 'w') as file:
        file.write('dim_states: {}\n'.format(dim_states))
        file.write('seed: {}\n'.format(seed))
        file.write('action selection type: {}\n'.format(act_type))
        file.write('target q-function type: {}\n'.format(target_type))
        file.write('Without epsilon: {}\n'.format(
            True if eps_params is None else False))

    # create empty directories for writing logs
    create_empty_directory(tensorboard_directory)

    # create a summary writer
    writer = SummaryWriter(tensorboard_directory)

    # define models
    target_model = copy.deepcopy(model)

    # define optimizer
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    # define shedule of epsilon in epsilon-greedy exploration
    if eps_params is not None:
        schedule_timesteps = int(eps_params['exploration_fraction'] *
                                 max_num_episodes)
        print('schedule_timesteps', schedule_timesteps)
        eps_shedule = LinearSchedule(
            schedule_timesteps=schedule_timesteps,
            initial_p=1.0,
            final_p=eps_params['exploration_final_eps'])
    else:
        eps_shedule = None

    # create replay buffers
    replay_buffer = ReplayBuffer(replay_buffer_size, seed=seed)

    num_episodes = 0
    sum_rewards_per_episode = [0]
    list_rewards_per_episode = [[]]
    state_history = []
    count_history = []
    state = env.reset()
    break_flag = False

    for t in range(max_steps):
        eps_t = eps_shedule.value(num_episodes) if eps_shedule else 0

        action = epsilon_greedy_act(num_actions, state, model, eps_t)
        next_state, rew, done, _ = env.step(action)
        replay_buffer.add(state, action, rew, done, next_state, _)

        sum_rewards_per_episode[-1] += rew
        list_rewards_per_episode[-1].append(rew)

        state = next_state

        if t > learning_starts_in_steps and t % train_freq_in_steps == 0:
            batch = replay_buffer.sample(batch_size)
            loss = dqn_loss(optimizer,
                            model,
                            target_model,
                            batch,
                            gamma,
                            target_type=target_type)
        else:
            loss = 0
            exploration_loss = 0

        if t > learning_starts_in_steps and t % update_freq_in_steps == 0:
            target_model = copy.deepcopy(model)

        if write_logs:
            write_tensorboard_logs(locals())

        if done:
            if print_freq is not None:
                print('t: {}, Episode: {}, sum: {}, eps: {:.2f}'.\
                      format(t, num_episodes, sum_rewards_per_episode[-1], eps_t))
            num_episodes += 1
            sum_rewards_per_episode.append(0)

            state = env.reset()

        if len(sum_rewards_per_episode) > max_num_episodes:
            break

    state_history = np.array(state_history)

    if return_states:
        if done:
            return sum_rewards_per_episode[:
                                           -1], num_episodes - 1, state_history
        else:
            return sum_rewards_per_episode, num_episodes, state_history
    else:
        if done:
            return sum_rewards_per_episode[:-1], num_episodes - 1
        else:
            return sum_rewards_per_episode, num_episodes
Ejemplo n.º 4
0
def train_tabular(
    env,
    model,
    eps_params=None,
    alpha_params=None,
    lr=1e-5,
    replay_buffer_size=1000,
    gamma=0.99,
    max_steps=100,
    learning_starts_in_steps=100,
    train_freq_in_steps=1,
    update_freq_in_steps=10,
    plot_freq=1,
    eval_freq=5,
    seed=None,
    log_dir='logs',
    write_logs=None,
    count_based_exploration_type='state_action',
    act_type='epsilon_greedy',
    target_type='standard',
    reward_shaping_type=None,
    do_pretraining=False,
    print_freq=1,
    batch_size=32,
):

    if seed:
        set_seed(seed, env)

    num_actions = env.action_space.n
    dim_states = env.observation_space.shape[0]
    n_all_states = env.get_all_states().shape[0]

    if write_logs:
        create_empty_directory(log_dir)
        tensorboard_directory = log_dir + '/tensorboard_logs'
        images_directory = log_dir + '/images_logs'
        descr_file_name = log_dir + '/parameters.csv'

        with open(descr_file_name, 'w') as file:
            file.write('dim_states: {}\n'.format(dim_states))
            file.write('seed: {}\n'.format(seed))
            file.write('action selection type: {}\n'.format(act_type))
            file.write('target q-function type: {}\n'.format(target_type))
            file.write(
                'reward addition type: {}\n'.format(reward_shaping_type))
            file.write('Without epsilon: {}\n'.format(
                True if eps_params is None else False))
            file.write(
                'count-based exploration type (rewards in exploration model): \{}\n'
                .format(count_based_exploration_type))

            # create empty directories for writing logs
            create_empty_directory(images_directory)
            create_empty_directory(tensorboard_directory)

            # create a summary writer
            writer = SummaryWriter(tensorboard_directory)
    else:
        writer = None

    # define models
    if do_pretraining:
        pretrain(model,
                 env.get_all_states(),
                 num_actions,
                 eps=1e-6,
                 max_steps=int(1e5),
                 writer=writer)
    target_model = copy.deepcopy(model)

    # define optimizer
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    # define shedule of epsilon in epsilon-greedy exploration
    if eps_params is not None:
        schedule_timesteps = int(eps_params['exploration_fraction'] *
                                 max_steps)
        eps_shedule = LinearSchedule(
            schedule_timesteps=schedule_timesteps,
            initial_p=1.0,
            final_p=eps_params['exploration_final_eps'])
    else:
        eps_shedule = None

    if alpha_params is not None:
        schedule_timesteps = int(alpha_params['fraction'] * max_steps)
        alpha_shedule = LinearSchedule(schedule_timesteps=schedule_timesteps,
                                       initial_p=alpha_params['initial_alpha'],
                                       final_p=alpha_params['final_alpha'])
    else:
        alpha_shedule = None

    # create replay buffers
    replay_buffer = ReplayBuffer(replay_buffer_size, seed=seed)

    num_episodes = 0
    sum_rewards_per_episode = [0]
    list_rewards_per_episode = [[]]
    state_action_count = np.zeros((n_all_states, num_actions))
    count_good_rewards = 0
    episode_visitations = np.zeros((dim_states, num_actions))
    episode_history = []
    state = env.reset()
    state_history = []
    state_ids = []
    count_history = []
    break_flag = False

    for t in range(max_steps):
        eps_t = eps_shedule.value(t) if eps_shedule else 0
        alpha_t = alpha_shedule.value(t) if alpha_shedule else 1

        if act_type == 'epsilon_greedy':
            action = epsilon_greedy_act(num_actions, state, model, eps_t)
            entropy = 0

        elif act_type == 'ucb-1':
            s_id = env.convert_state_to_id(state)
            n_s = max(1, state_action_count[s_id].sum())
            n_sa = np.maximum(1, state_action_count[s_id])
            ucb = np.sqrt(2 * np.log(n_s) / n_sa)
            action = epsilon_greedy_act(num_actions,
                                        state,
                                        model,
                                        eps_t,
                                        ucb=alpha_t * ucb)
            entropy = 0

        elif act_type == 'ucb-2':
            s_id = env.convert_state_to_id(state)
            ucb = 1 / np.sqrt(1 + state_action_count[s_id])
            action = epsilon_greedy_act(num_actions,
                                        state,
                                        model,
                                        eps_t,
                                        ucb=alpha_t * ucb)
            entropy = 0
        next_state, rew, done, _ = env.step(action)
        rew_addition = count_rew_addition(state_action_count,
                                          env.convert_state_to_id(state),
                                          env.convert_state_to_id(next_state),
                                          action, reward_shaping_type)

        replay_buffer.add(state, action, rew + rew_addition, done, next_state,
                          -1)

        state_action_count[env.convert_state_to_id(state)][action] += 1
        episode_visitations[env.convert_state_to_id(state)][action] += 1

        state_history.append(state)
        count_history.append(state_action_count)
        state_ids.append(env.convert_state_to_id(state))

        if rew == 1:
            count_good_rewards += 1
        sum_rewards_per_episode[-1] += rew
        list_rewards_per_episode[-1].append(rew)

        state = next_state

        if t > learning_starts_in_steps and t % train_freq_in_steps == 0:
            batch = replay_buffer.sample(batch_size)
            loss = dqn_loss(optimizer,
                            model,
                            target_model,
                            batch,
                            gamma,
                            target_type=target_type)
        else:
            loss = 0
            exploration_loss = 0

        if t > learning_starts_in_steps and t % update_freq_in_steps == 0:
            target_model = copy.deepcopy(model)

        if write_logs:
            write_tensorboard_tabular_logs(locals())

        if done:
            if print_freq is not None:
                print('Episode:', num_episodes, sum_rewards_per_episode[-1])
            if np.sum(sum_rewards_per_episode[-100:]) == 100 * 10:
                break

            episode_history = []
            episode_visitations = np.zeros((dim_states, num_actions))
            num_episodes += 1
            sum_rewards_per_episode.append(0)
            list_rewards_per_episode.append([])
            state = env.reset()

    if done:
        return sum_rewards_per_episode[-2], num_episodes
    else:
        return sum_rewards_per_episode[-1], num_episodes
Ejemplo n.º 5
0
def train_tabular_with_e_learning(env,
                                  model,
                                  e_model,
                                  e_lr=1e-4,
                                  gamma_E=0.9,
                                  eps_params=None,
                                  alpha_params=None,
                                  lr=1e-5,
                                  replay_buffer_size=1000,
                                  gamma=0.99,
                                  max_steps=100,
                                  max_num_episodes=1,
                                  learning_starts_in_steps=100,
                                  train_freq_in_steps=1,
                                  e_train_freq_in_steps=1,
                                  update_freq_in_steps=10,
                                  e_update_freq_in_steps=10,
                                  plot_freq=1,
                                  eval_freq=5,
                                  seed=None,
                                  log_dir='logs',
                                  write_logs=None,
                                  count_based_exploration_type='state_action',
                                  act_type='epsilon_greedy',
                                  target_type='standard',
                                  reward_shaping_type=None,
                                  do_pretraining=False,
                                  print_freq=1,
                                  batch_size=32):
    if seed:
        set_seed(seed, env)

    num_actions = env.action_space.n
    dim_states = env.observation_space.shape[0]
    n_all_states = env.get_all_states().shape[0]

    create_empty_directory(log_dir)
    tensorboard_directory = log_dir + '/tensorboard_logs'
    images_directory = log_dir + '/images_logs'
    descr_file_name = log_dir + '/parameters.csv'

    with open(descr_file_name, 'w') as file:
        file.write('dim_states: {}\n'.format(dim_states))
        file.write('seed: {}\n'.format(seed))
        file.write('action selection type: {}\n'.format(act_type))
        file.write('target q-function type: {}\n'.format(target_type))
        file.write('reward addition type: {}\n'.format(reward_shaping_type))
        file.write('Without epsilon: {}\n'.format(
            True if eps_params is None else False))
        file.write(
            'count-based exploration type (rewards in exploration model): {}\n'
            .format(count_based_exploration_type))

    # create empty directories for writing logs
    create_empty_directory(images_directory)
    create_empty_directory(tensorboard_directory)

    # create a summary writer
    writer = SummaryWriter(tensorboard_directory)

    # define models
    if do_pretraining:
        pretrain(model,
                 env.get_all_states(),
                 num_actions,
                 eps=1e-6,
                 max_steps=int(1e5),
                 writer=writer)
    target_model = copy.deepcopy(model)
    target_e_model = copy.deepcopy(e_model)
    # define optimizer
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    e_optimizer = torch.optim.Adam(e_model.parameters(), lr=e_lr)

    # define shedule of epsilon in epsilon-greedy exploration
    if eps_params is not None:
        schedule_timesteps = int(eps_params['exploration_fraction'] *
                                 max_steps)
        eps_shedule = LinearSchedule(
            schedule_timesteps=schedule_timesteps,
            initial_p=1.0,
            final_p=eps_params['exploration_final_eps'])
    else:
        eps_shedule = None

    if alpha_params is not None:
        schedule_timesteps = int(alpha_params['fraction'] * max_steps)
        alpha_shedule = LinearSchedule(schedule_timesteps=schedule_timesteps,
                                       initial_p=alpha_params['initial_alpha'],
                                       final_p=alpha_params['final_alpha'])
    else:
        alpha_shedule = None

    # create replay buffers
    replay_buffer = ReplayBuffer(replay_buffer_size, seed=seed)

    num_episodes = 0
    sum_rewards_per_episode = [0]
    list_rewards_per_episode = [[]]
    state_action_count = np.zeros((n_all_states, num_actions))
    count_good_rewards = 0
    episode_visitations = np.zeros((dim_states, num_actions))
    episode_history = []
    state = env.reset()
    break_flag = False

    corr_coefs = []

    t = 0
    for i_episode in range(max_num_episodes):
        state = env.reset()

        eps_t = eps_shedule.value(t) if eps_shedule else 0
        alpha_t = alpha_shedule.value(t) if alpha_shedule else 1

        if act_type == 'epsilon_greedy':
            action = epsilon_greedy_act(num_actions, state, model, eps_t)
            entropy = 0
            episode_history.append((env.convert_state_to_id(state), action))

        elif act_type == 'ucb-1':
            s_id = env.convert_state_to_id(state)
            n_s = max(1, state_action_count[s_id].sum())
            n_sa = np.maximum(1, state_action_count[s_id])
            ucb = np.sqrt(2 * np.log(n_s) / n_sa)
            action = epsilon_greedy_act(num_actions,
                                        state,
                                        model,
                                        eps_t,
                                        ucb=alpha_t * ucb)
            entropy = 0

        elif act_type == 'ucb-2':
            s_id = env.convert_state_to_id(state)
            ucb = 1 / np.sqrt(1 + state_action_count[s_id])
            action, flag = epsilon_greedy_act(num_actions,
                                              state,
                                              model,
                                              eps_t,
                                              ucb=alpha_t * ucb)
            entropy = 0

        while True:
            next_state, rew, done, _ = env.step(action)
            rew_addition = count_rew_addition(
                state_action_count, env.convert_state_to_id(state),
                env.convert_state_to_id(next_state), action,
                reward_shaping_type)

            eps_t = eps_shedule.value(t) if eps_shedule else 0
            alpha_t = alpha_shedule.value(t) if alpha_shedule else 1
            if act_type == 'epsilon_greedy':
                next_action = epsilon_greedy_act(num_actions, state, model,
                                                 eps_t)
                entropy = 0
                episode_history.append(
                    (env.convert_state_to_id(state), action))
            elif act_type == 'ucb-1':
                s_id = env.convert_state_to_id(state)
                n_s = max(1, state_action_count[s_id].sum())
                n_sa = np.maximum(1, state_action_count[s_id])
                ucb = np.sqrt(2 * np.log(n_s) / n_sa)
                next_action = epsilon_greedy_act(num_actions,
                                                 state,
                                                 model,
                                                 eps_t,
                                                 ucb=alpha_t * ucb)
                entropy = 0
            elif act_type == 'ucb-2':
                s_id = env.convert_state_to_id(state)
                ucb = 1 / np.sqrt(1 + state_action_count[s_id])
                next_action = epsilon_greedy_act(num_actions,
                                                 state,
                                                 model,
                                                 eps_t,
                                                 ucb=alpha_t * ucb)
                entropy = 0
            else:
                print('Unknow action selection type')

            replay_buffer.add(state, action, rew + rew_addition, done,
                              next_state, next_action)

            state_action_count[env.convert_state_to_id(state)][action] += 1
            episode_visitations[env.convert_state_to_id(state)][action] += 1

            if rew == 1:
                count_good_rewards += 1
            sum_rewards_per_episode[-1] += rew
            list_rewards_per_episode[-1].append(rew)

            if t > learning_starts_in_steps:
                batch = [
                    np.array([state]),
                    np.array([action]),
                    np.array([rew]),
                    np.array([done]),
                    np.array([next_state]),
                    np.array([next_action])
                ]
                e_loss = sarsa_loss(e_optimizer, e_model, target_e_model,
                                    batch, gamma_E)

            if t > learning_starts_in_steps and t % train_freq_in_steps == 0:
                batch = replay_buffer.sample(batch_size)
                loss = dqn_loss(optimizer,
                                model,
                                target_model,
                                batch,
                                gamma,
                                target_type=target_type)
            else:
                loss = 0

            if t > learning_starts_in_steps and t % e_train_freq_in_steps == 0:
                batch = replay_buffer.sample(batch_size)
                states = batch[0]
                states_ids = []
                for st in states:
                    states_ids.append(env.convert_state_to_id(st))
                states_ids = np.array(states_ids)
                unique, counts = np.unique(states_ids, return_counts=True)
                tmp = [(u, c) for u, c in zip(unique, counts)]
                print(tmp)
                e_loss = sarsa_loss(e_optimizer, e_model, target_e_model,
                                    batch, gamma_E)
            else:
                e_loss = 0

            if t > learning_starts_in_steps and t % update_freq_in_steps == 0:
                target_model = copy.deepcopy(model)

            if t > learning_starts_in_steps and t % e_update_freq_in_steps == 0:
                target_e_model = copy.deepcopy(e_model)

            if done:
                all_states_var = convert_to_var(env.get_all_states())
                if torch.cuda.is_available():
                    e_values = e_model.forward(
                        all_states_var).cpu().data.numpy()
                else:
                    e_values = e_model.forward(all_states_var).data.numpy()
                e_counters = np.log(e_values) / np.log(1 - e_lr) + np.log(
                    2) / np.log(1 - e_lr)
                plot_visitations_with_e_values(state_action_count, e_values,
                                               e_counters, num_episodes, t,
                                               images_directory)
                if t > 3 * learning_starts_in_steps:
                    corr_coefs.append(
                        np.corrcoef(state_action_count.flatten(),
                                    e_counters.flatten())[0][1])

            state = next_state
            action = next_action
            t += 1
            if done:
                if print_freq is not None:
                    print('t: {}, Episode: {}, sum: {}, eps: {:.2f}'. \
                          format(t, num_episodes, sum_rewards_per_episode[-1], eps_t))
                episode_history = []
                episode_visitations = np.zeros((dim_states, num_actions))
                num_episodes += 1
                sum_rewards_per_episode.append(0)
                list_rewards_per_episode.append([])
                state = env.reset()
                break
    torch.save(e_model, 'e_model')
    plot_corr_hist(corr_coefs)
    np.save('corr_coefs', np.array(corr_coefs))
    return sum_rewards_per_episode[-2], num_episodes