Beispiel #1
0
import numpy as np
from timeit import default_timer as timer

from datetime import timedelta
import math

from agents.DQN_NSTEP import Model
from utils.hyperparameters import Config
from utils.plot import plot, save_plot
from utils.wrappers import wrap_pytorch, make_atari, wrap_deepmind

# 智能体名称
agent_name = "DQN"
# 获取配置文件
config = Config()
# 记录开始时间
start = timer()
# 声明环境为乒乓球
env_id = "PongNoFrameskip-v4"
env = make_atari(env_id)
env = wrap_deepmind(env, frame_stack=False)
env = wrap_pytorch(env)
# 构建模型
model = Model(env=env, config=config)
# 场景的收获
episode_reward = 0
# 获取场景初始状态
observation = env.reset()
# max_frames = int(config.MAX_FRAMES / 50)
# 最大frame数量
max_frames = config.MAX_FRAMES
Beispiel #2
0
    log_dir = './results/' + exp_name + '/'
    env_id = 'PongNoFrameskip-v4'

    try:
        os.makedirs(log_dir)
    except OSError:
        files = glob.glob(os.path.join(log_dir, '*.monitor.csv'))\
            + glob.glob(os.path.join(log_dir, '*td.csv')) \
            + glob.glob(os.path.join(log_dir, '*sig_param_mag.csv'))  \
            + glob.glob(os.path.join(log_dir, '*action_log.csv'))

        for f in files:
            os.remove(f)

    # Config
    config = Config()
    config.device = torch.device(
        "cuda" if torch.cuda.is_available() else "cpu")

    # Special Configuration
    config.SIGMA_INIT = 0.0
    config.N_STEPS = 3

    # Env
    env = PrepareAtariEnv(env_id, log_dir)

    # Agent
    agent = C51DuelAgent(config, env, log_dir, static_policy=False)

    # Begin Interaction & Learning
Beispiel #3
0
from matplotlib import pyplot as plt
from IPython.display import clear_output
import numpy as np

from utils.hyperparameters import Config

cfg = Config()


# 展示收reward,loss和sigma
def plot(frame_idx, rewards, losses, sigma, elapsed_time, nstep=1, name="DQN"):
    clear_output(True)
    plt.figure(figsize=(20, 5))
    plt.subplot(131)
    plt.title('%s. step: %d. frame %s. reward: %s. time: %s' %
              (name, nstep, frame_idx, np.mean(rewards[-10:]), elapsed_time))
    plt.plot(rewards)
    if losses:
        plt.subplot(132)
        plt.title("loss")
        plt.plot(losses)
    if sigma:
        plt.subplot(133)
        plt.title('noisy param magnitude')
        plt.plot(sigma)
    plt.show()


def save_plot(frame_idx,
              rewards,
              losses,
Beispiel #4
0
from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv

from utils.hyperparameters import Config
from utils.plot import plot
from utils.wrappers import make_env_a2c_atari
from agents.A2C import Model as A2C

log_dir = ""
try:
    os.makedirs(log_dir)
except OSError:
    files = glob.glob(os.path.join(log_dir, '*.monitor.csv'))
    for f in files:
        os.remove(f)

config = Config()

# ppo control
config.ppo_epoch = 3
config.num_mini_batch = 32
config.ppo_clip_param = 0.1

# a2c control
config.num_agents = 4
config.rollout = 128
config.USE_GAE = True
config.gae_tau = 0.95

# misc agent variables
config.GAMMA = 0.99
config.LR = 7e-4
Beispiel #5
0
def main(stdcsr=None):
    def log(s, end="\n"):
        if stdcsr:
            stdcsr.addstr(s + end)
        else:
            print(s, end=end)

    def refresh():
        if stdcsr:
            stdcsr.refresh()

    env = MatrisEnv(no_display=not pygame_gui, real_tick=False)
    config = Config()
    model_path = "saved_agents/value_iteration_model.pth"  # "saved_agents/V2/agent_11000/model.pth"
    body_list = [TetrisBodyV2]
    agent = Agent(env=env,
                  config=config,
                  body=body_list[0],
                  use_target=False,
                  static_policy=True,
                  use_data_parallel=False)
    # load model
    ckpt = torch.load(model_path, map_location=lambda storage, loc: storage)
    # agent.model.load_state_dict(ckpt)
    agent.model.load_state_dict(
        {k.replace('module.', ''): v
         for k, v in ckpt.items()})

    episode_reward = 0
    rounds = 0
    lines = 0
    epsilon = 0
    observation = env.reset()

    start_time = time.time()
    for frame_idx in range(1, config.MAX_FRAMES + 1):
        env.render()
        prev_observation = observation
        action = agent.get_action(epsilon)
        observation, reward, done, info = env.step(action)
        observation = None if done else observation

        episode_reward += reward
        rounds += 1
        lines = info['lines']
        score = info['score']
        if done:
            print("done")

        if not pygame_gui:
            print_observation(observation, stdcsr)
            current_value = agent.observation_value(prev_observation)
            next_value = agent.observation_value(observation)
            log("[{:5}/{} {:.0f} secs] State value: {:<5.1f}  Target value: {:<5.1f} ({:=5.1f} + {:=5.1f})  Action: {:<2}"
                .format(frame_idx, config.MAX_FRAMES,
                        time.time() - start_time, current_value,
                        next_value + reward, reward, next_value, action))
            log("Game: {}  Round: {}  Episode reward: {:<5.1f}  Cleared lines: {:<4}  Loss: {:<.1f}  Epsilon: {:<.3f}"
                .format(len(agent.lines), rounds, episode_reward, lines,
                        agent.losses[-1][1] if len(agent.losses) > 0 else 0.0,
                        epsilon))
            refresh()

        if done:
            observation = env.reset()
            assert observation is not None
            agent.append_episode_reward(frame_idx, episode_reward)
            agent.append_rounds(frame_idx, rounds)
            agent.append_lines(frame_idx, lines)
            game = len(agent.lines)
            episode_reward = 0
            rounds = 0
            lines = 0
    env.close()
Beispiel #6
0
def main(stdcsr=None):
    def log(s, end="\n"):
        if stdcsr:
            stdcsr.addstr(s + end)
        else:
            print(s, end=end)

    def refresh():
        if stdcsr:
            stdcsr.refresh()

    writer = SummaryWriter()
    mp_pool = Pool(cpu_count())
    env = MatrisEnv(no_display=True,
                    real_tick=False,
                    reward_functions=reward_functions,
                    mp_pool=mp_pool)
    save_dir = os.path.join(
        "./saved_agents",
        datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S"))
    config = Config()
    config.reward_type = "live/1"  # can be ['']
    config.EXP_REPLAY_SIZE = 5000
    config.BATCH_SIZE = 256
    config.LEARN_START = config.BATCH_SIZE
    config.TRAIN_FREQ = 1
    config.TARGET_NET_UPDATE_FREQ = 100
    config.GAMMA = 0.99
    config.SAVE_FREQ = 100
    config.LR = 1e-3
    config.epsilon_start = 0.0
    config.epsilon_final = 0.0
    config.reward_functions = reward_functions
    body_list = [TetrisBodyV2]
    env = MatrisEnv(no_display=True,
                    real_tick=False,
                    reward_functions=reward_functions,
                    reward_type=config.reward_type)
    agent = Agent(env=env, config=config, body=body_list[0], use_target=False)
    writer.add_text('cfg', pprint.pformat(config.__dict__))

    episode_reward = 0
    rounds = 0
    lines = 0
    observation = env.reset()
    start_time = time.time()
    for frame_idx in range(1, config.MAX_FRAMES + 1):
        env.render()
        epsilon = config.epsilon_by_frame(frame_idx)

        action = agent.get_action(epsilon)
        prev_observation = observation
        observation, reward, done, info = env.step(action)
        observation = None if done else observation

        agent.append_to_replay(prev_observation, action, reward, observation)
        if frame_idx >= config.LEARN_START and frame_idx % config.TRAIN_FREQ == 0:
            agent.update(frame_idx)
            writer.add_scalar('loss', agent.losses[-1][1], agent.losses[-1][0])

        if frame_idx >= config.LEARN_START and frame_idx % config.TARGET_NET_UPDATE_FREQ == 0:
            agent.update_target_network()

        episode_reward += reward
        rounds += 1
        lines = info['lines']
        score = info['score']

        print_observation(observation, env.get_color_state(), stdcsr)
        current_value = agent.observation_value(prev_observation)
        next_value = agent.observation_value(observation)
        log("[ {} / {} {:.0f} secs] State value: {:<5.1f}  Target value: {:<5.1f} ({:=5.1f} + {} * {:=5.1f})  Action: {:<2}"
            .format(frame_idx, config.MAX_FRAMES,
                    time.time() - start_time, current_value,
                    config.GAMMA * next_value + reward, reward, config.GAMMA,
                    next_value, action))
        log("Game: {}  Cleared lines and Round: {} / {}  Episode reward: {:<5.1f}  Loss: {:<.1f}  Epsilon: {:<.3f}"
            .format(len(agent.lines), lines, rounds, episode_reward,
                    agent.losses[-1][1] if len(agent.losses) > 0 else 0.0,
                    epsilon))
        refresh()

        writer.add_scalars(
            'state_values', {
                'state_value': current_value,
                'next_state_value': next_value,
                'reward': reward,
            }, frame_idx)
        writer.add_scalar('epsilon', epsilon, frame_idx)

        if done:
            observation = env.reset()
            assert observation is not None
            agent.append_episode_reward(frame_idx, episode_reward)
            agent.append_rounds(frame_idx, rounds)
            agent.append_lines(frame_idx, lines)
            game = len(agent.lines)
            writer.add_scalar('game/rounds', rounds, game)
            writer.add_scalar('game/scores', score, game)
            writer.add_scalar('game/cleared_lines', lines, game)
            writer.add_scalar('game/episode_reward', episode_reward, game)
            episode_reward = 0
            rounds = 0
            lines = 0
            agent.save(os.path.join(save_dir, f"agent_{game}"))

        # if frame_idx % config.SAVE_FREQ == 0:
        #     agent.save(f'./saved_agents/agent_{frame_idx}')

    agent.save('./saved_agents/final')
    env.close()
Beispiel #7
0
import torch.nn.functional as F

import numpy as np
import matplotlib.pyplot as plt

import random
from timeit import default_timer as timer
from datetime import timedelta
import math

from utils.wrappers import make_atari, wrap_deepmind, wrap_pytorch
from utils.hyperparameters import Config
from agents.BaseAgent import BaseAgent

# Hyperparameters
config = Config()

config.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# epsilon variables
config.epsilon_start = 1.0
config.epsilon_final = 0.01
config.epsilon_decay = 30000
config.epsilon_by_frame = lambda frame_idx: config.epsilon_final + (
    config.epsilon_start - config.epsilon_final) * math.exp(
        -1. * frame_idx / config.epsilon_decay)

# misc agent variables
config.GAMMA = 0.99
config.LR = 1e-4
from torch.autograd import Variable
from torch.distributions import Categorical

from MaTris.gym_matris import MatrisEnv
from networks.network_bodies import TetrisBodyV2
from utils.hyperparameters import Config
from utils.board_utils import print_observation
import utils.board_utils as bu
try:
    from curses import wrapper
except:
    print("your env does not support curses package")

render = True
test = False
config = Config()
device = config.device
config.BATCH_SIZE = 10
config.GAMMA = 0.99


class PGbaseline(nn.Module):
    def __init__(self, input_shape, body=TetrisBodyV2, num_actions=2):
        super(PGbaseline, self).__init__()
        self.net_body = body(input_shape)

        in_features = self.net_body.feature_size()
        self.action_head = nn.Linear(
            in_features, num_actions
        )  # action 1: static, action 2: move up, action 3: move down
        self.value_head = nn.Linear(in_features, 1)
Beispiel #9
0
def DQN_experiment(env, batch_size, max_frames, log_dir):
    LOG_DIR = log_dir + "DQN/"
    ATARI_ENV = env
    monitor = GPUMonitor()

    ## Hyperparameters
    config = Config()
    config.device = torch.device(
        "cuda" if torch.cuda.is_available() else "cpu")

    # epsilon variables
    config.epsilon_start = 1.0
    config.epsilon_final = 0.01
    config.epsilon_decay = 30000
    config.epsilon_by_frame = lambda frame_idx: config.epsilon_final + (
        config.epsilon_start - config.epsilon_final) * math.exp(
            -1. * frame_idx / config.epsilon_decay)

    # misc agent variables
    config.GAMMA = 0.99
    config.LR = 1e-4

    # memory
    config.TARGET_NET_UPDATE_FREQ = 1000
    config.EXP_REPLAY_SIZE = 100000
    config.BATCH_SIZE = batch_size

    # Learning control variables
    # config.LEARN_START = 10000
    config.LEARN_START = 100
    # config.MAX_FRAMES  = 1000000
    config.MAX_FRAMES = max_frames
    config.UPDATE_FREQ = 1

    start = timer()
    log_dir = LOG_DIR
    try:
        os.makedirs(log_dir)
    except OSError:
        files = glob.glob(os.path.join(log_dir, '*.monitor.csv'))
        for f in files:
            os.remove(f)

    env_id = ATARI_ENV
    env = make_atari(env_id)
    env = bench.Monitor(env, os.path.join(log_dir, env_id))
    env = GPUMonitorWrapper(monitor, env, os.path.join(log_dir, env_id))
    env = wrap_deepmind(env,
                        episode_life=True,
                        clip_rewards=True,
                        frame_stack=False,
                        scale=True)
    env = WrapPyTorch(env)
    model = Model(env=env, config=config, log_dir=log_dir)

    episode_reward = 0

    observation = env.reset()
    for frame_idx in range(1, config.MAX_FRAMES + 1):
        epsilon = config.epsilon_by_frame(frame_idx)

        action = model.get_action(observation, epsilon)
        prev_observation = observation
        observation, reward, done, _ = env.step(action)
        observation = None if done else observation

        model.update(prev_observation, action, reward, observation, frame_idx)
        episode_reward += reward

        if done:
            # Episode ended
            model.finish_nstep()
            model.reset_hx()
            observation = env.reset()
            # model.save_gpu_info(gpu_info)
            model.save_reward(episode_reward)
            episode_reward = 0

        if frame_idx % 1000 == 0:
            try:
                clear_output(True)
                dtime = int(timer() - start)
                plot_reward(log_dir,
                            env_id,
                            'DQN',
                            config.MAX_FRAMES,
                            bin_size=10,
                            smooth=1,
                            time=timedelta(seconds=dtime))
                plot_gpu(log_dir,
                         env_id,
                         'DQN',
                         config.MAX_FRAMES,
                         bin_size=10,
                         smooth=1,
                         time=timedelta(seconds=dtime))
            except IOError:
                pass

    model.save_w()
    env.close()
    return
Beispiel #10
0
    log_dir = './results/' + exp_name + '/'
    env_id = 'PongNoFrameskip-v4'

    try:
        os.makedirs(log_dir)
    except OSError:
        files = glob.glob(os.path.join(log_dir, '*.monitor.csv'))\
            + glob.glob(os.path.join(log_dir, '*td.csv')) \
            + glob.glob(os.path.join(log_dir, '*sig_param_mag.csv'))  \
            + glob.glob(os.path.join(log_dir, '*action_log.csv'))

        for f in files:
            os.remove(f)

    # Config
    config = Config()
    config.device = torch.device(
        "cuda" if torch.cuda.is_available() else "cpu")

    # Special Configuration
    config.USE_NOISY_NETS = True
    config.USE_PRIORITY_REPLAY = True
    config.SIGMA_INIT = 0.5
    config.N_STEPS = 3
    config.MAX_FRAMES = 800000

    # Env
    env = PrepareAtariEnv(env_id, log_dir)

    # Agent
    agent = RainbowAgent(config, env, log_dir, static_policy=False)
Beispiel #11
0
def A2C_experiment(env, batch_size, max_frames, log_dir):
    log_dir = log_dir+"A2C/"

    try:
        os.makedirs(log_dir)
    except OSError:
        files = glob.glob(os.path.join(log_dir, '*.monitor.csv'))
        for f in files:
            os.remove(f)

    config = Config()

    # a2c control
    config.num_agents = 16
    config.rollout = 5

    # misc agent variables
    config.GAMMA = 0.99
    config.LR = 7e-4
    config.entropy_loss_weight = 0.01
    config.value_loss_weight = 0.5

    # batch size
    config.BATCH_SIZE = batch_size

    # Number of updates in 10000000 frames
    # config.MAX_FRAMES = int(1e7 / config.num_agents / config.rollout)
    config.MAX_FRAMES = int(max_frames / config.num_agents / config.rollout)

    # training loop
    seed = 1

    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)

    torch.set_num_threads(1)
    #monitor = GPUMonitor()
    env_id = env
    envs = [make_env_a2c_atari(env_id, seed, i, log_dir) for i in range(config.num_agents)]
    envs = SubprocVecEnv(envs) if config.num_agents > 1 else DummyVecEnv(envs)

    obs_shape = envs.observation_space.shape
    obs_shape = (obs_shape[0] * 4, *obs_shape[1:])

    model = Model(log_dir, env=envs, config=config)

    current_obs = torch.zeros(config.num_agents, *obs_shape,
                              device=config.device, dtype=torch.float)

    def update_current_obs(obs):
        shape_dim0 = envs.observation_space.shape[0]
        obs = torch.from_numpy(obs.astype(np.float32)).to(config.device)
        current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:]
        current_obs[:, -shape_dim0:] = obs

    obs = envs.reset()
    update_current_obs(obs)

    model.rollouts.observations[0].copy_(current_obs)

    episode_rewards = np.zeros(config.num_agents, dtype=np.float)
    final_rewards = np.zeros(config.num_agents, dtype=np.float)

    start = timer()
    print_step = 1
    print_threshold = 10
    #os.remove('./log/A2C/logs.csv')
    for frame_idx in range(1, config.MAX_FRAMES + 1):
        for step in range(config.rollout):
            with torch.no_grad():
                values, actions, action_log_prob = model.get_action(model.rollouts.observations[step])
            cpu_actions = actions.view(-1).cpu().numpy()

            obs, reward, done, _ = envs.step(cpu_actions)

            episode_rewards += reward
            masks = 1. - done.astype(np.float32)
            final_rewards *= masks
            final_rewards += (1. - masks) * episode_rewards
            episode_rewards *= masks

            rewards = torch.from_numpy(reward.astype(np.float32)).view(-1, 1).to(config.device)
            masks = torch.from_numpy(masks).to(config.device).view(-1, 1)

            current_obs *= masks.view(-1, 1, 1, 1)
            update_current_obs(obs)

            model.rollouts.insert(current_obs, actions.view(-1, 1), action_log_prob, values, rewards, masks)

        with torch.no_grad():
            next_value = model.get_values(model.rollouts.observations[-1])

        model.rollouts.compute_returns(next_value, config.GAMMA)

        value_loss, action_loss, dist_entropy = model.update(model.rollouts)

        model.rollouts.after_update()

        if frame_idx % 100 == 0:
            try:
                clear_output()
                end = timer()
                total_num_steps = (frame_idx + 1) * config.num_agents * config.rollout
                #df = pd.DataFrame({'frame': frame_idx, 'timesteps': total_num_steps, 'fps': int(total_num_steps / (end - start)),
                #                   'mean reward': np.mean(final_rewards), 'median reward': np.median(final_rewards),
                #                   'min reward': np.min(final_rewards), 'max rewards': np.max(final_rewards),
                #                   'entropy': dist_entropy, 'value loss': value_loss, 'action loss': action_loss})
                #if not os.path.isfile('./log/A2C/logs.csv'):
                #    df.to_csv('./log/A2C/logs.csv', header='column_names')
                #else:
                #    df.to_csv('./log/A2C/logs.csv', mode='a', header=False)
                #with open("./log/A2C/logs.txt", "a") as myfile:
                #    myfile.write(
                #        "Frame {}, Num Timesteps {}, FPS {},"
                #        "Mean/Median Reward {:.1f}/{:.1f}, Min/Max Reward {:.1f}/{:.1f},"
                #        "Entropy {:.5f}, Value Loss {:.5f}, Policy Loss {:.5f}".
                #     format(frame_idx, total_num_steps,
                #            int(total_num_steps / (end - start)),
                #            np.mean(final_rewards),
                #            np.median(final_rewards),
                #            np.min(final_rewards),
                #            np.max(final_rewards), dist_entropy,
                #            value_loss, action_loss))
                plot(log_dir, env_id, 'A2C',
                     config.MAX_FRAMES * config.num_agents * config.rollout)

                dtime = int(timer() - start)
                plot_gpu(log_dir, env_id, 'A2C', config.MAX_FRAMES * config.num_agents * config.rollout, bin_size=10,
                         smooth=1, time=timedelta(seconds=dtime))
            except IOError:
                pass

    model.save_w()
    envs.close()
    return
Beispiel #12
0
from IPython.display import clear_output
import matplotlib
#matplotlib.use("agg")
from matplotlib import pyplot as plt
#%matplotlib inline

from timeit import default_timer as timer
from datetime import timedelta
import math

from utils.wrappers import *
from utils.hyperparameters import Config
from agents.DQN import Model

config = Config()

#algorithm control
config.USE_NOISY_NETS = False
config.USE_PRIORITY_REPLAY = False

#Multi-step returns
config.N_STEPS = 1

#epsilon variables
config.epsilon_start = 1.0
config.epsilon_final = 0.01
config.epsilon_decay = 500
config.epsilon_by_frame = lambda frame_idx: config.epsilon_final + (
    config.epsilon_start - config.epsilon_final) * math.exp(
        -1. * frame_idx / config.epsilon_decay)
Beispiel #13
0
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F

from IPython.display import clear_output
from matplotlib import pyplot as plt

from utils.wrappers import *
from agents.DQN import Model as DQN_Agent
from utils.ReplayMemory import ExperienceReplayMemory

from utils.hyperparameters import Config
from utils.plot import plot_all_data

param = Config()

param.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = param.device

#epsilon variables
param.epsilon_start = 1.0
param.epsilon_final = 0.01
param.epsilon_decay = 90000
param.epsilon_by_frame = lambda frame_idx: param.epsilon_final + (
    param.epsilon_start - param.epsilon_final) * math.exp(-1. * frame_idx /
                                                          param.epsilon_decay)

#misc agent variables
param.GAMMA = 0.99
param.LR = 1e-4
Beispiel #14
0
import os
import os.path as osp
import gym
import glob
import numpy as np
from timeit import default_timer as timer
import torch

from utils.wrappers import *
from utils.hyperparameters import Config

from model import Model

config = Config()
config.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
config.MAX_FRAMES = 500000  # how long is the model trained
config.VALID_EPISODES = 50  # how many episodes to test

# test settings
student_path = 'pretrained_weights/'
test_dir = 'log/test/'

if __name__ == '__main__':

    os.makedirs(test_dir, exist_ok=True)

    start = timer()

    valid_env_name_list = ["AirRaidNoFrameskip-v4", "CarnivalNoFrameskip-v4",
                           "DemonAttackNoFrameskip-v4", "AssaultNoFrameskip-v4"]
Beispiel #15
0
                    'best_timestep': best_timestep,
                }, True, 'checkpoint-episode-%d.pth.tar' % episode_idx)
        elif episode_idx % config.save_checkpoint_freq == 0:
            save_checkpoint(
                {
                    'episode': episode_idx,
                    'epsilon': epsilon,
                    'state_dict': agent.model.state_dict(),
                    'time_step': ave_time,
                }, False, 'checkpoint-episode-%d.pth.tar' % episode_idx)
        else:
            continue

        print('save checkpoint, episode={}, ave time step={:.2f}'.format(
            episode_idx, ave_time))


if __name__ == '__main__':
    env = game.GameState()
    args = parser.parse_args()
    config = Config()
    config.resume_file = ""
    config.resume = (config.resume_file != "")

    if args.train:
        agent = Agent(config)
        train(agent, config)
    else:
        #play("checkpoint-episode-18000.pth.tar", config)
        play("model_best.pth.tar", config)
Beispiel #16
0
import numpy as np

from IPython.display import clear_output
from matplotlib import pyplot as plt

import random
from timeit import default_timer as timer
from datetime import timedelta
import math

from utils.hyperparameters import Config
from BaseAgent import BaseAgent
import mazeworld

config = Config()

config.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#epsilon variables
config.epsilon_start = 0.1
config.epsilon_final = 0.1
config.epsilon_decay = 50000
config.epsilon_by_frame = lambda frame_idx: config.epsilon_final + (
    config.epsilon_start - config.epsilon_final) * math.exp(
        -1. * frame_idx / config.epsilon_decay)

#misc agent variables
config.GAMMA = 1
config.LR = 0.005