コード例 #1
0
def worker(worker_id, master_end, worker_end):
    master_end.close()  # Forbid worker to use the master end for messaging
    env = retro.make("StreetFighterIISpecialChampionEdition-Genesis",
                     state='rm-easy')
    env = Discretizer(env)
    env = wrap_deepmind(env, scale=True)
    env.seed(worker_id)

    while True:
        cmd, data = worker_end.recv()
        if cmd == 'step':
            ob, reward, done, info = env.step(data)
            if done:
                ob = env.reset()
            worker_end.send((ob, reward, done, info))
        elif cmd == 'reset':
            ob = env.reset()
            worker_end.send(ob)
        elif cmd == 'close':
            worker_end.close()
            break
        elif cmd == 'get_total_rewards':
            episode_rewards = env.get_total_rewards()
            worker_end.send(episode_rewards)
        else:
            raise NotImplementedError
コード例 #2
0
def make_atari_env(name, seed):
    from gym.envs.atari.atari_env import AtariEnv
    env = AtariEnv(game=name, frameskip=4, obs_type='image')
    env = monitor(env, name)
    env = wrap_deepmind(env)
    env.seed(seed)
    return env
コード例 #3
0
ファイル: utils.py プロジェクト: abri-simond/RL
def play_game(env = wrap_deepmind(gym.make("Pong-v0"), frame_stack = True), agent = None, skipframe = 4, th = 0, maxstep = 5000, render = False, memory = ReplayMemory(50000)):
    cum_reward = 0.0
    render_frames = []
    state = env.reset()
    

    for i in range(maxstep):
        # take action:
        action = agent(state, th = th)
        reward = 0
        for _ in range(skipframe):
            next_state, r, ended, info = env.step(action)
            reward += r
            if ended:
                break
        
        cum_reward += float(reward)
        
        # push to replay buffer:
        memory.push(state, action, next_state, reward, ended)
        state = next_state
        
        if render:
            if i % 1 == 0:
                render_frames.append(torch.from_numpy(env.render(mode="rgb_array")).unsqueeze(0))
        if ended == 1:
            break
            
    out = {'cum_reward' : cum_reward, 'steps' :  i}
    if render:
        out['frames'] = torch.cat(render_frames).permute(3,0,1,2).unsqueeze(0)
    return out
コード例 #4
0
 def env_fn():
     env = gym.make('{}NoFrameskip-v4'.format(args.env_name))
     env.seed(args.seed + rank)
     env = Monitor(
         env, osp.join(args.log_dir, "{}.monitor.json".format(rank)))
     gym.logger.setLevel(logging.WARN)
     return wrap_deepmind(env, num_skips=args.num_skips)
コード例 #5
0
def train_model(num_frames):
    env = make_atari('PongNoFrameskip-v4')
    env = wrap_deepmind(env,episode_life=True, frame_stack=True)
    train_results = results.results(globals())

    cumulative_frames = 0
    best_score = -50
    games = 0
    full_loss = []
    rewards = []
    while 1:
        state = env.reset()
        done = False
        cum_reward = 0
        cum_loss = []
        while not done:
            action = select_action(torch.tensor(np.array(state).reshape(-1, 4, HEIGHT, WIDTH)).to(device), cumulative_frames)

            next_state, reward, done, _ = env.step(action)

            memory.add(state, action, reward, next_state, reward)

            state = next_state
            if cumulative_frames % TRAIN_FREQUENCY == 0 and cumulative_frames > LEARNING_STARTS:
                loss = optimize_model(cumulative_frames)
                cum_loss.append(loss)
            
            cum_reward += reward
            cumulative_frames += 1
        
            if cumulative_frames % TARGET_UPDATE == 0:
                target_net.load_state_dict(policy_net.state_dict())

        if best_score < cum_reward:
            best_score = cum_reward
        if len(cum_loss) == 0:
            full_loss.append(0)
        else:
            full_loss.append(np.mean(cum_loss))
        rewards.append(cum_reward)
        games += 1

        if games % 10 == 0:
            print("=============================================")
            print("Game: {} | Frame {}".format(games, cumulative_frames))
            print("Final reward: {}".format(cum_reward))
            print("Epsilon after: {}".format(EPSILON))
            print("Best High Score: {}".format(best_score))
            print("Avg Loss Last 100 games: {}".format(
                np.mean(full_loss[-100:])))
            print("Avg Reward Last 100 games: {}".format(
                np.mean(rewards[-100:])))

        train_results.record(cumulative_frames, games, EPSILON, cum_reward, full_loss[-1])

        if np.mean(rewards[-100:]) >= 18 and cumulative_frames > LEARNING_STARTS:
            break

    torch.save(target_net.state_dict(), PATH)
    train_results.close()
コード例 #6
0
def make_atari_env(name, seed):
    from gym.wrappers.monitor import Monitor
    from gym.envs.atari.atari_env import AtariEnv
    env = AtariEnv(game=name, frameskip=4, obs_type='image')
    env = Monitor(env, 'videos/', force=True, video_callable=lambda e: False)
    env = wrappers.wrap_deepmind(env)
    env.seed(seed)
    return env
コード例 #7
0
def make_atari_env(name, history_len):
    from gym.envs.atari.atari_env import AtariEnv
    from gym.wrappers.monitor import Monitor
    env = AtariEnv(game=name, frameskip=4, obs_type='image')
    env = Monitor(env, 'videos/', force=True, video_callable=lambda e: False)
    env = wrappers.wrap_deepmind(env)
    env = wrappers.HistoryWrapper(env, history_len)
    env.seed(utils.random_seed())
    return env
コード例 #8
0
ファイル: register.py プロジェクト: mynsng/A2C-SF2
def create_env(num):

    if num % 4 == 0 or num % 4 == 1:
        env = retro.make("StreetFighterIISpecialChampionEdition-Genesis",
                         state='rm')
        env = Discretizer(env)

        #expt_dir = config.log_path + '/monitor'
        #env = gym.wrappers.Monitor(env, expt_dir, force=True, video_callable=False)
        env = wrap_deepmind(env, scale=True)

    else:
        env = retro.make("StreetFighterIISpecialChampionEdition-Genesis",
                         state='rm-easy')
        env = Discretizer(env)

        #expt_dir = config.log_path + '/monitor'
        #env = gym.wrappers.Monitor(env, expt_dir, force=True, video_callable=False)
        env = wrap_deepmind(env, scale=True)
    '''    
    elif num % 4 == 2:
        env = retro.make("StreetFighterIISpecialChampionEdition-Genesis", state = 'rm')
        env = Discretizer(env)

        #expt_dir = config.log_path + '/monitor'
        #env = gym.wrappers.Monitor(env, expt_dir, force=True, video_callable=False)
        env = wrap_deepmind(env, scale = True)
        
    else:
        env = retro.make("StreetFighterIISpecialChampionEdition-Genesis", state = 're-easy')
        env = Discretizer(env)

        #expt_dir = config.log_path + '/monitor'
        #env = gym.wrappers.Monitor(env, expt_dir, force=True, video_callable=False)
        env = wrap_deepmind(env, scale = True)
     '''
    #    if config.num_actors >1 :
    #    env = ParallelEnv(num_processes = config.num_actors)

    return env
コード例 #9
0
def create_super_mario_env():
    import gym
    from nes_py.wrappers import BinarySpaceToDiscreteSpaceEnv
    import gym_super_mario_bros
    from gym_super_mario_bros.actions import SIMPLE_MOVEMENT, COMPLEX_MOVEMENT
    env = gym_super_mario_bros.make('SuperMarioBrosRandomStages-v1')
    env = BinarySpaceToDiscreteSpaceEnv(env, COMPLEX_MOVEMENT)
    #env = wrappers.MaxAndSkipEnv(env, skip=4)
    env = wrappers.wrap_deepmind(env,
                                 episode_life=False,
                                 clip_rewards=False,
                                 frame_stack=True,
                                 scale=True)
    return env
コード例 #10
0
def create_env(config):
    env_name = config.env

    env = gym.make(env_name)

    expt_dir = config.log_path + '/monitor'
    env = gym.wrappers.Monitor(env, expt_dir, force=True, video_callable=False)
    env = wrap_deepmind(env, config.episode_life, config.preprocess,
                        config.max_and_skip, config.clip_rewards,
                        config.no_op_reset, config.scale)

    if config.num_actors > 1:
        env = ParallelEnv(num_processes=config.num_actors, env=env)

    return env
コード例 #11
0
def inference(episodes, model, env_name):
    env = make_atari(env_name)
    env = wrap_deepmind(env, episode_life=True, frame_stack=True)
    for _ in range(episodes):
        observation = env.reset()
        done = False
        while not done:
            time.sleep(0.05)
            env.render()
            observation = torch.tensor(np.array(observation).reshape(-1, 4, HEIGHT, WIDTH)).to(device)
            with torch.no_grad():
                action = model(observation).max(1)[1].item()
                observation, reward, done, _ = env.step(action)
                if reward != 0:
                    print(reward)
コード例 #12
0
# run python -i test.py for testing stuff in shell
import torch
import numpy as np
import gym
from wrappers import make_atari, wrap_deepmind
from utils import LinearSchedule, Replay

env=wrap_deepmind(make_atari('BreakoutNoFrameskip-v4'))
state=env.reset()
state = np.array(state)
r = Replay(50, 3, False)
for i in range(100):
    action = env.action_space.sample()
    next_state, reward, done, _ = env.step(action)
    r.add(state, action, reward, next_state, done)
    state = next_state
s, a, r, ns, d = r.sample_tensor()
コード例 #13
0
ファイル: DQN.py プロジェクト: attoucha/INF8225-projet
        next_state = np.expand_dims(next_state, 0)

        self.buffer.append((state, action, reward, next_state, done))

    def sample(self, batch_size):
        state, action, reward, next_state, done = zip(
            *random.sample(self.buffer, batch_size))
        return np.concatenate(state), action, reward, np.concatenate(
            next_state), done

    def __len__(self):
        return len(self.buffer)


env = make_atari("PongNoFrameskip-v4")
env = wrap_deepmind(env, frame_stack=True, pytorch_img=True)

memory = ReplayMemory(100000)


class DQN(nn.Module):
    def __init__(self, input_shape, num_actions):
        super(DQN, self).__init__()

        self.input_shape = input_shape
        self.num_actions = num_actions

        self.conv1 = nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2)
        self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1)
コード例 #14
0
def main():
    #Make OpenAI gym environment + wrappers
    date_time = now.strftime("_%H:%M:%S_%m-%d-%Y")
    env = gym.make("PongNoFrameskip-v4")
    env = gym.wrappers.Monitor(env, './data_dqn_ataripong' + date_time)
    assert 'NoFrameskip' in env.spec.id
    env = NoopResetEnv(env, noop_max=30)
    env = MaxAndSkipEnv(env, skip=4)  #skip 4 frames & max over last_obs
    env = wrap_deepmind(env)
    env = wrap_pytorch(env)  #obs shape = num_channels x width x height
    obs_space_shape = env.observation_space.shape[0]
    action_space_shape = env.action_space.n

    #Set random seeds
    seed = 6582
    torch.manual_seed(seed)
    if torch.cuda.is_available:
        torch.cuda.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    env.seed(seed)

    #Initialize Replay Memory (Line 1)
    replay_memory = ReplayMemory(max_size=100000)

    #Make Q-Network and Target Q-Network (Lines 2 & 3)
    qnet = Atari_Dueling_DQN(obs_space_shape, action_space_shape).to(device)
    target_qnet = Atari_Dueling_DQN(obs_space_shape,
                                    action_space_shape).to(device)
    target_qnet.load_state_dict(qnet.state_dict())

    #Training Parameters (Changes from Mnih et al. outlined in README.md)
    optimizer = optim.Adam(qnet.parameters())
    num_frames = 1400000
    gamma = 0.99
    replay_start_size = 50000
    target_network_update_freq = 10000

    #Train
    obs = env.reset()
    num_episodes = 0
    for t in range(1, num_frames + 1):
        epsilon = epsilon_at_t(t)

        #-------------------------------------------------------------------
        #Take one step in the environment & add to Replay Memory (Line 7-11)
        #-------------------------------------------------------------------
        torch.set_grad_enabled(False)
        #Select action with epsilon-greedy exploration (Line 7,8)
        if random.random() > epsilon:
            ts_obs = torch.from_numpy(obs.astype(
                np.float32)).unsqueeze(0).to(device)
            ts_qvals = qnet(ts_obs)
            action = ts_qvals.max(-1)[1].item()
        else:
            action = random.randrange(action_space_shape)
        torch.set_grad_enabled(True)

        #Execute action and get reward + next_obs (Line 9, 10)
        next_obs, reward, done, _ = env.step(action)

        #Store transition in Replay Memory
        replay_memory.add(obs, next_obs, action, reward, done)

        obs = next_obs

        if done:
            obs = env.reset()
            num_episodes += 1

        #Populate Replay Memory with <replay_start_size> experiences before learning
        if t > replay_start_size:
            #---------------------------------------------
            #Sample batch & compute loss & update network (Lines 12 - 15)
            #---------------------------------------------
            obs_minibatch, next_obs_minibatch, actions_minibatch, rewards_minibatch, done_minibatch = replay_memory.sample(
            )

            ts_obs, ts_rewards, ts_next_obs, ts_done = map(
                lambda x: torch.FloatTensor(x).to(device), [
                    obs_minibatch, rewards_minibatch, next_obs_minibatch,
                    done_minibatch
                ])
            ts_actions = torch.LongTensor(actions_minibatch).to(device)

            torch.set_grad_enabled(False)
            # Compute Target Values (as per Double-DQN update rule)
            ts_next_qvals_outer = qnet(
                ts_next_obs)  #(32, 2) (outer Qnet, evaluates value)
            ts_next_qvals_inner = target_qnet(
                ts_next_obs)  #(32, 2) (inner Qnet, evaluates action)
            ts_next_action_inner = ts_next_qvals_inner.argmax(
                -1, keepdim=True)  #(32, 1)
            ts_next_action_qvals_outer = ts_next_qvals_outer.gather(
                -1, ts_next_action_inner).view(
                    -1)  #(32, ) (use inner actions to evaluate outer Q values)
            ts_target_q = ts_rewards + gamma * ts_next_action_qvals_outer * (
                1 - ts_done)
            torch.set_grad_enabled(True)

            #Compute predicted
            ts_pred_q = qnet(ts_obs).gather(-1, ts_actions).view(-1)  #(32,)

            #Calculate Loss & Perform gradient descent (Line 14)
            loss = F.smooth_l1_loss(ts_pred_q, ts_target_q)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            #Update target network ever <target_network_update_freq> steps (Line 15)
            if t % target_network_update_freq == 0:
                target_qnet.load_state_dict(qnet.state_dict())

        #Log to Terminal
        episode_rewards = env.env.env.env.env.env.env.env.get_episode_rewards()
        print('Timesteps', t, 'Episode', num_episodes, 'Mean Reward',
              np.mean(episode_rewards[-100:]))
    env.env.close()
コード例 #15
0
        if render:
            time_to_sleep = wait_time - (time.time() - start_time)
            if time_to_sleep > 0:
                time.sleep(time_to_sleep)

    return total_reward


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--cuda",
                        default=False,
                        action="store_true",
                        help="Render on graphics card(cuda:0).")
    parser.add_argument("--env",
                        default=ENV_NAME,
                        help="Name of the environment, default=" + ENV_NAME)
    parser.add_argument("-m", "--model", help="DQN")
    args = parser.parse_args()

    device = torch.device(GRAPHICS_CARD if args.cuda else "cpu")

    env = wrappers.make_atari(args.env)
    env = wrappers.wrap_deepmind(env, False, False, True)

    net = model.DQN(4, env.action_space.n).to(device)
    net.load_state_dict(torch.load(args.model))

    score = play(env, net, True, device)
    print(f"Score: {score}")
コード例 #16
0
def train_dqn(env_name,
              save_path,
              double=False,
              dueling=False,
              notebook=False):
    env = wrap_deepmind(make_atari(env_name))
    num_actions = env.action_space.n
    print('Num actions: {}'.format(num_actions))
    if dueling:
        model = DuelingNet(out_size=num_actions)
        target_model = DuelingNet(out_size=num_actions)
    else:
        model = DQN(out_size=num_actions)
        target_model = DQN(out_size=num_actions)
    criterion = nn.SmoothL1Loss()
    print('Created models')

    cuda = False
    if torch.cuda.is_available():
        cuda = True
        model = model.cuda()
        target_model = target_model.cuda()
        print('GPU: {}'.format(torch.cuda.get_device_name(0)))

    model.apply(init_weights)
    target_model.apply(init_weights)
    optimizer = optim.Adam(model.parameters())  #, lr=0.00001)
    print('Initalized models')

    schedule = LinearSchedule(P.start_eps, P.end_eps, P.steps_eps)
    replay = Replay(P.replay_size, P.batch_size, cuda)
    state = env.reset()
    num_updates = 0
    eps_reward = 0
    rewards = []
    losses = []
    # populate replay with random policy
    print('Populating replay')
    for i in tqdm(range(P.replay_start_size), desc='Populating replay'):
        action = env.action_space.sample()
        next_state, reward, done, _ = env.step(action)
        replay.add(state, action, reward, next_state, done)
        state = next_state
        if done:
            state = env.reset()
    print('Starting training')
    state = env.reset()
    for i in tqdm(range(P.num_steps), desc='Total steps'):
        if schedule.choose_random():
            action = env.action_space.sample()
        else:
            model_input = torch.from_numpy(np.array(state)[None, :]).type(
                torch.FloatTensor)
            if cuda:
                model_input = model_input.cuda()
            q_values = model(model_input)
            action = int(q_values.argmax(1)[0])
        next_state, reward, done, _ = env.step(action)
        eps_reward += reward
        replay.add(state, action, reward, next_state, done)
        state = next_state
        last_eps = 0
        if i % P.update_freq == 0:
            loss = compute_loss(replay, optimizer, model, target_model,
                                P.gamma, criterion, double)
            num_updates += 1
            if num_updates % P.target_update_freq == 0:
                target_model.load_state_dict(model.state_dict())
        if done:
            rewards.append(eps_reward)
            losses.append(loss.item())
            eps_reward = 0
            state = env.reset()
        if i % P.print_every == 0 and i > 0:
            print('Step: {}'.format(i))
            print('Average episode reward: {}'.format(
                sum(rewards[last_eps:]) / len(rewards[last_eps:])))
            print('Loss: {}'.format(
                sum(losses[last_eps:]) / len(losses[last_eps:])))
            last_eps = len(losses)
        if i % P.plot_every == 0 and i > 0:
            plot(i, rewards, losses, notebook, save_path)
            # if i % P.save_every == 0 and i > 0:
            torch.save(model, 'experiments/{}/{}_model'.format(save_path, i))
            pickle.dump(
                losses,
                open("experiments/{}/{}_losses.p".format(save_path, i), "wb"))
            pickle.dump(
                rewards,
                open("experiments/{}/{}_rewards.p".format(save_path, i), "wb"))
コード例 #17
0
    epsilon_decay = 30000
    num_frames = 1000000
    batch_size = 32
    learning_rate = 0.0001

    # create environment
    # env_id = "PongNoFrameskip-v4"
    # env_id = 'SpaceInvadersNoFrameskip-v4'
    # env_id = 'MsPacmanNoFrameskip-v4'
    # env_id = 'VideoPinballNoFrameskip-v4'
    # env_id = 'MontezumaRevengeNoFrameskip-v4'
    # env_id = 'QbertNoFrameskip-v4'
    env_id = sys.argv[1]
    env    = make_atari(env_id)
    # env = gym.wrappers.Monitor(env, 'stats', video_callable=lambda episode_id: False, force=True, resume=False)
    env    = wrap_deepmind(env)
    env    = wrap_pytorch(env)

    # create networks
    current_model = CnnDQN(env.observation_space.shape, env.action_space.n)
    target_model  = CnnDQN(env.observation_space.shape, env.action_space.n)
    if USE_CUDA:
        current_model = current_model.cuda()
        target_model  = target_model.cuda()

    # setup optimizer
    optimizer = optim.Adam(current_model.parameters(), lr = learning_rate)

    # initialize replay memory
    replay_buffer = ReplayBuffer(100000)
コード例 #18
0
ファイル: train_atari.py プロジェクト: abri-simond/RL
             'batch_size' : 32,
             'lr' : 0.0001,
            'GAMMA' : 0.95,
            'replay_buffer' : 500000,
             'end_eps' : 0.1,
            'exp_length' : 2000000}
    param['version'] = ", ".join([ "{}:{}".format(key,val) for key, val in param.items()]) + " "+str(datetime.datetime.now())[:16]
    print(param['version'])

    memory = utils.ReplayMemory(param['replay_buffer'])
    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    eps = utils.EpsilonDecay(start_eps = 1.0, end_eps = param['end_eps'], length = param['exp_length'])
    writer = SummaryWriter(log_dir = "tensorboard/" + param['version'])
    checkpoint = utils.CheckpointIfBetter(param, device)

    env = wrap_deepmind(gym.make(param['env']), frame_stack = True)
    dqn = model.DQN(num_actions = env.action_space.n).to(device)
    target_dqn = copy.deepcopy(dqn)
    
    def dqn_epsilon_agent(state, net = dqn, th = 0.05):
        if random.random() > th:
            yhat = net(default_states_preprocessor(state))
            return int(yhat.argmax().cpu().numpy())
        else:
            return env.action_space.sample()

    optimizer = optim.Adam(dqn.parameters(), lr = param['lr'])

    # Warmup buffer
    for _ in range(5):
        game = utils.play_game(env, agent = dqn_epsilon_agent, th = eps.get(0), memory = memory)
コード例 #19
0
ファイル: train.py プロジェクト: Abhipanda4/Battle-Of-DQNs
def get_env():
    env = make_atari("PongNoFrameskip-v4")
    env = wrap_deepmind(env)
    env = wrap_pytorch(env)
    return env
コード例 #20
0
import gym
import tensorflow as tf
from wrappers import wrap_deepmind
from agent import Agent
import time

env = gym.make('Breakout-v0')
env = wrap_deepmind(env, frame_stack=True, scale=True)
action_size = env.action_space.n

# Reset the graph
tf.reset_default_graph()

#Create our agent
agent = Agent(action_size)

count = 0
with tf.Session() as sess:
    total_test_rewards = []

    saver = tf.train.Saver()
    # Load the model
    saver.restore(sess, "./model.ckpt")

    for episode in range(10):
        total_rewards = 0

        state = env.reset()

        print("****************************************************")
        print("EPISODE ", episode)
コード例 #21
0
def get_env(env_id, frame_stack):
    env = make_atari(env_id)
    env = wrap_deepmind(env, frame_stack)
    env = wrap_pytorch(env)
    return env
コード例 #22
0
def main():
    env_id = "PongNoFrameskip-v4"
    env = make_atari(env_id)
    env = wrap_deepmind(env)
    env = wrap_pytorch(env)

    observation_space = env.observation_space.shape
    action_sapce = env.action_space.n

    model = CnnDQN(observation_space, action_sapce)

    if USE_CUDA:
        model = model.cuda()

    optimizer = optim.Adam(model.parameters())

    replay_buffer = ReplayBuffer(1000)

    batch_size = 32
    gamma = 0.99
    replay_initial = 100
    num_frames = 14000

    losses = []
    all_rewards = []
    x_axis1 = []
    x_axis2= []
    episode_reward = 0

    epsilon_start = 1.0
    epsilon_final = 0.01
    epsilon_decay = 30000

    # 要求探索率随着迭代次数增加而减小
    epsilon_by_frame = lambda frame_idx: epsilon_final + (epsilon_start - epsilon_final) * math.exp(-1. * frame_idx / epsilon_decay)

    state = env.reset()

    for frame_idx in range(1, num_frames + 1):
        #显示动画
        env.render()
        epsilon = epsilon_by_frame(frame_idx)
        action = model.act(state, epsilon)
        next_state, reward, done, _ = env.step(action)
        replay_buffer.push(state, action, reward, next_state, done)
        state = next_state
        episode_reward += reward

        if done:
            state = env.reset()
            x_axis1.append(frame_idx)
            all_rewards.append(episode_reward)
            episode_reward = 0

        if frame_idx+1 > replay_initial:
            loss = compute_td_loss(model, optimizer, replay_buffer, gamma, batch_size)
            x_axis2.append(frame_idx)
            losses.append(np.array(loss.data.cpu()))



        if frame_idx % 100 == 0:
            plt.figure(1)
            plt.subplot(121)
            plt.plot(x_axis1, all_rewards)
            plt.subplot(122)
            plt.plot(x_axis2, losses)
            plt.show()

    env.close()
コード例 #23
0
    expected_state_action_values = expected_state_action_values.float()
    predicted_state_action_values = predicted_state_action_values.float()
    return predicted_state_action_values, expected_state_action_values


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--cuda", default=False, action="store_true",
                        help="Enable cuda")
    parser.add_argument("--env", default=ENV_NAME,
                        help="Name of the environment, default=" + ENV_NAME)
    args = parser.parse_args()
    device = torch.device(GRAPHICS_CARD if args.cuda else "cpu")

    env = wrappers.make_atari(args.env)
    env = wrappers.wrap_deepmind(env, episode_life=False, frame_stack=True)
    exp_buffer = ExperienceBuffer(REPLAY_MEMORY_SIZE)
    agent = Agent(env, exp_buffer)

    net = model.DQN(AGENT_HIST_LENGTH, env.action_space.n).to(device)
    tgt_net = model.DQN(AGENT_HIST_LENGTH, env.action_space.n).to(device)
    tgt_net.load_state_dict(net.state_dict())

    criterion = nn.MSELoss()
    optimizer = optim.RMSprop(net.parameters(), lr=LEARNING_RATE,
                              momentum=GRAD_MOMENTUM, eps=MIN_SQ_GRAD)

    writer = SummaryWriter(comment="-" + args.env)

    remaining_time_buffer = collections.deque(maxlen=100)
    last_100_rewards_training = collections.deque(maxlen=100)