Ejemplo n.º 1
0
def main():
    """메인."""
    # 환경 생성
    env = make_env(ENV_NAME)
    net = DQN(env.observation_space.shape, env.action_space.n)
    net.apply(weights_init)
    tgt_net = DQN(env.observation_space.shape, env.action_space.n)
    tgt_net.load_state_dict(net.state_dict())

    if PRIORITIZED:
        memory = PrioReplayBuffer(PRIO_BUF_SIZE)
    else:
        memory = ReplayBuffer(SEND_SIZE)

    # 고정 eps로 에이전트 생성
    epsilon = EPS_BASE**(1 + actor_id / (num_actor - 1) * EPS_ALPHA)
    agent = Agent(env, memory, epsilon, PRIORITIZED)
    log("Actor {} - epsilon {:.5f}".format(actor_id, epsilon))

    # zmq 초기화
    context, lrn_sock, buf_sock = init_zmq()
    # 러너에게서 기본 가중치 받고 시작
    net, tgt_net = receive_model(lrn_sock, net, tgt_net, True)

    #
    # 시뮬레이션
    #
    episode = frame_idx = 0
    p_time = p_frame = None
    p_reward = -50.0

    while True:
        frame_idx += 1

        # 스텝 진행 (에피소드 종료면 reset까지)
        reward = agent.play_step(net, tgt_net, epsilon, frame_idx)

        # 리워드가 있는 경우 (에피소드 종료)
        if reward is not None:
            episode += 1
            p_reward = reward

        # 보내기
        if frame_idx % SEND_FREQ == 0:
            # 학습관련 정보
            if p_time is None:
                speed = 0.0
            else:
                speed = (frame_idx - p_frame) / (time.time() - p_time)
            info = ActorInfo(episode, frame_idx, p_reward, speed)
            # 리플레이 정보와 정보 전송
            agent.send_replay(buf_sock, info)
            # 동작 선택 횟수
            agent.show_action_rate()

            p_time = time.time()
            p_frame = frame_idx

            # 새로운 모델 받기
            net, tgt_net = receive_model(lrn_sock, net, tgt_net, False)
Ejemplo n.º 2
0
def train(agent, env_name):
    writer = SummaryWriter()

    if env_name in ['MountainCar-v0', 'CartPole-v0']:
        sction = 1
        env = gym.make(env_name)
    elif env_name in ["PongNoFrameskip-v4"]:
        sction = 2
        env = make_env(env_name)

    step = 0

    for i in range(99999):
        agent.epsilon = 1 / (i * 0.1 + 1)
        done = False
        state = env.reset()
        score = 0
        step_per_q_value = 0
        step_per_loss = 0
        sum_of_q_value = 0
        agent.n_step_buffer.reset()
        agent.save_model('model/model')
        total_loss = 0
        while not done:
            step += 1

            if i % 10 == 0:
                env.render()
            action, q_value = agent.get_action(state, agent.epsilon)
            if not q_value == None:
                sum_of_q_value += q_value
                step_per_q_value += 1

            if sction == 1:
                next_state, reward, done, info = env.step(action)
            elif sction == 2:
                next_state, reward, done, info = env.step(action + 1)

            score += reward
            agent.append_to_memory(state, next_state, reward, done, action)
            state = next_state

            if step > agent.batch_size:
                if step % agent.train_size == 0:
                    step_per_loss += 1
                    loss = agent.update()
                    total_loss += loss
                if step % agent.update_size == 0:
                    agent.update_parameter()
        writer.add_scalar('data/step', step, i)
        writer.add_scalar('data/score', score, i)
        writer.add_scalar('data/epsilon', agent.epsilon, i)

        if not step_per_q_value == 0:
            writer.add_scalar('data/average_of_q_value',
                              sum_of_q_value / step_per_q_value, i)
        if not step_per_loss == 0:
            writer.add_scalar('data/loss', total_loss / step_per_loss, i)
        print(score, i)
Ejemplo n.º 3
0
def main():
    env = make_env('PongNoFrameskip-v4')
    policy_net = PolicyNet(env.observation_space.shape,
                           env.action_space.n).to(torch.device('cuda'))
    base_net = Baseline(env.observation_space.shape).to(torch.device('cuda'))
    policy_net.load_state_dict(torch.load('./policynet'))
    base_net.load_state_dict(torch.load('./basenet'))
    agent = Agent(policy_net, base_net)
    agent.train(env, 16, 20000, 0.98, 5)
Ejemplo n.º 4
0
def main():
    """메인."""
    # 환경 생성
    env = make_env(ENV_NAME)
    set_random_seed(env, actor_id)
    net = A2C(env.observation_space.shape, env.action_space.n)
    net.apply(weights_init)
    memory = ReplayBuffer(SEND_SIZE)
    agent = Agent(env, memory, NUM_UNROLL)
    log("Actor {}".format(actor_id))

    # zmq 초기화
    context, lrn_sock, buf_sock = init_zmq()
    # 러너에게서 기본 가중치 받고 시작
    net = receive_model(lrn_sock, net, True)

    #
    # 시뮬레이션
    #
    episode = frame_idx = 0
    p_time = p_frame = None
    p_reward = -50.0

    while True:
        frame_idx += 1

        # 스텝 진행 (에피소드 종료면 reset까지)
        ep_reward = agent.play_step(net, frame_idx)

        # 에피소드 리워드가 있는 경우 (에피소드 종료)
        if ep_reward is not None:
            episode += 1
            p_reward = ep_reward
            log("Episode finished! reward {}".format(ep_reward))

        # 보내기
        if frame_idx % SEND_FREQ == 0:
            # 학습관련 정보
            if p_time is None:
                speed = 0.0
            else:
                speed = (frame_idx - p_frame) / (time.time() - p_time)
            info = ActorInfo(episode, frame_idx, p_reward, speed)
            # 리플레이 정보와 정보 전송
            agent.send_replay(buf_sock, info)
            # 동작 선택 횟수
            agent.show_action_rate()

            p_time = time.time()
            p_frame = frame_idx

            # 새로운 모델 받기
            net = receive_model(lrn_sock, net, False)
Ejemplo n.º 5
0
def run_experiment(params, log_dir, local_log_path, random_seed=None):

    # create env and add specific conifigurations to Malmo
    env = make_env(params["DEFAULT_ENV_NAME"])
    env.configure(client_pool=[('127.0.0.1', 10000), ('127.0.0.1', 10001)])
    env.configure(allowDiscreteMovement=["move",
                                         "turn"])  # , log_level="INFO")
    env.configure(videoResolution=[420, 420])
    env.configure(stack_frames=4)
    env = wrap_env_malmo(env)

    while True:
        action = env.action_space.sample()
        new_state, reward, is_done, _ = env.step(action)
        env.render('human')
        if is_done:
            env.reset()
def train_agent(device: Any) -> None:
    """
    Train agent using embedder and embedded checkpoints.

    TODO Fix docstrings once finished.

    """
    # Load embedded network
    tdc = TDC().to(device)
    load_tdc(tdc)

    # Create checkpoints
    loader = get_checkpoint_loader()
    checkpoints: List[torch.Tensor] = get_checkpoints(tdc, loader)

    # Create environment
    env = make_env(tdc, checkpoints)

    # TODO Temporarily added to disable flake8 error
    print(env)
Ejemplo n.º 7
0
def test(agent, env_name):
    if env_name in ['MountainCar-v0', 'CartPole-v0']:
        sction = 1
        env = gym.make(env_name)
    elif env_name in ["PongNoFrameskip-v4"]:
        sction = 2
        env = make_env(env_name)
    agent.load_model('model/model')
    while True:
        state = env.reset()
        done = False
        while not done:
            env.render()
            time.sleep(0.01)
            action, q_value = agent.get_action(state, 0)
            if sction == 1:
                next_state, reward, done, _ = env.step(action)
            elif sction == 2:
                next_state, reward, done, _ = env.step(action + 1)

            state = next_state
Ejemplo n.º 8
0
def prep_agent(params, log_dir, local_log, random_seed, trial, agent_id, port):

    # define device on which to run
    device = torch.device(params["DEVICE"])

    # create env and add specific conifigurations to Malmo
    env = make_env(params["DEFAULT_ENV_NAME"])
    env.configure(client_pool=[('127.0.0.1', int(port))]) # fix port
    env.configure(allowDiscreteMovement=["move", "turn"]) # , log_level="INFO")
    env.configure(videoResolution=[84,84])
    env.configure(stack_frames=4)
    env = wrap_env_malmo(env)

    if random_seed:
        env.seed(random_seed)

    print(colored("Observation Space: ", COLORS[agent_id]), colored(env.observation_space, COLORS[agent_id]))
    print(colored("Action Space: ", COLORS[agent_id]), colored(env.action_space, COLORS[agent_id]))

    # initialize agent
    bufer = ExperienceBufferGridImage(params["REPLAY_SIZE"])
    # buffer = ExperienceBuffer(params["REPLAY_SIZE"])            
    net = DQN(env.observation_space.shape, env.action_space.n, params["DEVICE"]).to(device)
    tgt_net = DQN(env.observation_space.shape, env.action_space.n, params["DEVICE"]).to(device)
    epsilon = params["EPSILON_START"]
    gamma = params["GAMMA"]
    tau = params["SOFT_UPDATE_TAU"]
    agent = Agent('agent' + str(agent_id), env, bufer, net, tgt_net, gamma, epsilon, tau, 
        trial, log_dir, params)

    # other variables
    agent.optimizer = optim.Adam(agent.net.parameters(), lr=params["LEARNING_RATE"])
    agent.print_color = COLORS[agent_id]

    # fill buffer with initial size - don't count these episodes
    agent.fill_buffer()

    return agent
Ejemplo n.º 9
0
def make_gif(extension):
    agent = Agent(alpha=1e-4,
                  gamma=0.99,
                  n_actions=3,
                  action_map={
                      0: 0,
                      1: 4,
                      2: 5
                  },
                  mem_size=25000,
                  batch_size=32,
                  replace=0,
                  input_dims=(4, 80, 80),
                  epsilon=0.02,
                  epsilon_dec=0,
                  epsilon_min=0,
                  load_from_checkpoint=False)
    agent.load_models(extension=extension)

    frames = []
    done = False
    env = make_env("PongNoFrameskip-v4")
    observation = env.reset()
    i = 0
    while not done:
        if i % 3 == 0:
            frames.append(Image.fromarray(env.render(mode='rgb_array')))
        action = agent.get_action(observation)
        move = agent.action_map[action]
        new_observation, reward, done, info = env.step(move)
        observation = new_observation
        i += 1

    with open(f'{extension}.gif', 'wb') as f:  # change the path if necessary
        im = Image.new('RGB', frames[0].size)
        im.save(f, save_all=True, append_images=frames)
    'NB_FRAMES': 10000,
    'BATCH_SIZE': 32,
    'DISCOUNT': 0.99,
    'TARGET_UPDATE_STEPS': 100,
    'LEARNING_RATE': 1e-3,
    'REPLAY_BUFFER_SIZE': 1000,
    'MIN_REPLAY_BUFFER_SIZE': 100,
    'EPSILON_START': 1,
    'EPSILON_END': 0.1,
    'EPSILON_DECAY_DURATION': 5000,
}
# Allow changing hyperparameters from command-line arguments
args = get_args(default_args=args_dict)

# Create wrapped environment
env = make_env(args.ENV_ID)

# Set Seed
set_seed(env, args.SEED)

# GPU or CPU
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# Create agent
agent = Agent(env, device, args)

# Train agent for args.NB_FRAMES
agent.train()

# Save agent
agent.save()
Ejemplo n.º 11
0
                        "--env",
                        default=DEFAULT_ENV_NAME,
                        help="Environment name to use"
                        "default=" + DEFAULT_ENV_NAME)
    parser.add_argument("-r",
                        "--record",
                        help="Directory to store video "
                        "recording")
    parser.add_argument("--no-visualize",
                        default=True,
                        action='store_false',
                        dest='visualize',
                        help="Disable visualization of the game play")
    args = parser.parse_args()

    env = make_env(args.env)
    if args.record:
        env = gym.wrappers.Monitor(env, args.record)
    net = torch.load(args.model, map_location={'cuda:0': 'cpu'})

    state = env.reset()
    total_reward = 0.0
    dead = False
    start_life = 5
    c = collections.Counter()

    while True:
        start_ts = time.time()
        if args.visualize:
            env.render()
        state_v = torch.tensor(np.array([state], copy=False))
Ejemplo n.º 12
0
def main():
    # Get Atari games.
    env = make_env()
    # Run training
    atari_learn(env, num_timesteps=2e8)
    # d = local_log._getvalue()
    # with open( local_log_path , "w") as f:
    #     json.dump(d, f)

    # Inform experiment is done
    print("Experiment complet. Results found at: " + local_log_path)
    


def prep_agent(params, log_dir, local_log, random_seed, trial, agent_id, port, give, receive, req_give, req_receive)

    # define device on which to run
    device = torch.device(params["DEVICE"])

    # create env and add specific conifigurations to Malmo
    env = make_env(params["DEFAULT_ENV_NAME"])
    env.configure(client_pool=[('127.0.0.1', int(port))]) # fix port
    env.configure(allowDiscreteMovement=["move", "turn"]) # , log_level="INFO")
    env.configure(videoResolution=[84,84])
    env.configure(stack_frames=4)
    env = wrap_env_malmo(env)

    if random_seed:
        env.seed(random_seed)

    print(colored("Observation Space: ", COLORS[agent_id]), colored(env.observation_space, COLORS[agent_id]))
    print(colored("Action Space: ", COLORS[agent_id]), colored(env.action_space, COLORS[agent_id]))

    # initialize agent
    bufer = ExperienceBufferGridImage(params["REPLAY_SIZE"])
    # buffer = ExperienceBuffer(params["REPLAY_SIZE"])            
Ejemplo n.º 14
0
        print(f"episode score: {total_return}")


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--task', type=str, default='pong')
    parser.add_argument('--render', action='store_true')
    parser.add_argument('--cpu', action='store_true')
    parser.add_argument('--evaluate', type=str, default=None)
    parser.add_argument('--resume', type=str, default=None, nargs=2)
    args = parser.parse_args()
    params = HPS[args.task]

    device = torch.device('cpu') if args.cpu else torch.device('cuda')

    env = make_env(params.env_name)
    obs_shape = env.observation_space.shape
    nb_actions = env.action_space.n

    if params.net_type == 'conv':
        net = CategoricalDQN((params.frame_stack, *obs_shape), nb_actions)
    agent = CategoricalDQNAgent(
        net=net,
        nb_actions=nb_actions,
        gamma=params.gamma,
        device=device,
    )

    if args.evaluate:
        agent.net.load_state_dict(torch.load(args.evaluate))
        env = make_env(params.env_name, episodic=False)
Ejemplo n.º 15
0
def DQN_experiment(params, log_dir, random_seed=None):

    # define device on which to run
    device = torch.device(params["DEVICE"])

    # fix replay start sie to be equal to replay size
    params["REPLAY_START_SIZE"] = params["REPLAY_SIZE"]

    ## initialize global variables
    # initialize local log trackers 
    log_episodes_count = []
    log_ma_steps = []
    log_md_steps = []
    log_ma_rewards = []
    log_md_rewards = []

    colors=['green','red','blue','yellow','cyan','magenta','grey','white']

    # try several times and average results, needs to compensate for stochasticity
    for trial in range(params["NUM_TRIALS"]):

        # initialize environment
        agents = []

        # need to be one env per agent
        env = make_env(params["DEFAULT_ENV_NAME"])
        if random_seed:
            env.seed(random_seed)

        # initialize agents
        for idx in range(params["NUM_AGENTS"]):

            # initialize agent
            buffer = ExperienceBuffer(params["REPLAY_SIZE"], env)
            net = DQN(env.observation_space.shape[0], env.action_space.n, params["DEVICE"]).to(device)
            tgt_net = DQN(env.observation_space.shape[0], env.action_space.n, params["DEVICE"]).to(device)
            epsilon = params["EPSILON_START"]
            gamma = params["GAMMA"]
            tau = params["SOFT_UPDATE_TAU"]
            agent = Agent('agent' + str(idx+1), env, buffer, net, tgt_net, gamma, epsilon, tau, trial, log_dir)

            # other variables
            agent.optimizer = optim.Adam(agent.net.parameters(), lr=params["LEARNING_RATE"])
            agent.print_color = colors[idx]

            agents.append(agent)    


        ######### training loop
        ################################

        ts = time.time() # track start time


        ######### 1. Filling replay bugg
        ################################

        # both agents fill their buffer prior to experience
        for agent in agents:
            while True:
            
                # add frame count
                agent.frame_idx+= 1

                # play step
                episode_over, done_reward = agent.play_step(device=device)
                if params["DEBUG"]: agent.record()

                # check if minimum buffer size has been achieved. if not, move on, do not do learning
                if len(agent.exp_buffer) >= params["REPLAY_START_SIZE"]:
                    agent.reset()
                    break    


        ######### 1. They start alternating
        ################################

        episode_start = time.time()        
        ep_count = 0
        # while all agents have not completed:    
        while sum(map(lambda agent:agent.completed, agents)) != len(agents):

            ep_count += 1

            # agents alternate
            for agent in agents:

                ## Before 2 agents perform, act, do one round of experience share
                # given a sharing interval and it is not the first episode
                if params["SHARING"] and ep_count % params["SHARING_INTERVAL"] == 0 and ep_count > 0:

                    # agent 1 requests
                    student, teacher = agents[0], agents[1]
                    transfer_mask = student.request_share(threshold=0)
                    transfer_batch = teacher.exp_buffer.sample_with_mask(student.steps[-1], transfer_mask)
                    student.exp_buffer.extend(transfer_batch)

                    # agent 2 requests
                    student, teacher = agents[1], agents[0]
                    transfer_mask = student.request_share(threshold=0)
                    transfer_batch = teacher.exp_buffer.sample_with_mask(student.steps[1], transfer_mask)
                    student.exp_buffer.extend(transfer_batch)


                # check if agent has not completed the task already
                # if it does, go to the next agent
                if not agent.completed:

                    # play until episode is over
                    episode_over = False
                    while not episode_over:

                        # add frame count
                        agent.frame_idx+= 1

                        # play step
                        episode_over, done_reward = agent.play_step(device=device)

                        if done_reward is not None:

                            # calculate speed
                            agent.speed = (agent.frame_idx - agent.ts_frame) / (time.time() - ts)
                            agent.ts_frame = agent.frame_idx
                            ts = time.time()

                            # get time between episodes

                            ## verify completion and report metrics
                            if params["INDEPENDENT_EVALUATION"]:

                                if len(agent.total_rewards) % params["TRACKING_INTERVAL"] == 0:
                                    agent.test_rewards = []
                                    evaluation_start = time.time()
                                    for _ in range(100):
                                        done_reward = False
                                        while not done_reward:
                                            _, done_reward = agent.play_step(device=device, test=True)
                                        agent.test_rewards.append(done_reward)
                                    evaluation_time = time.time() - evaluation_start

                                    # only report after one episode ends
                                    agent.mean_reward = np.mean(agent.test_rewards)
                                    agent.std_reward = np.std(agent.test_rewards)

                                    # calculate elapsed time
                                    episode_end = time.time()
                                    episode_speed = params["TRACKING_INTERVAL"] / (episode_end - episode_start) 
                                    episode_start = time.time()

                                    # report
                                    print(colored("%s, %d: done %d episodes, mean reward %.2f, std reward %.2f, eps %.2f, speed %d f/s, ep_speed %.2f e/s, eval_time %.2f s" % (
                                        agent.alias, agent.frame_idx, len(agent.total_rewards), agent.mean_reward, agent.std_reward, agent.epsilon, agent.speed, episode_speed, evaluation_time
                                    ), agent.print_color))
                                    
                                    ## check if reward has improved from last iteration
                                    if agent.mean_reward is not None:
                                        if agent.mean_reward > params["MEAN_REWARD_BOUND"]:
                                            print(colored("%s solved in %d episodes!" % (agent.alias, len(agent.total_rewards)), agent.print_color))
                                            # save final version
                                            # save final version
                                            # torch.save(agent.net.state_dict(), "weights/" + params["DEFAULT_ENV_NAME"] + "-" + agent.alias + "-best.dat")
                                            # mark as completed
                                            agent.completed = True
                                            # save local log
                                            log_episodes_count[agent.alias].append(len(agent.total_rewards))
                                            log_steps[agent.alias].append(len(agent.total_rewards))

                            ## approach to track evaluation using moving averages:
                            else:
                                # only report after one episode ends
                                agent.mean_reward = np.mean(agent.total_rewards[-params["NUMBER_EPISODES_MEAN"]:])
                                agent.std_reward = np.std(agent.total_rewards[-params["NUMBER_EPISODES_MEAN"]:])

                                # calculate elapsed time
                                episode_end = time.time()
                                episode_speed = 1 / (episode_end - episode_start)
                                episode_start = time.time()

                                # report
                                if len(agent.total_rewards) % params["TRACKING_INTERVAL"] == 0:
                                    print(colored("%s, %d: done %d episodes, mean reward %.2f, std reward %.2f, eps %.2f, speed %d f/s, ep_speed %.2f e/s" % (
                                        agent.alias, agent.frame_idx, len(agent.total_rewards), agent.mean_reward, agent.std_reward, agent.epsilon, agent.speed, episode_speed
                                    ), agent.print_color))
                                
                                ## check if reward has improved from last iteration
                                if agent.mean_reward is not None:
                                    if agent.mean_reward > params["MEAN_REWARD_BOUND"]:
                                        print(colored("%s solved in %d episodes!" % (agent.alias, len(agent.total_rewards)), agent.print_color))
                                        # save final version
                                        # torch.save(agent.net.state_dict(), "weights/" + params["DEFAULT_ENV_NAME"] + "-" + agent.alias + "-best.dat")
                                        # mark as completed
                                        agent.completed = True
                                        # save local log
                                        log_episodes_count.append(len(agent.total_rewards))
                                        log_ma_rewards.append(np.mean(agent.total_rewards[-params["REPORTING_INTERVAL"]:]))
                                        log_md_rewards.append(np.std(agent.total_rewards[-params["REPORTING_INTERVAL"]:]))
                                        log_ma_steps.append(np.mean(agent.total_steps[-params["REPORTING_INTERVAL"]:]))
                                        log_md_steps.append(np.std(agent.total_steps[-params["REPORTING_INTERVAL"]:]))

                        # if no sign of converging, also break
                        # but don't store the result
                        if len(agent.total_rewards) > params["MAX_GAMES_PLAYED"]:
                            agent.completed = True

                        # decay epsilon after the first episodes that fill the buffer
                        # decay epsilon linearly on frames
                        agent.epsilon = max(params["EPSILON_FINAL"], params["EPSILON_START"] - (agent.frame_idx-params["REPLAY_START_SIZE"]) / params["EPSILON_DECAY_LAST_FRAME"])
                            
                        # update at every frame using soft updates
                        if params["SOFT"]:
                            agent.soft_update_target_network()
                        else:                        
                            if agent.frame_idx % params["SYNC_TARGET_FRAMES"] == 0:
                                agent.tgt_net.load_state_dict(agent.net.state_dict())
                            
                        ## learn
                        # zero gradients
                        agent.optimizer.zero_grad()
                        # sample from buffer
                        batch = agent.exp_buffer.sample(params["BATCH_SIZE"])
                        # calculate loss
                        # decide to leave it on the agent as a static method, instead of floating around
                        loss_t = agent.calc_loss(batch, device=device)
                        # calculate gradients
                        loss_t.backward()
                        # gradient clipping
                        if params["GRADIENT_CLIPPING"]: nn.utils.clip_grad_norm_(net.parameters(), params["GRAD_L2_CLIP"])
                        # optimize
                        agent.optimizer.step()

                        # track agent parameters, including loss function
                        # detach loss before extracting value - not sure if needed, but better safe than sorry
                        if params["DEBUG"]: agent.record(loss_t.detach().item())


    for agent in agents:
        agent.writer.close()

    # return local log with results
    local_log = {
        "episodes_count": log_episodes_count,
        "ma_steps": log_ma_steps,
        "md_steps": log_md_steps,
        "ma_rewards": log_ma_rewards,
        "md_rewards": log_md_rewards
    }
    return local_log
Ejemplo n.º 16
0
from networks import *
from wrappers import make_env
import random

import sys

import pyglet

path = "checkpoints/%s.pt" % sys.argv[1]

q = OCNN()
#q.load_state_dict(torch.load(path,map_location=torch.device('cpu')))
q.eval()
env = gym.make('BreakoutNoFrameskip-v0')
env = make_env(env,
               noop_max=10)  # sometimes it no-ops until the ball is unsaveable

obs = env.reset()
action = 0

kold = None


def update(dt):
    global obs, action, kold
    env.render()
    k = q(torch.cat(obs).unsqueeze(0))
    kk = k.tolist()
    if kold != kk:
        print(*('- '[i > 0] + str(round(abs(i), 3)).ljust(5, '0')
                for i in kk[0]))
Ejemplo n.º 17
0
def main():
    """메인 함수."""
    # 환경 생성
    env = make_env(ENV_NAME)
    device = get_device()
    net = DQN(env.observation_space.shape, env.action_space.n).to(device)
    net.apply(weights_init)
    tgt_net = DQN(env.observation_space.shape, env.action_space.n).to(device)
    tgt_net.load_state_dict(net.state_dict())
    writer = SummaryWriter(comment="-" + ENV_NAME)
    log(net)

    # ZMQ 초기화
    context, act_sock, buf_sock = init_zmq()
    # 입력을 기다린 후 시작
    log("Press Enter when the actors are ready: ")
    input()

    # 기본 모델을 발행해 액터 시작
    log("sending parameters to actors…")
    publish_model(net, tgt_net, act_sock)

    optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE)
    # optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE,
    #                             momentum=0.9)
    # scheduler = ReduceLROnPlateau(optimizer, 'min')

    fps = q_max = 0.0
    p_time = idxs = errors = None
    train_cnt = 1
    max_reward = -1000
    while True:

        # 버퍼에게 학습을 위한 배치를 요청
        log("request new batch {}.".format(train_cnt))
        st = time.time()
        if PRIORITIZED:
            # 배치의 에러를 보냄
            payload = pickle.dumps((idxs, errors))
            if errors is not None:
                priority = np.mean(errors)
        else:
            payload = b''
        buf_sock.send(payload)
        payload = buf_sock.recv()
        log("receive batch elapse {:.2f}".format(time.time() - st))

        if payload == b'not enough':
            # 아직 배치가 부족
            log("not enough data to batch.")
            time.sleep(1)
        else:
            # 배치 학습
            st = time.time()
            train_cnt += 1

            if PRIORITIZED:
                exps, idxs, ainfos, binfo = pickle.loads(payload)
                batch = Experience(*map(np.concatenate, zip(*exps)))
            else:
                batch, ainfos, binfo = pickle.loads(payload)

            loss_t, errors, q_maxs = calc_loss(batch,
                                               net,
                                               tgt_net,
                                               device=device)
            optimizer.zero_grad()
            loss_t.backward()
            # scheduler.step(float(loss_t))
            q_max = q_maxs.mean()
            optimizer.step()

            # gradient clipping
            for param in net.parameters():
                param.grad.data.clamp_(-GRADIENT_CLIP, GRADIENT_CLIP)

            # 타겟 네트워크 갱신
            if train_cnt % SYNC_TARGET_FREQ == 0:
                log("sync target network")
                log(net.state_dict()['conv.0.weight'][0][0])
                tgt_net.load_state_dict(net.state_dict())

            if train_cnt % SHOW_FREQ == 0:
                # 보드 게시
                # for name, param in net.named_parameters():
                #    writer.add_histogram("learner/" + name,
                #                         param.clone().cpu().data.numpy(),
                #                         train_cnt)
                writer.add_scalar("learner/loss", float(loss_t), train_cnt)
                writer.add_scalar("learner/Qmax", q_max, train_cnt)
                if PRIORITIZED:
                    writer.add_scalar("learner/priority", priority, train_cnt)
                writer.add_scalar("buffer/replay", binfo.replay, train_cnt)
                for ano, ainfo in ainfos.items():
                    writer.add_scalar("actor/{}-reward".format(ano),
                                      ainfo.reward, ainfo.frame)

            # 최고 리워드 모델 저장
            _max_reward = np.max([ainfo.reward for ainfo in ainfos.values()])
            if _max_reward > max_reward and train_cnt % SAVE_FREQ == 0:
                log("save best model - reward {:.2f}".format(_max_reward))
                torch.save(net, ENV_NAME + "-best.dat")
                max_reward = _max_reward

        # 모델 발행
        if train_cnt % PUBLISH_FREQ == 0:
            publish_model(net, tgt_net, act_sock)

        if p_time is not None:
            elapsed = time.time() - p_time
            fps = 1.0 / elapsed
            log("train elapsed {:.2f} speed {:.2f} f/s".format(elapsed, fps))

        p_time = time.time()

    writer.close()
Ejemplo n.º 18
0
            layer.register_forward_hook(hook_fn)
    return visualization


def modify(t, i):
    t[i] += torch.randn_like(t[i]) * 0.05


if __name__ == "__main__":
    path = "checkpoints/%s.pt" % sys.argv[1]

    q = OCNN()
    q.load_state_dict(torch.load(path, map_location=torch.device('cpu')))
    q.eval()
    env = gym.make('BreakoutNoFrameskip-v0')
    env = make_env(env)

    d = hook_layers(q)

    obs = env.reset()
    done = False

    R = 0
    c = 0

    c1 = 0
    c2 = 0
    c3 = 0

    for i in range(1000):
        state = torch.cat(obs).unsqueeze(0)
Ejemplo n.º 19
0
            "target_update": 1000,
            "num_episodes": 1500,
            "batch_size": 32,
            "replay_initial": 10000,
            "capacity": 100000,
            "max_nb_elements": 4,
        },
    }

    params = HYPERPARAMS["breakout"]

    scores, eps_history = [], []

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    env = wrappers.make_env(params["env_name"])

    policy_network = model.NoisyDQN(env.observation_space.shape,
                                    env.action_space.n).to(device)
    target_network = ptan.agent.TargetNet(policy_network)
    optimizer = optim.Adam(policy_network.parameters(),
                           lr=params["learning_rate"])

    action_selector = ptan.actions.ArgmaxActionSelector()
    agent = ptan.agent.DQNAgent(policy_network, action_selector, device)

    exp_source = ptan.experience.ExperienceSourceFirstLast(
        env, agent, gamma=params["gamma"], steps_count=1)
    buffer = ptan.experience.ExperienceReplayBuffer(
        exp_source, buffer_size=params["capacity"])
Ejemplo n.º 20
0
def main():

    logger.configure('{}{}_logs'.format(filePath, envName))
    for k, v in C.items():
        logger.record_tabular(k, v)
    logger.dump_tabular()

    logger.log('MsPacman')

    #Start the session
    sess = tf.InteractiveSession()

    train_env = make_env(C['env_id'], C['noop_max'])
    eval_env = make_env(C['env_id'], C['noop_max'])

    #Intitialize variables to record outputs
    train_track = [0.0]
    eval_track = []
    best_reward = 0

    train_reward = tf.placeholder(tf.float32)
    eval_reward = tf.placeholder(tf.float32)
    train_env = make_env(C['env_id'], C['noop_max'])
    eval_env = make_env(C['env_id'], C['noop_max'])
    agent = Agent(train_env, C)

    train_fs = reset_fs()
    train_s = train_env.reset()
    best_reward = 0
    train_mean = []
    eval_mean = []

    train_summary = tf.summary.scalar('train_reward', train_reward)
    eval_summary = tf.summary.scalar('eval_reward', eval_reward)
    writer = tf.summary.FileWriter('{}{}_summary'.format(filePath, envName),
                                   sess.graph)
    sess.run(tf.global_variables_initializer())

    agent.net.update_target_network()

    for it in range(C['iterations']):

        train_fs.append(train_s)

        train_a = agent.act(np.transpose(train_fs, (1, 2, 0)))
        ns, train_r, train_d, _ = train_env.step(train_a)
        #print('Iteration ',it, ' Reward ', train_r)
        train_track[-1] += train_r
        agent.record(train_s, train_a, train_r, float(train_d), it)
        train_s = ns

        if train_d:
            if train_env.env.env.was_real_done:  # one env for MsPacman, Freeway (No Fire action)
                if len(train_track) % 100 == 0:
                    mean = np.mean(train_track[-100:])
                    train_mean.append(mean)
                    summary = sess.run(train_summary,
                                       feed_dict={train_reward: mean})
                    writer.add_summary(summary, it)
                    logger.record_tabular('steps', it)
                    logger.record_tabular('episode', len(train_track))
                    logger.record_tabular('epsilon', 100 * agent.epsilon)
                    logger.record_tabular('learning rate', agent.lr)
                    logger.record_tabular('Mean Reward 100 episdoes', mean)
                    logger.dump_tabular()
                    with open(resultPath + 'reward_atari_base.pk1', 'wb') as f:
                        pickle.dump(train_track,
                                    f,
                                    protocol=pickle.HIGHEST_PROTOCOL)

                train_track.append(0.0)

            train_fs = reset_fs()
            train_s = train_env.reset()

        if (it + 1) % C['eval_freq'] == 0:

            for i in range(C['eval_episodes']):
                temp_video = []
                eval_track.append(0.0)
                eval_fs = reset_fs()
                eval_s = eval_env.reset()
                while True:
                    temp_video.append(eval_s)
                    eval_fs.append(eval_s)
                    eval_a = agent.greedy_act(np.transpose(eval_fs, (1, 2, 0)))
                    eval_s, eval_r, eval_d, _ = eval_env.step(eval_a)
                    eval_track[-1] += eval_r

                    if eval_env.env.env.was_real_done:
                        break
                    if eval_d:
                        eval_fs = reset_fs()
                        eval_s = eval_env.reset()

                if eval_track[-1] > best_reward:
                    best_reward = eval_track[-1]
                    best_video = temp_video
                    with open(resultPath + 'video_atari_base.pk1', 'wb') as f:
                        pickle.dump(best_video,
                                    f,
                                    protocol=pickle.HIGHEST_PROTOCOL)

            eval_mean.append(np.mean(eval_track[-C['eval_episodes']:]))
            summary = sess.run(eval_summary,
                               feed_dict={
                                   eval_reward:
                                   np.mean(eval_track[-C['eval_episodes']:])
                               })
            writer.add_summary(summary, it)

        if it == 1000000:
            outputs = agent.net.get_outputs(np.transpose(train_fs, (1, 2, 0)))
            with open(resultPath + 'outputs.pk1', 'wb') as f:
                pickle.dump(outputs, f, protocol=pickle.HIGHEST_PROTOCOL)
            with open(resultPath + 'outputs_screen.pk1', 'wb') as f:
                pickle.dump(train_fs, f, protocol=pickle.HIGHEST_PROTOCOL)

    with open(resultPath + 'reward_atari_base.pk1', 'wb') as f:
        pickle.dump(train_track, f, protocol=pickle.HIGHEST_PROTOCOL)
    with open(resultPath + 'trainMean_atari_base.pk1', 'wb') as f:
        pickle.dump(train_mean, f, protocol=pickle.HIGHEST_PROTOCOL)
    with open(resultPath + 'evalMean_atari_base.pk1', 'wb') as f:
        pickle.dump(eval_mean, f, protocol=pickle.HIGHEST_PROTOCOL)
    agent.net.save(filePath + '{}_model2'.format(C['env_id']))
    sess.close()
Ejemplo n.º 21
0
def main():

  #Adding configuraion file details into logger
  logger.configure('{}{}_logs'.format(filePath, envName))
  for k, v in C.items():
    logger.record_tabular(k, v)
  logger.dump_tabular()        

  logger.log('Practice DQN with Dense 512') 

  sess = tf.InteractiveSession()
  train_env = make_env(C['env_id'], C['noop_max'])
  eval_env = make_env(C['env_id'], C['noop_max'])
  train_s = train_env.reset()
  agent = Agent(train_env, C)

  train_reward = tf.placeholder(tf.float32)
  eval_reward = tf.placeholder(tf.float32)
  train_summary = tf.summary.scalar('train_reward', train_reward)
  eval_summary = tf.summary.scalar('eval_reward', eval_reward)
  writer = tf.summary.FileWriter('{}{}_summary'.format(filePath, envName), sess.graph)

  sess.run(tf.global_variables_initializer())

  #Practice
  for it in range(C['pre_iterations']):
    train_a = agent.act_pre()
    ns, train_r, train_d, _ = train_env.step(train_a)
    agent.record(train_s, train_a, train_r, float(train_d), it, True)
    train_s = ns
    if train_d:
      train_s = train_env.reset()
 
  logger.log('Pre-training completed')

  #Initializing Online RL training network 
  agent.net.initialize_online_network()
  train_track = [0.0]
  eval_track = []
  best_reward = 0
  
  train_fs = reset_fs()
  train_s = train_env.reset()
  best_reward = 0
  train_mean = []
  eval_mean = []
  
  
  agent.net.update_target_network()
  
  #RL training
  for it in range(C['iterations']):
    
    train_fs.append(train_s)
    
    train_a = agent.act(np.transpose(train_fs, (1,2,0)))
    ns, train_r, train_d, _ = train_env.step(train_a)
    train_track[-1]+= train_r
    agent.record(train_s, train_a, train_r, float(train_d), it, False)
    train_s = ns

    if train_d:
      if train_env.env.env.was_real_done:
        if len(train_track) % 100 == 0:

          #records statistics to logger and tensorboard
          train_mean.append(np.mean(train_track[-100:]))
          summary = sess.run(train_summary, feed_dict={train_reward:np.mean(train_track[-100:])})
          writer.add_summary(summary, it)
          logger.record_tabular('steps', it)
          logger.record_tabular('episode', len(train_track))
          logger.record_tabular('epsilon', 100*agent.epsilon)
          logger.record_tabular('learning rate', agent.lr)
          logger.record_tabular('Mean Reward 100 episdoes', np.mean(train_track[-100:]))
          logger.dump_tabular()
          with open(resultPath + 'reward_atari_practice.pk1', 'wb') as f:
              pickle.dump(train_track, f, protocol=pickle.HIGHEST_PROTOCOL)
        train_track.append(0.0)
      
      train_fs = reset_fs()
      train_s = train_env.reset()
        
    #Evaluation
    if (it+1)%C['eval_freq'] == 0:
        
      for i in range(C['eval_episodes']):
        temp_video = []
        eval_track.append(0.0)
        eval_fs = reset_fs()
        eval_s = eval_env.reset()
        while True:
          temp_video.append(eval_s)
          eval_fs.append(eval_s)
          eval_a = agent.greedy_act(np.transpose(eval_fs, (1,2,0)))
          eval_s, eval_r, eval_d, _ = eval_env.step(eval_a)
          eval_track[-1] += eval_r
          if eval_env.env.env.was_real_done:
            break
          if eval_d:
            eval_fs = reset_fs()
            eval_s = eval_env.reset()
                
        if eval_track[-1] > best_reward:
          best_reward = eval_track[-1]
          best_video = temp_video
          with open(resultPath + 'video_atari_practice.pk1', 'wb') as f:
              pickle.dump(best_video, f, protocol=pickle.HIGHEST_PROTOCOL)
              
      eval_mean.append(np.mean(eval_track[-C['eval_episodes']:]))
      logger.log('Evaluate mean reward: {:.2f}, max reward: {:.2f}, std: {:.2f}'.format(np.mean(eval_track[-C['eval_episodes']:]), np.max(eval_track[-C['eval_episodes']:]), np.std(eval_track[-C['eval_episodes']:])))
      summary = sess.run(eval_summary, feed_dict={eval_reward:np.mean(eval_track[-C['eval_episodes']:])})
      writer.add_summary(summary, it)
      with open(resultPath + 'eval_reward_atari_practice.pk1', 'wb') as f:
          pickle.dump(eval_track, f, protocol=pickle.HIGHEST_PROTOCOL)      

    #Storing current state and outputs from Convolution layers
    if it%1000000 == 0:
      outputs = agent.net.get_outputs(np.transpose(train_fs, (1,2,0)))
      with open(resultPath+str(it)+'outputs.pk1', 'wb') as f:
        pickle.dump(outputs, f, protocol=pickle.HIGHEST_PROTOCOL)
      with open(resultPath+str(it)+'outputs_screen.pk1', 'wb') as f:
        pickle.dump(train_fs, f, protocol=pickle.HIGHEST_PROTOCOL)

  #Storing required outputs as pickle files        
  with open(resultPath + 'reward_atari_practice.pk1', 'wb') as f:
    pickle.dump(train_track, f, protocol=pickle.HIGHEST_PROTOCOL)
  with open(resultPath + 'trainMean_atari_practice.pk1', 'wb') as f:
    pickle.dump(train_mean, f, protocol=pickle.HIGHEST_PROTOCOL)
  with open(resultPath+ 'evalMean_atari_practice.pk1', 'wb') as f:
    pickle.dump(eval_mean, f, protocol=pickle.HIGHEST_PROTOCOL)        
  agent.net.save(filePath + '{}_model2'.format(C['env_id']))
  sess.close()
Ejemplo n.º 22
0
def train(params, log_dir, local_log, random_seed, trial):

    # define device on which to run
    device = torch.device(params["DEVICE"])

    # create env and add specific conifigurations to Malmo
    env = make_env(params["DEFAULT_ENV_NAME"])
    # port = int('1000'+str(aid))
    # env.configure(client_pool=[('127.0.0.1', port)])
    # env.configure(client_pool=[('127.0.0.1', 10000), ('127.0.0.1', 10001), ('127.0.0.1', 10002)])
    env.configure(client_pool=[('127.0.0.1', 10000), ('127.0.0.1', 10001)])
    env.configure(allowDiscreteMovement=["move", "turn"]) # , log_level="INFO")
    env.configure(videoResolution=[84,84])
    env.configure(stack_frames=4)
    env = wrap_env_malmo(env)

    if random_seed:
        env.seed(random_seed)

    print("Observation Space: ", env.observation_space)
    print("Action Space: ", env.action_space)

    agents = []
    for aid in range(params["NUM_AGENTS"]):

        # initialize bufer
        if params["SHARING"] and params["PRIORITIZED_SHARING"]:
            bufer = ExperienceBufferGridImage(params["REPLAY_SIZE"])
        else:
            bufer = ExperienceBuffer(params["REPLAY_SIZE"])            

        # initialize agent        
        net = DQN(env.observation_space.shape, env.action_space.n, params["DEVICE"]).to(device)
        tgt_net = DQN(env.observation_space.shape, env.action_space.n, params["DEVICE"]).to(device)
        epsilon = params["EPSILON_START"]
        gamma = params["GAMMA"]
        tau = params["SOFT_UPDATE_TAU"]
        agent = Agent('agent' + str(aid), env, bufer, net, tgt_net, gamma, epsilon, tau, 
            trial, log_dir, params)

        # other variables
        agent.optimizer = optim.Adam(agent.net.parameters(), lr=params["LEARNING_RATE"])
        agent.print_color = COLORS[aid]

        local_log[agent.alias+"-"+str(trial)] = {"rewards": [],"steps": []}

        # fill buffer with initial size - don't count these episodes
        agent.fill_buffer()

        agents.append(agent)

    # training loop
    ep_count = 0
    while sum(map(lambda agent:agent.completed, agents)) != len(agents):

        # overall count of episodes
        ep_count += 1

        # sharing
        if params["SHARING"] and ep_count % params["SHARING_INTERVAL"] == 0 and ep_count > 0:
            if params["PRIORITIZED_SHARING"]:
                share(agents, params["BATCH_SIZE_TRANSFER"], params["REPLAY_START_SIZE"], params["SHARING_THRESHOLD"])
            else:
                share_no_mask(agents, params["BATCH_SIZE_TRANSFER"], params["REPLAY_START_SIZE"])

        # each agent does one episode
        for agent in agents:

            ## Before 2 agents perform, act, do one round of experience share
            # given a sharing interval and it is not the first episode
            if not agent.completed:

                episode_over = False
                episode_start = time.time()        
                while not episode_over:

                    # play step
                    frame_start = time.time()
                    episode_over, done_reward = agent.play_step(device=device)
                    agent.frame_idx+= 1

                    #### Folllowing methods on episode basis
                    if done_reward is not None:

                        # calculate episode speed
                        agent.ep_speed = 1 / (time.time() - episode_start)
                        # reset trackers
                        episode_start = time.time()

                        # save to local log as well
                        local_log[agent.alias+"-"+str(trial)]["rewards"].append(agent.total_rewards[-1])
                        local_log[agent.alias+"-"+str(trial)]["steps"].append(agent.total_steps[-1])

                        if params["INDEPENDENT_EVALUATION"]:
                            offline_evaluation(params, agent)
                        else:
                            online_evaluation(params, agent)

                        ## check if problem has been solved
                        # need a minimum number of episodes to evaluate
                        if len(agent.total_rewards) >= params["NUMBER_EPISODES_MEAN"]:                                
                            # and mean reward has to go above boundary
                            if agent.mean_reward >= params["MEAN_REWARD_BOUND"]:
                                    print(colored("%s solved in %d episodes!" % (agent.alias, len(agent.total_rewards)), agent.print_color))
                                    agent.completed = True

                        # if no sign of converging, also break
                        if len(agent.total_rewards) >= params["MAX_GAMES_PLAYED"]:
                            agent.completed = True

                    #### Folllowing methods on frame basis
                    # decay epsilon linearly on frames
                    agent.epsilon = max(params["EPSILON_FINAL"], params["EPSILON_START"] - \
                        agent.frame_idx / params["EPSILON_DECAY_LAST_FRAME"])
                    
                    # update at every frame using soft updates
                    if params["SOFT"]:
                        agent.soft_update_target_network()
                    # or hard updates
                    else:
                        if agent.frame_idx % params["SYNC_TARGET_FRAMES"] == 0:
                            agent.hard_update_target_network()
                    
                    ## learn
                    loss_t = agent.learn(device)

                    # record
                    agent.frame_speed = 1 / (time.time() - frame_start)
                    if params["DEBUG"]: 
                        agent.record_frame(loss_t.detach().item()) # detach required?


    # del bufer to force gc later, occupies too much memory
    del bufer
    for agent in agents:
        del agent.exp_buffer
    # closes tensorboard writer
    agent.writer.close()
Ejemplo n.º 23
0
    lr = 0.001
    gamma = 0.99
    eps_start = 1
    eps_end = 0.01
    eps_decay = 0.001
    target_update = 10
    num_episodes = 1000
    batch_size = 256
    capacity = 1000000
    max_nb_elements = 4
    scores, eps_history = [], []

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    env = wrappers.make_env("Breakout-v0")

    strategy = EpsilonGreedyStrategy(eps_start, eps_end, eps_decay)

    agent = Agent(env.action_space.n, strategy, device)
    memory = ReplayMemory(capacity)

    policy_network = DQN(env.action_space.n, lr).to(device)
    target_network = DQN(env.action_space.n, lr).to(device)

    target_network.load_state_dict(policy_network.state_dict())
    target_network.eval()

    optimizer = optim.Adam(params=policy_network.parameters(), lr=lr)

    for episode in range(num_episodes):
Ejemplo n.º 24
0
Archivo: run.py Proyecto: UT-ECLAIR/RL
update_freq = args.update_freq
epsilon_start = args.epsilon_start
epsilon_final = args.epsilon_final
epsilon_decay = args.epsilon_decay
lr = args.lr
epsilon = lambda frame_num: max(
    epsilon_start - (epsilon_start - epsilon_final) *
    (frame_num / epsilon_decay), epsilon_final)

load_model = args.load_model
checkpoint_save = args.checkpoint_save
log = args.log

# make the environment
env = wrappers.make_env(game,
                        clip_rewards=clip_rewards,
                        frame_stack=frame_stack,
                        frame_skip=frame_skip)

# Model settings
output_dim = env.action_space.n
input_dims = (frame_stack, 84, 84)

# make the agent
agent = agent.Agent(memory_capacity, gamma, input_dims, output_dim, lr)

if load_model:
    print(load_model)
    agent.load_variables(direc=load_model,
                         copy_model_to_target=True,
                         load_mem=True)
Ejemplo n.º 25
0
env = gym.make("Pong-ram-v4")
env = env.unwrapped
dqn = DQN(env, inputlen, cnn, fc, gamma = 0.9, learning_rate = 0.0001,
          epoch = 100000, replay = 10000, update_round = 1000, render = 0)
'''

#Pong CNN
inputlen = 4
cnn = [
    (32, 8, 0, 4, 1, 0),
    (64, 4, 0, 2, 1, 0),
    (64, 3, 0, 1, 1, 0),
]
n_atoms = 51
fc = [7 * 7 * 64, 1000, 6 * n_atoms]
env = wrappers.make_env('PongNoFrameskip-v4')

dqn = DQN(env,
          inputlen,
          cnn,
          fc,
          gamma=0.99,
          learning_rate=0.0001,
          eps=[1, 0.00001, 0.02],
          epoch=100000,
          replay=10000,
          update_round=1000,
          render=-1,
          batch_size=32,
          n_atoms=n_atoms,
          value_min=-21,
Ejemplo n.º 26
0
    state_action_values = net(states_v).gather(
        1, actions_v.unsqueeze(-1)).squeeze(-1)
    next_state_actions = net(next_states_v).max(1)[1]
    next_state_values = tgt_net(next_states_v).gather(
        1, next_state_actions.unsqueeze(-1)).squeeze(-1)
    #next_state_values = tgt_net(next_states_v).max(1)[0]
    next_state_values[done_mask] = 0.0
    next_state_values = next_state_values.detach()

    expected_state_action_values = next_state_values * GAMMA + rewards_v
    return nn.MSELoss()(state_action_values, expected_state_action_values)


if __name__ == "__main__":

    env = wrappers.make_env(ENV_NAME)

    device = torch.device("cpu")

    net = dqn_model.DuelingDQN(env.observation_space.shape,
                               env.action_space.n).to(device)
    tgt_net = dqn_model.DuelingDQN(env.observation_space.shape,
                                   env.action_space.n).to(device)
    writer = SummaryWriter(comment="-" + ENV_NAME)
    print(net)

    buffer = ExperienceBuffer(REPLAY_SIZE)
    agent = Agent(env, buffer)
    epsilon = EPSILON_START

    optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE)
Ejemplo n.º 27
0
def train(params, log_dir, local_log, random_seed, trial, agent_id):

    # define device on which to run
    device = torch.device(params["DEVICE"])

    # create env and add specific conifigurations to Malmo
    env = make_env(params["DEFAULT_ENV_NAME"])
    env.configure(client_pool=[('127.0.0.1', 10000), ('127.0.0.1', 10001)])
    env.configure(allowDiscreteMovement=["move", "turn"]) # , log_level="INFO")
    env.configure(videoResolution=[84,84])
    env.configure(stack_frames=4)
    env = wrap_env_malmo(env)

    if random_seed:
        env.seed(random_seed)

    print("Observation Space: ", env.observation_space)
    print("Action Space: ", env.action_space)

    # initialize agent
    bufer = ExperienceBuffer(params["REPLAY_SIZE"])
    # buffer = ExperienceBuffer(params["REPLAY_SIZE"])            
    net = DQN(env.observation_space.shape, env.action_space.n, params["DEVICE"]).to(device)
    tgt_net = DQN(env.observation_space.shape, env.action_space.n, params["DEVICE"]).to(device)
    epsilon = params["EPSILON_START"]
    gamma = params["GAMMA"]
    tau = params["SOFT_UPDATE_TAU"]
    agent = Agent('agent' + str(agent_id), env, bufer, net, tgt_net, gamma, epsilon, tau, 
        trial, log_dir, params)

    # other variables
    agent.optimizer = optim.Adam(agent.net.parameters(), lr=params["LEARNING_RATE"])
    agent.print_color = COLORS[agent_id]

    local_log[agent.alias+"-"+str(trial)] = {"rewards": [],"steps": []}

    # fill buffer with initial size - don't count these episodes
    agent.fill_buffer()

    # training loop
    ep_count = 0
    while not agent.completed:

        ep_count += 1

        episode_over = False
        episode_start = time.time()        
        while not episode_over:

            # play step
            frame_start = time.time()
            episode_over, done_reward = agent.play_step(device=device)
            agent.frame_idx+= 1

            #### Folllowing methods on episode basis
            if done_reward is not None:

                # calculate episode speed
                agent.ep_speed = time.time() - episode_start
                # reset trackers
                episode_start = time.time()

                # save to local log as well
                local_log[agent.alias+"-"+str(trial)]["rewards"].append(agent.total_rewards[-1])
                local_log[agent.alias+"-"+str(trial)]["steps"].append(agent.total_steps[-1])

                if params["INDEPENDENT_EVALUATION"]:
                    offline_evaluation(params, agent)
                else:
                    online_evaluation(params, agent)

                ## check if problem has been solved
                if agent.mean_reward is not None:
                    if agent.mean_reward > params["MEAN_REWARD_BOUND"]:
                        print(colored("%s solved in %d episodes!" % (agent.alias, len(agent.total_rewards)), agent.print_color))
                        agent.completed = True

                # if no sign of converging, also break
                if len(agent.total_rewards) >= params["MAX_GAMES_PLAYED"]:
                    agent.completed = True

            #### Folllowing methods on frame basis
            # decay epsilon linearly on frames
            agent.epsilon = max(params["EPSILON_FINAL"], params["EPSILON_START"] - \
                (agent.frame_idx-params["REPLAY_START_SIZE"]) / params["EPSILON_DECAY_LAST_FRAME"])
            
            # update at every frame using soft updates
            if params["SOFT"]:
                agent.soft_update_target_network()
            # or hard updates
            else:
                if agent.frame_idx % params["SYNC_TARGET_FRAMES"] == 0:
                    agent.hard_update_target_network()
            
            ## learn
            loss_t = agent.learn(device)

            # record
            agent.frame_speed = 1000 / (time.time() - frame_start)
            if params["DEBUG"]: 
                agent.record_frame(loss_t.detach().item()) # detach required?


    # del bufer to force gc later, occupies too much memory
    del bufer
    # closes tensorboard writer
    agent.writer.close()
Ejemplo n.º 28
0
def train(env_name='PongNoFrameskip-v4',
          gamma=0.99,
          batch_size=32,
          replay_size=1000000,
          replay_start_size=50000,
          learning_rate=0.00025,
          adam_epsilon=0.0000001,
          sync_target_frames=10000,
          epsilon_decay_last_frame=1000000,
          epsilon_start=1.0,
          epsilon_final=0.1,
          train_frames=50000000,
          train_rewards=495,
          n_steps=3,
          save_checkpoints=True,
          run_name=None,
          use_double=True,
          use_dense=None,
          dueling=False,
          priority_replay=None,
          categorical=None,
          record=False,
          random_seed=None,
          index=0):
    n_atoms = v_min = v_max = None
    use_categorical = False
    if categorical is not None:
        use_categorical = True
        n_atoms = categorical['n_atoms']
        v_min = categorical['v'][0]
        v_max = categorical['v'][1]

    alpha = beta = None
    use_priority_replay = False
    if priority_replay is not None:
        use_priority_replay = True
        alpha = priority_replay['alpha']
        beta = priority_replay['beta']

    print(f'Training DQN on {env_name} environment')
    print(f'Params: gamma:{gamma}, batch_size:{batch_size}, replay_size:{replay_size}')
    print(f'        replay_start_size: {replay_start_size}, learning_rate:{learning_rate}')
    print(f'        sync_target_frames: {sync_target_frames}, epsilon_decay_last_frame:{epsilon_decay_last_frame}')
    print(f'        epsilon_start: {epsilon_start}, epsilon_final: {epsilon_final}, train_frames: {train_frames}')
    print(f'        train_rewards: {train_rewards}, n_steps: {n_steps}, save_checkpoints: {save_checkpoints}')
    print(f'        run_name: {run_name}, use_double: {use_double}, use_dense: {use_dense}, dueling: {dueling}')
    if use_categorical:
        print(f'        categorical - n_atoms: {n_atoms}, v_min: {v_min}, v_max: {v_max}')
    if use_priority_replay:
        print(f'        priority buffer - alpha: {alpha} beta: {beta}')
    print(f'        random_seed: {random_seed}, index: {index}')
    f_name = env_name + "_" + run_name if run_name is not None else env_name
    env = wrappers.make_env(env_name, record, f_name)
    if random_seed is not None:
        tf.random.set_seed(random_seed)
        env.seed(random_seed)
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=adam_epsilon)
    agent = Agent(env, replay_size, optimizer, batch_size, n_steps, gamma, use_double, use_dense, dueling,
                  use_categorical, n_atoms, v_min, v_max, train_frames if train_frames is not None else 5000000)
    if save_checkpoints:
        agent.load_checkpoint(f'checkpoints/{f_name}/checkpoint')

    total_rewards = []
    rewards_mean_std = []
    frame_idx = 0
    count = 0
    update_count = 0
    ts_frame = 0
    ts = time.time()
    best_mean_reward = None

    while True:
        frame_idx += 1
        epsilon = max(epsilon_final, epsilon_start - frame_idx / epsilon_decay_last_frame)

        reward = agent.play_step(epsilon)
        if reward is not None:
            count += 1
            total_rewards.append(reward)
            speed = (frame_idx - ts_frame) / (time.time() - ts)
            ts_frame = frame_idx
            ts = time.time()
            mean_reward = np.mean(total_rewards[-100:])
            print(f'{index}:{frame_idx}: done {count} games, mean reward: {mean_reward}, eps {epsilon}, speed: {speed}')
            if best_mean_reward is None or best_mean_reward < mean_reward:
                # Save network
                if best_mean_reward is not None:
                    if save_checkpoints:
                        agent.save_checkpoint(f'./checkpoints/{f_name}/checkpoint')
                    print(f'Best mean reward updated {best_mean_reward} -> {mean_reward}, model saved')
                best_mean_reward = mean_reward
            if train_frames is not None:
                if frame_idx >= train_frames:
                    print(f'Trained for {frame_idx} frames. Done.')
                    break
            if train_rewards is not None:
                if mean_reward >= train_rewards:
                    print(f'Reached reward: {mean_reward}. Done.')
                    break

        if agent.buffer_size() < replay_start_size:
            continue

        if frame_idx % sync_target_frames == 0:
            agent.sync_weights()
        agent.step(gamma, True if update_count % 1000 == 0 else False)
        update_count += 1
        rewards_mean_std.append({'reward': total_rewards[-1:][0],
                                 'step': update_count})
    env.close()
    plot.directory_check('./plots')
    plot.plot(rewards_mean_std, f'./plots/{f_name}.png', f_name)
    plot.directory_check('./data')
    plot.save(rewards_mean_std, f'./data/{f_name}.csv')
Ejemplo n.º 29
0
    return nn.MSELoss()(Qs, Qtarget)


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--cuda',
                        default=False,
                        action='store_true',
                        help='Enable CUDA')
    parser.add_argument('--env', default=DEFAULT_ENV_NAME)
    parser.add_argument('--reward', type=float, default=MEAN_REWARD_BOUND, \
                        help='Mean reward bound for stop of training')
    args = parser.parse_args()
    device = torch.device("cuda" if args.cuda else "cpu")

    env = wrappers.make_env(args.env)

    net = dqn_model.DQN(env.observation_space.shape,
                        env.action_space.n).to(device)
    target_net = dqn_model.DQN(env.observation_space.shape,
                               env.action_space.n).to(device)
    writer = SummaryWriter(comment='-' + args.env)
    print(net)

    buffer = ReplayBuffer(REPLAY_SIZE)
    agent = Agent(env, buffer)
    epsilon = EPSILON_START

    optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE)
    total_rewards = []
    frame_idx = 0
Ejemplo n.º 30
0
def train(args):
    print(args)
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)

    if args.return_function == "GAE":
        return_function = GAE
    elif args.return_function == "Q":
        return_function = Q
    elif args.return_function == "A":
        return_function = A

    MONTE_CARLO = True if args.num_steps == 200 else False

    envs = SubprocVecEnv(
        [make_env(args.env, i + args.num_envs) for i in range(args.num_envs)],
        MONTE_CARLO)
    test_env = gym.make(args.env)
    test_env.seed(args.seed + args.num_envs)
    policy = ActorCriticMLP(input_dim=envs.observation_space.shape[0],
                            n_acts=envs.action_space.n)
    optim = torch.optim.Adam(params=policy.parameters(),
                             lr=args.lr,
                             weight_decay=args.weight_decay)

    test_rewards = []
    steps = 1

    obs = torch.from_numpy(envs.reset())
    while steps < args.max_steps:
        logp_actions = []
        state_values = []
        rewards = []
        masks = []

        for _ in range(args.num_steps):
            probs, state_value = policy.forward(obs)
            dist = Categorical(probs)
            action = dist.sample()

            obs, reward, done, _ = envs.step(action.numpy())

            logp_actions.append(dist.log_prob(action).unsqueeze(1))
            state_values.append(state_value)
            rewards.append(torch.FloatTensor(reward).unsqueeze(1))
            masks.append(torch.FloatTensor(1 - done).unsqueeze(1))
            obs = torch.from_numpy(obs)
            steps += 1

            if steps % args.test_every == 0:
                test_reward = np.mean(
                    [test(test_env, policy) for _ in range(10)])
                test_rewards.append(test_reward)
                print(f"Running reward at timestep {steps}: and {test_reward}")

            if (1 - done).sum() == 0:
                break

        next_value = 0
        if not (1 - done).sum() == 0:
            _, next_value = policy(obs)

        returns = return_function(next_value, rewards, masks, state_values,
                                  args)
        loss = policy_gradient(logp_actions, returns)

        optim.zero_grad()
        loss.backward()

        optim.step()
        # if monte carlo, we need to reset the environment by hand
        if MONTE_CARLO:
            obs = torch.from_numpy(envs.reset())
    return test_rewards