Ejemplo n.º 1
0
    def __init__(self, env_name):
        self.device = torch.device('cuda')
        self.env_name = env_name
        self.env = wrappers.make_env(env_name)

        self.gamma = 0.99
        self.batch_size = 32
        self.replay_buffer_size = 10000
        self.replay_start_size = 10000
        self.learning_rate = 1e-4
        self.update_target_interval = 1000

        self.epsilon_start = 1.0
        self.epsilon_end = 0.02
        self.epsilon_period = 100000

        self.reward_bound = 19.5

        self.replay_buffer = replay_buffer.ReplayBuffer(
            self.replay_buffer_size)
        self.network = dqn_model.DQNModel(self.env.observation_space.shape,
                                          self.env.action_space.n).to(
                                              self.device)
        self.target_network = dqn_model.DQNModel(
            self.env.observation_space.shape,
            self.env.action_space.n).to(self.device)
        self.optimizer = optim.Adam(self.network.parameters(),
                                    lr=self.learning_rate)

        print(self.network)

        self.writer = SummaryWriter(comment='dqn' + self.env_name)

        self.total_rewards = []
        self.frame_index = 0
Ejemplo n.º 2
0
def main(to_train, save_path):
    torch.manual_seed(1234)
    parser = argparse.ArgumentParser()
    parser.add_argument("--cuda",
                        default=False,
                        action="store_true",
                        help="Enable cuda computation")
    parser.add_argument("--env",
                        default=DEFAULT_ENV_NAME,
                        help="default env name")
    args = parser.parse_args()
    device = torch.device(
        "cuda" if args.cuda and torch.cuda.is_available() else "cpu")

    os.makedirs(save_path, exist_ok=True)
    env = wrappers.make_env(args.env)
    net = dqn_model.DQN(env.observation_space.shape,
                        env.action_space.n).to(device)
    target_net = dqn_model.DQN(env.observation_space.shape,
                               env.action_space.n).to(device)
    print(net)

    buffer = ExperienceBuffer(REPLAY_SIZE)
    agent = Agent(env, buffer)
    epsilon = EPSILON_START

    optimizer = optim.Adam(net.parameters(),
                           lr=LEARNING_RATE)  # only need one optimizer

    if to_train:
        train(env, net, target_net, buffer, agent, optimizer, device,
              save_path)
Ejemplo n.º 3
0
    def __init__(self, env_name):
        self.env = wrappers.make_env(env_name)


        self.env_name = env_name
        self.device = torch.device('cuda')

        self.learning_rate = 2.5e-4
        self.stabilizer = 0.01
        self.gradient_momentum = 0.95
        self.gamma = 0.99
        self.batch_size = 32
        self.replay_start_size = 50000
        self.replay_buffer_size = 1000000
        self.update_target_interval = 10000
        self.training_frequency = 4

        self.epsilon_start = 1.0
        self.epsilon_end = 0.05
        self.epsilon_period = 1000000

        self.network = dqn_model.DQNModel(self.env.observation_space.shape, self.env.action_space.n).to(self.device)
        self.target_network = dqn_model.DQNModel(self.env.observation_space.shape, self.env.action_space.n).to(self.device)
        self.replay_buffer = replay_buffer.ReplayBuffer(self.replay_buffer_size)
        self.optimizer = optim.RMSprop(self.network.parameters(), lr = self.learning_rate, alpha = self.gradient_momentum, 
                    eps = self.stabilizer)
        self.loss_criterion = nn.SmoothL1Loss()

        print(self.network)

        self.writer = SummaryWriter(comment = 'dqn' + self.env_name)
        self.total_rewards = []
        self.best_mean_reward = None

        self.steps = 0
Ejemplo n.º 4
0
    def __init__(self, env_name):
        self.env = wrappers.make_env(env_name)
        self.env_name = env_name
        self.device = torch.device('cuda')
        self.rollout_length = 2

        torch.manual_seed(2)
        np.random.seed(2)
        self.env.seed(2)
        self.env.action_space.seed(2)
        self.env.observation_space.seed(2)

        self.learning_rate = 1e-4
        self.gamma = 0.99
        self.batch_size = 32
        self.replay_start_size = 10000
        self.replay_buffer_size = 10000
        self.update_target_interval = 1000

        self.epsilon_start = 1.0
        self.epsilon_end = 0.02
        self.epsilon_period = 100000

        self.alpha = 0.6
        self.beta_start = 0.4
        self.beta_period = 100000
        self.beta = self.beta_start

        self.network = dqn_model.DuelingDQNModel(
            self.env.observation_space.shape,
            self.env.action_space.n).to(self.device)
        self.target_network = dqn_model.DuelingDQNModel(
            self.env.observation_space.shape,
            self.env.action_space.n).to(self.device)
        self.replay_buffer = replay_buffer.NStepPriorityBuffer(
            self.replay_buffer_size, self.rollout_length, self.gamma,
            self.alpha)
        self.optimizer = optim.Adam(self.network.parameters(),
                                    lr=self.learning_rate)

        print(self.network)

        self.writer = SummaryWriter(comment='combineddqn' + self.env_name)
        self.total_rewards = []
        self.best_mean_reward = None

        self.steps = 0
Ejemplo n.º 5
0
    def __init__(self, env_name):
        self.env = wrappers.make_env(env_name)
        self.device = torch.device('cuda')
        self.env_name = env_name

        torch.manual_seed(2)
        np.random.seed(2)
        self.env.seed(2)
        self.env.action_space.seed(2)
        self.env.observation_space.seed(2)

        self.replay_buffer_size = 10000
        self.replay_start_size = 10000
        self.update_interval = 1000
        self.learning_rate = 1e-4
        self.gamma = 0.99
        self.batch_size = 32

        n_actions = self.env.action_space.n
        input_shape = self.env.observation_space.shape
        self.network = dqn_model.DQNNoisyModel(input_shape,
                                               n_actions).to(self.device)
        self.target_network = dqn_model.DQNNoisyModel(
            input_shape, n_actions).to(self.device)
        self.replay_buffer = replay_buffer.ReplayBuffer(
            self.replay_buffer_size)
        self.optimizer = optim.Adam(self.network.parameters(),
                                    lr=self.learning_rate)

        print(self.network)

        self.writer = SummaryWriter(comment='noisy_dqn' + env_name)
        self.total_rewards = []
        self.best_mean_reward = None

        self.steps = 0
Ejemplo n.º 6
0
    expected_state_action_values = next_state_values * GAMMA + rewards_v
    return nn.MSELoss()(state_action_values, expected_state_action_values)


if __name__ == "__main__":
    mkdir('.', 'checkpoints')
    parser = argparse.ArgumentParser()
    parser.add_argument("--cuda", default=False, action="store_true", help="Enable cuda")
    parser.add_argument("--env", default=DEFAULT_ENV_NAME,
                        help="Name of the environment, default=" + DEFAULT_ENV_NAME)
    parser.add_argument("--reward", type=float, default=MEAN_REWARD_GOAL,
                        help="Mean reward goal to stop training, default=%.2f" % MEAN_REWARD_GOAL)
    args = parser.parse_args()
    device = torch.device("cuda" if args.cuda else "cpu")

    env = wrappers.make_env(args.env)

    net = dqn_model.DQN(env.observation_space.shape, env.action_space.n).to(device)
    tgt_net = dqn_model.DQN(env.observation_space.shape, env.action_space.n).to(device)
    writer = SummaryWriter(comment="-" + args.env)
    print(net)

    buffer = ExperienceBuffer(REPLAY_BUFFER_SIZE)
    agent = Agent(env, buffer)
    epsilon = EPSILON_START

    optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE)
    total_rewards = []
    frame_idx = 0
    ts_frame = 0
    ts = time.time()
Ejemplo n.º 7
0
            print(s)
            return True
    return False

# **********************************************************************************************************************
# *                                                   1000 episodes                                                    *
# **********************************************************************************************************************


if __name__ == "__main__":
    print('\n\n*******************************************************')
    print("* Random model's playing 1000 episodes of MsPacman... *")
    print('*******************************************************\n')

    DEFAULT_ENV_NAME = "MsPacmanNoFrameskip-v4"
    env = wrappers.make_env(DEFAULT_ENV_NAME)
    f = open('bug_log_Random.txt', 'w+')
    f.close()

    for game in range(10):

        state = env.reset()
        total_reward = 0.0
        # wait the game starts
        for i in range(65):
            state, reward, is_done, _ = env.step(0)

        bug_flags = [False, False, False, False]
        count_actions = 0
        while True:
Ejemplo n.º 8
0
def main(cuda: bool, env_name: str, reward_stop: float, render: bool,
         weights_fn: str, fps: float, epsilon_fixed: float, no_learn: bool):
    device = torch.device("cuda" if cuda else "cpu")
    # create environment
    env: gym.Env = wrappers.make_env(env_name)

    # create both neural networks
    net = dqn_model.DQN(env.observation_space.shape,
                        env.action_space.n).to(device)
    tgt_net = dqn_model.DQN(env.observation_space.shape,
                            env.action_space.n).to(device)

    if weights_fn:
        assert os.path.isfile(weights_fn), "File {0} does not exist.".format(
            weights_fn)
        state_dict = torch.load(weights_fn, map_location=device)
        net.load_state_dict(state_dict)
        tgt_net.load_state_dict(state_dict)

    # create summary writer for tensorboard
    writer = SummaryWriter(comment="-" + env_name)

    # create buffer and agent and init epsilon
    buffer = ExperienceBuffer(REPLAY_SIZE)
    agent = Agent(env, buffer, render=render)

    optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE)
    total_rewards: List[float] = []
    frame_idx = 0
    ts_frame = 0
    ts = time.time()
    best_mean_reward: Optional[float] = None

    while True:
        frame_idx += 1
        # update epsilon
        if epsilon_fixed:
            epsilon = epsilon_fixed
        else:
            epsilon = max(EPSILON_FINAL,
                          EPSILON_START - frame_idx / EPSILON_DECAY_LAST_FRAME)
        # play one step
        t_step_0 = time.time()
        reward = agent.play_step(net, epsilon, device)
        if fps:
            while 1 / (time.time() - t_step_0) > fps:
                time.sleep(0.01)

        if reward is not None:
            # add reward to total and calculate mean
            total_rewards.append(reward)
            mean_reward = np.mean(total_rewards[-100:])

            # meter speed
            speed = (frame_idx - ts_frame) / (time.time() - ts)
            ts_frame = frame_idx
            ts = time.time()

            # print and write information
            print(
                "%d: done %d games, mean reward %.3f, eps %.2f, speed %.2f f/s"
                % (frame_idx, len(total_rewards), float(mean_reward), epsilon,
                   speed))
            writer.add_scalar("epsilon", epsilon, frame_idx)
            writer.add_scalar("speed", speed, frame_idx)
            writer.add_scalar("reward_100", mean_reward, frame_idx)
            writer.add_scalar("reward", reward, frame_idx)

            if best_mean_reward is None or best_mean_reward < mean_reward:
                torch.save(net.state_dict(), env_name + "-best.dat")
                if best_mean_reward is not None:
                    print(
                        "Best mean reward updated %.3f -> %.3f, model saved" %
                        (best_mean_reward, float(mean_reward)))
                best_mean_reward = float(mean_reward)
            if mean_reward > reward_stop:
                print("Solved in {0} frames!".format(frame_idx))
                break

        if len(buffer) < REPLAY_START_SIZE or no_learn:
            continue

        # sync target net with training net
        if frame_idx % SYNC_TARGET_FRAMES == 0:
            tgt_net.load_state_dict(net.state_dict())

        optimizer.zero_grad()
        batch = buffer.sample(BATCH_SIZE)
        loss_t = calc_loss(batch, net, tgt_net, device=device)
        loss_t.backward()
        optimizer.step()

    writer.close()
Ejemplo n.º 9
0
                        "--model",
                        required=False,
                        help="Model file to load")
    parser.add_argument("-e",
                        "--env",
                        default=DEFAULT_ENV_NAME,
                        help="Environment name to use, default=" +
                        DEFAULT_ENV_NAME)
    args = parser.parse_args()

    env_name = DEFAULT_ENV_NAME if args.env is None else args.env

    device = torch.device("cpu" if (args.mode is None) or (
        args.mode == "c") else "cuda")

    env = wrappers.make_env(env_name, lives=True, fire=True)
    print("{} environment".format(env_name))
    print(env.unwrapped.get_action_meanings())

    model = DEFAULT_MODEL_NAME if (args.model is None) else args.model
    net = VanillaDQN(env.observation_space.shape,
                     env.action_space.n).to(device)
    net.load_state_dict(torch.load(model, map_location=torch.device(device)))
    total_reward = 0.0
    total_steps = 0
    obs = env.reset()

    net.eval()
    with torch.no_grad():
        while True:
            start_ts = time.time()
DEFAULT_ENV_NAME = "PongNoFrameskip-v4"
FPS = 25


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("-m", "--model", required=True, help="Model file to load")
    parser.add_argument("-e", "--env", default=DEFAULT_ENV_NAME,
                        help="Environment name to use, default=" + DEFAULT_ENV_NAME)
    parser.add_argument("-r", "--record", help="Directory to store video recording")
    parser.add_argument("--no-visualize", default=True, action='store_false', dest='visualize',
                        help="Disable visualization of the game play")
    args = parser.parse_args()

    env = wrappers.make_env(args.env)
    if args.record:
        env = gym.wrappers.Monitor(env, args.record)
    net = dqn_model.DQN(env.observation_space.shape, env.action_space.n)
    net.load_state_dict(torch.load(args.model, map_location=lambda storage, loc: storage))

    state = env.reset()
    total_reward = 0.0
    c = collections.Counter()

    while True:
        start_ts = time.time()
        if args.visualize:
            env.render()
        state_v = torch.tensor(np.array([state], copy=False))
        q_vals = net(state_v).data.numpy()[0]
Ejemplo n.º 11
0
 def make_env():
     return wrappers.make_env(DEFAULT_ENV_NAME)
Ejemplo n.º 12
0

if __name__ == "__main__":
    params = common.HYPERPARAMS['pong']
    parser = argparse.ArgumentParser()
    parser.add_argument("--cuda", default=True, action="store_true", help="Enable cuda")
    parser.add_argument("--n", default=1,  help="how many steps to unroll from the bellman equation")
    parser.add_argument("--double", default=False, action="store_true", help="Enable double DQN")
    args = parser.parse_args()
    device = torch.device("cuda" if args.cuda else "cpu")
    unrolling_steps = int(args.n) if args.n else 1
    print('bellman unrolling steps: '+ str(unrolling_steps))
    double = args.double if args.double else False
    #env = gym.make(params['env_name'])
    #ptan.common.wrappers.wrap_dqn(env)
    env = wrappers.make_env(params['env_name'])
    writer = SummaryWriter(comment="-" + params['run_name'] + "-basic")
    net = dqn_model.DQN(env.observation_space.shape, env.action_space.n).to(device)
    tgt_net = ptan.agent.TargetNet(net)
    selector = ptan.actions.EpsilonGreedyActionSelector(epsilon=params['epsilon_start'])
    epsilon_tracker = common.EpsilonTracker(selector, params)
    agent = ptan.agent.DQNAgent(net, selector, device=device)
    exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=params['gamma'], steps_count=unrolling_steps)
    buffer = ptan.experience.ExperienceReplayBuffer(exp_source, buffer_size=params['replay_size'])
    optimizer = optim.Adam(net.parameters(), lr=params['learning_rate'])
    frame_idx = 0
    eval_states = None
    with common.RewardTracker(writer, params['stop_reward']) as reward_tracker:
        while True:
            frame_idx += 1
            buffer.populate(1)#where all the magic happens!
Ejemplo n.º 13
0
                        default=is_cuda,
                        action="store_true",
                        help="Enable cuda")
    parser.add_argument("--env",
                        default=DEFAULT_ENV_NAME,
                        help="Name of the environment, default=" +
                        DEFAULT_ENV_NAME)
    args = parser.parse_args()
    device = torch.device("cuda" if args.cuda else "cpu")
    make_dir(out_dir)  # save models
    make_dir(data_dir)  # save datas
    print('device:', device, 'BETA: ', BETA, 'NAME: ', NAME)
    print('clip_reward:', IS_CLIP_REWARD, 'clip_loss:', IS_CLIP_LOSS,
          'is_huber:', is_huber)

    env = wrappers.make_env(args.env, SKIP_FRAME, STACK_FRAME)
    net = dqn_model.DQN(env.observation_space.shape,
                        env.action_space.n).to(device)
    tgt_net = dqn_model.DQN(env.observation_space.shape,
                            env.action_space.n).to(device)
    buffer = ExperienceBuffer(REPLAY_SIZE)
    agent = Agent(env, buffer)

    epsilon = EPSILON_START
    optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE)
    scheduler = StepLR(optimizer, step_size=10000, gamma=LR_GAMMA)
    optimizer_tgt = optim.Adam(tgt_net.parameters(), lr=LEARNING_RATE)
    total_rewards, mean_return, train_times, total_losses = [], [], [], []
    frame_idx, ts_frame, episode, ep_time, ep_loss, best_mean_reward = 0, 0, 0, 0, 0, -21
    ts = time.time()
Ejemplo n.º 14
0
    parser.add_argument("-r",
                        "--reward",
                        help="set the reward bound for specific env:",
                        type=int)

    args = parser.parse_args()

    # ------------------------------
    # using the args

    # Make our device the GPU
    device = torch.device("cpu" if (args.mode is None) or (
        args.mode == "c") else "cuda")

    # Create the environment with wrappers
    env = wrappers.make_env(DEFAULT_ENV_NAME, lives=False, fire=True)
    print(env.unwrapped.get_action_meanings())
    if args.reward is not None:
        REWARD_BOUND = args.reward

    # Initialise the neural network, which will try to learn the Q values
    # Initialise the target network, which provides a copy of the network weights
    # from previous training iterations
    if args.model == 0 or args.model is None:
        print("Using vanilla network")
        net = nnmodels.VanillaDQN(env.observation_space.shape,
                                  env.action_space.n).to(device)
        tgtNet = nnmodels.VanillaDQN(env.observation_space.shape,
                                     env.action_space.n).to(device)
    # -------------------------
Ejemplo n.º 15
0
    state, reward, done, _ = env.step(action)
    total_reward += reward
    if done:
        return True
    return False


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("-m1", "--model1", required=True, help="Model player 1 file to load")
    parser.add_argument("-m2", "--model2", required=True, help="Model player 2 file to load")
    parser.add_argument("-e", "--env", default=DEFAULT_ENV_NAME,
                        help="Environment name to use, default=" + DEFAULT_ENV_NAME)
    args = parser.parse_args()

    env = wrappers.make_env(args.env, gui=True, scenario="multi_side_ai")
    net1 = dqn_model.DQN(env.observation_space.shape, env.action_space.n)
    net2 = dqn_model.DQN(env.observation_space.shape, env.action_space.n)
    net1.load_state_dict(torch.load(args.model1, map_location=lambda storage, loc: storage))
    net2.load_state_dict(torch.load(args.model2, map_location=lambda storage, loc: storage))

    state1 = env.reset()
    state2 = state1
    total_reward1 = 0.0
    total_reward2 = 0.0
    counter1 = collections.Counter()
    counter2 = collections.Counter()
    
    epsilon = 0.2
    frame = 0
    while True:
Ejemplo n.º 16
0
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("-m",
                        "--model",
                        required=True,
                        help="Model file to load")
    parser.add_argument("-e",
                        "--env",
                        default=DEFAULT_ENV_NAME,
                        help="Environment name to use, default=" +
                        DEFAULT_ENV_NAME)
    args = parser.parse_args()

    env = wrappers.make_env(args.env,
                            gui=True,
                            scenario="side1_pass",
                            variations=4)
    net = dqn_model.DQN(env.observation_space.shape, env.action_space.n)
    net.load_state_dict(
        torch.load(args.model, map_location=lambda storage, loc: storage))

    state = env.reset()
    total_reward = 0.0
    c = collections.Counter()

    while True:
        start_ts = time.time()
        state_v = torch.tensor(np.array([state], copy=False))
        q_vals = net(state_v).data.numpy()[0]
        action = np.argmax(q_vals)
        c[action] += 1