Exemple #1
0
    def __init__(self, env, gamma, tau, buffer_maxlen, value_lr, q_lr, policy_lr):

        self.env = env
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]
        self.action_range = [env.action_space.low, env.action_space.high]

        # hyperparameters
        self.gamma = gamma
        self.tau = tau

        # initialize networks
        self.value_net = ValueNet(self.state_dim).to(device)
        self.target_value_net = ValueNet(self.state_dim).to(device)
        self.q1_net = SoftQNet(self.state_dim, self.action_dim).to(device)
        self.q2_net = SoftQNet(self.state_dim, self.action_dim).to(device)
        self.policy_net = PolicyNet(self.state_dim, self.action_dim).to(device)

        # Load the target value network parameters
        for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()):
            target_param.data.copy_(self.tau * param + (1 - self.tau) * target_param)

            # Initialize the optimizer
        self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=value_lr)
        self.q1_optimizer = optim.Adam(self.q1_net.parameters(), lr=q_lr)
        self.q2_optimizer = optim.Adam(self.q2_net.parameters(), lr=q_lr)
        self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=policy_lr)

        # Initialize thebuffer
        self.buffer = ReplayBeffer(buffer_maxlen)
Exemple #2
0
    def __init__(self, args):
        self.args = args
        self.pool = multiprocessing.Pool(args.worker)
        env = gym.make(args.env_name)
        o_dim = env.observation_space.shape[0]
        s_dim = 128
        a_dim = env.action_space.n

        self.population_status = []
        for _ in range(args.population):
            individual_status = {}
            name = ''.join(
                random.choice(string.ascii_letters + string.digits)
                for _ in range(8))
            individual_status['name'] = name
            env_name = self.args.env_name
            individual_status['env_name'] = env_name
            policy_net = PolicyNet(o_dim, s_dim, a_dim)
            policy_net.share_memory()
            individual_status['policy_net'] = policy_net
            evaluate_net = EvaluateNet(o_dim, s_dim, a_dim)
            evaluate_net.share_memory()
            individual_status['evolution_net'] = evaluate_net
            steps = self.args.step_per_generation
            individual_status['steps'] = steps
            self.population_status.append(individual_status)
Exemple #3
0
    def __init__(self, config):
        self.config = config

        # replay memory
        self.replay_memory = deque(maxlen=self.config.n_replay_memory)

        # 정책신경망 생성
        self.model = PolicyNet(self.config.n_state, self.config.n_action)
        self.model.to(device)
        self.model_optimizer = torch.optim.Adam(self.model.parameters(),
                                                lr=self.config.learning_rate)
Exemple #4
0
 def __init__(self, env, lr, gamma, save_path, load=False):
     self.mean = []
     self.std = []
     self.env = env
     s_len = self.env.observation_space_shape[0]
     a_len = self.env.action_space_n
     self.model = PolicyNet(lr, s_len, a_len)
     self.gamma = gamma
     self.save_path = save_path
     if load:
         self.model.load(self.save_path)
     return
Exemple #5
0
 def __init__(self, model_file=None):
     self.model_dir = "models/"
     if not model_file:
         model_file = self.find_latest_model("model")
     if model_file == "init":
         self.net = PolicyNet()
     else:
         self.net = PolicyNet(model_file)
     print("using model from", model_file)
     self.data = []
     self.data_buffer = [[], []]
     self.old_net = None
     self.best_net = self.net
Exemple #6
0
    def __init__(self, config):
        self.config = config

        # 리플레이메모리
        self.replay_memory = deque(maxlen=self.config.n_replay_memory)

        # 정책신경망 생성
        self.actor = PolicyNet(self.config.n_state, self.config.n_action)
        self.actor.to(device)

        # 가치신경망 생성
        self.critic = ValueNet(self.config.n_state, 1)
        self.critic.to(device)
Exemple #7
0
    def __init__(self, config):
        self.config = config

        # 정책신경망 생성
        self.actor = PolicyNet(self.config.n_state, self.config.n_action)
        self.actor.to(device)
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(),
                                                lr=self.config.actor_lr)

        # 가치신경망 생성
        self.critic = ValueNet(self.config.n_state, 1)
        self.critic.to(device)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(),
                                                 lr=self.config.critic_lr)
Exemple #8
0
class A3CLocal:
    def __init__(self, config):
        self.config = config

        # 리플레이메모리
        self.replay_memory = deque(maxlen=self.config.n_replay_memory)

        # 정책신경망 생성
        self.actor = PolicyNet(self.config.n_state, self.config.n_action)
        self.actor.to(device)

        # 가치신경망 생성
        self.critic = ValueNet(self.config.n_state, 1)
        self.critic.to(device)

    # 정책신경망의 출력을 받아 확률적으로 행동을 선택
    def get_action(self, state):
        state = torch.tensor(state, dtype=torch.float).to(device)
        policy = self.actor(state)
        policy = policy.detach().cpu().numpy()[0]
        return np.random.choice(self.config.n_action, 1, p=policy)[0]

    # 리플에이메모리 추가
    def append_replay(self, state, action, reward, next_state):
        act = np.zeros(self.config.n_action)
        act[action] = 1
        self.replay_memory.append((state, act, reward, next_state))

    # 리플에이메모리 조회 및 클리어
    def get_replay(self):
        # 히스토리를 배열 형태로 정렬
        replay_memory = np.array(self.replay_memory)
        self.replay_memory.clear()
        states = np.vstack(replay_memory[:, 0])
        actions = list(replay_memory[:, 1])
        rewards = list(replay_memory[:, 2])
        next_states = list(replay_memory[:, 3])

        return states, actions, rewards, next_states

    # 글로벌신병망의 wegith를 로컬신경망으로 복사
    def update_local_model(self, actor_dict, critic_dict):
        self.actor.load_state_dict(actor_dict)
        self.critic.load_state_dict(critic_dict)

    # GPU 메모리 반납
    def close(self):
        del self.actor
        del self.critic
Exemple #9
0
def load_checkpoint(file_dir, i_epoch, layer_sizes, input_size, device='cuda'):
    checkpoint = torch.load(os.path.join(file_dir, "ckpt_eps%d.pt" % i_epoch),
                            map_location=device)

    policy_net = PolicyNet(layer_sizes).to(device)
    value_net = ValueNet(input_size).to(device)
    policy_net.load_state_dict(checkpoint["policy_net"])
    policy_net.train()
    value_net.load_state_dict(checkpoint["value_net"])
    value_net.train()

    policy_lr = checkpoint["policy_lr"]
    valuenet_lr = checkpoint["valuenet_lr"]

    policynet_optim = optim.Adam(policy_net.parameters(), lr=policy_lr)
    policynet_optim.load_state_dict(checkpoint["policynet_optim"])
    valuenet_optim = optim.Adam(value_net.parameters(), lr=valuenet_lr)
    valuenet_optim.load_state_dict(checkpoint["valuenet_optim"])

    checkpoint.pop("policy_net")
    checkpoint.pop("value_net")
    checkpoint.pop("policynet_optim")
    checkpoint.pop("valuenet_optim")
    checkpoint.pop("i_epoch")
    checkpoint.pop("policy_lr")
    checkpoint.pop("valuenet_lr")

    return policy_net, value_net, policynet_optim, valuenet_optim, checkpoint
def load_checkpoint(file_dir, i_epoch, layer_sizes, device='cuda'):
    checkpoint = torch.load(os.path.join(file_dir, "ckpt_eps%d.pt" % i_epoch))

    policy_net = PolicyNet(layer_sizes).to(device)
    policy_net.load_state_dict(checkpoint["policy_net"])
    policy_net.train()

    learning_rate = checkpoint["learning_rate"]

    optimizer = optim.Adam(policy_net.parameters())
    # optimizer = optim.SGD(policy_net.parameters(), lr=learning_rate)
    optimizer.load_state_dict(checkpoint["optimizer"])

    checkpoint.pop("policy_net")
    checkpoint.pop("optimizer")
    checkpoint.pop("i_epoch")
    checkpoint.pop("learning_rate")

    return policy_net, optimizer, checkpoint
Exemple #11
0
def main():
    #Parse arguments
    #----------------------------
    parser = argparse.ArgumentParser()
    parser.add_argument("--env", default="BipedalWalker-v3")
    parser.add_argument("--discrete", action="store_true")
    parser.add_argument("--unwrap", action="store_true")
    args = parser.parse_args()

    #Parameters
    #----------------------------
    save_dir       = "./save"
    device         = "cuda:0" if torch.cuda.is_available() else "cpu"

    #Create environment
    #----------------------------
    env = gym.make(args.env)

    if args.discrete:
        s_dim = env.observation_space.shape[0]
        a_dim = env.action_space.n
    else:
        s_dim = env.observation_space.shape[0]
        a_dim = env.action_space.shape[0]

    if args.unwrap:
        env = env.unwrapped

    #Create model
    #----------------------------
    policy_net = PolicyNet(s_dim, a_dim, conti=not args.discrete).to(device)
    print(policy_net)

    #Load model
    #----------------------------
    model_path = os.path.join(save_dir, "{}.pt".format(args.env))

    if os.path.exists(model_path):
        print("Loading the model ... ", end="")
        checkpoint = torch.load(model_path)
        policy_net.load_state_dict(checkpoint["PolicyNet"])
        start_it = checkpoint["it"]
        print("Done.")
    else:
        print("Error: No model saved")
        os.exit(1)

    #Start training
    #----------------------------
    policy_net.eval()

    with torch.no_grad():
        for it in range(10):
            ob = env.reset()
            total_reward = 0
            length = 0

            while True:
                env.render()
                ob_tensor = torch.tensor(np.expand_dims(ob, axis=0), dtype=torch.float32, device=device)
                action = policy_net.action_step(ob_tensor, deterministic=True).cpu().numpy()
                ob, reward, done, info = env.step(action[0])
                total_reward += reward
                length += 1

                if done:
                    print("Total reward = {:.6f}, length = {:d}".format(total_reward, length))
                    break

    env.close()
Exemple #12
0
class Reinforce(object):
    # Implementation of the policy gradient method REINFORCE.

    def __init__(self, env, lr, gamma, save_path, load=False):
        self.mean = []
        self.std = []
        self.env = env
        s_len = self.env.observation_space_shape[0]
        a_len = self.env.action_space_n
        self.model = PolicyNet(lr, s_len, a_len)
        self.gamma = gamma
        self.save_path = save_path
        if load:
            self.model.load(self.save_path)
        return

    def train(self):
        # Trains the model on a single episode using REINFORCE.
        #       method generate_episode() to generate training data.
        K = 100
        print("pretrain test:")
        self.test()
        print("training")
        # generate an episode
        for i in range(10000000):
            s, ava, a, r = self.generate_episode()
            s = np.array(s)
            r = np.array(r)
            r /= 100.0
            T = len(r)
            G = np.zeros(shape=(T, ), dtype=np.float32)
            G[T - 1] = r[T - 1]
            for t in range(T - 2, -1, -1):
                G[t] = self.gamma * G[t + 1] + r[t]
            for j in range(6):
                self.model.fit(s, ava, a, G)
            if (i + 1) % K == 0:
                mean, std = self.test()
                self.mean.append(mean)
                self.std.append(std)
                with open('mean_np_array.pickle', 'wb+') as f:
                    pickle.dump(self.mean, f)
                with open('std_np_array.pickle', 'wb+') as f:
                    pickle.dump(self.std, f)
                self.model.save(self.save_path)
        self.model.save(self.save_path)
        return

    def generate_episode_fast(self):
        s = self.env.reset()
        done = False
        states = []
        avas = []
        rewards = []
        predict_fn = self.model.predict
        step_fn = self.env.step
        s_append_fn = states.append
        r_append_fn = rewards.append
        while not done:
            ava = self.env.ava()
            a = predict_fn(s.reshape([1, -1]), ava.reshape([1, -1]))
            s_append_fn(s)
            avas.append(ava)
            s, r, done, _ = step_fn(a)
            r_append_fn(r)
        return states, avas, rewards

    def generate_episode(self):
        s = self.env.reset()
        done = False
        states = []
        rewards = []
        actions = []
        avas = []
        predict_fn = self.model.predict
        step_fn = self.env.step
        s_append_fn = states.append
        r_append_fn = rewards.append
        a_append_fn = actions.append
        while not done:
            ava = self.env.ava()

            a = predict_fn(s.reshape([1, -1]), ava.reshape([1, -1]))
            if a >= 1352:
                p, p_norm = self.model.sess.run(
                    [self.model.p, self.model.p_norm], {
                        self.model.s: s.reshape([1, -1]),
                        self.model.ava: ava.reshape([1, -1])
                    })
                print(a)
                print(ava)
                print(sum(ava))
            a_append_fn(a)
            s_append_fn(s)
            avas.append(ava)
            s, r, done, _ = step_fn(a)
            r_append_fn(r)
        return states, avas, actions, rewards

    def generate_episode_render(self):
        s = self.env.reset()
        done = False
        states = []
        avas = []
        rewards = []
        predict_fn = self.model.predict
        step_fn = self.env.step
        s_append_fn = states.append
        r_append_fn = rewards.append
        while not done:
            ava = self.env.ava()
            a = predict_fn(s.reshape([1, -1]), ava.reshape([1, -1]))
            s_append_fn(s)
            avas.append(ava)
            s, r, done, _ = step_fn(a)
            r_append_fn(r)
        self.env.render()
        return states, avas, rewards

    def test(self):
        r = []
        for i in range(100):
            _, _, ri = self.generate_episode_fast()
            ri = sum(ri)
            r.append(ri)

        std = np.std(r)
        mean = np.mean(r)
        print('r =', mean, "+-", std)
        return mean, std

    def test_render(self):
        r = []
        for i in range(100):
            _, _, ri = self.generate_episode_render()
            ri = sum(ri)
            r.append(ri)
        steps = len(r)
        std = np.std(r)
        mean = np.mean(r)
        print('r =', mean, "+-", std, "steps", steps)
        return mean, std
Exemple #13
0
def main():
    #Parse arguments
    #----------------------------
    parser = argparse.ArgumentParser()
    parser.add_argument("--env", default="CartPole-v0")
    parser.add_argument("--conti", action="store_true")
    parser.add_argument("--unwrap", action="store_true")
    args = parser.parse_args()

    #Parameters
    #----------------------------
    n_env = 8
    n_step = 128
    mb_size = n_env * n_step
    sample_mb_size = 64
    sample_n_epoch = 4
    clip_val = 0.2
    lamb = 0.95
    gamma = 0.99
    ent_weight = 0.0
    max_grad_norm = 0.5
    beta = 0.1
    lr = 1e-4
    n_iter = 30000
    disp_step = 30
    save_step = 300
    save_dir = "./save"
    device = "cuda:0"
    expert_path = "../save/{}_traj.pkl".format(args.env)

    #Create multiple environments
    #----------------------------
    env = MultiEnv([
        make_env(i,
                 env_id=args.env,
                 unwrap=args.unwrap,
                 rand_seed=int(time.time())) for i in range(n_env)
    ])

    if args.conti:
        s_dim = env.ob_space.shape[0]
        a_dim = env.ac_space.shape[0]
    else:
        s_dim = env.ob_space.shape[0]
        a_dim = env.ac_space.n

    runner = EnvRunner(env,
                       s_dim,
                       a_dim,
                       n_step,
                       gamma,
                       lamb,
                       device=device,
                       conti=args.conti)

    #Load expert trajectories
    #----------------------------
    if os.path.exists(expert_path):
        s_real, a_real = pkl.load(open(expert_path, "rb"))
        sa_real = []

        if args.conti:
            for i in range(len(s_real)):
                sa_real.append(np.concatenate([s_real[i], a_real[i]], 1))
        else:
            for i in range(len(s_real)):
                a_real_onehot = np.zeros((len(a_real[i]), a_dim),
                                         dtype=np.float32)

                for j in range(len(a_real[i])):
                    a_real_onehot[j, a_real[i][j]] = 1

                sa_real.append(np.concatenate([s_real[i], a_real_onehot], 1))

        sa_real = np.concatenate(sa_real, 0)
    else:
        print("ERROR: No expert trajectory file found")
        sys.exit(1)

    #Create model
    #----------------------------
    policy_net = PolicyNet(s_dim, a_dim, conti=args.conti).to(device)
    value_net = ValueNet(s_dim).to(device)
    dis_net = DiscriminatorNet(s_dim + a_dim).to(device)
    agent = PPO(policy_net,
                value_net,
                dis_net,
                a_dim,
                beta,
                lr,
                max_grad_norm,
                ent_weight,
                clip_val,
                sample_n_epoch,
                sample_mb_size,
                mb_size,
                device=device,
                conti=args.conti)

    #Load model
    #----------------------------
    if not os.path.exists(save_dir):
        os.mkdir(save_dir)

    if os.path.exists(os.path.join(save_dir, "{}.pt".format(args.env))):
        print("Loading the model ... ", end="")
        checkpoint = torch.load(
            os.path.join(save_dir, "{}.pt".format(args.env)))
        policy_net.load_state_dict(checkpoint["PolicyNet"])
        value_net.load_state_dict(checkpoint["ValueNet"])
        dis_net.load_state_dict(checkpoint["DiscriminatorNet"])
        agent.beta = checkpoint["beta"]
        start_it = checkpoint["it"]
        print("Done.")
    else:
        start_it = 0

    #Start training
    #----------------------------
    t_start = time.time()
    policy_net.train()
    value_net.train()

    for it in range(start_it, n_iter):
        #Run the environment
        with torch.no_grad():
            mb_obs, mb_actions, mb_old_a_logps, mb_values, mb_returns = runner.run(
                policy_net, value_net, dis_net)
            mb_advs = mb_returns - mb_values
            mb_advs = (mb_advs - mb_advs.mean()) / (mb_advs.std() + 1e-6)

        #Train
        pg_loss, v_loss, ent, dis_loss, dis_real, dis_fake, avg_kl = agent.train(
            policy_net, value_net, dis_net, mb_obs, mb_actions, mb_values,
            mb_advs, mb_returns, mb_old_a_logps, sa_real)

        #Print the result
        if it % disp_step == 0:
            agent.lr_decay(it, n_iter)
            policy_net.eval()
            value_net.eval()
            n_sec = time.time() - t_start
            fps = int((it - start_it) * n_env * n_step / n_sec)
            mean_true_return, std_true_return, mean_return, std_return, mean_len = runner.get_performance(
            )
            policy_net.train()
            value_net.train()

            print("[{:5d} / {:5d}]".format(it, n_iter))
            print("----------------------------------")
            print("Timesteps        = {:d}".format((it - start_it) * mb_size))
            print("Elapsed time     = {:.2f} sec".format(n_sec))
            print("FPS              = {:d}".format(fps))
            print("actor loss       = {:.6f}".format(pg_loss))
            print("critic loss      = {:.6f}".format(v_loss))
            print("dis loss         = {:.6f}".format(dis_loss))
            print("entropy          = {:.6f}".format(ent))
            print("avg_kl           = {:.6f}".format(avg_kl))
            print("beta             = {:.6f}".format(agent.beta))
            print("mean true return = {:.6f}".format(mean_true_return))
            print("mean return      = {:.6f}".format(mean_return))
            print("mean length      = {:.2f}".format(mean_len))
            print("dis_real         = {:.3f}".format(dis_real))
            print("dis_fake         = {:.3f}".format(dis_fake))
            print()

        #Save model
        if it % save_step == 0:
            print("Saving the model ... ", end="")
            torch.save(
                {
                    "beta": agent.beta,
                    "it": it,
                    "PolicyNet": policy_net.state_dict(),
                    "ValueNet": value_net.state_dict(),
                    "DiscriminatorNet": dis_net.state_dict()
                }, os.path.join(save_dir, "{}.pt".format(args.env)))
            print("Done.")
            print()

    env.close()
Exemple #14
0
from model import PolicyNet, ValueNet
import torch
from torch import optim, distributions
import torch.nn.functional as F

env = gym.make("CartPole-v1")
# observation = env.reset()
# print(observation)
# print(env.observation_space)

MAXSTEP = 100
BATCHSIZE = 16
EPOCH = 1000
GAMMA = 0.99

policy_net = PolicyNet()
value_net = ValueNet()

policy_net.cuda()
value_net.cuda()
opt1 = optim.Adam(policy_net.parameters(), lr=1e-3)
opt2 = optim.Adam(value_net.parameters(), lr=1e-3)


# train one epoch
def train_step():

    observ_batch = []
    reward_batch = []
    action_batch = []
    mask_batch = []
Exemple #15
0
def main():
    #Parse arguments
    #----------------------------
    parser = argparse.ArgumentParser()
    parser.add_argument("--env", default="CartPole-v0")
    parser.add_argument("--conti", action="store_true")
    args = parser.parse_args()

    #Parameters
    #----------------------------
    env_id = args.env
    mb_size = 256
    lr = 1e-5
    n_iter = 100000
    disp_step = 1000
    save_step = 10000
    save_dir = "./save"
    device = "cuda:0"
    expert_path = "../save/{}_traj.pkl".format(args.env)

    #Create environment
    #----------------------------
    env = gym.make(env_id)

    if args.conti:
        s_dim = env.observation_space.shape[0]
        a_dim = env.action_space.shape[0]
    else:
        s_dim = env.observation_space.shape[0]
        a_dim = env.action_space.n

    #Load expert trajectories
    #----------------------------
    if os.path.exists(expert_path):
        s_traj, a_traj = pkl.load(open(expert_path, "rb"))
        s_traj = np.concatenate(s_traj, 0)
        a_traj = np.concatenate(a_traj, 0)
    else:
        print("ERROR: No expert trajectory file found")
        sys.exit(1)

    #Create model
    #----------------------------
    policy_net = PolicyNet(s_dim, a_dim, conti=args.conti).to(device)
    opt = torch.optim.Adam(policy_net.parameters(), lr)

    #Load model
    #----------------------------
    if not os.path.exists(save_dir):
        os.mkdir(save_dir)

    if os.path.exists(os.path.join(save_dir, "{}.pt".format(args.env))):
        print("Loading the model ... ", end="")
        checkpoint = torch.load(
            os.path.join(save_dir, "{}.pt".format(args.env)))
        policy_net.load_state_dict(checkpoint["PolicyNet"])
        start_it = checkpoint["it"]
        print("Done.")
    else:
        start_it = 0

    #Start training
    #----------------------------
    t_start = time.time()
    policy_net.train()

    for it in range(start_it, n_iter + 1):
        #Train
        mb_obs, mb_actions = sample_batch(s_traj, a_traj, mb_size)
        mb_a_logps, mb_ents = policy_net.evaluate(
            torch.from_numpy(mb_obs).to(device),
            torch.from_numpy(mb_actions).to(device))
        loss = -mb_a_logps.mean()

        opt.zero_grad()
        loss.backward()
        opt.step()

        #Print the result
        if it % disp_step == 0:
            print("[{:5d} / {:5d}] Elapsed time = {:.2f}, actor loss = {:.6f}".
                  format(it, n_iter,
                         time.time() - t_start, loss.item()))

        #Save model
        if it % save_step == 0:
            print("Saving the model ... ", end="")
            torch.save({
                "it": it,
                "PolicyNet": policy_net.state_dict()
            }, os.path.join(save_dir, "{}.pt".format(args.env)))
            print("Done.")
            print()

    env.close()
Exemple #16
0
class A3CGlobal:
    def __init__(self, config):
        self.config = config

        # 정책신경망 생성
        self.actor = PolicyNet(self.config.n_state, self.config.n_action)
        self.actor.to(device)
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(),
                                                lr=self.config.actor_lr)

        # 가치신경망 생성
        self.critic = ValueNet(self.config.n_state, 1)
        self.critic.to(device)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(),
                                                 lr=self.config.critic_lr)

    # 리턴값 계산
    def get_returns(self, rewards, done, next_value):
        returns = torch.zeros(len(rewards),
                              dtype=torch.float).to(self.config.device)
        R = 0 if done else next_value
        for i in reversed(range(0, len(rewards))):
            R = rewards[i] + self.config.discount_factor * R
            returns[i] = R
        return returns

    # 각 타임스텝마다 정책신경망과 가치신경망을 업데이트
    def train_model(self, states, actions, rewards, next_states, done):
        states = torch.tensor(states, dtype=torch.float).to(self.config.device)
        actions = torch.tensor(actions,
                               dtype=torch.float).to(self.config.device)
        next_states = torch.tensor(next_states,
                                   dtype=torch.float).to(self.config.device)

        next_values = self.critic(next_states).view(-1)

        # 리턴값 계산
        returns = self.get_returns(rewards, done, next_values[-1])

        values = self.critic(states).view(-1)

        # 가치신경망 학습
        critic_loss = self.train_critic(values, returns)
        # 정책신경망 학습
        actor_loss = self.train_actor(states, actions, returns - values)

        return actor_loss, critic_loss

    # 정책신경망을 업데이트하는 함수
    def train_actor(self, states, actions, advantages):
        policy = self.actor(states)
        action_prob = torch.sum(actions * policy, dim=1)
        cross_entropy = torch.log(action_prob + 1.e-7) * advantages.detach()
        actor_loss = -torch.mean(cross_entropy)

        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        return actor_loss.item()

    # 가치신경망을 업데이트하는 states
    def train_critic(self, values, targets):
        critic_loss = torch.mean(torch.pow(targets - values, 2))

        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        return critic_loss.item()

    # GPU 메모리 반납
    def close(self):
        del self.actor
        del self.critic
Exemple #17
0
def main():
    #Parse arguments
    #----------------------------
    parser = argparse.ArgumentParser()
    parser.add_argument("--env", default="BipedalWalker-v3")
    parser.add_argument("--discrete", action="store_true")
    parser.add_argument("--unwrap", action="store_true")
    args = parser.parse_args()

    #Parameters
    #----------------------------
    clip_val = 0.2
    sample_mb_size = 64
    sample_n_epoch = 4
    lamb = 0.95
    gamma = 0.99
    ent_weight = 0.01
    max_grad_norm = 0.5
    lr = 1e-4
    n_iter = 10000
    disp_step = 30
    save_step = 300
    save_dir = "./save"
    device = "cuda:0" if torch.cuda.is_available() else "cpu"

    #Create environment
    #----------------------------
    env = gym.make(args.env)

    if args.discrete:
        s_dim = env.observation_space.shape[0]
        a_dim = env.action_space.n
    else:
        s_dim = env.observation_space.shape[0]
        a_dim = env.action_space.shape[0]

    if args.unwrap:
        env = env.unwrapped

    runner = EnvRunner(s_dim,
                       a_dim,
                       gamma,
                       lamb,
                       max_step=2048,
                       device=device,
                       conti=not args.discrete)

    #Create model
    #----------------------------
    policy_net = PolicyNet(s_dim, a_dim, conti=not args.discrete).to(device)
    value_net = ValueNet(s_dim).to(device)
    agent = PPO(policy_net,
                value_net,
                lr,
                max_grad_norm,
                ent_weight,
                clip_val,
                sample_n_epoch,
                sample_mb_size,
                device=device)

    #Load model
    #----------------------------
    if not os.path.exists(save_dir):
        os.mkdir(save_dir)

    if os.path.exists(os.path.join(save_dir, "{}.pt".format(args.env))):
        print("Loading the model ... ", end="")
        checkpoint = torch.load(
            os.path.join(save_dir, "{}.pt".format(args.env)))
        policy_net.load_state_dict(checkpoint["PolicyNet"])
        value_net.load_state_dict(checkpoint["ValueNet"])
        start_it = checkpoint["it"]
        print("Done.")
    else:
        start_it = 0

    #Start training
    #----------------------------
    t_start = time.time()
    policy_net.train()
    value_net.train()
    mean_total_reward = 0
    mean_length = 0

    for it in range(start_it, n_iter):
        #Run the environment
        with torch.no_grad():
            mb_obs, mb_actions, mb_old_a_logps, mb_values, mb_returns, mb_rewards = runner.run(
                env, policy_net, value_net)
            mb_advs = mb_returns - mb_values
            mb_advs = (mb_advs - mb_advs.mean()) / (mb_advs.std() + 1e-6)

        #Train
        pg_loss, v_loss, ent = agent.train(policy_net, value_net, mb_obs,
                                           mb_actions, mb_values, mb_advs,
                                           mb_returns, mb_old_a_logps)
        mean_total_reward += mb_rewards.sum()
        mean_length += len(mb_obs)
        print("[Episode {:4d}] total reward = {:.6f}, length = {:d}".format(
            it, mb_rewards.sum(), len(mb_obs)))

        #Print the result
        if it % disp_step == 0:
            print("\n[{:5d} / {:5d}]".format(it, n_iter))
            print("----------------------------------")
            print("Elapsed time = {:.2f} sec".format(time.time() - t_start))
            print("actor loss   = {:.6f}".format(pg_loss))
            print("critic loss  = {:.6f}".format(v_loss))
            print("entropy      = {:.6f}".format(ent))
            print("mean return  = {:.6f}".format(mean_total_reward /
                                                 disp_step))
            print("mean length  = {:.2f}".format(mean_length / disp_step))
            print()

            agent.lr_decay(it, n_iter)
            mean_total_reward = 0
            mean_length = 0

        #Save model
        if it % save_step == 0:
            print("Saving the model ... ", end="")
            torch.save(
                {
                    "it": it,
                    "PolicyNet": policy_net.state_dict(),
                    "ValueNet": value_net.state_dict()
                }, os.path.join(save_dir, "{}.pt".format(args.env)))
            print("Done.")
            print()

    env.close()
Exemple #18
0

if __name__ == "__main__":
    writer = SummaryWriter("./log")

    env = gym.make("Pong-v0")
    MAXSTEP = 6
    NWORKERS = 4
    EPOCHSTEP = 4000 * 1024 // (MAXSTEP * BATCHSIZE * NWORKERS
                                )  # around ~4000 1 EPOCH in A3C paper
    print("1 epoch contains {} steps".format(EPOCHSTEP))
    NEPOCH = 100 * EPOCHSTEP
    GAMMA = 0.99
    NFRAMES = 4

    policy_net = PolicyNet(NFRAMES)
    policy_net.cuda()
    policy_net.share_memory()  # make it store in shared memory
    opt = optim.RMSprop(policy_net.parameters(), lr=5e-4, alpha=0.99, eps=1e-5)

    samplers = [
        EnvSampler(env, policy_net, NFRAMES, MAXSTEP, GAMMA)
        for _ in range(NWORKERS)
    ]
    global_step = 0

    ctx = mp.get_context('spawn')
    queue = ctx.Queue()
    event = ctx.Event()

    workers = []
Exemple #19
0
def main():
	#Parse arguments
	#----------------------------
	parser = argparse.ArgumentParser()
	parser.add_argument("--env", default="CartPole-v0")
	parser.add_argument("--conti", action="store_true")
	parser.add_argument("--unwrap", action="store_true")
	args = parser.parse_args()

	#Parameters
	#----------------------------
	env_id   = args.env
	save_dir = "./save"
	device   = "cuda:0"

	#Create environment
	#----------------------------
	env = gym.make(env_id)
	
	if args.conti:
		s_dim = env.observation_space.shape[0]
		a_dim = env.action_space.shape[0]
	else:
		s_dim = env.observation_space.shape[0]
		a_dim = env.action_space.n
	
	if args.unwrap:
		env = env.unwrapped

	#Create model
	#----------------------------
	policy_net = PolicyNet(s_dim, a_dim, conti=args.conti).to(device)

	#Load model
	#----------------------------
	if os.path.exists(os.path.join(save_dir, "{}.pt".format(env_id))):
		print("Loading the model ... ", end="")
		checkpoint = torch.load(os.path.join(save_dir, "{}.pt".format(env_id)))
		policy_net.load_state_dict(checkpoint["PolicyNet"])
		print("Done.")
	else:
		print("Error: No model saved")

	#Start playing
	#----------------------------
	policy_net.eval()

	for it in range(100):
		ob  = env.reset()
		ret = 0

		while True:
			env.render()
			action = policy_net.action_step(torch.from_numpy(np.expand_dims(ob.__array__(), axis=0)).float().to(device), deterministic=True)
			ob, reward, done, info = env.step(action.cpu().detach().numpy()[0])
			ret += reward

			if done:
				print("return = {:.4f}".format(ret))
				break

	env.close()
Exemple #20
0
def main():
    #Parse arguments
    #----------------------------
    parser = argparse.ArgumentParser()
    parser.add_argument("--env", default="CartPole-v0")
    parser.add_argument("--conti", action="store_true")
    parser.add_argument("--unwrap", action="store_true")
    args = parser.parse_args()

    #Parameters
    #----------------------------
    n_env = 8
    n_step = 128
    mb_size = n_env * n_step
    sample_mb_size = 64
    sample_n_epoch = 4
    clip_val = 0.2
    lamb = 0.95
    gamma = 0.99
    ent_weight = 0.0
    max_grad_norm = 0.5
    lr = 1e-4
    n_iter = 30000
    disp_step = 30
    save_step = 300
    save_dir = "./save"
    device = "cuda:0"

    #Create multiple environments
    #----------------------------
    env = MultiEnv([
        make_env(i,
                 env_id=args.env,
                 unwrap=args.unwrap,
                 rand_seed=int(time.time())) for i in range(n_env)
    ])

    if args.conti:
        s_dim = env.ob_space.shape[0]
        a_dim = env.ac_space.shape[0]
    else:
        s_dim = env.ob_space.shape[0]
        a_dim = env.ac_space.n

    runner = EnvRunner(env,
                       s_dim,
                       a_dim,
                       n_step,
                       gamma,
                       lamb,
                       device=device,
                       conti=args.conti)

    #Create model
    #----------------------------
    policy_net = PolicyNet(s_dim, a_dim, conti=args.conti).to(device)
    value_net = ValueNet(s_dim).to(device)
    agent = PPO(policy_net,
                value_net,
                lr,
                max_grad_norm,
                ent_weight,
                clip_val,
                sample_n_epoch,
                sample_mb_size,
                mb_size,
                device=device)

    #Load model
    #----------------------------
    if not os.path.exists(save_dir):
        os.mkdir(save_dir)

    if os.path.exists(os.path.join(save_dir, "{}.pt".format(args.env))):
        print("Loading the model ... ", end="")
        checkpoint = torch.load(
            os.path.join(save_dir, "{}.pt".format(args.env)))
        policy_net.load_state_dict(checkpoint["PolicyNet"])
        value_net.load_state_dict(checkpoint["ValueNet"])
        start_it = checkpoint["it"]
        print("Done.")
    else:
        start_it = 0

    #Start training
    #----------------------------
    t_start = time.time()
    policy_net.train()
    value_net.train()

    for it in range(start_it, n_iter):
        #Run the environment
        with torch.no_grad():
            mb_obs, mb_actions, mb_old_a_logps, mb_values, mb_returns = runner.run(
                policy_net, value_net)
            mb_advs = mb_returns - mb_values
            mb_advs = (mb_advs - mb_advs.mean()) / (mb_advs.std() + 1e-6)

        #Train
        pg_loss, v_loss, ent = agent.train(policy_net, value_net, mb_obs,
                                           mb_actions, mb_values, mb_advs,
                                           mb_returns, mb_old_a_logps)

        #Print the result
        if it % disp_step == 0:
            agent.lr_decay(it, n_iter)
            policy_net.eval()
            value_net.eval()
            n_sec = time.time() - t_start
            fps = int((it - start_it) * n_env * n_step / n_sec)
            mean_return, std_return, mean_len = runner.get_performance()
            policy_net.train()
            value_net.train()

            print("[{:5d} / {:5d}]".format(it, n_iter))
            print("----------------------------------")
            print("Timesteps    = {:d}".format((it - start_it) * mb_size))
            print("Elapsed time = {:.2f} sec".format(n_sec))
            print("FPS          = {:d}".format(fps))
            print("actor loss   = {:.6f}".format(pg_loss))
            print("critic loss  = {:.6f}".format(v_loss))
            print("entropy      = {:.6f}".format(ent))
            print("mean return  = {:.6f}".format(mean_return))
            print("mean length  = {:.2f}".format(mean_len))
            print()

        #Save model
        if it % save_step == 0:
            print("Saving the model ... ", end="")
            torch.save(
                {
                    "it": it,
                    "PolicyNet": policy_net.state_dict(),
                    "ValueNet": value_net.state_dict()
                }, os.path.join(save_dir, "{}.pt".format(args.env)))
            print("Done.")
            print()

    env.close()
Exemple #21
0
class A2CAgent:
    def __init__(self, config):
        self.config = config

        # replay memory
        self.replay_memory = deque(maxlen=self.config.n_replay_memory)

        # 정책신경망 생성
        self.actor = PolicyNet(self.config.n_state, self.config.n_action)
        self.actor.to(device)
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=self.config.actor_lr)

        # 가치신경망 생성
        self.critic = ValueNet(self.config.n_state, 1)
        self.critic.to(device)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=self.config.critic_lr)

    # 정책신경망의 출력을 받아 확률적으로 행동을 선택
    def get_action(self, state):
        state = torch.tensor(state, dtype=torch.float).to(device)
        policy = self.actor(state)
        policy = policy.detach().cpu().numpy()[0]
        return np.random.choice(self.config.n_action, 1, p=policy)[0]

    # 히스토리 추가
    def append_replay(self, state, action, reward, next_state):
        act = np.zeros(self.config.n_action)
        act[action] = 1
        self.replay_memory.append((state, act, reward, next_state))

    # 리턴값 계산
    def get_returns(self, rewards, done, next_value):
        returns = torch.zeros(len(rewards), dtype=torch.float).to(self.config.device)
        R = 0 if done else next_value
        for i in reversed(range(0, len(rewards))):
            R = rewards[i] + self.config.discount_factor * R
            returns[i] = R
        return returns

    # 각 타임스텝마다 정책신경망과 가치신경망을 업데이트
    def train_model(self, done):
        # 히스토리를 배열 형태로 정렬
        replay_memory = np.array(self.replay_memory)
        self.replay_memory.clear()
        states = np.vstack(replay_memory[:, 0])
        actions = list(replay_memory[:, 1])
        rewards = list(replay_memory[:, 2])
        next_states = list(replay_memory[:, 3])

        states = torch.tensor(states, dtype=torch.float).to(self.config.device)
        actions = torch.tensor(actions, dtype=torch.float).to(self.config.device)
        next_states = torch.tensor(next_states, dtype=torch.float).to(self.config.device)

        next_values = self.critic(next_states).view(-1)

        # 리턴값 계산
        returns = self.get_returns(rewards, done, next_values[-1])

        values = self.critic(states).view(-1)

        # 가치신경망 학습
        critic_loss = self.train_critic(values, returns)
        # 정책신경망 학습
        actor_loss = self.train_actor(states, actions, returns - values)

        return actor_loss, critic_loss
    
    # 정책신경망을 업데이트하는 함수
    def train_actor(self, states, actions, advantages):
        policy = self.actor(states)
        action_prob = torch.sum(actions * policy, dim=1)
        cross_entropy = torch.log(action_prob + 1.e-7) * advantages.detach()
        actor_loss = -torch.mean(cross_entropy)

        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        return actor_loss.item()
    
    # 가치신경망을 업데이트하는 states
    def train_critic(self, values, targets):
        critic_loss = torch.mean(torch.pow(targets - values, 2))

        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        return critic_loss.item()

    # model의 weight를 파일로 저장
    def save(self):
        torch.save(self.actor.state_dict(), self.config.save_file + ".actor")
        torch.save(self.critic.state_dict(), self.config.save_file + ".critic")
    
    # 파일로 부터 model의 weight를 읽어 옴
    def load(self):
        self.actor.load_state_dict(torch.load(self.config.save_file + ".actor"))
        self.critic.load_state_dict(torch.load(self.config.save_file + ".critic"))
    
    # GPU 메모리 반납
    def close(self):
        del self.actor
        del self.critic
Exemple #22
0
input_size = 4
hidden_size = 128
num_layers = 2
dropout = 0.85
seq_len = 1024

state_size = 3
num_actions = 3
act_lim = 1

batch_size = 32

######################################################

encoder = Encoder(input_size, batch_size, hidden_size, num_layers, dropout)
policy_net = PolicyNet(state_size, num_actions, act_lim, batch_size,
                       hidden_size)

print("Encoder network: ", encoder)
print("Policy network: ", policy_net)
print()

# Test encoder
test_input = torch.randn((batch_size, seq_len, input_size))
print("test_input shape: ", test_input.shape)

encoding = encoder(test_input)
print("encoding shape: ", encoding.shape)

# Test Policy Net
# One step forward propagation
state = torch.randn((batch_size, state_size))
Exemple #23
0
# Turn on pyplot's interactive mode
# VERY IMPORTANT because otherwise training stats plot will hault
plt.ion()

# Create OpenAI gym environment
env = gym.make(env_name)
if is_unwrapped:
    env = env.unwrapped

# Get device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Current usable device is: ", device)

# Create the model
policy_net = PolicyNet(layer_sizes, action_lim).to(device)  # Policy network
value_net = ValueNet(input_size).to(device)  # Value network

# Set up memory
memory = Memory(capacity, device)

# Set up optimizer
policynet_optimizer = optim.Adam(policy_net.parameters(), lr=policy_lr)
valuenet_optimizer = optim.Adam(value_net.parameters(), lr=valuenet_lr)

###################################################################
# Start training

# Dictionary for extra training information to save to checkpoints
training_info = {
    "epoch mean durations": [],
Exemple #24
0
def main():
	#Parse arguments
	#----------------------------
	parser = argparse.ArgumentParser()
	parser.add_argument("--env", default="CartPole-v0")
	parser.add_argument("--conti", action="store_true")
	parser.add_argument("--render", action="store_true")
	parser.add_argument("--unwrap", action="store_true")
	parser.add_argument("--episode", default=1000)
	args = parser.parse_args()

	#Parameters
	#----------------------------
	env_id    = args.env
	save_dir  = "./save"
	device    = "cuda:0"
	n_episode = args.episode

	#Create environment
	#----------------------------
	env = gym.make(env_id)

	if args.conti:
		s_dim = env.observation_space.shape[0]
		a_dim = env.action_space.shape[0]
	else:
		s_dim = env.observation_space.shape[0]
		a_dim = env.action_space.n

	if args.unwrap:
		env = env.unwrapped

	#Create model
	#----------------------------
	policy_net = PolicyNet(s_dim, a_dim, conti=args.conti).to(device)

	#Load model
	#----------------------------
	if os.path.exists(os.path.join(save_dir, "{}.pt".format(env_id))):
		print("Loading the model ... ", end="")
		checkpoint = torch.load(os.path.join(save_dir, "{}.pt".format(env_id)))
		policy_net.load_state_dict(checkpoint["PolicyNet"])
		print("Done.")
	else:
		print("Error: No model saved")

	#Start playing
	#----------------------------
	policy_net.eval()
	s_traj = []
	a_traj = []

	for i_episode in range(n_episode):
		ob  = env.reset()
		ret = 0
		s_traj.append([])
		a_traj.append([])

		while True:
			if args.render:
				env.render()

			action = policy_net.action_step(torch.FloatTensor(np.expand_dims(ob, axis=0)).to(device), deterministic=True)
			action = action.cpu().detach().numpy()[0]

			s_traj[i_episode].append(ob)
			a_traj[i_episode].append(action)

			ob, reward, done, info = env.step(action)
			ret += reward

			if done:
				s_traj[i_episode] = np.array(s_traj[i_episode], dtype=np.float32)

				if args.conti:
					a_traj[i_episode] = np.array(a_traj[i_episode], dtype=np.float32)
				else:
					a_traj[i_episode] = np.array(a_traj[i_episode], dtype=np.int32)

				print("{:d}: return = {:.4f}, len = {:d}".format(i_episode, ret, len(s_traj[i_episode])))
				break

	#s_traj: (n_episode, timesteps, s_dim)
	#a_traj: (n_episode, timesteps, a_dim) or (n_episode, timesteps)
	print("Saving the trajectories ... ", end="")
	pkl.dump((s_traj, a_traj), open(os.path.join(save_dir, "{}_traj.pkl".format(env_id)), "wb"))
	print("Done.")
	env.close()
Exemple #25
0
def main():
    env = FrozenSeaEnv(H, W, P, 'uniform', SEED)
    net = PolicyNet(env.get_obs_size(), env.get_n_actions())
    print(net)
    test_play(env, net, 10)
Exemple #26
0
class Trainer:
    actions = {
        'j': 0,
        't': 1,
        'by': 2,
        'b': 3,
        'bt': 4,
        'fs': 5,
        'fm': 6,
        'fg': 7,
        'fk': 8,
        's': 9,
        'gs': 10,
        'mj': 11,
        'k': 12,
        'xd': 13
    }  # action名称映射到其index

    def __init__(self, model_file=None):
        self.model_dir = "models/"
        if not model_file:
            model_file = self.find_latest_model("model")
        if model_file == "init":
            self.net = PolicyNet()
        else:
            self.net = PolicyNet(model_file)
        print("using model from", model_file)
        self.data = []
        self.data_buffer = [[], []]
        self.old_net = None
        self.best_net = self.net

    def find_latest_model(self, prefix):
        biggest_timestamp = 0
        latest_model = ""
        for filename in os.listdir(self.model_dir):
            pat = prefix + r"(.+)\.h5$"
            match = re.match(pat, filename)
            if match:
                timestamp = int(match.group(1))
                if timestamp > biggest_timestamp:
                    biggest_timestamp = timestamp
                    latest_model = filename
        return self.model_dir + latest_model if latest_model else None

    @staticmethod
    def one_hot(action):
        p = Player(0)
        coded_action = [0 for i in range(14)]
        coded_action[list(p.all_actions).index(action)] = 1
        return coded_action

    @staticmethod
    def abstract_state(game: Game, player: int):
        """
        jue(10) ta(5) bingYing(bool) bing(10) baoTou(5) defended_rush(5)
        then last three steps of each using one-hot coding
        following the order of "my resources, his resources, my steps, his steps“
        """
        state = []
        for i in [player, 1 - player]:  # 先自己 再对方的顺序
            p = game.players[i]
            state += [
                p.jue / 10, p.tower / 5,
                int(p.camp), p.soldier / 10, p.baotou / 5, p.defended_rush / 5
            ]
        for i in [player, 1 - player]:
            his = game.histories[i]
            processed_his = [his[0]] * 2 + his
            for action in processed_his[-3:]:
                state += Trainer.one_hot(action)
        return state

    def self_play(self):
        game = Game(2)
        game.do(0, "j")
        game.do(1, "j")
        game.settle()
        cache = [[[], []], [[], []]]
        while True:
            for i in range(2):
                # p0, p1 = game.players[0], game.players[1]
                act, state = self.choose_action(game, i, self.net)
                game.do(i, act, [1 - i])
                cache[i][0].append(state)
                cache[i][1].append(self.one_hot(act))
            game.settle()
            if len(game.players) == 1:
                winner = list(game.players.keys())[0]
                break
        self.data += cache[winner]
        self.data_buffer[0] += cache[winner][0]
        self.data_buffer[1] += cache[winner][1]

    def progress(self, data_size=512):
        print("progress")
        while len(self.data_buffer[0]) < data_size:
            self.self_play()
        self.old_net = self.net.copy()
        self.net.train_step(*self.data_buffer)
        self.data_buffer = [[], []]
        self.evaluate(self.old_net, self.net, 1, True)
        return self.evaluate(self.old_net, self.net, 20)

    def evaluate(self, old_net, new_net, count=100, verbose=0):
        winners = [0, 0]
        for t in range(count):
            game = Game(2)
            game.do(0, "j")
            game.do(1, "j")
            game.settle()
            while True:
                for i in range(2):
                    # p0, p1 = game.players[0], game.players[1]
                    net = [old_net, new_net][i]
                    act, _ = self.choose_action(game, i, net)
                    game.do(i, act, [1 - i])
                    if verbose:
                        print(i, act)
                game.settle()
                if verbose:
                    print(game.info())
                if len(game.players) == 1:
                    winner = list(game.players.keys())[0]
                    winners[winner] += 1
                    break
        if sum(winners) != count:
            raise ValueError
        return winners[1] / sum(winners), winners

    @staticmethod
    def choose_action(game, i, net):
        p = game.players[i]
        state = Trainer.abstract_state(game, i)
        actions = net.predict(state, noise=0.25)
        # choose max
        max_val = 0
        act = None
        available = p.aActions()
        for action in available:
            val = actions[Trainer.actions[action]]
            if val > max_val:
                max_val = val
                act = action
        return act, state

    def main(self):
        count = 0
        while True:
            result = self.progress(2048)
            print(result)
            count += 1
            if count % 5 == 0:
                self.net.save_model()
Exemple #27
0
class PGAgent:
    def __init__(self, config):
        self.config = config

        # replay memory
        self.replay_memory = deque(maxlen=self.config.n_replay_memory)

        # 정책신경망 생성
        self.model = PolicyNet(self.config.n_state, self.config.n_action)
        self.model.to(device)
        self.model_optimizer = torch.optim.Adam(self.model.parameters(),
                                                lr=self.config.learning_rate)

    # 정책신경망의 출력을 받아 확률적으로 행동을 선택
    def get_action(self, state):
        state = torch.tensor(state, dtype=torch.float).to(device)
        policy = self.model(state)
        policy = policy.detach().cpu().numpy()[0]
        return np.random.choice(self.config.n_action, 1, p=policy)[0]

    # 히스토리 추가
    def append_replay(self, state, action, reward, next_state):
        act = np.zeros(self.config.n_action)
        act[action] = 1
        self.replay_memory.append((state, act, reward, next_state))

    # 리턴값 계산
    def get_returns(self, rewards):
        returns = torch.zeros(len(rewards),
                              dtype=torch.float).to(self.config.device)
        R = 0
        for i in reversed(range(0, len(rewards))):
            R = rewards[i] + self.config.discount_factor * R
            returns[i] = R
        if 1 < len(returns):
            returns -= torch.mean(returns)
            returns /= (torch.std(returns) + 1.e-7)
        return returns

    # 각 타임스텝마다 정책신경망과 가치신경망을 업데이트
    def train_model(self):
        # 히스토리를 배열 형태로 정렬
        replay_memory = np.array(self.replay_memory)
        self.replay_memory.clear()
        states = np.vstack(replay_memory[:, 0])
        actions = list(replay_memory[:, 1])
        rewards = list(replay_memory[:, 2])
        next_states = list(replay_memory[:, 3])

        states = torch.tensor(states, dtype=torch.float).to(self.config.device)
        actions = torch.tensor(actions,
                               dtype=torch.float).to(self.config.device)

        # 리턴값 계산
        returns = self.get_returns(rewards)

        loss = self.train_policy(states, actions, returns)

        return loss

    # 정책신경망을 업데이트하는 함수
    def train_policy(self, states, actions, returns):
        policy = self.model(states)
        action_prob = torch.sum(actions * policy, dim=1)
        cross_entropy = torch.log(action_prob + 1.e-7) * returns
        loss = -torch.mean(cross_entropy)

        self.model_optimizer.zero_grad()
        loss.backward()
        self.model_optimizer.step()

        return loss.item()

    # model의 weight를 파일로 저장
    def save(self):
        torch.save(self.model.state_dict(), self.config.save_file)

    # 파일로 부터 model의 weight를 읽어 옴
    def load(self):
        self.model.load_state_dict(torch.load(self.config.save_file))

    # GPU 메모리 반납
    def close(self):
        del self.model
Exemple #28
0
# Turn on pyplot's interactive mode
# VERY IMPORTANT because otherwise training stats plot will hault
plt.ion()

# Create OpenAI gym environment
env = gym.make(env_name)
if is_unwrapped:
    env = env.unwrapped

# Get device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Current usable device is: ", device)

# Create the model
policy_net = PolicyNet(layer_sizes).to(device)

# Set up optimizer - Minimal
optimizer = optim.Adam(policy_net.parameters())
# optimizer = optim.SGD(policy_net.parameters(), lr=learning_rate)

###################################################################
# Start training

# Dictionary for extra training information to save to checkpoints
training_info = {"epoch mean durations" : [],
                 "epoch mean rewards" : [],
                 "max reward achieved": 0,
                 "past %d epochs mean reward" %  (num_avg_epoch): 0,}

# Batch that records trajectories
Exemple #29
0
class SAC:
    def __init__(self, env, gamma, tau, buffer_maxlen, value_lr, q_lr, policy_lr):

        self.env = env
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]
        self.action_range = [env.action_space.low, env.action_space.high]

        # hyperparameters
        self.gamma = gamma
        self.tau = tau

        # initialize networks
        self.value_net = ValueNet(self.state_dim).to(device)
        self.target_value_net = ValueNet(self.state_dim).to(device)
        self.q1_net = SoftQNet(self.state_dim, self.action_dim).to(device)
        self.q2_net = SoftQNet(self.state_dim, self.action_dim).to(device)
        self.policy_net = PolicyNet(self.state_dim, self.action_dim).to(device)

        # Load the target value network parameters
        for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()):
            target_param.data.copy_(self.tau * param + (1 - self.tau) * target_param)

            # Initialize the optimizer
        self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=value_lr)
        self.q1_optimizer = optim.Adam(self.q1_net.parameters(), lr=q_lr)
        self.q2_optimizer = optim.Adam(self.q2_net.parameters(), lr=q_lr)
        self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=policy_lr)

        # Initialize thebuffer
        self.buffer = ReplayBeffer(buffer_maxlen)

    def get_action(self, state):
        action = self.policy_net.action(state)
        action = action * (self.action_range[1] - self.action_range[0]) / 2.0 + \
                 (self.action_range[1] + self.action_range[0]) / 2.0

        return action

    def update(self, batch_size):
        state, action, reward, next_state, done = self.buffer.sample(batch_size)
        new_action, log_prob = self.policy_net.evaluate(state)

        # V value loss
        value = self.value_net(state)
        new_q1_value = self.q1_net(state, new_action)
        new_q2_value = self.q2_net(state, new_action)
        next_value = torch.min(new_q1_value, new_q2_value) - log_prob
        value_loss = F.mse_loss(value, next_value.detach())

        # Soft q  loss
        q1_value = self.q1_net(state, action)
        q2_value = self.q2_net(state, action)
        target_value = self.target_value_net(next_state)
        target_q_value = reward + done * self.gamma * target_value
        q1_value_loss = F.mse_loss(q1_value, target_q_value.detach())
        q2_value_loss = F.mse_loss(q2_value, target_q_value.detach())

        # Policy loss
        policy_loss = (log_prob - torch.min(new_q1_value, new_q2_value)).mean()

        # Update v
        self.value_optimizer.zero_grad()
        value_loss.backward()
        self.value_optimizer.step()

        # Update Soft q
        self.q1_optimizer.zero_grad()
        self.q2_optimizer.zero_grad()
        q1_value_loss.backward()
        q2_value_loss.backward()
        self.q1_optimizer.step()
        self.q2_optimizer.step()

        # Update Policy
        self.policy_optimizer.zero_grad()
        policy_loss.backward()
        self.policy_optimizer.step()

        # Update target networks
        for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()):
            target_param.data.copy_(self.tau * param + (1 - self.tau) * target_param)