def main():
    learning_rate = 0.001
    discount = 0.995
    beta = 0.4
    eps = 0.05
    K_epoch = 3
    num_steps = 128

    envs = [make_env() for _ in range(num_envs)]
    envs = SubprocVecEnv(envs)
    model = CNNTradingAgent(num_features=envs.reset().shape[-1],
                            n_actions=2 * n_action_intervals + 1).to(device)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    print_interval = 10

    scores_list = []
    loss_list = []
    for n_epi in range(10000):  # 게임 1만판 진행
        n_epi += 1
        loss = 0.0
        log_probs, states, actions, rewards, next_state, masks, values = collect_trajectories(
            envs, model, num_steps)

        # raise Exception("True" if torch.any(torch.isnan(torch.stack(states))) else "False")
        if beta > 0.01:
            beta *= discount
        for _ in range(K_epoch):
            L = -clipped_surrogate(envs, model, log_probs, states, actions,
                                   rewards, discount, eps, beta)

            optimizer.zero_grad()
            L.backward()
            optimizer.step()

            loss += L.item()
            del L

        score = np.asarray(rewards).sum(axis=0).mean()
        scores_list.append(score)
        loss_list.append(loss)

        if n_epi % print_interval == 0 and n_epi != 0:
            print("# of episode :{}, avg score : {:.4f}, loss : {:.6f}".format(
                n_epi, score / print_interval, loss / print_interval))
            print("actions : ", torch.cat(actions))

        if n_epi % save_interval == 0:
            torch.save(model.state_dict(),
                       os.path.join(save_location, f'TradingGym_{n_epi}.pth'))
            torch.save(scores_list,
                       os.path.join(save_location, f"{n_epi}_scores.pth"))
            # plt.plot(scores_list)
            # plt.title("Reward")
            # plt.grid(True)
            # plt.savefig(os.path.join(save_location,f'{n_epi}_ppo.png'))
            # plt.close()

    del envs
Ejemplo n.º 2
0
def run(num_envs=16,
        hidden_dim=256,
        batch_size=1024,
        iterations=1000,
        log_interval=10,
        runs=1):
    envs = [tl.make_nh_waypoint_3d() for i in range(num_envs)]
    envs = SubprocVecEnv(envs)
    t_env = tenv.WaypointEnv3D()
    state_dim = t_env.observation_space.shape[0]
    action_dim = t_env.action_space.shape[0]
    path = os.getcwd() + "/nh_waypoint_3d/"
    for i in range(runs):
        agent = ag.Agent(state_dim, hidden_dim, action_dim, dim=3)
        opt = torch.optim.Adam(agent.parameters(), lr=1e-4)
        ep, rew, agent = tl.train_mp(envs,
                                     t_env,
                                     agent,
                                     opt,
                                     batch_size,
                                     iterations,
                                     log_interval,
                                     render=False,
                                     fname=path + "gaussian_" + str(2))
        if i == 0:
            csv_input = pd.DataFrame()
            csv_input["timesteps"] = ep
        csv_input["run" + str(i)] = rew
        csv_input.to_csv(path + "data.csv", index=False)
Ejemplo n.º 3
0
    def __init__(self, numOfEnvs):
        
        self.testRewards = []
        
#         self.num_envs = 16
#         self.num_envs = numOfEnvs
        self.num_envs = 6
        
        self.env_name = "Pendulum-v0"
        self.env = gym.make(self.env_name)
        
        self.envs = [self.make_env() for i in range(self.num_envs)]
        self.envs = SubprocVecEnv(self.envs)
        
        self.num_inputs  = self.envs.observation_space.shape[0]
        self.num_outputs = self.envs.action_space.shape[0]

        #Hyper params:
        self.hidden_size      = 256
        self.lr               = 3e-3

        self.model = ActorCritic(self.num_inputs, self.num_outputs, self.hidden_size).to(device)
        self.optimizer = optim.Adam(self.model.parameters(), lr=self.lr)
Ejemplo n.º 4
0
def make_envs(num_envs=16,env_name="Pendulum-v0"):
    ''' 创建多个子环境
    '''
    num_envs = 16
    env_name = "CartPole-v0"
    def make_env():
        def _thunk():
            env = gym.make(env_name)
            return env

        return _thunk

    envs = [make_env() for i in range(num_envs)]
    envs = SubprocVecEnv(envs)
    return envs
Ejemplo n.º 5
0
def run(num_envs=16,
        hidden_dim=256,
        batch_size=1024,
        iterations=1000,
        log_interval=10,
        runs=3):
    envs = [tl.make_term_3d() for i in range(num_envs)]
    envs = SubprocVecEnv(envs)
    t_env = tenv.TrajectoryEnvTerm()
    t_env.num_fut_wp = int(cfg.waypoints - 1)
    state_size = 5 + 15 * (t_env.num_fut_wp + 1)
    t_env.observation_space = gym.spaces.Box(-1, 1, shape=(state_size, ))

    state_dim = t_env.observation_space.shape[0]
    action_dim = t_env.action_space.shape[0]
    path = os.getcwd() + "/_3d/term_3d/"
    for i in range(runs):
        agent = ag.Agent(state_dim,
                         hidden_dim,
                         action_dim,
                         dim=3,
                         lookahead=lookahead)
        opt = torch.optim.Adam(agent.parameters(), lr=cfg.lr)
        ep, rew, term_rew, agent = tl.train_term_mp(envs,
                                                    t_env,
                                                    agent,
                                                    opt,
                                                    batch_size,
                                                    iterations,
                                                    log_interval,
                                                    render=False,
                                                    fname=path + wps + "-wps")
        if i == 0:
            csv_input = pd.DataFrame()
            csv_input["iterations"] = ep
            term_csv_input = pd.DataFrame()
            term_csv_input["iterations"] = ep
        csv_input["run" + str(i)] = rew
        term_csv_input["run" + str(i)] = term_rew
        csv_input.to_csv(path + "data_wp-" + wps + ".csv", index=False)
        term_csv_input.to_csv(path + "term_data_wp-" + wps + ".csv",
                              index=False)
Ejemplo n.º 6
0
    board[2] = observation['bomb_blast_strength']    
    return board

def makeTrainingObservation():
    env = Pomme(**config["env_kwargs"])
    agents = {}
    for agent_id in range(num_players):
        agent = TrainingAgent(config["agent"](agent_id, config["game_type"]))              
        agents[agent_id] = agent
    env.set_agents(list(agents.values()))
    env.set_init_game_state(None)
    return env

if __name__ == '__main__':
    envs = [make_env() for i in range(num_envs)]
    envs = SubprocVecEnv(envs)    
    
    state_shape = (3,11,11) 
    num_actions = envs.action_space.n

     #a2c hyperparams:
    gamma = 0.99
    entropy_coef = 0.01
    value_loss_coef = 0.5
    max_grad_norm = 0.5
    num_steps = 5
    num_frames = int(10e6)

    #rmsprop hyperparams:
    lr    = 7e-4
    eps   = 1e-5
Ejemplo n.º 7
0
def main():
    envs = [make_env() for i in range(num_envs)]
    envs = SubprocVecEnv(envs)

    num_inputs = envs.observation_space.shape[0]
    num_outputs = envs.action_space.n

    model = ActorCritic(num_inputs, num_outputs, hidden_size,
                        hd2_size).to(device)
    optimizer = optim.Adam(model.parameters())

    max_frames = 10000
    frame_idx = 0
    test_rewards = []

    state = envs.reset()

    while frame_idx < max_frames:

        log_probs = []
        values = []
        rewards = []
        masks = []
        entropy = 0

        for _ in range(num_steps):
            state = torch.FloatTensor(state).to(device)
            dist, value = model(state)

            action = dist.sample()
            next_state, reward, done, _ = envs.step(action.cpu().numpy())

            log_prob = dist.log_prob(action)
            entropy += dist.entropy().mean()

            log_probs.append(log_prob)
            values.append(value)
            rewards.append(torch.FloatTensor(reward).unsqueeze(1).to(device))
            masks.append(torch.FloatTensor(1 - done).unsqueeze(1).to(device))

            state = next_state
            frame_idx += 1

        next_state = torch.FloatTensor(next_state).to(device)
        _, next_value = model(next_state)
        returns = compute_returns(next_value, rewards, masks)

        log_probs = torch.cat(log_probs)
        returns = torch.cat(returns).detach()
        values = torch.cat(values)

        advantage = returns - values

        actor_loss = -(log_probs * advantage.detach()).mean()
        critic_loss = advantage.pow(2).mean()

        loss = actor_loss + 0.5 * critic_loss - 0.001 * entropy

        print(f'\rframe: {frame_idx}\t loss: {loss}', end='')
        if frame_idx % 100 == 0:
            rewards, scores = map(
                list, zip(*((test_env(model, False) for _ in range(10)))))
            avg_rewards = np.mean(rewards)
            avg_scores = np.mean(scores)
            print(
                f'\rframe: {frame_idx}\t avg_rewards: {avg_rewards:.2f}\t avg_scores: {avg_scores:.2f}\t loss: {loss}'
            )

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    ((test_env(model, True) for _ in range(10)))
    envs.close()
Ejemplo n.º 8
0
import matplotlib.pyplot as plt
import matplotlib.animation as animation

from a2c import ActorCritic
from policy import *


def env_fn():
    env = gym.make('cube-x3-v0')
    env.unwrapped._refreshScrambleParameters(1, 2, scramble_easy=True)
    return env


actions = env_fn().unwrapped.action_list

envs = SubprocVecEnv([env_fn])

obs = envs.reset()
envs.render(0)

action_list = []

fig = plt.figure()
ims = []

im = plt.imshow(cube_gym.onehotToRGB(obs[0]))
ims.append([im])

with tf.Session() as sess:

    actor_critic = ActorCritic(sess, CnnPolicy, envs.observation_space.shape,
Ejemplo n.º 9
0
import sys  
sys.path.append('./common')
from common.multiprocessing_env import SubprocVecEnv

num_envs = 16
env_name = "Pendulum-v0"

def make_env():
    def _thunk():
        env = gym.make(env_name)
        return env

    return _thunk

envs = [make_env() for i in range(num_envs)]
envs = SubprocVecEnv(envs)

env = gym.make(env_name)


# Neural Network
def init_weights(m):
    if isinstance(m, nn.Linear):
        nn.init.normal_(m.weight, mean=0., std=0.1)
        nn.init.constant_(m.bias, 0.1)

class ActorCritic(nn.Module):
    def __init__(self, num_inputs, num_outputs, hidden_size, std=0.0):
        super(ActorCritic, self).__init__()
        
        self.critic = nn.Sequential(
Ejemplo n.º 10
0
def train(policy, save_name, load_count = 0, summarize=True, load_path=None, log_path = './logs'):
    
    #Minigrid maze env
    env_name = "MiniGrid-BlockMaze-v0"
    def make_env(env_name):
        return lambda: gym_minigrid.wrappers.PadImgObsWrapper(gym.make(env_name))

    envs = [make_env(env_name) for i in range(N_ENVS)]
    envs = SubprocVecEnv(envs)

    ob_space = envs.observation_space.shape
    nw, nh, nc = ob_space
    ac_space = envs.action_space

    obs = envs.reset()

    with tf.Session() as sess:
        actor_critic = get_actor_critic(sess, N_ENVS, N_STEPS, ob_space,
                ac_space, policy, summarize)
        if load_path is not None:
            actor_critic.load(load_path)
            print('Loaded a2c')

        summary_op = tf.summary.merge_all()
        writer = tf.summary.FileWriter(log_path, graph=sess.graph)

        sess.run(tf.global_variables_initializer())

        batch_ob_shape = (N_ENVS*N_STEPS, nw, nh, nc)

        dones = [False for _ in range(N_ENVS)]
        nbatch = N_ENVS * N_STEPS

        episode_rewards = np.zeros((N_ENVS, ))
        final_rewards   = np.zeros((N_ENVS, ))

        for update in tqdm(range(load_count + 1, TOTAL_TIMESTEPS + 1)):
            # mb stands for mini batch
            mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[]
            for n in range(N_STEPS):
                actions, values, _ = actor_critic.act(obs)

                mb_obs.append(np.copy(obs))
                mb_actions.append(actions)
                mb_values.append(values)
                mb_dones.append(dones)

                obs, rewards, dones, _ = envs.step(actions)

                #print(obs[0:3, :,:,0])

                episode_rewards += rewards
                masks = 1 - np.array(dones)
                final_rewards *= masks
                final_rewards += (1 - masks) * episode_rewards
                episode_rewards *= masks

                mb_rewards.append(rewards)

            mb_dones.append(dones)

            #batch of steps to batch of rollouts
            mb_obs = np.asarray(mb_obs, dtype=np.float32).swapaxes(1, 0).reshape(batch_ob_shape)
            mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0)
            mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0)
            mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0)
            mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
            mb_masks = mb_dones[:, :-1]
            mb_dones = mb_dones[:, 1:]

            last_values = actor_critic.critique(obs).tolist()

            #discount/bootstrap off value fn
            for n, (rewards, d, value) in enumerate(zip(mb_rewards, mb_dones, last_values)):
                rewards = rewards.tolist()
                d = d.tolist()
                if d[-1] == 0:
                    rewards = discount_with_dones(rewards+[value], d+[0], GAMMA)[:-1]
                else:
                    rewards = discount_with_dones(rewards, d, GAMMA)
                mb_rewards[n] = rewards

            mb_rewards = mb_rewards.flatten()
            mb_actions = mb_actions.flatten()
            mb_values = mb_values.flatten()
            mb_masks = mb_masks.flatten()

            if summarize:
                loss, policy_loss, value_loss, policy_entropy, _, summary = actor_critic.train(mb_obs,
                        mb_rewards, mb_masks, mb_actions, mb_values, update,
                        summary_op)
                writer.add_summary(summary, update)
            else:
                loss, policy_loss, value_loss, policy_entropy, _ = actor_critic.train(mb_obs,
                        mb_rewards, mb_masks, mb_actions, mb_values, update)

            if update % LOG_INTERVAL == 0 or update == 1:
                print('%i): %.4f, %.4f, %.4f' % (update, policy_loss, value_loss, policy_entropy))
                print(final_rewards.mean())

            if update % SAVE_INTERVAL == 0:
                print('Saving model')
                actor_critic.save(SAVE_PATH, save_name + '_' + str(update) + '.ckpt')

        actor_critic.save(SAVE_PATH, save_name + '_done.ckpt')
Ejemplo n.º 11
0
from common.multiprocessing_env import SubprocVecEnv

num_envs = 16
env_name = "Pendulum-v0"


def make_env():
    def make():
        env = gym.make(env_name)
        return env

    return make


envs = [make_env() for i in range(num_envs)]
envs = SubprocVecEnv(envs)

env = gym.make(env_name)

STATE_DIM = env.observation_space.shape[0]
ACTION_DIM = env.action_space.shape[0]
ACTION_MAX = env.action_space.high[0]
SAMPLE_NUMS = 100
TARGET_UPDATE_STEP = 10
CLIP_PARAM = 0.3

FloatTensor = torch.FloatTensor
LongTensor = torch.LongTensor
ByteTensor = torch.ByteTensor
Tensor = FloatTensor
Ejemplo n.º 12
0
logger = Logger()

def make_cuda(input):
    if USE_CUDA:
        return input.cuda()
    return input

def make_env():   
    def _thunk():        
        env = Key_Collect()
        return env
    return _thunk

if __name__ == '__main__': # important for windows systems if subprocesses are run
    envs = [make_env() for i in range(num_envs)]
    envs = SubprocVecEnv(envs)    
    
    state_shape = envs.observation_space.shape 
    num_actions = envs.action_space.n    
    
    #a2c hyperparams:
    gamma = 0.99
    entropy_coef = 0.01
    value_loss_coef = 0.5
    max_grad_norm = 0.5
    num_steps = 10
    num_frames = int(1e6)
    

    #Init a2c and rmsprop   
    actor_critic = ActorCritic(state_shape, num_actions)            
Ejemplo n.º 13
0
class Ppo:
    
    def __init__(self, numOfEnvs):
        
        self.testRewards = []
        
#         self.num_envs = 16
#         self.num_envs = numOfEnvs
        self.num_envs = 6
        
        self.env_name = "Pendulum-v0"
        self.env = gym.make(self.env_name)
        
        self.envs = [self.make_env() for i in range(self.num_envs)]
        self.envs = SubprocVecEnv(self.envs)
        
        self.num_inputs  = self.envs.observation_space.shape[0]
        self.num_outputs = self.envs.action_space.shape[0]

        #Hyper params:
        self.hidden_size      = 256
        self.lr               = 3e-3

        self.model = ActorCritic(self.num_inputs, self.num_outputs, self.hidden_size).to(device)
        self.optimizer = optim.Adam(self.model.parameters(), lr=self.lr)

            
    def make_env(self):
        def _thunk():
            env = gym.make(self.env_name)
            return env

        return _thunk        

#     def compute_gae(self, next_value, rewards, masks, values, gamma=0.99, tau=0.95):
    def compute_gae(self, next_value, rewards, masks, values, g, t):
        
        gamma = float(g)
        tau = float(t)

        values = values + [next_value]
        gae = 0
        returns = []
        for step in reversed(range(len(rewards))):
            delta = rewards[step] + gamma * values[step + 1] * masks[step] - values[step]
            gae = delta + gamma * tau * masks[step] * gae
            returns.insert(0, gae + values[step])
        return returns
    
    def ppo_iter(self, mini_batch_size, states, actions, log_probs, returns, advantage):
        batch_size = states.size(0)
        for _ in range(batch_size // mini_batch_size):
            rand_ids = np.random.randint(0, batch_size, mini_batch_size)
            yield states[rand_ids, :], actions[rand_ids, :], log_probs[rand_ids, :], returns[rand_ids, :], advantage[rand_ids, :]

    def ppo_update(self, ppo_epochs, mini_batch_size, states, actions, log_probs, returns, advantages, clip_param=0.2):
        for _ in range(ppo_epochs):
            for state, action, old_log_probs, return_, advantage in self.ppo_iter(mini_batch_size, states, actions, log_probs, returns, advantages):
                dist, value = self.model(state)
                entropy = dist.entropy().mean()
                new_log_probs = dist.log_prob(action)

                ratio = (new_log_probs - old_log_probs).exp()
                surr1 = ratio * advantage
                surr2 = torch.clamp(ratio, 1.0 - clip_param, 1.0 + clip_param) * advantage

                actor_loss  = - torch.min(surr1, surr2).mean()
                critic_loss = (return_ - value).pow(2).mean()

                loss = 0.5 * critic_loss + actor_loss - 0.001 * entropy

                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()
        return loss
                
    def plot(self, frame_idx, rewards):
        clear_output(True)
        plt.figure(figsize=(20,5))
        plt.subplot(131)
        plt.title('frame %s. reward: %s' % (frame_idx, rewards[-1]))
        plt.plot(rewards)
        plt.show()
#         plt.savefig("{0}/{1}_rewardGraph.png".format(saveGraphPath, frame_idx))
        
    def test_env(self, vis=False):
        state = self.env.reset()
        if vis: self.env.render()
        done = False
        total_reward = 0
        while not done:
            state = torch.FloatTensor(state).unsqueeze(0).to(device)
            dist, _ = self.model(state)
            next_state, reward, done, _ = self.env.step(dist.sample().cpu().numpy()[0])
            state = next_state
            if vis: self.env.render()
            total_reward += reward
        return total_reward
                
    def main(self, inputVals):
        gam = inputVals[0]
        lam = inputVals[1]
        
        print ("Gam: ", gam)
        print ("Lam: ", lam)
        
        num_inputs  = self.envs.observation_space.shape[0]
        num_outputs = self.envs.action_space.shape[0]

        #Hyper params:
#         hidden_size      = 256
#         lr               = 3e-3
        num_steps        = 20
        mini_batch_size  = 5
        ppo_epochs       = 4
        threshold_reward = -200

#         model = a.ActorCritic(num_inputs, num_outputs, hidden_size).to(device)
#         optimizer = optim.Adam(self.model.parameters(), lr=lr)
        
        max_frames = 12000
#         max_frames = 2000
        frame_idx  = 0
        self.test_rewards = []
        
        state = self.envs.reset()
        early_stop = False

        while frame_idx < max_frames and not early_stop:

            log_probs = []
            values    = []
            states    = []
            actions   = []
            rewards   = []
            masks     = []
            entropy = 0

            for _ in range(num_steps):
                state = torch.FloatTensor(state).to(device)
                dist, value = self.model(state)

                action = dist.sample()
                next_state, reward, done, _ = self.envs.step(action.cpu().numpy())

                log_prob = dist.log_prob(action)
                entropy += dist.entropy().mean()

                log_probs.append(log_prob)
                values.append(value)
                rewards.append(torch.FloatTensor(reward).unsqueeze(1).to(device))
                masks.append(torch.FloatTensor(1 - done).unsqueeze(1).to(device))

                states.append(state)
                actions.append(action)

                state = next_state
                frame_idx += 1

                if frame_idx % 1000 == 0:
                    test_reward = np.mean([self.test_env() for _ in range(10)])
                    self.test_rewards.append(test_reward)
                    self.plot(frame_idx, self.test_rewards)
                    if test_reward > threshold_reward: early_stop = True
                    print ("rewards: ", test_reward)


            next_state = torch.FloatTensor(next_state).to(device)
            _, next_value = self.model(next_state)
            returns = self.compute_gae(next_value, rewards, masks, values, gam, lam)

            returns   = torch.cat(returns).detach()
            log_probs = torch.cat(log_probs).detach()
            values    = torch.cat(values).detach()
            states    = torch.cat(states)
            actions   = torch.cat(actions)
            advantage = returns - values

            lastLoss = self.ppo_update(ppo_epochs, mini_batch_size, states, actions, log_probs, returns, advantage)
#             print ("loss: ", [lastLoss])
            
#         re = rewards[-1].cpu()
#         print ("RE: ", np.asarray(re))
#         return (np.asarray(re))
        return lastLoss.item()
Ejemplo n.º 14
0
from hyperparameters import *
from common.multiprocessing_env import SubprocVecEnv
from model import Net, Brain
from envs import make_env
from tqdm import tqdm
import numpy as np

seed_num = 1
torch.manual_seed(seed_num)
if use_cuda:
    torch.cuda.manual_seed(seed_num)

# 실행환경 구축
torch.set_num_threads(seed_num)
envs = [make_env(ENV_NAME, seed_num, i) for i in range(NUM_PROCESSES)]
envs = SubprocVecEnv(envs)  # 멀티프로세스 실행환경

n_out = envs.action_space.n  # 행동의 가짓수는 4
actor_critic = Net(n_out).to(device)  # GPU 사용
global_brain = Brain(actor_critic)

# 정보 저장용 변수 생성
obs_shape = envs.observation_space.shape  # (1, 84, 84)
obs_shape = (obs_shape[0] * NUM_STACK_FRAME,
                *obs_shape[1:])  # (4, 84, 84)
# torch.Size([16, 4, 84, 84])
current_obs = torch.zeros(NUM_PROCESSES, *obs_shape).to(device)
rollouts = RolloutStorage(
    NUM_ADVANCED_STEP, NUM_PROCESSES, obs_shape)  # rollouts 객체
episode_rewards = torch.zeros([NUM_PROCESSES, 1])  # 현재 에피소드에서 받을 보상 저장
final_rewards = torch.zeros([NUM_PROCESSES, 1])  # 마지막 에피소드의 총 보상 저장
Ejemplo n.º 15
0
Archivo: GAE.py Proyecto: yhjflower/RL
        returns.insert(0, gae + values[step])
    return returns


if __name__ == '__main__':
    num_envs = 8

    def make_env():
        def _thunk():
            env = gym.make(env_name)
            return env

        return _thunk

    envs = [make_env() for i in range(num_envs)]
    envs = SubprocVecEnv(env_name)
    num_inputs = envs.observation_space.shape[0]
    num_outputs = envs.action_space.shape[0]

    hidden_size = 256
    lr = 3e-2
    num_steps = 20

    model = ActorCritic(num_inputs, num_outputs, hidden_size).to(device)
    optimizer = optim.Adam(model.parameters(), lr)

    max_frames = 100000
    frame_idx = 0
    test_rewards = []

    state = envs.reset()
Ejemplo n.º 16
0
Archivo: main.py Proyecto: km01/myrl

env_name = 'Pendulum-v0'
gamma = 0.9
num_envs = 12  # num_envs 가 크면 오류발생 가능
max_frame = 50000
actor_lr = 0.0003
critic_lr = 0.001
max_grad_norm = 0.7
n_steps = 50
max_episode_steps = 500
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


if __name__ == '__main__':
    envs = SubprocVecEnv([make_env(env_name) for i in range(num_envs)])
    envs.set_max_episode_steps(max_episode_steps)
    actor = Actor().to(device)
    critic = Critic().to(device)
    a_solver = optim.Adam(actor.parameters(), lr=actor_lr)
    c_solver = optim.Adam(critic.parameters(), lr=critic_lr)

    frame_count = 0
    rewards = [[0.] for _ in range(num_envs)]
    global_rewards = []

    obs_gotten = None

    while frame_count < max_frame:

        cache = {'obs': [], 'acts': [], 'rews': [], 'dones': []}
Ejemplo n.º 17
0
n_updates = 4

frame_idx = 0
scores_list = []


def make_env():
    def _thunk():
        env = gym.make(env_name)
        return env

    return _thunk


envs = [make_env() for i in range(num_envs)]
envs = SubprocVecEnv(envs)

env = gym.make(env_name)


class ActorCritic(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(in_channels=2,
                               out_channels=4,
                               kernel_size=6,
                               stride=2,
                               bias=False)
        nn.init.orthogonal_(self.conv1.weight, np.sqrt(2))
        #The second convolution layer takes a 20x20 frame and produces a 9x9 frame
        self.conv2 = nn.Conv2d(
Ejemplo n.º 18
0
def train(env_fn=None,
          spectrum=False,
          vae_arch=None,
          a2c_arch=None,
          nenvs=16,
          nsteps=100,
          max_iters=1e6,
          kl_coeff=0.5,
          lr=7e-4,
          log_interval=100,
          summarize=True,
          vae_load_path=None,
          a2c_load_path=None,
          log_path=None,
          cpu_cores=1):

    # Construct the vectorized parallel environments
    envs = [env_fn for _ in range(nenvs)]
    envs = SubprocVecEnv(envs)

    # Set some random seeds for the environment
    envs.seed(0)
    if spectrum:
        envs.spectrum()

    ob_space = envs.observation_space.shape
    nw, nh, nc = ob_space
    ac_space = envs.action_space

    obs = envs.reset()

    tf_config = tf.ConfigProto(inter_op_parallelism_threads=cpu_cores,
                               intra_op_parallelism_threads=cpu_cores)
    tf_config.gpu_options.allow_growth = True

    with tf.Session(config=tf_config) as sess:

        actor_critic = RandomActorCritic(sess, a2c_arch, ob_space, ac_space,
                                         nenvs, nsteps)

        if a2c_load_path is not None:
            actor_critic.load(a2c_load_path)
            print('Loaded a2c')
        else:
            actor_critic.epsilon = -1
            print('WARNING: No Actor Critic Model loaded. Using Random Agent')

        vae = VariationalAutoEncoder(sess, vae_arch, ob_space, ac_space, lr,
                                     kl_coeff, summarize)

        load_count = 0
        if vae_load_path is not None:
            vae.load(vae_load_path)

        summary_op = tf.summary.merge_all()
        writer = tf.summary.FileWriter(log_path, graph=sess.graph)

        sess.run(tf.global_variables_initializer())

        print('VAE Training Start!')
        print('Model will be saved on intervals of %i' % (log_interval))
        for i in tqdm(range(load_count + 1,
                            int(max_iters) + 1),
                      ascii=True,
                      desc='VarAutoEncoder'):

            mb_s, mb_a, mb_r, mb_ns, mb_d = [], [], [], [], []

            for s, a, r, ns, d in model_play_games(actor_critic, envs, nsteps):
                mb_s.append(s)
                mb_a.append(a)
                mb_r.append(r)
                mb_ns.append(ns)
                mb_d.append(d)

            mb_s = np.concatenate(mb_s)
            mb_a = np.concatenate(mb_a)
            mb_r = np.concatenate(mb_r)
            mb_ns = np.concatenate(mb_ns)
            mb_d = np.concatenate(mb_d)

            if summarize:
                loss, recon_loss, kl_loss, _, smy = vae.train(
                    mb_s, mb_a, mb_ns, mb_r, summary_op)
                writer.add_summary(smy, i)
            else:
                loss, recon_loss, kl_loss, _ = vae.train(
                    mb_s, mb_a, mb_ns, mb_r)

            if i % log_interval == 0:
                vae.save(log_path, i)

        vae.save(log_path, 'final')
        print('Variational AutoEncoder is finished training')
    np.random.seed(2019)

    from common.multiprocessing_env import SubprocVecEnv

    num_envs = 1  #original is 16
    env_name = "Pendulum-v0"

    def make_env():
        def _thunk():
            env = gym.make(env_name)
            return env

        return _thunk

    envs = [make_env() for i in range(num_envs)]
    envs = SubprocVecEnv(envs)

    env = gym.make(env_name)

    class ActorCritic:
        def __init__(self,
                     sess,
                     obs,
                     acs,
                     hidden_size,
                     name,
                     trainable,
                     init_std=1.0):
            self.sess = sess
            self.obs = obs
            self.acs = acs
Ejemplo n.º 20
0
def train(env_fn=None,
          spectrum=False,
          a2c_arch=None,
          nenvs=16,
          nsteps=100,
          max_iters=1e6,
          gamma=0.99,
          pg_coeff=1.0,
          vf_coeff=0.5,
          ent_coeff=0.01,
          max_grad_norm=0.5,
          lr=7e-4,
          alpha=0.99,
          epsilon=1e-5,
          log_interval=100,
          summarize=True,
          load_path=None,
          log_path=None,
          cpu_cores=1):

    # Construct the vectorized parallel environments
    envs = [env_fn for _ in range(nenvs)]
    envs = SubprocVecEnv(envs)

    # Set some random seeds for the environment
    envs.seed(0)
    if spectrum:
        envs.spectrum()

    ob_space = envs.observation_space.shape
    nw, nh, nc = ob_space
    ac_space = envs.action_space

    obs = envs.reset()

    tf_config = tf.ConfigProto(inter_op_parallelism_threads=cpu_cores,
                               intra_op_parallelism_threads=cpu_cores)
    tf_config.gpu_options.allow_growth = True

    with tf.Session(config=tf_config) as sess:

        actor_critic = ActorCritic(sess, a2c_arch, ob_space, ac_space,
                                   pg_coeff, vf_coeff, ent_coeff,
                                   max_grad_norm, lr, alpha, epsilon,
                                   summarize)

        load_count = 0
        if load_path is not None:
            actor_critic.load(load_path)
            print('Loaded a2c')

        summary_op = tf.summary.merge_all()
        writer = tf.summary.FileWriter(log_path, graph=sess.graph)

        sess.run(tf.global_variables_initializer())

        batch_ob_shape = (-1, nw, nh, nc)

        dones = [False for _ in range(nenvs)]

        episode_rewards = np.zeros((nenvs, ))
        final_rewards = np.zeros((nenvs, ))

        print('a2c Training Start!')
        print('Model will be saved on intervals of %i' % (log_interval))
        for i in tqdm(range(load_count + 1,
                            int(max_iters) + 1),
                      ascii=True,
                      desc='ActorCritic'):

            # Create the minibatch lists
            mb_obs, mb_rewards, mb_actions, mb_values, mb_dones, mb_depth = [], [], [], [], [], []
            total_reward = 0

            for n in range(nsteps):

                # Get the actions and values from the actor critic, we don't need neglogp
                actions, values, neglogp = actor_critic.act(obs)

                mb_obs.append(np.copy(obs))
                mb_actions.append(actions)
                mb_values.append(values)
                mb_dones.append(dones)

                obs, rewards, dones, info = envs.step(actions)
                total_reward += np.sum(rewards)

                episode_rewards += rewards
                masks = 1 - np.array(dones)
                final_rewards *= masks
                final_rewards += (1 - masks) * episode_rewards
                episode_rewards *= masks

                mb_rewards.append(rewards)
                mb_depth.append(
                    np.array(
                        [info_item['scramble_depth'] for info_item in info]))

            mb_dones.append(dones)

            # Convert batch steps to batch rollouts
            mb_obs = np.asarray(mb_obs, dtype=np.float32).swapaxes(
                1, 0).reshape(batch_ob_shape)
            mb_rewards = np.asarray(mb_rewards,
                                    dtype=np.float32).swapaxes(1, 0)
            mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0)
            mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0)
            mb_dones = np.asarray(mb_dones, dtype=np.float32).swapaxes(1, 0)
            mb_depth = np.asarray(mb_depth, dtype=np.int32).swapaxes(1, 0)
            mb_masks = mb_dones[:, :-1]
            mb_dones = mb_dones[:, 1:]

            last_values = actor_critic.critique(obs).tolist()

            # discounting
            for n, (rewards, d,
                    value) in enumerate(zip(mb_rewards, mb_dones,
                                            last_values)):
                rewards = rewards.tolist()
                d = d.tolist()
                if d[-1] == 0:
                    rewards = discount_with_dones(rewards + [value], d + [0],
                                                  gamma)[:-1]
                else:
                    rewards = discount_with_dones(rewards, d, gamma)
                mb_rewards[n] = rewards

            # Flatten the whole minibatch
            mb_rewards = mb_rewards.flatten()
            mb_actions = mb_actions.flatten()
            mb_values = mb_values.flatten()
            mb_masks = mb_masks.flatten()
            mb_depth = mb_depth.flatten()

            # Save the information to tensorboard
            if summarize:
                loss, policy_loss, value_loss, policy_ent, mrew, mdp, _, summary = actor_critic.train(
                    mb_obs, mb_rewards, mb_masks, mb_actions, mb_values,
                    mb_depth, i, summary_op)
                writer.add_summary(summary, i)
            else:
                loss, policy_loss, value_loss, policy_ent, mrew, mdp, _ = actor_critic.train(
                    mb_obs, mb_rewards, mb_masks, mb_actions, mb_values,
                    mb_depth, i)

            if i % log_interval == 0:
                actor_critic.save(log_path, i)

        actor_critic.save(log_path, 'final')
        print('a2c model is finished training')
Ejemplo n.º 21
0
                # plot(frame_idx, test_rewards)
        next_state = torch.FloatTensor(next_state).to(cfg.device)
        _, next_value = model(next_state)
        returns = compute_returns(next_value, rewards, masks)
        log_probs = torch.cat(log_probs)
        returns = torch.cat(returns).detach()
        values = torch.cat(values)
        advantage = returns - values
        actor_loss = -(log_probs * advantage.detach()).mean()
        critic_loss = advantage.pow(2).mean()
        loss = actor_loss + 0.5 * critic_loss - 0.001 * entropy
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    return test_rewards, test_ma_rewards


if __name__ == "__main__":
    cfg = A2CConfig()
    envs = [make_envs(cfg.env) for i in range(cfg.n_envs)]
    envs = SubprocVecEnv(envs)  # 8 env
    rewards, ma_rewards = train(cfg, envs)
    make_dir(cfg.result_path, cfg.model_path)
    save_results(rewards, ma_rewards, tag='train', path=cfg.result_path)
    plot_rewards(rewards,
                 ma_rewards,
                 tag="train",
                 env=cfg.env,
                 algo=cfg.algo,
                 path=cfg.result_path)
Ejemplo n.º 22
0
# Create Environments
num_envs = 4
env_name = 'CartPole-v0'


def make_env():
    def _thunk():
        env = gym.make(env_name)
        env.seed(seed)
        return env

    return _thunk


envs = [make_env() for i in range(num_envs)]
envs = SubprocVecEnv(envs)

env = gym.make(env_name)
env.seed(seed)


# Neural Network
class NonSpikingLIFNode(neuron.LIFNode):
    class NonSpikingLIFNode(neuron.LIFNode):
        def forward(self, dv: torch.Tensor):
            self.neuronal_charge(dv)
            # self.neuronal_fire()
            # self.neuronal_reset()
            return self.v

Ejemplo n.º 23
0
        target_param.data.copy_(target_param.data * (1.0 - soft_tau) +
                                param.data * soft_tau)


def make_env(env_id):
    def _thunk():
        '''멀티 프로세스로 동작하는 환경 SubprocVecEnv를 실행하기 위해 필요하다'''
        env = gym.make(env_id)
        env = NormalizedActions(env)
        return env

    return _thunk


envs = [make_env("Pendulum-v0") for i in range(NUM_PROCESS)]
envs = SubprocVecEnv(envs)  # 멀티프로세스 실행환경
ou_noise = OUNoise(envs.action_space)

state_dim = envs.observation_space.shape[0]
action_dim = envs.action_space.shape[0]
hidden_dim = 256

value_net = ValueNetwork(state_dim, action_dim, hidden_dim).to(device)
policy_net = PolicyNetwork(state_dim, action_dim, hidden_dim).to(device)

target_value_net = ValueNetwork(state_dim, action_dim, hidden_dim).to(device)
target_policy_net = PolicyNetwork(state_dim, action_dim, hidden_dim).to(device)

for target_param, param in zip(target_value_net.parameters(),
                               value_net.parameters()):
    target_param.data.copy_(param.data)
Ejemplo n.º 24
0

# In[ ]:


num_envs = 4

def make_env():
    def _thunk():
        env = SokobanEnv()
        return env

    return _thunk

envs = [make_env() for i in range(num_envs)]
envs = SubprocVecEnv(envs)

state_shape = envs.observation_space.shape

#a2c hyperparams:
gamma = 0.99
entropy_coef = 0.01
value_loss_coef = 0.5
max_grad_norm = 0.5
num_steps = 120
num_batch = int(10e5)

#rmsprop hyperparams:
lr    = 7e-4
eps   = 1e-5
alpha = 0.99
Ejemplo n.º 25
0
            break
    env_t.close()


env_name = 'CartPole-v1'
gamma = 0.99
num_envs = 8
PENALTY = -1.0
n_step = 4
max_frame = 50000
lr = 0.001
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

if __name__ == '__main__':

    envs = SubprocVecEnv([make_env(env_name) for i in range(num_envs)])
    net = nn.Sequential(nn.Linear(4, 128), nn.ReLU(), nn.Linear(128, 2))
    actor = Actor(4, 128, 2).to(device)
    critic = Critic(4, 128).to(device)
    solver = optim.Adam(
        list(actor.parameters()) + list(critic.parameters()), lr)

    duration = []
    frame_count = 0
    lifespan = [[0] for _ in range(num_envs)]
    s_gotten = None

    while frame_count * n_step < max_frame:
        obs_l, acts_l, rews_l, dones_l, probs_l = [], [], [], [], []
        accept_sample = [True for _ in range(num_envs)]
        for _ in range(n_step):
Ejemplo n.º 26
0
Archivo: main.py Proyecto: km01/myrl
gamma = 0.99
batch_size = 64
lr = 0.001
initial_exploration = 1000
update_target = 200
replay_memory_capacity = 30000
max_frame = 100000
PENALTY = -1.0
num_envs = 8


if __name__ == '__main__':
    import warnings
    warnings.filterwarnings("ignore", category=UserWarning)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    envs = SubprocVecEnv([make_env(env_name) for i in range(num_envs)])
    net = nn.Sequential(nn.Linear(4, 128), nn.ReLU(), nn.Linear(128, 2))
    agent = Model(net, 2).to(device)
    solver = optim.Adam(agent.parameters())
    memory = Memory(replay_memory_capacity)

    eps = 1.0
    duration = []
    frame_count = 0
    lifespan = [[0] for _ in range(num_envs)]
    s_gotten = None
    while frame_count < max_frame:
        s = envs.reset() if s_gotten is None else s_gotten
        preprocessed_s = torch.FloatTensor(s)
        a = agent.response(preprocessed_s, eps)
        s_gotten, r, done, _ = envs.step(a)