def start_or_resume_from_checkpoint():
    """
    Create actor, critic, actor optimizer and critic optimizer from scratch
    or load from latest checkpoint if it exists.
    """
    max_checkpoint_iteration = get_last_checkpoint_iteration()

    obsv_dim, action_dim, continuous_action_space = get_env_space()
    actor = Actor(obsv_dim,
                  action_dim,
                  continuous_action_space=continuous_action_space,
                  trainable_std_dev=hp.trainable_std_dev,
                  init_log_std_dev=hp.init_log_std_dev)
    critic = Critic(obsv_dim)
    var_critic = Var_Critic(obsv_dim)

    actor_optimizer = optim.AdamW(actor.parameters(), lr=hp.actor_learning_rate)
    critic_optimizer = optim.AdamW(critic.parameters(), lr=hp.critic_learning_rate)

    var_critic_optimizer = optim.AdamW(var_critic.parameters(), lr=hp.var_critic_learning_rate)

    stop_conditions = StopConditions()

    # If max checkpoint iteration is greater than zero initialise training with the checkpoint.
    if max_checkpoint_iteration > 0:
        actor_state_dict, critic_state_dict, var_critic_state_dict, \
        actor_optimizer_state_dict, critic_optimizer_state_dict, \
        var_critic_optimizer_state_dict, stop_conditions = load_checkpoint(max_checkpoint_iteration)

        actor.load_state_dict(actor_state_dict, strict=True)
        critic.load_state_dict(critic_state_dict, strict=True)
        var_critic.load_state_dict(var_critic_state_dict, strict=True)

        actor_optimizer.load_state_dict(actor_optimizer_state_dict)
        critic_optimizer.load_state_dict(critic_optimizer_state_dict)
        var_critic_optimizer.load_state_dict(var_critic_optimizer_state_dict)

        '''We have to move manually move optimizer states to 
        TRAIN_DEVICE manually since optimizer doesn't yet have a "to" method.#'''

        for state in actor_optimizer.state.values():
            for k, v in state.items():
                if isinstance(v, torch.Tensor):
                    state[k] = v.to(TRAIN_DEVICE)

        for state in critic_optimizer.state.values():
            for k, v in state.items():
                if isinstance(v, torch.Tensor):
                    state[k] = v.to(TRAIN_DEVICE)

        for state in var_critic_optimizer.state.values():
            for k, v in state.items():
                if isinstance(v, torch.Tensor):
                    state[k] = v.to(TRAIN_DEVICE)

    return actor, critic, var_critic, actor_optimizer, critic_optimizer, var_critic_optimizer, \
           max_checkpoint_iteration, stop_conditions
예제 #2
0
    def __init__(self, envs, testing_envs, seed, variance_limit = 0.25):
        self.seed = seed
        self.successes = []
        self.testing_envs = testing_envs
        self.envs = envs
        self.variance_limit = variance_limit
        
        training_envs_per_dof = int(len(self.envs.envs)/3)
        
        self.training_env_seq = [4]*training_envs_per_dof + [5]*training_envs_per_dof + [6]*training_envs_per_dof
        self.testing_env_seq = [4]*10 + [5]*10 + [6]*10

        if p.mode == "retrain":
            self.training_env_seq = self.testing_env_seq

        self.device = torch.device(p.device)
        # create the network
        self.actor = Actor().to(self.device)
        self.critic = Critic().to(self.device)

        if p.mode == 'retrain':
            self.actor.load_state_dict(torch.load("actor_seed_{}".format(seed)))
            self.critic.load_state_dict(torch.load("critic_seed_{}".format(seed)))

        # build up the target network
        self.actor_target = Actor().to(self.device)
        self.critic_target = Critic().to(self.device)
        # load the weights into the target networks
        self.actor_target.load_state_dict(self.actor.state_dict())
        self.critic_target.load_state_dict(self.critic.state_dict())
        # if use gpu
        self.actor_optim = torch.optim.Adam(self.actor.parameters(), lr=p.lr)
        self.critic_optim = torch.optim.Adam(self.critic.parameters(), lr=p.lr)
        # her sampler
        self.buffer = replay_buffer(seed)

        if p.mode == 'retrain':
            self.buffer.load_normalizers()
            print("loading done")

        self.training_data, self.testing_data = {}, {}
        for env in self.envs.envs:
            self.training_data[env.name] = []
        for env in self.testing_envs.envs:
            self.testing_data[env.name] = []

        try:
            os.mkdir("Generated_data")
        except FileExistsError:
            pass
    def __init__(self, positions_per_level, move_probabilities, initial_vacancy_fraction, firing_schedule):
        """
        :param positions_per_level: list of positions per level ;list of ints
                                    e.g. [10,20,30] == 10 positions in level 1, 20 in level 2, etc.
        :param move_probabilities: dict of move probabilities for agents, specific format below
                                    NB: vector of actor retirement probs is per level
                                    e.g. [retire prob level 1, retire prob level 2, retire prob level 3]
                                    while vector of move  probs for vacancies is for vacancies, in order of
                                    [don't move, retire, move in same level, move down level]
                                    e.g. {"actor retirement probs": [0.1, 0.1, 0.1]
                                          "vacancy move probs": [0.3, 0..1, 0.3, 0.3]}
        :param initial_vacancy_fraction: float [0,1] telling us what percentage of positions in each level
                                         should be vacant at model initialisation
        :param firing_schedule: dict indicating what retirement probabilities should be at given steps (form below)
                                this facilitates one-off changes where portions of levels are emptied of actors
                                e.g. {"steps": {5, 10}, "level-retire probability": [(1, 0.4), (2, 0.4), (3, 0.6)]}
        """
        super().__init__()
        # set parameters
        self.num_levels = len(positions_per_level)
        self.positions_per_level = positions_per_level
        self.move_probabilities = move_probabilities
        self.vacancy_fraction = initial_vacancy_fraction
        self.firing_schedule = firing_schedule

        self.per_step_movement = {"actor": 0, "vacancy": 0}

        self.schedule = SimultaneousActivation(self)
        self.running = True
        self.datacollector = DataCollector(
            model_reporters={"agent_counts": get_agent_counts,
                             "percent_vacant_per_level": get_percent_vacancy_per_level,
                             "mean_lengths": get_sequence_and_vacancy_mean_lengths,
                             "mean_lengths_std": get_sequence_and_vacancy_length_stdev,
                             "mean_spell_lengths": get_mean_spell_lengths,
                             "mean_spell_length_stdev": get_stdev_spell_lengths,
                             "total mobility": get_total_mobility})

        # make positions and populate them with agents
        self.positions = {i: {} for i in range(1, self.num_levels + 1)}
        for i in range(self.num_levels):
            vacancies = fraction_of_list(initial_vacancy_fraction, self.positions_per_level[i])
            for j in range(self.positions_per_level[i]):
                position_id = str(i + 1) + '-' + str(j + 1)  # position ID = level-position number
                p = Position(position_id, self)
                self.positions[i + 1][position_id] = p
                # make entity
                agent = Vacancy(uuid4(), self) if vacancies[j] else Actor(uuid4(), self)
                self.schedule.add(agent)
                # associate it with position
                agent.position = p.unique_id
                p.dual = [agent.unique_id, agent.type]
                # update logs
                agent.log.append(p.unique_id)
                p.log.append(agent.unique_id)
        self.retiree_spots = set()
        self.desired_positions = []
        self.retirees = {"actor": {}, "vacancy": {}}
예제 #4
0
 def __init__(self, env, seed):
     self.seed = seed
     self.successes = []
     self.epochs = []
     self.env = env
     self.device = torch.device(p.device)
     # create the network
     self.actor = Actor(self.env.ob_shape, self.env.goal_shape,
                        self.env.action_shape).to(self.device)
     self.critic = Critic(self.env.ob_shape, self.env.goal_shape,
                          self.env.action_shape).to(self.device)
     # build up the target network
     self.actor_target = Actor(self.env.ob_shape, self.env.goal_shape,
                               self.env.action_shape).to(self.device)
     self.critic_target = Critic(self.env.ob_shape, self.env.goal_shape,
                                 self.env.action_shape).to(self.device)
     # load the weights into the target networks
     self.actor_target.load_state_dict(self.actor.state_dict())
     self.critic_target.load_state_dict(self.critic.state_dict())
     # if use gpu
     self.actor_optim = torch.optim.Adam(self.actor.parameters(), lr=p.lr)
     self.critic_optim = torch.optim.Adam(self.critic.parameters(), lr=p.lr)
     # her sampler
     self.buffer = replay_buffer(self.env.ob_shape, self.env.action_shape)
예제 #5
0
def main(epochs):

    # create model
    model = Actor()

    # load data
    data_path = os.path.join(script_path, 'data.pkl')
    with open(data_path, 'rb') as handler:
        data_dict = pickle.load(handler)
    data = data_dict['data']
    label = data_dict['label']
    '''OHLC is in dataframe label. indicators are in dataframe data.'''
    plot_series(label['c'])
    '''
    train-test split 70% and 30%
    所以绘制出来的buyhold图形可能和完整的buyhold不一样
    '''
    split = int(.7 * len(data))

    train_data = data.ix[:split]
    train_label = label.ix[:split]
    test_data = data.ix[split:]
    test_label = label.ix[split:]

    # train
    ret_lst = []
    for i_ep in range(epochs):
        '''任意选96个time step,所以得保证idx小于长度减去96'''
        idx = np.random.randint(len(train_data) - 96)
        ''' sample a consecutive of 96 steps
        一共有3个action,交易的对象是0~4个bitcoin,任选之一
       '''
        env = Env(train_data[idx:idx + 96],
                  train_label[idx:idx + 96],
                  init_act=np.random.randint(5))
        ret = roll_out(env=env, model=model, train_mode=True)
        ret_lst.append(ret)

    # test
    env = Env(test_data, test_label, init_act=0)
    _, r_lst, p_lst, P_lst = roll_out(env=env, model=model, train_mode=False)

    # plot
    plot_result('Bitcoin', r_lst, p_lst)
예제 #6
0
def main(epochs):

    # create model
    model = Actor()

    # load data
    data_path = os.path.join(script_path, 'BTCUSD-15Min-Data.pkl')
    with open(data_path, 'rb') as handler:
        data_dict = pickle.load(handler)
    data = data_dict['data']
    label = data_dict['label']

    # train-test split 70% and 30%
    split = int(.7 * len(data))

    train_data = data.ix[:split]
    train_label = label.ix[:split]
    test_data = data.ix[split:]
    test_label = label.ix[split:]

    # train
    ret_lst = []
    for i_ep in range(epochs):
        idx = np.random.randint(len(train_data) - 96)

        # sample a consecutive of 96 steps
        env = Env(train_data[idx:idx + 96],
                  train_label[idx:idx + 96],
                  init_act=np.random.randint(5))
        ret = roll_out(env=env, model=model, train_mode=True)
        ret_lst.append(ret)

    # test
    env = Env(test_data, test_label, init_act=0)
    _, r_lst, p_lst, P_lst = roll_out(env=env, model=model, train_mode=False)

    # plot
    plot_result('Bitcoin', r_lst, p_lst)
def main(args):

    # Set path to save result
    gym_dir = './' + args['env'] + '_' + args['variation'] + '/gym'

    # Set random seed for reproducibility
    np.random.seed(int(args['seed']))
    tf.set_random_seed(int(args['seed']))

    with tf.Session() as sess:

        # Load environment
        env = gym.make(args['env'])
        env.seed(int(args['seed']))

        # get size of action and state (i.e. output and input for the agent)
        obs = env.reset()
        observation_dim = obs['observation'].shape[0]
        achieved_goal_dim = obs['achieved_goal'].shape[0]
        desired_goal_dim =  obs['desired_goal'].shape[0]
        assert achieved_goal_dim == desired_goal_dim

        # state size = observation size + goal size
        state_dim = observation_dim + desired_goal_dim
        action_dim = env.action_space.shape[0]
        action_highbound = env.action_space.high

        # print out parameters
        print('Parameters:')
        print('Observation Size=', observation_dim)
        print('Goal Size=', desired_goal_dim)
        print('State Size =', state_dim)
        print('Action Size =', action_dim)
        print('Action Upper Boundary =', action_highbound)

        # save to monitor if render
        if args['render']:
            env = gym.wrappers.Monitor(env, gym_dir, force=True)
        else:
            env = gym.wrappers.Monitor(env, gym_dir, video_callable=False, force=True)

        # create actor
        actor = Actor(sess, state_dim, action_dim, action_highbound,
                      float(args['actor_lr']), float(args['tau']),
                      int(args['batch_size']), int(args['hidden_size']))

        # create critic
        critic = Critic(sess, state_dim, action_dim,
                        float(args['critic_lr']), float(args['tau']),
                        float(args['gamma']),
                        actor.n_actor_vars,
                        int(args['hidden_size']))

        # noise
        actor_noise = Noise(mu=np.zeros(action_dim))

        # train the network
        if not args['test']:
            train(sess, env, args, actor, critic, actor_noise, desired_goal_dim, achieved_goal_dim, observation_dim)
        else:
            test(sess, env, args, actor, critic, desired_goal_dim, achieved_goal_dim, observation_dim)

        # close gym
        env.close()

        # close session
        sess.close()
예제 #8
0
class Trainer:
    def __init__(self, env, seed):
        self.seed = seed
        self.successes = []
        self.epochs = []
        self.env = env
        self.device = torch.device(p.device)
        # create the network
        self.actor = Actor(self.env.ob_shape, self.env.goal_shape,
                           self.env.action_shape).to(self.device)
        self.critic = Critic(self.env.ob_shape, self.env.goal_shape,
                             self.env.action_shape).to(self.device)
        # build up the target network
        self.actor_target = Actor(self.env.ob_shape, self.env.goal_shape,
                                  self.env.action_shape).to(self.device)
        self.critic_target = Critic(self.env.ob_shape, self.env.goal_shape,
                                    self.env.action_shape).to(self.device)
        # load the weights into the target networks
        self.actor_target.load_state_dict(self.actor.state_dict())
        self.critic_target.load_state_dict(self.critic.state_dict())
        # if use gpu
        self.actor_optim = torch.optim.Adam(self.actor.parameters(), lr=p.lr)
        self.critic_optim = torch.optim.Adam(self.critic.parameters(), lr=p.lr)
        # her sampler
        self.buffer = replay_buffer(self.env.ob_shape, self.env.action_shape)

    def start(self):
        for self.epoch in range(p.n_epochs):
            for _ in range(p.n_cycles):
                mb_obs, mb_ag, mb_g, mb_obs_next, mb_ag_next, mb_actions = [], [], [], [], [], []
                for _ in range(1):
                    # reset the rollouts
                    ep_obs, ep_ag, ep_g, ep_obs_next, ep_ag_next, ep_actions = [], [], [], [], [], []
                    # reset the environment
                    observation = self.env.reset()
                    obs = observation['observation']
                    ag = observation['achieved_goal']
                    g = observation['desired_goal']
                    # start to collect samples
                    for t in range(p.max_episode_steps):
                        with torch.no_grad():
                            obs_norm, g_norm = self.normalize(obs, g)
                            pi = self.actor(obs_norm, g_norm)
                            action = self.add_noise(pi)
                        # feed the actions into the environment
                        observation_new, _, _, info = self.env.step(action)
                        obs_new = observation_new['observation']
                        ag_new = observation_new['achieved_goal']
                        # append rollouts
                        ep_obs.append(obs.copy())
                        ep_ag.append(ag.copy())
                        ep_g.append(g.copy())
                        ep_obs_next.append(obs_new.copy())
                        ep_ag_next.append(ag_new.copy())
                        ep_actions.append(action.copy())
                        # re-assign the observation
                        obs = obs_new
                        ag = ag_new

                    mb_obs.append(ep_obs)
                    mb_ag.append(ep_ag)
                    mb_g.append(ep_g)
                    mb_obs_next.append(ep_obs_next)
                    mb_ag_next.append(ep_ag_next)
                    mb_actions.append(ep_actions)

                # convert them into arrays
                mb_obs = np.array(mb_obs)
                mb_ag = np.array(mb_ag)
                mb_g = np.array(mb_g)
                mb_obs_next = np.array(mb_obs_next)
                mb_ag_next = np.array(mb_ag_next)
                mb_actions = np.array(mb_actions)
                # store the episodes
                self.buffer.store_episode(
                    [mb_obs, mb_ag, mb_g, mb_obs_next, mb_ag_next, mb_actions])
                self.buffer.update_normalizer(
                    [mb_obs, mb_ag, mb_g, mb_obs_next, mb_ag_next, mb_actions])
                for _ in range(p.update_per_episode):
                    # train the network
                    c_loss, a_loss = self.update_network()
                # soft update
                self.soft_update_target_network()
            # start to do the evaluation
            success_rate = self.eval_agent()
            print('[{}] epoch: {}, seed: {}, eval success rate is: {}'.format(
                self.env.name, self.epoch, self.seed, success_rate))
            self.save_csv(self.epoch, success_rate)
            if len(self.successes) >= 10:
                if sum(self.successes[-10:]) == 10.0:
                    break

    def save_csv(self, epoch, success_rate):
        try:
            os.mkdir("Generated_data")
        except:
            pass

        self.epochs.append(epoch + 1)
        self.successes.append(success_rate)

        di = {}
        di['epochs'] = self.epochs
        di["success_rate"] = self.successes

        frame = pd.DataFrame(di)
        frame.to_csv("Generated_data/{}_{}.csv".format(self.env.name,
                                                       self.seed))

    def normalize(self, obs, g):
        print(self.env.name)
        time.sleep(10000)
        obs_norm = self.buffer.o_norm.normalize(obs)
        g_norm = self.buffer.g_norm.normalize(g)
        obs_norm = torch.FloatTensor(obs_norm).to(self.device)
        g_norm = torch.FloatTensor(g_norm).to(self.device)
        # concatenate the stuffs
        return obs_norm, g_norm

    # this function will choose action for the agent and do the exploration
    def add_noise(self, pi):
        action = pi.cpu().numpy().squeeze()
        # add the gaussian
        action += p.noise_eps * np.random.randn(*action.shape)
        action = np.clip(action, -1.0, 1.0)
        # random actions...
        random_actions = np.random.uniform(low=-1.0,
                                           high=1.0,
                                           size=self.env.action_shape)
        # choose if use the random actions
        action += np.random.binomial(1, p.random_eps,
                                     1)[0] * (random_actions - action)
        return action

    # soft update
    def soft_update_target_network(self):
        for target_param, param in zip(self.actor_target.parameters(),
                                       self.actor.parameters()):
            target_param.data.copy_((1 - p.polyak) * param.data +
                                    p.polyak * target_param.data)

        for target_param, param in zip(self.critic_target.parameters(),
                                       self.critic.parameters()):
            target_param.data.copy_((1 - p.polyak) * param.data +
                                    p.polyak * target_param.data)

    # update the network
    def update_network(self):
        # sample the episodes
        transitions = self.buffer.sample()
        # pre-process the observation and goal
        o, o_next, g = transitions['obs'], transitions[
            'obs_next'], transitions['g']
        transitions['obs'], transitions['g'] = self.buffer.preproc_og(o, g)
        transitions['obs_next'], transitions[
            'g_next'] = self.buffer.preproc_og(o_next, g)

        # start to do the update
        obs_norm, g_norm = self.normalize(transitions['obs'], transitions['g'])
        obs_next_norm, g_next_norm = self.normalize(transitions['obs_next'],
                                                    transitions['g_next'])

        actions_tensor = torch.FloatTensor(transitions['actions']).to(
            self.device)
        r_tensor = torch.FloatTensor(transitions['r']).to(self.device)

        with torch.no_grad():
            # do the normalization
            # concatenate the stuffs
            actions_next = self.actor_target(obs_next_norm, g_next_norm)
            q_next_value = self.critic_target(obs_next_norm, g_next_norm,
                                              actions_next)
            q_next_value = q_next_value.detach()
            target_q_value = r_tensor + p.gamma * q_next_value
            target_q_value = target_q_value.detach()
            # clip the q value
            clip_return = 1 / (1 - p.gamma)
            target_q_value = torch.clamp(target_q_value, -clip_return, 0)

        # the q loss
        real_q_value = self.critic(obs_norm, g_norm, actions_tensor)
        critic_loss = (target_q_value - real_q_value).pow(2).mean()
        # the actor loss
        actions_real = self.actor(obs_norm, g_norm)
        actor_loss = -self.critic(obs_norm, g_norm, actions_real).mean()
        self.a1 = actor_loss
        self.a2 = (actions_real).pow(2).mean()
        self.actions_real = actions_real
        actor_loss += (actions_real).pow(2).mean()

        # start to update the network
        self.actor_optim.zero_grad()
        actor_loss.backward()

        # update the critic_network
        self.critic_optim.zero_grad()
        critic_loss.backward()

        self.actor_optim.step()
        self.critic_optim.step()

        return critic_loss.item(), actor_loss.item()

    # do the evaluation
    def eval_agent(self):
        total_success_rate = []
        for _ in range(p.testing_eps):
            total_success_rate.append(0.0)
            observation = self.env.reset()
            obs = observation['observation']
            g = observation['desired_goal']
            for _ in range(p.max_episode_steps):
                with torch.no_grad():
                    obs_norm, g_norm = self.normalize(obs, g)
                    pi = self.actor(obs_norm, g_norm)
                    # convert the actions
                    actions = pi.detach().cpu().numpy().squeeze()
                observation_new, _, _, info = self.env.step(actions)
                obs = observation_new['observation']
                g = observation_new['desired_goal']
                if info["is_success"]:
                    break
            total_success_rate[-1] = info['is_success']
        total_success_rate = round(np.array(total_success_rate).mean(), 2)
        return total_success_rate
예제 #9
0
from agent import Actor, Learner
import os
import numpy as np
import importlib

config = get_config()
os.environ['CUDA_VISIBLE_DEVICES'] = config['visible_device']
environment = importlib.import_module('environment.' + config['task_name'])

ds = np.zeros([config['imsize']**2, config['A_size'], config['C_size'], 2],
              dtype=np.int)
learner = Learner(ds)
tracker = util.Tracker()

env = environment.Env()
actor = Actor(learner.main_net)
preobs = env.reset()
rew_tmp = []

for frame_idx in range(config['max_training_step']):
    option = actor.act(preobs)
    postobs, reward, done, info = env.step(option)
    learner.main_net.add_sample(preobs[1], option, postobs[1])
    learner.buffer.add_tmp(preobs, reward, option)
    preobs = postobs
    rew_tmp.append(reward)
    if done:
        sum_rew = 0
        learner.buffer.popall()
        for r in reversed(rew_tmp):
            sum_rew = config['gamma'] * sum_rew + r
예제 #10
0
def train(args):
    T_SIZE = 500
    SET_POINT = 50

    t = np.linspace(0, 50, num=T_SIZE)
    SP = np.ones(T_SIZE) * SET_POINT

    env = PIDModel(ku=1.396, tu=3.28, t=t, SP=SP)

    actor = Actor()
    critic = Critic()
    agent = Agent(env,
                  actor_lr=args["ACTOR_LEARNING_RATE"],
                  critic_lr=args["CRITIC_LEARNING_RATE"],
                  actor_model=actor,
                  critic_model=critic,
                  device=args["DEVICE"],
                  gamma=args["GAMMA"])

    stats = {"episode_reward": deque([]), "del_ts": []}

    if args["LOAD_PREVIOUS"]:
        print("Loading previously trained model")
        agent.load()

    for i in range(args["NUM_EPISODES"]):
        print("Starting episode", i)
        state = env.reset()
        total = 0

        agent.start_episode()
        state, _, __ = env.step((0.5, 0.5, 3.5))  # Initial random state

        num_step = 0
        done = False
        while not done:
            action = agent.get_action(state)

            # Exploration strategy
            gauss_noise = np.random.normal(0,
                                           args["exploration_stddev"],
                                           size=3)
            target_action = action + torch.Tensor(gauss_noise)
            target_action = agent.actor_model.clamp_action(target_action)

            new_state, reward, done = env.step(target_action.detach().numpy())
            transition = Transition(reward=reward,
                                    state=state,
                                    action=action,
                                    target_action=target_action,
                                    next_state=new_state)
            agent.step(transition)

            if (num_step % args["PRINT_EVERY"] == 0):
                print("\tStep", num_step, "for episode", i)
                print("\t", action, target_action)
                print("\tReward accumulated:", total)

            assert (type(target_action) == torch.Tensor)
            assert (target_action.requires_grad)
            assert (action.requires_grad)

            total += reward
            state = new_state
            num_step += 1

        # Learn from this episode
        agent.learn()

        if i % 1 == 0:
            agent.save()
            stats["episode_reward"].append(total / num_step)
            stats["del_ts"].extend(agent.get_episode_stats()[1])

            print("Reward is ", total, "and average reward is",
                  total / num_step)

    return stats
예제 #11
0
class Trainer:
    def __init__(self, envs, testing_envs, seed, variance_limit = 0.25):
        self.seed = seed
        self.successes = []
        self.testing_envs = testing_envs
        self.envs = envs
        self.variance_limit = variance_limit
        
        training_envs_per_dof = int(len(self.envs.envs)/3)
        
        self.training_env_seq = [4]*training_envs_per_dof + [5]*training_envs_per_dof + [6]*training_envs_per_dof
        self.testing_env_seq = [4]*10 + [5]*10 + [6]*10

        if p.mode == "retrain":
            self.training_env_seq = self.testing_env_seq

        self.device = torch.device(p.device)
        # create the network
        self.actor = Actor().to(self.device)
        self.critic = Critic().to(self.device)

        if p.mode == 'retrain':
            self.actor.load_state_dict(torch.load("actor_seed_{}".format(seed)))
            self.critic.load_state_dict(torch.load("critic_seed_{}".format(seed)))

        # build up the target network
        self.actor_target = Actor().to(self.device)
        self.critic_target = Critic().to(self.device)
        # load the weights into the target networks
        self.actor_target.load_state_dict(self.actor.state_dict())
        self.critic_target.load_state_dict(self.critic.state_dict())
        # if use gpu
        self.actor_optim = torch.optim.Adam(self.actor.parameters(), lr=p.lr)
        self.critic_optim = torch.optim.Adam(self.critic.parameters(), lr=p.lr)
        # her sampler
        self.buffer = replay_buffer(seed)

        if p.mode == 'retrain':
            self.buffer.load_normalizers()
            print("loading done")

        self.training_data, self.testing_data = {}, {}
        for env in self.envs.envs:
            self.training_data[env.name] = []
        for env in self.testing_envs.envs:
            self.testing_data[env.name] = []

        try:
            os.mkdir("Generated_data")
        except FileExistsError:
            pass
        
    def start(self):
        if p.mode == "retrain":
            for self.epoch in range(-10, 0):
                training_success_rate, testing_success_rate = self.eval_agent()     
                self.log_data(training_success_rate, testing_success_rate)

        else:
            for self.epoch in range(p.n_epochs):
                for _ in range(p.n_cycles):
                    # reset the rollouts
                    ep_obs, ep_ag, ep_g, ep_obs_next, ep_ag_next, ep_actions, ep_seq = [], [], [], [], [], [], []
                    # reset the environment
                    observation = self.envs.reset()
                    obs = observation['observation']
                    ag = observation['achieved_goal']
                    g = observation['desired_goal']

                    # start to collect samples
                    for t in range(p.max_episode_steps):
                        with torch.no_grad():
                            obs_norm, g_norm = self.normalize(obs, g)
                            pi = self.actor(obs_norm, g_norm, self.training_env_seq)
                            action = self.add_noise(pi)
                          
                        # feed the actions into the environment
                        observation_new, info = self.envs.step(action)
                        obs_new = observation_new['observation']
                        ag_new = observation_new['achieved_goal']
                        # append rollouts
                        ep_obs.append(obs.copy())
                        ep_ag.append(ag.copy())
                        ep_g.append(g.copy())
                        ep_obs_next.append(obs_new.copy())
                        ep_ag_next.append(ag_new.copy())
                        ep_actions.append(action.copy())
                        ep_seq.append(self.training_env_seq)
                        # re-assign the observation
                        obs = obs_new
                        ag = ag_new

                    #convert them into arrays
                    ep_obs = np.array(ep_obs).swapaxes(0,1)
                    ep_ag = np.array(ep_ag).swapaxes(0,1)
                    ep_g = np.array(ep_g).swapaxes(0,1)
                    ep_obs_next = np.array(ep_obs_next).swapaxes(0,1)
                    ep_ag_next = np.array(ep_ag_next).swapaxes(0,1)
                    ep_actions = np.array(ep_actions).swapaxes(0,1)
                    ep_seq = np.array(ep_seq).swapaxes(0,1)

                    for i in range(ep_obs.shape[0]):
                        # store the episodes
                        self.buffer.store_episode([np.expand_dims(ep_obs[i],0), np.expand_dims(ep_ag[i],0), np.expand_dims(ep_g[i],0), np.expand_dims(ep_obs_next[i],0), np.expand_dims(ep_ag_next[i],0), np.expand_dims(ep_actions[i],0), np.expand_dims(ep_seq[i],0)])
                        self.buffer.update_normalizer([np.expand_dims(ep_obs[i],0), np.expand_dims(ep_ag[i],0), np.expand_dims(ep_g[i],0), np.expand_dims(ep_obs_next[i],0), np.expand_dims(ep_ag_next[i],0), np.expand_dims(ep_actions[i],0), np.expand_dims(ep_seq[i],0)])
                   
                    for _ in range(p.update_per_episode):
                        # train the network
                        c_loss, a_loss = self.update_network()
                        
                    # soft update
                    self.soft_update_target_network()

                training_success_rate, testing_success_rate = self.eval_agent()
                self.log_data(training_success_rate, testing_success_rate)
            
                torch.save(self.actor.state_dict(), "actor_seed_{}".format(self.seed))
                torch.save(self.critic.state_dict(), "critic_seed_{}".format(self.seed))
                self.buffer.save_normalizers()

    def log_data(self, training_data, testing_data):
        os.system("clear")
        print("Epoch: {}".format(self.epoch))
        print("Training_data: ")
        end = "\t"

        for i, env in enumerate(self.envs.envs):
            print(env.name, training_data[i], end=end)
            self.training_data[env.name].append(training_data[i])
            end = "\t" if end=="\n" else "\n"
        print(end="\n\n")
        
        frame = pd.DataFrame(self.training_data)
        if self.variance_limit == 0.25:
            frame.to_csv("Generated_data/" + p.mode + "ing_data_{}.csv".format(self.seed))
        else:
            frame.to_csv("Generated_data/" + p.mode + "ing_data_{}_{}.csv".format(self.variance_limit, self.seed))

        print("Testing_data: ")
        end = "\t"
        for i, env in enumerate(self.testing_envs.envs):
            print(env.name, testing_data[i], end=end)
            self.testing_data[env.name].append(testing_data[i])
            end = "\t" if end=="\n" else "\n"
        print(end="\n\n")

        frame = pd.DataFrame(self.testing_data)
        if self.variance_limit == 0.25:
            frame.to_csv("Generated_data/" + p.mode + "ing_test_data_{}.csv".format(self.seed))
        else:
            frame.to_csv("Generated_data/" + p.mode + "ing_test_data_{}_{}.csv".format(self.variance_limit, self.seed))

    def normalize(self, obs, g):
        obs_norm = self.buffer.o_norm.normalize(obs)
        g_norm = self.buffer.g_norm.normalize(g)
        obs_norm = torch.FloatTensor(obs_norm).to(self.device)
        g_norm = torch.FloatTensor(g_norm).to(self.device)
        # concatenate the stuffs
        return obs_norm, g_norm
    
    # this function will choose action for the agent and do the exploration
    def add_noise(self, pi):
        action = pi.cpu().numpy().squeeze()
        # add the gaussian
        action += p.noise_eps * np.random.randn(*action.shape)
        action = np.clip(action, -1.0, 1.0)
        # random actions...
        random_actions = np.random.uniform(low = -1.0, high = 1.0, size=p.max_dof)
        # choose if use the random actions
        action += np.random.binomial(1, p.random_eps, 1)[0] * (random_actions - action)
        return action

    # soft update
    def soft_update_target_network(self):
        for target_param, param in zip(self.actor_target.parameters(), self.actor.parameters()):
            target_param.data.copy_((1 - p.polyak) * param.data + p.polyak * target_param.data)

        for target_param, param in zip(self.critic_target.parameters(), self.critic.parameters()):
            target_param.data.copy_((1 - p.polyak) * param.data + p.polyak * target_param.data)

    # update the network
    def update_network(self):
        # sample the episodes
        transitions = self.buffer.sample()
        # pre-process the observation and goal
        o, o_next, g = transitions['obs'], transitions['obs_next'], transitions['g']
        transitions['obs'], transitions['g'] = self.buffer.preproc_og(o, g)
        transitions['obs_next'], transitions['g_next'] = self.buffer.preproc_og(o_next, g)
        seq = transitions['seq']

        # start to do the update
        obs_norm, g_norm = self.normalize(transitions['obs'], transitions['g'])
        obs_next_norm, g_next_norm = self.normalize(transitions['obs_next'], transitions['g_next'])
        
        actions_tensor = torch.FloatTensor(transitions['actions']).to(self.device)
        r_tensor = torch.FloatTensor(transitions['r']).to(self.device)
           
        with torch.no_grad():
            # do the normalization
            # concatenate the stuffs
            r_tensor = r_tensor.view(p.batch_size)
            actions_next = self.actor_target(obs_next_norm, g_next_norm, seq)
            q_next_value = self.critic_target(obs_next_norm, g_next_norm, actions_next, seq)
            q_next_value = q_next_value.detach()
            target_q_value = r_tensor + p.gamma * q_next_value
            target_q_value = target_q_value.detach()
            # clip the q value
            clip_return = 1 / (1 - p.gamma)
            target_q_value = torch.clamp(target_q_value, -clip_return, 0)
        # the q loss
        real_q_value = self.critic(obs_norm, g_norm, actions_tensor, seq)
        critic_loss = (target_q_value - real_q_value).pow(2).mean()
        # the actor loss
        actions_real = self.actor(obs_norm, g_norm, seq)
        actor_loss = -self.critic(obs_norm, g_norm, actions_real, seq).mean()
        self.a1 = actor_loss
        self.a2 = (actions_real).pow(2).mean()
        self.actions_real = actions_real  
        actor_loss += (actions_real).pow(2).mean()
        
        # start to update the network
        self.actor_optim.zero_grad()
        actor_loss.backward()

        # update the critic_network
        self.critic_optim.zero_grad()
        critic_loss.backward()
        
        self.actor_optim.step()
        self.critic_optim.step()

        return critic_loss.item(), actor_loss.item()

    # do the evaluation
    def eval_agent(self):
        training_success_rate = np.array([0.0] * len(self.envs.envs))
        
        for _ in range(p.testing_eps):
            successes = np.array([0.0]*len(self.envs.envs))
            observation = self.envs.reset()
            obs = observation['observation']
            g = observation['desired_goal']
            
            for _ in range(p.max_episode_steps):
                with torch.no_grad():
                    obs_norm, g_norm = self.normalize(obs, g)
                    pi = self.actor(obs_norm, g_norm, self.training_env_seq)
                    actions = pi.detach().cpu().numpy().squeeze()
                observation_new, info = self.envs.step(actions)
                obs = observation_new['observation']
                g = observation_new['desired_goal']
                successes = successes + info['is_success']

            successes = np.array([1.0 if i else 0.0 for i in successes])
            training_success_rate = training_success_rate + successes
        training_success_rate = training_success_rate/p.testing_eps
        
        testing_success_rate = np.array([0.0] * len(self.testing_envs.envs))    
        for _ in range(p.testing_eps):
            successes = np.array([0.0]*len(self.testing_envs.envs))
            observation = self.testing_envs.reset()
            obs = observation['observation']
            g = observation['desired_goal']
            
            for _ in range(p.max_episode_steps):
                with torch.no_grad():
                    obs_norm, g_norm = self.normalize(obs, g)
                    pi = self.actor(obs_norm, g_norm, self.testing_env_seq)
                    actions = pi.detach().cpu().numpy().squeeze()
                observation_new, info = self.testing_envs.step(actions)
                obs = observation_new['observation']
                g = observation_new['desired_goal']
                successes = successes + info['is_success']

            successes = np.array([1.0 if i else 0.0 for i in successes])
            testing_success_rate = testing_success_rate + successes
        testing_success_rate = testing_success_rate/p.testing_eps

        return training_success_rate, testing_success_rate
예제 #12
0
파일: ac.py 프로젝트: n4m4g/python_project
def main():
    np.random.seed(2)
    torch.manual_seed(2)

    # Superparameters
    MAX_EPISODE = 3000

    # renders environment if total episode reward
    # is greater then this threshold
    DISPLAY_REWARD_THRESHOLD = 200
    MAX_EP_STEPS = 1000  # maximum time step in one episode
    RENDER = False  # rendering wastes time
    GAMMA = 0.9  # reward discount in TD error
    LR_A = 0.001  # learning rate for actor
    LR_C = 0.005  # learning rate for critic

    env = gym.make('CartPole-v0')
    env.seed(1)  # reproducible
    env = env.unwrapped
    pprint(env.__dict__)

    N_F = env.observation_space.shape[0]
    N_A = env.action_space.n
    print(f"N_F: {N_F}, N_A: {N_A}")
    # N_F: 4, N_A: 2

    actor = Actor(N_F, N_A, LR_A)
    critic = Critic(N_F, LR_C, GAMMA)
    running_reward = 0

    for episode in range(MAX_EPISODE):
        s = env.reset()
        # s : list
        # s.shape = (4,)
        t = 0
        track_r = []

        while True:
            if RENDER:
                env.render()

            # actor choose action
            a = actor.choose_action(s)
            # a : scalar
            s_, r, done, info = env.step(a)
            # s.shape = (4,)
            # r : float

            # done means the pole is down
            # last reward give negative reward
            if done:
                r = -20

            track_r.append(r)

            td_err = critic.learn(s, r, s_)
            # td : torch.tensor
            # td_err.shape = (1, 1)
            actor.learn(s, a, td_err)

            s = s_
            t += 1

            if done or t >= MAX_EP_STEPS:
                ep_rs_sum = sum(track_r)

                if episode == 0:
                    running_reward = ep_rs_sum
                else:
                    running_reward = running_reward * 0.95 + ep_rs_sum * 0.05

                if running_reward > DISPLAY_REWARD_THRESHOLD:
                    RENDER = True

                print(f"Episode: {episode+1}")
                print(f"\treward: {ep_rs_sum}")
                print(f"\trunning_reward: {running_reward:.1f}")
                print(f"\tdone: {done}")
                break
예제 #13
0
        tracker.writer.add_scalar("loss", float(loss.detach().cpu().numpy()),
                                  epoch * X_train.shape[0] + i)

model_path = os.path.join(tracker.path, 'model')
torch.save(
    {
        'net': net.state_dict(),
        'ds': ds,
        'optimizer': optimizer.state_dict()
    }, model_path)

ac = 0
psum = 0
for i in range(0, X_test.shape[0], config['batch_size']):
    j = i + config['batch_size']
    if j > X_test.shape[0]: break
    x = torch.cuda.FloatTensor(X_test[i:j]).permute(0, 3, 1, 2)
    s1 = torch.cuda.LongTensor(S1_test[i:j])
    s2 = torch.cuda.LongTensor(S2_test[i:j])
    Q = net(x, s1, s2)
    label = y_test[i:j].flatten()
    pre = Q.view(-1, config['A_size']).max(1)[1].cpu().numpy()
    ac += np.where(pre == label)[0].shape[0]
    psum += label.shape[0]
print('PA:', ac / psum)

agent = Actor(net, eps=False)
env = environment.Env()
sr, mean, std = util.test_game(env, agent, tracker, 0,
                               config['test_final_game'])
print('test sr:%f, reward: mean:%f std:%f' % (sr, mean, std))