def start_or_resume_from_checkpoint(): """ Create actor, critic, actor optimizer and critic optimizer from scratch or load from latest checkpoint if it exists. """ max_checkpoint_iteration = get_last_checkpoint_iteration() obsv_dim, action_dim, continuous_action_space = get_env_space() actor = Actor(obsv_dim, action_dim, continuous_action_space=continuous_action_space, trainable_std_dev=hp.trainable_std_dev, init_log_std_dev=hp.init_log_std_dev) critic = Critic(obsv_dim) var_critic = Var_Critic(obsv_dim) actor_optimizer = optim.AdamW(actor.parameters(), lr=hp.actor_learning_rate) critic_optimizer = optim.AdamW(critic.parameters(), lr=hp.critic_learning_rate) var_critic_optimizer = optim.AdamW(var_critic.parameters(), lr=hp.var_critic_learning_rate) stop_conditions = StopConditions() # If max checkpoint iteration is greater than zero initialise training with the checkpoint. if max_checkpoint_iteration > 0: actor_state_dict, critic_state_dict, var_critic_state_dict, \ actor_optimizer_state_dict, critic_optimizer_state_dict, \ var_critic_optimizer_state_dict, stop_conditions = load_checkpoint(max_checkpoint_iteration) actor.load_state_dict(actor_state_dict, strict=True) critic.load_state_dict(critic_state_dict, strict=True) var_critic.load_state_dict(var_critic_state_dict, strict=True) actor_optimizer.load_state_dict(actor_optimizer_state_dict) critic_optimizer.load_state_dict(critic_optimizer_state_dict) var_critic_optimizer.load_state_dict(var_critic_optimizer_state_dict) '''We have to move manually move optimizer states to TRAIN_DEVICE manually since optimizer doesn't yet have a "to" method.#''' for state in actor_optimizer.state.values(): for k, v in state.items(): if isinstance(v, torch.Tensor): state[k] = v.to(TRAIN_DEVICE) for state in critic_optimizer.state.values(): for k, v in state.items(): if isinstance(v, torch.Tensor): state[k] = v.to(TRAIN_DEVICE) for state in var_critic_optimizer.state.values(): for k, v in state.items(): if isinstance(v, torch.Tensor): state[k] = v.to(TRAIN_DEVICE) return actor, critic, var_critic, actor_optimizer, critic_optimizer, var_critic_optimizer, \ max_checkpoint_iteration, stop_conditions
def __init__(self, envs, testing_envs, seed, variance_limit = 0.25): self.seed = seed self.successes = [] self.testing_envs = testing_envs self.envs = envs self.variance_limit = variance_limit training_envs_per_dof = int(len(self.envs.envs)/3) self.training_env_seq = [4]*training_envs_per_dof + [5]*training_envs_per_dof + [6]*training_envs_per_dof self.testing_env_seq = [4]*10 + [5]*10 + [6]*10 if p.mode == "retrain": self.training_env_seq = self.testing_env_seq self.device = torch.device(p.device) # create the network self.actor = Actor().to(self.device) self.critic = Critic().to(self.device) if p.mode == 'retrain': self.actor.load_state_dict(torch.load("actor_seed_{}".format(seed))) self.critic.load_state_dict(torch.load("critic_seed_{}".format(seed))) # build up the target network self.actor_target = Actor().to(self.device) self.critic_target = Critic().to(self.device) # load the weights into the target networks self.actor_target.load_state_dict(self.actor.state_dict()) self.critic_target.load_state_dict(self.critic.state_dict()) # if use gpu self.actor_optim = torch.optim.Adam(self.actor.parameters(), lr=p.lr) self.critic_optim = torch.optim.Adam(self.critic.parameters(), lr=p.lr) # her sampler self.buffer = replay_buffer(seed) if p.mode == 'retrain': self.buffer.load_normalizers() print("loading done") self.training_data, self.testing_data = {}, {} for env in self.envs.envs: self.training_data[env.name] = [] for env in self.testing_envs.envs: self.testing_data[env.name] = [] try: os.mkdir("Generated_data") except FileExistsError: pass
def __init__(self, positions_per_level, move_probabilities, initial_vacancy_fraction, firing_schedule): """ :param positions_per_level: list of positions per level ;list of ints e.g. [10,20,30] == 10 positions in level 1, 20 in level 2, etc. :param move_probabilities: dict of move probabilities for agents, specific format below NB: vector of actor retirement probs is per level e.g. [retire prob level 1, retire prob level 2, retire prob level 3] while vector of move probs for vacancies is for vacancies, in order of [don't move, retire, move in same level, move down level] e.g. {"actor retirement probs": [0.1, 0.1, 0.1] "vacancy move probs": [0.3, 0..1, 0.3, 0.3]} :param initial_vacancy_fraction: float [0,1] telling us what percentage of positions in each level should be vacant at model initialisation :param firing_schedule: dict indicating what retirement probabilities should be at given steps (form below) this facilitates one-off changes where portions of levels are emptied of actors e.g. {"steps": {5, 10}, "level-retire probability": [(1, 0.4), (2, 0.4), (3, 0.6)]} """ super().__init__() # set parameters self.num_levels = len(positions_per_level) self.positions_per_level = positions_per_level self.move_probabilities = move_probabilities self.vacancy_fraction = initial_vacancy_fraction self.firing_schedule = firing_schedule self.per_step_movement = {"actor": 0, "vacancy": 0} self.schedule = SimultaneousActivation(self) self.running = True self.datacollector = DataCollector( model_reporters={"agent_counts": get_agent_counts, "percent_vacant_per_level": get_percent_vacancy_per_level, "mean_lengths": get_sequence_and_vacancy_mean_lengths, "mean_lengths_std": get_sequence_and_vacancy_length_stdev, "mean_spell_lengths": get_mean_spell_lengths, "mean_spell_length_stdev": get_stdev_spell_lengths, "total mobility": get_total_mobility}) # make positions and populate them with agents self.positions = {i: {} for i in range(1, self.num_levels + 1)} for i in range(self.num_levels): vacancies = fraction_of_list(initial_vacancy_fraction, self.positions_per_level[i]) for j in range(self.positions_per_level[i]): position_id = str(i + 1) + '-' + str(j + 1) # position ID = level-position number p = Position(position_id, self) self.positions[i + 1][position_id] = p # make entity agent = Vacancy(uuid4(), self) if vacancies[j] else Actor(uuid4(), self) self.schedule.add(agent) # associate it with position agent.position = p.unique_id p.dual = [agent.unique_id, agent.type] # update logs agent.log.append(p.unique_id) p.log.append(agent.unique_id) self.retiree_spots = set() self.desired_positions = [] self.retirees = {"actor": {}, "vacancy": {}}
def __init__(self, env, seed): self.seed = seed self.successes = [] self.epochs = [] self.env = env self.device = torch.device(p.device) # create the network self.actor = Actor(self.env.ob_shape, self.env.goal_shape, self.env.action_shape).to(self.device) self.critic = Critic(self.env.ob_shape, self.env.goal_shape, self.env.action_shape).to(self.device) # build up the target network self.actor_target = Actor(self.env.ob_shape, self.env.goal_shape, self.env.action_shape).to(self.device) self.critic_target = Critic(self.env.ob_shape, self.env.goal_shape, self.env.action_shape).to(self.device) # load the weights into the target networks self.actor_target.load_state_dict(self.actor.state_dict()) self.critic_target.load_state_dict(self.critic.state_dict()) # if use gpu self.actor_optim = torch.optim.Adam(self.actor.parameters(), lr=p.lr) self.critic_optim = torch.optim.Adam(self.critic.parameters(), lr=p.lr) # her sampler self.buffer = replay_buffer(self.env.ob_shape, self.env.action_shape)
def main(epochs): # create model model = Actor() # load data data_path = os.path.join(script_path, 'data.pkl') with open(data_path, 'rb') as handler: data_dict = pickle.load(handler) data = data_dict['data'] label = data_dict['label'] '''OHLC is in dataframe label. indicators are in dataframe data.''' plot_series(label['c']) ''' train-test split 70% and 30% 所以绘制出来的buyhold图形可能和完整的buyhold不一样 ''' split = int(.7 * len(data)) train_data = data.ix[:split] train_label = label.ix[:split] test_data = data.ix[split:] test_label = label.ix[split:] # train ret_lst = [] for i_ep in range(epochs): '''任意选96个time step,所以得保证idx小于长度减去96''' idx = np.random.randint(len(train_data) - 96) ''' sample a consecutive of 96 steps 一共有3个action,交易的对象是0~4个bitcoin,任选之一 ''' env = Env(train_data[idx:idx + 96], train_label[idx:idx + 96], init_act=np.random.randint(5)) ret = roll_out(env=env, model=model, train_mode=True) ret_lst.append(ret) # test env = Env(test_data, test_label, init_act=0) _, r_lst, p_lst, P_lst = roll_out(env=env, model=model, train_mode=False) # plot plot_result('Bitcoin', r_lst, p_lst)
def main(epochs): # create model model = Actor() # load data data_path = os.path.join(script_path, 'BTCUSD-15Min-Data.pkl') with open(data_path, 'rb') as handler: data_dict = pickle.load(handler) data = data_dict['data'] label = data_dict['label'] # train-test split 70% and 30% split = int(.7 * len(data)) train_data = data.ix[:split] train_label = label.ix[:split] test_data = data.ix[split:] test_label = label.ix[split:] # train ret_lst = [] for i_ep in range(epochs): idx = np.random.randint(len(train_data) - 96) # sample a consecutive of 96 steps env = Env(train_data[idx:idx + 96], train_label[idx:idx + 96], init_act=np.random.randint(5)) ret = roll_out(env=env, model=model, train_mode=True) ret_lst.append(ret) # test env = Env(test_data, test_label, init_act=0) _, r_lst, p_lst, P_lst = roll_out(env=env, model=model, train_mode=False) # plot plot_result('Bitcoin', r_lst, p_lst)
def main(args): # Set path to save result gym_dir = './' + args['env'] + '_' + args['variation'] + '/gym' # Set random seed for reproducibility np.random.seed(int(args['seed'])) tf.set_random_seed(int(args['seed'])) with tf.Session() as sess: # Load environment env = gym.make(args['env']) env.seed(int(args['seed'])) # get size of action and state (i.e. output and input for the agent) obs = env.reset() observation_dim = obs['observation'].shape[0] achieved_goal_dim = obs['achieved_goal'].shape[0] desired_goal_dim = obs['desired_goal'].shape[0] assert achieved_goal_dim == desired_goal_dim # state size = observation size + goal size state_dim = observation_dim + desired_goal_dim action_dim = env.action_space.shape[0] action_highbound = env.action_space.high # print out parameters print('Parameters:') print('Observation Size=', observation_dim) print('Goal Size=', desired_goal_dim) print('State Size =', state_dim) print('Action Size =', action_dim) print('Action Upper Boundary =', action_highbound) # save to monitor if render if args['render']: env = gym.wrappers.Monitor(env, gym_dir, force=True) else: env = gym.wrappers.Monitor(env, gym_dir, video_callable=False, force=True) # create actor actor = Actor(sess, state_dim, action_dim, action_highbound, float(args['actor_lr']), float(args['tau']), int(args['batch_size']), int(args['hidden_size'])) # create critic critic = Critic(sess, state_dim, action_dim, float(args['critic_lr']), float(args['tau']), float(args['gamma']), actor.n_actor_vars, int(args['hidden_size'])) # noise actor_noise = Noise(mu=np.zeros(action_dim)) # train the network if not args['test']: train(sess, env, args, actor, critic, actor_noise, desired_goal_dim, achieved_goal_dim, observation_dim) else: test(sess, env, args, actor, critic, desired_goal_dim, achieved_goal_dim, observation_dim) # close gym env.close() # close session sess.close()
class Trainer: def __init__(self, env, seed): self.seed = seed self.successes = [] self.epochs = [] self.env = env self.device = torch.device(p.device) # create the network self.actor = Actor(self.env.ob_shape, self.env.goal_shape, self.env.action_shape).to(self.device) self.critic = Critic(self.env.ob_shape, self.env.goal_shape, self.env.action_shape).to(self.device) # build up the target network self.actor_target = Actor(self.env.ob_shape, self.env.goal_shape, self.env.action_shape).to(self.device) self.critic_target = Critic(self.env.ob_shape, self.env.goal_shape, self.env.action_shape).to(self.device) # load the weights into the target networks self.actor_target.load_state_dict(self.actor.state_dict()) self.critic_target.load_state_dict(self.critic.state_dict()) # if use gpu self.actor_optim = torch.optim.Adam(self.actor.parameters(), lr=p.lr) self.critic_optim = torch.optim.Adam(self.critic.parameters(), lr=p.lr) # her sampler self.buffer = replay_buffer(self.env.ob_shape, self.env.action_shape) def start(self): for self.epoch in range(p.n_epochs): for _ in range(p.n_cycles): mb_obs, mb_ag, mb_g, mb_obs_next, mb_ag_next, mb_actions = [], [], [], [], [], [] for _ in range(1): # reset the rollouts ep_obs, ep_ag, ep_g, ep_obs_next, ep_ag_next, ep_actions = [], [], [], [], [], [] # reset the environment observation = self.env.reset() obs = observation['observation'] ag = observation['achieved_goal'] g = observation['desired_goal'] # start to collect samples for t in range(p.max_episode_steps): with torch.no_grad(): obs_norm, g_norm = self.normalize(obs, g) pi = self.actor(obs_norm, g_norm) action = self.add_noise(pi) # feed the actions into the environment observation_new, _, _, info = self.env.step(action) obs_new = observation_new['observation'] ag_new = observation_new['achieved_goal'] # append rollouts ep_obs.append(obs.copy()) ep_ag.append(ag.copy()) ep_g.append(g.copy()) ep_obs_next.append(obs_new.copy()) ep_ag_next.append(ag_new.copy()) ep_actions.append(action.copy()) # re-assign the observation obs = obs_new ag = ag_new mb_obs.append(ep_obs) mb_ag.append(ep_ag) mb_g.append(ep_g) mb_obs_next.append(ep_obs_next) mb_ag_next.append(ep_ag_next) mb_actions.append(ep_actions) # convert them into arrays mb_obs = np.array(mb_obs) mb_ag = np.array(mb_ag) mb_g = np.array(mb_g) mb_obs_next = np.array(mb_obs_next) mb_ag_next = np.array(mb_ag_next) mb_actions = np.array(mb_actions) # store the episodes self.buffer.store_episode( [mb_obs, mb_ag, mb_g, mb_obs_next, mb_ag_next, mb_actions]) self.buffer.update_normalizer( [mb_obs, mb_ag, mb_g, mb_obs_next, mb_ag_next, mb_actions]) for _ in range(p.update_per_episode): # train the network c_loss, a_loss = self.update_network() # soft update self.soft_update_target_network() # start to do the evaluation success_rate = self.eval_agent() print('[{}] epoch: {}, seed: {}, eval success rate is: {}'.format( self.env.name, self.epoch, self.seed, success_rate)) self.save_csv(self.epoch, success_rate) if len(self.successes) >= 10: if sum(self.successes[-10:]) == 10.0: break def save_csv(self, epoch, success_rate): try: os.mkdir("Generated_data") except: pass self.epochs.append(epoch + 1) self.successes.append(success_rate) di = {} di['epochs'] = self.epochs di["success_rate"] = self.successes frame = pd.DataFrame(di) frame.to_csv("Generated_data/{}_{}.csv".format(self.env.name, self.seed)) def normalize(self, obs, g): print(self.env.name) time.sleep(10000) obs_norm = self.buffer.o_norm.normalize(obs) g_norm = self.buffer.g_norm.normalize(g) obs_norm = torch.FloatTensor(obs_norm).to(self.device) g_norm = torch.FloatTensor(g_norm).to(self.device) # concatenate the stuffs return obs_norm, g_norm # this function will choose action for the agent and do the exploration def add_noise(self, pi): action = pi.cpu().numpy().squeeze() # add the gaussian action += p.noise_eps * np.random.randn(*action.shape) action = np.clip(action, -1.0, 1.0) # random actions... random_actions = np.random.uniform(low=-1.0, high=1.0, size=self.env.action_shape) # choose if use the random actions action += np.random.binomial(1, p.random_eps, 1)[0] * (random_actions - action) return action # soft update def soft_update_target_network(self): for target_param, param in zip(self.actor_target.parameters(), self.actor.parameters()): target_param.data.copy_((1 - p.polyak) * param.data + p.polyak * target_param.data) for target_param, param in zip(self.critic_target.parameters(), self.critic.parameters()): target_param.data.copy_((1 - p.polyak) * param.data + p.polyak * target_param.data) # update the network def update_network(self): # sample the episodes transitions = self.buffer.sample() # pre-process the observation and goal o, o_next, g = transitions['obs'], transitions[ 'obs_next'], transitions['g'] transitions['obs'], transitions['g'] = self.buffer.preproc_og(o, g) transitions['obs_next'], transitions[ 'g_next'] = self.buffer.preproc_og(o_next, g) # start to do the update obs_norm, g_norm = self.normalize(transitions['obs'], transitions['g']) obs_next_norm, g_next_norm = self.normalize(transitions['obs_next'], transitions['g_next']) actions_tensor = torch.FloatTensor(transitions['actions']).to( self.device) r_tensor = torch.FloatTensor(transitions['r']).to(self.device) with torch.no_grad(): # do the normalization # concatenate the stuffs actions_next = self.actor_target(obs_next_norm, g_next_norm) q_next_value = self.critic_target(obs_next_norm, g_next_norm, actions_next) q_next_value = q_next_value.detach() target_q_value = r_tensor + p.gamma * q_next_value target_q_value = target_q_value.detach() # clip the q value clip_return = 1 / (1 - p.gamma) target_q_value = torch.clamp(target_q_value, -clip_return, 0) # the q loss real_q_value = self.critic(obs_norm, g_norm, actions_tensor) critic_loss = (target_q_value - real_q_value).pow(2).mean() # the actor loss actions_real = self.actor(obs_norm, g_norm) actor_loss = -self.critic(obs_norm, g_norm, actions_real).mean() self.a1 = actor_loss self.a2 = (actions_real).pow(2).mean() self.actions_real = actions_real actor_loss += (actions_real).pow(2).mean() # start to update the network self.actor_optim.zero_grad() actor_loss.backward() # update the critic_network self.critic_optim.zero_grad() critic_loss.backward() self.actor_optim.step() self.critic_optim.step() return critic_loss.item(), actor_loss.item() # do the evaluation def eval_agent(self): total_success_rate = [] for _ in range(p.testing_eps): total_success_rate.append(0.0) observation = self.env.reset() obs = observation['observation'] g = observation['desired_goal'] for _ in range(p.max_episode_steps): with torch.no_grad(): obs_norm, g_norm = self.normalize(obs, g) pi = self.actor(obs_norm, g_norm) # convert the actions actions = pi.detach().cpu().numpy().squeeze() observation_new, _, _, info = self.env.step(actions) obs = observation_new['observation'] g = observation_new['desired_goal'] if info["is_success"]: break total_success_rate[-1] = info['is_success'] total_success_rate = round(np.array(total_success_rate).mean(), 2) return total_success_rate
from agent import Actor, Learner import os import numpy as np import importlib config = get_config() os.environ['CUDA_VISIBLE_DEVICES'] = config['visible_device'] environment = importlib.import_module('environment.' + config['task_name']) ds = np.zeros([config['imsize']**2, config['A_size'], config['C_size'], 2], dtype=np.int) learner = Learner(ds) tracker = util.Tracker() env = environment.Env() actor = Actor(learner.main_net) preobs = env.reset() rew_tmp = [] for frame_idx in range(config['max_training_step']): option = actor.act(preobs) postobs, reward, done, info = env.step(option) learner.main_net.add_sample(preobs[1], option, postobs[1]) learner.buffer.add_tmp(preobs, reward, option) preobs = postobs rew_tmp.append(reward) if done: sum_rew = 0 learner.buffer.popall() for r in reversed(rew_tmp): sum_rew = config['gamma'] * sum_rew + r
def train(args): T_SIZE = 500 SET_POINT = 50 t = np.linspace(0, 50, num=T_SIZE) SP = np.ones(T_SIZE) * SET_POINT env = PIDModel(ku=1.396, tu=3.28, t=t, SP=SP) actor = Actor() critic = Critic() agent = Agent(env, actor_lr=args["ACTOR_LEARNING_RATE"], critic_lr=args["CRITIC_LEARNING_RATE"], actor_model=actor, critic_model=critic, device=args["DEVICE"], gamma=args["GAMMA"]) stats = {"episode_reward": deque([]), "del_ts": []} if args["LOAD_PREVIOUS"]: print("Loading previously trained model") agent.load() for i in range(args["NUM_EPISODES"]): print("Starting episode", i) state = env.reset() total = 0 agent.start_episode() state, _, __ = env.step((0.5, 0.5, 3.5)) # Initial random state num_step = 0 done = False while not done: action = agent.get_action(state) # Exploration strategy gauss_noise = np.random.normal(0, args["exploration_stddev"], size=3) target_action = action + torch.Tensor(gauss_noise) target_action = agent.actor_model.clamp_action(target_action) new_state, reward, done = env.step(target_action.detach().numpy()) transition = Transition(reward=reward, state=state, action=action, target_action=target_action, next_state=new_state) agent.step(transition) if (num_step % args["PRINT_EVERY"] == 0): print("\tStep", num_step, "for episode", i) print("\t", action, target_action) print("\tReward accumulated:", total) assert (type(target_action) == torch.Tensor) assert (target_action.requires_grad) assert (action.requires_grad) total += reward state = new_state num_step += 1 # Learn from this episode agent.learn() if i % 1 == 0: agent.save() stats["episode_reward"].append(total / num_step) stats["del_ts"].extend(agent.get_episode_stats()[1]) print("Reward is ", total, "and average reward is", total / num_step) return stats
class Trainer: def __init__(self, envs, testing_envs, seed, variance_limit = 0.25): self.seed = seed self.successes = [] self.testing_envs = testing_envs self.envs = envs self.variance_limit = variance_limit training_envs_per_dof = int(len(self.envs.envs)/3) self.training_env_seq = [4]*training_envs_per_dof + [5]*training_envs_per_dof + [6]*training_envs_per_dof self.testing_env_seq = [4]*10 + [5]*10 + [6]*10 if p.mode == "retrain": self.training_env_seq = self.testing_env_seq self.device = torch.device(p.device) # create the network self.actor = Actor().to(self.device) self.critic = Critic().to(self.device) if p.mode == 'retrain': self.actor.load_state_dict(torch.load("actor_seed_{}".format(seed))) self.critic.load_state_dict(torch.load("critic_seed_{}".format(seed))) # build up the target network self.actor_target = Actor().to(self.device) self.critic_target = Critic().to(self.device) # load the weights into the target networks self.actor_target.load_state_dict(self.actor.state_dict()) self.critic_target.load_state_dict(self.critic.state_dict()) # if use gpu self.actor_optim = torch.optim.Adam(self.actor.parameters(), lr=p.lr) self.critic_optim = torch.optim.Adam(self.critic.parameters(), lr=p.lr) # her sampler self.buffer = replay_buffer(seed) if p.mode == 'retrain': self.buffer.load_normalizers() print("loading done") self.training_data, self.testing_data = {}, {} for env in self.envs.envs: self.training_data[env.name] = [] for env in self.testing_envs.envs: self.testing_data[env.name] = [] try: os.mkdir("Generated_data") except FileExistsError: pass def start(self): if p.mode == "retrain": for self.epoch in range(-10, 0): training_success_rate, testing_success_rate = self.eval_agent() self.log_data(training_success_rate, testing_success_rate) else: for self.epoch in range(p.n_epochs): for _ in range(p.n_cycles): # reset the rollouts ep_obs, ep_ag, ep_g, ep_obs_next, ep_ag_next, ep_actions, ep_seq = [], [], [], [], [], [], [] # reset the environment observation = self.envs.reset() obs = observation['observation'] ag = observation['achieved_goal'] g = observation['desired_goal'] # start to collect samples for t in range(p.max_episode_steps): with torch.no_grad(): obs_norm, g_norm = self.normalize(obs, g) pi = self.actor(obs_norm, g_norm, self.training_env_seq) action = self.add_noise(pi) # feed the actions into the environment observation_new, info = self.envs.step(action) obs_new = observation_new['observation'] ag_new = observation_new['achieved_goal'] # append rollouts ep_obs.append(obs.copy()) ep_ag.append(ag.copy()) ep_g.append(g.copy()) ep_obs_next.append(obs_new.copy()) ep_ag_next.append(ag_new.copy()) ep_actions.append(action.copy()) ep_seq.append(self.training_env_seq) # re-assign the observation obs = obs_new ag = ag_new #convert them into arrays ep_obs = np.array(ep_obs).swapaxes(0,1) ep_ag = np.array(ep_ag).swapaxes(0,1) ep_g = np.array(ep_g).swapaxes(0,1) ep_obs_next = np.array(ep_obs_next).swapaxes(0,1) ep_ag_next = np.array(ep_ag_next).swapaxes(0,1) ep_actions = np.array(ep_actions).swapaxes(0,1) ep_seq = np.array(ep_seq).swapaxes(0,1) for i in range(ep_obs.shape[0]): # store the episodes self.buffer.store_episode([np.expand_dims(ep_obs[i],0), np.expand_dims(ep_ag[i],0), np.expand_dims(ep_g[i],0), np.expand_dims(ep_obs_next[i],0), np.expand_dims(ep_ag_next[i],0), np.expand_dims(ep_actions[i],0), np.expand_dims(ep_seq[i],0)]) self.buffer.update_normalizer([np.expand_dims(ep_obs[i],0), np.expand_dims(ep_ag[i],0), np.expand_dims(ep_g[i],0), np.expand_dims(ep_obs_next[i],0), np.expand_dims(ep_ag_next[i],0), np.expand_dims(ep_actions[i],0), np.expand_dims(ep_seq[i],0)]) for _ in range(p.update_per_episode): # train the network c_loss, a_loss = self.update_network() # soft update self.soft_update_target_network() training_success_rate, testing_success_rate = self.eval_agent() self.log_data(training_success_rate, testing_success_rate) torch.save(self.actor.state_dict(), "actor_seed_{}".format(self.seed)) torch.save(self.critic.state_dict(), "critic_seed_{}".format(self.seed)) self.buffer.save_normalizers() def log_data(self, training_data, testing_data): os.system("clear") print("Epoch: {}".format(self.epoch)) print("Training_data: ") end = "\t" for i, env in enumerate(self.envs.envs): print(env.name, training_data[i], end=end) self.training_data[env.name].append(training_data[i]) end = "\t" if end=="\n" else "\n" print(end="\n\n") frame = pd.DataFrame(self.training_data) if self.variance_limit == 0.25: frame.to_csv("Generated_data/" + p.mode + "ing_data_{}.csv".format(self.seed)) else: frame.to_csv("Generated_data/" + p.mode + "ing_data_{}_{}.csv".format(self.variance_limit, self.seed)) print("Testing_data: ") end = "\t" for i, env in enumerate(self.testing_envs.envs): print(env.name, testing_data[i], end=end) self.testing_data[env.name].append(testing_data[i]) end = "\t" if end=="\n" else "\n" print(end="\n\n") frame = pd.DataFrame(self.testing_data) if self.variance_limit == 0.25: frame.to_csv("Generated_data/" + p.mode + "ing_test_data_{}.csv".format(self.seed)) else: frame.to_csv("Generated_data/" + p.mode + "ing_test_data_{}_{}.csv".format(self.variance_limit, self.seed)) def normalize(self, obs, g): obs_norm = self.buffer.o_norm.normalize(obs) g_norm = self.buffer.g_norm.normalize(g) obs_norm = torch.FloatTensor(obs_norm).to(self.device) g_norm = torch.FloatTensor(g_norm).to(self.device) # concatenate the stuffs return obs_norm, g_norm # this function will choose action for the agent and do the exploration def add_noise(self, pi): action = pi.cpu().numpy().squeeze() # add the gaussian action += p.noise_eps * np.random.randn(*action.shape) action = np.clip(action, -1.0, 1.0) # random actions... random_actions = np.random.uniform(low = -1.0, high = 1.0, size=p.max_dof) # choose if use the random actions action += np.random.binomial(1, p.random_eps, 1)[0] * (random_actions - action) return action # soft update def soft_update_target_network(self): for target_param, param in zip(self.actor_target.parameters(), self.actor.parameters()): target_param.data.copy_((1 - p.polyak) * param.data + p.polyak * target_param.data) for target_param, param in zip(self.critic_target.parameters(), self.critic.parameters()): target_param.data.copy_((1 - p.polyak) * param.data + p.polyak * target_param.data) # update the network def update_network(self): # sample the episodes transitions = self.buffer.sample() # pre-process the observation and goal o, o_next, g = transitions['obs'], transitions['obs_next'], transitions['g'] transitions['obs'], transitions['g'] = self.buffer.preproc_og(o, g) transitions['obs_next'], transitions['g_next'] = self.buffer.preproc_og(o_next, g) seq = transitions['seq'] # start to do the update obs_norm, g_norm = self.normalize(transitions['obs'], transitions['g']) obs_next_norm, g_next_norm = self.normalize(transitions['obs_next'], transitions['g_next']) actions_tensor = torch.FloatTensor(transitions['actions']).to(self.device) r_tensor = torch.FloatTensor(transitions['r']).to(self.device) with torch.no_grad(): # do the normalization # concatenate the stuffs r_tensor = r_tensor.view(p.batch_size) actions_next = self.actor_target(obs_next_norm, g_next_norm, seq) q_next_value = self.critic_target(obs_next_norm, g_next_norm, actions_next, seq) q_next_value = q_next_value.detach() target_q_value = r_tensor + p.gamma * q_next_value target_q_value = target_q_value.detach() # clip the q value clip_return = 1 / (1 - p.gamma) target_q_value = torch.clamp(target_q_value, -clip_return, 0) # the q loss real_q_value = self.critic(obs_norm, g_norm, actions_tensor, seq) critic_loss = (target_q_value - real_q_value).pow(2).mean() # the actor loss actions_real = self.actor(obs_norm, g_norm, seq) actor_loss = -self.critic(obs_norm, g_norm, actions_real, seq).mean() self.a1 = actor_loss self.a2 = (actions_real).pow(2).mean() self.actions_real = actions_real actor_loss += (actions_real).pow(2).mean() # start to update the network self.actor_optim.zero_grad() actor_loss.backward() # update the critic_network self.critic_optim.zero_grad() critic_loss.backward() self.actor_optim.step() self.critic_optim.step() return critic_loss.item(), actor_loss.item() # do the evaluation def eval_agent(self): training_success_rate = np.array([0.0] * len(self.envs.envs)) for _ in range(p.testing_eps): successes = np.array([0.0]*len(self.envs.envs)) observation = self.envs.reset() obs = observation['observation'] g = observation['desired_goal'] for _ in range(p.max_episode_steps): with torch.no_grad(): obs_norm, g_norm = self.normalize(obs, g) pi = self.actor(obs_norm, g_norm, self.training_env_seq) actions = pi.detach().cpu().numpy().squeeze() observation_new, info = self.envs.step(actions) obs = observation_new['observation'] g = observation_new['desired_goal'] successes = successes + info['is_success'] successes = np.array([1.0 if i else 0.0 for i in successes]) training_success_rate = training_success_rate + successes training_success_rate = training_success_rate/p.testing_eps testing_success_rate = np.array([0.0] * len(self.testing_envs.envs)) for _ in range(p.testing_eps): successes = np.array([0.0]*len(self.testing_envs.envs)) observation = self.testing_envs.reset() obs = observation['observation'] g = observation['desired_goal'] for _ in range(p.max_episode_steps): with torch.no_grad(): obs_norm, g_norm = self.normalize(obs, g) pi = self.actor(obs_norm, g_norm, self.testing_env_seq) actions = pi.detach().cpu().numpy().squeeze() observation_new, info = self.testing_envs.step(actions) obs = observation_new['observation'] g = observation_new['desired_goal'] successes = successes + info['is_success'] successes = np.array([1.0 if i else 0.0 for i in successes]) testing_success_rate = testing_success_rate + successes testing_success_rate = testing_success_rate/p.testing_eps return training_success_rate, testing_success_rate
def main(): np.random.seed(2) torch.manual_seed(2) # Superparameters MAX_EPISODE = 3000 # renders environment if total episode reward # is greater then this threshold DISPLAY_REWARD_THRESHOLD = 200 MAX_EP_STEPS = 1000 # maximum time step in one episode RENDER = False # rendering wastes time GAMMA = 0.9 # reward discount in TD error LR_A = 0.001 # learning rate for actor LR_C = 0.005 # learning rate for critic env = gym.make('CartPole-v0') env.seed(1) # reproducible env = env.unwrapped pprint(env.__dict__) N_F = env.observation_space.shape[0] N_A = env.action_space.n print(f"N_F: {N_F}, N_A: {N_A}") # N_F: 4, N_A: 2 actor = Actor(N_F, N_A, LR_A) critic = Critic(N_F, LR_C, GAMMA) running_reward = 0 for episode in range(MAX_EPISODE): s = env.reset() # s : list # s.shape = (4,) t = 0 track_r = [] while True: if RENDER: env.render() # actor choose action a = actor.choose_action(s) # a : scalar s_, r, done, info = env.step(a) # s.shape = (4,) # r : float # done means the pole is down # last reward give negative reward if done: r = -20 track_r.append(r) td_err = critic.learn(s, r, s_) # td : torch.tensor # td_err.shape = (1, 1) actor.learn(s, a, td_err) s = s_ t += 1 if done or t >= MAX_EP_STEPS: ep_rs_sum = sum(track_r) if episode == 0: running_reward = ep_rs_sum else: running_reward = running_reward * 0.95 + ep_rs_sum * 0.05 if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True print(f"Episode: {episode+1}") print(f"\treward: {ep_rs_sum}") print(f"\trunning_reward: {running_reward:.1f}") print(f"\tdone: {done}") break
tracker.writer.add_scalar("loss", float(loss.detach().cpu().numpy()), epoch * X_train.shape[0] + i) model_path = os.path.join(tracker.path, 'model') torch.save( { 'net': net.state_dict(), 'ds': ds, 'optimizer': optimizer.state_dict() }, model_path) ac = 0 psum = 0 for i in range(0, X_test.shape[0], config['batch_size']): j = i + config['batch_size'] if j > X_test.shape[0]: break x = torch.cuda.FloatTensor(X_test[i:j]).permute(0, 3, 1, 2) s1 = torch.cuda.LongTensor(S1_test[i:j]) s2 = torch.cuda.LongTensor(S2_test[i:j]) Q = net(x, s1, s2) label = y_test[i:j].flatten() pre = Q.view(-1, config['A_size']).max(1)[1].cpu().numpy() ac += np.where(pre == label)[0].shape[0] psum += label.shape[0] print('PA:', ac / psum) agent = Actor(net, eps=False) env = environment.Env() sr, mean, std = util.test_game(env, agent, tracker, 0, config['test_final_game']) print('test sr:%f, reward: mean:%f std:%f' % (sr, mean, std))