def train(self, args): self.agent.start_interaction(self.envs, nlump=self.hps['nlumps'], dynamics=self.dynamics) sess = tf.get_default_session() self.save = functools.partial(save_variables, sess=sess) self.load = functools.partial(load_variables, sess=sess) checkdir = osp.join(logger.get_dir(), 'checkpoints') os.makedirs(checkdir, exist_ok=True) load_weights = args['load_weights'] start_nupdates = 0 if load_weights is not None: load_path = osp.join(checkdir, load_weights) start_nupdates = int(load_weights) print('Loading checkpoint from %s ' % load_weights) self.load(load_path) while True: info = self.agent.step() if info['update']: info['update']['n_updates'] += start_nupdates info['update']['tcount'] += start_nupdates*args['nsteps_per_seg']*args['envs_per_process'] logger.logkvs(info['update']) logger.dumpkvs() if info['update']['n_updates'] % 10 == 0: weights_index = info['update']['n_updates'] savepath = osp.join(checkdir, '%.5i'% weights_index) print('Saving to', savepath) self.save(savepath) if self.agent.rollout.stats['tcount'] > self.num_timesteps: break self.agent.stop_interaction()
def train(args, model, device, train_loader, optimizer, epoch): model.train() total_loss = 0 correct = 0 for batch_idx, (data, target) in enumerate(train_loader): data, target = data.to(device).float(), target.to(device).long() optimizer.zero_grad() output = model(data) loss = F.cross_entropy(output, target, reduction='sum') loss.backward() optimizer.step() # get the index of the max log-probability pred = output.max(1, keepdim=True)[1] correct += pred.eq(target.view_as(pred)).sum().item() total_loss += loss.item() if batch_idx % args.log_interval == 0: print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( epoch, batch_idx * len(data), len(train_loader.dataset), 100. * batch_idx / len(train_loader), loss.item())) total_loss /= float(len(train_loader.dataset)) acc = correct / float(len(train_loader.dataset)) logger.logkv('epoch', epoch) logger.logkv('train/loss', total_loss) logger.logkv('train/acc', acc) logger.dumpkvs() return total_loss, acc
def train(self): next_v = 1e6 v = self.value_fun.get_values() itr = 0 videos = [] contours = [] returns = [] delay_cs = [] fig = None while not self._stop_condition(itr, next_v, v) and itr < self.max_itr: log = itr % self.log_itr == 0 render = (itr % self.render_itr == 0) and self.render if log: next_pi = self.get_next_policy() self.policy.update(next_pi) average_return, avg_delay_cost, video = rollout(self.env, self.policy, render=render, num_rollouts=self.num_rollouts, max_path_length=self.max_path_length,iteration=itr) if render: contour, fig = plot_contour(self.env, self.value_fun, fig=fig, iteration=itr) contours += [contour] * len(video) videos += video returns.append(average_return) delay_cs.append(avg_delay_cost) logger.logkv('Iteration', itr) logger.logkv('Average Returns', average_return) logger.logkv('Average Delayed Costs', avg_delay_cost) logger.dumpkvs() next_v = self.get_next_values() self.value_fun.update(next_v) itr += 1 next_pi = self.get_next_policy() self.policy.update(next_pi) contour, fig = plot_contour(self.env, self.value_fun, save=True, fig=fig, iteration=itr) average_return, avg_delay_cost, video = rollout(self.env, self.policy, render=True, num_rollouts=self.num_rollouts, max_path_length=self.max_path_length, iteration=itr) self.env.close() plot_returns(returns) plot_returns(delay_cs,'delayed_cost') videos += video if self.render: contours += [contour] logger.logkv('Iteration', itr) logger.logkv('Average Returns', average_return) logger.logkv('Average Delayed Costs', avg_delay_cost) fps = int(4/getattr(self.env, 'dt', 0.1)) if contours and contours[0] is not None: clip = mpy.ImageSequenceClip(contours, fps=fps) clip.write_videofile('%s/contours_progress.mp4' % logger.get_dir()) if videos: clip = mpy.ImageSequenceClip(videos, fps=fps) clip.write_videofile('%s/roll_outs.mp4' % logger.get_dir()) plt.close()
def train(self): obs = self._env.reset() episode_rewards = [] n_episodes = 0 l_episode_return = deque([], maxlen=10) l_discounted_episode_return = deque([], maxlen=10) l_tq_squared_error = deque(maxlen=50) log_itr = -1 for itr in range(self._initial_step, self._max_steps): act = self.eps_greedy(obs[np.newaxis, :], self.exploration.value(itr)) next_obs, rew, done, _ = self._env.step(act) if self._render: self._env.render() self._replay_buffer.add(obs, act, rew, next_obs, float(done)) episode_rewards.append(rew) if done: obs = self._env.reset() episode_return = np.sum(episode_rewards) discounted_episode_return = np.sum( episode_rewards * self._discount**np.arange(len(episode_rewards))) l_episode_return.append(episode_return) l_discounted_episode_return.append(discounted_episode_return) episode_rewards = [] n_episodes += 1 else: obs = next_obs if itr % self._target_q_update_freq == 0 and itr > self._learning_start_itr: self._update_target_q() if itr % self._train_q_freq == 0 and itr > self._learning_start_itr: # Sample from replay buffer. l_obs, l_act, l_rew, l_obs_prime, l_done = self._replay_buffer.sample( self._opt_batch_size) # Train Q value function with sampled data. td_squared_error = self.train_q(l_obs, l_act, l_rew, l_obs_prime, l_done) l_tq_squared_error.append(td_squared_error) if (itr + 1) % self._log_freq == 0 and len(l_episode_return) > 5: log_itr += 1 logger.logkv('Iteration', log_itr) logger.logkv('Steps', itr) logger.logkv('Epsilon', self.exploration.value(itr)) logger.logkv('Episodes', n_episodes) logger.logkv('AverageReturn', np.mean(l_episode_return)) logger.logkv('AverageDiscountedReturn', np.mean(l_discounted_episode_return)) logger.logkv('TDError^2', np.mean(l_tq_squared_error)) logger.dumpkvs() self._q.dump(logger.get_dir() + '/weights.pkl')
def test(episodes=20, agent=None, load_path=None, ifrender=False, log=False): if log: logger.configure(dir="./log/", format_strs="stdout") if agent is None: agent = DQN(num_state=16, num_action=4) if load_path: agent.load(load_path) else: agent.load() env = Game2048Env() score_list = [] highest_list = [] for i in range(episodes): state, _, done, info = env.reset() state = log2_shaping(state) start = time.time() while True: action = agent.select_action(state, deterministic=True) next_state, _, done, info = env.step(action) next_state = log2_shaping(next_state) state = next_state if ifrender: env.render() if done: print(env.Matrix) if log: logger.logkv('episode number', i + 1) logger.logkv('episode reward', info['score']) logger.logkv('episode steps', info['steps']) logger.logkv('highest', info['highest']) logger.dumpkvs() break end = time.time() if log: print('episode time:{} s\n'.format(end - start)) score_list.append(info['score']) highest_list.append(info['highest']) print('mean score:{}, mean highest:{}'.format(np.mean(score_list), np.mean(highest_list))) print('max score:{}, max hightest:{}'.format(np.max(score_list), np.max(highest_list))) result_info = { 'mean': np.mean(score_list), 'max': np.max(score_list), 'list': score_list } print(highest_list) return result_info
def train(self): obs = self._env.reset() episode_rewards = [] n_episodes = 0 l_episode_return = deque([], maxlen=10) l_discounted_episode_return = deque([], maxlen=10) l_tq_squared_error = deque(maxlen=50) log_itr = -1 for itr in range(self._initial_step, self._max_steps): act = self.eps_greedy(obs[np.newaxis, :], self.exploration.value(itr)) next_obs, rew, done, _ = self._env.step(act) if self._render: self._env.render() self._replay_buffer.add(obs, act, rew, next_obs, float(done)) episode_rewards.append(rew) if done: obs = self._env.reset() episode_return = np.sum(episode_rewards) discounted_episode_return = np.sum( episode_rewards * self._discount ** np.arange(len(episode_rewards))) l_episode_return.append(episode_return) l_discounted_episode_return.append(discounted_episode_return) episode_rewards = [] n_episodes += 1 else: obs = next_obs if itr % self._target_q_update_freq == 0 and itr > self._learning_start_itr: self._update_target_q() if itr % self._train_q_freq == 0 and itr > self._learning_start_itr: # Sample from replay buffer. l_obs, l_act, l_rew, l_obs_prime, l_done = self._replay_buffer.sample( self._opt_batch_size) # Train Q value function with sampled data. td_squared_error = self.train_q( l_obs, l_act, l_rew, l_obs_prime, l_done) l_tq_squared_error.append(td_squared_error) if (itr + 1) % self._log_freq == 0 and len(l_episode_return) > 5: log_itr += 1 logger.logkv('Iteration', log_itr) logger.logkv('Steps', itr) logger.logkv('Epsilon', self.exploration.value(itr)) logger.logkv('Episodes', n_episodes) logger.logkv('AverageReturn', np.mean(l_episode_return)) logger.logkv('AverageDiscountedReturn', np.mean(l_discounted_episode_return)) logger.logkv('TDError^2', np.mean(l_tq_squared_error)) logger.dumpkvs() self._q.dump(logger.get_dir() + '/weights.pkl')
def train(self): self.agent.start_interaction(self.envs, nlump=self.hps['nlumps'], dynamics=self.dynamics) while True: info = self.agent.step() if info['update']: logger.logkvs(info['update']) logger.dumpkvs() if self.agent.rollout.stats['tcount'] > self.num_timesteps: break self.agent.stop_interaction()
def instant_impulse(variant): env_name = variant['env_name'] env = get_env_from_name(env_name) env_params = variant['env_params'] eval_params = variant['eval_params'] policy_params = variant['alg_params'] policy_params.update({ 's_bound': env.observation_space, 'a_bound': env.action_space, }) build_func = get_policy(variant['algorithm_name']) if 'Fetch' in env_name or 'Hand' in env_name: s_dim = env.observation_space.spaces['observation'].shape[0] \ + env.observation_space.spaces['achieved_goal'].shape[0] + \ env.observation_space.spaces['desired_goal'].shape[0] else: s_dim = env.observation_space.shape[0] a_dim = env.action_space.shape[0] # d_dim = env_params['disturbance dim'] policy = build_func(a_dim, s_dim, policy_params) # disturber = Disturber(d_dim, s_dim, disturber_params) log_path = variant['log_path'] + '/eval/safety_eval' variant['eval_params'].update({'magnitude': 0}) logger.configure(dir=log_path, format_strs=['csv']) for magnitude in eval_params['magnitude_range']: variant['eval_params']['magnitude'] = magnitude diagnostic_dict = evaluation(variant, env, policy) string_to_print = ['magnitude', ':', str(magnitude), '|'] [ string_to_print.extend( [key, ':', str(round(diagnostic_dict[key], 2)), '|']) for key in diagnostic_dict.keys() ] print(''.join(string_to_print)) logger.logkv('magnitude', magnitude) [ logger.logkv(key, diagnostic_dict[key]) for key in diagnostic_dict.keys() ] logger.dumpkvs()
def train(num_iter, log_schedule): game = KuhnPoker() strategy_profile = get_initial_strategy_profile(game.root, game.num_players) average_strategy_profile = deepcopy(strategy_profile) for t in tqdm(range(num_iter)): update_pi(game.root, strategy_profile, average_strategy_profile, [1.0 for _ in range(game.num_players + 1)], [1.0 for _ in range(game.num_players + 1)], [1.0 for _ in range(game.num_players + 1)]) update_node_values(game.root, strategy_profile) exploitability = get_exploitability(game, average_strategy_profile) update_strategy(strategy_profile, average_strategy_profile, game.information_sets) if t % log_schedule(t) == 0: logger.logkv("t", t) logger.logkv("exploitability", exploitability) logger.dumpkvs() return average_strategy_profile
def various_disturbance(variant): env_name = variant['env_name'] env = get_env_from_name(env_name) env_params = variant['env_params'] eval_params = variant['eval_params'] policy_params = variant['alg_params'] disturber_params = variant['disturber_params'] build_func = get_policy(variant['algorithm_name']) if 'Fetch' in env_name or 'Hand' in env_name: s_dim = env.observation_space.spaces['observation'].shape[0] \ + env.observation_space.spaces['achieved_goal'].shape[0] + \ env.observation_space.spaces['desired_goal'].shape[0] else: s_dim = env.observation_space.shape[0] a_dim = env.action_space.shape[0] d_dim = env_params['disturbance dim'] policy = build_func(a_dim, s_dim, d_dim, policy_params) # disturber = Disturber(d_dim, s_dim, disturber_params) log_path = variant[ 'log_path'] + '/eval/various_disturbance-' + eval_params['form'] variant['eval_params'].update({'period': 0}) logger.configure(dir=log_path, format_strs=['csv']) for period in eval_params['period_list']: variant['eval_params']['period'] = period diagnostic_dict = evaluation(variant, env, policy) frequency = 1. / period string_to_print = ['frequency', ':', str(frequency), '|'] [ string_to_print.extend( [key, ':', str(round(diagnostic_dict[key], 2)), '|']) for key in diagnostic_dict.keys() ] print(''.join(string_to_print)) logger.logkv('frequency', frequency) [ logger.logkv(key, diagnostic_dict[key]) for key in diagnostic_dict.keys() ] logger.dumpkvs()
def train(self): params = self.value_fun._params videos = [] contours = [] returns = [] fig = None for itr in range(self.max_itr): params = self.optimizer.grad_step(self.objective, params) self.value_fun.update(params) log = itr % self.log_itr == 0 or itr == self.max_itr - 1 render = (itr % self.render_itr == 0) and self.render if log: average_return, video = rollout(self.env, self.policy, render=render, iteration=itr) if render: contour, fig = plot_contour(self.env, self.value_fun, fig=fig, iteration=itr) contours += [contour] videos += video returns.append(average_return) logger.logkv('Iteration', itr) logger.logkv('Average Returns', average_return) logger.dumpkvs() plot_returns(returns) plot_contour(self.env, self.value_fun, save=True, fig=fig) if contours and contours[0] is not None: contours = list(upsample(np.array(contours), 10)) clip = mpy.ImageSequenceClip(contours, fps=10) clip.write_videofile('%s/contours_progress.mp4' % logger.get_dir()) if videos: fps = int(10 / getattr(self.env, 'dt', 0.1)) clip = mpy.ImageSequenceClip(videos, fps=fps) clip.write_videofile('%s/learning_progress.mp4' % logger.get_dir()) plt.close()
def constant_impulse(variant): env_name = variant["env_name"] env = get_env_from_name(env_name) env_params = variant["env_params"] eval_params = variant["eval_params"] policy_params = variant["alg_params"] policy_params["network_structure"] = env_params["network_structure"] build_func = get_policy(variant["algorithm_name"]) if "Fetch" in env_name or "Hand" in env_name: s_dim = (env.observation_space.spaces["observation"].shape[0] + env.observation_space.spaces["achieved_goal"].shape[0] + env.observation_space.spaces["desired_goal"].shape[0]) else: s_dim = env.observation_space.shape[0] a_dim = env.action_space.shape[0] policy = build_func(a_dim, s_dim, policy_params) # disturber = Disturber(d_dim, s_dim, disturber_params) log_path = variant["log_path"] + "/eval/constant_impulse" variant["eval_params"].update({"magnitude": 0}) logger.configure(dir=log_path, format_strs=["csv"]) for magnitude in eval_params["magnitude_range"]: variant["eval_params"]["magnitude"] = magnitude diagnostic_dict, _ = evaluation(variant, env, policy) string_to_print = ["magnitude", ":", str(magnitude), "|"] [ string_to_print.extend( [key, ":", str(round(diagnostic_dict[key], 2)), "|"]) for key in diagnostic_dict.keys() ] print("".join(string_to_print)) logger.logkv("magnitude", magnitude) [ logger.logkv(key, diagnostic_dict[key]) for key in diagnostic_dict.keys() ] logger.dumpkvs()
def train(self): self.agent.start_interaction(self.envs, nlump=self.hps['nlumps'], dynamics=self.dynamics) if self.hps['ckptpath'] is not None: self.agent.restore_model(logdir=self.hps['ckptpath'], exp_name=self.hps['exp_name']) while True: info = self.agent.step() if info['update']: logger.logkvs(info['update']) logger.dumpkvs() if info['update']['n_updates'] % 60 == 0: self.agent.save_model( logdir=logger.get_dir(), exp_name=self.hps['exp_name'], global_step=info['update']['n_updates']) if self.agent.rollout.stats['tcount'] > self.num_timesteps: break self.agent.stop_interaction()
def trained_disturber(variant): env_name = variant['env_name'] env = get_env_from_name(env_name) env_params = variant['env_params'] eval_params = variant['eval_params'] policy_params = variant['alg_params'] disturber_params = variant['disturber_params'] build_func = get_policy(variant['algorithm_name']) if 'Fetch' in env_name or 'Hand' in env_name: s_dim = env.observation_space.spaces['observation'].shape[0] \ + env.observation_space.spaces['achieved_goal'].shape[0] + \ env.observation_space.spaces['desired_goal'].shape[0] else: s_dim = env.observation_space.shape[0] a_dim = env.action_space.shape[0] d_dim = env_params['disturbance dim'] policy = build_func(a_dim, s_dim, d_dim, policy_params) disturbance_chanel_list = np.nonzero( disturber_params['disturbance_magnitude'])[0] disturber_params['disturbance_chanel_list'] = disturbance_chanel_list disturber = Disturber(d_dim, s_dim, disturber_params) disturber.restore(eval_params['path']) log_path = variant['log_path'] + '/eval/trained_disturber' variant['eval_params'].update({'magnitude': 0}) logger.configure(dir=log_path, format_strs=['csv']) diagnostic_dict, _ = evaluation(variant, env, policy, disturber) string_to_print = [] [ string_to_print.extend( [key, ':', str(round(diagnostic_dict[key], 2)), '|']) for key in diagnostic_dict.keys() ] print(''.join(string_to_print)) [logger.logkv(key, diagnostic_dict[key]) for key in diagnostic_dict.keys()] logger.dumpkvs()
def trained_disturber(variant): env_name = variant["env_name"] env = get_env_from_name(env_name) env_params = variant["env_params"] eval_params = variant["eval_params"] policy_params = variant["alg_params"] disturber_params = variant["disturber_params"] build_func = get_policy(variant["algorithm_name"]) if "Fetch" in env_name or "Hand" in env_name: s_dim = (env.observation_space.spaces["observation"].shape[0] + env.observation_space.spaces["achieved_goal"].shape[0] + env.observation_space.spaces["desired_goal"].shape[0]) else: s_dim = env.observation_space.shape[0] a_dim = env.action_space.shape[0] d_dim = env_params["disturbance dim"] policy = build_func(a_dim, s_dim, d_dim, policy_params) disturbance_chanel_list = np.nonzero( disturber_params["disturbance_magnitude"])[0] disturber_params["disturbance_chanel_list"] = disturbance_chanel_list disturber = Disturber(d_dim, s_dim, disturber_params) disturber.restore(eval_params["path"]) log_path = variant["log_path"] + "/eval/trained_disturber" variant["eval_params"].update({"magnitude": 0}) logger.configure(dir=log_path, format_strs=["csv"]) diagnostic_dict, _ = evaluation(variant, env, policy, disturber) string_to_print = [] [ string_to_print.extend( [key, ":", str(round(diagnostic_dict[key], 2)), "|"]) for key in diagnostic_dict.keys() ] print("".join(string_to_print)) [logger.logkv(key, diagnostic_dict[key]) for key in diagnostic_dict.keys()] logger.dumpkvs()
def various_disturbance(variant): env_name = variant["env_name"] env = get_env_from_name(env_name) env_params = variant["env_params"] eval_params = variant["eval_params"] policy_params = variant["alg_params"] build_func = get_policy(variant["algorithm_name"]) if "Fetch" in env_name or "Hand" in env_name: s_dim = (env.observation_space.spaces["observation"].shape[0] + env.observation_space.spaces["achieved_goal"].shape[0] + env.observation_space.spaces["desired_goal"].shape[0]) else: s_dim = env.observation_space.shape[0] a_dim = env.action_space.shape[0] policy = build_func(a_dim, s_dim, policy_params) # disturber = Disturber(d_dim, s_dim, disturber_params) log_path = variant[ "log_path"] + "/eval/various_disturbance-" + eval_params["form"] variant["eval_params"].update({"period": 0}) logger.configure(dir=log_path, format_strs=["csv"]) for period in eval_params["period_list"]: variant["eval_params"]["period"] = period diagnostic_dict, _ = evaluation(variant, env, policy) frequency = 1.0 / period string_to_print = ["frequency", ":", str(frequency), "|"] [ string_to_print.extend( [key, ":", str(round(diagnostic_dict[key], 2)), "|"]) for key in diagnostic_dict.keys() ] print("".join(string_to_print)) logger.logkv("frequency", frequency) [ logger.logkv(key, diagnostic_dict[key]) for key in diagnostic_dict.keys() ] logger.dumpkvs()
def test(model, device, test_loader, epoch=None, val=True): model.eval() test_loss = 0 correct = 0 with torch.no_grad(): for data, target in test_loader: data, target = data.to(device).float(), \ target.to(device).long() output = model(data) test_loss += F.cross_entropy(output, target, reduction='sum').item() # sum up batch loss pred = output.max(1, keepdim=True)[1] # get the index of the max log-probability correct += pred.eq(target.view_as(pred)).sum().item() test_loss /= len(test_loader.dataset) acc = float(correct) / len(test_loader.dataset) if val: logger.logkv('epoch', epoch) logger.logkv('val/loss', test_loss) logger.logkv('val/acc', acc) logger.dumpkvs() return test_loss, acc
def train(self): self.agent.start_interaction(self.envs, nlump=self.hps['nlumps'], dynamics=self.dynamics) sess = tf.get_default_session() self.save = functools.partial(save_variables, sess=sess) while True: info = self.agent.step() if info['update']: logger.logkvs(info['update']) logger.dumpkvs() if info['update']['n_updates'] % 10 == 0: checkdir = osp.join(logger.get_dir(), 'checkpoints') os.makedirs(checkdir, exist_ok=True) savepath = osp.join(checkdir, '%.5i' % info['update']['n_updates']) print('Saving to', savepath) self.save(savepath) if self.agent.rollout.stats['tcount'] > self.num_timesteps: break self.agent.stop_interaction()
def test_mpi_weighted_mean(): from mpi4py import MPI comm = MPI.COMM_WORLD with logger.scoped_configure(comm=comm): if comm.rank == 0: name2valcount = {'a': (10, 2), 'b': (20, 3)} elif comm.rank == 1: name2valcount = {'a': (19, 1), 'c': (42, 3)} else: raise NotImplementedError d = mpi_util.mpi_weighted_mean(comm, name2valcount) correctval = {'a': (10 * 2 + 19) / 3.0, 'b': 20, 'c': 42} if comm.rank == 0: assert d == correctval, f'{d} != {correctval}' for name, (val, count) in name2valcount.items(): for _ in range(count): logger.logkv_mean(name, val) d2 = logger.dumpkvs() if comm.rank == 0: assert d2 == correctval
def train(): episodes = train_episodes logger.configure(dir="./log/", format_strs="stdout,tensorboard,log") agent = DQN(num_state=16, num_action=4) env = Game2048Env() pf_saver = Perfomance_Saver() model_saver = Model_Saver(num=10) eval_max_score = 0 for i in range(episodes): state, reward, done, info = env.reset() state = log2_shaping(state) start = time.time() loss = None while True: if agent.buffer.memory_counter <= agent.memory_capacity: action = agent.select_action(state, random=True) else: action = agent.select_action(state) next_state, reward, done, info = env.step(action) next_state = log2_shaping(next_state) reward = log2_shaping(reward, divide=1) agent.store_transition(state, action, reward, next_state) state = next_state if ifrender: env.render() if agent.buffer.memory_counter % agent.train_interval == 0 and agent.buffer.memory_counter > agent.memory_capacity: # 相当于填满后才update loss = agent.update() if done: if i % log_interval == 0: if loss: logger.logkv('loss', loss) logger.logkv('training progress', (i+1) / episodes) logger.logkv('episode reward', info['score']) logger.logkv('episode steps', info['steps']) logger.logkv('highest', info['highest']) logger.logkv('epsilon', agent.epsilon) logger.dumpkvs() loss = None if i % epsilon_decay_interval == 0: # episilon decay agent.epsilon_decay(i, episodes) break end = time.time() print('episode time:{} s\n'.format(end - start)) # eval if i % eval_interval == 0 and i: eval_info = test(episodes=test_episodes, agent=agent) average_score, max_score, score_lis = eval_info['mean'], eval_info['max'], eval_info['list'] pf_saver.save(score_lis, info=f'episode:{i}') if int(average_score) > eval_max_score: eval_max_score = int(average_score) name = 'dqn_{}.pkl'.format(int(eval_max_score)) agent.save(name=name) model_saver.save("./save/" + name) logger.logkv('eval average score', average_score) logger.logkv('eval max socre', max_score) logger.dumpkvs()
def eval(variant): env_name = variant['env_name'] traj = get_traj() env = get_env_from_name(env_name) env_params = variant['env_params'] max_ep_steps = env_params['max_ep_steps'] policy_params = variant['alg_params'] s_dim = env.observation_space.shape[0] a_dim = env.action_space.shape[0] a_upperbound = env.action_space.high a_lowerbound = env.action_space.low policy = CAC(a_dim, s_dim, policy_params) log_path = variant['log_path'] + '/eval/' + str(0) logger.configure(dir=log_path, format_strs=['csv']) policy.restore(variant['log_path'] + '/' + str(0) + '/policy') # Training setting t1 = time.time() PLOT_theta_1 = [] PLOT_ground_theta_1 = [] mst = [] agent_traj = [] ground_traj = [] for i in tqdm(range(50)): s = env.reset() cost = 0 traj_num = 0 # Random start point start_point = 0 + 1000 * i s = traj[start_point, :16] PLOT_state = s s = np.concatenate([[s], [traj[start_point, 17:]]], axis=1)[0] env.state = s for j in range(start_point + 1, start_point + 1 + 1000): if agent_traj == []: agent_traj = s[0:16] else: agent_traj = np.concatenate((agent_traj, s[0:16]), axis=0) if ground_traj == []: ground_traj = traj[j - 1, 0:16] else: ground_traj = np.concatenate((ground_traj, traj[j - 1, 0:16]), axis=0) delta = np.zeros(36) # ###### NOSIE ############## # noise = np.random.normal(0, 0.001, 16) # delta[20:]= noise # ###### BIAS ############## # noise = s[0:16]*0.005 # delta[0:16] = noise a = policy.choose_action(s + delta, True) action = a_lowerbound + (a + 1.) * (a_upperbound - a_lowerbound) / 2 # action = traj[j-1,16] X_, r, done, theta = env.step(action) # The new s= current state,next omega, next state s_ = np.concatenate([X_, [traj[j, 17:]]], axis=1)[0] env.state = s_ PLOT_theta_1.append(theta[0]) PLOT_ground_theta_1.append(traj[j, 16]) mst.append(np.linalg.norm(traj[j, 16] - theta[0])) PLOT_state = np.vstack((PLOT_state, X_)) logger.logkv('rewards', r) logger.logkv('timestep', j) logger.dumpkvs() cost = cost + r if j == 1000 - 1 + start_point: done = True s = s_ if done: #print('episode:', i,'trajectory_number:',traj_num,'total_cost:',cost,'steps:',j-start_point) break x = np.linspace(0, np.shape(PLOT_ground_theta_1)[0] - 1, np.shape(PLOT_ground_theta_1)[0]) # plt.plot(x, PLOT_theta_1, color='blue', label='Tracking') # plt.plot(x, PLOT_ground_theta_1, color='black', linestyle='--', label='Ground truth') # plt.show() fig = plt.figure() with h5py.File(variant['log_path'] + '/' + 'CAC_theta.h5', 'w') as hdf: hdf.create_dataset('Data', data=PLOT_theta_1) with h5py.File(variant['log_path'] + '/' + 'Normal_theta_ground.h5', 'w') as hdf: hdf.create_dataset('Data', data=PLOT_ground_theta_1) with h5py.File(variant['log_path'] + '/' + 'CAC_track.h5', 'w') as hdf: hdf.create_dataset('Data', data=agent_traj) with h5py.File(variant['log_path'] + '/' + 'GT_track.h5', 'w') as hdf: hdf.create_dataset('Data', data=ground_traj) plt.plot(x, PLOT_theta_1, color='blue', label='Tracking') plt.plot(x, PLOT_ground_theta_1, color='black', linestyle='--', label='Ground truth') plt.show() return
def train(variant): Min_cost = 1000000 traj = get_traj() # get data env_name = variant['env_name'] # choose your environment env = get_env_from_name(env_name) env_params = variant['env_params'] max_episodes = env_params[ 'max_episodes'] # maximum episodes for RL training max_ep_steps = env_params[ 'max_ep_steps'] # number of maximum steps in each episode max_global_steps = env_params['max_global_steps'] store_last_n_paths = variant['store_last_n_paths'] evaluation_frequency = variant['evaluation_frequency'] policy_params = variant['alg_params'] min_memory_size = policy_params['min_memory_size'] steps_per_cycle = policy_params['steps_per_cycle'] train_per_cycle = policy_params['train_per_cycle'] batch_size = policy_params['batch_size'] lr_a, lr_c, lr_l = policy_params['lr_a'], policy_params[ 'lr_c'], policy_params['lr_l'] lr_a_now = lr_a # learning rate for actor lr_c_now = lr_c # learning rate for critic lr_l_now = lr_l # learning rate for critic s_dim = env.observation_space.shape[0] print("s_dim is ", s_dim) a_dim = env.action_space.shape[0] a_upperbound = env.action_space.high a_lowerbound = env.action_space.low policy = CAC(a_dim, s_dim, policy_params) # policy.restore("log/CMAPSS/CAC-new-reward-0.01/0/policy") pool_params = { 's_dim': s_dim, 'a_dim': a_dim, 'd_dim': 1, 'store_last_n_paths': store_last_n_paths, 'memory_capacity': policy_params['memory_capacity'], 'min_memory_size': policy_params['min_memory_size'], 'history_horizon': policy_params['history_horizon'], 'finite_horizon': policy_params['finite_horizon'] } if 'value_horizon' in policy_params.keys(): pool_params.update({'value_horizon': policy_params['value_horizon']}) else: pool_params['value_horizon'] = None pool = Pool(pool_params) # For analyse Render = env_params['eval_render'] # Training setting t1 = time.time() global_step = 0 last_training_paths = deque(maxlen=store_last_n_paths) training_started = False log_path = variant['log_path'] logger.configure(dir=log_path, format_strs=['csv']) logger.logkv('tau', policy_params['tau']) logger.logkv('alpha3', policy_params['alpha3']) logger.logkv('batch_size', policy_params['batch_size']) logger.logkv('target_entropy', policy.target_entropy) for i in range(max_episodes): current_path = { 'rewards': [], 'distance': [], 'kl_divergence': [], 'a_loss': [], 'alpha': [], 'lyapunov_error': [], 'entropy': [], 'beta': [], 'action_distance': [], } if global_step > max_global_steps: break s = env.reset() # Random start point start_point = np.random.randint(0, 500000) s = traj[start_point, :16] # current state, theta,next w, desired state # this is for decision making # 16,1,4,16 s = np.concatenate([[s], [traj[start_point, 17:]]], axis=1)[0] env.state = s for j in range(start_point + 1, start_point + 1 + max_ep_steps): if Render: env.render() delta = np.zeros(36) # ###### NOSIE ############## noise = np.random.normal(0, 0.01, 16) delta[20:] = noise # ########IF Noise env########## # s= s + delta # a = policy.choose_action(s) # ###### BIAS ############## # noise = s[0:16]*0.01 # delta[0:16] = noise a = policy.choose_action(s + delta) action = a_lowerbound + (a + 1.) * (a_upperbound - a_lowerbound) / 2 # action = traj[j-1,16] a_upperbound = env.action_space.high a_lowerbound = env.action_space.low # Run in simulator X_, r, done, theta = env.step(action) # The new s= current state,next omega, next state s_ = np.concatenate([X_, [traj[j, 17:]]], axis=1)[0] # s_ = np.concatenate([[s_], [theta]], axis=1)[0] # s_ = np.concatenate([X_,[[theta]], [traj[j, 9:]]], axis=1)[0] env.state = s_ # theta_pre=theta if training_started: global_step += 1 if j == max_ep_steps - 1 + start_point: done = True terminal = 1. if done else 0. if j > start_point + 2: pool.store(s, a, np.zeros([1]), np.zeros([1]), r, terminal, s_, _s) # policy.store_transition(s, a, disturbance, r,0, terminal, s_) if pool.memory_pointer > min_memory_size and global_step % steps_per_cycle == 0: training_started = True for _ in range(train_per_cycle): batch = pool.sample(batch_size) labda, alpha, l_loss, entropy, a_loss, beta, action_distance, kl, distance = policy.learn( lr_a_now, lr_c_now, lr_l_now, lr_a_now / 10, batch) if training_started: current_path['rewards'].append(r) current_path['distance'].append(distance) current_path['kl_divergence'].append(kl) current_path['lyapunov_error'].append(l_loss) current_path['alpha'].append(alpha) current_path['entropy'].append(entropy) current_path['a_loss'].append(a_loss) current_path['beta'].append(beta) current_path['action_distance'].append(action_distance) if training_started and global_step % evaluation_frequency == 0 and global_step > 0: logger.logkv("total_timesteps", global_step) training_diagnotic = evaluate_training_rollouts( last_training_paths) # print(training_diagnotic) if training_diagnotic is not None: eval_diagnotic = training_evaluation(variant, env, policy) [ logger.logkv(key, eval_diagnotic[key]) for key in eval_diagnotic.keys() ] training_diagnotic.pop('return') [ logger.logkv(key, training_diagnotic[key]) for key in training_diagnotic.keys() ] logger.logkv('lr_a', lr_a_now) logger.logkv('lr_c', lr_c_now) logger.logkv('lr_l', lr_l_now) string_to_print = ['time_step:', str(global_step), '|'] [ string_to_print.extend( [key, ':', str(eval_diagnotic[key]), '|']) for key in eval_diagnotic.keys() ] [ string_to_print.extend([ key, ':', str(round(training_diagnotic[key], 2)), '|' ]) for key in training_diagnotic.keys() ] print(''.join(string_to_print)) logger.dumpkvs() if eval_diagnotic['test_return'] / eval_diagnotic[ 'test_average_length'] <= Min_cost: Min_cost = eval_diagnotic['test_return'] / eval_diagnotic[ 'test_average_length'] print("New lowest cost:", Min_cost) policy.save_result(log_path) if training_started and global_step % ( 10 * evaluation_frequency) == 0 and global_step > 0: policy.save_result(log_path) # Status Update _s = s s = s_ # OUTPUT TRAINING INFORMATION AND LEARNING RATE DECAY if done: if training_started: last_training_paths.appendleft(current_path) frac = 1.0 - (global_step - 1.0) / max_global_steps lr_a_now = lr_a * frac # learning rate for actor lr_c_now = lr_c * frac # learning rate for critic lr_l_now = lr_l * frac # learning rate for critic break policy.save_result(log_path) print('Running time: ', time.time() - t1) return
def train(variant): env_name = variant['env_name'] env = get_env_from_name(env_name) env_params = variant['env_params'] max_episodes = env_params['max_episodes'] max_ep_steps = env_params['max_ep_steps'] max_global_steps = env_params['max_global_steps'] store_last_n_paths = variant['num_of_training_paths'] evaluation_frequency = variant['evaluation_frequency'] policy_params = variant['alg_params'] policy_params['network_structure'] = env_params['network_structure'] min_memory_size = policy_params['min_memory_size'] steps_per_cycle = policy_params['steps_per_cycle'] train_per_cycle = policy_params['train_per_cycle'] batch_size = policy_params['batch_size'] lr_a, lr_c, lr_l = policy_params['lr_a'], policy_params[ 'lr_c'], policy_params['lr_l'] lr_a_now = lr_a # learning rate for actor lr_c_now = lr_c # learning rate for critic lr_l_now = lr_l # learning rate for critic if 'Fetch' in env_name or 'Hand' in env_name: s_dim = env.observation_space.spaces['observation'].shape[0]\ + env.observation_space.spaces['achieved_goal'].shape[0]+ \ env.observation_space.spaces['desired_goal'].shape[0] else: s_dim = env.observation_space.shape[0] a_dim = env.action_space.shape[0] # if disturber_params['process_noise']: # d_dim = disturber_params['noise_dim'] # else: # d_dim = env_params['disturbance dim'] a_upperbound = env.action_space.high a_lowerbound = env.action_space.low policy = LAC(a_dim, s_dim, policy_params) pool_params = { 's_dim': s_dim, 'a_dim': a_dim, 'd_dim': 1, 'store_last_n_paths': store_last_n_paths, 'memory_capacity': policy_params['memory_capacity'], 'min_memory_size': policy_params['min_memory_size'], 'history_horizon': policy_params['history_horizon'], 'finite_horizon': policy_params['finite_horizon'] } if 'value_horizon' in policy_params.keys(): pool_params.update({'value_horizon': policy_params['value_horizon']}) else: pool_params['value_horizon'] = None pool = Pool(pool_params) # For analyse Render = env_params['eval_render'] # Training setting t1 = time.time() global_step = 0 last_training_paths = deque(maxlen=store_last_n_paths) training_started = False log_path = variant['log_path'] logger.configure(dir=log_path, format_strs=['csv']) logger.logkv('tau', policy_params['tau']) logger.logkv('alpha3', policy_params['alpha3']) logger.logkv('batch_size', policy_params['batch_size']) logger.logkv('target_entropy', policy.target_entropy) for i in range(max_episodes): current_path = { 'rewards': [], 'a_loss': [], 'alpha': [], 'lambda': [], 'lyapunov_error': [], 'entropy': [], } if global_step > max_global_steps: break s = env.reset() if 'Fetch' in env_name or 'Hand' in env_name: s = np.concatenate([s[key] for key in s.keys()]) for j in range(max_ep_steps): if Render: env.render() a = policy.choose_action(s) # a = a*0 action = a_lowerbound + (a + 1.) * (a_upperbound - a_lowerbound) / 2 # Run in simulator disturbance_input = np.zeros([a_dim + s_dim]) s_, r, done, info = env.step(action) if 'Fetch' in env_name or 'Hand' in env_name: s_ = np.concatenate([s_[key] for key in s_.keys()]) if info['done'] > 0: done = True if training_started: global_step += 1 if j == max_ep_steps - 1: done = True terminal = 1. if done else 0. pool.store(s, a, np.zeros([1]), np.zeros([1]), r, terminal, s_) # policy.store_transition(s, a, disturbance, r,0, terminal, s_) if pool.memory_pointer > min_memory_size and global_step % steps_per_cycle == 0: training_started = True for _ in range(train_per_cycle): batch = pool.sample(batch_size) labda, alpha, l_loss, entropy, a_loss = policy.learn( lr_a_now, lr_c_now, lr_l_now, lr_a, batch) if training_started: current_path['rewards'].append(r) current_path['lyapunov_error'].append(l_loss) current_path['alpha'].append(alpha) current_path['lambda'].append(labda) current_path['entropy'].append(entropy) current_path['a_loss'].append(a_loss) if training_started and global_step % evaluation_frequency == 0 and global_step > 0: logger.logkv("total_timesteps", global_step) training_diagnotic = evaluate_training_rollouts( last_training_paths) if training_diagnotic is not None: if variant['num_of_evaluation_paths'] > 0: eval_diagnotic = training_evaluation( variant, env, policy) [ logger.logkv(key, eval_diagnotic[key]) for key in eval_diagnotic.keys() ] training_diagnotic.pop('return') [ logger.logkv(key, training_diagnotic[key]) for key in training_diagnotic.keys() ] logger.logkv('lr_a', lr_a_now) logger.logkv('lr_c', lr_c_now) logger.logkv('lr_l', lr_l_now) string_to_print = ['time_step:', str(global_step), '|'] if variant['num_of_evaluation_paths'] > 0: [ string_to_print.extend( [key, ':', str(eval_diagnotic[key]), '|']) for key in eval_diagnotic.keys() ] [ string_to_print.extend([ key, ':', str(round(training_diagnotic[key], 2)), '|' ]) for key in training_diagnotic.keys() ] print(''.join(string_to_print)) logger.dumpkvs() # 状态更新 s = s_ # OUTPUT TRAINING INFORMATION AND LEARNING RATE DECAY if done: if training_started: last_training_paths.appendleft(current_path) frac = 1.0 - (global_step - 1.0) / max_global_steps lr_a_now = lr_a * frac # learning rate for actor lr_c_now = lr_c * frac # learning rate for critic lr_l_now = lr_l * frac # learning rate for critic break policy.save_result(log_path) print('Running time: ', time.time() - t1) return
def learn(*, network, env, total_timesteps, starting_positions, env_name, win_percentage=0.5, eval_env=None, seed=None, nsteps=2048, ent_coef=0.0, lr=3e-4, vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95, log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2, save_interval=0, load_path=None, model_fn=None, **network_kwargs): ''' Learn policy using PPO algorithm (https://arxiv.org/abs/1707.06347) Parameters: ---------- network: policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list) specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets. See common/models.py/lstm for more details on using recurrent nets in policies env: baselines.common.vec_env.VecEnv environment. Needs to be vectorized for parallel environment simulation. The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class. nsteps: int number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where nenv is number of environment copies simulated in parallel) total_timesteps: int number of timesteps (i.e. number of actions taken in the environment) ent_coef: float policy entropy coefficient in the optimization objective lr: float or function learning rate, constant or a schedule function [0,1] -> R+ where 1 is beginning of the training and 0 is the end of the training. vf_coef: float value function loss coefficient in the optimization objective max_grad_norm: float or None gradient norm clipping coefficient gamma: float discounting factor lam: float advantage estimation discounting factor (lambda in the paper) log_interval: int number of timesteps between logging events nminibatches: int number of training minibatches per update. For recurrent policies, should be smaller or equal than number of environments run in parallel. noptepochs: int number of training epochs per update cliprange: float or function clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training and 0 is the end of the training save_interval: int number of timesteps between saving events load_path: str path to load the model from **network_kwargs: keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network For instance, 'mlp' network architecture has arguments num_hidden and num_layers. ''' set_global_seeds(seed) if isinstance(lr, float): lr = constfn(lr) else: assert callable(lr) if isinstance(cliprange, float): cliprange = constfn(cliprange) else: assert callable(cliprange) total_timesteps = int(total_timesteps) policy = build_policy(env, network, **network_kwargs) # Get the nb of env nenvs = env.num_envs # Get state_space and action_space ob_space = env.observation_space ac_space = env.action_space # Calculate the batch_size nbatch = nenvs * nsteps nbatch_train = nbatch // nminibatches # Instantiate the model object (that creates act_model and train_model) if model_fn is None: from ppo2.model import Model model_fn = Model model = model_fn(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm) if load_path is not None: model.load(load_path) current_starting_position = starting_positions.pop() # Instantiate the runner object runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam, starting_position=current_starting_position) if eval_env is not None: eval_runner = Runner(env=eval_env, model=model, nsteps=nsteps, gamma=gamma, lam=lam, starting_position=current_starting_position) epinfobuf = deque(maxlen=100) if eval_env is not None: eval_epinfobuf = deque(maxlen=100) # Start total timer tfirststart = time.time() start_changes = [] reached_goal = [] nupdates = total_timesteps // nbatch for update in range(1, nupdates + 1): assert nbatch % nminibatches == 0 # Start timer tstart = time.time() frac = 1.0 - (update - 1.0) / nupdates # Calculate the learning rate lrnow = lr(frac) # Calculate the cliprange cliprangenow = cliprange(frac) # Get minibatch obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run( ) #pylint: disable=E0632 if eval_env is not None: eval_obs, eval_returns, eval_masks, eval_actions, eval_values, eval_neglogpacs, eval_states, eval_epinfos = eval_runner.run( ) #pylint: disable=E0632 if env_name == "MountainCar-v0": done_obs = obs[masks] # Number of episodes past n_eps = done_obs.shape[0] # Reached goal if pos is > 0.5 n_goal_reached = (done_obs[:, 0] >= 0.5).sum() reached_goal.extend([ done + update * nsteps - nsteps for done in np.where(done_obs[:, 0] >= 0.5)[0] ]) if (n_goal_reached / n_eps) > win_percentage and len(starting_positions) > 0: start_changes.append(update * nsteps) current_starting_position = starting_positions.pop() runner.env.starting_position = current_starting_position if eval_env is not None: eval_runner.env.starting_position = current_starting_position epinfobuf.extend(epinfos) if eval_env is not None: eval_epinfobuf.extend(eval_epinfos) # Here what we're going to do is for each minibatch calculate the loss and append it. mblossvals = [] if states is None: # nonrecurrent version # Index of each element of batch_size # Create the indices array inds = np.arange(nbatch) for _ in range(noptepochs): # Randomize the indexes np.random.shuffle(inds) # 0 to batch_size with batch_train_size step for start in range(0, nbatch, nbatch_train): end = start + nbatch_train mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mblossvals.append(model.train(lrnow, cliprangenow, *slices)) else: # recurrent version assert nenvs % nminibatches == 0 envsperbatch = nenvs // nminibatches envinds = np.arange(nenvs) flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps) envsperbatch = nbatch_train // nsteps for _ in range(noptepochs): np.random.shuffle(envinds) for start in range(0, nenvs, envsperbatch): end = start + envsperbatch mbenvinds = envinds[start:end] mbflatinds = flatinds[mbenvinds].ravel() slices = (arr[mbflatinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mbstates = states[mbenvinds] mblossvals.append( model.train(lrnow, cliprangenow, *slices, mbstates)) # Feedforward --> get losses --> update lossvals = np.mean(mblossvals, axis=0) # End timer tnow = time.time() # Calculate the fps (frame per second) fps = int(nbatch / (tnow - tstart)) if update % log_interval == 0 or update == 1: # Calculates if value function is a good predicator of the returns (ev > 1) # or if it's just worse than predicting nothing (ev =< 0) ev = explained_variance(values, returns) logger.logkv("serial_timesteps", update * nsteps) logger.logkv("nupdates", update) logger.logkv("total_timesteps", update * nbatch) logger.logkv("fps", fps) logger.logkv("explained_variance", float(ev)) logger.logkv('eprewmean', safemean([epinfo['r'] for epinfo in epinfobuf])) logger.logkv('eplenmean', safemean([epinfo['l'] for epinfo in epinfobuf])) logger.logkv('start_changes', "_".join([str(s) for s in start_changes])) logger.logkv('reached_goal', "_".join([str(goal) for goal in reached_goal])) if eval_env is not None: logger.logkv( 'eval_eprewmean', safemean([epinfo['r'] for epinfo in eval_epinfobuf])) logger.logkv( 'eval_eplenmean', safemean([epinfo['l'] for epinfo in eval_epinfobuf])) logger.logkv('time_elapsed', tnow - tfirststart) for (lossval, lossname) in zip(lossvals, model.loss_names): logger.logkv(lossname, lossval) if MPI is None or MPI.COMM_WORLD.Get_rank() == 0: logger.dumpkvs() if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir() and ( MPI is None or MPI.COMM_WORLD.Get_rank() == 0): checkdir = osp.join(logger.get_dir(), 'checkpoints') os.makedirs(checkdir, exist_ok=True) savepath = osp.join(checkdir, '%.5i' % update) print('Saving to', savepath) model.save(savepath) return model
def train(self): # params = self.value_fun._params videos = [] contours = [] returns = [] delay_cs = [] fig = None for itr in range(self.max_itr): itr_starttime = time.time() self.value_fun_update() itr_time = time.time() - itr_starttime log = itr % self.log_itr == 0 or itr == self.max_itr - 1 render = (itr % self.render_itr == 0) and self.render if log: rollout_starttime = time.time() average_return, avg_delay_cost, video = rollout( self.env, self.policy, num_rollouts=self.num_rollouts, render=render, iteration=itr, max_path_length=self.max_path_length) rollout_time = time.time() - rollout_starttime if render: # contour, fig = plot_contour(self.env, self.value_fun, fig=fig, iteration=itr) # contours += [contour] videos += video returns.append(average_return) delay_cs.append(avg_delay_cost) logger.logkv('Iteration', itr) logger.logkv('Average Returns', average_return) logger.logkv('Average Delayed Costs', avg_delay_cost) logger.logkv('Iteration Time', itr_time) logger.logkv('Policy Rollout Time', rollout_time) logger.dumpkvs() plot_returns(returns) plot_returns(delay_cs, 'delayed_cost') # plot_contour(self.env, self.value_fun, save=True, fig=fig) # if contours and contours[0] is not None: # contours = list(upsample(np.array(contours), 10)) # clip = mpy.ImageSequenceClip(contours, fps=10) # clip.write_videofile('%s/contours_progress.mp4' % logger.get_dir()) if videos: fps = int(4 / getattr(self.env, 'dt', 0.1)) clip = mpy.ImageSequenceClip(videos, fps=fps) clip.write_videofile('%s/learning_progress.mp4' % logger.get_dir()) itr = self.max_itr average_return, avg_delay_cost, final_itr_video = rollout( self.env, self.policy, num_rollouts=2, render=True, iteration=itr, last_max_path_length=self.last_max_path_length, last_iteration=True) final_clip = mpy.ImageSequenceClip(final_itr_video, fps=40) final_clip.write_videofile('%s/final_rollout.mp4' % logger.get_dir()) plt.close()
def eval(variant): # num_data_traj = variant['num_data_trajectories'] num_data_traj = 50 env_name = variant['env_name'] data_trajectories = get_data() env = get_env_from_name(env_name) env_params = variant['env_params'] max_ep_steps = env_params['max_ep_steps'] policy_params = variant['alg_params'] s_dim = env.observation_space.shape[0] print("observation_space = ", s_dim) a_dim = env.action_space.shape[0] print("action space = ", a_dim) a_upperbound = env.action_space.high print("upper bound =", a_upperbound) a_lowerbound = env.action_space.low print("lower bound = ", a_lowerbound) policy = CAC(a_dim, s_dim, policy_params) ref_s = env.reference_state log_path = variant['log_path'] + '/eval/' + str(0) logger.configure(dir=log_path, format_strs=['csv']) policy.restore(variant['log_path'] + '/' + str(0) + '/policy') # Training setting t1 = time.time() PLOT_theta_1 = [] PLOT_ground_theta_1 = [] PLOT_theta_2 = [] PLOT_ground_theta_2 = [] state_storage = StateStorage() mst = [] agent_traj = [] ground_traj = [] reward_traj = [] for i in tqdm(range(num_data_traj)): if (i >= 10): break j = i * len(data_trajectories) // num_data_traj print(j) traj = data_trajectories[j] env.reset() cost = 0 # s = traj[0, 1] s = traj[0, -8:] # PLOT_state = np.array([s]) # s = np.array([s, traj[0, 2], traj[0, 4]]) s = np.array(list(s) + [traj[0, 2]] + list(traj[1, -8:])) # print("initial state : ", s) print("action here is : ", [traj[0, 5], traj[0, 6]]) env.state = s env.model.state = traj[0, -8:] # env.state = env.model.state ep_steps = len(traj) for j in range(1, ep_steps): # if j%100 == 0: # env.reset() # s = np.array(list(traj[j-1, -8:]) + [traj[j,2]] + list(traj[j,-8:])) # # s = traj[j-1,-8:] # env.state = s # env.model.state = traj[j-1, -8:] s = env.state # if agent_traj == []: # agent_traj = [s[0]] # else: # agent_traj = np.concatenate((agent_traj, [s[0]]),axis=0) # if ground_traj == []: # ground_traj = [traj[j-1,1]] # else: # ground_traj = np.concatenate((ground_traj, [traj[j-2,4]]),axis=0) # print(traj[j,1], s[2]) delta = np.zeros(s.shape) # ###### NOSIE ############## # noise = np.random.normal(0, 0.001, 16) # delta[20:]= noise # ###### BIAS ############## # noise = s[0:16]*0.005 # delta[0:16] = noise # store_s = s.copy() # store_s[2] = store_s[2] - store_s[0] # a = policy.choose_action(store_s + delta, True) a = policy.choose_action(s / ref_s + delta, True) # a = policy.choose_action(s + delta, True) # print(a) action = a_lowerbound + (a + 1.) * (a_upperbound - a_lowerbound) / 2 # print(action) s_, r, done, X_ = env.step(action, traj[j, 2], traj[j, 1]) # _, r, done , X_= env.step(action, True) # print(r) # The new s= current state,next omega, next state # s_ = np.array([X_[1,0], traj[j, 2], traj[j, 4]]) s_ = np.array(list(s_) + [traj[j + 1, 2]] + list(traj[j + 1, -8:])) # s_ = np.array([traj[j,1], traj[j,2], traj[j,4]]) r = modify_reward(r, s, s_, variant['reward_id']) # s_ = np.array([traj[j,1], traj[j,2], traj[j,4]]) if (j % 51 == 0): # print("X predicted ", X_, " and actual: ", traj[j-1, 4]) print("predicted action : ", action, ", reward : ", r) if agent_traj == []: # agent_traj = [s_[0]] agent_traj = [X_[1, 0]] else: # agent_traj = np.concatenate((agent_traj, [s_[0]]),axis=0) agent_traj = np.concatenate((agent_traj, [X_[1, 0]]), axis=0) if ground_traj == []: # ground_traj = [s[2]] ground_traj = [traj[j, 1]] else: # ground_traj = np.concatenate((ground_traj, [s[2]]),axis=0) ground_traj = np.concatenate((ground_traj, [traj[j, 1]]), axis=0) env.state = s_ theta = action PLOT_theta_1.append(theta[0]) PLOT_ground_theta_1.append(traj[j, 5]) PLOT_theta_2.append(theta[1]) PLOT_ground_theta_2.append(traj[j, 6]) mst.append(np.linalg.norm(traj[j, 5] - theta[0])) state_storage.update(predicted_state=s_[:8], original_state=s[-8:]) reward_traj.append(r) # PLOT_state = np.vstack((PLOT_state, np.array([X_[1,0]]))) logger.logkv('rewards', r) logger.logkv('timestep', j) logger.logkv('total-length', ep_steps) logger.logkv('state', s) logger.logkv('predicted-output', X_[1, 0]) logger.logkv('predicted-action', action) logger.logkv('actual-action', [traj[j, 5], traj[j, 6]]) logger.logkv('action-error', np.linalg.norm(traj[j, 5:7] - theta)) # logger.logkv('output-error', np.linalg.norm(s[0] - traj[j-1,1])) logger.dumpkvs() cost = cost + r if j == len(traj) - 2: done = True s = s_ if done: #print('episode:', i,'trajectory_number:',traj_num,'total_cost:',cost,'steps:',j-start_point) break x = np.linspace(0, np.shape(PLOT_ground_theta_1)[0] - 1, np.shape(PLOT_ground_theta_1)[0]) # plt.plot(x, PLOT_theta_1, color='blue', label='Tracking') # plt.plot(x, PLOT_ground_theta_1, color='black', linestyle='--', label='Ground truth') # plt.show() plt.style.use('seaborn') with h5py.File(variant['log_path'] + '/' + 'CAC_theta.h5', 'w') as hdf: hdf.create_dataset('Data', data=PLOT_theta_1) with h5py.File(variant['log_path'] + '/' + 'Normal_theta_ground.h5', 'w') as hdf: hdf.create_dataset('Data', data=PLOT_ground_theta_1) with h5py.File(variant['log_path'] + '/' + 'CAC_track.h5', 'w') as hdf: hdf.create_dataset('Data', data=agent_traj) with h5py.File(variant['log_path'] + '/' + 'GT_track.h5', 'w') as hdf: hdf.create_dataset('Data', data=ground_traj) fig = plt.figure() plt.plot(x, PLOT_theta_1, linestyle='--', color='blue', label='Tracking', marker='o', markersize=1) plt.plot(x, PLOT_ground_theta_1, color='orange', linestyle='--', label='Ground truth', marker='.', markersize=3) plt.ylim(2000, 8000) plt.xlabel('time') plt.ylabel('Qmax') plt.legend(loc="upper right", markerscale=3., scatterpoints=1, fontsize=10) plt.savefig(variant['log_path'] + '/action_tracking_1.jpg') fig = plt.figure() plt.plot(x, PLOT_theta_2, linestyle='--', color='blue', label='Tracking', marker='o', markersize=1) plt.plot(x, PLOT_ground_theta_2, color='orange', linestyle='--', label='Ground Truth', marker='.', markersize=3) plt.ylim(0.10, 0.20) plt.xlabel('time') plt.ylabel('Ro') plt.legend(loc="upper right", markerscale=3., scatterpoints=1, fontsize=10) plt.savefig(variant['log_path'] + '/action_tracking_2.jpg') fig = plt.figure() plt.plot(x, agent_traj, linestyle='--', color='blue', label='Tracking', marker='o', markersize=1) plt.plot(x, ground_traj, color='orange', linestyle='--', label='Ground Truth', marker='.', markersize=3) plt.xlabel('time') plt.ylabel('Voltage (V)') plt.legend(loc="upper right", markerscale=3., scatterpoints=1, fontsize=10) plt.savefig(variant['log_path'] + '/output_tracking.jpg') fig = plt.figure() plt.plot(np.array(reward_traj), np.square(agent_traj - ground_traj), linestyle='', marker='.', markersize=3) plt.scatter(np.array(reward_traj), np.square(agent_traj - ground_traj)) plt.xlabel("reward") plt.ylabel("error") plt.legend(loc="upper right", markerscale=3., scatterpoints=1, fontsize=10) plt.savefig(variant['log_path'] + '/reward_vs_error.jpg') state_storage.plot_states(outpath=variant['log_path']) return
def param_variation(variant): env_name = variant['env_name'] env = get_env_from_name(env_name) env_params = variant['env_params'] eval_params = variant['eval_params'] policy_params = variant['alg_params'] policy_params.update({ 's_bound': env.observation_space, 'a_bound': env.action_space, }) disturber_params = variant['disturber_params'] build_func = get_policy(variant['algorithm_name']) s_dim = env.observation_space.shape[0] a_dim = env.action_space.shape[0] d_dim = env_params['disturbance dim'] policy = build_func(a_dim, s_dim, d_dim, policy_params) # disturber = Disturber(d_dim, s_dim, disturber_params) param_variable = eval_params['param_variables'] grid_eval_param = eval_params['grid_eval_param'] length_of_pole, mass_of_pole, mass_of_cart, gravity = env.get_params() log_path = variant['log_path'] + '/eval' if eval_params['grid_eval']: param1 = grid_eval_param[0] param2 = grid_eval_param[1] log_path = log_path + '/' + param1 + '-' + param2 logger.configure(dir=log_path, format_strs=['csv']) logger.logkv('num_of_paths', variant['eval_params']['num_of_paths']) for var1 in param_variable[param1]: if param1 == 'length_of_pole': length_of_pole = var1 elif param1 == 'mass_of_pole': mass_of_pole = var1 elif param1 == 'mass_of_cart': mass_of_cart = var1 elif param1 == 'gravity': gravity = var1 for var2 in param_variable[param2]: if param2 == 'length_of_pole': length_of_pole = var2 elif param2 == 'mass_of_pole': mass_of_pole = var2 elif param2 == 'mass_of_cart': mass_of_cart = var2 elif param2 == 'gravity': gravity = var2 env.set_params(mass_of_pole=mass_of_pole, length=length_of_pole, mass_of_cart=mass_of_cart, gravity=gravity) diagnostic_dict, _ = evaluation(variant, env, policy) string_to_print = [ param1, ':', str(round(var1, 2)), '|', param2, ':', str(round(var2, 2)), '|' ] [ string_to_print.extend( [key, ':', str(round(diagnostic_dict[key], 2)), '|']) for key in diagnostic_dict.keys() ] print(''.join(string_to_print)) logger.logkv(param1, var1) logger.logkv(param2, var2) [ logger.logkv(key, diagnostic_dict[key]) for key in diagnostic_dict.keys() ] logger.dumpkvs() else: for param in param_variable.keys(): logger.configure(dir=log_path + '/' + param, format_strs=['csv']) logger.logkv('num_of_paths', variant['eval_params']['num_of_paths']) env.reset_params() for var in param_variable[param]: if param == 'length_of_pole': length_of_pole = var elif param == 'mass_of_pole': mass_of_pole = var elif param == 'mass_of_cart': mass_of_cart = var elif param == 'gravity': gravity = var env.set_params(mass_of_pole=mass_of_pole, length=length_of_pole, mass_of_cart=mass_of_cart, gravity=gravity) diagnostic_dict = evaluation(variant, env, policy) string_to_print = [param, ':', str(round(var, 2)), '|'] [ string_to_print.extend( [key, ':', str(round(diagnostic_dict[key], 2)), '|']) for key in diagnostic_dict.keys() ] print(''.join(string_to_print)) logger.logkv(param, var) [ logger.logkv(key, diagnostic_dict[key]) for key in diagnostic_dict.keys() ] logger.dumpkvs()
def train(log_dir): """Performs the agent traning. Args: log_dir (str): The directory in which the final model (policy) and the log data is saved. """ # Create environment env = get_env_from_name(ENV_NAME, ENV_SEED) # Set initial learning rates lr_a, lr_l = ( ALG_PARAMS["lr_a"], ALG_PARAMS["lr_l"], ) lr_a_now = ALG_PARAMS["lr_a"] # learning rate for actor, lambda and alpha lr_l_now = ALG_PARAMS["lr_l"] # learning rate for lyapunov critic # Get observation and action space dimension and limits from the environment s_dim = env.observation_space.shape[0] a_dim = env.action_space.shape[0] a_upperbound = env.action_space.high a_lowerbound = env.action_space.low # Create the Lyapunov Actor Critic agent policy = LAC(a_dim, s_dim, log_dir=log_dir) # Create replay memory buffer pool = Pool( s_dim=s_dim, a_dim=a_dim, store_last_n_paths=TRAIN_PARAMS["num_of_training_paths"], memory_capacity=ALG_PARAMS["memory_capacity"], min_memory_size=ALG_PARAMS["min_memory_size"], ) # Training setting t1 = time.time() global_step = 0 tb_step = 0 last_training_paths = deque(maxlen=TRAIN_PARAMS["num_of_training_paths"]) training_started = False # Create tensorboard variables tb_lr_a = tf.Variable(lr_a, dtype=tf.float32) tb_lr_l = tf.Variable(lr_l, dtype=tf.float32) tb_lr_lag = tf.Variable(lr_a, dtype=tf.float32) tb_ret = tf.Variable(0, dtype=tf.float32) tb_len = tf.Variable(0, dtype=tf.float32) tb_a_loss = tf.Variable(0, dtype=tf.float32) tb_lyapunov_error = tf.Variable(0, dtype=tf.float32) tb_entropy = tf.Variable(0, dtype=tf.float32) # Initialize tensorboard variables and create summaries if USE_TB: policy.sess.run( [ tb_lr_a.initializer, tb_lr_l.initializer, tb_lr_lag.initializer, tb_ret.initializer, tb_len.initializer, tb_a_loss.initializer, tb_lyapunov_error.initializer, tb_entropy.initializer, ] ) # Add tensorboard summaries main_sum = tf.compat.v1.summary.merge( [ tf.compat.v1.summary.scalar("lr_a", tb_lr_a), tf.compat.v1.summary.scalar("lr_l", tb_lr_l), tf.compat.v1.summary.scalar("lr_lag", tb_lr_lag), tf.compat.v1.summary.scalar("alpha", policy.alpha), tf.compat.v1.summary.scalar("lambda", policy.labda), ] ) other_sum = tf.compat.v1.summary.merge( [ tf.compat.v1.summary.scalar("ep_ret", tb_ret), tf.compat.v1.summary.scalar("ep_length", tb_len), tf.compat.v1.summary.scalar("a_loss", tb_a_loss), tf.compat.v1.summary.scalar("lyapunov_error", tb_lyapunov_error), tf.compat.v1.summary.scalar("entropy", tb_entropy), ] ) policy.tb_writer.add_summary( policy.sess.run(main_sum), policy.sess.run(policy.step) ) if WRITE_W_B: policy.tb_writer.add_summary( policy.sess.run(policy.w_b_sum), policy.sess.run(policy.step), ) policy.tb_writer.flush() # Above summaries are known from the start # Setup logger and log hyperparameters logger.configure(dir=log_dir, format_strs=["csv"]) logger.logkv("tau", ALG_PARAMS["tau"]) logger.logkv("alpha3", ALG_PARAMS["alpha3"]) logger.logkv("batch_size", ALG_PARAMS["batch_size"]) logger.logkv("target_entropy", policy.target_entropy) # Training loop for i in range(ENV_PARAMS["max_episodes"]): # Create variable to store information about the current path current_path = { "rewards": [], "a_loss": [], "alpha": [], "lambda": [], "lyapunov_error": [], "entropy": [], } # Stop training if max number of steps has been reached if global_step > ENV_PARAMS["max_global_steps"]: break # Reset environment s = env.reset() # Training Episode loop for j in range(ENV_PARAMS["max_ep_steps"]): # Render environment if requested if ENV_PARAMS["eval_render"]: env.render() # Retrieve (scaled) action based on the current policy a = policy.choose_action(s) # a = np.squeeze(np.random.uniform(low=-1.0, high=1.0, size=(1, 2))) # DEBUG action = a_lowerbound + (a + 1.0) * (a_upperbound - a_lowerbound) / 2 # Perform action in env s_, r, done, _ = env.step(action) # Increment global step count if training_started: global_step += 1 # Stop episode if max_steps has been reached if j == ENV_PARAMS["max_ep_steps"] - 1: done = True terminal = 1.0 if done else 0.0 # Store experience in replay buffer pool.store(s, a, r, terminal, s_) # Increment tensorboard step counter # NOTE: This was done differently from the global_step counter since # otherwise there were inconsistencies in the tb log. if USE_TB: tb_step += 1 # Optimize weights and parameters using STG if ( pool.memory_pointer > ALG_PARAMS["min_memory_size"] and global_step % ALG_PARAMS["steps_per_cycle"] == 0 ): training_started = True # Perform STG a set number of times (train per cycle) for _ in range(ALG_PARAMS["train_per_cycle"]): batch = pool.sample(ALG_PARAMS["batch_size"]) labda, alpha, l_loss, entropy, a_loss = policy.learn( lr_a_now, lr_l_now, lr_a, batch ) # Save path results if training_started: current_path["rewards"].append(r) current_path["lyapunov_error"].append(l_loss) current_path["alpha"].append(alpha) current_path["lambda"].append(labda) current_path["entropy"].append(entropy) current_path["a_loss"].append(a_loss) # Evalute the current performance and log results if ( training_started and global_step % TRAIN_PARAMS["evaluation_frequency"] == 0 and global_step > 0 ): logger.logkv("total_timesteps", global_step) training_diagnostics = evaluate_training_rollouts(last_training_paths) if training_diagnostics is not None: if TRAIN_PARAMS["num_of_evaluation_paths"] > 0: eval_diagnostics = training_evaluation(env, policy) [ logger.logkv(key, eval_diagnostics[key]) for key in eval_diagnostics.keys() ] training_diagnostics.pop("return") [ logger.logkv(key, training_diagnostics[key]) for key in training_diagnostics.keys() ] logger.logkv("lr_a", lr_a_now) logger.logkv("lr_l", lr_l_now) string_to_print = ["time_step:", str(global_step), "|"] if TRAIN_PARAMS["num_of_evaluation_paths"] > 0: [ string_to_print.extend( [key, ":", str(eval_diagnostics[key]), "|"] ) for key in eval_diagnostics.keys() ] [ string_to_print.extend( [key, ":", str(round(training_diagnostics[key], 2)), "|"] ) for key in training_diagnostics.keys() ] print("".join(string_to_print)) logger.dumpkvs() # Update state s = s_ # Decay learning rate if done: # Store paths if training_started: last_training_paths.appendleft(current_path) # Get current model performance for tb if USE_TB: training_diagnostics = evaluate_training_rollouts( last_training_paths ) # Log tb variables if USE_TB: if i % TB_FREQ == 0: # Update and log learning rate tb vars policy.sess.run(policy.step.assign(tb_step)) policy.sess.run(tb_lr_a.assign(lr_a_now)) policy.sess.run(tb_lr_l.assign(lr_l_now)) policy.sess.run(tb_lr_lag.assign(lr_a)) policy.tb_writer.add_summary( policy.sess.run(main_sum), policy.sess.run(policy.step) ) # Update and log other training vars to tensorboard if training_started: # Update and log training vars policy.sess.run( tb_ret.assign(training_diagnostics["return"]) ) policy.sess.run( tb_len.assign(training_diagnostics["length"]) ) policy.sess.run( tb_a_loss.assign(training_diagnostics["a_loss"]) ) policy.sess.run( tb_lyapunov_error.assign( training_diagnostics["lyapunov_error"] ) ) policy.sess.run( tb_entropy.assign(training_diagnostics["entropy"]) ) policy.tb_writer.add_summary( policy.sess.run(other_sum), policy.sess.run(policy.step) ) # Log network weights if WRITE_W_B: policy.tb_writer.add_summary( policy.sess.run(policy.w_b_sum), policy.sess.run(policy.step), ) policy.tb_writer.flush() # Decay learning rates frac = 1.0 - (global_step - 1.0) / ENV_PARAMS["max_global_steps"] lr_a_now = lr_a * frac # learning rate for actor, lambda, alpha lr_l_now = lr_l * frac # learning rate for lyapunov critic break # Save model and print Running time policy.save_result(log_dir) # policy.tb_writer.close() print("Running time: ", time.time() - t1) return
def dynamic(variant): env_name = variant['env_name'] env = get_env_from_name(env_name) eval_params = variant['eval_params'] policy_params = variant['alg_params'] build_func = get_policy(variant['algorithm_name']) if 'Fetch' in env_name or 'Hand' in env_name: s_dim = env.observation_space.spaces['observation'].shape[0] \ + env.observation_space.spaces['achieved_goal'].shape[0] + \ env.observation_space.spaces['desired_goal'].shape[0] else: s_dim = env.observation_space.shape[0] a_dim = env.action_space.shape[0] policy = build_func(a_dim, s_dim, policy_params) # disturber = Disturber(d_dim, s_dim, disturber_params) log_path = variant['log_path'] + '/eval/dynamic/' + eval_params[ 'additional_description'] variant['eval_params'].update({'magnitude': 0}) logger.configure(dir=log_path, format_strs=['csv']) _, paths = evaluation(variant, env, policy) max_len = 0 for path in paths['s']: path_length = len(path) if path_length > max_len: max_len = path_length average_path = np.average(np.array(paths['s']), axis=0) std_path = np.std(np.array(paths['s']), axis=0) for i in range(max_len): logger.logkv('average_path', average_path[i]) logger.logkv('std_path', std_path[i]) logger.logkv('reference', paths['reference'][0][i]) logger.dumpkvs() if eval_params['directly_show']: fig = plt.figure(figsize=(9, 6)) ax = fig.add_subplot(111) if eval_params['plot_average']: t = range(max_len) ax.plot(t, average_path, color='red') # if env_name =='cartpole_cost': # ax.fill_between(t, (average_path - std_path)[:, 0], (average_path + std_path)[:, 0], # color='red', alpha=.1) # else: ax.fill_between(t, average_path - std_path, average_path + std_path, color='red', alpha=.1) else: for path in paths['s']: path_length = len(path) t = range(path_length) path = np.array(path) # ax.plot(t, path) ax.plot(t, path, color='red') #MJS # ax.plot(t, path[:, 0], color='red') # ax.plot(t, path[:, 1], color='blue') # ax.plot(t, path[:,0],label='mRNA 1') # ax.plot(t, path[:, 1], label='mRNA 2') # ax.plot(t, path[:, 2], label='mRNA 3') # ax.plot(t, path[:, 3], label='Protein 1') # ax.plot(t, path[:, 4], label='Protein 2') # ax.plot(t, path[:, 5], label='Protein 3') #osscillator complicated # ax.plot(t, path[:, 0],label='mRNA 1') # ax.plot(t, path[:, 1], label='mRNA 2') # ax.plot(t, path[:, 2], label='mRNA 3') # ax.plot(t, path[:, 3], label='mRNA 4') # ax.plot(t, path[:, 4], label='Protein 1') # ax.plot(t, path[:, 5], label='Protein 2') # ax.plot(t, path[:, 6], label='Protein 3') # ax.plot(t, path[:, 7], label='Protein 4') if path_length > max_len: max_len = path_length # MJS # plt.ylim(-1000, 1000) # ax.plot(t, path[:, 0], color='red', label='s 1') # ax.plot(t, path[:, 1], color='blue', label='s 2') # cartpole # ax.plot(t, path, color='red', label='theta') # oscillator # ax.plot(t, path, color='red', label='Protein 1') # ax.plot(t, paths['reference'][0], color='blue', label='Reference') handles, labels = ax.get_legend_handles_labels() ax.legend(handles, labels, fontsize=20, loc=2, fancybox=False, shadow=False) # if 'reference' in paths.keys(): # for path in paths['reference']: # path_length = len(path) # if path_length == max_len: # t = range(path_length) # # ax.plot(t, path, color='brown',linestyle='dashed', label='refernce') # break # else: # continue # # handles, labels = ax.get_legend_handles_labels() # ax.legend(handles, labels, fontsize=20, loc=2, fancybox=False, shadow=False) plt.savefig(env_name + '-' + variant['algorithm_name'] + '-dynamic-state.pdf') plt.show() if 'c' in paths.keys(): fig = plt.figure(figsize=(9, 6)) ax = fig.add_subplot(111) for path in paths['c']: t = range(len(path)) ax.plot(t, path) plt.savefig(env_name + '-' + variant['algorithm_name'] + '-dynamic-cost.pdf') plt.show() if 'v' in paths.keys(): fig = plt.figure(figsize=(9, 6)) ax = fig.add_subplot(111) for path in paths['v']: t = range(len(path)) ax.plot(t, path) plt.savefig(env_name + '-' + variant['algorithm_name'] + '-dynamic-value.pdf') plt.show() return
def learn(seed, policy, env, nsteps, total_timesteps, ent_coef, lr, vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95, nminibatches=4, noptepochs=4, cliprange=0.1, next_n=10, nslupdates=10, seq_len=10, ext_coef=1, int_coef=0.1, K=10): rng = np.random.RandomState(seed) total_timesteps = int(total_timesteps) nenvs = env.num_envs ob_space = env.observation_space loc_space = 2 ac_space = env.action_space nbatch = nenvs * nsteps nbatch_train = nbatch // nminibatches nbatch_sl_train = nenvs * seq_len // nminibatches make_model = lambda: Model(policy=policy, ob_space=ob_space, loc_space=loc_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nbatch_sl_train=nbatch_sl_train, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, seq_len=seq_len, seed=seed) model = make_model() replay_buffer = Buffer(max_size=1000, seed=seed) runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam, next_n=next_n, seq_len=seq_len, int_coef=int_coef, ext_coef=ext_coef, replay_buffer=replay_buffer, seed=seed) episode_raw_stats = EpisodeStats(nsteps, nenvs) episode_stats = EpisodeStats(nsteps, nenvs) tfirststart = time.time() nupdates = total_timesteps // nbatch sl_acc = 0 p = 0 for update in range(1, nupdates + 1): assert nbatch % nminibatches == 0 p = update * nbatch / (total_timesteps * 0.875) nbatch_train = nbatch // nminibatches tstart = time.time() obs, locs, goals, raw_rewards, rewards, returns, masks, rnn_masks, actions, values, neglogpacs, states = runner.run( K, p) episode_raw_stats.feed(raw_rewards, masks) episode_stats.feed(rewards, masks) mblossvals = [] assert nenvs % nminibatches == 0 envsperbatch = nenvs // nminibatches envinds = np.arange(nenvs) flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps) envsperbatch = nbatch_train // nsteps for _ in range(noptepochs): rng.shuffle(envinds) for start in range(0, nenvs, envsperbatch): end = start + envsperbatch mbenvinds = envinds[start:end] mbflatinds = flatinds[mbenvinds].ravel() slices = (arr[mbflatinds] for arr in (obs, locs, goals, returns, rnn_masks, actions, values, neglogpacs)) mbstates = states[mbenvinds] mblossvals.append(model.train(lr, cliprange, *slices, mbstates)) if nslupdates > 0 and sl_acc < 0.75: sl_acc, sl_loss = sl_train(model, replay_buffer, nslupdates=nslupdates, seq_len=seq_len, nenvs=nenvs, envsperbatch=envsperbatch, lr=lr) elif nslupdates > 0: sl_acc, sl_loss = sl_train(model, replay_buffer, nslupdates=1, seq_len=seq_len, nenvs=nenvs, envsperbatch=envsperbatch, lr=lr) lossvals = np.mean(mblossvals, axis=0) tnow = time.time() fps = int(nbatch / (tnow - tstart)) logger.logkv("serial_timesteps", update * nsteps) logger.logkv("nupdates", update) logger.logkv("total_timesteps", update * nbatch) logger.logkv("fps", fps) logger.logkv('episode_raw_reward', episode_raw_stats.mean_reward()) logger.logkv('imitation_episode_reward', np.mean(runner.recent_imitation_rewards)) logger.logkv('episode_reward', episode_stats.mean_reward()) logger.logkv('episode_success_ratio', np.mean(runner.recent_success_ratio)) logger.logkv('time_elapsed', tnow - tfirststart) if nslupdates > 0: logger.logkv('sl_loss', sl_loss) logger.logkv('sl_acc', sl_acc) logger.logkv('replay_buffer_num', replay_buffer.num_episodes()) logger.logkv('replay_buffer_best', replay_buffer.max_reward()) if noptepochs > 0: for (lossval, lossname) in zip(lossvals, model.loss_names): logger.logkv(lossname, lossval) logger.dumpkvs() print(logger.get_dir()) env.close() return model
def train(variant): Min_cost = 1000000 data_trajectories = get_data() # get data (X, W, X_, theta, state) env_name = variant['env_name'] # choose your environment env = get_env_from_name(env_name) num_data_traj = variant['num_data_trajectories'] reward_id = variant['reward_id'] env_params = variant['env_params'] max_episodes = env_params[ 'max_episodes'] # maximum episodes for RL training max_ep_steps = env_params[ 'max_ep_steps'] # number of maximum steps in each episode max_global_steps = env_params['max_global_steps'] store_last_n_paths = variant['store_last_n_paths'] evaluation_frequency = variant['evaluation_frequency'] policy_params = variant['alg_params'] min_memory_size = policy_params['min_memory_size'] steps_per_cycle = policy_params['steps_per_cycle'] train_per_cycle = policy_params['train_per_cycle'] batch_size = policy_params['batch_size'] lr_a, lr_c, lr_l = policy_params['lr_a'], policy_params[ 'lr_c'], policy_params['lr_l'] lr_a_now = lr_a # learning rate for actor lr_c_now = lr_c # learning rate for critic lr_l_now = lr_l # learning rate for lyapunov critic s_dim = env.observation_space.shape[ 0] # dimension of state (3 for Battery) a_dim = env.action_space.shape[0] # action space dimension (1 or 2) a_upperbound = env.action_space.high a_lowerbound = env.action_space.low policy = CAC(a_dim, s_dim, policy_params) policy.restore(variant['log_path'] + "/0/policy") pool_params = { 's_dim': s_dim, 'a_dim': a_dim, 'd_dim': 1, 'store_last_n_paths': store_last_n_paths, 'memory_capacity': policy_params['memory_capacity'], 'min_memory_size': policy_params['min_memory_size'], 'history_horizon': policy_params['history_horizon'], 'finite_horizon': policy_params['finite_horizon'] } if 'value_horizon' in policy_params.keys(): pool_params.update({'value_horizon': policy_params['value_horizon']}) else: pool_params['value_horizon'] = None pool = Pool(pool_params) # For analyse Render = env_params['eval_render'] ref_s = env.reference_state # Training setting t1 = time.time() global_step = 0 last_training_paths = deque(maxlen=store_last_n_paths) training_started = False log_path = variant['log_path'] logger.configure(dir=log_path, format_strs=['csv']) logger.logkv('tau', policy_params['tau']) logger.logkv('alpha3', policy_params['alpha3']) logger.logkv('batch_size', policy_params['batch_size']) logger.logkv('target_entropy', policy.target_entropy) for i in range(max_episodes): print("episode # ", i) print("global steps ", global_step) current_path = { 'rewards': [], 'distance': [], 'kl_divergence': [], 'a_loss': [], 'alpha': [], 'lyapunov_error': [], 'entropy': [], 'beta': [], 'action_distance': [], } if global_step > max_global_steps: break s = env.reset() # Random start point # traj_id = np.random.randint(0, len(data_trajectories)) traj_id = np.random.randint(0, num_data_traj) # traj_id = 1 traj = data_trajectories[traj_id] # print(len(traj)) if variant['traj_start'] == "random": start_point = np.random.randint(0, len(traj) - 2) else: start_point = int(variant['traj_start']) # s = traj[start_point, 1] s = traj[start_point, -8:] # current state, theta,next w, desired state # this is for decision making # 16,1,4,16 # s = np.array([s, traj[start_point, 2], traj[start_point, 4]]) # print(i, s) s = np.array( list(s) + [traj[start_point, 2]] + list(traj[start_point + 1, -8:])) # print(s) env.state = s env.model.state = traj[start_point, -8:] # env.state = env.model.state # ep_steps = len(traj) ep_steps = min(start_point + 1 + max_ep_steps, len(traj)) # print("selected traj = ", traj_id, " and length = ", len(traj), " starting = ", start_point, " ep_steps = ", ep_steps) for j in range(start_point + 1, ep_steps): if Render: env.render() s = env.state delta = np.zeros(s.shape) # ###### NOSIE ############## # noise = np.random.normal(0, 0.01, 0.01) # delta[2:]= noise # ########IF Noise env########## # s= s + delta # a = policy.choose_action(s) # ###### BIAS ############## # noise = s[0:16]*0.01 # delta[0:16] = noise # store_s = s.copy() # store_s[2] = store_s[2]-store_s[0] # a = policy.choose_action(store_s + delta) # print(s, delta) a = policy.choose_action(s / ref_s + delta) # print("a: ", a) action = a_lowerbound + (a + 1.) * (a_upperbound - a_lowerbound) / 2 # action = traj[j-1,16] # print("a normalize: " , action) a_upperbound = env.action_space.high a_lowerbound = env.action_space.low # Run in simulator s_, r, done, X_ = env.step(action, traj[j, 2], traj[j, 1]) # The new s= current state,next omega, next state s_ = np.array(list(s_) + [traj[j + 1, 2]] + list(traj[j + 1, -8:])) # s_ = np.array([X_[1][0], traj[j, 2], traj[j,4]]) # s_ = np.array([traj[j, 1], traj[j, 2], traj[j,4]]) r = modify_reward(r, s, s_, reward_id) # print(r) if global_step % 100 == 1: print("global step: ", global_step, " true action: ", [traj[j, 5], traj[j, 6]], " predicted action: ", action, " and reward : ", r) # print("new state is : ", s_) # s_ = np.concatenate([[s_], [theta]], axis=1)[0] # s_ = np.concatenate([X_,[[theta]], [traj[j, 9:]]], axis=1)[0] env.state = s_ # store_s_ = s_.copy() # store_s_[2] = store_s_[2] - store_s_[0] # theta_pre=theta if training_started: global_step += 1 if j == ep_steps - 2: done = True terminal = 1. if done else 0. if j > start_point + 2: pool.store(s / ref_s, a, np.zeros([1]), np.zeros([1]), r, terminal, s_ / ref_s, _s / ref_s) # pool.store(store_s, a, np.zeros([1]), np.zeros([1]), r, terminal, store_s_, store__s) # policy.store_transition(s, a, disturbance, r,0, terminal, s_) if pool.memory_pointer > min_memory_size and global_step % steps_per_cycle == 0: training_started = True # print("learning policy") for _ in range(train_per_cycle): batch = pool.sample(batch_size) labda, alpha, beta, l_loss, entropy, a_loss, beta, action_distance, kl, distance = policy.learn( lr_a_now, lr_c_now, lr_l_now, lr_a_now / 10, batch) if global_step % 2000 == 1: print("labda = ", labda, " | alpha = ", alpha, " | beta = ", beta, " | l_loss = ", l_loss, " | entropy = ", entropy, " | a_loss = ", a_loss, " | action_distance = ", action_distance) if training_started: current_path['rewards'].append(r) current_path['distance'].append(distance) current_path['kl_divergence'].append(kl) current_path['lyapunov_error'].append(l_loss) current_path['alpha'].append(alpha) current_path['entropy'].append(entropy) current_path['a_loss'].append(a_loss) current_path['beta'].append(beta) current_path['action_distance'].append(action_distance) if training_started and global_step % evaluation_frequency == 0 and global_step > 0: logger.logkv("total_timesteps", global_step) training_diagnotic = evaluate_training_rollouts( last_training_paths) # print(training_diagnotic) if training_diagnotic is not None: print("doing training evaluation") eval_diagnotic = training_evaluation(variant, env, policy) [ logger.logkv(key, eval_diagnotic[key]) for key in eval_diagnotic.keys() ] training_diagnotic.pop('return') [ logger.logkv(key, training_diagnotic[key]) for key in training_diagnotic.keys() ] logger.logkv('lr_a', lr_a_now) logger.logkv('lr_c', lr_c_now) logger.logkv('lr_l', lr_l_now) string_to_print = ['time_step:', str(global_step), '|'] [ string_to_print.extend( [key, ':', str(eval_diagnotic[key]), '|']) for key in eval_diagnotic.keys() ] [ string_to_print.extend([ key, ':', str(round(training_diagnotic[key], 2)), '|' ]) for key in training_diagnotic.keys() ] print(''.join(string_to_print)) logger.dumpkvs() if eval_diagnotic['test_return'] / eval_diagnotic[ 'test_average_length'] <= Min_cost: Min_cost = eval_diagnotic['test_return'] / eval_diagnotic[ 'test_average_length'] print("New lowest cost:", Min_cost) policy.save_result(log_path) else: print("cost did not improve.") print( "avg cost was ", eval_diagnotic['test_return'] / eval_diagnotic['test_average_length']) print("prev best cost is:", Min_cost) # policy.save_result(log_path) if training_started and global_step % ( 10 * evaluation_frequency) == 0 and global_step > 0: policy.save_result(log_path) # State Update _s = s s = s_ store__s = _s.copy() store__s[2] = store__s[2] - store__s[0] # OUTPUT TRAINING INFORMATION AND LEARNING RATE DECAY if done: # print("done at ", j) if training_started: last_training_paths.appendleft(current_path) frac = 1.0 - (global_step - 1.0) / max_global_steps lr_a_now = lr_a * frac # learning rate for actor lr_c_now = lr_c * frac # learning rate for critic lr_l_now = lr_l * frac # learning rate for critic break policy.save_result(log_path) print('Running time: ', time.time() - t1) return