def train(self): xs = tf.placeholder(tf.float32, [None, 1]) ys = tf.placeholder(tf.float32, [None, 1]) NN_out, params = self.net_model(self.X_input) self.update_tf_wb(params, 'neulist') loss = tf.reduce_mean( tf.reduce_sum(tf.square(self.Y_true - NN_out), keepdims=True)) reward = self.get_reward(loss) loss = self.choose_loss(loss) train_step = tf.train.GradientDescentOptimizer(0.001).minimize(loss) sess = tf.Session(U.make_session()) sess.run(tf.global_variables_initializer()) for i in range(20): start = time.time() sess.run(train_step, feed_dict={xs: self.X_input, ys: self.Y_true}) end = time.time() #计算训练一次的时间 step_time = start - end #消除不用更新的权重和偏置 self.update_tf_wb(params, neulist=None) #推送本地参数到缓存区 self.push_local() #拉取全局参数 self.pull_global() print('step time:%f' % step_time) if i % 50 == 0: print( sess.run(loss, feed_dict={ self.X_input: self.X_input, self.Y_true: self.Y_true })) if 'need saving model': self.save_modle()
def setup_step_model(self): assert issubclass(self.policy, MultiTaskActorCriticPolicy), "Error: the input policy for the A2C model must be an " \ "instance of MultiTaskActorCriticPolicy." self.graph = tf.Graph() with self.graph.as_default(): self.sess = tf_utils.make_session(graph=self.graph) self.step_model = self.policy(self.sess, self.tasks, self.observation_space_dict, self.action_space_dict, self.n_envs_per_task, n_steps=1, reuse=False) self.trainable_variables = tf_utils.find_trainable_variables( "model") # a modell betöltéséhez kell. self.step = self.step_model.step self.value = self.step_model.value
def setup_model(self): """ Create all the functions and tensorflow graphs necessary to train the model """ assert issubclass(self.policy, MetaLstmActorCriticPolicy), "Error: the input policy for the A2C model must be an " \ "instance of MetaLstmActorCriticPolicy." self.graph = tf.Graph() with self.graph.as_default(): self.sess = tf_utils.make_session(graph=self.graph) # azért nincs step model mert ugyanaz a lépés (n_batch) így felesleges. policy_model = self.policy(sess=self.sess, input_length=self.input_length, output_length=self.output_length, n_steps=self.n_steps, window_size=self.window_size, layers=self.layers, lstm_units=self.lstm_units) with tf.variable_scope("loss", reuse=False): self.actions_ph = policy_model.pdtype.sample_placeholder([self.n_steps], name="action_ph") self.advs_ph = tf.placeholder(tf.float32, [self.n_steps], name="advs_ph") self.rewards_ph = tf.placeholder(tf.float32, [self.n_steps], name="rewards_ph") self.learning_rate_ph = tf.placeholder(tf.float32, [], name="learning_rate_ph") neglogpac = policy_model.proba_distribution.neglogp(self.actions_ph) self.entropy = tf.reduce_mean(policy_model.proba_distribution.entropy()) self.pg_loss = tf.reduce_mean(self.advs_ph * neglogpac) self.vf_loss = mse(tf.squeeze(policy_model.value_fn), self.rewards_ph) loss = self.pg_loss - self.entropy * self.ent_coef + self.vf_loss * self.vf_coef self.trainable_variables = tf_utils.find_trainable_variables("model") grads = tf.gradients(loss, self.trainable_variables) if self.max_grad_norm is not None: grads, _ = tf.clip_by_global_norm(grads, self.max_grad_norm) grads = list(zip(grads, self.trainable_variables)) trainer = tf.train.RMSPropOptimizer(learning_rate=self.learning_rate_ph, decay=self.alpha, epsilon=self.epsilon) self.apply_backprop = trainer.apply_gradients(grads) self.step = policy_model.step self.policy_model = policy_model self.value = self.policy_model.value tf.global_variables_initializer().run(session=self.sess)
def main(args): ## Define environment if args.mesh is not None: change_env_to_use_correct_mesh(args.mesh) # env = gym.make(args.env) # env = NormalizeActionWrapper(env) # env = ImageEnv(env, # imsize=64, # normalize=True, # init_camera=init_multiple_cameras, # num_cameras=10, # num_views=4, # depth=True, # cam_angles=True, # reward_type="wrapped_env", # flatten=False) # Dictionary of values to plot plotters = { 'min_return': [], 'max_return': [], 'mean_return': [], 'mean_final_success': [] } ## Define expert expert_policy, env = load_expert.get_policy(args.checkpoint_path) ## Define policy network policy = XYZ_XYZ_Policy("dagger_xyz_xyz", env) ## Define DAGGER loss ob = tfu.get_placeholder(name="ob", dtype=tf.float32, shape=[None, policy.obs_dim]) act = tfu.get_placeholder(name="act", dtype=tf.float32, shape=[None, policy.act_dim]) loss = tf.reduce_mean(tf.squared_difference(policy.ac, act)) opt = tf.train.AdamOptimizer().minimize(loss) # Start session session = tfu.make_session(num_cpu=8) session.__enter__() session.run(tf.global_variables_initializer()) # Load expert policy pickle_path = os.path.join(args.checkpoint_path, 'checkpoint.pkl') with open(pickle_path, 'rb') as f: picklable = pickle.load(f) expert_policy.set_weights(picklable['policy_weights']) expert_policy.set_deterministic(True).__enter__() # Collect initial data if args.expert_data_path is None: data, _ = rollout(env, args.num_rollouts, args.max_path_length, expert_policy) # np.save('expert_data_{}.npy'.format(args.env), data) else: data = np.load(args.expert_data_path, allow_pickle=True).item() roll, _ = rollout(env, args.num_rollouts, args.max_path_length, expert_policy) exit() ## Start training # Start for loop for i in tqdm.tqdm(range(args.num_iterations)): # print('\nIteration {} :'.format(i+1)) # Parse dataset for supervised learning num_samples = data['state_observation'].shape[0] idx = np.arange(num_samples) np.random.shuffle(idx) for j in range(num_samples // args.mb_size): np.random.shuffle(idx) obs_train = policy.train_process_observation( data, idx[:args.mb_size]) act_train = data['actions'][idx[:args.mb_size]] session.run(opt, feed_dict={ob: obs_train, act: act_train}) # Perform rollouts roll, plot_data = rollout(env, args.num_rollouts, args.max_path_length, policy, expert_policy) data = append_paths(data, roll) for key in plotters.keys(): plotters[key].append(plot_data[key]) # Plotting results color_list = ["#363737"] plt.figure(figsize=(4, 4)) plt.rcParams["axes.edgecolor"] = "0.15" plt.rcParams["axes.linewidth"] = 0.5 plt.rcParams["font.sans-serif"] = "Helvetica" plt.rcParams["font.family"] = "sans-serif" plt.rcParams["ytick.labelsize"] = "medium" plt.rcParams["xtick.labelsize"] = "medium" plt.rcParams["font.size"] = 8.3 for i, key in enumerate(plotters.keys()): ax = plt.subplot(2, 2, i + 1) plt.plot(range(args.num_iterations), plotters[key]) plt.title(key) plt.tight_layout() plt.savefig('metrics.png', dpi=300) plt.close()
def setup_train_model(self, transfer=False): with SetVerbosity(self.verbose): assert issubclass(self.policy, MultiTaskActorCriticPolicy), "Error: the input policy for the A2C model must be an " \ "instance of MultiTaskActorCriticPolicy." self.graph = tf.Graph() with self.graph.as_default(): self.sess = tf_utils.make_session(graph=self.graph) self.n_batch = self.n_envs_per_task * self.n_steps step_model = self.policy(self.sess, self.tasks, self.observation_space_dict, self.action_space_dict, self.n_envs_per_task, n_steps=1, reuse=False) with tf.variable_scope( "train_model", reuse=True, custom_getter=tf_utils.outer_scope_getter( "train_model")): train_model = self.policy(self.sess, self.tasks, self.observation_space_dict, self.action_space_dict, self.n_envs_per_task, self.n_steps, reuse=True) with tf.variable_scope("loss", reuse=False): self.actions_ph = tf.placeholder(dtype=tf.int32, shape=[None], name="actions_ph") self.advs_ph = tf.placeholder(tf.float32, [None], name="advs_ph") # advantages self.rewards_ph = tf.placeholder(tf.float32, [None], name="rewards_ph") self.learning_rate_ph = tf.placeholder( tf.float32, [], name="learning_rate_ph") neglogpac = {} losses = {} for task in self.tasks: neglogpac[task] = train_model.proba_distribution_dict[ task].neglogp(self.actions_ph) self.entropy[task] = tf.reduce_mean( train_model.proba_distribution_dict[task].entropy( )) self.pg_loss[task] = tf.reduce_mean( self.advs_ph * neglogpac[task]) # policy gradient loss self.vf_loss[task] = mse( tf.squeeze(train_model.value_fn_dict[task]), self.rewards_ph) losses[task] = self.pg_loss[task] - self.entropy[ task] * self.ent_coef + self.vf_loss[ task] * self.vf_coef tf.summary.scalar(task + '_policy_gradient_loss', self.pg_loss[task]) tf.summary.scalar(task + '_value_function_loss', self.vf_loss[task]) with tf.variable_scope("input_info", reuse=False): tf.summary.scalar('learning_rate', tf.reduce_mean(self.learning_rate_ph)) optimizers = {} grads_and_vars = {} self.apply_backprop = {} for task in self.tasks: optimizers[task] = tf.train.RMSPropOptimizer( learning_rate=self.learning_rate_ph, decay=self.alpha, epsilon=self.epsilon) grads_and_vars[task] = optimizers[task].compute_gradients( losses[task]) if self.max_grad_norm is not None: grads = [grad for grad, var in grads_and_vars[task]] vars = [var for grad, var in grads_and_vars[task]] clipped_grads, _ = tf.clip_by_global_norm( grads, self.max_grad_norm) grads_and_vars[task] = list(zip(clipped_grads, vars)) self.apply_backprop[task] = optimizers[ task].apply_gradients(grads_and_vars[task]) self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.trainable_variables = tf_utils.find_trainable_variables( "model") tf.global_variables_initializer().run(session=self.sess) self.summary = tf.summary.merge_all() if not transfer: self.sess.graph.finalize()
def test(args): ## Define environment expert_list = ['mug1','mouse','mug2','headphones','ball','book','eyeglass'] if args.mesh is not None: change_env_to_use_correct_mesh(args.mesh) # Dictionary of values to plot plotters = {'min_return': [], 'max_return': [], 'mean_return': [], 'mean_final_success': []} # Create environment _, env = load_expert.get_policy(args.checkpoint_path) ## Define policy network policy = Tensor_XYZ_Policy("dagger_tensor_xyz", env) # Start session session = tfu.make_session(num_cpu=40) session.__enter__() policy.map3D.finalize_graph() checkpoint_path = "/home/robertmu/DAGGER_discovery/checkpoints/test7obj" # saver = tf.train.import_meta_graph(checkpoint_path+ "/minuet.model-0"+".meta") ckpt = tf.train.get_checkpoint_state(checkpoint_path) saver = tf.train.Saver() if ckpt and ckpt.model_checkpoint_path: ckpt_name = os.path.basename(ckpt.model_checkpoint_path) print(("...found %s " % ckpt.model_checkpoint_path)) saver.restore(session, os.path.join(checkpoint_path, ckpt_name)) else: print("...ain't no full checkpoint here!") # Rollout policy for mesh in expert_list: print('testing {} '.format(mesh)) change_env_to_use_correct_mesh(mesh) checkpoint_path = '/projects/katefgroup/yunchu/{}'.format(mesh)+'48/checkpoint_1400/' _, env = load_expert.get_policy(checkpoint_path) _, stats = rollout(env, args.test_num_rollouts, args.max_path_length, policy, mesh = mesh) for key, value in enumerate(stats): print("{} : {}".format(value, stats[value])) for key in plotters.keys(): plotters[key].append(stats[key]) plott = {'min_return': np.min(plotters['min_return']), 'max_return': np.max(plotters['max_return']), 'mean_return': np.mean(plotters['mean_return']), 'mean_final_success': np.mean(plotters['mean_final_success'])} for key, value in enumerate(plott): print("{} : {}".format(value, plott[value])) session.close()
def main(args): ## Define environment expert_list = ['mug1','mouse','mug2','headphones','ball','eyeglass','coffee_mug','car3','boat2'] if args.mesh is not None: change_env_to_use_correct_mesh(args.mesh) # env = gym.make(args.env) # env = NormalizeActionWrapper(env) # env = ImageEnv(env, # imsize=64, # normalize=True, # init_camera=init_multiple_cameras, # num_cameras=10, # num_views=4, # depth=True, # cam_angles=True, # reward_type="wrapped_env", # flatten=False) # Dictionary of values to plot plotters = {'min_return': [], 'max_return': [], 'mean_return': [], 'mean_final_success': []} name = "test9obj" log_dir_ = os.path.join("logs_mujoco_offline", name) checkpoint_dir_ = os.path.join("checkpoints", name) set_writer = tf.summary.FileWriter(log_dir_ + '/train', None) ## Define expert expert_policy, env = load_expert.get_policy(args.checkpoint_path) ## Define policy network policy = Tensor_XYZ_Policy("dagger_tensor_xyz", env) ## Define DAGGER loss # goal_obs = tfu.get_placeholder(name="goal_obs", # dtype=tf.float32, # shape=[None, policy.state_obs_dim + policy.state_desired_dim]) # crop = tfu.get_placeholder(name="crop", # dtype=tf.float32, # shape=[None, 16, 16, 8, 32]) act = tfu.get_placeholder(name="act", dtype=tf.float32, shape=[None, policy.act_dim]) min_return = tfu.get_placeholder(dtype=tf.float32, shape=None, name="min_return") max_return = tfu.get_placeholder(dtype=tf.float32, shape=None, name="max_return") mean_return = tfu.get_placeholder(dtype=tf.float32, shape=None, name="mean_return") mean_final_success = tfu.get_placeholder(dtype=tf.float32, shape=None, name="mean_final_success") step = tf.Variable(0, trainable=False) # lr 0.002 0.001 # decay 0.96 0.8 lr = tf.train.exponential_decay(learning_rate = 0.001, global_step = step, decay_steps = 20000, decay_rate = 0.75, staircase=True) # Exclude map3D network from gradient computation freeze_patterns = [] freeze_patterns.append("feat") loss = tf.reduce_mean(tf.squared_difference(policy.ac, act)) train_vars = tf.contrib.framework.filter_variables( tf.trainable_variables(), exclude_patterns=freeze_patterns) opt = tf.train.AdamOptimizer(learning_rate=lr).minimize(loss, var_list=train_vars, global_step=step) # Start session session = tfu.make_session(num_cpu=40) session.__enter__() # Load map3D network freeze_list = tf.contrib.framework.filter_variables( tf.trainable_variables(), include_patterns=freeze_patterns) policy.map3D.finalize_graph() # seperate with map_3d summary loss_op = tf.summary.scalar('loss', loss) with tf.variable_scope("policy_perf"): min_return_op = tf.summary.scalar('min_return', min_return) max_return_op = tf.summary.scalar('max_return', max_return) mean_return_op = tf.summary.scalar('mean_return', mean_return) mean_final_success_op = tf.summary.scalar('mean_final_success', mean_final_success) saver = tf.train.Saver() # Load expert policy init = True for mesh in expert_list: print('generating {} data'.format(mesh)) change_env_to_use_correct_mesh(mesh) ## Define expert checkpoint_path = '/projects/katefgroup/yunchu/{}'.format(mesh)+'48/checkpoint_1350/' if mesh =='mug2': checkpoint_path = '/projects/katefgroup/yunchu/{}'.format(mesh)+'48/checkpoint_1200/' expert_policy, env = load_expert.get_policy(checkpoint_path) # Load expert policy pickle_path = os.path.join(checkpoint_path, 'checkpoint.pkl') with open(pickle_path, 'rb') as f: picklable = pickle.load(f) expert_policy.set_weights(picklable['policy_weights']) with expert_policy.set_deterministic(True): # Collect initial data if init is True: data, _ = rollout(env, args.num_rollouts, args.max_path_length, expert_policy, mesh = mesh) np.save('expert_data_{}.npy'.format(args.env), data) init = False else: roll, _ = rollout(env, args.num_rollouts, args.max_path_length, expert_policy, mesh = mesh) data = append_paths(data, roll) ## Start training # Start for loop global_step = 0 for i in tqdm.tqdm(range(args.num_iterations)): plotters = {'min_return': [], 'max_return': [], 'mean_return': [], 'mean_final_success': []} # Parse dataset for supervised learning num_samples = data['state_observation'].shape[0] print('num_samples',num_samples) idx = np.arange(num_samples) np.random.shuffle(idx) for j in range(num_samples // args.mb_size): np.random.shuffle(idx) feed = policy.train_process_observation(data, idx[:args.mb_size] ,env) act_train = data['actions'][idx[:args.mb_size]] feed.update({act:act_train}) loss, _ = session.run([loss_op,opt], feed_dict=feed) log_this = np.mod(global_step, 500) == 0 if log_this: results = session.run(policy.map3D.summary, feed) set_writer.add_summary(results, global_step) set_writer.add_summary(loss, global_step=global_step) global_step = global_step + 1 # Perform rollouts for mesh in expert_list: print('generating {} dagger data'.format(mesh)) change_env_to_use_correct_mesh(mesh) ## Define expert checkpoint_path = '/projects/katefgroup/yunchu/{}'.format(mesh)+'48/checkpoint_1350/' if mesh =='mug2': checkpoint_path = '/projects/katefgroup/yunchu/{}'.format(mesh)+'48/checkpoint_1200/' expert_policy, env = load_expert.get_policy(checkpoint_path) # Load expert policy pickle_path = os.path.join(checkpoint_path, 'checkpoint.pkl') with open(pickle_path, 'rb') as f: picklable = pickle.load(f) expert_policy.set_weights(picklable['policy_weights']) with expert_policy.set_deterministic(True): # Collect initial data roll, plot_data = rollout(env, args.num_rollouts, args.max_path_length, policy, expert_policy, mesh = mesh) # import ipdb;ipdb.set_trace() data = append_paths(data, roll) for key in plotters.keys(): plotters[key].append(plot_data[key]) minro,maxro,meanro,meanfo= session.run([min_return_op,max_return_op,mean_return_op,mean_final_success_op],feed_dict=\ {min_return:np.min(plotters['min_return']),max_return:np.max(plotters['max_return']),mean_return:np.mean(plotters['mean_return']),\ mean_final_success:np.mean(plotters['mean_final_success'])}) set_writer.add_summary(minro,global_step=global_step) set_writer.add_summary(maxro,global_step=global_step) set_writer.add_summary(meanro,global_step=global_step) set_writer.add_summary(meanfo,global_step=global_step) # for key in plotters.keys(): plotters[key].append(plot_data[key]) if (i+1)%args.checkpoint_freq==0: savemodel(saver, session, checkpoint_dir_, i+1) plotting_data(plotters) session.__exit__() session.close()
#主函数 if __name__ == "__main__": #初始化全局变量,建立用于存储全局的训练信息缓存 gl._init() #加载训练数据 X, Y = load_data() #解析参数 arglist = parse_args() #设置使用cpu开核数量 或 使用gpu sess = tf.Session(config=U.make_session()) with tf.device("/cpu:0"): trainers = [] # 创建worker for i in range(Y.shape[0]): i_name = 'w_%i' % i trainers.append(T.trainer(i_name, arglist, X, Y)) #加入线程协调器 COORD = tf.train.Coordinator() # 调用work 开始训练 trainer_threads = [] for trainer in trainers: job = lambda: trainer.train()
def test(env, num_rollouts, path_length): tf.reset_default_graph() session1 = tfu.make_session(num_cpu=40) session1.__enter__() session1.run(tf.global_variables_initializer()) checkpoint_path = "/home/robertmu/DAGGER_discovery/checkpoints/dagger_tensor_xyz02" saver = tf.train.import_meta_graph(checkpoint_path + "/minuet.model-0" + ".meta") #print("i am reloading", tf.train.latest_checkpoint(checkpoint_path)) saver.restore(session1, tf.train.latest_checkpoint(checkpoint_path)) env_keys = env.observation_space.spaces.keys() observation_converter = lambda x: x paths = [] rewards = [] count_infos = [] while len(paths) < (num_rollouts): import ipdb ipdb.set_trace() t = 0 path = {key: [] for key in env_keys} images = [] infos = [] observations = [] actions = [] terminals = [] observation = env.reset() R = 0 for t in range(path_length): observation = observation_converter(observation) ob = observation ob = { key: np.repeat(np.expand_dims(ob[key], axis=0), 8, axis=0) for key in ob.keys() } puck_z = env._env.env.init_puck_z + \ env._env.env.sim.model.geom_pos[env._env.env.sim.model.geom_name2id('puckbox')][-1] batch_dict = get_inputs(ob, puck_z) goal_obs_train = np.hstack( [ob['state_desired_goal'], ob['state_observation']]) feed = {} feed.update({policy.rgb_camXs: batch_dict['rgb_camXs']}) feed.update({policy.xyz_camXs: batch_dict['xyz_camXs']}) feed.update({policy.pix_T_cams: batch_dict['pix_T_cams']}) feed.update({policy.origin_T_camRs: batch_dict['origin_T_camRs']}) feed.update({policy.origin_T_camXs: batch_dict['origin_T_camXs']}) feed.update({policy.puck_xyz_camRs: batch_dict['puck_xyz_camRs']}) feed.update({goal_obs: goal_obs_train}) action = session.run([policy.ac], feed_dict=feed) observation, reward, terminal, info = env.step(action) for key in env_keys: path[key].append(observation[key]) actions.append(action) terminals.append(terminal) infos.append(info) R += reward if terminal: break assert len(infos) == t + 1 path = {key: np.stack(path[key], axis=0) for key in env_keys} path['actions'] = np.stack(actions, axis=0) path['terminals'] = np.stack(terminals, axis=0) if isinstance( policy, GaussianPolicy) and len(path['terminals']) >= path_length: continue elif not isinstance(policy, GaussianPolicy) and len( path['terminals']) == 1: continue rewards.append(R) count_infos.append(infos[-1]['puck_success']) paths.append(path) return _clean_paths(paths), return_stats(rewards, count_infos)