Example #1
0
    def train(self):
        xs = tf.placeholder(tf.float32, [None, 1])
        ys = tf.placeholder(tf.float32, [None, 1])
        NN_out, params = self.net_model(self.X_input)
        self.update_tf_wb(params, 'neulist')
        loss = tf.reduce_mean(
            tf.reduce_sum(tf.square(self.Y_true - NN_out), keepdims=True))
        reward = self.get_reward(loss)
        loss = self.choose_loss(loss)
        train_step = tf.train.GradientDescentOptimizer(0.001).minimize(loss)

        sess = tf.Session(U.make_session())
        sess.run(tf.global_variables_initializer())

        for i in range(20):
            start = time.time()
            sess.run(train_step, feed_dict={xs: self.X_input, ys: self.Y_true})
            end = time.time()

            #计算训练一次的时间
            step_time = start - end

            #消除不用更新的权重和偏置
            self.update_tf_wb(params, neulist=None)

            #推送本地参数到缓存区
            self.push_local()

            #拉取全局参数
            self.pull_global()

            print('step time:%f' % step_time)

            if i % 50 == 0:
                print(
                    sess.run(loss,
                             feed_dict={
                                 self.X_input: self.X_input,
                                 self.Y_true: self.Y_true
                             }))

            if 'need saving model':
                self.save_modle()
Example #2
0
    def setup_step_model(self):
        assert issubclass(self.policy, MultiTaskActorCriticPolicy), "Error: the input policy for the A2C model must be an " \
                                                                    "instance of MultiTaskActorCriticPolicy."

        self.graph = tf.Graph()
        with self.graph.as_default():
            self.sess = tf_utils.make_session(graph=self.graph)

            self.step_model = self.policy(self.sess,
                                          self.tasks,
                                          self.observation_space_dict,
                                          self.action_space_dict,
                                          self.n_envs_per_task,
                                          n_steps=1,
                                          reuse=False)

            self.trainable_variables = tf_utils.find_trainable_variables(
                "model")  # a modell betöltéséhez kell.
            self.step = self.step_model.step
            self.value = self.step_model.value
Example #3
0
    def setup_model(self):
        """
        Create all the functions and tensorflow graphs necessary to train the model
        """

        assert issubclass(self.policy, MetaLstmActorCriticPolicy), "Error: the input policy for the A2C model must be an " \
                                                                         "instance of MetaLstmActorCriticPolicy."

        self.graph = tf.Graph()
        with self.graph.as_default():
            self.sess = tf_utils.make_session(graph=self.graph)

            # azért nincs step model mert ugyanaz a lépés (n_batch) így felesleges.
            policy_model = self.policy(sess=self.sess, input_length=self.input_length, output_length=self.output_length, n_steps=self.n_steps,
                                       window_size=self.window_size, layers=self.layers, lstm_units=self.lstm_units)

            with tf.variable_scope("loss", reuse=False):
                self.actions_ph = policy_model.pdtype.sample_placeholder([self.n_steps], name="action_ph")
                self.advs_ph = tf.placeholder(tf.float32, [self.n_steps], name="advs_ph")
                self.rewards_ph = tf.placeholder(tf.float32, [self.n_steps], name="rewards_ph")
                self.learning_rate_ph = tf.placeholder(tf.float32, [], name="learning_rate_ph")

                neglogpac = policy_model.proba_distribution.neglogp(self.actions_ph)
                self.entropy = tf.reduce_mean(policy_model.proba_distribution.entropy())
                self.pg_loss = tf.reduce_mean(self.advs_ph * neglogpac)
                self.vf_loss = mse(tf.squeeze(policy_model.value_fn), self.rewards_ph)
                loss = self.pg_loss - self.entropy * self.ent_coef + self.vf_loss * self.vf_coef

                self.trainable_variables = tf_utils.find_trainable_variables("model")
                grads = tf.gradients(loss, self.trainable_variables)
                if self.max_grad_norm is not None:
                    grads, _ = tf.clip_by_global_norm(grads, self.max_grad_norm)
                grads = list(zip(grads, self.trainable_variables))

            trainer = tf.train.RMSPropOptimizer(learning_rate=self.learning_rate_ph, decay=self.alpha,
                                                epsilon=self.epsilon)
            self.apply_backprop = trainer.apply_gradients(grads)
            self.step = policy_model.step
            self.policy_model = policy_model
            self.value = self.policy_model.value
            tf.global_variables_initializer().run(session=self.sess)
Example #4
0
def main(args):

    ## Define environment
    if args.mesh is not None: change_env_to_use_correct_mesh(args.mesh)

    # env = gym.make(args.env)
    # env = NormalizeActionWrapper(env)
    # env = ImageEnv(env,
    # 		 imsize=64,
    # 		 normalize=True,
    # 		 init_camera=init_multiple_cameras,
    # 		 num_cameras=10,
    # 		 num_views=4,
    # 		 depth=True,
    # 		 cam_angles=True,
    # 		 reward_type="wrapped_env",
    # 		 flatten=False)

    # Dictionary of values to plot
    plotters = {
        'min_return': [],
        'max_return': [],
        'mean_return': [],
        'mean_final_success': []
    }

    ## Define expert
    expert_policy, env = load_expert.get_policy(args.checkpoint_path)

    ## Define policy network
    policy = XYZ_XYZ_Policy("dagger_xyz_xyz", env)

    ## Define DAGGER loss
    ob = tfu.get_placeholder(name="ob",
                             dtype=tf.float32,
                             shape=[None, policy.obs_dim])
    act = tfu.get_placeholder(name="act",
                              dtype=tf.float32,
                              shape=[None, policy.act_dim])
    loss = tf.reduce_mean(tf.squared_difference(policy.ac, act))
    opt = tf.train.AdamOptimizer().minimize(loss)

    # Start session
    session = tfu.make_session(num_cpu=8)
    session.__enter__()
    session.run(tf.global_variables_initializer())

    # Load expert policy
    pickle_path = os.path.join(args.checkpoint_path, 'checkpoint.pkl')
    with open(pickle_path, 'rb') as f:
        picklable = pickle.load(f)

    expert_policy.set_weights(picklable['policy_weights'])
    expert_policy.set_deterministic(True).__enter__()

    # Collect initial data
    if args.expert_data_path is None:
        data, _ = rollout(env, args.num_rollouts, args.max_path_length,
                          expert_policy)
        # np.save('expert_data_{}.npy'.format(args.env), data)
    else:
        data = np.load(args.expert_data_path, allow_pickle=True).item()
        roll, _ = rollout(env, args.num_rollouts, args.max_path_length,
                          expert_policy)
    exit()
    ## Start training

    # Start for loop
    for i in tqdm.tqdm(range(args.num_iterations)):
        # print('\nIteration {} :'.format(i+1))
        # Parse dataset for supervised learning
        num_samples = data['state_observation'].shape[0]
        idx = np.arange(num_samples)
        np.random.shuffle(idx)
        for j in range(num_samples // args.mb_size):
            np.random.shuffle(idx)
            obs_train = policy.train_process_observation(
                data, idx[:args.mb_size])
            act_train = data['actions'][idx[:args.mb_size]]
            session.run(opt, feed_dict={ob: obs_train, act: act_train})
        # Perform rollouts
        roll, plot_data = rollout(env, args.num_rollouts, args.max_path_length,
                                  policy, expert_policy)
        data = append_paths(data, roll)
        for key in plotters.keys():
            plotters[key].append(plot_data[key])

    # Plotting results
    color_list = ["#363737"]
    plt.figure(figsize=(4, 4))
    plt.rcParams["axes.edgecolor"] = "0.15"
    plt.rcParams["axes.linewidth"] = 0.5
    plt.rcParams["font.sans-serif"] = "Helvetica"
    plt.rcParams["font.family"] = "sans-serif"
    plt.rcParams["ytick.labelsize"] = "medium"
    plt.rcParams["xtick.labelsize"] = "medium"
    plt.rcParams["font.size"] = 8.3
    for i, key in enumerate(plotters.keys()):
        ax = plt.subplot(2, 2, i + 1)
        plt.plot(range(args.num_iterations), plotters[key])
        plt.title(key)
    plt.tight_layout()
    plt.savefig('metrics.png', dpi=300)
    plt.close()
Example #5
0
    def setup_train_model(self, transfer=False):
        with SetVerbosity(self.verbose):

            assert issubclass(self.policy, MultiTaskActorCriticPolicy), "Error: the input policy for the A2C model must be an " \
                                                                        "instance of MultiTaskActorCriticPolicy."

            self.graph = tf.Graph()
            with self.graph.as_default():
                self.sess = tf_utils.make_session(graph=self.graph)

                self.n_batch = self.n_envs_per_task * self.n_steps

                step_model = self.policy(self.sess,
                                         self.tasks,
                                         self.observation_space_dict,
                                         self.action_space_dict,
                                         self.n_envs_per_task,
                                         n_steps=1,
                                         reuse=False)

                with tf.variable_scope(
                        "train_model",
                        reuse=True,
                        custom_getter=tf_utils.outer_scope_getter(
                            "train_model")):
                    train_model = self.policy(self.sess,
                                              self.tasks,
                                              self.observation_space_dict,
                                              self.action_space_dict,
                                              self.n_envs_per_task,
                                              self.n_steps,
                                              reuse=True)

                with tf.variable_scope("loss", reuse=False):
                    self.actions_ph = tf.placeholder(dtype=tf.int32,
                                                     shape=[None],
                                                     name="actions_ph")
                    self.advs_ph = tf.placeholder(tf.float32, [None],
                                                  name="advs_ph")  # advantages
                    self.rewards_ph = tf.placeholder(tf.float32, [None],
                                                     name="rewards_ph")
                    self.learning_rate_ph = tf.placeholder(
                        tf.float32, [], name="learning_rate_ph")

                    neglogpac = {}
                    losses = {}
                    for task in self.tasks:
                        neglogpac[task] = train_model.proba_distribution_dict[
                            task].neglogp(self.actions_ph)
                        self.entropy[task] = tf.reduce_mean(
                            train_model.proba_distribution_dict[task].entropy(
                            ))
                        self.pg_loss[task] = tf.reduce_mean(
                            self.advs_ph *
                            neglogpac[task])  # policy gradient loss
                        self.vf_loss[task] = mse(
                            tf.squeeze(train_model.value_fn_dict[task]),
                            self.rewards_ph)
                        losses[task] = self.pg_loss[task] - self.entropy[
                            task] * self.ent_coef + self.vf_loss[
                                task] * self.vf_coef

                        tf.summary.scalar(task + '_policy_gradient_loss',
                                          self.pg_loss[task])
                        tf.summary.scalar(task + '_value_function_loss',
                                          self.vf_loss[task])

                with tf.variable_scope("input_info", reuse=False):
                    tf.summary.scalar('learning_rate',
                                      tf.reduce_mean(self.learning_rate_ph))

                optimizers = {}
                grads_and_vars = {}
                self.apply_backprop = {}
                for task in self.tasks:
                    optimizers[task] = tf.train.RMSPropOptimizer(
                        learning_rate=self.learning_rate_ph,
                        decay=self.alpha,
                        epsilon=self.epsilon)
                    grads_and_vars[task] = optimizers[task].compute_gradients(
                        losses[task])
                    if self.max_grad_norm is not None:
                        grads = [grad for grad, var in grads_and_vars[task]]
                        vars = [var for grad, var in grads_and_vars[task]]
                        clipped_grads, _ = tf.clip_by_global_norm(
                            grads, self.max_grad_norm)
                        grads_and_vars[task] = list(zip(clipped_grads, vars))
                    self.apply_backprop[task] = optimizers[
                        task].apply_gradients(grads_and_vars[task])

                self.train_model = train_model
                self.step_model = step_model
                self.step = step_model.step
                self.value = step_model.value

                self.trainable_variables = tf_utils.find_trainable_variables(
                    "model")

                tf.global_variables_initializer().run(session=self.sess)

                self.summary = tf.summary.merge_all()

                if not transfer:
                    self.sess.graph.finalize()
Example #6
0
def test(args):

	## Define environment
	expert_list = ['mug1','mouse','mug2','headphones','ball','book','eyeglass']
	if args.mesh is not None: change_env_to_use_correct_mesh(args.mesh)

	# Dictionary of values to plot
	plotters = {'min_return': [],
				'max_return': [],
				'mean_return': [],
				'mean_final_success': []}

	# Create environment
	_, env = load_expert.get_policy(args.checkpoint_path)

	## Define policy network
	policy = Tensor_XYZ_Policy("dagger_tensor_xyz", env)

	# Start session
	session = tfu.make_session(num_cpu=40)
	session.__enter__()

	policy.map3D.finalize_graph()
	checkpoint_path = "/home/robertmu/DAGGER_discovery/checkpoints/test7obj"
	# saver = tf.train.import_meta_graph(checkpoint_path+ "/minuet.model-0"+".meta")
	ckpt = tf.train.get_checkpoint_state(checkpoint_path)
	saver = tf.train.Saver()
	if ckpt and ckpt.model_checkpoint_path:
		ckpt_name = os.path.basename(ckpt.model_checkpoint_path)
		print(("...found %s " % ckpt.model_checkpoint_path))
		saver.restore(session, os.path.join(checkpoint_path, ckpt_name))
	else:
		print("...ain't no full checkpoint here!")

	# Rollout policy
	for mesh in expert_list:
		print('testing {} '.format(mesh))
		change_env_to_use_correct_mesh(mesh)
		checkpoint_path = '/projects/katefgroup/yunchu/{}'.format(mesh)+'48/checkpoint_1400/'
		_, env = load_expert.get_policy(checkpoint_path)	
		
		_, stats = rollout(env,
				args.test_num_rollouts,
				args.max_path_length,
				policy,
				mesh = mesh)


		for key, value in enumerate(stats):
			print("{} : {}".format(value, stats[value]))

		for key in plotters.keys(): plotters[key].append(stats[key])

	plott = {'min_return': np.min(plotters['min_return']),
				'max_return': np.max(plotters['max_return']),
				'mean_return': np.mean(plotters['mean_return']),
				'mean_final_success': np.mean(plotters['mean_final_success'])}
	for key, value in enumerate(plott):
		print("{} : {}".format(value, plott[value]))

	session.close()
Example #7
0
def main(args):

	## Define environment
	expert_list = ['mug1','mouse','mug2','headphones','ball','eyeglass','coffee_mug','car3','boat2']
	if args.mesh is not None: change_env_to_use_correct_mesh(args.mesh)

	# env = gym.make(args.env)
	# env = NormalizeActionWrapper(env)
	# env = ImageEnv(env,
	# 		 imsize=64,
	# 		 normalize=True,
	# 		 init_camera=init_multiple_cameras,
	# 		 num_cameras=10,
	# 		 num_views=4,
	# 		 depth=True,
	# 		 cam_angles=True,
	# 		 reward_type="wrapped_env",
	# 		 flatten=False)

	# Dictionary of values to plot
	plotters = {'min_return': [],
				'max_return': [],
				'mean_return': [],
				'mean_final_success': []}


	name = "test9obj"
	log_dir_ = os.path.join("logs_mujoco_offline", name)
	checkpoint_dir_ = os.path.join("checkpoints", name)
	set_writer = tf.summary.FileWriter(log_dir_ + '/train', None)

	## Define expert
	expert_policy, env = load_expert.get_policy(args.checkpoint_path)

	## Define policy network
	policy = Tensor_XYZ_Policy("dagger_tensor_xyz", env)

	## Define DAGGER loss
	# goal_obs = tfu.get_placeholder(name="goal_obs",
	# 						dtype=tf.float32,
	# 						shape=[None, policy.state_obs_dim + policy.state_desired_dim])
	# crop = tfu.get_placeholder(name="crop",
	# 						dtype=tf.float32,
	# 						shape=[None, 16, 16, 8, 32])
	act = tfu.get_placeholder(name="act",
							dtype=tf.float32,
							shape=[None, policy.act_dim])
	min_return = tfu.get_placeholder(dtype=tf.float32, shape=None, name="min_return")
	max_return = tfu.get_placeholder(dtype=tf.float32, shape=None, name="max_return")
	mean_return = tfu.get_placeholder(dtype=tf.float32, shape=None, name="mean_return")
	mean_final_success = tfu.get_placeholder(dtype=tf.float32, shape=None, name="mean_final_success")

	step = tf.Variable(0, trainable=False)

	# lr 0.002 0.001
	# decay 0.96 0.8

	lr = tf.train.exponential_decay(learning_rate = 0.001,
									global_step = step,
									decay_steps = 20000,
									decay_rate = 0.75,
									staircase=True)

	# Exclude map3D network from gradient computation
	freeze_patterns = []
	freeze_patterns.append("feat")

	loss = tf.reduce_mean(tf.squared_difference(policy.ac, act))
	train_vars = tf.contrib.framework.filter_variables( tf.trainable_variables(),
														exclude_patterns=freeze_patterns)
	opt = tf.train.AdamOptimizer(learning_rate=lr).minimize(loss,
															var_list=train_vars,
															global_step=step)

	# Start session
	session = tfu.make_session(num_cpu=40)
	session.__enter__()

	# Load map3D network
	freeze_list = tf.contrib.framework.filter_variables(
		tf.trainable_variables(),
		include_patterns=freeze_patterns)

	policy.map3D.finalize_graph()
	# seperate with map_3d summary
	loss_op = tf.summary.scalar('loss', loss)

	with tf.variable_scope("policy_perf"):
		min_return_op = tf.summary.scalar('min_return', min_return)
		max_return_op = tf.summary.scalar('max_return', max_return)
		mean_return_op = tf.summary.scalar('mean_return', mean_return)
		mean_final_success_op = tf.summary.scalar('mean_final_success', mean_final_success)


	saver = tf.train.Saver()
	# Load expert policy
	init = True
	for mesh in expert_list:
		print('generating {} data'.format(mesh))
		change_env_to_use_correct_mesh(mesh)
		## Define expert
		checkpoint_path = '/projects/katefgroup/yunchu/{}'.format(mesh)+'48/checkpoint_1350/'
		if mesh =='mug2':
			checkpoint_path = '/projects/katefgroup/yunchu/{}'.format(mesh)+'48/checkpoint_1200/'
		expert_policy, env = load_expert.get_policy(checkpoint_path)
		# Load expert policy
		pickle_path = os.path.join(checkpoint_path, 'checkpoint.pkl')
		with open(pickle_path, 'rb') as f:
			picklable = pickle.load(f)

		expert_policy.set_weights(picklable['policy_weights'])
		with expert_policy.set_deterministic(True):
		
			# Collect initial data
			if init is True:
				data, _ = rollout(env,
							args.num_rollouts,
							args.max_path_length,
							expert_policy,
							mesh = mesh)
				np.save('expert_data_{}.npy'.format(args.env), data)
				init = False
			else:
				roll, _ = rollout(env,
						args.num_rollouts,
						args.max_path_length,
						expert_policy,
						mesh = mesh)
				data = append_paths(data, roll)

	## Start training

	# Start for loop
	global_step = 0

	for i in tqdm.tqdm(range(args.num_iterations)):
		plotters = {'min_return': [],
				'max_return': [],
				'mean_return': [],
				'mean_final_success': []}
		# Parse dataset for supervised learning
		num_samples = data['state_observation'].shape[0]
		print('num_samples',num_samples)
		idx = np.arange(num_samples)
		np.random.shuffle(idx)
		for j in range(num_samples // args.mb_size):
			np.random.shuffle(idx)
			feed = policy.train_process_observation(data, idx[:args.mb_size] ,env)
			act_train = data['actions'][idx[:args.mb_size]]
			feed.update({act:act_train})
			loss, _ = session.run([loss_op,opt], feed_dict=feed)
			log_this = np.mod(global_step, 500) == 0
			if log_this:
				results = session.run(policy.map3D.summary, feed)
				set_writer.add_summary(results, global_step)
			set_writer.add_summary(loss, global_step=global_step)
			global_step = global_step + 1

		# Perform rollouts
		for mesh in expert_list:
			print('generating {} dagger data'.format(mesh))
			change_env_to_use_correct_mesh(mesh)
			## Define expert
			checkpoint_path = '/projects/katefgroup/yunchu/{}'.format(mesh)+'48/checkpoint_1350/'
			if mesh =='mug2':
				checkpoint_path = '/projects/katefgroup/yunchu/{}'.format(mesh)+'48/checkpoint_1200/'
			expert_policy, env = load_expert.get_policy(checkpoint_path)
			# Load expert policy
			pickle_path = os.path.join(checkpoint_path, 'checkpoint.pkl')
			with open(pickle_path, 'rb') as f:
				picklable = pickle.load(f)

			expert_policy.set_weights(picklable['policy_weights'])

			with expert_policy.set_deterministic(True):
			# Collect initial data
				roll, plot_data = rollout(env,
					args.num_rollouts,
					args.max_path_length,
					policy,
					expert_policy,
					mesh = mesh)
				# import ipdb;ipdb.set_trace()
				data = append_paths(data, roll)

				for key in plotters.keys(): plotters[key].append(plot_data[key])


		minro,maxro,meanro,meanfo= session.run([min_return_op,max_return_op,mean_return_op,mean_final_success_op],feed_dict=\
			{min_return:np.min(plotters['min_return']),max_return:np.max(plotters['max_return']),mean_return:np.mean(plotters['mean_return']),\
			mean_final_success:np.mean(plotters['mean_final_success'])})
		set_writer.add_summary(minro,global_step=global_step)
		set_writer.add_summary(maxro,global_step=global_step)
		set_writer.add_summary(meanro,global_step=global_step)
		set_writer.add_summary(meanfo,global_step=global_step)

		# for key in plotters.keys(): plotters[key].append(plot_data[key])

		if (i+1)%args.checkpoint_freq==0:
			savemodel(saver, session, checkpoint_dir_, i+1)

	plotting_data(plotters)
	session.__exit__()
	session.close()
Example #8
0

#主函数
if __name__ == "__main__":

    #初始化全局变量,建立用于存储全局的训练信息缓存
    gl._init()

    #加载训练数据
    X, Y = load_data()

    #解析参数
    arglist = parse_args()

    #设置使用cpu开核数量 或 使用gpu
    sess = tf.Session(config=U.make_session())
    with tf.device("/cpu:0"):
        trainers = []
        # 创建worker
        for i in range(Y.shape[0]):
            i_name = 'w_%i' % i
            trainers.append(T.trainer(i_name, arglist, X, Y))

    #加入线程协调器
    COORD = tf.train.Coordinator()

    # 调用work 开始训练
    trainer_threads = []

    for trainer in trainers:
        job = lambda: trainer.train()
Example #9
0
def test(env, num_rollouts, path_length):

    tf.reset_default_graph()
    session1 = tfu.make_session(num_cpu=40)
    session1.__enter__()

    session1.run(tf.global_variables_initializer())

    checkpoint_path = "/home/robertmu/DAGGER_discovery/checkpoints/dagger_tensor_xyz02"
    saver = tf.train.import_meta_graph(checkpoint_path + "/minuet.model-0" +
                                       ".meta")
    #print("i am reloading", tf.train.latest_checkpoint(checkpoint_path))
    saver.restore(session1, tf.train.latest_checkpoint(checkpoint_path))

    env_keys = env.observation_space.spaces.keys()
    observation_converter = lambda x: x

    paths = []
    rewards = []
    count_infos = []
    while len(paths) < (num_rollouts):
        import ipdb
        ipdb.set_trace()

        t = 0
        path = {key: [] for key in env_keys}
        images = []
        infos = []
        observations = []
        actions = []
        terminals = []
        observation = env.reset()
        R = 0
        for t in range(path_length):
            observation = observation_converter(observation)
            ob = observation
            ob = {
                key: np.repeat(np.expand_dims(ob[key], axis=0), 8, axis=0)
                for key in ob.keys()
            }
            puck_z = env._env.env.init_puck_z + \
             env._env.env.sim.model.geom_pos[env._env.env.sim.model.geom_name2id('puckbox')][-1]
            batch_dict = get_inputs(ob, puck_z)
            goal_obs_train = np.hstack(
                [ob['state_desired_goal'], ob['state_observation']])

            feed = {}

            feed.update({policy.rgb_camXs: batch_dict['rgb_camXs']})
            feed.update({policy.xyz_camXs: batch_dict['xyz_camXs']})
            feed.update({policy.pix_T_cams: batch_dict['pix_T_cams']})
            feed.update({policy.origin_T_camRs: batch_dict['origin_T_camRs']})
            feed.update({policy.origin_T_camXs: batch_dict['origin_T_camXs']})
            feed.update({policy.puck_xyz_camRs: batch_dict['puck_xyz_camRs']})
            feed.update({goal_obs: goal_obs_train})

            action = session.run([policy.ac], feed_dict=feed)

            observation, reward, terminal, info = env.step(action)

            for key in env_keys:
                path[key].append(observation[key])
            actions.append(action)
            terminals.append(terminal)

            infos.append(info)
            R += reward

            if terminal:
                break

        assert len(infos) == t + 1

        path = {key: np.stack(path[key], axis=0) for key in env_keys}
        path['actions'] = np.stack(actions, axis=0)
        path['terminals'] = np.stack(terminals, axis=0)
        if isinstance(
                policy,
                GaussianPolicy) and len(path['terminals']) >= path_length:
            continue
        elif not isinstance(policy, GaussianPolicy) and len(
                path['terminals']) == 1:
            continue
        rewards.append(R)
        count_infos.append(infos[-1]['puck_success'])
        paths.append(path)

    return _clean_paths(paths), return_stats(rewards, count_infos)