def test_learning(self): pol_net = PolNet(self.env.observation_space, self.env.action_space, h1=32, h2=32) pol = GaussianPol(self.env.observation_space, self.env.action_space, pol_net) sampler = EpiSampler(self.env, pol, num_parallel=1) optim_pol = torch.optim.Adam(pol_net.parameters(), 3e-4) with open(os.path.join('data/expert_epis', 'Pendulum-v0_2epis.pkl'), 'rb') as f: expert_epis = pickle.load(f) train_epis, test_epis = ef.train_test_split( expert_epis, train_size=0.7) train_traj = Traj() train_traj.add_epis(train_epis) train_traj.register_epis() test_traj = Traj() test_traj.add_epis(test_epis) test_traj.register_epis() result_dict = behavior_clone.train( train_traj, pol, optim_pol, 256 ) del sampler
pol_net, data_parallel=args.data_parallel) elif isinstance(action_space, gym.spaces.MultiDiscrete): pol = MultiCategoricalPol(observation_space, action_space, pol_net, data_parallel=args.data_parallel) else: raise ValueError('Only Box, Discrete, and MultiDiscrete are supported') sampler = EpiSampler(env, pol, num_parallel=args.num_parallel, seed=args.seed) optim_pol = torch.optim.Adam(pol_net.parameters(), args.pol_lr) with open(os.path.join(args.expert_dir, args.expert_fname), 'rb') as f: expert_epis = pickle.load(f) train_epis, test_epis = ef.train_test_split(expert_epis, train_size=args.train_size) train_traj = Traj() train_traj.add_epis(train_epis) train_traj.register_epis() test_traj = Traj() test_traj.add_epis(test_epis) test_traj.register_epis() expert_rewards = [np.sum(epi['rews']) for epi in expert_epis] expert_mean_rew = np.mean(expert_rewards) logger.log('expert_score={}'.format(expert_mean_rew)) logger.log('num_train_epi={}'.format(train_traj.num_epi)) max_rew = -1e6 for curr_epoch in range(args.epoch): if args.data_parallel: