def test_learning(self): pol_net = PolNet(self.env.ob_space, self.env.ac_space, h1=32, h2=32) pol = GaussianPol(self.env.ob_space, self.env.ac_space, pol_net) targ_pol_net = PolNet(self.env.ob_space, self.env.ac_space, 32, 32) targ_pol_net.load_state_dict(pol_net.state_dict()) targ_pol = GaussianPol( self.env.ob_space, self.env.ac_space, targ_pol_net) qf_net = QNet(self.env.ob_space, self.env.ac_space, h1=32, h2=32) qf = DeterministicSAVfunc(self.env.ob_space, self.env.ac_space, qf_net) targ_qf_net = QNet(self.env.ob_space, self.env.ac_space, 32, 32) targ_qf_net.load_state_dict(targ_qf_net.state_dict()) targ_qf = DeterministicSAVfunc( self.env.ob_space, self.env.ac_space, targ_qf_net) sampler = EpiSampler(self.env, pol, num_parallel=1) optim_pol = torch.optim.Adam(pol_net.parameters(), 3e-4) optim_qf = torch.optim.Adam(qf_net.parameters(), 3e-4) epis = sampler.sample(pol, max_steps=32) traj = Traj() traj.add_epis(epis) traj = ef.add_next_obs(traj) traj.register_epis() result_dict = svg.train( traj, pol, targ_pol, qf, targ_qf, optim_pol, optim_qf, 1, 32, 0.01, 0.9, 1) del sampler
def test_learning(self): ob_space = self.env.real_observation_space skill_space = self.env.skill_space ob_skill_space = self.env.observation_space ac_space = self.env.action_space ob_dim = ob_skill_space.shape[0] - 4 f_dim = ob_dim def discrim_f(x): return x pol_net = PolNet(ob_skill_space, ac_space) pol = GaussianPol(ob_skill_space, ac_space, pol_net) qf_net1 = QNet(ob_skill_space, ac_space) qf1 = DeterministicSAVfunc(ob_skill_space, ac_space, qf_net1) targ_qf_net1 = QNet(ob_skill_space, ac_space) targ_qf_net1.load_state_dict(qf_net1.state_dict()) targ_qf1 = DeterministicSAVfunc(ob_skill_space, ac_space, targ_qf_net1) qf_net2 = QNet(ob_skill_space, ac_space) qf2 = DeterministicSAVfunc(ob_skill_space, ac_space, qf_net2) targ_qf_net2 = QNet(ob_skill_space, ac_space) targ_qf_net2.load_state_dict(qf_net2.state_dict()) targ_qf2 = DeterministicSAVfunc(ob_skill_space, ac_space, targ_qf_net2) qfs = [qf1, qf2] targ_qfs = [targ_qf1, targ_qf2] log_alpha = nn.Parameter(torch.ones(())) high = np.array([np.finfo(np.float32).max]*f_dim) f_space = gym.spaces.Box(-high, high, dtype=np.float32) discrim_net = DiaynDiscrimNet( f_space, skill_space, h_size=100, discrim_f=discrim_f) discrim = DeterministicSVfunc(f_space, discrim_net) optim_pol = torch.optim.Adam(pol_net.parameters(), 1e-4) optim_qf1 = torch.optim.Adam(qf_net1.parameters(), 3e-4) optim_qf2 = torch.optim.Adam(qf_net2.parameters(), 3e-4) optim_qfs = [optim_qf1, optim_qf2] optim_alpha = torch.optim.Adam([log_alpha], 1e-4) optim_discrim = torch.optim.SGD(discrim.parameters(), lr=0.001, momentum=0.9) off_traj = Traj() sampler = EpiSampler(self.env, pol, num_parallel=1) epis = sampler.sample(pol, max_steps=200) on_traj = Traj() on_traj.add_epis(epis) on_traj = ef.add_next_obs(on_traj) on_traj = ef.compute_diayn_rews( on_traj, lambda x: diayn_sac.calc_rewards(x, 4, discrim)) on_traj.register_epis() off_traj.add_traj(on_traj) step = on_traj.num_step log_alpha = nn.Parameter(np.log(0.1)*torch.ones(())) # fix alpha result_dict = diayn_sac.train( off_traj, pol, qfs, targ_qfs, log_alpha, optim_pol, optim_qfs, optim_alpha, step, 128, 5e-3, 0.99, 1, discrim, 4, True) discrim_losses = diayn.train( discrim, optim_discrim, on_traj, 32, 100, 4) del sampler
def test_learning(self): pol_net = PolNet(self.env.ob_space, self.env.ac_space, h1=32, h2=32) pol = GaussianPol(self.env.ob_space, self.env.ac_space, pol_net) qf_net1 = QNet(self.env.ob_space, self.env.ac_space) qf1 = DeterministicSAVfunc(self.env.ob_space, self.env.ac_space, qf_net1) targ_qf_net1 = QNet(self.env.ob_space, self.env.ac_space) targ_qf_net1.load_state_dict(qf_net1.state_dict()) targ_qf1 = DeterministicSAVfunc(self.env.ob_space, self.env.ac_space, targ_qf_net1) qf_net2 = QNet(self.env.ob_space, self.env.ac_space) qf2 = DeterministicSAVfunc(self.env.ob_space, self.env.ac_space, qf_net2) targ_qf_net2 = QNet(self.env.ob_space, self.env.ac_space) targ_qf_net2.load_state_dict(qf_net2.state_dict()) targ_qf2 = DeterministicSAVfunc(self.env.ob_space, self.env.ac_space, targ_qf_net2) qfs = [qf1, qf2] targ_qfs = [targ_qf1, targ_qf2] log_alpha = nn.Parameter(torch.zeros(())) sampler = EpiSampler(self.env, pol, num_parallel=1) optim_pol = torch.optim.Adam(pol_net.parameters(), 3e-4) optim_qf1 = torch.optim.Adam(qf_net1.parameters(), 3e-4) optim_qf2 = torch.optim.Adam(qf_net2.parameters(), 3e-4) optim_qfs = [optim_qf1, optim_qf2] optim_alpha = torch.optim.Adam([log_alpha], 3e-4) epis = sampler.sample(pol, max_steps=32) traj = Traj() traj.add_epis(epis) traj = ef.add_next_obs(traj) traj.register_epis() result_dict = sac.train( traj, pol, qfs, targ_qfs, log_alpha, optim_pol, optim_qfs, optim_alpha, 2, 32, 0.01, 0.99, 2, ) del sampler
def test_learning(self): pol_net = PolNet(self.env.observation_space, self.env.action_space, h1=32, h2=32, deterministic=True) noise = OUActionNoise(self.env.action_space) pol = DeterministicActionNoisePol(self.env.observation_space, self.env.action_space, pol_net, noise) targ_pol_net = PolNet(self.env.observation_space, self.env.action_space, 32, 32, deterministic=True) targ_pol_net.load_state_dict(pol_net.state_dict()) targ_noise = OUActionNoise(self.env.action_space) targ_pol = DeterministicActionNoisePol(self.env.observation_space, self.env.action_space, targ_pol_net, targ_noise) qf_net = QNet(self.env.observation_space, self.env.action_space, h1=32, h2=32) qf = DeterministicSAVfunc(self.env.observation_space, self.env.action_space, qf_net) targ_qf_net = QNet(self.env.observation_space, self.env.action_space, 32, 32) targ_qf_net.load_state_dict(targ_qf_net.state_dict()) targ_qf = DeterministicSAVfunc(self.env.observation_space, self.env.action_space, targ_qf_net) sampler = EpiSampler(self.env, pol, num_parallel=1) optim_pol = torch.optim.Adam(pol_net.parameters(), 3e-4) optim_qf = torch.optim.Adam(qf_net.parameters(), 3e-4) epis = sampler.sample(pol, max_steps=32) traj = Traj() traj.add_epis(epis) traj = ef.add_next_obs(traj) traj.register_epis() result_dict = ddpg.train(traj, pol, targ_pol, qf, targ_qf, optim_pol, optim_qf, 1, 32, 0.01, 0.9) del sampler
def test_learning(self): qf_net = QNet(self.env.observation_space, self.env.action_space, 32, 32) lagged_qf_net = QNet(self.env.observation_space, self.env.action_space, 32, 32) lagged_qf_net.load_state_dict(qf_net.state_dict()) targ_qf1_net = QNet(self.env.observation_space, self.env.action_space, 32, 32) targ_qf1_net.load_state_dict(qf_net.state_dict()) targ_qf2_net = QNet(self.env.observation_space, self.env.action_space, 32, 32) targ_qf2_net.load_state_dict(lagged_qf_net.state_dict()) qf = DeterministicSAVfunc(self.env.observation_space, self.env.action_space, qf_net) lagged_qf = DeterministicSAVfunc(self.env.observation_space, self.env.action_space, lagged_qf_net) targ_qf1 = CEMDeterministicSAVfunc(self.env.observation_space, self.env.action_space, targ_qf1_net, num_sampling=60, num_best_sampling=6, num_iter=2, multivari=False) targ_qf2 = DeterministicSAVfunc(self.env.observation_space, self.env.action_space, targ_qf2_net) pol = ArgmaxQfPol(self.env.observation_space, self.env.action_space, targ_qf1, eps=0.2) sampler = EpiSampler(self.env, pol, num_parallel=1) optim_qf = torch.optim.Adam(qf_net.parameters(), 3e-4) epis = sampler.sample(pol, max_steps=32) traj = Traj() traj.add_epis(epis) traj = ef.add_next_obs(traj) traj.register_epis() result_dict = qtopt.train(traj, qf, lagged_qf, targ_qf1, targ_qf2, optim_qf, 1000, 32, 0.9999, 0.995, 'mse') del sampler
def test_learning(self): pol_net = PolNet(self.env.observation_space, self.env.action_space, h1=32, h2=32) pol = GaussianPol(self.env.observation_space, self.env.action_space, pol_net) vf_net = VNet(self.env.observation_space) vf = DeterministicSVfunc(self.env.observation_space, vf_net) discrim_net = DiscrimNet(self.env.observation_space, self.env.action_space, h1=32, h2=32) discrim = DeterministicSAVfunc(self.env.observation_space, self.env.action_space, discrim_net) sampler = EpiSampler(self.env, pol, num_parallel=1) optim_vf = torch.optim.Adam(vf_net.parameters(), 3e-4) optim_discrim = torch.optim.Adam(discrim_net.parameters(), 3e-4) with open(os.path.join('data/expert_epis', 'Pendulum-v0_2epis.pkl'), 'rb') as f: expert_epis = pickle.load(f) expert_traj = Traj() expert_traj.add_epis(expert_epis) expert_traj.register_epis() epis = sampler.sample(pol, max_steps=32) agent_traj = Traj() agent_traj.add_epis(epis) agent_traj = ef.compute_pseudo_rews(agent_traj, discrim) agent_traj = ef.compute_vs(agent_traj, vf) agent_traj = ef.compute_rets(agent_traj, 0.99) agent_traj = ef.compute_advs(agent_traj, 0.99, 0.95) agent_traj = ef.centerize_advs(agent_traj) agent_traj = ef.compute_h_masks(agent_traj) agent_traj.register_epis() result_dict = gail.train(agent_traj, expert_traj, pol, vf, discrim, optim_vf, optim_discrim, rl_type='trpo', epoch=1, batch_size=32, discrim_batch_size=32, discrim_step=1, pol_ent_beta=1e-3, discrim_ent_beta=1e-5) del sampler
def setUpClass(cls): env = GymEnv('Pendulum-v0') random_pol = RandomPol(cls.env.observation_space, cls.env.action_space) sampler = EpiSampler(cls.env, pol, num_parallel=1) epis = sampler.sample(pol, max_steps=32) traj = Traj() traj.add_epis(epis) traj.register_epis() cls.num_step = traj.num_step make_redis('localhost', '6379') cls.r = get_redis() cls.r.set('env', env) cls.r.set('traj', traj) pol_net = PolNet(env.observation_space, env.action_space) gpol = GaussianPol(env.observation_space, env.action_space, pol_net) pol_net = PolNet(env.observation_space, env.action_space, deterministic=True) dpol = DeterministicActionNoisePol( env.observation_space, env.action_space, pol_net) model_net = ModelNet(env.observation_space, env.action_space) mpcpol = MPCPol(env.observation_space, env.action_space, model_net, rew_func) q_net = QNet(env.observation_space, env.action_space) qfunc = DeterministicSAVfunc( env.observation_space, env.action_space, q_net) aqpol = ArgmaxQfPol(env.observation_space, env.action_space, qfunc) v_net = VNet(env.observation_space) vfunc = DeterministicSVfunc(env.observation_space, v_net) cls.r.set('gpol', cloudpickle.dumps(gpol)) cls.r.set('dpol', cloudpickle.dumps(dpol)) cls.r.set('mpcpol', cloudpickle.dumps(mpcpol)) cls.r.set('qfunc', cloudpickle.dumps(qfunc)) cls.r.set('aqpol', cloudpickle.dumps(aqpol)) cls.r.set('vfunc', cloudpickle.dumps(vfunc)) c2d = C2DEnv(env) pol_net = PolNet(c2d.observation_space, c2d.action_space) mcpol = MultiCategoricalPol( env.observation_space, env.action_space, pol_net) cls.r.set('mcpol', cloudpickle.dumps(mcpol))
h1=args.discrim_h1, h2=args.discrim_h2) shaping_vf = DeterministicSVfunc(observation_space, shaping_vf_net, data_parallel=args.data_parallel) optim_discrim = torch.optim.Adam( list(rewf_net.parameters()) + list(shaping_vf_net.parameters()), args.discrim_lr) advf = None elif args.rew_type == 'adv': advf_net = DiscrimNet(observation_space, action_space, h1=args.discrim_h1, h2=args.discrim_h2) advf = DeterministicSAVfunc(observation_space, action_space, advf_net, data_parallel=args.data_parallel) optim_discrim = torch.optim.Adam(advf_net.parameters(), args.discrim_lr) rewf = None shaping_vf = None else: raise ValueError('Only rew and adv are supported') sampler = EpiSampler(env, pol, num_parallel=args.num_parallel, seed=args.seed) optim_pol = torch.optim.Adam(pol_net.parameters(), args.pol_lr) optim_vf = torch.optim.Adam(vf_net.parameters(), args.vf_lr) with open(os.path.join(args.expert_dir, args.expert_fname), 'rb') as f: expert_epis = pickle.load(f) expert_traj = Traj()
rewf_net = VNet(observation_space, h1=args.discrim_h1, h2=args.discrim_h2) rewf = DeterministicSVfunc(observation_space, rewf_net) shaping_vf_net = VNet(observation_space, h1=args.discrim_h1, h2=args.discrim_h2) shaping_vf = DeterministicSVfunc(observation_space, shaping_vf_net) optim_discrim = torch.optim.Adam( list(rewf_net.parameters()) + list(shaping_vf_net.parameters()), args.discrim_lr) advf = None elif args.rew_type == 'adv': advf_net = DiscrimNet(observation_space, action_space, h1=args.discrim_h1, h2=args.discrim_h2) advf = DeterministicSAVfunc(observation_space, action_space, advf_net) optim_discrim = torch.optim.Adam(advf_net.parameters(), args.discrim_lr) rewf = None shaping_vf = None else: raise ValueError('Only rew and adv are supported') sampler = EpiSampler(env, pol, num_parallel=args.num_parallel, seed=args.seed) optim_pol = torch.optim.Adam(pol_net.parameters(), args.pol_lr) optim_vf = torch.optim.Adam(vf_net.parameters(), args.vf_lr) with open(os.path.join(args.expert_dir, args.expert_fname), 'rb') as f: expert_epis = pickle.load(f) expert_traj = Traj() expert_traj.add_epis(expert_epis)
def test_learning(self): pol_net = PolNetLSTM( self.env.observation_space, self.env.action_space, h_size=32, cell_size=32) pol = GaussianPol(self.env.observation_space, self.env.action_space, pol_net, rnn=True) qf_net1 = QNetLSTM(self.env.observation_space, self.env.action_space, h_size=32, cell_size=32) qf1 = DeterministicSAVfunc( self.env.observation_space, self.env.action_space, qf_net1, rnn=True) targ_qf_net1 = QNetLSTM( self.env.observation_space, self.env.action_space, h_size=32, cell_size=32) targ_qf_net1.load_state_dict(qf_net1.state_dict()) targ_qf1 = DeterministicSAVfunc( self.env.observation_space, self.env.action_space, targ_qf_net1, rnn=True) qf_net2 = QNetLSTM(self.env.observation_space, self.env.action_space, h_size=32, cell_size=32) qf2 = DeterministicSAVfunc( self.env.observation_space, self.env.action_space, qf_net2, rnn=True) targ_qf_net2 = QNetLSTM( self.env.observation_space, self.env.action_space, h_size=32, cell_size=32) targ_qf_net2.load_state_dict(qf_net2.state_dict()) targ_qf2 = DeterministicSAVfunc( self.env.observation_space, self.env.action_space, targ_qf_net2, rnn=True) qfs = [qf1, qf2] targ_qfs = [targ_qf1, targ_qf2] log_alpha = nn.Parameter(torch.zeros(())) sampler = EpiSampler(self.env, pol, num_parallel=1) optim_pol = torch.optim.Adam(pol_net.parameters(), 3e-4) optim_qf1 = torch.optim.Adam(qf_net1.parameters(), 3e-4) optim_qf2 = torch.optim.Adam(qf_net2.parameters(), 3e-4) optim_qfs = [optim_qf1, optim_qf2] optim_alpha = torch.optim.Adam([log_alpha], 3e-4) epis = sampler.sample(pol, max_steps=32) traj = Traj() traj.add_epis(epis) traj = ef.add_next_obs(traj) max_pri = traj.get_max_pri() traj = ef.set_all_pris(traj, max_pri) traj = ef.compute_seq_pris(traj, 4) traj = ef.compute_h_masks(traj) for i in range(len(qfs)): traj = ef.compute_hs( traj, qfs[i], hs_name='q_hs'+str(i), input_acs=True) traj = ef.compute_hs( traj, targ_qfs[i], hs_name='targ_q_hs'+str(i), input_acs=True) traj.register_epis() result_dict = r2d2_sac.train( traj, pol, qfs, targ_qfs, log_alpha, optim_pol, optim_qfs, optim_alpha, 2, 32, 4, 2, 0.01, 0.99, 2, ) del sampler
env = GymEnv(args.env_name, log_dir=os.path.join(args.log, 'movie'), record_video=args.record) env.env.seed(args.seed) observation_space = env.observation_space action_space = env.action_space qf_net = QNet(observation_space, action_space, args.h1, args.h2) lagged_qf_net = QNet(observation_space, action_space, args.h1, args.h2) lagged_qf_net.load_state_dict(qf_net.state_dict()) targ_qf1_net = QNet(observation_space, action_space, args.h1, args.h2) targ_qf1_net.load_state_dict(qf_net.state_dict()) targ_qf2_net = QNet(observation_space, action_space, args.h1, args.h2) targ_qf2_net.load_state_dict(lagged_qf_net.state_dict()) qf = DeterministicSAVfunc(observation_space, action_space, qf_net) lagged_qf = DeterministicSAVfunc(observation_space, action_space, lagged_qf_net) targ_qf1 = CEMDeterministicSAVfunc(observation_space, action_space, targ_qf1_net, num_sampling=args.num_sampling, num_best_sampling=args.num_best_sampling, num_iter=args.num_iter, multivari=args.multivari, save_memory=args.save_memory) targ_qf2 = DeterministicSAVfunc(observation_space, action_space, targ_qf2_net) pol = ArgmaxQfPol(observation_space, action_space, targ_qf1, eps=args.eps) sampler = EpiSampler(env, pol, num_parallel=args.num_parallel, seed=args.seed)
pol = GaussianPol(ob_space, ac_space, pol_net, data_parallel=args.data_parallel, parallel_dim=0) vf_net = VNet(ob_space) vf = DeterministicSVfunc(ob_space, vf_net, data_parallel=args.data_parallel, parallel_dim=0) qf_net = QNet(ob_space, ac_space) qf = DeterministicSAVfunc(ob_space, ac_space, qf_net, data_parallel=args.data_parallel, parallel_dim=0) targ_qf_net = QNet(ob_space, ac_space) targ_qf_net.load_state_dict(qf_net.state_dict()) targ_qf = DeterministicSAVfunc(ob_space, ac_space, targ_qf_net, data_parallel=args.data_parallel, parallel_dim=0) log_alpha = nn.Parameter(torch.zeros((), device=device)) sampler = EpiSampler(env, pol, args.num_parallel, seed=args.seed) optim_pol = torch.optim.Adam(pol_net.parameters(), args.pol_lr)
data_parallel=args.data_parallel) if args.rew_type == 'rew': rewf_net = VNet(ob_space, h1=args.discrim_h1, h2=args.discrim_h2) rewf = DeterministicSVfunc( ob_space, rewf_net, data_parallel=args.data_parallel) shaping_vf_net = VNet(ob_space, h1=args.discrim_h1, h2=args.discrim_h2) shaping_vf = DeterministicSVfunc( ob_space, shaping_vf_net, data_parallel=args.data_parallel) optim_discrim = torch.optim.Adam( list(rewf_net.parameters()) + list(shaping_vf_net.parameters()), args.discrim_lr) advf = None elif args.rew_type == 'adv': advf_net = DiscrimNet(ob_space, ac_space, h1=args.discrim_h1, h2=args.discrim_h2) advf = DeterministicSAVfunc( ob_space, ac_space, advf_net, data_parallel=args.data_parallel) optim_discrim = torch.optim.Adam(advf_net.parameters(), args.discrim_lr) rewf = None shaping_vf = None else: raise ValueError('Only rew and adv are supported') sampler = EpiSampler(env, pol, num_parallel=args.num_parallel, seed=args.seed) optim_pol = torch.optim.Adam(pol_net.parameters(), args.pol_lr) optim_vf = torch.optim.Adam(vf_net.parameters(), args.vf_lr) with open(os.path.join(args.expert_dir, args.expert_fname), 'rb') as f: expert_epis = pickle.load(f) expert_traj = Traj() expert_traj.add_epis(expert_epis)
def main(): pygame.init() # 初期化 (w, h) = (480, 320) screen = pygame.display.set_mode((w, h), FULLSCREEN) # window size pygame.display.set_caption("Sikamaru") # window bar # initialization tx = 0 ty = 0 sika = Sikamaru((w / 2, h / 2)) sleep_count = 5 eat_mode = 100 esa = Food() wait = True seed = 42 # TODO define RL agent ''' state : 4D (sikaposi, esaposi) action : 2D (-20,+20)^2 SAC simple_net : 30,30 ''' np.random.seed(seed) torch.manual_seed(seed) low = np.zeros(4) high = w * np.ones(4) ob_space = gym.spaces.Box(low=low, high=high) ac_space = gym.spaces.Discrete(4) ac_dict = { 0: np.array([-20, 0]), 1: np.array([20, 0]), 2: np.array([0, -20]), 3: np.array([0, 20]) } pol_net = PolNet(ob_space, ac_space) pol = CategoricalPol(ob_space, ac_space, pol_net) qf_net1 = QNet(ob_space, ac_space) qf1 = DeterministicSAVfunc(ob_space, ac_space, qf_net1) targ_qf_net1 = QNet(ob_space, ac_space) targ_qf_net1.load_state_dict(qf_net1.state_dict()) targ_qf1 = DeterministicSAVfunc(ob_space, ac_space, targ_qf_net1) qf_net2 = QNet(ob_space, ac_space) qf2 = DeterministicSAVfunc(ob_space, ac_space, qf_net2) targ_qf_net2 = QNet(ob_space, ac_space) targ_qf_net2.load_state_dict(qf_net2.state_dict()) targ_qf2 = DeterministicSAVfunc(ob_space, ac_space, targ_qf_net2) qfs = [qf1, qf2] targ_qfs = [targ_qf1, targ_qf2] log_alpha = nn.Parameter(torch.ones(())) optim_pol = torch.optim.Adam(pol_net.parameters(), 1e-4) optim_qf1 = torch.optim.Adam(qf_net1.parameters(), 3e-4) optim_qf2 = torch.optim.Adam(qf_net2.parameters(), 3e-4) optim_qfs = [optim_qf1, optim_qf2] optim_alpha = torch.optim.Adam([log_alpha], 1e-4) # off_traj = Traj() while (True): screen.fill(( 0, 100, 0, )) # backgroud color # my procedure ## env obs = make_obs((tx, ty), sika.posi, w, h) ac_real, ac, a_i = pol.deterministic_ac_real( torch.tensor(obs, dtype=torch.float)) # ac_real = ac_real.reshape(pol.ac_space.shape) a = rule_act((tx, ty), sika.posi) # a = ac_dict[int(ac_real)] nx = sika.posi[0] + a[0] nx = max(min(nx, w), 0) ny = sika.posi[1] + a[1] ny = max(min(ny, h), 0) sika.move((nx, ny)) screen.blit(sika.get_im(), sika.rect) if esa.life: # RL # TOOD:record as epi screen.blit(esa.im, esa.rect) # scr rew = esa.life_step(sika) if rew > 0: sika.bigup() if esa.life == 0: pass #TODO add one epi and learn wait = False if wait: pygame.time.wait(500) wait = True pygame.display.update() # 画面更新 ## event for event in pygame.event.get(): if event.type == MOUSEBUTTONDOWN and event.button == 1: tx, ty = event.pos esa.set((tx, ty)) if event.type == KEYDOWN: if event.key == K_ESCAPE: sys.exit() if event.type == QUIT: # 終了処理 pygame.quit() sys.exit()
args.log, 'movie'), record_video=args.record) env.env.seed(args.seed) observation_space = env.observation_space action_space = env.action_space pol_net = PolNet(observation_space, action_space) pol = GaussianPol(observation_space, action_space, pol_net, data_parallel=args.data_parallel, parallel_dim=0) vf_net = VNet(observation_space) vf = DeterministicSVfunc( observation_space, vf_net, data_parallel=args.data_parallel, parallel_dim=0) qf_net = QNet(observation_space, action_space) qf = DeterministicSAVfunc(observation_space, action_space, qf_net, data_parallel=args.data_parallel, parallel_dim=0) targ_qf_net = QNet(observation_space, action_space) targ_qf_net.load_state_dict(qf_net.state_dict()) targ_qf = DeterministicSAVfunc( observation_space, action_space, targ_qf_net, data_parallel=args.data_parallel, parallel_dim=0) log_alpha = nn.Parameter(torch.zeros((), device=device)) sampler = EpiSampler(env, pol, args.num_parallel, seed=args.seed) optim_pol = torch.optim.Adam(pol_net.parameters(), args.pol_lr) optim_vf = torch.optim.Adam(vf_net.parameters(), args.vf_lr) optim_qf = torch.optim.Adam(qf_net.parameters(), args.qf_lr) optim_alpha = torch.optim.Adam([log_alpha], args.pol_lr) off_traj = Traj(args.max_steps_off)
log_dir=os.path.join(args.log, 'movie'), record_video=args.record) env.env.seed(args.seed) observation_space = env.observation_space action_space = env.action_space qf_net = QNet(observation_space, action_space, args.h1, args.h2) lagged_qf_net = QNet(observation_space, action_space, args.h1, args.h2) lagged_qf_net.load_state_dict(qf_net.state_dict()) targ_qf1_net = QNet(observation_space, action_space, args.h1, args.h2) targ_qf1_net.load_state_dict(qf_net.state_dict()) targ_qf2_net = QNet(observation_space, action_space, args.h1, args.h2) targ_qf2_net.load_state_dict(lagged_qf_net.state_dict()) qf = DeterministicSAVfunc(observation_space, action_space, qf_net, data_parallel=args.data_parallel) lagged_qf = DeterministicSAVfunc(observation_space, action_space, lagged_qf_net, data_parallel=args.data_parallel) targ_qf1 = CEMDeterministicSAVfunc(observation_space, action_space, targ_qf1_net, num_sampling=args.num_sampling, num_best_sampling=args.num_best_sampling, num_iter=args.num_iter, multivari=args.multivari, data_parallel=args.data_parallel, save_memory=args.save_memory) targ_qf2 = DeterministicSAVfunc(observation_space,
if args.rnn: vf_net = VNetLSTM(ob_space, h_size=256, cell_size=256) else: vf_net = VNet(ob_space) vf = DeterministicSVfunc(ob_space, vf_net, args.rnn, data_parallel=args.data_parallel, parallel_dim=1 if args.rnn else 0) discrim_net = DiscrimNet(ob_space, ac_space, h1=args.discrim_h1, h2=args.discrim_h2) discrim = DeterministicSAVfunc(ob_space, ac_space, discrim_net, data_parallel=args.data_parallel) sampler = EpiSampler(env, pol, num_parallel=args.num_parallel, seed=args.seed) optim_pol = torch.optim.Adam(pol_net.parameters(), args.pol_lr) optim_vf = torch.optim.Adam(vf_net.parameters(), args.vf_lr) optim_discrim = torch.optim.Adam(discrim_net.parameters(), args.discrim_lr) with open(os.path.join(args.expert_dir, args.expert_fname), 'rb') as f: expert_epis = pickle.load(f) expert_traj = Traj() expert_traj.add_epis(expert_epis) expert_traj.register_epis() expert_rewards = [np.sum(epi['rews']) for epi in expert_epis] expert_mean_rew = np.mean(expert_rewards)
pol = MultiCategoricalPol(observation_space, action_space, pol_net, args.rnn) else: raise ValueError('Only Box, Discrete, and MultiDiscrete are supported') if args.rnn: vf_net = VNetLSTM(observation_space, h_size=256, cell_size=256) else: vf_net = VNet(observation_space) vf = DeterministicSVfunc(observation_space, vf_net, args.rnn) discrim_net = DiscrimNet(observation_space, action_space, h1=args.discrim_h1, h2=args.discrim_h2) discrim = DeterministicSAVfunc(observation_space, action_space, discrim_net) sampler = EpiSampler(env, pol, num_parallel=args.num_parallel, seed=args.seed) optim_pol = torch.optim.Adam(pol_net.parameters(), args.pol_lr) optim_vf = torch.optim.Adam(vf_net.parameters(), args.vf_lr) optim_discrim = torch.optim.Adam(discrim_net.parameters(), args.discrim_lr) with open(os.path.join(args.expert_dir, args.expert_fname), 'rb') as f: expert_epis = pickle.load(f) expert_traj = Traj() expert_traj.add_epis(expert_epis) expert_traj.register_epis() expert_rewards = [np.sum(epi['rews']) for epi in expert_epis] expert_mean_rew = np.mean(expert_rewards) logger.log('expert_score={}'.format(expert_mean_rew))
if args.irl_type == 'rew': rewf_net = VNet(ob_space, h1=args.discrim_h1, h2=args.discrim_h2) rewf = DeterministicSVfunc(ob_space, rewf_net, args.rnn) shaping_vf_net = VNet(ob_space, h1=args.discrim_h1, h2=args.discrim_h2) shaping_vf = DeterministicSVfunc(ob_space, shaping_vf_net, args.rnn) optim_discrim = torch.optim.Adam( list(rewf_net.parameters()) + list(shaping_vf_net.parameters()), args.discrim_lr) advf = None elif args.rew_type == 'adv': advf_net = DiscrimNet(ob_space, ac_space, h1=args.discrim_h1, h2=args.discrim_h2) advf = DeterministicSAVfunc(ob_space, ac_space, advf_net, args.rnn) optim_discrim = torch.optim.Adam(advf_net.parameters(), args.discrim_lr) rewf = None shaping_vf = None else: raise ValueError('Only rew and adv are supported') sampler = EpiSampler(env, pol, num_parallel=args.num_parallel, seed=args.seed) optim_pol = torch.optim.Adam(pol_net.parameters(), args.pol_lr) optim_vf = torch.optim.Adam(vf_net.parameters(), args.vf_lr) with open(os.path.join(args.expert_dir, args.expert_fname), 'rb') as f: expert_epis = pickle.load(f) expert_traj = Traj() expert_traj.add_epis(expert_epis)
ob_space = env.observation_space ac_space = env.action_space pol_net = PolNet(ob_space, ac_space, args.h1, args.h2, deterministic=True) noise = OUActionNoise(ac_space) pol = DeterministicActionNoisePol( ob_space, ac_space, pol_net, noise, data_parallel=args.data_parallel) targ_pol_net = PolNet(ob_space, ac_space, args.h1, args.h2, deterministic=True) targ_pol_net.load_state_dict(pol_net.state_dict()) targ_noise = OUActionNoise(ac_space) targ_pol = DeterministicActionNoisePol( ob_space, ac_space, targ_pol_net, targ_noise, data_parallel=args.data_parallel) qf_net = QNet(ob_space, ac_space, args.h1, args.h2) qf = DeterministicSAVfunc(ob_space, ac_space, qf_net, data_parallel=args.data_parallel) targ_qf_net = QNet(ob_space, ac_space, args.h1, args.h2) targ_qf_net.load_state_dict(targ_qf_net.state_dict()) targ_qf = DeterministicSAVfunc( ob_space, ac_space, targ_qf_net, data_parallel=args.data_parallel) sampler = EpiSampler(env, pol, num_parallel=args.num_parallel, seed=args.seed) optim_pol = torch.optim.Adam(pol_net.parameters(), args.pol_lr) optim_qf = torch.optim.Adam(qf_net.parameters(), args.qf_lr) off_traj = Traj(args.max_steps_off, traj_device='cpu') total_epi = 0 total_step = 0
skill_space = env.skill_space ob_skill_space = env.observation_space action_space = env.action_space ob_dim = ob_skill_space.shape[0] - args.num_skill device_name = 'cpu' if args.cuda < 0 else "cuda:{}".format(args.cuda) device = torch.device(device_name) set_device(device) # policy pol_net = PolNet(ob_skill_space, action_space) pol = GaussianPol(ob_skill_space, action_space, pol_net, data_parallel=args.data_parallel, parallel_dim=0) # q-function qf_net1 = QNet(ob_skill_space, action_space) qf1 = DeterministicSAVfunc(ob_skill_space, action_space, qf_net1, data_parallel=args.data_parallel, parallel_dim=0) targ_qf_net1 = QNet(ob_skill_space, action_space) targ_qf_net1.load_state_dict(qf_net1.state_dict()) targ_qf1 = DeterministicSAVfunc( ob_skill_space, action_space, targ_qf_net1, data_parallel=args.data_parallel, parallel_dim=0) qf_net2 = QNet(ob_skill_space, action_space) qf2 = DeterministicSAVfunc(ob_skill_space, action_space, qf_net2, data_parallel=args.data_parallel, parallel_dim=0) targ_qf_net2 = QNet(ob_skill_space, action_space) targ_qf_net2.load_state_dict(qf_net2.state_dict()) targ_qf2 = DeterministicSAVfunc( ob_skill_space, action_space, targ_qf_net2, data_parallel=args.data_parallel, parallel_dim=0) qfs = [qf1, qf2] targ_qfs = [targ_qf1, targ_qf2] log_alpha = nn.Parameter(torch.ones((), device=device))
observation_space = env.real_observation_space skill_space = env.skill_space ob_skill_space = env.observation_space action_space = env.action_space ob_dim = ob_skill_space.shape[0] - args.num_skill device_name = 'cpu' if args.cuda < 0 else "cuda:{}".format(args.cuda) device = torch.device(device_name) set_device(device) # policy pol_net = PolNet(ob_skill_space, action_space) pol = GaussianPol(ob_skill_space, action_space, pol_net) # q-function qf_net1 = QNet(ob_skill_space, action_space) qf1 = DeterministicSAVfunc(ob_skill_space, action_space, qf_net1) targ_qf_net1 = QNet(ob_skill_space, action_space) targ_qf_net1.load_state_dict(qf_net1.state_dict()) targ_qf1 = DeterministicSAVfunc(ob_skill_space, action_space, targ_qf_net1) qf_net2 = QNet(ob_skill_space, action_space) qf2 = DeterministicSAVfunc(ob_skill_space, action_space, qf_net2) targ_qf_net2 = QNet(ob_skill_space, action_space) targ_qf_net2.load_state_dict(qf_net2.state_dict()) targ_qf2 = DeterministicSAVfunc(ob_skill_space, action_space, targ_qf_net2) qfs = [qf1, qf2] targ_qfs = [targ_qf1, targ_qf2] log_alpha = nn.Parameter(torch.ones((), device=device)) high = np.array([np.finfo(np.float32).max]*f_dim) f_space = gym.spaces.Box(-high, high, dtype=np.float32)
ob_space = env.observation_space ac_space = env.action_space pol_net = PolNetLSTM(ob_space, ac_space) pol = GaussianPol(ob_space, ac_space, pol_net, rnn=True, data_parallel=args.data_parallel, parallel_dim=1) qf_net1 = QNetLSTM(ob_space, ac_space) qf1 = DeterministicSAVfunc(ob_space, ac_space, qf_net1, rnn=True, data_parallel=args.data_parallel, parallel_dim=1) targ_qf_net1 = QNetLSTM(ob_space, ac_space) targ_qf_net1.load_state_dict(qf_net1.state_dict()) targ_qf1 = DeterministicSAVfunc(ob_space, ac_space, targ_qf_net1, rnn=True, data_parallel=args.data_parallel, parallel_dim=1) qf_net2 = QNetLSTM(ob_space, ac_space) qf2 = DeterministicSAVfunc(ob_space, ac_space, qf_net2,
score_file = os.path.join(args.log, 'progress.csv') logger.add_tabular_output(score_file) logger.add_tensorboard_output(args.log) env = GymEnv(args.env_name, log_dir=os.path.join( args.log, 'movie'), record_video=args.record) env.env.seed(args.seed) observation_space = env.observation_space action_space = env.action_space pol_net = PolNetLSTM(observation_space, action_space) pol = GaussianPol(observation_space, action_space, pol_net, rnn=True) qf_net1 = QNetLSTM(observation_space, action_space) qf1 = DeterministicSAVfunc(observation_space, action_space, qf_net1, rnn=True) targ_qf_net1 = QNetLSTM(observation_space, action_space) targ_qf_net1.load_state_dict(qf_net1.state_dict()) targ_qf1 = DeterministicSAVfunc( observation_space, action_space, targ_qf_net1, rnn=True) qf_net2 = QNetLSTM(observation_space, action_space) qf2 = DeterministicSAVfunc(observation_space, action_space, qf_net2, rnn=True) targ_qf_net2 = QNetLSTM(observation_space, action_space) targ_qf_net2.load_state_dict(qf_net2.state_dict()) targ_qf2 = DeterministicSAVfunc( observation_space, action_space, targ_qf_net2, rnn=True) qfs = [qf1, qf2] targ_qfs = [targ_qf1, targ_qf2]
ob_space = env.observation_space ac_space = env.action_space pol_net = PolNet(ob_space, ac_space, args.h1, args.h2, deterministic=True) noise = OUActionNoise(ac_space) pol = DeterministicActionNoisePol(ob_space, ac_space, pol_net, noise) targ_pol_net = PolNet(ob_space, ac_space, args.h1, args.h2, deterministic=True) targ_pol_net.load_state_dict(pol_net.state_dict()) targ_noise = OUActionNoise(ac_space.shape) targ_pol = DeterministicActionNoisePol( ob_space, ac_space, targ_pol_net, targ_noise) qf_net = QNet(ob_space, ac_space, args.h1, args.h2) qf = DeterministicSAVfunc(ob_space, ac_space, qf_net) targ_qf_net = QNet(ob_space, ac_space, args.h1, args.h2) targ_qf_net.load_state_dict(qf_net.state_dict()) targ_qf = DeterministicSAVfunc(ob_space, ac_space, targ_qf_net) sampler = EpiSampler(env, pol, num_parallel=args.num_parallel, seed=args.seed) optim_pol = torch.optim.Adam(pol_net.parameters(), args.pol_lr) optim_qf = torch.optim.Adam(qf_net.parameters(), args.qf_lr) off_traj = Traj(args.max_steps_off) total_epi = 0 total_step = 0 max_rew = -1e6
device = torch.device(device_name) set_device(device) # policy pol_net = PolNet(ob_skill_space, ac_space) pol = GaussianPol(ob_skill_space, ac_space, pol_net, data_parallel=args.data_parallel, parallel_dim=0) # q-function qf_net1 = QNet(ob_skill_space, ac_space) qf1 = DeterministicSAVfunc(ob_skill_space, ac_space, qf_net1, data_parallel=args.data_parallel, parallel_dim=0) targ_qf_net1 = QNet(ob_skill_space, ac_space) targ_qf_net1.load_state_dict(qf_net1.state_dict()) targ_qf1 = DeterministicSAVfunc(ob_skill_space, ac_space, targ_qf_net1, data_parallel=args.data_parallel, parallel_dim=0) qf_net2 = QNet(ob_skill_space, ac_space) qf2 = DeterministicSAVfunc(ob_skill_space, ac_space, qf_net2, data_parallel=args.data_parallel, parallel_dim=0)
env = GymEnv(env, log_dir=os.path.join(args.log, 'movie'), record_video=args.record) env.env.seed(args.seed) # 観測と行動の次元 observation_space = env.observation_space action_space = env.action_space print('obs: {0}, act: {1}'.format(observation_space, action_space)) # Q-Network print('Qnet') qf_net = QTOptNet(observation_space, action_space) qf = DeterministicSAVfunc( flattend_observation_space, action_space, qf_net, data_parallel=args.data_parallel) # 決定的行動状態価値関数?q-netの出力の形を少し整える # target Q network theta1 print('target1_net') targ_qf1_net = QTOptNet(observation_space, action_space) targ_qf1_net.load_state_dict(qf_net.state_dict()) # model(重み)をロード(q-netからコピー) targ_qf1 = CEMDeterministicSAVfunc( flattend_observation_space, action_space, targ_qf1_net, num_sampling=args.num_sampling, num_best_sampling=args.num_best_sampling, num_iter=args.num_iter, multivari=args.multivari,
env.env.seed(args.seed) observation_space = env.observation_space action_space = env.action_space pol_net = PolNet(observation_space, action_space) pol = GaussianPol(observation_space, action_space, pol_net, data_parallel=args.data_parallel, parallel_dim=0) qf_net1 = QNet(observation_space, action_space) qf1 = DeterministicSAVfunc(observation_space, action_space, qf_net1, data_parallel=args.data_parallel, parallel_dim=0) targ_qf_net1 = QNet(observation_space, action_space) targ_qf_net1.load_state_dict(qf_net1.state_dict()) targ_qf1 = DeterministicSAVfunc(observation_space, action_space, targ_qf_net1, data_parallel=args.data_parallel, parallel_dim=0) qf_net2 = QNet(observation_space, action_space) qf2 = DeterministicSAVfunc(observation_space, action_space, qf_net2, data_parallel=args.data_parallel,