def experiment_meta_vs_random(meta_actor, meta_critic, save_directory, xml_models, episodes, steps): alpha = 1e-3 beta = 1e-2 env = gym.make("RoboschoolInvertedPendulum-v1") env.reset() (obs, reward, done, info) = env.step(env.action_space.sample()) num_features = obs.shape[0] num_actions = env.action_space.low.shape[0] if os.path.isdir(save_directory) == False: os.mkdir(save_directory) num_samples = 5 for k in range(1): if k == 0: meta = MetaPolicy(num_features=num_features, num_actions=num_actions, algo="PPO", alpha=1e-3, beta=1e-3, env=env) meta.learning_algorithm.load_model(meta_actor, meta_critic) else: meta = None agents = [] for model in xml_models: domain_agents = [] for _ in range(num_samples): env = gym.make("RoboschoolInvertedPendulum-v1") env.env.model_xml = model agent = LinearAgent(env, meta_policy=meta, algo="PPO", alpha=alpha, beta=beta) agent.learning_algorithm.t_length = 16 agent.learning_algorithm.update_steps = 64 agent.learning_algorithm.epochs = 8 domain_agents.append(agent) agents.append(domain_agents) domain_rewards_by_episode = {} for ep in range(episodes): null_actions = {} trajectories_by_domain = ExperimentsInvertedPendulum._run_episode( domain_agents=agents, num_steps=steps, optimize_meta=False) domain_samples = [] for i in trajectories_by_domain.keys(): sample_returns = [] for j in range(len(trajectories_by_domain[i])): sample_returns.append(sum( trajectories_by_domain[i][j])) domain_samples.append(sample_returns) print("Episode %d / %d" % (ep, episodes)) domain_rewards_by_episode[ep] = domain_samples if ep % 100 == 0: filename = "meta_test_" + str( ep) + ".pkl" if k == 0 else "no_meta_test_" + str( ep) + ".pkl" pickle.dump(domain_rewards_by_episode, open(save_directory + "/" + filename, "wb")) for ai, a in enumerate(agents): type_ = "meta_" if k == 0 else "no_meta_" type_ += str(ai) + "_" a[0].learning_algorithm.save_model( save_directory + "/" + type_, ep)
def experiment_train_meta(save_directory, meta_alpha, meta_beta, xml_models): alpha = 1e-3 beta = 1e-2 env = gym.make("RoboschoolInvertedPendulum-v1") env.reset() (obs, reward, done, info) = env.step(env.action_space.sample()) num_features = obs.shape[0] num_actions = env.action_space.low.shape[0] meta = MetaPolicy(num_features=num_features, num_actions=num_actions, algo="PPO", alpha=meta_alpha, beta=meta_beta, env=env) meta.learning_algorithm.t_length = 32 meta.learning_algorithm.update_steps = 64 if os.path.isdir(save_directory) == False: os.mkdir(save_directory) meta_train_episodes = 30 num_samples = 4 for k in range(meta_train_episodes): agents = [] print("Loading environments...") trial_by_domain = {} for i, model in enumerate(xml_models): domain_agents = [] for _ in range(num_samples): env = gym.make("RoboschoolInvertedPendulum-v1") env.env.model_xml = model agent = LinearAgent(env, meta_policy=meta, algo="PPO", alpha=alpha, beta=beta) domain_agents.append(agent) agent.learning_algorithm.t_length = 16 agent.learning_algorithm.update_steps = 64 agent.learning_algorithm.epochs = 8 agents.append(domain_agents) trial_by_domain[i] = [list() for _ in range(num_samples)] print("Done loading...") episodes = 250 steps = 500 domain_rewards_by_episode = {} for ep in range(episodes): trajectories_by_domain = ExperimentsInvertedPendulum._run_episode( domain_agents=agents, num_steps=steps, r_maxs=None) domain_samples = [] for i in trajectories_by_domain.keys(): sample_returns = [] for j in range(len(trajectories_by_domain[i])): sample_returns.append(sum( trajectories_by_domain[i][j])) domain_samples.append(sample_returns) print("Episode %d - Trial %d" % (ep, k)) domain_rewards_by_episode[ep] = domain_samples if ep % 100 == 0: val = (k * episodes) + ep meta.learning_algorithm.save_model(save_directory + "/", val) # pickle.dump(meta, open(save_directory+"/meta_iter_"+str(k)+".pkl", "wb")) pickle.dump( domain_rewards_by_episode, open( save_directory + "/trajectory_iter_" + str(val) + ".pkl", "wb"))
def experiment_meta_vs_random(meta_actor, meta_critic, save_directory, setups, episodes, steps): alpha = 1e-4 beta = 1e-3 basis_order = 3 env = AnimatEnv("./CustomEnvironments/maze1.txt") env.reset() (obs, reward, done, info) = env.step(env.action_space.sample()) obs = EnvWrapper.normalize_range(obs, env.env_range) phi = fourier_basis(obs, order=basis_order) num_features = phi.shape[0] num_actions = env.action_space.n env = EnvWrapper(env, basis_order=basis_order, normalization=1) if os.path.isdir(save_directory) == False: os.mkdir(save_directory) num_samples = 3 for k in range(2): k = 1 if k == 0: meta = MetaPolicy(num_features=num_features, num_actions=num_actions, algo="PPO", alpha=1e-3, beta=1e-3, env=env) meta.learning_algorithm.load_model(meta_actor, meta_critic) else: meta = None agents = [] for setup in setups: domain_agents = [] for _ in range(num_samples): gym_env = AnimatEnv(setup) env = EnvWrapper(gym_env, basis_order=basis_order, normalization=1) agent = LinearAgent(env, meta_policy=meta, algo="PPO", alpha=alpha, beta=beta) agent.learning_algorithm.t_length = 8 agent.learning_algorithm.update_steps = 64 agent.learning_algorithm.epochs = 4 agent.learning_algorithm.batch_size = 16 domain_agents.append(agent) agents.append(domain_agents) domain_rewards_by_episode = {} null_action_by_episode = {} for ep in range(episodes): null_actions = {} trajectories_by_domain = ExperimentsAnimat._run_episode( domain_agents=agents, num_steps=steps) domain_samples = [] for i in trajectories_by_domain.keys(): sample_rewards = [] for j in range(len(trajectories_by_domain[i])): t_rewards = [] for t in trajectories_by_domain[i][j]: action, explore = t['action'] a = agents[i][j].env.env.action_space.actions[ action] effect = agents[i][ j].env.env.animat._action_effect(a) if math.fabs(effect[0]) < 0.1 and math.fabs( effect[1]) < 0.1: if action in null_actions: null_actions[action] += 1 else: null_actions[action] = 1 t_rewards.append(t['reward']) sample_rewards.append(sum(t_rewards)) domain_samples.append(sample_rewards) print("Episode %d" % (ep)) domain_rewards_by_episode[ep] = domain_samples null_action_by_episode[ep] = null_actions if ep % 10 == 0: filename = "meta_test_" + str( ep) + ".pkl" if k == 0 else "no_meta_test_" + str( ep) + ".pkl" filename2 = "null_actions_meta_" + str( ep ) + ".pkl" if k == 0 else "null_actions_no_meta_" + str( ep) + ".pkl" pickle.dump(domain_rewards_by_episode, open(save_directory + "/" + filename, "wb")) pickle.dump(null_action_by_episode, open(save_directory + "/" + filename2, "wb")) for ai, a in enumerate(agents): type_ = "meta_" if k == 0 else "no_meta_" type_ += str(ai) + "_" a[0].learning_algorithm.save_model( save_directory + "/" + type_, ep)
def experiment_train_meta(save_directory, meta_alpha, meta_beta): alpha = 1e-4 beta = 1e-3 gym_env = AnimatEnv("./CustomEnvironments/maze1.txt") gym_env.reset() basis_order = 0 # ExperimentsAnimat.RECORDED_DATA[0]['order'] (obs, reward, done, info) = gym_env.step(gym_env.action_space.sample()) # obs = EnvWrapper.normalize_range(obs, gym_env.env_range) # phi = fourier_basis(obs, order=basis_order) num_features = obs.shape[0] num_actions = gym_env.action_space.n env = EnvWrapper(gym_env, basis_order=basis_order, normalization=1) meta = MetaPolicy(num_features=num_features, num_actions=num_actions, algo="PPO", alpha=meta_alpha, beta=meta_beta, env=env) meta.learning_algorithm.t_length = 32 meta.learning_algorithm.update_steps = 256 if os.path.isdir(save_directory) == False: os.mkdir(save_directory) meta_train_episodes = 30 num_samples = 3 for k in range(meta_train_episodes): agents = [] print("Loading environments...") r_maxs = [] trial_by_domain = {} for i, d in enumerate(ExperimentsAnimat.RECORDED_DATA): print("Setup: " + str(d['setup'])) setup = d['setup'] max_r = d['max_r'] episodes = d['episodes'] # steps = 500 #d['max_steps'] # episodes = 1000 steps = 600 basis_order = d['order'] domain_agents = [] for _ in range(num_samples): gym_env = AnimatEnv(setup) env = EnvWrapper(gym_env, basis_order=basis_order, normalization=1) agent = LinearAgent(env, meta_policy=meta, algo="PPO", alpha=alpha, beta=beta) domain_agents.append(agent) agent.learning_algorithm.t_length = 32 agent.learning_algorithm.update_steps = 128 agent.learning_algorithm.epochs = 4 agent.learning_algorithm.batch_size = 16 agents.append(domain_agents) r_maxs.append(max_r) trial_by_domain[i] = [list() for _ in range(num_samples)] print("Done loading...") domain_rewards_by_episode = {} for ep in range(episodes): trajectories_by_domain = ExperimentsAnimat._run_episode( domain_agents=agents, num_steps=steps, r_maxs=r_maxs) domain_samples = [] for i in trajectories_by_domain.keys(): sample_rewards = [] for j in range(len(trajectories_by_domain[i])): t_rewards = [] for t in trajectories_by_domain[i][j]: t_rewards.append(t['reward']) t['reward'] = t['reward'] / r_maxs[i] trial_by_domain[i][j].append(t) sample_rewards.append(sum(t_rewards)) domain_samples.append(sample_rewards) print("Episode %d - Trial %d" % (ep, k)) domain_rewards_by_episode[ep] = domain_samples if ep % 10 == 0: val = (k * episodes) + ep meta.learning_algorithm.save_model(save_directory + "/", val) # pickle.dump(meta, open(save_directory+"/meta_iter_"+str(k)+".pkl", "wb")) pickle.dump( domain_rewards_by_episode, open( save_directory + "/trajectory_iter_" + str(val) + ".pkl", "wb")) trajectories = [] for key in trial_by_domain.keys(): for traj in trial_by_domain[key]: trajectories.append(traj) if meta.algo == "REINFORCE": print("Updating meta....") meta.montecarlo_update(trajectories)
def experiment_meta_vs_random(meta_actor, meta_critic, save_directory, setups, episodes, steps): alpha = 0.001 basis_order = 3 env = gym.make('CartPole-v0') env.reset() (obs, reward, done, info) = env.step(env.action_space.sample()) obs = EnvWrapper.modified_sigmoid(obs) phi = fourier_basis(obs, order=basis_order) num_features = phi.shape[0] num_actions = env.action_space.n if os.path.isdir(save_directory) == False: os.mkdir(save_directory) num_samples = 5 for k in range(2): if k == 0: meta = MetaPolicy(num_features=num_features, num_actions=num_actions, algo="PPO", alpha=1e-3, beta=1e-3, env=env) meta.learning_algorithm.load_model(meta_actor, meta_critic) else: meta = None agents = [] for setup in setups: domain_agents = [] for _ in range(num_samples): gym_env = gym.make('CartPole-v0') gym_env.env.force_mag = setup["force"] gym_env.env.length = setup["pole_length"] gym_env.env.masscart = setup["masscart"] gym_env.env.masspole = setup["masspole"] env = EnvWrapper(gym_env, basis_order=basis_order, normalization=0) agent = LinearAgent(env, meta_policy=meta, alpha=alpha, algo="PPO") if meta is None: agent.random_action_prob = 0.0 domain_agents.append( agent ) agents.append( domain_agents ) domain_rewards_by_episode = {} for ep in range(episodes): trajectories_by_domain = ExperimentsCartpole._run_episode(domain_agents=agents, num_steps=steps, optimize_meta=False) domain_samples = [] for i in trajectories_by_domain.keys(): sample_rewards = [] for j in range(len(trajectories_by_domain[i])): t_rewards = [] for t in trajectories_by_domain[i][j]: t_rewards.append( t['reward'] ) sample_rewards.append( t_rewards ) domain_samples.append( sample_rewards ) print("Episode %d" %(ep)) domain_rewards_by_episode[ep] = domain_samples if ep % 100 == 0: filename = "meta_test"+str(ep)+".pkl" if k == 0 else "no_meta_test_"+str(ep)+".pkl" pickle.dump(domain_rewards_by_episode, open(save_directory+"/"+filename, "wb"))
def experiment_train_meta(save_directory, meta_alpha, meta_beta): gym_env = gym.make('CartPole-v0') gym_env.reset() basis_order = ExperimentsCartpole.RECORDED_DATA[0]['order'] (obs, reward, done, info) = gym_env.step(gym_env.env.action_space.sample()) obs = EnvWrapper.modified_sigmoid(obs) phi = fourier_basis(obs, order=basis_order) num_features = phi.shape[0] num_actions = gym_env.env.action_space.n env = EnvWrapper(gym_env, basis_order=basis_order, normalization=0) meta = MetaPolicy(num_features=num_features, num_actions=num_actions, algo="PPO", alpha=meta_alpha, beta=meta_beta, env=env) if os.path.isdir(save_directory) == False: os.mkdir(save_directory) meta_train_episodes = 500 num_samples = 3 for k in range(meta_train_episodes): agents = [] print("Loading environments...") r_maxs = [] trial_by_domain = {} for i, d in enumerate(ExperimentsCartpole.RECORDED_DATA): print("Setup: " + str(d['setup'])) setup = d['setup'] max_r = d['max_r'] episodes = d['episodes'] steps = d['max_steps'] basis_order = d['order'] # alpha = d['alpha'] alpha = 0.0001 domain_agents = [] for _ in range(num_samples): gym_env = gym.make('CartPole-v0') gym_env.env.force_mag = setup["force"] gym_env.env.length = setup["pole_length"] gym_env.env.masscart = setup["masscart"] gym_env.env.masspole = setup["masspole"] env = EnvWrapper(gym_env, basis_order=basis_order, normalization=0) agent = LinearAgent(env, meta_policy=meta, alpha=alpha, algo="PPO") domain_agents.append( agent ) agents.append( domain_agents ) r_maxs.append( max_r ) trial_by_domain[i] = [ list() for _ in range(num_samples) ] print("Done loading...") domain_rewards_by_episode = {} for ep in range(episodes): trajectories_by_domain = ExperimentsCartpole._run_episode(domain_agents=agents, num_steps=steps, r_maxs=r_maxs) domain_samples = [] for i in trajectories_by_domain.keys(): sample_rewards = [] for j in range(len(trajectories_by_domain[i])): t_rewards = [] for t in trajectories_by_domain[i][j]: t_rewards.append( t['reward'] ) t['reward'] = t['reward'] / r_maxs[i] trial_by_domain[i][j].append( t ) sample_rewards.append( t_rewards ) domain_samples.append( sample_rewards ) print("Episode %d - Trial %d" %(ep, k)) domain_rewards_by_episode[ep] = domain_samples trajectories = [] for key in trial_by_domain.keys(): for traj in trial_by_domain[key]: trajectories.append( traj ) if meta.algo == "REINFORCE": print("Updating meta....") meta.montecarlo_update(trajectories) meta.learning_algorithm.save_model(save_directory+"/", k) # pickle.dump(meta, open(save_directory+"/meta_iter_"+str(k)+".pkl", "wb")) pickle.dump(domain_rewards_by_episode, open(save_directory+"/trajectory_iter_"+str(k)+".pkl", "wb"))