def expert(expert_policy_file, envname, render=False, max_timesteps=None, num_rollouts=20): # import argparse # parser = argparse.ArgumentParser() # parser.add_argument('expert_policy_file', type=str) # parser.add_argument('envname', type=str) # parser.add_argument('--render', action='store_true') # parser.add_argument("--max_timesteps", type=int) # parser.add_argument('--num_rollouts', type=int, default=20, # help='Number of expert roll outs') # args = parser.parse_args() print('loading and building expert policy') policy_fn = load_policy.load_policy(expert_policy_file) print('loaded and built') with tf.Session(): tf_util.initialize() # import gym env = gym.make(envname) max_steps = max_timesteps or env.spec.timestep_limit returns = [] observations = [] actions = [] for i in range(num_rollouts): print('iter', i) obs = env.reset() done = False totalr = 0. steps = 0 while not done: action = policy_fn(obs[None, :]) observations.append(obs) actions.append(action) obs, r, done, _ = env.step(action) totalr += r steps += 1 if render: env.render() # if steps % 100 == 0: print("%i/%i"%(steps, max_steps)) if steps >= max_steps: break returns.append(totalr) # print('returns', returns) # print('mean return', np.mean(returns)) # print('std of return', np.std(returns)) expert_data = { 'observations': np.array(observations), 'actions': np.array(actions) } return expert_data
def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument('expert_policy_file', type=str) parser.add_argument('envname', type=str) parser.add_argument('--render', action='store_true') parser.add_argument("--max_timesteps", type=int) parser.add_argument('--num_rollouts', type=int, default=20, help='Number of expert roll outs') args = parser.parse_args() print('loading and building expert policy') with open(args.expert_policy_file, 'r') as f: globals()['__name__'] = 'foo' exec(f.read(), globals()) print('loaded and built') with tf.Session(): tf_util.initialize() import gym env = gym.make(args.envname) policy = SmallReactivePolicy(env.observation_space, env.action_space) max_steps = args.max_timesteps or env.spec.timestep_limit returns = [] observations = [] actions = [] for i in range(args.num_rollouts): print('iter', i) obs = env.reset() done = False totalr = 0. steps = 0 while not done: action = policy.act(obs) observations.append(obs) actions.append(action) obs, r, done, _ = env.step(action) totalr += r steps += 1 if args.render: env.render() if steps % 100 == 0: print("%i/%i" % (steps, max_steps)) if steps >= max_steps: break returns.append(totalr) print('returns', returns) print('mean return', np.mean(returns)) print('std of return', np.std(returns)) expert_data = { 'observations': np.array(observations), 'actions': np.array(actions) }
def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument('expert_policy_file', type=str) parser.add_argument('envname', type=str) parser.add_argument('--render', action='store_true') parser.add_argument("--max_timesteps", type=int, default=1000) parser.add_argument('--num_rollouts', type=int, default=100) parser.add_argument('--store_data', action='store_true') args = parser.parse_args() print('loading and building expert policy') policy_fn = load_policy.load_policy(args.expert_policy_file) print('loaded and built') with tf.Session(): tf_util.initialize() import gym env = gym.make(args.envname) max_steps = args.max_timesteps or env.spec.timestep_limit returns = [] observations = [] actions = [] for i in range(args.num_rollouts): print('iter', i) obs = env.reset() done = False totalr = 0. steps = 0 while not done: action = policy_fn(obs[None, :]) observations.append(obs) actions.append(action) obs, r, done, _ = env.step(action) totalr += r steps += 1 if args.render: env.render() if steps % 100 == 0: print("%i/%i" % (steps, max_steps)) if steps >= max_steps: break returns.append(totalr) print('returns', returns) print('mean return', np.mean(returns)) print('std of return', np.std(returns)) expert_data = { 'observations': np.array(observations), 'actions': np.array(actions) } if args.store_data: with open(os.path.join('expert_data', args.envname + '.pkl'), 'wb') as f: pickle.dump(expert_data, f, pickle.HIGHEST_PROTOCOL)
def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument('expert_policy_file', type=str) parser.add_argument('envname', type=str) parser.add_argument('--render', action='store_true') parser.add_argument("--max_timesteps", type=int) parser.add_argument('--num_rollouts', type=int, default=20, help='Number of expert roll outs') args = parser.parse_args() print('loading and building expert policy') policy_fn = load_policy.load_policy(args.expert_policy_file) print('loaded and built') with tf.Session(): tf_util.initialize() import gym env = gym.make(args.envname) max_steps = args.max_timesteps or env.spec.timestep_limit returns = [] observations = [] actions = [] for i in range(args.num_rollouts): print('iter', i) this_obs=[] this_act=[] obs = env.reset() done = False totalr = 0. steps = 0 while not done: action = policy_fn(obs[None,:]) this_obs.append(obs) this_act.append(action) obs, r, done, _ = env.step(action) totalr += r steps += 1 if args.render: env.render() if steps % 100 == 0: print("%i/%i"%(steps, max_steps)) if steps >= max_steps: break returns.append(totalr) observations.append(this_obs) actions.append(this_act) print('returns', returns) print('mean return', np.mean(returns)) print('std of return', np.std(returns)) print( (np.array(observations)).shape) print( (np.array(actions)).shape) expert_data = {'observations': np.array(observations), 'actions': np.array(actions), 'returns': np.array(returns)} with open('/Users/joker/imitation_learning/hopper_policy.pickle', 'wb') as handle: pickle.dump(expert_data, handle, protocol=pickle.HIGHEST_PROTOCOL)
def get_data(args, init_observations=None, render=True): # if init_observations is None ---> generates expert data # if init_observations are fed ---> returns expert actions print('loading and building expert policy') policy_fn = load_policy.load_policy(args.expert_policy_file) print('loaded and built') if init_observations is not None: print('initial observations: ', init_observations.shape) else: print('No initial observations: ') with tf.Session(): tf_util.initialize() import gym env = gym.make(args.envname) obs = env.reset() max_steps = args.max_timesteps or env.spec.timestep_limit returns = [] observations = [] actions = [] for i in range(args.num_rollouts): print('iter', i) done = False totalr = 0. steps = 0 while not done: if init_observations is not None: obs = init_observations[steps] action = policy_fn(np.array(obs[None, :])) # print(action.shape) observations.append(obs) actions.append(action) obs, r, done, _ = env.step(action) # print(r) totalr += r steps += 1 if render: env.render() if steps % 100 == 0: print("%i/%i" % (steps, max_steps)) if init_observations is not None: done = False if steps == len(init_observations): break else: if steps >= max_steps: break returns.append(totalr) print('returns', returns) print('mean return', np.mean(returns)) print('std of return', np.std(returns)) expert_data = { 'observations': np.array(observations), 'actions': np.array(actions) } return expert_data
def run_expert(envname, render, expert_policy_file, max_timesteps, num_rollouts, store=False): print('loading and building expert policy') policy_fn = load_policy.load_policy(expert_policy_file) print('loaded and built') with tf.Session(): tf_util.initialize() import gym env = gym.make(envname) max_steps = max_timesteps or env.spec.timestep_limit returns = [] observations = [] actions = [] for i in range(num_rollouts): print('iter', i) obs = env.reset() done = False totalr = 0. steps = 0 while not done: action = policy_fn(obs[None, :]) observations.append(obs) actions.append(action) obs, r, done, _ = env.step(action) totalr += r steps += 1 if render: env.render() if steps % 100 == 0: print("%i/%i" % (steps, max_steps)) if steps >= max_steps: break returns.append(totalr) print('returns', returns) print('mean return', np.mean(returns)) print('std of return', np.std(returns)) expert_data = { 'observations': np.array(observations), 'actions': np.array(actions), 'returns': np.array(returns) } if store: with open( os.path.join('expert_data/{}-{}.pkl'.format( envname, num_rollouts)), 'wb') as f: pickle.dump(expert_data, f, pickle.HIGHEST_PROTOCOL) return returns, expert_data
def run_expert_on_observations(observations, expert_policy_file): policy_fn = load_policy.load_policy(expert_policy_file) with tf.Session(): tf_util.initialize() actions = [] for obs in observations: action = policy_fn(obs[None, :]) actions.append(action) return np.array(actions)
def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument('expert_policy_file', type=str) parser.add_argument('envname', type=str) parser.add_argument('--render', action='store_true') parser.add_argument("--max_timesteps", type=int) parser.add_argument('--num_rollouts', type=int, default=50, help='Number of expert roll outs') parser.add_argument('--verbose', type=int, choices=[0,1,2], default=1, help='Verbose') args = parser.parse_args() env = gym.make(args.envname) max_steps = args.max_timesteps or env.spec.timestep_limit print('loading and building expert policy') policy_fn = load_policy.load_policy(args.expert_policy_file) print('loaded and built') with tf.Session(): tf_util.initialize() returns = [] observations = [] actions = [] for i in range(args.num_rollouts): # print('iter', i) obs = env.reset() done = False totalr = 0. steps = 0 while not done: action = policy_fn(obs[None,:]) observations.append(obs) actions.append(action[0]) obs, r, done, _ = env.step(action) totalr += r steps += 1 # if steps % 100 == 0: print("%i/%i"%(steps, max_steps)) if steps >= max_steps: break returns.append(totalr) print('returns', returns) print('mean return', np.mean(returns)) print('std of return', np.std(returns)) expert_data = {'observations': np.array(observations), 'actions': np.array(actions)} student = Network(env) print("Behavior Cloning....") student.train(expert_data['observations'], expert_data['actions'], 300, 128, args.verbose) print("Generating rollouts from new model..") generate_rollouts(env, student, max_steps, args.num_rollouts, args.render, args.verbose)
def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument('expert_data_file', type=str) parser.add_argument('expert_norm_file', type=str) parser.add_argument('envname', type=str) parser.add_argument('--render', action='store_true') parser.add_argument("--max_timesteps", type=int) parser.add_argument('--num_rollouts', type=int, default=20, help='Number of expert roll outs') args = parser.parse_args() print('loading and building expert policy') policy_fn = load_policy.load_policy(args.expert_norm_file) print('loaded and built') data = load_data(args.expert_data_file) X, y = data['observations'], data['actions'] norm_data = load_data(args.expert_norm_file) normed_X = norm_obs(X[None, :], norm_data) with tf.Session(): tf_util.initialize() # import gym env = gym.make(args.envname) max_steps = args.max_timesteps or env.spec.timestep_limit returns = [] observations = [] actions = [] model = train(normed_X.squeeze(), y.squeeze(), env, norm_data, policy_fn) for i in range(args.num_rollouts): print('iter', i) obs = env.reset() done = False totalr = 0. steps = 0 while not done: x_input = norm_obs(obs[None, :], norm_data) action = model.predict(x_input) observations.append(obs) actions.append(action) obs, r, done, _ = env.step(action) totalr += r steps += 1 if args.render: env.render() if steps % 100 == 0: print("%i/%i" % (steps, max_steps)) if steps >= max_steps: break returns.append(totalr) print('returns', returns) print('mean return', np.mean(returns)) print('std of return', np.std(returns))
def behavior_cloning(env_name=None, expert_policy_file=None, num_rollouts=10, max_timesteps=None, num_epochs=100, save=None): tf.reset_default_graph() env = gym.make(env_name) max_steps = max_timesteps or env.spec.timestep_limit print('[BA] Loading and building expert policy') expert_policy_fn = load_policy.load_policy(expert_policy_file) print('[BA] Gather experience...') data = gather_expert_experience(num_rollouts, env, expert_policy_fn, max_steps) print('[BA] Expert\'s reward mean: {:4f}({:4f})'.format( np.mean(data['returns']), np.std(data['returns']))) print('[BA] Building cloning policy') policy = Policy(env, data['observations']) with tf.Session(): tf_util.initialize() for epoch in tqdm(range(num_epochs)): num_samples = data['observations'].shape[0] perm = np.random.permutation(num_samples) obs_samples = data['observations'][perm] action_samples = data['actions'][perm] loss = 0. for k in range(0, obs_samples.shape[0], BATCH_SIZE): loss += policy.update(obs_samples[k:k + BATCH_SIZE], action_samples[k:k + BATCH_SIZE]) new_exp = policy.test_run(env, max_steps) tqdm.write('[BA] Epoch {:3d}, Loss {:4f}, Reward {:4f}'.format( epoch, loss / num_samples, new_exp['reward'])) if save is not None: env = wrappers.Monitor(env, save, force=True) results = [] for _ in tqdm(range(num_rollouts)): results.append(policy.test_run(env, max_steps)['reward']) print('[BA] Reward mean & std of cloned policy: {:4f}({:4f})'.format( np.mean(results), np.std(results))) return np.mean(data['returns']), np.std( data['returns']), np.mean(results), np.std(results)
def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument('expert_data', type=str) parser.add_argument('envname', type=str) parser.add_argument("--max_timesteps", type=int) parser.add_argument('--num_rollouts', type=int, default=20, help='Number of expert roll outs') parser.add_argument('--render', action='store_true') parser.add_argument('--train_epochs', type=int, default=10) args = parser.parse_args() with tf.Session(): tf_util.initialize() with open(os.path.join('expert_data', args.envname + '.pkl'), "rb") as file: expert_data = pickle.load(file) # clone the observations observations, actions = expert_data['observations'], expert_data[ 'actions'] print('actions', actions.shape) model = build_model(num_actions=actions.shape[-1]) model.fit(observations, actions[:, 0, :], epochs=args.train_epochs) # rollout the cloned model import gym env = gym.make(args.envname) max_steps = args.max_timesteps or env.spec.timestep_limit returns = [] for i in range(args.num_rollouts): print('iter', i) obs = env.reset() done = False totalr = 0. steps = 0 while not done: obs = np.expand_dims(obs, 0) action = model.predict(obs) obs, r, done, _ = env.step(action) totalr += r steps += 1 if args.render: env.render() if steps % 100 == 0: print("%i/%i" % (steps, max_steps)) if steps >= max_steps: break returns.append(totalr) print('returns', returns) print('mean', np.mean(returns), 'std', np.std(returns))
def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument('expert_policy_file', type=str) parser.add_argument('envname', type=str) parser.add_argument('--render', action='store_true') parser.add_argument("--max_timesteps", type=int) parser.add_argument('--num_rollouts', type=int, default=20, help='Number of expert roll outs') args = parser.parse_args() print('loading and building expert policy') policy_fn = load_policy.load_policy(args.expert_policy_file) print('loaded and built') with tf.Session(): tf_util.initialize() import gym env = gym.make(args.envname) max_steps = args.max_timesteps or env.spec.timestep_limit returns = [] observations = [] actions = [] for i in range(args.num_rollouts): print('iter', i) obs = env.reset() done = False totalr = 0. steps = 0 while not done: action = policy_fn(obs[None,:]) observations.append(obs) actions.append(action) #print action,action.shape obs, r, done, _ = env.step(action) totalr += r steps += 1 #print "done",steps if args.render: env.render() if steps % 100 == 0: print("%i/%i"%(steps, max_steps)) if steps >= max_steps: break returns.append(totalr) print('returns', returns) print('mean return', np.mean(returns)) print('std of return', np.std(returns)) expert_data = {'observations': np.array(observations), 'actions': np.array(actions)} file1=open("/home/sumuk/Desktop/sumuk/homework-master/hw1/data.pkl","wb") pickle.dump(expert_data,file1) file1.close()
def expert(envname, dagger_step, num_rollouts, max_timesteps=None, render=False): print('generating expert data ...') imitator = tf_reset() input_ph, output_ph, output_pred = create_model() saver = tf.train.Saver() saver.restore(imitator, "dagger/%s.ckpt" % envname) policy_fn = load_policy.load_policy('experts/' + envname + '.pkl') with tf.Session(): tf_util.initialize() import gym env = gym.make(envname) max_steps = max_timesteps or env.spec.timestep_limit returns = [] observations = [] actions = [] for i in range(num_rollouts): obs = env.reset() done = False totalr = 0. steps = 0 while not done: action = policy_fn(obs[None, :]) observations.append(obs) actions.append(action) imitation = imitator.run(output_pred, feed_dict={input_ph: obs[None, :]}) obs, r, done, _ = env.step(imitation) totalr += r steps += 1 if render: env.render() if steps >= max_steps: break returns.append(totalr) mean = np.mean(returns) std = np.std(returns) print("mean %d, std %d" % (mean, std)) with open('expert_data/' + envname + '_' + str(dagger_step) + '.pkl', 'rb') as f: expert_data = pickle.loads(f.read()) expert_data['observations'] = np.concatenate( (expert_data['observations'], np.array(observations))) expert_data['actions'] = np.concatenate( (expert_data['actions'], np.array(actions))) with open( 'expert_data/' + envname + '_' + str(dagger_step + 1) + '.pkl', 'wb') as f: pickle.dump(expert_data, f, pickle.HIGHEST_PROTOCOL) return mean, std
def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument('rollout_data', type=str) parser.add_argument('envname', type=str) parser.add_argument('--render', action='store_true') parser.add_argument("--max_timesteps", type=int) parser.add_argument('--num_rollouts', type=int, default=20, help='Number of expert roll outs') parser.add_argument('--num_epochs', type=int, default=50) parser.add_argument('--log_dir', type=str) args = parser.parse_args() print('loading rollout data') with open(args.rollout_data, 'rb') as f: data = pickle.loads(f.read()) observation_data = np.array(data['observations']) action_data = np.array(data['actions']) print('training supervised model') import gym env = gym.make(args.envname) cloning_policy = Model(env) cloning_policy.train(observation_data,action_data,args.num_epochs) print('running policy') with tf.Session(): tf_util.initialize() max_steps = args.max_timesteps or env.spec.timestep_limit returns = [] observations = [] actions = [] for i in range(args.num_rollouts): print('iter', i) obs = env.reset() done = False totalr = 0. steps = 0 while not done: action = cloning_policy.predict(obs) observations.append(obs) actions.append(action) obs, r, done, _ = env.step(action) totalr += r steps += 1 if args.render: env.render() if steps % 100 == 0: print("%i/%i"%(steps, max_steps)) if steps >= max_steps: break returns.append(totalr) print('returns', returns) print('mean return', np.mean(returns)) print('std of return', np.std(returns))
def get_expert_data(args): # TODO: docstring if args.load_expert_data: expert_data = load_expert_data(args.load_expert_data) else: env, _ = util.get_env(args.env_name) with tf.Session(): tf_util.initialize() expert_data = ExpertPolicy(env, args).run_expert() return expert_data
def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument('--expert_policy_file', type=str, default="experts/Hopper-v1.pkl") parser.add_argument('--envname', type=str, default="Hopper-v1") parser.add_argument('--render', action='store_true', default=False) parser.add_argument("--max_timesteps", type=int) parser.add_argument('--num_rollouts', type=int, default=1000, help='Number of expert roll outs') parser.add_argument('--out_file', type=str, help='save expert data to file') args = vars(parser.parse_args()) print('loading and building expert policy') policy_fn = load_policy.load_policy(args['expert_policy_file']) print('loaded and built') with tf.Session(): tf_util.initialize() env = gym.make(args['envname']) max_steps = args['max_timesteps'] or env.spec.timestep_limit returns = [] observations = [] actions = [] for i in range(args['num_rollouts']): print('iter', i) obs = env.reset() done = False totalr = 0. steps = 0 while not done: action = policy_fn(obs[None, :]) observations.append(obs) actions.append(action) obs, r, done, _ = env.step(action) totalr += r steps += 1 if args['render']: env.render() if steps % 100 == 0: print("%i/%i" % (steps, max_steps)) if steps >= max_steps: break returns.append(totalr) print('returns', returns) print('mean return', np.mean(returns)) print('std of return', np.std(returns)) expert_data = {'observations': np.array(observations), 'actions': np.array(actions)} if args['out_file'] is not None: pickle.dump(expert_data, open(args['out_file'], 'wb'))
def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument('expert_policy_file', type=str) parser.add_argument('envname', type=str) parser.add_argument('--render', action='store_true') parser.add_argument("--max_timesteps", type=int) parser.add_argument('--num_rollouts', type=int, default=20, help='Number of expert roll outs') args = parser.parse_args() print('loading and building expert policy') policy_fn = load_policy.load_policy(args.expert_policy_file) print('loaded and built') with tf.Session(): tf_util.initialize() import gym env = gym.make(args.envname) max_steps = args.max_timesteps or env.spec.timestep_limit returns = [] observations = [] actions = [] for i in range(args.num_rollouts): print('iter', i) obs = env.reset() done = False totalr = 0. steps = 0 while not done: action = policy_fn(obs[None,:]) observations.append(obs) actions.append(action) # obs: object, an environment-specific object representing the observation of the environment # r: float, reward obtained from action # done: boolean, whether it's time to reset the environment # _, info: dict, diagnostic information obs, r, done, _ = env.step(action) totalr += r steps += 1 if args.render: env.render() if steps % 100 == 0: print("%i/%i"%(steps, max_steps)) if steps >= max_steps: break returns.append(totalr) print('returns', returns) print('mean return', np.mean(returns)) print('std of return', np.std(returns)) expert_data = {'observations': np.array(observations), 'actions': np.array(actions)}
def label_observations(observations, policy_fn): with tf.Session(): tf_util.initialize() actions = [] for obs in observations: action = policy_fn(obs[None, :]) actions.append(action) return np.array(actions)
def load_model(envname): env = gym.make(envname) # Gather expert experiences with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)): with tf.device('/gpu:0'): tf_util.initialize() max_steps = max_timesteps or env.spec.timestep_limit obs = env.reset() exp_action = expert(obs[None, :])[0] model = build_model([obs], [action]) return model
def view_expert(policy, data_path): policy_fn = policy their_data_path = data_path with tf.Session(): tf_util.initialize() env = gym.make(args.envname) max_steps = args.max_timesteps or env.spec.timestep_limit returns = [] observations = [] actions = [] steps_numbers = [] for i in range(args.num_rollouts): print('iter', i) obs = env.reset() done = False totalr = 0. steps = 0 while not done: action = policy_fn(obs[None, :]) observations.append(obs) actions.append(action) obs, r, done, _ = env.step(action) totalr += r steps += 1 if args.render: env.render() if steps % 100 == 0: print("%i/%i" % (steps, max_steps)) if steps >= max_steps: break steps_numbers.append(steps) returns.append(totalr) observations = np.array(observations) actions = np.array(actions) global observations_shape, actions_shape observations_shape = observations.shape[1] actions_shape = actions.shape[2] expert_data = { 'observations': observations, 'actions': actions, 'returns': np.array(returns), 'steps': np.array(steps_numbers) } # print('expert_data', expert_data) pickle.dump(expert_data, open(their_data_path, 'wb'))
def run_exp_on_ours(env_name, obs, render=False): with tf.Session(): tf_util.initialize() actions = [] policy_fn = load_policy_fn(env_name) print("Running expert policy on our observations") for ob in tqdm.tqdm(obs): action = policy_fn(ob[None, :]) actions.append(action) return actions
def main(): # Load the expert policy. ['GaussianPolicy', 'nonlin_type'] expert_policy_file = 'experts/Humanoid-v2.pkl' envname = 'Humanoid-v2' num_rollouts = 20 max_timesteps = None render = True print('loading and building expert policy') policy_fn = load_policy.load_policy(expert_policy_file) print('loaded and built') with tf.Session(): tf_util.initialize() import gym env = gym.make(envname) max_steps = max_timesteps or env.spec.timestep_limit returns = [] observations = [] actions = [] for i in range(num_rollouts): print('iter', i) obs = env.reset() done = False totalr = 0. steps = 0 while not done: action = policy_fn(obs[None,:]) observations.append(obs) actions.append(action) obs, r, done, _ = env.step(action) totalr += r steps += 1 if render: env.render() if steps % 100 == 0: print("%i/%i"%(steps, max_steps)) if steps >= max_steps: break returns.append(totalr) print('returns', returns) print('mean return', np.mean(returns)) print('std of return', np.std(returns)) # get data. ['observations', 'actions'] expert_data = {'observations': np.array(observations), 'actions': np.array(actions)} # storing data with open(os.path.join('expert_data', envname + '.pkl'), 'wb') as f: pickle.dump(expert_data, f, pickle.HIGHEST_PROTOCOL)
def test_model(behavior_clone, envname, render=True, max_timesteps=1000, num_rollouts=20, get_expert_data=False): with tf.Session(): tf_util.initialize() behavior_clone.model = load_model(behavior_clone.get_file_name()) return helper(behavior_clone.predict, envname, render, max_timesteps, num_rollouts, get_expert_data)
def one_rollout(sess, env, file_): max_steps = env.spec.timestep_limit print(max_steps) policy = GaussianMLPPolicy(env, hidden_sizes=args.policy_size, activation=tf.nn.tanh) tf_util.initialize() policy_params = joblib.load(file_) # print([x.name for x in policy.get_params()]) policy.set_param_values(sess, policy_params) ret = [] policy_fn = lambda x: policy.act(x, sess, eval=True)[0] for i in range(num_rollouts): print('iter', i) obs = env.reset() done = False totalr = 0. steps = 0 returns = [] rewards = [] observations = [] actions = [] while not done: action = policy_fn(obs) observations.append(obs) actions.append(action) obs, r, done, _ = env.step(action) # print(done) rewards.append(r) totalr += r steps += 1 if True: env.render() if steps % 100 == 0: print("%i/%i" % (steps, max_steps)) if steps >= max_steps: break returns.append(totalr) ret.append({ 'observations': np.array(observations), 'actions': np.array(actions), 'rewards': np.array(rewards), 'mean_return': np.mean(returns), 'std_return': np.std(returns) }) returns = [ele['mean_return'] for ele in ret] print('returns', returns) print('mean return', np.mean(returns)) print('std of return', np.std(returns)) return ret
def gather_expert_data(expert_policy_file, envname, render, num_rollouts): policy_fn = load_policy.load_policy(expert_policy_file) print('loaded and built') with tf.Session(): tf_util.initialize() import gym env = gym.make(envname) max_steps = env.spec.timestep_limit returns = [] observations = [] actions = [] steps_numbers = [] for i in range(num_rollouts): print('iter', i) obs = env.reset() done = False totalr = 0. steps = 0 while not done: action = policy_fn(obs[None, :]) observations.append(obs) actions.append(action) obs, r, done, _ = env.step(action) totalr += r steps += 1 if render: env.render() # if steps % 100 == 0: print("%i/%i" % (steps, max_steps)) if steps >= max_steps: break steps_numbers.append(steps) returns.append(totalr) # print('returns', returns) # print('mean return', np.mean(returns)) # print('std of return', np.std(returns)) # 在这里得到专家数据 expert_data = { 'observations': np.array(observations), 'actions': np.array(actions), 'returns': np.array(returns), 'steps': np.array(steps_numbers) } return expert_data, np.mean(returns), np.std(returns)
def generate_expert_data(envname, max_timesteps, expert_policy_file, num_rollouts, save=True): with tf.Session(): tf_util.initialize() import gym env = gym.make(envname) max_steps = max_timesteps or env.spec.timestep_limit print('loading and building expert policy') policy_fn = load_policy.load_policy(expert_policy_file) print('loaded and built') returns = [] observations = [] actions = [] for i in range(num_rollouts): print('iter', i) obs = env.reset() done = False totalr = 0. steps = 0 while not done: action = policy_fn(obs[None, :]) #print("action", action) observations.append(obs) actions.append(action) obs, r, done, _ = env.step(action) totalr += r steps += 1 if steps % 100 == 0: print("%i/%i" % (steps, max_steps)) if steps >= max_steps: break returns.append(totalr) #print('returns', returns) #print('mean return', np.mean(returns)) #print('std of return', np.std(returns)) expert_data = { 'observations': np.array(observations), 'actions': np.array(actions), 'returns': np.array(returns) } if save: f = open("experts/" + envname + '.meta', 'wb') pickle.dump(expert_data, f) return expert_data
def generate_rollout_data(expert_policy_file, env_name, num_rollouts, render, output_dir=None, save=False, max_timesteps=None): print('loading and building expert policy') policy_fn = load_policy.load_policy(expert_policy_file) print('loaded and built') with tf.Session(): tf_util.initialize() env = gym.make(env_name) max_steps = max_timesteps or env.spec.timestep_limit if save: expert_results_dir = os.path.join(os.getcwd(), 'results', env_name, 'expert') env = wrappers.Monitor(env, expert_results_dir, force=True) returns = [] observations = [] actions = [] for i in range(num_rollouts): print('iter', i) obs = env.reset() done = False totalr = 0. steps = 0 while not done: action = policy_fn(obs[None,:]) observations.append(obs) actions.append(action) obs, r, done, _ = env.step(action) totalr += r steps += 1 if render: env.render() if steps % 100 == 0: print("%i/%i"%(steps, max_steps)) if steps >= max_steps: break returns.append(totalr) print('returns', returns) print('mean return', np.mean(returns)) print('std of return', np.std(returns)) expert_data = {'observations': np.array(observations), 'actions': np.array(actions), 'mean_return': np.mean(returns), 'std_return': np.std(returns)} if output_dir is not 'None': output_dir = os.path.join(os.getcwd(), output_dir) filename = '{}_data_{}_rollouts.pkl'.format(env_name, num_rollouts) with open(output_dir + '/' + filename,'wb') as f: pickle.dump(expert_data, f)
def run_expert(expert_policy_file, envname, in_jupyter=False, render='store_true', max_timesteps=None, num_rollouts=20): policy_fn = load_policy.load_policy(expert_policy_file) print('---------------training ' + envname + '---------------') with tf.Session(): tf_util.initialize() env = gym.make(envname) max_steps = max_timesteps or env.spec.timestep_limit returns = [] observations = [] actions = [] for i in range(num_rollouts): obs = env.reset() done = False totalr = 0. frames = [] steps = 0 while not done: action = policy_fn(obs[None, :]) observations.append(obs) actions.append(action) obs, r, done, _ = env.step(action) totalr += r steps += 1 if render: if in_jupyter: frames.append(env.render(mode='rgb_array')) else: env.render() if steps % 100 == 0: print("%i/%i" % (steps, max_steps)) if steps >= max_steps: break if render and in_jupyter: env.render(close=True) gym_util.display_frames_as_gif(frames) returns.append(totalr) print('returns', returns) print('mean return', np.mean(returns)) print('std of return', np.std(returns)) expert_data = { 'observations': np.array(observations), 'actions': np.array(actions) } save_expert_data(envname, expert_data, returns) print('--------------------------------------------\n')
def setup(self): """Setup environment and expert. Use as context.""" import tf_util import load_tf_policy self.expert = load_tf_policy.load_policy( data_path("experts/" + self.envname + "-v1.pkl")) config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config): tf_util.initialize() yield self
def labeling(args, observations, policy_fn): print("Begin Labeling") with tf.Session(): tf_util.initialize() actions = [] for ob in observations: action = policy_fn(ob[None,:]) actions.append(action) assert len(observations) == len(actions) new_dataset = tf.data.Dataset.from_tensor_slices((np.array(observations), np.array(actions).squeeze())) print('End Labeling') return new_dataset
def main(): """ Entry point for the program. """ args = get_args() # Build inference graph # Build training graph with tf.Session() as sess: tf_util.initialize() expert_data = run_expert(args) next_data = gen_input_graph(expert_data) for i in range(10): print(sess.run(next_data)[0].shape) print(sess.run(next_data)[1].shape)
def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument('expert_policy_file', type=str) parser.add_argument('expert_policy_data', type=str) parser.add_argument('envname', type=str) parser.add_argument('--render', action='store_true') parser.add_argument("--max_timesteps", type=int) parser.add_argument('--num_rollouts', type=int, default=20, help='Number of expert roll outs') parser.add_argument('--num_epochs', type=int, default=50, help='Number of epochs for training') #need number of epoch args = parser.parse_args() print('loading expert policy data for training') with open(args.expert_policy_data, 'rb') as handle: expert_data = pickle.load(handle) #train the network torch.manual_seed(25) o_expert=expert_data['observations'] (N,N_step,N_obs)=o_expert.shape a_expert=expert_data['actions'] (N,N_step,_,N_action)=a_expert.shape import gym env = gym.make(args.envname) max_steps = args.max_timesteps or env.spec.timestep_limit net=CNN(N_obs, N_action) #todo:initilize network parameters net.apply(init_weights) import torch.optim as optim optimizer=optim.Adam(net.parameters(),lr=1e-3, weight_decay=5e-9) criterion=nn.MSELoss() loss_history=[] reward_mean_history=[] reward_std_history=[] for j in range(args.num_epochs): print("epoch %i"%j) net.train() (N,N_step,N_obs)=o_expert.shape (N,N_step,_,N_action)=a_expert.shape for k in range(max_steps): optimizer.zero_grad() index=k o=Variable(torch.from_numpy(o_expert[:,index,:]).reshape(N,1,N_obs)) o=o.float() a_out=net.forward(o) a_label=torch.from_numpy(a_expert[:,index,:].reshape(N,N_action,1)) loss=criterion(a_out.float(), a_label.float()) loss.backward() optimizer.step() print("No DAGGER") print(loss/N) loss_history.append(loss/N) #test the network with tf.Session(): tf_util.initialize() import gym env = gym.make(args.envname) max_steps = args.max_timesteps or env.spec.timestep_limit net.eval() r_new=[] for i in range (int(args.num_rollouts)//4): totalr=0 obs=env.reset() done=False steps=0 while not done: obs=Variable(torch.Tensor(obs).reshape(1,1,N_obs)) action_new=net.forward(obs).detach().numpy() obs,r,done,_=env.step(action_new.reshape(N_action)) totalr+=r steps+=1 if steps >= max_steps: break r_new.append(totalr) u=np.average(np.array(r_new)) sigma=np.std(np.array(r_new)) reward_mean_history.append(u) reward_std_history.append(sigma) print('current reward mean', u) print('current reward std', sigma) fig0=plt.figure(0) plt.plot(loss_history, '-o') plt.xlabel('iteration') plt.ylabel('loss') fig0.savefig('/Users/joker/imitation_learning/hopper.png') reward_mean_history=np.array(reward_mean_history) reward_std_history=np.array(reward_std_history) #print(reward_mean_history.shape) #print(reward_std_history.shape) print('mean:', reward_mean_history) print('std:', reward_std_history) fig1=plt.figure(1) plt.errorbar(np.arange(args.num_epochs),reward_mean_history, reward_std_history, marker="s", mfc='blue', mec='yellow') fig1.savefig('/Users/joker/imitation_learning/hopper_reward.png')
def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument('expert_policy_file', type=str) parser.add_argument('envname', type=str) parser.add_argument('--render', action='store_true') parser.add_argument("--max_timesteps", type=int) parser.add_argument('--num_rollouts', type=int, default=20, help='Number of expert roll outs') parser.add_argument('--num_epochs', type=int, default=5, help='Number of epochs for training') args = parser.parse_args() print('loading and building expert policy') policy_fn = load_policy.load_policy(args.expert_policy_file) print('loaded and built') with tf.Session(): tf_util.initialize() import gym env = gym.make(args.envname) max_steps = args.max_timesteps or env.spec.timestep_limit returns = [] observations = [] actions = [] for i in range(args.num_rollouts): print('iter', i) this_obs=[] this_act=[] obs = env.reset() done = False totalr = 0. steps = 0 while not done: action = policy_fn(obs[None,:]) this_obs.append(obs) this_act.append(action) obs, r, done, _ = env.step(action) totalr += r steps += 1 if args.render: env.render() if steps % 100 == 0: print("%i/%i"%(steps, max_steps)) if steps >= max_steps: break returns.append(totalr) observations.append(this_obs) actions.append(this_act) print('returns', returns) print('mean return', np.mean(returns)) print('std of return', np.std(returns)) print( (np.array(observations)).shape) print( (np.array(actions)).shape) expert_data = {'observations': np.array(observations), 'actions': np.array(actions)} #train the network o_expert=expert_data['observations'] (N,N_step,N_obs)=o_expert.shape a_expert=expert_data['actions'] (N,N_step,_,N_action)=a_expert.shape net=CNN(N_obs, N_action) #todo:initilize network parameters net.apply(init_weights) import torch.optim as optim optimizer=optim.Adam(net.parameters(),lr=1e-3, weight_decay=5e-12) criterion=nn.MSELoss() loss_history=[] for j in range(args.num_epochs): print("epoch %i"%j) (N,N_step,N_obs)=o_expert.shape (N,N_step,_,N_action)=a_expert.shape for k in range(max_steps): index=k o=Variable(torch.from_numpy(o_expert[:,index,:]).reshape(N,1,N_obs)) o=o.float() a_out=net.forward(o) a_label=torch.from_numpy(a_expert[:,index,:].reshape(N,N_action,1)) loss=criterion(a_out.float(), a_label.float()) loss.backward() loss_history.append(loss) optimizer.step() print("before DAGGER") print(loss) #implement dagger with tf.Session(): tf_util.initialize() import gym env = gym.make(args.envname) max_steps = args.max_timesteps or env.spec.timestep_limit o_new_expert=[] a_new_expert=[] for i in range (int(args.num_rollouts)//2): this_o_new=[] this_a_new=[] obs=env.reset() done=False steps=0 while not done: action = policy_fn(obs[None, :]) this_o_new.append(obs) this_a_new.append(action) obs=Variable(torch.Tensor(obs).reshape(1,1,N_obs)) action_new=net.forward(obs).detach().numpy() obs,r,done,_=env.step(action_new.reshape(17)) steps+=1 if steps % 100 == 0: print("%i/%i"%(steps, max_steps)) if steps >= max_steps: break #if terminates early, we pad 0 to both observation and actions lists while steps<max_steps: steps+=1 this_o_new.append(np.zeros(N_obs)) this_a_new.append(np.zeros((1,N_action))) o_new_expert.append(this_o_new) a_new_expert.append(this_a_new) o_new=np.array(o_new_expert) a_new=np.array(a_new_expert) o_expert=np.concatenate((o_expert,o_new), axis=0) a_expert=np.concatenate((a_expert,a_new), axis=0) plt.plot(loss_history, '-o') plt.xlabel('iteration') plt.ylabel('loss') plt.savefig('/Users/joker/imitation_learning/humanoid_dagger.png') plt.show()
def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument('expert_policy_file', type=str) parser.add_argument('expert_policy_data', type=str) parser.add_argument('envname', type=str) parser.add_argument('--render', action='store_true') parser.add_argument("--max_timesteps", type=int) parser.add_argument('--num_rollouts', type=int, default=20, help='Number of expert roll outs') parser.add_argument('--num_epochs', type=int, default=50, help='Number of epochs for training') #need number of epoch args = parser.parse_args() print('loading expert policy data for training') with open(args.expert_policy_data, 'rb') as handle: expert_data = pickle.load(handle) #train the network torch.manual_seed(25) print('loading and building expert policy') policy_fn = load_policy.load_policy(args.expert_policy_file) print('loaded and built') o_expert=torch.Tensor(expert_data['observations']) (N,N_obs)=o_expert.size() a_expert=torch.Tensor(expert_data['actions']) (N,_,N_action)=a_expert.size() a_expert=a_expert.view(N,N_action) import gym env = gym.make(args.envname) max_steps = args.max_timesteps or env.spec.timestep_limit net=CNN(N_obs, N_action) #todo:initilize network parameters net.apply(init_weights) import torch.optim as optim optimizer=optim.Adam(net.parameters(),lr=1e-4, weight_decay=5e-9) criterion=nn.MSELoss() loss_history=[] reward_mean_history=[] reward_std_history=[] for j in range(args.num_epochs): print("epoch %i"%j) net.train() N=o_expert.shape[0] print(N) train_set=data_utils.TensorDataset(o_expert, a_expert) train_loader=data_utils.DataLoader(dataset=train_set,batch_size=BATCH_SIZE,shuffle=True) epoch_train_loss=0 for i, (X_train, y_train) in enumerate(train_loader): net.zero_grad() y_pred=net.forward(X_train) loss=criterion(y_pred,y_train) loss.backward() optimizer.step() epoch_train_loss+=loss.item()/N print("Before DAGGER") print(epoch_train_loss) loss_history.append(epoch_train_loss) #implement dagger with tf.Session(): tf_util.initialize() import gym env = gym.make(args.envname) max_steps = args.max_timesteps or env.spec.timestep_limit net.eval() o_new_expert=[] a_new_expert=[] reward_new=[] for i in range (int(args.num_rollouts)//4): obs=env.reset() done=False steps=0 totalr=0 while not done: action = policy_fn(obs[None, :]) o_new_expert.append(obs) a_new_expert.append(action) obs=Variable(torch.Tensor(obs).reshape(1,N_obs)) action_new=net.forward(obs).detach().numpy() obs,r,done,_=env.step(action_new.reshape(N_action)) totalr+=r steps+=1 if steps >= max_steps: break reward_new.append(totalr) o_new=torch.Tensor(np.array(o_new_expert)) a_new=torch.Tensor(np.array(a_new_expert).reshape(-1,N_action)) o_expert=torch.cat((o_expert,o_new), 0) a_expert=torch.cat((a_expert,a_new), 0) reward_new=np.array(reward_new) #print(reward_new.shape) u=np.average(reward_new) sigma=np.std(reward_new) print('current reward mean', u) print('current reward std', sigma) reward_mean_history.append(u) reward_std_history.append(sigma) fig0=plt.figure(0) plt.plot(loss_history, '-o') plt.xlabel('epoch') plt.ylabel('loss') fig0.savefig('/Users/joker/imitation_learning/hopper_basic_DAGGER.png') reward_mean_history=np.array(reward_mean_history) reward_std_history=np.array(reward_std_history) #print(reward_mean_history.shape) #print(reward_std_history.shape) print('mean:', reward_mean_history) print('std:', reward_std_history) fig1=plt.figure(1) plt.errorbar(np.arange(args.num_epochs),reward_mean_history, reward_std_history, marker="s", mfc='blue', mec='yellow') fig1.savefig('/Users/joker/imitation_learning/hopper__basic_DAGGERreward.png')