def trainModel(train_data_generator, val_data_generator, model, initial_epoch): """ Model training. # Arguments train_data_generator: Training data generated batch by batch. val_data_generator: Validation data generated batch by batch. model: A Model instance. initial_epoch: Epoch from which training starts. """ # Configure training process model.compile( loss='binary_crossentropy', optimizer=Adam(lr=resnet_models.lr_schedule(0, FLAGS.initial_lr)), metrics=['binary_accuracy'], loss_weights=np.ones((15, )).tolist()) # Save model with the lowest validation loss weights_path = os.path.join(FLAGS.experiment_rootdir, 'weights_{epoch:03d}.h5') writeBestModel = ModelCheckpoint(filepath=weights_path, monitor='val_loss', save_best_only=True, save_weights_only=True) tensorboard = TensorBoard(log_dir="logs/{}".format(time())) # Save training and validation losses. logz.configure_output_dir(FLAGS.experiment_rootdir) saveModelAndLoss = log_utils.MyCallback(filepath=FLAGS.experiment_rootdir) # Train model steps_per_epoch = int( np.ceil(train_data_generator.samples / FLAGS.batch_size)) validation_steps = int( np.ceil(val_data_generator.samples / FLAGS.batch_size)) - 1 lr_scheduler = LearningRateScheduler(resnet_models.lr_schedule) lr_reducer = ReduceLROnPlateau(factor=np.sqrt(0.1), cooldown=0, patience=5, min_lr=0.5e-6) strTime = strftime("%Y%b%d_%Hh%Mm%Ss", localtime(time())) # tensorboard = TensorBoard(log_dir="logs/{}".format(strTime), histogram_freq=10, # batch_size=32, write_graph=False, write_grads=True, # write_images=False, embeddings_freq=0, # embeddings_layer_names=None, embeddings_metadata=None) #tensorboard = TensorBoard(log_dir="logs/{}".format(strTime), histogram_freq=10) tensorboard = TensorBoard(log_dir="logs/{}".format(strTime), histogram_freq=0) callbacks = [ writeBestModel, saveModelAndLoss, lr_reducer, lr_scheduler, tensorboard ] model.fit_generator(train_data_generator, epochs=FLAGS.epochs, steps_per_epoch=steps_per_epoch, callbacks=callbacks, validation_data=val_data_generator, validation_steps=validation_steps, initial_epoch=initial_epoch)
def setup_logger(logdir, locals_): # Configure output directory for logging logz.configure_output_dir(logdir) # Log experimental parameters args = inspect.getargspec(Supervisor.__init__)[0] params = {k: locals_[k] if k in locals_ and not isinstance(locals_[k], types.FunctionType) and k is not "self" else None for k in args} logz.save_params(params)
def setup_logger(logdir, locals_): # Configure output directory for logging logz.configure_output_dir(logdir) # Log experimental parameters args = inspect.getargspec(train_PG)[0] params = {k: locals_[k] if k in locals_ else None for k in args} logz.save_params(params)
def mpc_sampler(vehicle_model,controller,cost_function,save,timeSteps=2000,pred_horizon=15, imag_rollouts_number=100,X_initial=[0,0],X_desired=[0,10]): X=[] u=[] cost=[] X.append(X_initial) cost.append(Abs_Error(X_initial[1],X_desired[1])) if(save): logz.configure_output_dir("/home/hendawy/Desktop/Platoon_Advanced_Mechatronics_Project/RLTrial",0) for t in range(timeSteps): # time_start=time.time() u_t=controller(vehicle_model,pred_horizon,imag_rollouts_number,X[t],cost_function) X_next=vehicle_model(u_t,X[t][0],X[t][1]) cost.append(Abs_Error(X_next[1],X_desired[1])) if(save): logz.log_tabular('Error', Abs_Error(X_next[1],X_desired[1])) logz.dump_tabular() X.append(X_next) u.append([u_t]) # time_end=time.time() # print(time_end-time_start) X.pop() traj = {"states" : np.array(X), "control" : np.array(u), "cost" : np.array(cost), } return traj
def trainModel(train_data_generator, val_data_generator, model, initial_epoch): """ Model training. # Arguments train_data_generator: Training data generated batch by batch. val_data_generator: Validation data generated batch by batch. model: Target image channels. initial_epoch: Dimension of model output. """ # Initialize loss weights ##model.alpha = tf.Variable(1, trainable=False, name='alpha', dtype=tf.float32) ##model.beta = tf.Variable(0, trainable=False, name='beta', dtype=tf.float32) model.beta = tf.Variable(1, trainable=False, name='beta', dtype=tf.float32) # Initialize number of samples for hard-mining ##model.k_mse = tf.Variable(FLAGS.batch_size, trainable=False, name='k_mse', dtype=tf.int32) model.k_entropy = tf.Variable(FLAGS.batch_size, trainable=False, name='k_entropy', dtype=tf.int32) optimizer = optimizers.Adam(decay=1e-5) # Configure training process ##model.compile(loss=[utils.hard_mining_mse(model.k_mse), ## utils.hard_mining_entropy(model.k_entropy)], ## optimizer=optimizer, loss_weights=[model.alpha, model.beta]) model.compile(loss=utils.hard_mining_entropy(model.k_entropy), optimizer=optimizer) # Save model with the lowest validation loss weights_path = os.path.join(FLAGS.experiment_rootdir, 'weights_{epoch:03d}.h5') writeBestModel = ModelCheckpoint(filepath=weights_path, monitor='val_loss', save_best_only=True, save_weights_only=True) # Save model every 'log_rate' epochs. # Save training and validation losses. logz.configure_output_dir(FLAGS.experiment_rootdir) saveModelAndLoss = log_utils.MyCallback(filepath=FLAGS.experiment_rootdir, period=FLAGS.log_rate, batch_size=FLAGS.batch_size) # Train model steps_per_epoch = int( np.ceil(train_data_generator.samples / FLAGS.batch_size)) validation_steps = int( np.ceil(val_data_generator.samples / FLAGS.batch_size)) model.fit_generator(train_data_generator, epochs=FLAGS.epochs, steps_per_epoch=steps_per_epoch, callbacks=[writeBestModel, saveModelAndLoss], validation_data=val_data_generator, validation_steps=validation_steps, initial_epoch=initial_epoch)
def __init__(self, env_name='HalfCheetah-v1', policy_params=None, num_workers=32, num_deltas=320, deltas_used=320, delta_std=0.02, logdir=None, rollout_length=1000, step_size=0.01, shift='constant zero', params=None, seed=123): logz.configure_output_dir(logdir) logz.save_params(params) env = minitaur_gym_env.MinitaurBulletEnv() #gym.make(env_name) self.timesteps = 0 self.action_size = env.action_space.shape[0] self.ob_size = env.observation_space.shape[0] self.num_deltas = num_deltas self.deltas_used = deltas_used self.rollout_length = rollout_length self.step_size = step_size self.delta_std = delta_std self.logdir = logdir self.shift = shift self.params = params self.max_past_avg_reward = float('-inf') self.num_episodes_used = float('inf') # create shared table for storing noise print("Creating deltas table.") deltas_id = create_shared_noise.remote() self.deltas = SharedNoiseTable(ray.get(deltas_id), seed = seed + 3) print('Created deltas table.') # initialize workers with different random seeds print('Initializing workers.') self.num_workers = num_workers self.workers = [Worker.remote(seed + 7 * i, env_name=env_name, policy_params=policy_params, deltas=deltas_id, rollout_length=rollout_length, delta_std=delta_std) for i in range(num_workers)] # initialize policy if policy_params['type'] == 'linear': self.policy = LinearPolicy(policy_params) self.w_policy = self.policy.get_weights() else: raise NotImplementedError # initialize optimization algorithm self.optimizer = optimizers.SGD(self.w_policy, self.step_size) print("Initialization of ARS complete.")
def __init__(self, env_name='HalfCheetah-v1', policy_params=None, num_workers=32, num_deltas=320, deltas_used=320, delta_std=0.02, logdir=None, rollout_length=1000, step_size=0.01, shift='constant zero', params=None, seed=123): logz.configure_output_dir(logdir) logz.save_params(params) env = gym.make(env_name) self.timesteps = 0 self.action_size = env.action_space.shape[0] self.ob_size = env.observation_space.shape[0] self.num_deltas = num_deltas self.deltas_used = deltas_used self.rollout_length = rollout_length self.step_size = step_size self.delta_std = delta_std self.logdir = logdir self.shift = shift self.params = params self.max_past_avg_reward = float('-inf') self.num_episodes_used = float('inf') # create shared table for storing noise print("Creating deltas table.") deltas_id = create_shared_noise.remote() self.deltas = SharedNoiseTable(ray.get(deltas_id), seed = seed + 3) print('Created deltas table.') # initialize workers with different random seeds print('Initializing workers.') self.num_workers = num_workers self.workers = [Worker.remote(seed + 7 * i, env_name=env_name, policy_params=policy_params, deltas=deltas_id, rollout_length=rollout_length, delta_std=delta_std) for i in range(num_workers)] # initialize policy if policy_params['type'] == 'linear': self.policy = LinearPolicy(policy_params) self.w_policy = self.policy.get_weights() else: raise NotImplementedError # initialize optimization algorithm self.optimizer = optimizers.SGD(self.w_policy, self.step_size) print("Initialization of ARS complete.")
def setup_logger(logdir, locals_): # Configure output directory for logging logz.configure_output_dir(logdir) # Log experimental parameters # args = inspect.getargspec(learn)[0] # params = {k: locals_[k] if k in locals_ else None for k in args} logz.save_params(locals_.get("kwargs"))
def train_mf(self): self.start_worker() self.init_opt() logz.configure_output_dir( "/home/hendawy/Desktop/2DOF_Robotic_Arm_withSphereObstacle/Rr", 1807) for itr in range(self.current_itr, self.n_itr): with logger.prefix('itr #%d | ' % itr): paths = self.sampler.obtain_samples(itr, Constrained=True) samples_data, analysis_data = self.sampler.process_samples( itr, paths) self.log_diagnostics(paths) optimization_data = self.optimize_policy(itr, samples_data) logz.log_tabular('Iteration', analysis_data["Iteration"]) # In terms of true environment reward of your rolled out trajectory using the MPC controller logz.log_tabular('AverageDiscountedReturn', analysis_data["AverageDiscountedReturn"]) logz.log_tabular('AverageReturns', analysis_data["AverageReturn"]) logz.log_tabular('violation_cost', np.mean(samples_data["violation_cost"])) logz.log_tabular( 'boundary_violation_cost', np.mean(samples_data["boundary_violation_cost"])) logz.log_tabular('success_rate', samples_data["success_rate"]) logz.log_tabular( 'successful_AverageReturn', np.mean(samples_data["successful_AverageReturn"])) logz.log_tabular('ExplainedVariance', analysis_data["ExplainedVariance"]) logz.log_tabular('NumTrajs', analysis_data["NumTrajs"]) logz.log_tabular('Entropy', analysis_data["Entropy"]) logz.log_tabular('Perplexity', analysis_data["Perplexity"]) logz.log_tabular('StdReturn', analysis_data["StdReturn"]) logz.log_tabular('MaxReturn', analysis_data["MaxReturn"]) logz.log_tabular('MinReturn', analysis_data["MinReturn"]) logz.log_tabular('LossBefore', optimization_data["LossBefore"]) logz.log_tabular('LossAfter', optimization_data["LossAfter"]) logz.log_tabular('MeanKLBefore', optimization_data["MeanKLBefore"]) logz.log_tabular('MeanKL', optimization_data["MeanKL"]) logz.log_tabular('dLoss', optimization_data["dLoss"]) logz.dump_tabular() logger.log("saving snapshot...") params = self.get_itr_snapshot(itr, samples_data) self.current_itr = itr + 1 params["algo"] = self if self.store_paths: params["paths"] = samples_data["paths"] logger.save_itr_params(itr, params) logger.log("saved") logger.dump_tabular(with_prefix=False) if self.plot: self.update_plot() if self.pause_for_plot: input("Plotting evaluation run: Press Enter to " "continue...") self.shutdown_worker()
def setup_logger(logdir, locals_): # Configure output directory for logging logz.configure_output_dir(logdir) # Log experimental parameters args = inspect.getargspec(QLearner)[0] params = {k: str(locals_[k]) if k in locals_ else None for k in args} params['exp_name'] = locals_['q_func'].__name__ + locals_['double_q'] * '_doubleQ' logz.save_params(params)
def setup_logger(logdir, locals_): # Configure output directory for logging seed = np.random.get_state()[1][0] logz.configure_output_dir(logdir + '/%s/' % seed) # Log experimental parameters params = {k: str(locals_[k]) for k in locals_ if '__' not in k} params['seed'] = str(seed) logz.save_params(params)
def main(): PROJECT_ROOT =osp.dirname(osp.realpath(__file__)) logz.configure_output_dir(osp.join(PROJECT_ROOT, "log/"+"_RAM_"+time.strftime("%d-%m-%Y_%H-%M-%S"))) seed = 0 env = get_env('SpaceInvaders-v0', seed) session = get_session() atari_learn(env, session, num_timesteps=40000000)
def train(self, train_db, val_db, test_db): ################################################################## ## LOG ################################################################## logz.configure_output_dir(self.cfg.model_dir) logz.save_config(self.cfg) ################################################################## ## Main loop ################################################################## start = time() min_val_loss = 100000000 for epoch in range(self.epoch, self.cfg.n_epochs): ################################################################## ## Training ################################################################## torch.cuda.empty_cache() train_loss, train_accu = self.train_epoch(train_db, epoch) ################################################################## ## Validation ################################################################## torch.cuda.empty_cache() val_loss, val_accu = self.validate_epoch(val_db, epoch) ################################################################## ## Logging ################################################################## # update optim scheduler current_val_loss = np.mean(val_loss[:,0]) # self.optimizer.update(current_val_loss, epoch) logz.log_tabular("Time", time() - start) logz.log_tabular("Iteration", epoch) logz.log_tabular("AverageLoss", np.mean(train_loss[:, 0])) logz.log_tabular("AveragePredLoss", np.mean(train_loss[:, 1])) logz.log_tabular("AverageEmbedLoss", np.mean(train_loss[:, 2])) logz.log_tabular("AverageAttnLoss", np.mean(train_loss[:, 3])) logz.log_tabular("AverageObjAccu", np.mean(train_accu[:, 0])) logz.log_tabular("AverageCoordAccu", np.mean(train_accu[:, 1])) logz.log_tabular("AverageScaleAccu", np.mean(train_accu[:, 2])) logz.log_tabular("AverageRatioAccu", np.mean(train_accu[:, 3])) logz.log_tabular("ValAverageLoss", np.mean(val_loss[:, 0])) logz.log_tabular("ValAveragePredLoss", np.mean(val_loss[:, 1])) logz.log_tabular("ValAverageEmbedLoss", np.mean(val_loss[:, 2])) logz.log_tabular("ValAverageAttnLoss", np.mean(val_loss[:, 3])) logz.log_tabular("ValAverageObjAccu", np.mean(val_accu[:, 0])) logz.log_tabular("ValAverageCoordAccu", np.mean(val_accu[:, 1])) logz.log_tabular("ValAverageScaleAccu", np.mean(val_accu[:, 2])) logz.log_tabular("ValAverageRatioAccu", np.mean(val_accu[:, 3])) logz.dump_tabular() ################################################################## ## Checkpoint ################################################################## self.save_checkpoint(epoch)
def setup_logger(logdir, locals_): # Configure output directory for logging logz.configure_output_dir(logdir) # Log experimental parameters args = inspect.getargspec(train_PG)[0] params = {k: locals_[k] if k in locals_ else None for k in args} # print(params.items()) # print(json.dumps(list(params.values()))) logz.save_params(params)
def trainModel(train_data_generator, val_data_generator, model, initial_epoch): """ Model training. # Arguments train_data_generator: Training data generated batch by batch. val_data_generator: Validation data generated batch by batch. model: A Model instance. initial_epoch: Epoch from which training starts. """ # Configure training process model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=cifar10_resnet_mod.lr_schedule(0)), metrics=['categorical_accuracy']) # Save model with the lowest validation loss weights_path = os.path.join(FLAGS.experiment_rootdir, 'weights_{epoch:03d}.h5') writeBestModel = ModelCheckpoint(filepath=weights_path, monitor='val_loss', save_best_only=True, save_weights_only=True) # Save training and validation losses. logz.configure_output_dir(FLAGS.experiment_rootdir) saveModelAndLoss = log_utils.MyCallback(filepath=FLAGS.experiment_rootdir) # Train model steps_per_epoch = int( np.ceil(train_data_generator.samples / FLAGS.batch_size)) validation_steps = int( np.ceil(val_data_generator.samples / FLAGS.batch_size)) - 1 lr_scheduler = LearningRateScheduler(cifar10_resnet_mod.lr_schedule) lr_reducer = ReduceLROnPlateau(factor=np.sqrt(0.1), cooldown=0, patience=5, min_lr=0.5e-6) # TENSORBOARD SIRVE PARA VISUALIZAR LOS RESULTADOS DE LAS ITERACIONES # HASTA AQUI SE HARÁ UN PROCESO ITERATIVO QUE GUARDARÁ DE TODOS LOS CASOS # EL MODELO CON EL MEJOR RESULTADO SOBRE LOS DATOS DE VALIDACIÓN strTime = strftime("%Y%b%d_%Hh%Mm%Ss", localtime(time())) tensorboard = TensorBoard(log_dir="logs/{}".format(strTime), histogram_freq=0) callbacks = [ writeBestModel, saveModelAndLoss, lr_reducer, lr_scheduler, tensorboard ] model.fit_generator(train_data_generator, epochs=FLAGS.epochs, steps_per_epoch=steps_per_epoch, callbacks=callbacks, validation_data=val_data_generator, validation_steps=validation_steps, initial_epoch=initial_epoch)
def __init__( self, organism_builder=None, logdir=None, params=None, master_organism=None, sampler_builder=None, ): logz.configure_output_dir(logdir) logz.save_params(params) # env = env_registry.get_env_constructor(params['env_name'])() self.logdir = logdir self.params = params self.max_past_avg_reward = float('-inf') self.num_episodes_used = float('inf') # create shared table for storing noise print("Creating deltas table.") deltas_id = create_shared_noise_serial() self.deltas = SharedNoiseTable(deltas_id, seed=params['seed'] + 3) print('Created deltas table.') ######################################################## self.master_organism = master_organism self.sampler = sampler_builder( num_deltas=params['n_directions'], shift=params['shift'], num_workers=params['n_workers'], seed=params['seed'], env_name=params['env_name'], organism_builder= organism_builder, #lambda: ARS_LinearAgent(agent_args) deltas_id=deltas_id, rollout_length=params['rollout_length'], delta_std=params['delta_std'], ) # maybe we'd need to merge Sampler and Agent # agent holds the parameters, but sampler takes the agent and does the parallel rollouts # so agent should not have the workers at all... # agent should just contain the parameter. # but the sampler would need to take the agent in. # so the sampler is the thing that takes a single agent, and creates a bunch of workers # modeled the agent. self.rl_alg = ARS_RL_Alg( deltas=self.deltas, # noise table num_deltas=params['n_directions'], # N deltas_used=params['deltas_used'] # b )
def run_model(session, predict, loss, train_step, saver, images, labels, X, y, epochs=1, batch_size=64, print_every=100, is_test=False): if not is_test: # Configure output directory for logging logz.configure_output_dir('logs') # Log experimental parameters args = inspect.getargspec(main)[0] # Get the names and default values of a function's parameters. locals_ = locals() # Return a dictionary containing the current scope's local variables params = {k: locals_[k] if k in locals_ else None for k in args} logz.save_params(params) # have tensorflow compute accuracy correct_prediction = tf.equal(tf.argmax(predict, axis=1), tf.argmax(y, axis=1)) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) # counter iter_cnt = 0 iters_each_epoch = len(images)//batch_size - 1 for e in range(epochs): # keep track of losses and accuracy correct = 0 losses = [] # make sure we iterate over the dataset once images, labels = shuffle_dataset(images, labels) for i in range(iters_each_epoch): current_iter = i+1 batch_X, batch_y = images[current_iter*batch_size:(current_iter+1)*batch_size], labels[current_iter*batch_size:(current_iter+1)*batch_size] feed_dict = {X: batch_X, y: batch_y} # have tensorflow compute loss and correct predictions # and (if given) perform a training step l, corr, _ = session.run([loss, correct_prediction, train_step],feed_dict=feed_dict) # aggregate performance stats losses.append(l*batch_size) correct += np.sum(corr) # print every now and then if (iter_cnt % print_every) == 0 and not is_test: logz.log_tabular("Iteration", iter_cnt) logz.log_tabular("minibatch_loss", l) logz.log_tabular("minibatch_accuracy", np.sum(corr)/batch_size) logz.dump_tabular() logz.pickle_tf_vars() iter_cnt += 1 if is_test: total_correct = correct/len(images) total_loss = np.sum(losses)/len(images) print('acc:', total_correct) print('los:', total_loss) else: saver.save(session, 'checkpoints/mnist_plus', iter_cnt)
def main(): PROJECT_ROOT = os.path.dirname(os.path.realpath(__file__)) logz.configure_output_dir( os.path.join(PROJECT_ROOT, "log/" + "_RAM_" + time.strftime("%d-%m-%Y_%H-%M-%S"))) # Run training seed = 0 # Use a seed of zero (you may want to randomize the seed!) env = get_env(seed) session = get_session() atari_learn(env, session, num_timesteps=int(4e7))
def main_pendulum(logdir, seed, n_iter, gamma, min_timesteps_per_batch, initial_stepsize, desired_kl, vf_type, vf_params, animate=False): tf.set_random_seed(seed) np.random.seed(seed) env = gym.make("Pendulum-v0") ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.shape[0] logz.configure_output_dir(logdir) if vf_type == 'linear': vf = LinearValueFunction(**vf_params) elif vf_type == 'nn': vf = NnValueFunction(ob_dim=ob_dim, **vf_params) YOUR_CODE_HERE sy_surr = - tf.reduce_mean(sy_adv_n * sy_logprob_n) # Loss function that we'll differentiate to get the policy gradient ("surr" is for "surrogate loss") sy_stepsize = tf.placeholder(shape=[], dtype=tf.float32) # Symbolic, in case you want to change the stepsize during optimization. (We're not doing that currently) update_op = tf.train.AdamOptimizer(sy_stepsize).minimize(sy_surr) sess = tf.Session() sess.__enter__() # equivalent to `with sess:` tf.global_variables_initializer().run() #pylint: disable=E1101 total_timesteps = 0 stepsize = initial_stepsize for i in range(n_iter): print("********** Iteration %i ************"%i) YOUR_CODE_HERE if kl > desired_kl * 2: stepsize /= 1.5 print('stepsize -> %s'%stepsize) elif kl < desired_kl / 2: stepsize *= 1.5 print('stepsize -> %s'%stepsize) else: print('stepsize OK') # Log diagnostics logz.log_tabular("EpRewMean", np.mean([path["reward"].sum() for path in paths])) logz.log_tabular("EpLenMean", np.mean([pathlength(path) for path in paths])) logz.log_tabular("KLOldNew", kl) logz.log_tabular("Entropy", ent) logz.log_tabular("EVBefore", explained_variance_1d(vpred_n, vtarg_n)) logz.log_tabular("EVAfter", explained_variance_1d(vf.predict(ob_no), vtarg_n)) logz.log_tabular("TimestepsSoFar", total_timesteps) # If you're overfitting, EVAfter will be way larger than EVBefore. # Note that we fit value function AFTER using it to compute the advantage function to avoid introducing bias logz.dump_tabular()
def learn(*args, **kwargs): alg = QLearner(*args, **kwargs) logz.configure_output_dir(alg.logdir) if alg.start_time is None: alg.start_time = time.time() while not alg.stopping_criterion_met(): alg.step_env() # at this point, the environment should have been advanced one step (and # reset if done was true), and self.last_obs should point to the new latest # observation alg.update_model() alg.log_progress()
def __init__( self, env=None, discrete=True, ob_dim=0, ac_dim=0, gamma=1.0, max_path_length=None, learning_rate=5e-3, logdir=None, normalize_advantages=True, nn_baseline=False, # network arguments n_layers=1, size=32, gae_lambda=-1.0, model_tag='vanilla', #ppo parameter clip_ratio=0.2, ): #params self.nn_baseline = nn_baseline self.learning_rate = learning_rate self.gamma = gamma self.normalize_advantages = normalize_advantages self.n_layers = n_layers self.size = size self.gae_lambda = gae_lambda self.model_tag = model_tag self.clip_ratio = clip_ratio # Configure output directory for logging logz.configure_output_dir(logdir) self.log_dir = logdir # Log experimental parameters # args = inspect.getfullargspec(__init__)[0] # locals_ = locals() # params = {k: locals_[k] if k in locals_ else None for k in args} # logz.save_params(params) # Make the gym environment self.env = env self.ob_dim = ob_dim self.ac_dim = ac_dim # Is this env continuous, or discrete? self.discrete = discrete # Maximum length for episodes self.max_path_length = max_path_length self.setup_placeholders() self.setup_tf_operations() self.setup_loss() if self.nn_baseline: self.setup_baseline()
def setup_logger(logdir, params): # Configure output directory for logging logz.configure_output_dir(logdir) # Log experimental parameters # args = inspect.getargspec(learn)[0] check_params = params.copy() log_params = params.copy() for param in check_params.keys(): try: json.dumps(check_params[param]) except: del log_params[param] logz.save_params(log_params)
def main(): # Get Atari games. # Change the index to select a different game. PROJECT_ROOT = osp.dirname(osp.realpath(__file__)) logz.configure_output_dir( osp.join(PROJECT_ROOT, "log/" + "_RAM_" + time.strftime("%d-%m-%Y_%H-%M-%S"))) # Run training seed = 0 # Use a seed of zero (you may want to randomize the seed!) env = get_env('SpaceInvaders-v0', seed) session = get_session() atari_learn(env, session, num_timesteps=40000000)
def main(): # Get Atari games. benchmark = gym.benchmark_spec('Atari40M') # Change the index to select a different game. task = benchmark.tasks[3] PROJECT_ROOT = os.path.dirname(os.path.realpath(__file__)) logz.configure_output_dir(os.path.join(PROJECT_ROOT, "log/"+"_RAM_"+time.strftime("%d-%m-%Y_%H-%M-%S"))) # Run training seed = 0 # Use a seed of zero (you may want to randomize the seed!) env = get_env(task, seed) session = get_session() atari_learn(env, session, num_timesteps=task.max_timesteps)
def mpc_platoon_sampler(vehicle_model,platoon_model,controller,cost_function,save,timeSteps=3000,pred_horizon=15, imag_rollouts_number=400,X_initial_1=[0,0],X_initial_2=[0,0],X_desired_1=[0,10],X_desired_2=[2,0]): X_1=[] X_2=[] X_v2=[] u_1=[] u_2=[] cost_1=[] cost_2=[] X_1.append(X_initial_1) X_v2.append(X_initial_2) X_2.append([X_1[0][0]-X_v2[0][0],X_1[0][1]-X_v2[0][1]]) cost_1.append(Abs_Error(X_initial_1[1],X_desired_1[1])) cost_2.append(Abs_Error(X_2[0][0],X_desired_2[0])) if(save): logz.configure_output_dir("/home/hendawy/Desktop/Platoon_Advanced_Mechatronics_Project/RLTrial",11) for t in range(timeSteps): # time_start=time.time() u1_t=controller(vehicle_model,pred_horizon,imag_rollouts_number,X_1[t],cost_function,X_desired_1,'Leader') u2_t=controller(vehicle_model,pred_horizon,imag_rollouts_number,X_v2[t],cost_function,X_desired_2,'Follower',X_1[t]) X_next_1,X_next_2=platoon_model(u1_t,u2_t,X_1[t][0],X_1[t][1],X_v2[t][0],X_v2[t][1]) # print('Vehicle 1',X_1[t],X_next_1,u1_t) # print('Vehicle 2',X_v2[t],X_next_2,u2_t) cost_1.append(Abs_Error(X_next_1[1],X_desired_1[1])) cost_2.append(Abs_Error(X_next_2[0],X_desired_2[0])) if(save): logz.log_tabular('Error_v1', Abs_Error(X_next_1[1],X_desired_1[1])) logz.log_tabular('Error_v2', Abs_Error(X_next_2[0],X_desired_2[0])) logz.dump_tabular() X_1.append(X_next_1) X_2.append(X_next_2) u_1.append([u1_t]) u_2.append([u2_t]) X_v2.append([-X_next_2[0]+X_next_1[0],-X_next_2[1]+X_next_1[1]]) # time_end=time.time() # print(time_end-time_start) X_1.pop() X_2.pop() traj = {"states_v1" : np.array(X_1), "states_v2" : np.array(X_v2), "states_f1" : np.array(X_2), "control_v1" : np.array(u_1), "control_v2" : np.array(u_2), "cost_v1" : np.array(cost_1), "cost_v2" : np.array(cost_2), } return traj
def train_model(train_generator, val_generator, model, initial_epoch): model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=lr_schedule(0)), metrics=['accuracy']) # Save model with the lowest validation loss weights_path = os.path.join(FLAGS.experiment_rootdir, 'weights_{epoch:03d}.h5') write_best_model = ModelCheckpoint(filepath=weights_path, monitor='val_loss', save_best_only=True, save_weights_only=True) # Save training and validation losses. logz.configure_output_dir(FLAGS.experiment_rootdir) save_model_and_loss = log_utils.MyCallback( filepath=FLAGS.experiment_rootdir) # Train model lr_scheduler = LearningRateScheduler(lr_schedule, verbose=FLAGS.verbose) lr_reducer = ReduceLROnPlateau(factor=np.sqrt(0.1), cooldown=0, patience=5, verbose=FLAGS.verbose, min_lr=0.5e-6) # earlystopping = EarlyStopping(monitor='val_loss', patience=3, verbose=FLAGS.verbose) str_time = strftime("%Y%b%d_%Hh%Mm%Ss", localtime(time())) tensorboard = TensorBoard(log_dir="logs/{}".format(str_time), histogram_freq=0) callbacks = [ write_best_model, save_model_and_loss, lr_reducer, lr_scheduler, tensorboard ] model.fit_generator(train_generator, validation_data=val_generator, epochs=FLAGS.epochs, verbose=FLAGS.verbose, callbacks=callbacks, initial_epoch=initial_epoch, use_multiprocessing=True)
def main(): # Get Atari games. task = gym.make('LunarLander-v2') file_dir = osp.dirname(osp.abspath(__file__)) unique_name = datetime.datetime.now(dateutil.tz.tzlocal()).strftime( '%Y_%m_%d_%H_%M_%S_%f_%Z') + '__' + str(uuid.uuid4()) result_dir = osp.join(file_dir, unique_name) logz.configure_output_dir(result_dir) logz.save_params(dict(exp_name=unique_name, )) # Run training seed = 1 print('random seed = %d' % seed) env = get_env(task, seed, result_dir) session = get_session() atari_learn(env, session, num_timesteps=5e5, result_dir=result_dir)
def trainModel(train_data_generator, val_data_generator, model, initial_epoch): """ Model training. # Arguments train_data_generator: Training data generated batch by batch. val_data_generator: Validation data generated batch by batch. model: A Model instance. initial_epoch: Epoch from which training starts. """ # Configure training process optimizer = keras.optimizers.Adam(lr=FLAGS.initial_lr, decay=1e-6) model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['binary_accuracy'], loss_weights=np.ones((21, )).tolist()) # Save model with the lowest validation loss weights_path = os.path.join(FLAGS.experiment_rootdir, 'weights_{epoch:03d}.h5') writeBestModel = ModelCheckpoint(filepath=weights_path, monitor='val_loss', save_best_only=True, save_weights_only=True) tensorboard = TensorBoard(log_dir="logs/{}".format(time())) # Save training and validation losses. logz.configure_output_dir(FLAGS.experiment_rootdir) saveModelAndLoss = log_utils.MyCallback(filepath=FLAGS.experiment_rootdir) # Train model steps_per_epoch = int( np.ceil(train_data_generator.samples / FLAGS.batch_size)) validation_steps = int( np.ceil(val_data_generator.samples / FLAGS.batch_size)) - 1 model.fit_generator( train_data_generator, epochs=FLAGS.epochs, steps_per_epoch=steps_per_epoch, callbacks=[writeBestModel, saveModelAndLoss, tensorboard], validation_data=val_data_generator, validation_steps=validation_steps, initial_epoch=initial_epoch)
def __init__(self, env=None, discrete=True, ob_shape=(), ac_dim=0, gamma=1.0, actor_lr=1e-4, critic_lr=1e-3, logdir=None, normalize_returns=True, # network arguments n_layers=1, size=32, gae_lambda=-1.0, tau=0.001 #parameter update rate ): self.gamma = gamma self.actor_lr = actor_lr self.critic_lr = critic_lr self.normalize_returns = normalize_returns self.n_layers = n_layers self.size = size self.gae_lambda = gae_lambda self.tau = tau # Configure output directory for logging logz.configure_output_dir(logdir) # Log experimental parameters # args = inspect.getfullargspec(train_DDPG)[0] # locals_ = locals() # params = {k: locals_[k] if k in locals_ else None for k in args} # logz.save_params(params) # Make the gym environment self.env = env # Is this env continuous, or discrete? self.discrete = discrete self.ac_dim = ac_dim self.ob_dim = ob_shape[0] #observation_shape in cartpole is (2,) 一个tuple self.memory = Memory(limit=int(1e6), action_shape=ac_dim, observation_shape=ob_shape) self.setup_placeholders() self.setup_network()
def get_env(env_name, exp_name, seed): env = gym.make(env_name) set_global_seeds(seed) env.seed(seed) # Set Up Logger logdir = 'dqn_' + exp_name + '_' + env_name + '_' + time.strftime("%d-%m-%Y_%H-%M-%S") logdir = osp.join('data', logdir) logdir = osp.join(logdir, '%d'%seed) logz.configure_output_dir(logdir) hyperparams = {'exp_name': exp_name, 'env_name': env_name} logz.save_hyperparams(hyperparams) expt_dir = '/tmp/hw3_vid_dir/' env = wrappers.Monitor(env, osp.join(expt_dir, "gym"), force=True, video_callable=False) return env
def get_env(env_name, exp_name, seed): env = gym.make(env_name) set_global_seeds(seed) env.seed(seed) # Set Up Logger logdir = 'dqn_' + exp_name + '_' + env_name + '_' + time.strftime( "%d-%m-%Y_%H-%M-%S") logdir = osp.join('data', logdir) logdir = osp.join(logdir, '%d' % seed) logz.configure_output_dir(logdir) hyperparams = {'exp_name': exp_name, 'env_name': env_name} logz.save_hyperparams(hyperparams) expt_dir = '/tmp/hw3_vid_dir2/' env = wrappers.Monitor(env, osp.join(expt_dir, "gym"), force=True) env = wrap_deepmind(env) # observation = env.reset() # print('observation shape', observation.shape) return env
def train_PG(exp_name='', env_name='CartPole-v0', n_iter=100, gamma=1.0, min_timesteps_per_batch=1000, max_path_length=None, learning_rate=5e-3, reward_to_go=True, animate=True, logdir=None, normalize_advantages=True, nn_baseline=False, seed=0, # network arguments n_layers=1, size=32 ): start = time.time() # Configure output directory for logging logz.configure_output_dir(logdir) # Log experimental parameters args = inspect.getargspec(train_PG)[0] locals_ = locals() params = {k: locals_[k] if k in locals_ else None for k in args} logz.save_params(params) # Set random seeds torch.manual_seed(seed) np.random.seed(seed) # Make the gym environment env = gym.make(env_name) # Is this env continuous, or discrete? discrete = isinstance(env.action_space, gym.spaces.Discrete) # Maximum length for episodes max_path_length = max_path_length or env.spec.max_episode_steps #========================================================================================# # Notes on notation: # # Symbolic variables have the prefix sy_, to distinguish them from the numerical values # that are computed later in the function # # Prefixes and suffixes: # ob - observation # ac - action # _no - this tensor should have shape (batch size /n/, observation dim) # _na - this tensor should have shape (batch size /n/, action dim) # _n - this tensor should have shape (batch size /n/) # # Note: batch size /n/ is defined at runtime, and until then, the shape for that axis # is None #========================================================================================# # Observation and action sizes ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.n if discrete else env.action_space.shape[0] #todo: create Agent #todo: initilize Agent: #========================================================================================# # Tensorflow Engineering: Config, Session, Variable initialization #========================================================================================# tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) sess = tf.Session(config=tf_config) sess.__enter__() # equivalent to `with sess:` #========================================================================================# # Training Loop #========================================================================================# total_timesteps = 0 for itr in range(n_iter): print("********** Iteration %i ************"%itr) # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] while True: ob = env.reset() obs, acs, rewards = [], [], [] animate_this_episode=(len(paths)==0 and (itr % 10 == 0) and animate) steps = 0 while True: if animate_this_episode: env.render() time.sleep(0.05) obs.append(ob) ac = actor.run(ob) print("need to type-check action here:(two lines)") print(ac) print(ac.size()) acs.append(ac) ob, rew, done, _ = env.step(ac) rewards.append(rew) steps += 1 if done or steps > max_path_length: break #One episode finishes; perform update here finish_episode(actor, actor_optimizer, critic=None, critic_optimizer=None, ) path = {"observation" : np.array(obs), "reward" : np.array(rewards), "action" : np.array(acs)} paths.append(path) timesteps_this_batch += pathlength(path) if timesteps_this_batch > min_timesteps_per_batch: break total_timesteps += timesteps_this_batch # Log diagnostics returns = [path["reward"].sum() for path in paths] ep_lengths = [pathlength(path) for path in paths] logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", itr) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) logz.log_tabular("TimestepsSoFar", total_timesteps) logz.dump_tabular() logz.pickle_tf_vars()
def train_PG(exp_name='', env_name='CartPole-v0', n_iter=100, gamma=1.0, min_timesteps_per_batch=1000, max_path_length=None, learning_rate=5e-3, reward_to_go=True, animate=True, logdir=None, normalize_advantages=True, nn_baseline=False, seed=0, # network arguments n_layers=1, size=32, network_activation='tanh' ): start = time.time() # Configure output directory for logging logz.configure_output_dir(logdir) # Log experimental parameters args = inspect.getargspec(train_PG)[0] locals_ = locals() params = {k: locals_[k] if k in locals_ else None for k in args} logz.save_params(params) # Set random seeds torch.manual_seed(seed) np.random.seed(seed) # Make the gym environment env = gym.make(env_name) # Is this env continuous, or discrete? discrete = isinstance(env.action_space, gym.spaces.Discrete) # Maximum length for episodes max_path_length = max_path_length or env.spec.max_episode_steps #========================================================================================# # Notes on notation: # # Symbolic variables have the prefix sy_, to distinguish them from the numerical values # that are computed later in the function # # Prefixes and suffixes: # ob - observation # ac - action # _no - this tensor should have shape (batch size /n/, observation dim) # _na - this tensor should have shape (batch size /n/, action dim) # _n - this tensor should have shape (batch size /n/) # # Note: batch size /n/ is defined at runtime, and until then, the shape for that axis # is None #========================================================================================# # Observation and action sizes ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.n if discrete else env.action_space.shape[0] #activation function for the network if network_activation=='relu': activation=torch.nn.functional.relu elif network_activation=='leaky_relu': activation=torch.nn.functional.leaky_relu else: activation=torch.nn.functional.tanh #todo: create policy actor=build_mlp(ob_dim, ac_dim, "actor",\ n_layers=n_layers, size=size, activation=activation, discrete=discrete) actor_loss=reinforce_loss actor_optimizer=torch.optim.Adam(actor.parameters(), lr=learning_rate) #todo: initilize Agent: #========================================================================================# # ----------SECTION 5---------- # Optional Baseline #========================================================================================# if nn_baseline: critic=build_mlp(ob_dim,1,"nn_baseline",\ n_layers=n_layers,size=size, discrete=discrete) critic_loss=nn.MSELoss() critic_optimizer=torch.optim.Adam(critic.parameters(), lr=learning_rate) #========================================================================================# # Training Loop #========================================================================================# total_timesteps = 0 for itr in range(n_iter): print("********** Iteration %i ************"%itr) # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] while True: ob = env.reset() obs, acs, rewards, log_probs = [], [], [], [] animate_this_episode=(len(paths)==0 and (itr % 10 == 0) and animate) steps = 0 while True: if animate_this_episode: env.render() time.sleep(0.05) ob = torch.from_numpy(ob).float().unsqueeze(0) obs.append(ob) ac, log_prob = actor.run(ob) acs.append(ac) log_probs.append(log_prob) #format the action from policy if discrete: ac = int(ac) else: ac = ac.squeeze(0).numpy() ob, rew, done, _ = env.step(ac) rewards.append(rew) steps += 1 if done or steps > max_path_length: break path = {"observation" : torch.cat(obs, 0), "reward" : torch.Tensor(rewards), "action" : torch.cat(acs, 0), "log_prob" : torch.cat(log_probs, 0)} paths.append(path) timesteps_this_batch += pathlength(path) if timesteps_this_batch > min_timesteps_per_batch: break total_timesteps += timesteps_this_batch ob_no = torch.cat([path["observation"] for path in paths], 0) ac_na = torch.cat([path["action"] for path in paths], 0) #====================================================================================# # ----------SECTION 4---------- # Computing Q-values # # Your code should construct numpy arrays for Q-values which will be used to compute # advantages (which will in turn be fed to the placeholder you defined above). # # Recall that the expression for the policy gradient PG is # # PG = E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * (Q_t - b_t )] # # where # # tau=(s_0, a_0, ...) is a trajectory, # Q_t is the Q-value at time t, Q^{pi}(s_t, a_t), # and b_t is a baseline which may depend on s_t. # # You will write code for two cases, controlled by the flag 'reward_to_go': # # Case 1: trajectory-based PG # # (reward_to_go = False) # # Instead of Q^{pi}(s_t, a_t), we use the total discounted reward summed over # entire trajectory (regardless of which time step the Q-value should be for). # # For this case, the policy gradient estimator is # # E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * Ret(tau)] # # where # # Ret(tau) = sum_{t'=0}^T gamma^t' r_{t'}. # # Thus, you should compute # # Q_t = Ret(tau) # # Case 2: reward-to-go PG # # (reward_to_go = True) # # Here, you estimate Q^{pi}(s_t, a_t) by the discounted sum of rewards starting # from time step t. Thus, you should compute # # Q_t = sum_{t'=t}^T gamma^(t'-t) * r_{t'} # # # Store the Q-values for all timesteps and all trajectories in a variable 'q_n', # like the 'ob_no' and 'ac_na' above. # #====================================================================================# q_n = [] for path in paths: rewards = path['reward'] num_steps = pathlength(path) R=[] if reward_to_go: for t in range(num_steps): R.append((torch.pow(gamma, torch.arange(num_steps-t))*rewards[t:]).sum().view(-1,1)) q_n.append(torch.cat(R)) else: q_n.append((torch.pow(gamma, torch.arange(num_steps)) * rewards).sum() * torch.ones(num_steps, 1)) q_n = torch.cat(q_n, 0) #====================================================================================# # ----------SECTION 5---------- # Computing Baselines #====================================================================================# if nn_baseline: # If nn_baseline is True, use your neural network to predict reward-to-go # at each timestep for each trajectory, and save the result in a variable 'b_n' # like 'ob_no', 'ac_na', and 'q_n'. # # Hint #bl1: rescale the output from the nn_baseline to match the statistics # (mean and std) of the current or previous batch of Q-values. (Goes with Hint # #bl2 below.) b_n = critic(ob_no) q_n_std = q_n.std() q_n_mean = q_n.mean() b_n_scaled = b_n * q_n_std + q_n_mean adv_n = (q_n - b_n_scaled).detach() else: adv_n = q_n #====================================================================================# # ----------SECTION 4---------- # Advantage Normalization #====================================================================================# if normalize_advantages: # On the next line, implement a trick which is known empirically to reduce variance # in policy gradient methods: normalize adv_n to have mean zero and std=1. # YOUR_CODE_HERE adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + np.finfo(np.float32).eps.item()) #====================================================================================# # ----------SECTION 5---------- # Optimizing Neural Network Baseline #====================================================================================# if nn_baseline: # ----------SECTION 5---------- # If a neural network baseline is used, set up the targets and the inputs for the # baseline. # # Fit it to the current batch in order to use for the next iteration. Use the # baseline_update_op you defined earlier. # # Hint #bl2: Instead of trying to target raw Q-values directly, rescale the # targets to have mean zero and std=1. (Goes with Hint #bl1 above.) # YOUR_CODE_HERE target = (q_n - q_n_mean) / (q_n_std + np.finfo(np.float32).eps.item()) critic_optimizer.zero_grad() c_loss = critic_loss(b_n, target) c_loss.backward() critic_optimizer.step() #====================================================================================# # ----------SECTION 4---------- # Performing the Policy Update #====================================================================================# # Call the update operation necessary to perform the policy gradient update based on # the current batch of rollouts. # # For debug purposes, you may wish to save the value of the loss function before # and after an update, and then log them below. # YOUR_CODE_HERE log_probs = torch.cat([path["log_prob"] for path in paths], 0) actor_optimizer.zero_grad() loss = actor_loss(log_probs, adv_n, len(paths)) print(loss) loss.backward() actor_optimizer.step() # Log diagnostics returns = [path["reward"].sum() for path in paths] ep_lengths = [pathlength(path) for path in paths] logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", itr) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) logz.log_tabular("TimestepsSoFar", total_timesteps) logz.dump_tabular() logz.pickle_tf_vars()
def main_cartpole(n_iter=100, gamma=1.0, min_timesteps_per_batch=1000, stepsize=1e-2, animate=True, logdir=None): env = gym.make("CartPole-v0") ob_dim = env.observation_space.shape[0] num_actions = env.action_space.n logz.configure_output_dir(logdir) vf = LinearValueFunction() # Symbolic variables have the prefix sy_, to distinguish them from the numerical values # that are computed later in these function sy_ob_no = tf.placeholder(shape=[None, ob_dim], name="ob", dtype=tf.float32) # batch of observations sy_ac_n = tf.placeholder(shape=[None], name="ac", dtype=tf.int32) # batch of actions taken by the policy, used for policy gradient computation sy_adv_n = tf.placeholder(shape=[None], name="adv", dtype=tf.float32) # advantage function estimate sy_h1 = lrelu(dense(sy_ob_no, 32, "h1", weight_init=normc_initializer(1.0))) # hidden layer sy_logits_na = dense(sy_h1, num_actions, "final", weight_init=normc_initializer(0.05)) # "logits", describing probability distribution of final layer # we use a small initialization for the last layer, so the initial policy has maximal entropy sy_oldlogits_na = tf.placeholder(shape=[None, num_actions], name='oldlogits', dtype=tf.float32) # logits BEFORE update (just used for KL diagnostic) sy_logp_na = tf.nn.log_softmax(sy_logits_na) # logprobability of actions sy_sampled_ac = categorical_sample_logits(sy_logits_na)[0] # sampled actions, used for defining the policy (NOT computing the policy gradient) sy_n = tf.shape(sy_ob_no)[0] sy_logprob_n = fancy_slice_2d(sy_logp_na, tf.range(sy_n), sy_ac_n) # log-prob of actions taken -- used for policy gradient calculation # The following quantities are just used for computing KL and entropy, JUST FOR DIAGNOSTIC PURPOSES >>>> sy_oldlogp_na = tf.nn.log_softmax(sy_oldlogits_na) sy_oldp_na = tf.exp(sy_oldlogp_na) sy_kl = tf.reduce_sum(sy_oldp_na * (sy_oldlogp_na - sy_logp_na)) / tf.to_float(sy_n) sy_p_na = tf.exp(sy_logp_na) sy_ent = tf.reduce_sum( - sy_p_na * sy_logp_na) / tf.to_float(sy_n) # <<<<<<<<<<<<< sy_surr = - tf.reduce_mean(sy_adv_n * sy_logprob_n) # Loss function that we'll differentiate to get the policy gradient ("surr" is for "surrogate loss") sy_stepsize = tf.placeholder(shape=[], dtype=tf.float32) # Symbolic, in case you want to change the stepsize during optimization. (We're not doing that currently) update_op = tf.train.AdamOptimizer(sy_stepsize).minimize(sy_surr) tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) # use single thread. on such a small problem, multithreading gives you a slowdown # this way, we can better use multiple cores for different experiments sess = tf.Session(config=tf_config) sess.__enter__() # equivalent to `with sess:` tf.global_variables_initializer().run() #pylint: disable=E1101 total_timesteps = 0 for i in range(n_iter): print("********** Iteration %i ************"%i) # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] while True: ob = env.reset() terminated = False obs, acs, rewards = [], [], [] animate_this_episode=(len(paths)==0 and (i % 10 == 0) and animate) while True: if animate_this_episode: env.render() obs.append(ob) ac = sess.run(sy_sampled_ac, feed_dict={sy_ob_no : ob[None]}) acs.append(ac) ob, rew, done, _ = env.step(ac) rewards.append(rew) if done: break path = {"observation" : np.array(obs), "terminated" : terminated, "reward" : np.array(rewards), "action" : np.array(acs)} paths.append(path) timesteps_this_batch += pathlength(path) if timesteps_this_batch > min_timesteps_per_batch: break total_timesteps += timesteps_this_batch # Estimate advantage function vtargs, vpreds, advs = [], [], [] for path in paths: rew_t = path["reward"] return_t = discount(rew_t, gamma) vpred_t = vf.predict(path["observation"]) adv_t = return_t - vpred_t advs.append(adv_t) vtargs.append(return_t) vpreds.append(vpred_t) # Build arrays for policy update ob_no = np.concatenate([path["observation"] for path in paths]) ac_n = np.concatenate([path["action"] for path in paths]) adv_n = np.concatenate(advs) standardized_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8) vtarg_n = np.concatenate(vtargs) vpred_n = np.concatenate(vpreds) vf.fit(ob_no, vtarg_n) # Policy update _, oldlogits_na = sess.run([update_op, sy_logits_na], feed_dict={sy_ob_no:ob_no, sy_ac_n:ac_n, sy_adv_n:standardized_adv_n, sy_stepsize:stepsize}) kl, ent = sess.run([sy_kl, sy_ent], feed_dict={sy_ob_no:ob_no, sy_oldlogits_na:oldlogits_na}) # Log diagnostics logz.log_tabular("EpRewMean", np.mean([path["reward"].sum() for path in paths])) logz.log_tabular("EpLenMean", np.mean([pathlength(path) for path in paths])) logz.log_tabular("KLOldNew", kl) logz.log_tabular("Entropy", ent) logz.log_tabular("EVBefore", explained_variance_1d(vpred_n, vtarg_n)) logz.log_tabular("EVAfter", explained_variance_1d(vf.predict(ob_no), vtarg_n)) logz.log_tabular("TimestepsSoFar", total_timesteps) # If you're overfitting, EVAfter will be way larger than EVBefore. # Note that we fit value function AFTER using it to compute the advantage function to avoid introducing bias logz.dump_tabular()
def train_SAC(env_name, exp_name, seed, reparametrize, two_qf, old_funct, logdir, debug, gpu): alpha = { 'Ant-v2': 0.1, 'HalfCheetah-v2': 0.2, 'Hopper-v2': 0.2, 'Humanoid-v2': 0.05, 'Walker2d-v2': 0.2, }.get(env_name, 0.2) algorithm_params = { 'alpha': alpha, 'batch_size': 256, 'discount': 0.99, 'learning_rate': 1e-3, 'reparameterize': reparametrize, 'tau': 0.01, 'epoch_length': 1000, 'n_epochs': 500, 'two_qf': two_qf, } sampler_params = { 'max_episode_length': 1000, 'prefill_steps': 1000, } replay_pool_params = { 'max_size': 1e6, } value_function_params = { 'hidden_layer_sizes': (128, 128), } q_function_params = { 'hidden_layer_sizes': (128, 128), } policy_params = { 'hidden_layer_sizes': (128, 128), } logz.configure_output_dir(logdir) params = { 'exp_name': exp_name, 'env_name': env_name, 'algorithm_params': algorithm_params, 'sampler_params': sampler_params, 'replay_pool_params': replay_pool_params, 'value_function_params': value_function_params, 'q_function_params': q_function_params, 'policy_params': policy_params } logz.save_params(params) env = gym.envs.make(env_name) # Set random seeds tf.set_random_seed(seed) np.random.seed(seed) env.seed(seed) sampler = utils.SimpleSampler(**sampler_params) replay_pool = utils.SimpleReplayPool( observation_shape=env.observation_space.shape, action_shape=env.action_space.shape, **replay_pool_params) q_function = nn.QFunction(name='q_function', **q_function_params) if algorithm_params.get('two_qf', False): q_function2 = nn.QFunction(name='q_function2', **q_function_params) else: q_function2 = None value_function = nn.ValueFunction(name='value_function', **value_function_params) target_value_function = nn.ValueFunction(name='target_value_function', **value_function_params) policy = nn.GaussianPolicy( action_dim=env.action_space.shape[0], reparameterize=algorithm_params['reparameterize'], old_funct=old_funct, **policy_params) sampler.initialize(env, policy, replay_pool) algorithm = SAC(**algorithm_params) gpu_options = tf.GPUOptions(allow_growth=True, visible_device_list=gpu) tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1, gpu_options=gpu_options) with tf.Session(config=tf_config) as sess: if debug: sess = tf_debug.LocalCLIDebugWrapperSession(sess) algorithm.build(env=env, policy=policy, q_function=q_function, q_function2=q_function2, value_function=value_function, target_value_function=target_value_function) for epoch in algorithm.train(sampler, session=sess, n_epochs=algorithm_params.get( 'n_epochs', 1000)): logz.log_tabular('Iteration', epoch) for k, v in algorithm.get_statistics().items(): logz.log_tabular(k, v) for k, v in replay_pool.get_statistics().items(): logz.log_tabular(k, v) for k, v in sampler.get_statistics().items(): logz.log_tabular(k, v) logz.dump_tabular()
def train_PG(exp_name='', env_name='CartPole-v0', n_iter=100, gamma=1.0, min_timesteps_per_batch=1000, max_path_length=None, learning_rate=5e-3, reward_to_go=True, animate=True, logdir=None, normalize_advantages=True, nn_baseline=False, seed=0, # network arguments n_layers=1, size=32 ): start = time.time() # Configure output directory for logging logz.configure_output_dir(logdir) # Log experimental parameters args = inspect.getargspec(train_PG)[0] locals_ = locals() params = {k: locals_[k] if k in locals_ else None for k in args} logz.save_params(params) # Set random seeds tf.set_random_seed(seed) np.random.seed(seed) # Make the gym environment env = gym.make(env_name) # Is this env continuous, or discrete? discrete = isinstance(env.action_space, gym.spaces.Discrete) # Maximum length for episodes max_path_length = max_path_length or env.spec.max_episode_steps #========================================================================================# # Notes on notation: # # Symbolic variables have the prefix sy_, to distinguish them from the numerical values # that are computed later in the function # # Prefixes and suffixes: # ob - observation # ac - action # _no - this tensor should have shape (batch size /n/, observation dim) # _na - this tensor should have shape (batch size /n/, action dim) # _n - this tensor should have shape (batch size /n/) # # Note: batch size /n/ is defined at runtime, and until then, the shape for that axis # is None #========================================================================================# # Observation and action sizes ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.n if discrete else env.action_space.shape[0] #========================================================================================# # ----------SECTION 4---------- # Placeholders # # Need these for batch observations / actions / advantages in policy gradient loss function. #========================================================================================# sy_ob_no = tf.placeholder(shape=[None, ob_dim], name="ob", dtype=tf.float32) if discrete: sy_ac_na = tf.placeholder(shape=[None], name="ac", dtype=tf.int32) else: sy_ac_na = tf.placeholder(shape=[None, ac_dim], name="ac", dtype=tf.float32) # Define a placeholder for advantages sy_adv_n = TODO #========================================================================================# # ----------SECTION 4---------- # Networks # # Make symbolic operations for # 1. Policy network outputs which describe the policy distribution. # a. For the discrete case, just logits for each action. # # b. For the continuous case, the mean / log std of a Gaussian distribution over # actions. # # Hint: use the 'build_mlp' function you defined in utilities. # # Note: these ops should be functions of the placeholder 'sy_ob_no' # # 2. Producing samples stochastically from the policy distribution. # a. For the discrete case, an op that takes in logits and produces actions. # # Should have shape [None] # # b. For the continuous case, use the reparameterization trick: # The output from a Gaussian distribution with mean 'mu' and std 'sigma' is # # mu + sigma * z, z ~ N(0, I) # # This reduces the problem to just sampling z. (Hint: use tf.random_normal!) # # Should have shape [None, ac_dim] # # Note: these ops should be functions of the policy network output ops. # # 3. Computing the log probability of a set of actions that were actually taken, # according to the policy. # # Note: these ops should be functions of the placeholder 'sy_ac_na', and the # policy network output ops. # #========================================================================================# if discrete: # YOUR_CODE_HERE sy_logits_na = TODO sy_sampled_ac = TODO # Hint: Use the tf.multinomial op sy_logprob_n = TODO else: # YOUR_CODE_HERE sy_mean = TODO sy_logstd = TODO # logstd should just be a trainable variable, not a network output. sy_sampled_ac = TODO sy_logprob_n = TODO # Hint: Use the log probability under a multivariate gaussian. #========================================================================================# # ----------SECTION 4---------- # Loss Function and Training Operation #========================================================================================# loss = TODO # Loss function that we'll differentiate to get the policy gradient. update_op = tf.train.AdamOptimizer(learning_rate).minimize(loss) #========================================================================================# # ----------SECTION 5---------- # Optional Baseline #========================================================================================# if nn_baseline: baseline_prediction = tf.squeeze(build_mlp( sy_ob_no, 1, "nn_baseline", n_layers=n_layers, size=size)) # Define placeholders for targets, a loss function and an update op for fitting a # neural network baseline. These will be used to fit the neural network baseline. # YOUR_CODE_HERE baseline_update_op = TODO #========================================================================================# # Tensorflow Engineering: Config, Session, Variable initialization #========================================================================================# tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) sess = tf.Session(config=tf_config) sess.__enter__() # equivalent to `with sess:` tf.global_variables_initializer().run() #pylint: disable=E1101 #========================================================================================# # Training Loop #========================================================================================# total_timesteps = 0 for itr in range(n_iter): print("********** Iteration %i ************"%itr) # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] while True: ob = env.reset() obs, acs, rewards = [], [], [] animate_this_episode=(len(paths)==0 and (itr % 10 == 0) and animate) steps = 0 while True: if animate_this_episode: env.render() time.sleep(0.05) obs.append(ob) ac = sess.run(sy_sampled_ac, feed_dict={sy_ob_no : ob[None]}) ac = ac[0] acs.append(ac) ob, rew, done, _ = env.step(ac) rewards.append(rew) steps += 1 if done or steps > max_path_length: break path = {"observation" : np.array(obs), "reward" : np.array(rewards), "action" : np.array(acs)} paths.append(path) timesteps_this_batch += pathlength(path) if timesteps_this_batch > min_timesteps_per_batch: break total_timesteps += timesteps_this_batch # Build arrays for observation, action for the policy gradient update by concatenating # across paths ob_no = np.concatenate([path["observation"] for path in paths]) ac_na = np.concatenate([path["action"] for path in paths]) #====================================================================================# # ----------SECTION 4---------- # Computing Q-values # # Your code should construct numpy arrays for Q-values which will be used to compute # advantages (which will in turn be fed to the placeholder you defined above). # # Recall that the expression for the policy gradient PG is # # PG = E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * (Q_t - b_t )] # # where # # tau=(s_0, a_0, ...) is a trajectory, # Q_t is the Q-value at time t, Q^{pi}(s_t, a_t), # and b_t is a baseline which may depend on s_t. # # You will write code for two cases, controlled by the flag 'reward_to_go': # # Case 1: trajectory-based PG # # (reward_to_go = False) # # Instead of Q^{pi}(s_t, a_t), we use the total discounted reward summed over # entire trajectory (regardless of which time step the Q-value should be for). # # For this case, the policy gradient estimator is # # E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * Ret(tau)] # # where # # Ret(tau) = sum_{t'=0}^T gamma^t' r_{t'}. # # Thus, you should compute # # Q_t = Ret(tau) # # Case 2: reward-to-go PG # # (reward_to_go = True) # # Here, you estimate Q^{pi}(s_t, a_t) by the discounted sum of rewards starting # from time step t. Thus, you should compute # # Q_t = sum_{t'=t}^T gamma^(t'-t) * r_{t'} # # # Store the Q-values for all timesteps and all trajectories in a variable 'q_n', # like the 'ob_no' and 'ac_na' above. # #====================================================================================# # YOUR_CODE_HERE q_n = TODO #====================================================================================# # ----------SECTION 5---------- # Computing Baselines #====================================================================================# if nn_baseline: # If nn_baseline is True, use your neural network to predict reward-to-go # at each timestep for each trajectory, and save the result in a variable 'b_n' # like 'ob_no', 'ac_na', and 'q_n'. # # Hint #bl1: rescale the output from the nn_baseline to match the statistics # (mean and std) of the current or previous batch of Q-values. (Goes with Hint # #bl2 below.) b_n = TODO adv_n = q_n - b_n else: adv_n = q_n.copy() #====================================================================================# # ----------SECTION 4---------- # Advantage Normalization #====================================================================================# if normalize_advantages: # On the next line, implement a trick which is known empirically to reduce variance # in policy gradient methods: normalize adv_n to have mean zero and std=1. # YOUR_CODE_HERE pass #====================================================================================# # ----------SECTION 5---------- # Optimizing Neural Network Baseline #====================================================================================# if nn_baseline: # ----------SECTION 5---------- # If a neural network baseline is used, set up the targets and the inputs for the # baseline. # # Fit it to the current batch in order to use for the next iteration. Use the # baseline_update_op you defined earlier. # # Hint #bl2: Instead of trying to target raw Q-values directly, rescale the # targets to have mean zero and std=1. (Goes with Hint #bl1 above.) # YOUR_CODE_HERE pass #====================================================================================# # ----------SECTION 4---------- # Performing the Policy Update #====================================================================================# # Call the update operation necessary to perform the policy gradient update based on # the current batch of rollouts. # # For debug purposes, you may wish to save the value of the loss function before # and after an update, and then log them below. # YOUR_CODE_HERE # Log diagnostics returns = [path["reward"].sum() for path in paths] ep_lengths = [pathlength(path) for path in paths] logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", itr) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) logz.log_tabular("TimestepsSoFar", total_timesteps) logz.dump_tabular() logz.pickle_tf_vars()