def trainModel(train_data_generator, val_data_generator, model, initial_epoch):
    """
    Model training.

    # Arguments
       train_data_generator: Training data generated batch by batch.
       val_data_generator: Validation data generated batch by batch.
       model: A Model instance.
       initial_epoch: Epoch from which training starts.
    """
    # Configure training process
    model.compile(
        loss='binary_crossentropy',
        optimizer=Adam(lr=resnet_models.lr_schedule(0, FLAGS.initial_lr)),
        metrics=['binary_accuracy'],
        loss_weights=np.ones((15, )).tolist())

    # Save model with the lowest validation loss
    weights_path = os.path.join(FLAGS.experiment_rootdir,
                                'weights_{epoch:03d}.h5')
    writeBestModel = ModelCheckpoint(filepath=weights_path,
                                     monitor='val_loss',
                                     save_best_only=True,
                                     save_weights_only=True)
    tensorboard = TensorBoard(log_dir="logs/{}".format(time()))

    # Save training and validation losses.
    logz.configure_output_dir(FLAGS.experiment_rootdir)
    saveModelAndLoss = log_utils.MyCallback(filepath=FLAGS.experiment_rootdir)

    # Train model
    steps_per_epoch = int(
        np.ceil(train_data_generator.samples / FLAGS.batch_size))
    validation_steps = int(
        np.ceil(val_data_generator.samples / FLAGS.batch_size)) - 1

    lr_scheduler = LearningRateScheduler(resnet_models.lr_schedule)
    lr_reducer = ReduceLROnPlateau(factor=np.sqrt(0.1),
                                   cooldown=0,
                                   patience=5,
                                   min_lr=0.5e-6)
    strTime = strftime("%Y%b%d_%Hh%Mm%Ss", localtime(time()))
    #    tensorboard = TensorBoard(log_dir="logs/{}".format(strTime), histogram_freq=10,
    #                              batch_size=32, write_graph=False, write_grads=True,
    #                              write_images=False, embeddings_freq=0,
    #                              embeddings_layer_names=None, embeddings_metadata=None)

    #tensorboard = TensorBoard(log_dir="logs/{}".format(strTime), histogram_freq=10)
    tensorboard = TensorBoard(log_dir="logs/{}".format(strTime),
                              histogram_freq=0)
    callbacks = [
        writeBestModel, saveModelAndLoss, lr_reducer, lr_scheduler, tensorboard
    ]
    model.fit_generator(train_data_generator,
                        epochs=FLAGS.epochs,
                        steps_per_epoch=steps_per_epoch,
                        callbacks=callbacks,
                        validation_data=val_data_generator,
                        validation_steps=validation_steps,
                        initial_epoch=initial_epoch)
Example #2
0
def setup_logger(logdir, locals_):
    # Configure output directory for logging
    logz.configure_output_dir(logdir)
    # Log experimental parameters
    args = inspect.getargspec(Supervisor.__init__)[0]
    params = {k: locals_[k] if k in locals_ and not isinstance(locals_[k], types.FunctionType) and k is not "self" else None for k in args}
    logz.save_params(params)
Example #3
0
def setup_logger(logdir, locals_):
    # Configure output directory for logging
    logz.configure_output_dir(logdir)
    # Log experimental parameters
    args = inspect.getargspec(train_PG)[0]
    params = {k: locals_[k] if k in locals_ else None for k in args}
    logz.save_params(params)
Example #4
0
def mpc_sampler(vehicle_model,controller,cost_function,save,timeSteps=2000,pred_horizon=15, imag_rollouts_number=100,X_initial=[0,0],X_desired=[0,10]):
    X=[]
    u=[]
    cost=[]
    X.append(X_initial)
    cost.append(Abs_Error(X_initial[1],X_desired[1]))
    if(save):
        logz.configure_output_dir("/home/hendawy/Desktop/Platoon_Advanced_Mechatronics_Project/RLTrial",0)
    for t in range(timeSteps):
        # time_start=time.time()
        u_t=controller(vehicle_model,pred_horizon,imag_rollouts_number,X[t],cost_function)
        X_next=vehicle_model(u_t,X[t][0],X[t][1])
        cost.append(Abs_Error(X_next[1],X_desired[1]))
        if(save):
            logz.log_tabular('Error', Abs_Error(X_next[1],X_desired[1]))
            logz.dump_tabular()
        X.append(X_next)
        u.append([u_t])
        # time_end=time.time()
        # print(time_end-time_start)
    X.pop()
    traj = {"states" : np.array(X), 
            "control" : np.array(u),
            "cost" : np.array(cost),
            }
    return traj
Example #5
0
def trainModel(train_data_generator, val_data_generator, model, initial_epoch):
    """
    Model training.

    # Arguments
       train_data_generator: Training data generated batch by batch.
       val_data_generator: Validation data generated batch by batch.
       model: Target image channels.
       initial_epoch: Dimension of model output.
    """

    # Initialize loss weights
    ##model.alpha = tf.Variable(1, trainable=False, name='alpha', dtype=tf.float32)
    ##model.beta = tf.Variable(0, trainable=False, name='beta', dtype=tf.float32)
    model.beta = tf.Variable(1, trainable=False, name='beta', dtype=tf.float32)

    # Initialize number of samples for hard-mining
    ##model.k_mse = tf.Variable(FLAGS.batch_size, trainable=False, name='k_mse', dtype=tf.int32)
    model.k_entropy = tf.Variable(FLAGS.batch_size,
                                  trainable=False,
                                  name='k_entropy',
                                  dtype=tf.int32)

    optimizer = optimizers.Adam(decay=1e-5)

    # Configure training process
    ##model.compile(loss=[utils.hard_mining_mse(model.k_mse),
    ##                    utils.hard_mining_entropy(model.k_entropy)],
    ##                    optimizer=optimizer, loss_weights=[model.alpha, model.beta])
    model.compile(loss=utils.hard_mining_entropy(model.k_entropy),
                  optimizer=optimizer)

    # Save model with the lowest validation loss
    weights_path = os.path.join(FLAGS.experiment_rootdir,
                                'weights_{epoch:03d}.h5')
    writeBestModel = ModelCheckpoint(filepath=weights_path,
                                     monitor='val_loss',
                                     save_best_only=True,
                                     save_weights_only=True)

    # Save model every 'log_rate' epochs.
    # Save training and validation losses.
    logz.configure_output_dir(FLAGS.experiment_rootdir)
    saveModelAndLoss = log_utils.MyCallback(filepath=FLAGS.experiment_rootdir,
                                            period=FLAGS.log_rate,
                                            batch_size=FLAGS.batch_size)

    # Train model
    steps_per_epoch = int(
        np.ceil(train_data_generator.samples / FLAGS.batch_size))
    validation_steps = int(
        np.ceil(val_data_generator.samples / FLAGS.batch_size))

    model.fit_generator(train_data_generator,
                        epochs=FLAGS.epochs,
                        steps_per_epoch=steps_per_epoch,
                        callbacks=[writeBestModel, saveModelAndLoss],
                        validation_data=val_data_generator,
                        validation_steps=validation_steps,
                        initial_epoch=initial_epoch)
Example #6
0
    def __init__(self, env_name='HalfCheetah-v1',
                 policy_params=None,
                 num_workers=32, 
                 num_deltas=320, 
                 deltas_used=320,
                 delta_std=0.02, 
                 logdir=None, 
                 rollout_length=1000,
                 step_size=0.01,
                 shift='constant zero',
                 params=None,
                 seed=123):

        logz.configure_output_dir(logdir)
        logz.save_params(params)
        
        env = minitaur_gym_env.MinitaurBulletEnv() #gym.make(env_name)
        
        self.timesteps = 0
        self.action_size = env.action_space.shape[0]
        self.ob_size = env.observation_space.shape[0]
        self.num_deltas = num_deltas
        self.deltas_used = deltas_used
        self.rollout_length = rollout_length
        self.step_size = step_size
        self.delta_std = delta_std
        self.logdir = logdir
        self.shift = shift
        self.params = params
        self.max_past_avg_reward = float('-inf')
        self.num_episodes_used = float('inf')

        
        # create shared table for storing noise
        print("Creating deltas table.")
        deltas_id = create_shared_noise.remote()

        self.deltas = SharedNoiseTable(ray.get(deltas_id), seed = seed + 3)
        print('Created deltas table.')

        # initialize workers with different random seeds
        print('Initializing workers.') 
        self.num_workers = num_workers
        self.workers = [Worker.remote(seed + 7 * i,
                                      env_name=env_name,
                                      policy_params=policy_params,
                                      deltas=deltas_id,
                                      rollout_length=rollout_length,
                                      delta_std=delta_std) for i in range(num_workers)]

        # initialize policy 
        if policy_params['type'] == 'linear':
            self.policy = LinearPolicy(policy_params)
            self.w_policy = self.policy.get_weights()
        else:
            raise NotImplementedError
            
        # initialize optimization algorithm
        self.optimizer = optimizers.SGD(self.w_policy, self.step_size)        
        print("Initialization of ARS complete.")
Example #7
0
File: ars.py Project: zhan0903/ARS
    def __init__(self, env_name='HalfCheetah-v1',
                 policy_params=None,
                 num_workers=32, 
                 num_deltas=320, 
                 deltas_used=320,
                 delta_std=0.02, 
                 logdir=None, 
                 rollout_length=1000,
                 step_size=0.01,
                 shift='constant zero',
                 params=None,
                 seed=123):

        logz.configure_output_dir(logdir)
        logz.save_params(params)
        
        env = gym.make(env_name)
        
        self.timesteps = 0
        self.action_size = env.action_space.shape[0]
        self.ob_size = env.observation_space.shape[0]
        self.num_deltas = num_deltas
        self.deltas_used = deltas_used
        self.rollout_length = rollout_length
        self.step_size = step_size
        self.delta_std = delta_std
        self.logdir = logdir
        self.shift = shift
        self.params = params
        self.max_past_avg_reward = float('-inf')
        self.num_episodes_used = float('inf')

        
        # create shared table for storing noise
        print("Creating deltas table.")
        deltas_id = create_shared_noise.remote()
        self.deltas = SharedNoiseTable(ray.get(deltas_id), seed = seed + 3)
        print('Created deltas table.')

        # initialize workers with different random seeds
        print('Initializing workers.') 
        self.num_workers = num_workers
        self.workers = [Worker.remote(seed + 7 * i,
                                      env_name=env_name,
                                      policy_params=policy_params,
                                      deltas=deltas_id,
                                      rollout_length=rollout_length,
                                      delta_std=delta_std) for i in range(num_workers)]


        # initialize policy 
        if policy_params['type'] == 'linear':
            self.policy = LinearPolicy(policy_params)
            self.w_policy = self.policy.get_weights()
        else:
            raise NotImplementedError
            
        # initialize optimization algorithm
        self.optimizer = optimizers.SGD(self.w_policy, self.step_size)        
        print("Initialization of ARS complete.")
Example #8
0
def setup_logger(logdir, locals_):
    # Configure output directory for logging
    logz.configure_output_dir(logdir)
    # Log experimental parameters
    # args = inspect.getargspec(learn)[0]
    # params = {k: locals_[k] if k in locals_ else None for k in args}
    logz.save_params(locals_.get("kwargs"))
Example #9
0
    def train_mf(self):
        self.start_worker()
        self.init_opt()
        logz.configure_output_dir(
            "/home/hendawy/Desktop/2DOF_Robotic_Arm_withSphereObstacle/Rr",
            1807)
        for itr in range(self.current_itr, self.n_itr):
            with logger.prefix('itr #%d | ' % itr):
                paths = self.sampler.obtain_samples(itr, Constrained=True)
                samples_data, analysis_data = self.sampler.process_samples(
                    itr, paths)
                self.log_diagnostics(paths)
                optimization_data = self.optimize_policy(itr, samples_data)
                logz.log_tabular('Iteration', analysis_data["Iteration"])
                # In terms of true environment reward of your rolled out trajectory using the MPC controller
                logz.log_tabular('AverageDiscountedReturn',
                                 analysis_data["AverageDiscountedReturn"])
                logz.log_tabular('AverageReturns',
                                 analysis_data["AverageReturn"])
                logz.log_tabular('violation_cost',
                                 np.mean(samples_data["violation_cost"]))
                logz.log_tabular(
                    'boundary_violation_cost',
                    np.mean(samples_data["boundary_violation_cost"]))
                logz.log_tabular('success_rate', samples_data["success_rate"])
                logz.log_tabular(
                    'successful_AverageReturn',
                    np.mean(samples_data["successful_AverageReturn"]))
                logz.log_tabular('ExplainedVariance',
                                 analysis_data["ExplainedVariance"])
                logz.log_tabular('NumTrajs', analysis_data["NumTrajs"])
                logz.log_tabular('Entropy', analysis_data["Entropy"])
                logz.log_tabular('Perplexity', analysis_data["Perplexity"])
                logz.log_tabular('StdReturn', analysis_data["StdReturn"])
                logz.log_tabular('MaxReturn', analysis_data["MaxReturn"])
                logz.log_tabular('MinReturn', analysis_data["MinReturn"])
                logz.log_tabular('LossBefore', optimization_data["LossBefore"])
                logz.log_tabular('LossAfter', optimization_data["LossAfter"])
                logz.log_tabular('MeanKLBefore',
                                 optimization_data["MeanKLBefore"])
                logz.log_tabular('MeanKL', optimization_data["MeanKL"])
                logz.log_tabular('dLoss', optimization_data["dLoss"])
                logz.dump_tabular()
                logger.log("saving snapshot...")
                params = self.get_itr_snapshot(itr, samples_data)
                self.current_itr = itr + 1
                params["algo"] = self
                if self.store_paths:
                    params["paths"] = samples_data["paths"]
                logger.save_itr_params(itr, params)
                logger.log("saved")
                logger.dump_tabular(with_prefix=False)
                if self.plot:
                    self.update_plot()
                    if self.pause_for_plot:
                        input("Plotting evaluation run: Press Enter to "
                              "continue...")

        self.shutdown_worker()
Example #10
0
def setup_logger(logdir, locals_):
  # Configure output directory for logging
  logz.configure_output_dir(logdir)
  # Log experimental parameters
  args = inspect.getargspec(QLearner)[0]
  params = {k: str(locals_[k]) if k in locals_ else None for k in args}
  params['exp_name'] = locals_['q_func'].__name__ + locals_['double_q'] * '_doubleQ'
  logz.save_params(params)
Example #11
0
def setup_logger(logdir, locals_):
    # Configure output directory for logging
    seed = np.random.get_state()[1][0]
    logz.configure_output_dir(logdir + '/%s/' % seed)
    # Log experimental parameters
    params = {k: str(locals_[k]) for k in locals_ if '__' not in k}
    params['seed'] = str(seed)
    logz.save_params(params)
def main():

    PROJECT_ROOT =osp.dirname(osp.realpath(__file__))
    logz.configure_output_dir(osp.join(PROJECT_ROOT, "log/"+"_RAM_"+time.strftime("%d-%m-%Y_%H-%M-%S")))
    seed = 0
    env = get_env('SpaceInvaders-v0', seed)
    session = get_session()
    atari_learn(env, session, num_timesteps=40000000)
Example #13
0
    def train(self, train_db, val_db, test_db):
        ##################################################################
        ## LOG
        ##################################################################
        logz.configure_output_dir(self.cfg.model_dir)
        logz.save_config(self.cfg)

        ##################################################################
        ## Main loop
        ##################################################################
        start = time()
        min_val_loss = 100000000
        for epoch in range(self.epoch, self.cfg.n_epochs):
            ##################################################################
            ## Training
            ##################################################################
            torch.cuda.empty_cache()
            train_loss, train_accu = self.train_epoch(train_db, epoch)

            ##################################################################
            ## Validation
            ##################################################################
            torch.cuda.empty_cache()
            val_loss, val_accu = self.validate_epoch(val_db, epoch)

            ##################################################################
            ## Logging
            ##################################################################

            # update optim scheduler
            current_val_loss = np.mean(val_loss[:,0])
            # self.optimizer.update(current_val_loss, epoch)
            logz.log_tabular("Time", time() - start)
            logz.log_tabular("Iteration", epoch)
            logz.log_tabular("AverageLoss",         np.mean(train_loss[:, 0]))
            logz.log_tabular("AveragePredLoss",     np.mean(train_loss[:, 1]))
            logz.log_tabular("AverageEmbedLoss",    np.mean(train_loss[:, 2]))
            logz.log_tabular("AverageAttnLoss",     np.mean(train_loss[:, 3]))
            logz.log_tabular("AverageObjAccu",      np.mean(train_accu[:, 0]))
            logz.log_tabular("AverageCoordAccu",    np.mean(train_accu[:, 1]))
            logz.log_tabular("AverageScaleAccu",    np.mean(train_accu[:, 2]))
            logz.log_tabular("AverageRatioAccu",    np.mean(train_accu[:, 3]))

            logz.log_tabular("ValAverageLoss",      np.mean(val_loss[:, 0]))
            logz.log_tabular("ValAveragePredLoss",  np.mean(val_loss[:, 1]))
            logz.log_tabular("ValAverageEmbedLoss", np.mean(val_loss[:, 2]))
            logz.log_tabular("ValAverageAttnLoss",  np.mean(val_loss[:, 3]))
            logz.log_tabular("ValAverageObjAccu",   np.mean(val_accu[:, 0]))
            logz.log_tabular("ValAverageCoordAccu", np.mean(val_accu[:, 1]))
            logz.log_tabular("ValAverageScaleAccu", np.mean(val_accu[:, 2]))
            logz.log_tabular("ValAverageRatioAccu", np.mean(val_accu[:, 3]))
            logz.dump_tabular()

            ##################################################################
            ## Checkpoint
            ##################################################################
            self.save_checkpoint(epoch)
Example #14
0
def setup_logger(logdir, locals_):
    # Configure output directory for logging
    logz.configure_output_dir(logdir)
    # Log experimental parameters
    args = inspect.getargspec(train_PG)[0]
    params = {k: locals_[k] if k in locals_ else None for k in args}
    # print(params.items())
    # print(json.dumps(list(params.values())))
    logz.save_params(params)
def trainModel(train_data_generator, val_data_generator, model, initial_epoch):
    """
    Model training.
    # Arguments
       train_data_generator: Training data generated batch by batch.
       val_data_generator: Validation data generated batch by batch.
       model: A Model instance.
       initial_epoch: Epoch from which training starts.
    """
    # Configure training process
    model.compile(loss='categorical_crossentropy',
                  optimizer=Adam(lr=cifar10_resnet_mod.lr_schedule(0)),
                  metrics=['categorical_accuracy'])

    # Save model with the lowest validation loss
    weights_path = os.path.join(FLAGS.experiment_rootdir,
                                'weights_{epoch:03d}.h5')
    writeBestModel = ModelCheckpoint(filepath=weights_path,
                                     monitor='val_loss',
                                     save_best_only=True,
                                     save_weights_only=True)

    # Save training and validation losses.
    logz.configure_output_dir(FLAGS.experiment_rootdir)
    saveModelAndLoss = log_utils.MyCallback(filepath=FLAGS.experiment_rootdir)

    # Train model
    steps_per_epoch = int(
        np.ceil(train_data_generator.samples / FLAGS.batch_size))
    validation_steps = int(
        np.ceil(val_data_generator.samples / FLAGS.batch_size)) - 1

    lr_scheduler = LearningRateScheduler(cifar10_resnet_mod.lr_schedule)

    lr_reducer = ReduceLROnPlateau(factor=np.sqrt(0.1),
                                   cooldown=0,
                                   patience=5,
                                   min_lr=0.5e-6)

    # TENSORBOARD SIRVE PARA VISUALIZAR LOS RESULTADOS DE LAS ITERACIONES
    # HASTA AQUI SE HARÁ UN PROCESO ITERATIVO QUE GUARDARÁ DE TODOS LOS CASOS
    # EL MODELO CON EL MEJOR RESULTADO SOBRE LOS DATOS DE VALIDACIÓN
    strTime = strftime("%Y%b%d_%Hh%Mm%Ss", localtime(time()))
    tensorboard = TensorBoard(log_dir="logs/{}".format(strTime),
                              histogram_freq=0)
    callbacks = [
        writeBestModel, saveModelAndLoss, lr_reducer, lr_scheduler, tensorboard
    ]

    model.fit_generator(train_data_generator,
                        epochs=FLAGS.epochs,
                        steps_per_epoch=steps_per_epoch,
                        callbacks=callbacks,
                        validation_data=val_data_generator,
                        validation_steps=validation_steps,
                        initial_epoch=initial_epoch)
Example #16
0
    def __init__(
        self,
        organism_builder=None,
        logdir=None,
        params=None,
        master_organism=None,
        sampler_builder=None,
    ):

        logz.configure_output_dir(logdir)
        logz.save_params(params)

        # env = env_registry.get_env_constructor(params['env_name'])()

        self.logdir = logdir
        self.params = params
        self.max_past_avg_reward = float('-inf')
        self.num_episodes_used = float('inf')

        # create shared table for storing noise
        print("Creating deltas table.")
        deltas_id = create_shared_noise_serial()
        self.deltas = SharedNoiseTable(deltas_id, seed=params['seed'] + 3)
        print('Created deltas table.')

        ########################################################

        self.master_organism = master_organism

        self.sampler = sampler_builder(
            num_deltas=params['n_directions'],
            shift=params['shift'],
            num_workers=params['n_workers'],
            seed=params['seed'],
            env_name=params['env_name'],
            organism_builder=
            organism_builder,  #lambda: ARS_LinearAgent(agent_args)
            deltas_id=deltas_id,
            rollout_length=params['rollout_length'],
            delta_std=params['delta_std'],
        )

        # maybe we'd need to merge Sampler and Agent
        # agent holds the parameters, but sampler takes the agent and does the parallel rollouts
        # so agent should not have the workers at all...
        # agent should just contain the parameter.
        # but the sampler would need to take the agent in.
        # so the sampler is the thing that takes a single agent, and creates a bunch of workers
        # modeled the agent.

        self.rl_alg = ARS_RL_Alg(
            deltas=self.deltas,  # noise table
            num_deltas=params['n_directions'],  # N
            deltas_used=params['deltas_used']  # b
        )
Example #17
0
def run_model(session, predict, loss, train_step, saver, images, labels, X, y,
              epochs=1, batch_size=64, print_every=100, is_test=False):
    if not is_test:
        # Configure output directory for logging
        logz.configure_output_dir('logs')

        # Log experimental parameters
        args = inspect.getargspec(main)[0] # Get the names and default values of a function's parameters.
        locals_ = locals() # Return a dictionary containing the current scope's local variables
        params = {k: locals_[k] if k in locals_ else None for k in args}
        logz.save_params(params)

    # have tensorflow compute accuracy
    correct_prediction = tf.equal(tf.argmax(predict, axis=1), tf.argmax(y, axis=1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    
    # counter
    iter_cnt = 0
    iters_each_epoch = len(images)//batch_size - 1
    for e in range(epochs):
        # keep track of losses and accuracy
        correct = 0
        losses = []
        # make sure we iterate over the dataset once
        images, labels = shuffle_dataset(images, labels)
        for i in range(iters_each_epoch):
            current_iter = i+1
            
            batch_X, batch_y = images[current_iter*batch_size:(current_iter+1)*batch_size], labels[current_iter*batch_size:(current_iter+1)*batch_size]
            feed_dict = {X: batch_X, y: batch_y}
            
            # have tensorflow compute loss and correct predictions
            # and (if given) perform a training step
            l, corr, _ = session.run([loss, correct_prediction, train_step],feed_dict=feed_dict)

            # aggregate performance stats
            losses.append(l*batch_size)
            correct += np.sum(corr)
            
            # print every now and then
            if (iter_cnt % print_every) == 0 and not is_test:
                logz.log_tabular("Iteration", iter_cnt)
                logz.log_tabular("minibatch_loss", l)
                logz.log_tabular("minibatch_accuracy", np.sum(corr)/batch_size)
                logz.dump_tabular()
                logz.pickle_tf_vars()

            iter_cnt += 1
        if is_test:
            total_correct = correct/len(images)
            total_loss = np.sum(losses)/len(images)
            print('acc:', total_correct)
            print('los:', total_loss)
        else:
            saver.save(session, 'checkpoints/mnist_plus', iter_cnt)
Example #18
0
def main():
    PROJECT_ROOT = os.path.dirname(os.path.realpath(__file__))
    logz.configure_output_dir(
        os.path.join(PROJECT_ROOT,
                     "log/" + "_RAM_" + time.strftime("%d-%m-%Y_%H-%M-%S")))

    # Run training
    seed = 0  # Use a seed of zero (you may want to randomize the seed!)
    env = get_env(seed)
    session = get_session()
    atari_learn(env, session, num_timesteps=int(4e7))
Example #19
0
def main_pendulum(logdir, seed, n_iter, gamma, min_timesteps_per_batch, initial_stepsize, desired_kl, vf_type, vf_params, animate=False):
    tf.set_random_seed(seed)
    np.random.seed(seed)
    env = gym.make("Pendulum-v0")
    ob_dim = env.observation_space.shape[0]
    ac_dim = env.action_space.shape[0]
    logz.configure_output_dir(logdir)
    if vf_type == 'linear':
        vf = LinearValueFunction(**vf_params)
    elif vf_type == 'nn':
        vf = NnValueFunction(ob_dim=ob_dim, **vf_params)


    YOUR_CODE_HERE


    sy_surr = - tf.reduce_mean(sy_adv_n * sy_logprob_n) # Loss function that we'll differentiate to get the policy gradient ("surr" is for "surrogate loss")

    sy_stepsize = tf.placeholder(shape=[], dtype=tf.float32) # Symbolic, in case you want to change the stepsize during optimization. (We're not doing that currently)
    update_op = tf.train.AdamOptimizer(sy_stepsize).minimize(sy_surr)

    sess = tf.Session()
    sess.__enter__() # equivalent to `with sess:`
    tf.global_variables_initializer().run() #pylint: disable=E1101

    total_timesteps = 0
    stepsize = initial_stepsize

    for i in range(n_iter):
        print("********** Iteration %i ************"%i)

        YOUR_CODE_HERE

        if kl > desired_kl * 2: 
            stepsize /= 1.5
            print('stepsize -> %s'%stepsize)
        elif kl < desired_kl / 2: 
            stepsize *= 1.5
            print('stepsize -> %s'%stepsize)
        else:
            print('stepsize OK')


        # Log diagnostics
        logz.log_tabular("EpRewMean", np.mean([path["reward"].sum() for path in paths]))
        logz.log_tabular("EpLenMean", np.mean([pathlength(path) for path in paths]))
        logz.log_tabular("KLOldNew", kl)
        logz.log_tabular("Entropy", ent)
        logz.log_tabular("EVBefore", explained_variance_1d(vpred_n, vtarg_n))
        logz.log_tabular("EVAfter", explained_variance_1d(vf.predict(ob_no), vtarg_n))
        logz.log_tabular("TimestepsSoFar", total_timesteps)
        # If you're overfitting, EVAfter will be way larger than EVBefore.
        # Note that we fit value function AFTER using it to compute the advantage function to avoid introducing bias
        logz.dump_tabular()
Example #20
0
def learn(*args, **kwargs):
    alg = QLearner(*args, **kwargs)
    logz.configure_output_dir(alg.logdir)
    if alg.start_time is None:
        alg.start_time = time.time()
    while not alg.stopping_criterion_met():
        alg.step_env()
        # at this point, the environment should have been advanced one step (and
        # reset if done was true), and self.last_obs should point to the new latest
        # observation
        alg.update_model()
        alg.log_progress()
Example #21
0
    def __init__(
        self,
        env=None,
        discrete=True,
        ob_dim=0,
        ac_dim=0,
        gamma=1.0,
        max_path_length=None,
        learning_rate=5e-3,
        logdir=None,
        normalize_advantages=True,
        nn_baseline=False,
        # network arguments
        n_layers=1,
        size=32,
        gae_lambda=-1.0,
        model_tag='vanilla',
        #ppo parameter
        clip_ratio=0.2,
    ):
        #params
        self.nn_baseline = nn_baseline
        self.learning_rate = learning_rate
        self.gamma = gamma
        self.normalize_advantages = normalize_advantages
        self.n_layers = n_layers
        self.size = size
        self.gae_lambda = gae_lambda
        self.model_tag = model_tag
        self.clip_ratio = clip_ratio
        # Configure output directory for logging
        logz.configure_output_dir(logdir)
        self.log_dir = logdir
        # Log experimental parameters
        # args = inspect.getfullargspec(__init__)[0]
        # locals_ = locals()
        # params = {k: locals_[k] if k in locals_ else None for k in args}
        # logz.save_params(params)

        # Make the gym environment
        self.env = env
        self.ob_dim = ob_dim
        self.ac_dim = ac_dim
        # Is this env continuous, or discrete?
        self.discrete = discrete
        # Maximum length for episodes
        self.max_path_length = max_path_length
        self.setup_placeholders()
        self.setup_tf_operations()
        self.setup_loss()
        if self.nn_baseline:
            self.setup_baseline()
Example #22
0
def setup_logger(logdir, params):
    # Configure output directory for logging
    logz.configure_output_dir(logdir)
    # Log experimental parameters
    # args = inspect.getargspec(learn)[0]
    check_params = params.copy()
    log_params = params.copy()
    for param in check_params.keys():
        try:
            json.dumps(check_params[param])
        except:
            del log_params[param]
    logz.save_params(log_params)
def main():
    # Get Atari games.

    # Change the index to select a different game.
    PROJECT_ROOT = osp.dirname(osp.realpath(__file__))
    logz.configure_output_dir(
        osp.join(PROJECT_ROOT,
                 "log/" + "_RAM_" + time.strftime("%d-%m-%Y_%H-%M-%S")))

    # Run training
    seed = 0  # Use a seed of zero (you may want to randomize the seed!)
    env = get_env('SpaceInvaders-v0', seed)
    session = get_session()
    atari_learn(env, session, num_timesteps=40000000)
Example #24
0
def main():
    # Get Atari games.
    benchmark = gym.benchmark_spec('Atari40M')

    # Change the index to select a different game.
    task = benchmark.tasks[3]
    PROJECT_ROOT = os.path.dirname(os.path.realpath(__file__))
    logz.configure_output_dir(os.path.join(PROJECT_ROOT, "log/"+"_RAM_"+time.strftime("%d-%m-%Y_%H-%M-%S")))

    # Run training
    seed = 0 # Use a seed of zero (you may want to randomize the seed!)
    env = get_env(task, seed)
    session = get_session()
    atari_learn(env, session, num_timesteps=task.max_timesteps)
Example #25
0
def mpc_platoon_sampler(vehicle_model,platoon_model,controller,cost_function,save,timeSteps=3000,pred_horizon=15, imag_rollouts_number=400,X_initial_1=[0,0],X_initial_2=[0,0],X_desired_1=[0,10],X_desired_2=[2,0]):
    X_1=[]
    X_2=[]
    X_v2=[]
    u_1=[]
    u_2=[]
    cost_1=[]
    cost_2=[]
    X_1.append(X_initial_1)
    X_v2.append(X_initial_2)
    X_2.append([X_1[0][0]-X_v2[0][0],X_1[0][1]-X_v2[0][1]])
    cost_1.append(Abs_Error(X_initial_1[1],X_desired_1[1]))
    cost_2.append(Abs_Error(X_2[0][0],X_desired_2[0]))
    if(save):
        logz.configure_output_dir("/home/hendawy/Desktop/Platoon_Advanced_Mechatronics_Project/RLTrial",11)
    for t in range(timeSteps):
        # time_start=time.time()
        u1_t=controller(vehicle_model,pred_horizon,imag_rollouts_number,X_1[t],cost_function,X_desired_1,'Leader')
        u2_t=controller(vehicle_model,pred_horizon,imag_rollouts_number,X_v2[t],cost_function,X_desired_2,'Follower',X_1[t])
        X_next_1,X_next_2=platoon_model(u1_t,u2_t,X_1[t][0],X_1[t][1],X_v2[t][0],X_v2[t][1])
        # print('Vehicle 1',X_1[t],X_next_1,u1_t)
        # print('Vehicle 2',X_v2[t],X_next_2,u2_t)
        cost_1.append(Abs_Error(X_next_1[1],X_desired_1[1]))
        cost_2.append(Abs_Error(X_next_2[0],X_desired_2[0]))
        if(save):
            logz.log_tabular('Error_v1', Abs_Error(X_next_1[1],X_desired_1[1]))
            logz.log_tabular('Error_v2', Abs_Error(X_next_2[0],X_desired_2[0]))
            logz.dump_tabular()
        X_1.append(X_next_1)
        X_2.append(X_next_2)
        u_1.append([u1_t])
        u_2.append([u2_t])
        X_v2.append([-X_next_2[0]+X_next_1[0],-X_next_2[1]+X_next_1[1]])
        # time_end=time.time()
        # print(time_end-time_start)
    X_1.pop()
    X_2.pop()
    traj = {"states_v1" : np.array(X_1),
            "states_v2" : np.array(X_v2),
            "states_f1" : np.array(X_2),
            "control_v1" : np.array(u_1),
            "control_v2" : np.array(u_2),
            "cost_v1" : np.array(cost_1),
            "cost_v2" : np.array(cost_2),
            }
    return traj
def train_model(train_generator, val_generator, model, initial_epoch):

    model.compile(loss='categorical_crossentropy',
                  optimizer=Adam(lr=lr_schedule(0)),
                  metrics=['accuracy'])

    # Save model with the lowest validation loss
    weights_path = os.path.join(FLAGS.experiment_rootdir,
                                'weights_{epoch:03d}.h5')
    write_best_model = ModelCheckpoint(filepath=weights_path,
                                       monitor='val_loss',
                                       save_best_only=True,
                                       save_weights_only=True)

    # Save training and validation losses.
    logz.configure_output_dir(FLAGS.experiment_rootdir)
    save_model_and_loss = log_utils.MyCallback(
        filepath=FLAGS.experiment_rootdir)

    # Train model
    lr_scheduler = LearningRateScheduler(lr_schedule, verbose=FLAGS.verbose)

    lr_reducer = ReduceLROnPlateau(factor=np.sqrt(0.1),
                                   cooldown=0,
                                   patience=5,
                                   verbose=FLAGS.verbose,
                                   min_lr=0.5e-6)
    # earlystopping = EarlyStopping(monitor='val_loss', patience=3, verbose=FLAGS.verbose)

    str_time = strftime("%Y%b%d_%Hh%Mm%Ss", localtime(time()))
    tensorboard = TensorBoard(log_dir="logs/{}".format(str_time),
                              histogram_freq=0)

    callbacks = [
        write_best_model, save_model_and_loss, lr_reducer, lr_scheduler,
        tensorboard
    ]

    model.fit_generator(train_generator,
                        validation_data=val_generator,
                        epochs=FLAGS.epochs,
                        verbose=FLAGS.verbose,
                        callbacks=callbacks,
                        initial_epoch=initial_epoch,
                        use_multiprocessing=True)
Example #27
0
def main():
    # Get Atari games.
    task = gym.make('LunarLander-v2')

    file_dir = osp.dirname(osp.abspath(__file__))
    unique_name = datetime.datetime.now(dateutil.tz.tzlocal()).strftime(
        '%Y_%m_%d_%H_%M_%S_%f_%Z') + '__' + str(uuid.uuid4())
    result_dir = osp.join(file_dir, unique_name)

    logz.configure_output_dir(result_dir)
    logz.save_params(dict(exp_name=unique_name, ))

    # Run training
    seed = 1
    print('random seed = %d' % seed)
    env = get_env(task, seed, result_dir)
    session = get_session()
    atari_learn(env, session, num_timesteps=5e5, result_dir=result_dir)
Example #28
0
def trainModel(train_data_generator, val_data_generator, model, initial_epoch):
    """
    Model training.
    # Arguments
       train_data_generator: Training data generated batch by batch.
       val_data_generator: Validation data generated batch by batch.
       model: A Model instance.
       initial_epoch: Epoch from which training starts.
    """

    # Configure training process
    optimizer = keras.optimizers.Adam(lr=FLAGS.initial_lr, decay=1e-6)
    model.compile(loss='binary_crossentropy',
                  optimizer=optimizer,
                  metrics=['binary_accuracy'],
                  loss_weights=np.ones((21, )).tolist())

    # Save model with the lowest validation loss
    weights_path = os.path.join(FLAGS.experiment_rootdir,
                                'weights_{epoch:03d}.h5')
    writeBestModel = ModelCheckpoint(filepath=weights_path,
                                     monitor='val_loss',
                                     save_best_only=True,
                                     save_weights_only=True)
    tensorboard = TensorBoard(log_dir="logs/{}".format(time()))

    # Save training and validation losses.
    logz.configure_output_dir(FLAGS.experiment_rootdir)
    saveModelAndLoss = log_utils.MyCallback(filepath=FLAGS.experiment_rootdir)

    # Train model
    steps_per_epoch = int(
        np.ceil(train_data_generator.samples / FLAGS.batch_size))
    validation_steps = int(
        np.ceil(val_data_generator.samples / FLAGS.batch_size)) - 1

    model.fit_generator(
        train_data_generator,
        epochs=FLAGS.epochs,
        steps_per_epoch=steps_per_epoch,
        callbacks=[writeBestModel, saveModelAndLoss, tensorboard],
        validation_data=val_data_generator,
        validation_steps=validation_steps,
        initial_epoch=initial_epoch)
Example #29
0
    def __init__(self,
                 env=None,
                 discrete=True,
                 ob_shape=(),
                 ac_dim=0,
                 gamma=1.0,
                 actor_lr=1e-4,
                 critic_lr=1e-3,
                 logdir=None,
                 normalize_returns=True,
                 # network arguments
                 n_layers=1,
                 size=32,
                 gae_lambda=-1.0,
                 tau=0.001 #parameter update rate
                ):
        self.gamma = gamma
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr
        self.normalize_returns = normalize_returns
        self.n_layers = n_layers
        self.size = size
        self.gae_lambda = gae_lambda
        self.tau = tau

        # Configure output directory for logging
        logz.configure_output_dir(logdir)
        # Log experimental parameters
        # args = inspect.getfullargspec(train_DDPG)[0]
        # locals_ = locals()
        # params = {k: locals_[k] if k in locals_ else None for k in args}
        # logz.save_params(params)

        # Make the gym environment
        self.env = env
        # Is this env continuous, or discrete?
        self.discrete = discrete
        self.ac_dim = ac_dim
        self.ob_dim = ob_shape[0]
        #observation_shape in cartpole is (2,) 一个tuple
        self.memory = Memory(limit=int(1e6), action_shape=ac_dim, observation_shape=ob_shape)
        self.setup_placeholders()
        self.setup_network()
Example #30
0
def get_env(env_name, exp_name, seed):
    env = gym.make(env_name)

    set_global_seeds(seed)
    env.seed(seed)

    # Set Up Logger
    logdir = 'dqn_' + exp_name + '_' + env_name + '_' + time.strftime("%d-%m-%Y_%H-%M-%S")
    logdir = osp.join('data', logdir)
    logdir = osp.join(logdir, '%d'%seed)
    logz.configure_output_dir(logdir)
    hyperparams = {'exp_name': exp_name, 'env_name': env_name}
    logz.save_hyperparams(hyperparams)

    expt_dir = '/tmp/hw3_vid_dir/'
    env = wrappers.Monitor(env, osp.join(expt_dir, "gym"), force=True, video_callable=False)
    

    return env
Example #31
0
def get_env(env_name, exp_name, seed):
    env = gym.make(env_name)

    set_global_seeds(seed)
    env.seed(seed)

    # Set Up Logger
    logdir = 'dqn_' + exp_name + '_' + env_name + '_' + time.strftime(
        "%d-%m-%Y_%H-%M-%S")
    logdir = osp.join('data', logdir)
    logdir = osp.join(logdir, '%d' % seed)
    logz.configure_output_dir(logdir)
    hyperparams = {'exp_name': exp_name, 'env_name': env_name}
    logz.save_hyperparams(hyperparams)

    expt_dir = '/tmp/hw3_vid_dir2/'
    env = wrappers.Monitor(env, osp.join(expt_dir, "gym"), force=True)
    env = wrap_deepmind(env)
    # observation = env.reset()
    # print('observation shape', observation.shape)

    return env
Example #32
0
def train_PG(exp_name='',
             env_name='CartPole-v0',
             n_iter=100, 
             gamma=1.0, 
             min_timesteps_per_batch=1000, 
             max_path_length=None,
             learning_rate=5e-3, 
             reward_to_go=True, 
             animate=True, 
             logdir=None, 
             normalize_advantages=True,
             nn_baseline=False, 
             seed=0,
             # network arguments
             n_layers=1,
             size=32
             ):

    start = time.time()

    # Configure output directory for logging
    logz.configure_output_dir(logdir)

    # Log experimental parameters
    args = inspect.getargspec(train_PG)[0]
    locals_ = locals()
    params = {k: locals_[k] if k in locals_ else None for k in args}
    logz.save_params(params)

    # Set random seeds
    torch.manual_seed(seed)
    np.random.seed(seed)

    # Make the gym environment
    env = gym.make(env_name)
    
    # Is this env continuous, or discrete?
    discrete = isinstance(env.action_space, gym.spaces.Discrete)

    # Maximum length for episodes
    max_path_length = max_path_length or env.spec.max_episode_steps

    #========================================================================================#
    # Notes on notation:
    # 
    # Symbolic variables have the prefix sy_, to distinguish them from the numerical values
    # that are computed later in the function
    # 
    # Prefixes and suffixes:
    # ob - observation 
    # ac - action
    # _no - this tensor should have shape (batch size /n/, observation dim)
    # _na - this tensor should have shape (batch size /n/, action dim)
    # _n  - this tensor should have shape (batch size /n/)
    # 
    # Note: batch size /n/ is defined at runtime, and until then, the shape for that axis
    # is None
    #========================================================================================#

    # Observation and action sizes
    ob_dim = env.observation_space.shape[0]
    ac_dim = env.action_space.n if discrete else env.action_space.shape[0]

    #todo: create Agent
    
    #todo: initilize Agent:

    #========================================================================================#
    # Tensorflow Engineering: Config, Session, Variable initialization
    #========================================================================================#

    tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) 

    sess = tf.Session(config=tf_config)
    sess.__enter__() # equivalent to `with sess:`



    #========================================================================================#
    # Training Loop
    #========================================================================================#

    total_timesteps = 0

    for itr in range(n_iter):
        print("********** Iteration %i ************"%itr)

        # Collect paths until we have enough timesteps
        timesteps_this_batch = 0
        paths = []
        while True:
            ob = env.reset()
            obs, acs, rewards = [], [], []
            animate_this_episode=(len(paths)==0 and (itr % 10 == 0) and animate)
            steps = 0
            while True:
                if animate_this_episode:
                    env.render()
                    time.sleep(0.05)
                obs.append(ob)
                ac = actor.run(ob)
                print("need to type-check action here:(two lines)")
                print(ac)
                print(ac.size())
                acs.append(ac)
                ob, rew, done, _ = env.step(ac)
                rewards.append(rew)
                steps += 1
                if done or steps > max_path_length:
                    break
            #One episode finishes; perform update here
            finish_episode(actor, actor_optimizer, critic=None, critic_optimizer=None, )
            path = {"observation" : np.array(obs), 
                    "reward" : np.array(rewards), 
                    "action" : np.array(acs)}
            paths.append(path)
            timesteps_this_batch += pathlength(path)
            if timesteps_this_batch > min_timesteps_per_batch:
                break
        total_timesteps += timesteps_this_batch



        # Log diagnostics
        returns = [path["reward"].sum() for path in paths]
        ep_lengths = [pathlength(path) for path in paths]
        logz.log_tabular("Time", time.time() - start)
        logz.log_tabular("Iteration", itr)
        logz.log_tabular("AverageReturn", np.mean(returns))
        logz.log_tabular("StdReturn", np.std(returns))
        logz.log_tabular("MaxReturn", np.max(returns))
        logz.log_tabular("MinReturn", np.min(returns))
        logz.log_tabular("EpLenMean", np.mean(ep_lengths))
        logz.log_tabular("EpLenStd", np.std(ep_lengths))
        logz.log_tabular("TimestepsThisBatch", timesteps_this_batch)
        logz.log_tabular("TimestepsSoFar", total_timesteps)
        logz.dump_tabular()
        logz.pickle_tf_vars()
Example #33
0
def train_PG(exp_name='',
             env_name='CartPole-v0',
             n_iter=100, 
             gamma=1.0, 
             min_timesteps_per_batch=1000, 
             max_path_length=None,
             learning_rate=5e-3, 
             reward_to_go=True, 
             animate=True, 
             logdir=None, 
             normalize_advantages=True,
             nn_baseline=False, 
             seed=0,
             # network arguments
             n_layers=1,
             size=32,
             network_activation='tanh'
             ):

    start = time.time()

    # Configure output directory for logging
    logz.configure_output_dir(logdir)

    # Log experimental parameters
    args = inspect.getargspec(train_PG)[0]
    locals_ = locals()
    params = {k: locals_[k] if k in locals_ else None for k in args}
    logz.save_params(params)

    # Set random seeds
    torch.manual_seed(seed)
    np.random.seed(seed)

    # Make the gym environment
    env = gym.make(env_name)
    
    # Is this env continuous, or discrete?
    discrete = isinstance(env.action_space, gym.spaces.Discrete)

    # Maximum length for episodes
    max_path_length = max_path_length or env.spec.max_episode_steps

    #========================================================================================#
    # Notes on notation:
    # 
    # Symbolic variables have the prefix sy_, to distinguish them from the numerical values
    # that are computed later in the function
    # 
    # Prefixes and suffixes:
    # ob - observation 
    # ac - action
    # _no - this tensor should have shape (batch size /n/, observation dim)
    # _na - this tensor should have shape (batch size /n/, action dim)
    # _n  - this tensor should have shape (batch size /n/)
    # 
    # Note: batch size /n/ is defined at runtime, and until then, the shape for that axis
    # is None
    #========================================================================================#

    # Observation and action sizes
    ob_dim = env.observation_space.shape[0]
    ac_dim = env.action_space.n if discrete else env.action_space.shape[0]
    
    #activation function for the network
    if network_activation=='relu':
        activation=torch.nn.functional.relu
    elif network_activation=='leaky_relu':
        activation=torch.nn.functional.leaky_relu
    else:
        activation=torch.nn.functional.tanh
    #todo: create policy
    actor=build_mlp(ob_dim, ac_dim, "actor",\
                             n_layers=n_layers, size=size, activation=activation, discrete=discrete)
    actor_loss=reinforce_loss
    actor_optimizer=torch.optim.Adam(actor.parameters(), lr=learning_rate)
    
    #todo: initilize Agent:
    
    #========================================================================================#
    #                           ----------SECTION 5----------
    # Optional Baseline
    #========================================================================================#
    if nn_baseline:
        critic=build_mlp(ob_dim,1,"nn_baseline",\
                                    n_layers=n_layers,size=size, discrete=discrete)
        critic_loss=nn.MSELoss()
        critic_optimizer=torch.optim.Adam(critic.parameters(), lr=learning_rate)
        

    #========================================================================================#
    # Training Loop
    #========================================================================================#
    
    total_timesteps = 0

    for itr in range(n_iter):
        print("********** Iteration %i ************"%itr)

        # Collect paths until we have enough timesteps
        timesteps_this_batch = 0
        paths = []
        while True:
            ob = env.reset()
            obs, acs, rewards, log_probs = [], [], [], []
            animate_this_episode=(len(paths)==0 and (itr % 10 == 0) and animate)
            steps = 0
            while True:
                if animate_this_episode:
                    env.render()
                    time.sleep(0.05)
                ob = torch.from_numpy(ob).float().unsqueeze(0)
                obs.append(ob)
                ac, log_prob = actor.run(ob)
                acs.append(ac)
                log_probs.append(log_prob)
                #format the action from policy
                if discrete:
                    ac = int(ac)
                else:
                    ac = ac.squeeze(0).numpy()
                ob, rew, done, _ = env.step(ac)
                rewards.append(rew)
                steps += 1
                if done or steps > max_path_length:
                    break
            path = {"observation" : torch.cat(obs, 0),
                    "reward" : torch.Tensor(rewards),
                    "action" : torch.cat(acs, 0),
                    "log_prob" : torch.cat(log_probs, 0)}
            paths.append(path)
            timesteps_this_batch += pathlength(path)
            if timesteps_this_batch > min_timesteps_per_batch:
                break
        total_timesteps += timesteps_this_batch
        ob_no = torch.cat([path["observation"] for path in paths], 0)
        ac_na = torch.cat([path["action"] for path in paths], 0)
                                   
        #====================================================================================#
        #                           ----------SECTION 4----------
        # Computing Q-values
        #
        # Your code should construct numpy arrays for Q-values which will be used to compute
        # advantages (which will in turn be fed to the placeholder you defined above). 
        #
        # Recall that the expression for the policy gradient PG is
        #
        #       PG = E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * (Q_t - b_t )]
        #
        # where 
        #
        #       tau=(s_0, a_0, ...) is a trajectory,
        #       Q_t is the Q-value at time t, Q^{pi}(s_t, a_t),
        #       and b_t is a baseline which may depend on s_t. 
        #
        # You will write code for two cases, controlled by the flag 'reward_to_go':
        #
        #   Case 1: trajectory-based PG 
        #
        #       (reward_to_go = False)
        #
        #       Instead of Q^{pi}(s_t, a_t), we use the total discounted reward summed over 
        #       entire trajectory (regardless of which time step the Q-value should be for). 
        #
        #       For this case, the policy gradient estimator is
        #
        #           E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * Ret(tau)]
        #
        #       where
        #
        #           Ret(tau) = sum_{t'=0}^T gamma^t' r_{t'}.
        #
        #       Thus, you should compute
        #
        #           Q_t = Ret(tau)
        #
        #   Case 2: reward-to-go PG 
        #
        #       (reward_to_go = True)
        #
        #       Here, you estimate Q^{pi}(s_t, a_t) by the discounted sum of rewards starting
        #       from time step t. Thus, you should compute
        #
        #           Q_t = sum_{t'=t}^T gamma^(t'-t) * r_{t'}
        #
        #
        # Store the Q-values for all timesteps and all trajectories in a variable 'q_n',
        # like the 'ob_no' and 'ac_na' above. 
        #
        #====================================================================================#
        q_n = []
        for path in paths:
            rewards = path['reward']
            num_steps = pathlength(path)
            R=[]
            if reward_to_go:
                for t in range(num_steps):
                    R.append((torch.pow(gamma, torch.arange(num_steps-t))*rewards[t:]).sum().view(-1,1))
                q_n.append(torch.cat(R))
            else:
                q_n.append((torch.pow(gamma, torch.arange(num_steps)) * rewards).sum() * torch.ones(num_steps, 1))
        q_n = torch.cat(q_n, 0)
        
         #====================================================================================#
        #                           ----------SECTION 5----------
        # Computing Baselines
        #====================================================================================#
        if nn_baseline:
            # If nn_baseline is True, use your neural network to predict reward-to-go
            # at each timestep for each trajectory, and save the result in a variable 'b_n'
            # like 'ob_no', 'ac_na', and 'q_n'.
            #
            # Hint #bl1: rescale the output from the nn_baseline to match the statistics
            # (mean and std) of the current or previous batch of Q-values. (Goes with Hint
            # #bl2 below.)
            b_n = critic(ob_no)
            q_n_std = q_n.std()
            q_n_mean = q_n.mean()
            b_n_scaled = b_n * q_n_std + q_n_mean
            adv_n = (q_n - b_n_scaled).detach()
        else:
            adv_n = q_n
        #====================================================================================#
        #                           ----------SECTION 4----------
        # Advantage Normalization
        #====================================================================================#

        if normalize_advantages:
            # On the next line, implement a trick which is known empirically to reduce variance
            # in policy gradient methods: normalize adv_n to have mean zero and std=1. 
            # YOUR_CODE_HERE
            adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + np.finfo(np.float32).eps.item())
        
        #====================================================================================#
        #                           ----------SECTION 5----------
        # Optimizing Neural Network Baseline
        #====================================================================================#
        if nn_baseline:
            # ----------SECTION 5----------
            # If a neural network baseline is used, set up the targets and the inputs for the 
            # baseline. 
            # 
            # Fit it to the current batch in order to use for the next iteration. Use the 
            # baseline_update_op you defined earlier.
            #
            # Hint #bl2: Instead of trying to target raw Q-values directly, rescale the 
            # targets to have mean zero and std=1. (Goes with Hint #bl1 above.)

            # YOUR_CODE_HERE
            target = (q_n - q_n_mean) / (q_n_std + np.finfo(np.float32).eps.item())
            critic_optimizer.zero_grad()
            c_loss = critic_loss(b_n, target)
            c_loss.backward()
            critic_optimizer.step()
            
        #====================================================================================#
        #                           ----------SECTION 4----------
        # Performing the Policy Update
        #====================================================================================#

        # Call the update operation necessary to perform the policy gradient update based on 
        # the current batch of rollouts.
        # 
        # For debug purposes, you may wish to save the value of the loss function before
        # and after an update, and then log them below. 

        # YOUR_CODE_HERE
        log_probs = torch.cat([path["log_prob"] for path in paths], 0)
        actor_optimizer.zero_grad()
        loss = actor_loss(log_probs, adv_n, len(paths))
        print(loss)
        loss.backward()
        actor_optimizer.step()

        # Log diagnostics
        returns = [path["reward"].sum() for path in paths]
        ep_lengths = [pathlength(path) for path in paths]
        logz.log_tabular("Time", time.time() - start)
        logz.log_tabular("Iteration", itr)
        logz.log_tabular("AverageReturn", np.mean(returns))
        logz.log_tabular("StdReturn", np.std(returns))
        logz.log_tabular("MaxReturn", np.max(returns))
        logz.log_tabular("MinReturn", np.min(returns))
        logz.log_tabular("EpLenMean", np.mean(ep_lengths))
        logz.log_tabular("EpLenStd", np.std(ep_lengths))
        logz.log_tabular("TimestepsThisBatch", timesteps_this_batch)
        logz.log_tabular("TimestepsSoFar", total_timesteps)
        logz.dump_tabular()
        logz.pickle_tf_vars()
Example #34
0
def main_cartpole(n_iter=100, gamma=1.0, min_timesteps_per_batch=1000, stepsize=1e-2, animate=True, logdir=None):
    env = gym.make("CartPole-v0")
    ob_dim = env.observation_space.shape[0]
    num_actions = env.action_space.n
    logz.configure_output_dir(logdir)
    vf = LinearValueFunction()

    # Symbolic variables have the prefix sy_, to distinguish them from the numerical values
    # that are computed later in these function
    sy_ob_no = tf.placeholder(shape=[None, ob_dim], name="ob", dtype=tf.float32) # batch of observations
    sy_ac_n = tf.placeholder(shape=[None], name="ac", dtype=tf.int32) # batch of actions taken by the policy, used for policy gradient computation
    sy_adv_n = tf.placeholder(shape=[None], name="adv", dtype=tf.float32) # advantage function estimate
    sy_h1 = lrelu(dense(sy_ob_no, 32, "h1", weight_init=normc_initializer(1.0))) # hidden layer
    sy_logits_na = dense(sy_h1, num_actions, "final", weight_init=normc_initializer(0.05)) # "logits", describing probability distribution of final layer
    # we use a small initialization for the last layer, so the initial policy has maximal entropy
    sy_oldlogits_na = tf.placeholder(shape=[None, num_actions], name='oldlogits', dtype=tf.float32) # logits BEFORE update (just used for KL diagnostic)
    sy_logp_na = tf.nn.log_softmax(sy_logits_na) # logprobability of actions
    sy_sampled_ac = categorical_sample_logits(sy_logits_na)[0] # sampled actions, used for defining the policy (NOT computing the policy gradient)
    sy_n = tf.shape(sy_ob_no)[0]
    sy_logprob_n = fancy_slice_2d(sy_logp_na, tf.range(sy_n), sy_ac_n) # log-prob of actions taken -- used for policy gradient calculation

    # The following quantities are just used for computing KL and entropy, JUST FOR DIAGNOSTIC PURPOSES >>>>
    sy_oldlogp_na = tf.nn.log_softmax(sy_oldlogits_na)
    sy_oldp_na = tf.exp(sy_oldlogp_na) 
    sy_kl = tf.reduce_sum(sy_oldp_na * (sy_oldlogp_na - sy_logp_na)) / tf.to_float(sy_n)
    sy_p_na = tf.exp(sy_logp_na)
    sy_ent = tf.reduce_sum( - sy_p_na * sy_logp_na) / tf.to_float(sy_n)
    # <<<<<<<<<<<<<

    sy_surr = - tf.reduce_mean(sy_adv_n * sy_logprob_n) # Loss function that we'll differentiate to get the policy gradient ("surr" is for "surrogate loss")

    sy_stepsize = tf.placeholder(shape=[], dtype=tf.float32) # Symbolic, in case you want to change the stepsize during optimization. (We're not doing that currently)
    update_op = tf.train.AdamOptimizer(sy_stepsize).minimize(sy_surr)

    tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) 
    # use single thread. on such a small problem, multithreading gives you a slowdown
    # this way, we can better use multiple cores for different experiments
    sess = tf.Session(config=tf_config)
    sess.__enter__() # equivalent to `with sess:`
    tf.global_variables_initializer().run() #pylint: disable=E1101

    total_timesteps = 0

    for i in range(n_iter):
        print("********** Iteration %i ************"%i)

        # Collect paths until we have enough timesteps
        timesteps_this_batch = 0
        paths = []
        while True:
            ob = env.reset()
            terminated = False
            obs, acs, rewards = [], [], []
            animate_this_episode=(len(paths)==0 and (i % 10 == 0) and animate)
            while True:
                if animate_this_episode:
                    env.render()
                obs.append(ob)
                ac = sess.run(sy_sampled_ac, feed_dict={sy_ob_no : ob[None]})
                acs.append(ac)
                ob, rew, done, _ = env.step(ac)
                rewards.append(rew)
                if done:
                    break                    
            path = {"observation" : np.array(obs), "terminated" : terminated,
                    "reward" : np.array(rewards), "action" : np.array(acs)}
            paths.append(path)
            timesteps_this_batch += pathlength(path)
            if timesteps_this_batch > min_timesteps_per_batch:
                break
        total_timesteps += timesteps_this_batch
        # Estimate advantage function
        vtargs, vpreds, advs = [], [], []
        for path in paths:
            rew_t = path["reward"]
            return_t = discount(rew_t, gamma)
            vpred_t = vf.predict(path["observation"])
            adv_t = return_t - vpred_t
            advs.append(adv_t)
            vtargs.append(return_t)
            vpreds.append(vpred_t)

        # Build arrays for policy update
        ob_no = np.concatenate([path["observation"] for path in paths])
        ac_n = np.concatenate([path["action"] for path in paths])
        adv_n = np.concatenate(advs)
        standardized_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8)
        vtarg_n = np.concatenate(vtargs)
        vpred_n = np.concatenate(vpreds)
        vf.fit(ob_no, vtarg_n)

        # Policy update
        _, oldlogits_na = sess.run([update_op, sy_logits_na], feed_dict={sy_ob_no:ob_no, sy_ac_n:ac_n, sy_adv_n:standardized_adv_n, sy_stepsize:stepsize})
        kl, ent = sess.run([sy_kl, sy_ent], feed_dict={sy_ob_no:ob_no, sy_oldlogits_na:oldlogits_na})

        # Log diagnostics
        logz.log_tabular("EpRewMean", np.mean([path["reward"].sum() for path in paths]))
        logz.log_tabular("EpLenMean", np.mean([pathlength(path) for path in paths]))
        logz.log_tabular("KLOldNew", kl)
        logz.log_tabular("Entropy", ent)
        logz.log_tabular("EVBefore", explained_variance_1d(vpred_n, vtarg_n))
        logz.log_tabular("EVAfter", explained_variance_1d(vf.predict(ob_no), vtarg_n))
        logz.log_tabular("TimestepsSoFar", total_timesteps)
        # If you're overfitting, EVAfter will be way larger than EVBefore.
        # Note that we fit value function AFTER using it to compute the advantage function to avoid introducing bias
        logz.dump_tabular()
def train_SAC(env_name, exp_name, seed, reparametrize, two_qf, old_funct,
              logdir, debug, gpu):
    alpha = {
        'Ant-v2': 0.1,
        'HalfCheetah-v2': 0.2,
        'Hopper-v2': 0.2,
        'Humanoid-v2': 0.05,
        'Walker2d-v2': 0.2,
    }.get(env_name, 0.2)

    algorithm_params = {
        'alpha': alpha,
        'batch_size': 256,
        'discount': 0.99,
        'learning_rate': 1e-3,
        'reparameterize': reparametrize,
        'tau': 0.01,
        'epoch_length': 1000,
        'n_epochs': 500,
        'two_qf': two_qf,
    }
    sampler_params = {
        'max_episode_length': 1000,
        'prefill_steps': 1000,
    }
    replay_pool_params = {
        'max_size': 1e6,
    }

    value_function_params = {
        'hidden_layer_sizes': (128, 128),
    }

    q_function_params = {
        'hidden_layer_sizes': (128, 128),
    }

    policy_params = {
        'hidden_layer_sizes': (128, 128),
    }

    logz.configure_output_dir(logdir)
    params = {
        'exp_name': exp_name,
        'env_name': env_name,
        'algorithm_params': algorithm_params,
        'sampler_params': sampler_params,
        'replay_pool_params': replay_pool_params,
        'value_function_params': value_function_params,
        'q_function_params': q_function_params,
        'policy_params': policy_params
    }
    logz.save_params(params)

    env = gym.envs.make(env_name)
    # Set random seeds
    tf.set_random_seed(seed)
    np.random.seed(seed)
    env.seed(seed)

    sampler = utils.SimpleSampler(**sampler_params)
    replay_pool = utils.SimpleReplayPool(
        observation_shape=env.observation_space.shape,
        action_shape=env.action_space.shape,
        **replay_pool_params)

    q_function = nn.QFunction(name='q_function', **q_function_params)
    if algorithm_params.get('two_qf', False):
        q_function2 = nn.QFunction(name='q_function2', **q_function_params)
    else:
        q_function2 = None
    value_function = nn.ValueFunction(name='value_function',
                                      **value_function_params)
    target_value_function = nn.ValueFunction(name='target_value_function',
                                             **value_function_params)
    policy = nn.GaussianPolicy(
        action_dim=env.action_space.shape[0],
        reparameterize=algorithm_params['reparameterize'],
        old_funct=old_funct,
        **policy_params)

    sampler.initialize(env, policy, replay_pool)

    algorithm = SAC(**algorithm_params)

    gpu_options = tf.GPUOptions(allow_growth=True, visible_device_list=gpu)
    tf_config = tf.ConfigProto(inter_op_parallelism_threads=1,
                               intra_op_parallelism_threads=1,
                               gpu_options=gpu_options)
    with tf.Session(config=tf_config) as sess:

        if debug:
            sess = tf_debug.LocalCLIDebugWrapperSession(sess)

        algorithm.build(env=env,
                        policy=policy,
                        q_function=q_function,
                        q_function2=q_function2,
                        value_function=value_function,
                        target_value_function=target_value_function)

        for epoch in algorithm.train(sampler,
                                     session=sess,
                                     n_epochs=algorithm_params.get(
                                         'n_epochs', 1000)):
            logz.log_tabular('Iteration', epoch)
            for k, v in algorithm.get_statistics().items():
                logz.log_tabular(k, v)
            for k, v in replay_pool.get_statistics().items():
                logz.log_tabular(k, v)
            for k, v in sampler.get_statistics().items():
                logz.log_tabular(k, v)
            logz.dump_tabular()
Example #36
0
def train_PG(exp_name='',
             env_name='CartPole-v0',
             n_iter=100, 
             gamma=1.0, 
             min_timesteps_per_batch=1000, 
             max_path_length=None,
             learning_rate=5e-3, 
             reward_to_go=True, 
             animate=True, 
             logdir=None, 
             normalize_advantages=True,
             nn_baseline=False, 
             seed=0,
             # network arguments
             n_layers=1,
             size=32
             ):

    start = time.time()

    # Configure output directory for logging
    logz.configure_output_dir(logdir)

    # Log experimental parameters
    args = inspect.getargspec(train_PG)[0]
    locals_ = locals()
    params = {k: locals_[k] if k in locals_ else None for k in args}
    logz.save_params(params)

    # Set random seeds
    tf.set_random_seed(seed)
    np.random.seed(seed)

    # Make the gym environment
    env = gym.make(env_name)
    
    # Is this env continuous, or discrete?
    discrete = isinstance(env.action_space, gym.spaces.Discrete)

    # Maximum length for episodes
    max_path_length = max_path_length or env.spec.max_episode_steps

    #========================================================================================#
    # Notes on notation:
    # 
    # Symbolic variables have the prefix sy_, to distinguish them from the numerical values
    # that are computed later in the function
    # 
    # Prefixes and suffixes:
    # ob - observation 
    # ac - action
    # _no - this tensor should have shape (batch size /n/, observation dim)
    # _na - this tensor should have shape (batch size /n/, action dim)
    # _n  - this tensor should have shape (batch size /n/)
    # 
    # Note: batch size /n/ is defined at runtime, and until then, the shape for that axis
    # is None
    #========================================================================================#

    # Observation and action sizes
    ob_dim = env.observation_space.shape[0]
    ac_dim = env.action_space.n if discrete else env.action_space.shape[0]

    #========================================================================================#
    #                           ----------SECTION 4----------
    # Placeholders
    # 
    # Need these for batch observations / actions / advantages in policy gradient loss function.
    #========================================================================================#

    sy_ob_no = tf.placeholder(shape=[None, ob_dim], name="ob", dtype=tf.float32)
    if discrete:
        sy_ac_na = tf.placeholder(shape=[None], name="ac", dtype=tf.int32) 
    else:
        sy_ac_na = tf.placeholder(shape=[None, ac_dim], name="ac", dtype=tf.float32) 

    # Define a placeholder for advantages
    sy_adv_n = TODO


    #========================================================================================#
    #                           ----------SECTION 4----------
    # Networks
    # 
    # Make symbolic operations for
    #   1. Policy network outputs which describe the policy distribution.
    #       a. For the discrete case, just logits for each action.
    #
    #       b. For the continuous case, the mean / log std of a Gaussian distribution over 
    #          actions.
    #
    #      Hint: use the 'build_mlp' function you defined in utilities.
    #
    #      Note: these ops should be functions of the placeholder 'sy_ob_no'
    #
    #   2. Producing samples stochastically from the policy distribution.
    #       a. For the discrete case, an op that takes in logits and produces actions.
    #
    #          Should have shape [None]
    #
    #       b. For the continuous case, use the reparameterization trick:
    #          The output from a Gaussian distribution with mean 'mu' and std 'sigma' is
    #
    #               mu + sigma * z,         z ~ N(0, I)
    #
    #          This reduces the problem to just sampling z. (Hint: use tf.random_normal!)
    #
    #          Should have shape [None, ac_dim]
    #
    #      Note: these ops should be functions of the policy network output ops.
    #
    #   3. Computing the log probability of a set of actions that were actually taken, 
    #      according to the policy.
    #
    #      Note: these ops should be functions of the placeholder 'sy_ac_na', and the 
    #      policy network output ops.
    #   
    #========================================================================================#

    if discrete:
        # YOUR_CODE_HERE
        sy_logits_na = TODO
        sy_sampled_ac = TODO # Hint: Use the tf.multinomial op
        sy_logprob_n = TODO

    else:
        # YOUR_CODE_HERE
        sy_mean = TODO
        sy_logstd = TODO # logstd should just be a trainable variable, not a network output.
        sy_sampled_ac = TODO
        sy_logprob_n = TODO  # Hint: Use the log probability under a multivariate gaussian. 



    #========================================================================================#
    #                           ----------SECTION 4----------
    # Loss Function and Training Operation
    #========================================================================================#

    loss = TODO # Loss function that we'll differentiate to get the policy gradient.
    update_op = tf.train.AdamOptimizer(learning_rate).minimize(loss)


    #========================================================================================#
    #                           ----------SECTION 5----------
    # Optional Baseline
    #========================================================================================#

    if nn_baseline:
        baseline_prediction = tf.squeeze(build_mlp(
                                sy_ob_no, 
                                1, 
                                "nn_baseline",
                                n_layers=n_layers,
                                size=size))
        # Define placeholders for targets, a loss function and an update op for fitting a 
        # neural network baseline. These will be used to fit the neural network baseline. 
        # YOUR_CODE_HERE
        baseline_update_op = TODO


    #========================================================================================#
    # Tensorflow Engineering: Config, Session, Variable initialization
    #========================================================================================#

    tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) 

    sess = tf.Session(config=tf_config)
    sess.__enter__() # equivalent to `with sess:`
    tf.global_variables_initializer().run() #pylint: disable=E1101



    #========================================================================================#
    # Training Loop
    #========================================================================================#

    total_timesteps = 0

    for itr in range(n_iter):
        print("********** Iteration %i ************"%itr)

        # Collect paths until we have enough timesteps
        timesteps_this_batch = 0
        paths = []
        while True:
            ob = env.reset()
            obs, acs, rewards = [], [], []
            animate_this_episode=(len(paths)==0 and (itr % 10 == 0) and animate)
            steps = 0
            while True:
                if animate_this_episode:
                    env.render()
                    time.sleep(0.05)
                obs.append(ob)
                ac = sess.run(sy_sampled_ac, feed_dict={sy_ob_no : ob[None]})
                ac = ac[0]
                acs.append(ac)
                ob, rew, done, _ = env.step(ac)
                rewards.append(rew)
                steps += 1
                if done or steps > max_path_length:
                    break
            path = {"observation" : np.array(obs), 
                    "reward" : np.array(rewards), 
                    "action" : np.array(acs)}
            paths.append(path)
            timesteps_this_batch += pathlength(path)
            if timesteps_this_batch > min_timesteps_per_batch:
                break
        total_timesteps += timesteps_this_batch

        # Build arrays for observation, action for the policy gradient update by concatenating 
        # across paths
        ob_no = np.concatenate([path["observation"] for path in paths])
        ac_na = np.concatenate([path["action"] for path in paths])

        #====================================================================================#
        #                           ----------SECTION 4----------
        # Computing Q-values
        #
        # Your code should construct numpy arrays for Q-values which will be used to compute
        # advantages (which will in turn be fed to the placeholder you defined above). 
        #
        # Recall that the expression for the policy gradient PG is
        #
        #       PG = E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * (Q_t - b_t )]
        #
        # where 
        #
        #       tau=(s_0, a_0, ...) is a trajectory,
        #       Q_t is the Q-value at time t, Q^{pi}(s_t, a_t),
        #       and b_t is a baseline which may depend on s_t. 
        #
        # You will write code for two cases, controlled by the flag 'reward_to_go':
        #
        #   Case 1: trajectory-based PG 
        #
        #       (reward_to_go = False)
        #
        #       Instead of Q^{pi}(s_t, a_t), we use the total discounted reward summed over 
        #       entire trajectory (regardless of which time step the Q-value should be for). 
        #
        #       For this case, the policy gradient estimator is
        #
        #           E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * Ret(tau)]
        #
        #       where
        #
        #           Ret(tau) = sum_{t'=0}^T gamma^t' r_{t'}.
        #
        #       Thus, you should compute
        #
        #           Q_t = Ret(tau)
        #
        #   Case 2: reward-to-go PG 
        #
        #       (reward_to_go = True)
        #
        #       Here, you estimate Q^{pi}(s_t, a_t) by the discounted sum of rewards starting
        #       from time step t. Thus, you should compute
        #
        #           Q_t = sum_{t'=t}^T gamma^(t'-t) * r_{t'}
        #
        #
        # Store the Q-values for all timesteps and all trajectories in a variable 'q_n',
        # like the 'ob_no' and 'ac_na' above. 
        #
        #====================================================================================#

        # YOUR_CODE_HERE
        q_n = TODO

        #====================================================================================#
        #                           ----------SECTION 5----------
        # Computing Baselines
        #====================================================================================#

        if nn_baseline:
            # If nn_baseline is True, use your neural network to predict reward-to-go
            # at each timestep for each trajectory, and save the result in a variable 'b_n'
            # like 'ob_no', 'ac_na', and 'q_n'.
            #
            # Hint #bl1: rescale the output from the nn_baseline to match the statistics
            # (mean and std) of the current or previous batch of Q-values. (Goes with Hint
            # #bl2 below.)

            b_n = TODO
            adv_n = q_n - b_n
        else:
            adv_n = q_n.copy()

        #====================================================================================#
        #                           ----------SECTION 4----------
        # Advantage Normalization
        #====================================================================================#

        if normalize_advantages:
            # On the next line, implement a trick which is known empirically to reduce variance
            # in policy gradient methods: normalize adv_n to have mean zero and std=1. 
            # YOUR_CODE_HERE
            pass


        #====================================================================================#
        #                           ----------SECTION 5----------
        # Optimizing Neural Network Baseline
        #====================================================================================#
        if nn_baseline:
            # ----------SECTION 5----------
            # If a neural network baseline is used, set up the targets and the inputs for the 
            # baseline. 
            # 
            # Fit it to the current batch in order to use for the next iteration. Use the 
            # baseline_update_op you defined earlier.
            #
            # Hint #bl2: Instead of trying to target raw Q-values directly, rescale the 
            # targets to have mean zero and std=1. (Goes with Hint #bl1 above.)

            # YOUR_CODE_HERE
            pass

        #====================================================================================#
        #                           ----------SECTION 4----------
        # Performing the Policy Update
        #====================================================================================#

        # Call the update operation necessary to perform the policy gradient update based on 
        # the current batch of rollouts.
        # 
        # For debug purposes, you may wish to save the value of the loss function before
        # and after an update, and then log them below. 

        # YOUR_CODE_HERE


        # Log diagnostics
        returns = [path["reward"].sum() for path in paths]
        ep_lengths = [pathlength(path) for path in paths]
        logz.log_tabular("Time", time.time() - start)
        logz.log_tabular("Iteration", itr)
        logz.log_tabular("AverageReturn", np.mean(returns))
        logz.log_tabular("StdReturn", np.std(returns))
        logz.log_tabular("MaxReturn", np.max(returns))
        logz.log_tabular("MinReturn", np.min(returns))
        logz.log_tabular("EpLenMean", np.mean(ep_lengths))
        logz.log_tabular("EpLenStd", np.std(ep_lengths))
        logz.log_tabular("TimestepsThisBatch", timesteps_this_batch)
        logz.log_tabular("TimestepsSoFar", total_timesteps)
        logz.dump_tabular()
        logz.pickle_tf_vars()