parallel_sampler.initialize(n_parallel=1) parallel_sampler.set_seed(0) policy = QMDPPolicy(env_spec=env.spec, name="QMDP", qmdp_param=env._wrapped_env.params) baseline = LinearFeatureBaseline(env_spec=env.spec) with tf.Session() as sess: # writer = tf.summary.FileWriter(logdir=log_dir,) algo = VPG_t( env=env, policy=policy, baseline=baseline, batch_size=2048, #2*env._wrapped_env.params['traj_limit'], max_path_length=env._wrapped_env.params['traj_limit'], n_itr=10000, discount=0.95, step_size=0.01, record_rewards=True, transfer=False, ) algo.train(sess) # tf.summary.merge_all() # print(sess.graph) # writer.add_graph(sess.graph) # writer.close()
name="QMDP", qmdp_param=env._wrapped_env.params) baseline = LinearFeatureBaseline(env_spec=env.spec) with tf.Session() as sess: writer = tf.summary.FileWriter(logdir=log_dir, ) algo = VPG_t( env=env, policy=policy, baseline=baseline, batch_size=2 * env._wrapped_env.params['traj_limit'], max_path_length=env._wrapped_env.params['traj_limit'], n_itr=1, discount=0.95, step_size=0.01, record_rewards=True, transfer=False, # optimizer=ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(base_eps=1e-5)) # optimizer = PenaltyLbfgsOptimizer() # Uncomment both lines (this and the plot parameter below) to enable plotting # plot=True, ) algo.train(sess) # tf.summary.merge_all() # print(sess.graph) writer.add_graph(sess.graph) writer.close()