parallel_sampler.initialize(n_parallel=1)
parallel_sampler.set_seed(0)

policy = QMDPPolicy(env_spec=env.spec,
                    name="QMDP",
                    qmdp_param=env._wrapped_env.params)

baseline = LinearFeatureBaseline(env_spec=env.spec)

with tf.Session() as sess:

    # writer = tf.summary.FileWriter(logdir=log_dir,)

    algo = VPG_t(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=2048,  #2*env._wrapped_env.params['traj_limit'],
        max_path_length=env._wrapped_env.params['traj_limit'],
        n_itr=10000,
        discount=0.95,
        step_size=0.01,
        record_rewards=True,
        transfer=False,
    )

    algo.train(sess)
    # tf.summary.merge_all()
    # print(sess.graph)
    # writer.add_graph(sess.graph)
    # writer.close()
                    name="QMDP",
                    qmdp_param=env._wrapped_env.params)

baseline = LinearFeatureBaseline(env_spec=env.spec)

with tf.Session() as sess:

    writer = tf.summary.FileWriter(logdir=log_dir, )

    algo = VPG_t(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=2 * env._wrapped_env.params['traj_limit'],
        max_path_length=env._wrapped_env.params['traj_limit'],
        n_itr=1,
        discount=0.95,
        step_size=0.01,
        record_rewards=True,
        transfer=False,
        # optimizer=ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(base_eps=1e-5))
        # optimizer = PenaltyLbfgsOptimizer()
        # Uncomment both lines (this and the plot parameter below) to enable plotting
        # plot=True,
    )

    algo.train(sess)
    # tf.summary.merge_all()
    # print(sess.graph)
    writer.add_graph(sess.graph)
    writer.close()