from Algo import parallel_sampler parallel_sampler.initialize(n_parallel=1) parallel_sampler.set_seed(0) policy = QMDPPolicy(env_spec=env.spec, name="QMDP", qmdp_param=env._wrapped_env.params) baseline = LinearFeatureBaseline(env_spec=env.spec) with tf.Session() as sess: algo = VPG_t( env=env, policy=policy, baseline=baseline, batch_size=2048, max_path_length=env._wrapped_env.params['traj_limit'], n_itr=20000, discount=0.95, step_size=0.01, record_rewards=True, transfer=False, env_path=log_dir + '/TrainEnv', env_num=500, env_keep_itr=10, ) algo.train(sess)