def record_video(env_id, model, video_length=300, prefix='', video_folder='videos/', lstm=False): """ :param env_id: (str) :param model: (RL model) :param video_length: (int) :param prefix: (str) :param video_folder: (str) """ eval_env = DummyVecEnv([lambda: gym.make(env_id)]) # Start the video at step=0 and record 300 steps eval_env = VecVideoRecorder(eval_env, video_folder=video_folder, record_video_trigger=lambda step: step == 0, video_length=video_length, name_prefix=prefix) obs = eval_env.reset() state = None for _ in range(video_length): action, state = model.predict(np.tile(obs, (model.n_envs, 1)), state=state, deterministic=False) action = action[[0]] if lstm else action[0] obs, _, _, _ = eval_env.step(action) # Close the video recorder eval_env.close()
def main(): save_path = args.checkpoint_dir + args.policy + "/" + args.policy env = gym.make("SegmentationEnv-v0", objs_dir=args.objs_dir, max_scenes=args.max_scenes, sample_size=args.sample_size, diff_punishment=args.diff_punishment, max_steps_per_scene=args.max_steps_per_scene, scene_mode=args.scene_mode, training=False, point_mode=args.point_mode, voxel_size=args.voxel_size, voxel_mode=args.voxel_mode, single_scenes=args.single_scenes, early_diff=args.early_diff) env = DummyVecEnv([lambda: env]) # The algorithms require a vectorized environment to run env = VecCheckNan(env, raise_exception=True) model = PPO2.load(save_path, env=env) n_episodes = 10 for i in range(n_episodes): total_reward = 0 obs = env.reset() while True: action, _states = model.predict(obs, deterministic=True) obs, reward, done, info = env.step(action) total_reward += reward if done: print("Total Reward: ", total_reward) break env.close()
def main(): alg_input = input("Select algorithm (PPO2 or A2C only):") if alg_input != "PPO2" and alg_input != "A2C" and alg_input != "ppo2" and alg_input != "a2c": print("Not an option (PPO2 or A2C only) !") alg_input = input("Select algorithm (PPO2 or A2C only):") model_input = "trained_agents\\" + input( "Select model to test(input filename, eg. a2c_wf_2):") env = gym.make("WARFLEET-v0") # The algorithms require a vectorized environment to run env = DummyVecEnv([lambda: env]) log_dir = "./logs/" done = False stage_reward = 0 turns = 0 if alg_input == "PPO2" or alg_input == "ppo2": model = PPO2.load(model_input, env=env, tensorboard_log=log_dir) elif alg_input == "A2C" or alg_input == "a2c": model = A2C.load(model_input, env=env, tensorboard_log=log_dir) obs = env.reset() while not done: action, _states = model.predict(obs) obs, reward, done, info = env.step(action) stage_reward += reward turns = turns + 1 # env.render() print("Reward: {} /42".format(stage_reward)) print("Turns: {}".format(turns)) env.close()
def learn(algorithm, environment_name, total_timesteps=1000, n_steps=128, gamma=0.99, nminibatches=4, verbose=0): global environment environment = gym.make('gym_threshold:' + environment_name) dummy_vec_environment = DummyVecEnv([lambda: environment]) if algorithm == "PPO2": model = PPO2(MlpPolicy, dummy_vec_environment, verbose=verbose, n_steps=n_steps, gamma=gamma, nminibatches=nminibatches, cliprange_vf=-1, tensorboard_log="tensorboard") else: raise AttributeError('No algorithm with name: {}'.format(algorithm)) model.learn( total_timesteps=total_timesteps, tb_log_name= "algorithm: {}, n_steps: {}, nminibatches: {}, gamma: {} run".format( algorithm, n_steps, nminibatches, gamma), callback=tensorboard_callback) dummy_vec_environment.close()
def test_df(self, model, df, ohlc_df, train_len): train_df = df.iloc[:train_len].copy() test_df = df.iloc[train_len:].copy() train_ohlc = ohlc_df.iloc[:train_len].copy() test_ohlc = ohlc_df.iloc[train_len:].copy() # check test for train data test_env = DummyVecEnv([lambda: TradingEnv(train_df.drop('close', axis=1), train_ohlc, 1440)]) obs = test_env.reset() done = False ac_data = None while done == False: action, _states = model.predict(obs) obs, rewards, done, ac_data = test_env.step(action) # test_env.render() test_env.close() print('pl=', ac_data[0]['ac'].total_pl, 'num trade=', ac_data[0]['ac'].num_trade, 'win_rate=', ac_data[0]['ac'].win_rate, 'fee ratio=', round(ac_data[0]['ac'].total_fee / ac_data[0]['ac'].total_pl, 4) if ac_data[0]['ac'].num_trade > 0 else 0) print('num market order=', ac_data[0]['ac'].num_market_order) fig, ax1 = plt.subplots() plt.figure(figsize=(30, 30), dpi=200) ax1.plot(np.array(ac_data[0]['ac'].performance_total_pl_log).reshape(-1, 1), color='red', linewidth=3.0, label='pl') ax1.legend(loc="best", edgecolor="red") ax2 = ax1.twinx() ax2.plot(np.array(train_ohlc['close'].iloc[1440: len(ac_data[0]['ac'].performance_total_pl_log) + 1440]).reshape(-1, 1), label='close') h1, l1 = ax1.get_legend_handles_labels() h2, l2 = ax2.get_legend_handles_labels() ax2.legend(h1 + h2, l1 + l2, loc="best", frameon=True, edgecolor="blue") plt.show() return ac_data
def main(): env_id = "CartPole-v1" num_cpu = 8 # Number of processes to use training_steps = int(1e4) agent_types = [m.__name__ for m in g_stable_agents] for agent_type in agent_types: model_name = "{0}-{1}".format(agent_type, env_id) print(model_name) policy = common_policies.MlpPolicy if agent_type == "DQN": policy = deepq.MlpPolicy # Create the vectorized environment env = DummyVecEnv([make_env(env_id, 0)]) """ if agent_type in ["DQN","PPO1","TRPO"]: env = DummyVecEnv([make_env(env_id, 0)]) else: env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)]) """ agent = create_agent(agent_type) model = train(env, agent, policy, training_steps) model.save(os.path.join('./models', model_name)) del model #To make sure model is saved for e in env.envs: e.close() del e env.close() del env
def main(): env = gym.make("WARFLEET-v0") # The algorithms require a vectorized environment to run env = DummyVecEnv([lambda: env]) log_dir = "./logs/" model = PPO2(MlpPolicy, env, verbose=1, tensorboard_log=log_dir, cliprange=0.1, gamma=0.99, ent_coef=0.001, vf_coef=0.2) #model.learn(total_timesteps=10000000) #model.save("PPO2_wf_2") done = False stage_reward = 0 input("Training is finished, press to play a game: ") model = PPO2.load("trained_agents/PPO2_wf_2", env=env, tensorboard_log=log_dir) obs = env.reset() while not done: action, _states = model.predict(obs) obs, reward, done, info = env.step(action) stage_reward += reward # env.render() env.close()
def train(): def callback(_locals, _globals): # Save model _locals['self'].save(MODEL_NAME) envs = [create_env_headless for _ in range(ENV_COUNT)] vec_envs = SubprocVecEnv(envs) model = PPO2('CnnPolicy', vec_envs, verbose=1, ent_coef=0.0001, n_steps=256) if not os.path.isfile(MODEL_NAME): model.save(MODEL_NAME) vec_envs.close() print("Run again to train") else: model.learn(total_timesteps=TIMESTEPS, callback=callback) model.save(MODEL_NAME) vec_envs.close() print("Training Done") # Evaluation print("Evaluation") vec_env = create_env_headless() vec_env = DummyVecEnv([lambda: vec_env]) model = PPO2.load(MODEL_NAME) print(evaluate_policy(model, vec_env, n_eval_episodes=100)) print(evaluate_policy(model, vec_env, n_eval_episodes=100)) vec_env.close()
def train(): if not os.path.isdir("log/"): os.mkdir("log") if ENV_COUNT == 1: envs = create_env_headless() env_id = str(time.time())[-6:] envs = Monitor(envs, "log/" + MODEL_NAME + "-" + env_id, allow_early_resets=False) vec_envs = DummyVecEnv([lambda: envs]) else: vec_envs = [] def make_env(): env_id = str(time.time())[-6:] env = create_env_headless() return Monitor(env, "log/" + MODEL_NAME + "-" + env_id, allow_early_resets=False) for _ in range(ENV_COUNT): vec_envs.append(make_env) vec_envs = SubprocVecEnv(vec_envs) model = PPO2('CnnPolicy', vec_envs, verbose=1, ent_coef=0.0001, n_steps=256) model.learn(total_timesteps=TIMESTEPS) model.save(MODEL_NAME) vec_envs.close() print("Learning Done!")
def train(env_id, num_timesteps, seed, lam, sgd_steps, klcoeff, log): """ Train TRPO model for the mujoco environment, for testing purposes :param env_id: (str) Environment ID :param num_timesteps: (int) The total number of samples :param seed: (int) The initial seed for training """ with tf_util.single_threaded_session(): rank = MPI.COMM_WORLD.Get_rank() log_path = './experiments/' + str( env_id) + './OURS-LOADED/noent_klcoeffanneal_samesgdsteps' + str( sgd_steps) + '_longer_wgae0.95_exp1_2_' + str(seed) #log_path = './experiments/'+str(env_id)+'./TRPO-3x/TRPOR-oldsampling/noent_klcoeff'+str(sgd_steps)+'_sgdstep_steps5_'+str(seed) if not log: if rank == 0: logger.configure(log_path) else: logger.configure(log_path, format_strs=[]) logger.set_level(logger.DISABLED) else: if rank == 0: logger.configure() else: logger.configure(format_strs=[]) logger.set_level(logger.DISABLED) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() #env = make_mujoco_env(env_id, workerseed) def make_env(): env_out = gym.make(env_id) env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True) env_out.seed(seed) return env_out env = DummyVecEnv([make_env]) env = VecNormalize(env) #, norm_reward=False, norm_obs=False) #env = VecNormalize(env) model = TRPO(MlpPolicy, env, timesteps_per_batch=2048, max_kl=0.01, cg_iters=10, cg_damping=0.1, entcoeff=0.0, gamma=0.99, lam=0.95, vf_iters=5, vf_stepsize=1e-3, verbose=1, seed=seed, sgd_steps=sgd_steps, klcoeff=klcoeff, method="multistep-SGD") model.learn(total_timesteps=10e6) #num_timesteps, seed=seed) env.close()
def record_video(env_id, model, video_length=500, prefix='', video_folder='videos/'): """ :param env_id: (str) :param model: (RL model) :param video_length: (int) :param prefix: (str) :param video_folder: (str) """ eval_env = DummyVecEnv([lambda: gym.make(env_id)]) # Start the video at step=0 and record 500 steps eval_env = VecVideoRecorder(env, video_folder=video_folder, record_video_trigger=lambda step: step == 0, video_length=video_length, name_prefix=prefix) obs = eval_env.reset() for _ in range(video_length): action, _ = model.predict(obs) obs, _, _, _ = eval_env.step(action) # Close the video recorder eval_env.close()
def run(learning_steps=4300, verbose=0, n_steps=128, nminibatches=4, gamma=0.99, learning_rate=2.5e-4, ent_coef=0.01, tensorboard_log="tensorboard"): global inner_env inner_env = gym.make( 'gym_threshold:extended-state-semi-fixed-end-not-adapted-v0') env = DummyVecEnv([lambda: inner_env]) model = PPO2(MlpLstmPolicy, env, verbose=verbose, n_steps=n_steps, nminibatches=nminibatches, gamma=gamma, ent_coef=ent_coef, learning_rate=learning_rate, tensorboard_log=tensorboard_log) model.learn(total_timesteps=learning_steps, tb_log_name=os.path.basename(__file__).rstrip(".py"), callback=tensorboard_callback) env.close()
def main(): env_id = "CartPole-v1" num_cpu = 1 # Number of processes to use evaluation_steps_per_episode = 500 evaluation_episodes = 10 render = True agent_types = [m.__name__ for m in g_stable_agents] for agent_type in agent_types: model_name = "{0}-{1}".format(agent_type, env_id) print(model_name) policy = common_policies.MlpPolicy if agent_type == "DQN": policy = deepq.MlpPolicy # Create the vectorized environment env = DummyVecEnv([make_env(env_id=env_id, rank=0, seed=0)]) agent = create_agent(agent_type) model = agent.load(os.path.join('./models', model_name)) evaluate(env=env, model=model, num_episodes=evaluation_episodes, num_steps=evaluation_steps_per_episode, render=render) for e in env.envs: e.close() del e env.close() del env
def run_test(config): """Stable baselines test Mandatory configuration settings: - 'continuous' agent - camera_settings enabled - stable_baselines enabled """ env = None try: # Create Environment env = make_env(config) env = DummyVecEnv([lambda: env]) # Initialize DDPG and start learning n_actions = env.action_space.shape[-1] param_noise = None action_noise = OrnsteinUhlenbeckActionNoise( mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) model = DDPG(CnnPolicy, env, verbose=1, param_noise=param_noise, action_noise=action_noise, random_exploration=0.8) model.learn(total_timesteps=10000) finally: if env: env.close() else: clear_carla(config.host, config.port) print("-----Carla Environment is closed-----")
def learn(env_name, save_file, total_timesteps): env = DummyVecEnv([lambda: gym.make(env_name)]) model = PPO2(CnnPolicy, env, verbose=1) model.learn(total_timesteps=total_timesteps) model.save(save_file) del model env.close()
def run_model(env_name: str, graphs: List[nx.DiGraph], demands: List[List[List[Tuple[np.ndarray, float]]]], model_path: str, replay_steps: int = 10, env_kwargs: Dict = {}, parallelism: int = 4, policy_name: str = None): oblivious_routings = [routing_baselines.shortest_path_routing(graph) for graph in graphs] # make env env = lambda: gym.make(env_name, dm_sequence=demands, graphs=graphs, oblivious_routings=oblivious_routings, **env_kwargs) if policy_name == 'lstm': envs = DummyVecEnv([env] * parallelism) else: envs = DummyVecEnv([env]) # load model = PPO2.load(model_path) # execute obs = envs.reset() state = None utilisations = [] opt_utilisations = [] oblivious_utilisations = [] if env_name == 'ddr-iterative-v0': replay_steps = replay_steps * envs.envs[0].graphs[ envs.envs[0].graph_index].number_of_edges() for i in range(replay_steps - 1): action, state = model.predict(obs, state=state, deterministic=True) obs, reward, done, info = envs.step(action) print(reward) print(info) if info[0]['iter_idx'] == 0: utilisations.append(info[0]['utilisation']) opt_utilisations.append(info[0]['opt_utilisation']) oblivious_utilisations.append(info[0]['oblivious_utilisation']) else: for i in range(replay_steps - 1): action, state = model.predict(obs, state=state, deterministic=True) obs, reward, done, info = envs.step(action) print(reward) print(action) print(info) utilisations.append(info[0]['utilisation']) opt_utilisations.append(info[0]['opt_utilisation']) oblivious_utilisations.append(info[0]['oblivious_utilisation']) print("Mean reward: ", np.mean(np.divide(utilisations, opt_utilisations))) print("Mean oblivious reward: ", np.mean(np.divide(oblivious_utilisations, opt_utilisations))) envs.close() return utilisations, opt_utilisations, oblivious_utilisations
def train(): env = DummyVecEnv([ lambda: DemoEnv() ]) # DQN does not support parrelization through SubprocVecEnv model = DQN(MlpPolicy, env, verbose=1, policy_kwargs={'layers': [4]}) model.learn(total_timesteps=int(2e5)) model.save("deepq_DemoEnv") env.close() del model
def train(env_id, num_timesteps, seed, lam, sgd_steps, klcoeff, log): """ Train TRPO model for the mujoco environment, for testing purposes :param env_id: (str) Environment ID :param num_timesteps: (int) The total number of samples :param seed: (int) The initial seed for training """ with tf_util.single_threaded_session(): rank = MPI.COMM_WORLD.Get_rank() log_path = './experiments/' + str( env_id) + './SAC-M/nips_test19/m' + str(sgd_steps) + '_c' + str( 0.5) + '_e' + str(klcoeff) + '_' + str(seed) #log_path = './experiments/'+str(env_id)+'./TRPO-3x/TRPOR-oldsampling/noent_klcoeff'+str(sgd_steps)+'_sgdstep_steps5_'+str(seed) if not log: if rank == 0: logger.configure(log_path) else: logger.configure(log_path, format_strs=[]) logger.set_level(logger.DISABLED) else: if rank == 0: logger.configure() else: logger.configure(format_strs=[]) logger.set_level(logger.DISABLED) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() #env = make_mujoco_env(env_id, workerseed) def make_env(): env_out = gym.make(env_id) env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True) env_out.seed(seed) return env_out env = DummyVecEnv([make_env]) env = VecNormalize(env, norm_reward=False, norm_obs=False) #env = VecNormalize(env) model = MDPO(MlpPolicy, env, gamma=0.99, verbose=1, seed=seed, buffer_size=1000000, ent_coef=1.0, gradient_steps=sgd_steps, lam=klcoeff, train_freq=1, tsallis_q=1, reparameterize=True, klconst=0.5) model.learn( total_timesteps=int(num_timesteps)) #num_timesteps, seed=seed) env.close()
def record_video(model, env_id=None, eval_env=None, max_video_length=500, video_prefix='', video_folder='videos/', break_early=False, is_recurrent=False): """ :param env_id: (str) :param model: (RL model) :param max_video_length: (int) :param video_prefix: (str) :param video_folder: (str) """ # directly passing an environment overrides passing in an env if eval_env is None: eval_env = DummyVecEnv([lambda: gym.make(env_id)]) # Start the video at step=0 and record 500 steps eval_env = VecVideoRecorder(eval_env, video_folder=video_folder, record_video_trigger=lambda step: step == 0, video_length=max_video_length, name_prefix=video_prefix) # according to docs, recurrent policies must have "state" set like this if is_recurrent: state = None # When using VecEnv, done is a vector is_single_env = (eval_env.num_envs == 1) doneVec = [False for _ in range(model.n_envs)] obs = eval_env.reset() for _ in range(max_video_length): # We need to pass the previous state and a mask for recurrent policies # to reset lstm state when a new episode begin action, state = model.predict(obs, state=state, mask=doneVec) # only allow recurrent models to continually update their state if not is_recurrent: state = None obs, _, done, _ = eval_env.step(action) if is_single_env: doneVec[0] = copy.deepcopy(done[0]) else: doneVec = copy.deepcopy(done) # Close the video recorder eval_env.close()
def evaluate(): vec_env = create_env_headless() vec_env = DummyVecEnv([lambda: vec_env]) model = PPO2.load(MODEL_NAME) print("After Training evaluation") print(evaluate_policy(model, vec_env, n_eval_episodes=1000)) print(evaluate_policy(model, vec_env, n_eval_episodes=1000)) print(evaluate_policy(model, vec_env, n_eval_episodes=1000)) print(evaluate_policy(model, vec_env, n_eval_episodes=1000)) vec_env.close()
def run(learning_steps=4300, verbose=0, gamma=0.99, learning_rate=5e-4, tensorboard_log="tensorboard"): global inner_env inner_env = gym.make('gym_threshold:semi-fixed-end-not-adapted-maintain-v0') env = DummyVecEnv([lambda: inner_env]) model = DQN(MlpPolicy, env, prioritized_replay=True, verbose=verbose, learning_rate=learning_rate, gamma=gamma, tensorboard_log=tensorboard_log) model.learn(total_timesteps=learning_steps, tb_log_name=os.path.basename(__file__).rstrip(".py"), callback=tensorboard_callback) env.close()
def run(learning_steps=4300): global inner_env inner_env = gym.make('gym_threshold:threshold-intra_process-v0') env = DummyVecEnv([lambda: inner_env]) model = PPO2(MlpPolicy, env, verbose=1, n_steps=128, nminibatches=4, tensorboard_log="tensorboard") model.learn(total_timesteps=learning_steps, tb_log_name=os.path.basename(__file__).rstrip(".py"), callback=tensorboard_callback) env.close()
def play(env_name, load_file, total_timesteps): env = DummyVecEnv([lambda: gym.make(env_name)]) model = PPO2.load(load_file, verbose=1) obs = env.reset() for i in range(total_timesteps): action, _states = model.predict(obs) obs, reward, done, info = env.step(action) # env.render() # dummy if done: print(info[0]['episode']) del model env.close()
def play(): vec_env = create_env() vec_env = DummyVecEnv([lambda: vec_env]) model = PPO2.load(MODEL_NAME) obs = vec_env.reset() game_count = 0 while game_count < 1000: action = model.predict(obs)[0] obs, reward, done, info = vec_env.step(action) if done: game_count += 1 vec_env.close()
def main(): reward_averages = [] reward_std = [] training_times = [] total_env = 0 for num_envs in NUM_ENVS: total_env += num_envs print(f'process:{num_envs}') if num_envs == 1: train_env = DummyVecEnv([lambda : gym.make(ENV_ID)]) else: train_env = SubprocVecEnv([make_env(ENV_ID, i+total_env) for i in range(num_envs)], start_method='spawn') eval_env = DummyVecEnv([lambda: gym.make(ENV_ID)]) rewards = [] times = [] for experiment in range(NUM_EXPERIMENTS): train_env.reset() model = PPO2('MlpPolicy', train_env,verbose=0) start = time.time() model.learn(total_timesteps=NUM_STEPS) times.append(time.time() - start) mean_reward = evaluate(model, eval_env, num_episodes=NUM_EPISODES) rewards.append(mean_reward) train_env.close() eval_env.close() reward_averages.append(np.mean(rewards)) # 平均報酬 reward_std.append(np.std(rewards)) # 標準偏差 training_times.append(np.mean(times)) # 学習速度 # プロセスと平均報酬 plt.errorbar(NUM_ENVS, reward_averages, yerr=reward_std, capsize=2) plt.xlabel('number of envs') plt.ylabel('mean reward') plt.savefig('./data/process_mean.png') plt.show() # # プロセスと秒間ステップ数 training_steps_per_second = [NUM_STEPS / t for t in training_times] plt.bar(range(len(NUM_ENVS)), training_steps_per_second) plt.xticks(range(len(NUM_ENVS)), NUM_ENVS) plt.xlabel('number of envs') plt.ylabel('steps per second') plt.savefig('./data/process_step.png') plt.show()
def run(env_string, policy=MlpPolicy, learning_steps=4300, verbose=0, n_steps=128, nminibatches=4, gamma=0.99, learning_rate=2.5e-4, ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, cliprange=0.2, cliprange_vf=None, lam=0.95, policy_kwargs=None, tensorboard_log="tensorboard"): global inner_env inner_env = gym.make(env_string) env = DummyVecEnv([lambda: inner_env]) model = PPO2(policy=policy, env=env, verbose=verbose, n_steps=n_steps, nminibatches=nminibatches, gamma=gamma, ent_coef=ent_coef, learning_rate=learning_rate, vf_coef=vf_coef, max_grad_norm=max_grad_norm, cliprange=cliprange, cliprange_vf=cliprange_vf, lam=lam, policy_kwargs=policy_kwargs, tensorboard_log=tensorboard_log) model.learn(total_timesteps=learning_steps, tb_log_name=os.path.basename(__file__).rstrip(".py"), callback=tensorboard_callback) env.close()
def main(): cmd_parser = cmd_parse() options = cmd_parser.parse_args() ## Get the Stock Ticker data ## # print("The Stock ticker used here is ", options.ticker) file = Path("./data/" + options.ticker + ".csv") if file.is_file(): df = pd.read_csv('./data/' + options.ticker + '.csv') df = df.sort_values('Date') print("Loading ticker data from: " + "./data/" + options.ticker + ".csv") else: print( "Data file for ticker does not exist. Please download data first to ./data/" + options.ticker + ".csv") training_logs_path = options.output_file + "_training_logs.csv" eval_logs_path = options.output_file + "_eval_logs" ## Get the training set size ## print("The options.training_set_size is ", options.training_set_size) ## Get the number of look back days ## print("The options.look-back-days here is: ", options.look_back_days) ## Get the model we are using to train the agent ## print("The model to train the agent here is: ", options.model) # The algorithms require a vectorized environment to run env = DummyVecEnv([ lambda: StockTradingEnv(df, options.look_back_days, options. training_set_size, eval_logs_path) ]) if options.model == "PPO2": model = PPO2(MlpPolicy, env, verbose=1) model.learn(total_timesteps=options.training_set_size) np.savetxt(training_logs_path, model.training_rewards, delimiter=",") obs = env.reset() for i in range(options.training_set_size, len(df['Date'])): action, _states = model.predict(obs) obs, rewards, done, info = env.step(action) env.render(title=options.ticker) env.close()
def train_drf(self, df, ohlc_df, train_len): train_df = df.iloc[:train_len].copy() test_df = df.iloc[train_len:].copy() train_ohlc = ohlc_df.iloc[:train_len].copy() test_ohlc = ohlc_df.iloc[train_len:].copy() env = DummyVecEnv([lambda: TradingEnv(train_df.drop('close', axis=1), train_ohlc, -1)]) # env = SubprocVecEnv([make_env(train_provider, i) for i in range(4)]) # model = PPO2(MlpLnLstmPolicy, env, verbose=1, nminibatches=1, tensorboard_log=log_dir) # MlpLnLstmPolicy, CnnLnLstmPolicy model = PPO2(MlpLnLstmPolicy, env, verbose=1, nminibatches=1, tensorboard_log='./Model') # MlpLnLstmPolicy, CnnLnLstmPolicy # %tensorboard --logdir log_dir # tb=TensorBoardColab(startup_waiting_time=1) # tb=SummaryWriter('./Graph') model.learn(total_timesteps=10000) env.close() model.save('./Model/rf_ppo2') return model
def main(): train_env = SubprocVecEnv([make_env(ENV_ID, i) for i in range(NUM_ENV)]) model = PPO2('MlpPolicy', train_env, verbose=1) model.learn(total_timesteps=10000) test_env = DummyVecEnv([lambda: gym.make(ENV_ID)]) state = test_env.reset() for i in range(200): test_env.render() action, _ = model.predict(state) state, rewards, done, info = test_env.step(action) # エピソード完了 if done: break # 環境のクローズ test_env.close()
def play(): vec_env = create_env() vec_env = DummyVecEnv([lambda: vec_env]) model = PPO2.load(MODEL_NAME) model.set_env(vec_env) game_count = 0 wins = 0 obs = vec_env.reset() while game_count < 100: action = model.predict(obs)[0] obs, reward, done, info = vec_env.step(action) if done: game_count += 1 if reward == 1: wins += 1 vec_env.reset() print(wins / game_count) vec_env.close()