Ejemplo n.º 1
0
def run(config: Dict[str, Any], logdir: pathlib.PosixPath):
    env = make_env(config)

    if config["mode"] == "evaluate":
        print("Start evaluation.")
        model = PPO.load(logdir / "model.zip")
    elif config["mode"] == "train" and args.logdir:
        print("Start training from existing model.")
        model = PPO.load(logdir / "model.zip")
        model.set_env(env)
        model.learn(total_timesteps=config["train_steps"])
    else:
        print("Start training.")
        model = PPO(
            "CnnPolicy",
            env,
            verbose=1,
            tensorboard_log=logdir / "tensorboard",
            use_sde=True,
        )
        model.learn(total_timesteps=config["train_steps"])

    mean_reward, std_reward = evaluate_policy(
        model, env, n_eval_episodes=config["eval_eps"], deterministic=True)
    print(f"Mean reward: {mean_reward:.2f} +/- {std_reward:.2f}")

    if config["mode"] == "train":
        model.save(logdir / "model")

    env.close()
Ejemplo n.º 2
0
 def load_new_opp(self, idx, opp_fp, opp_elo):
     if idx < len(self.opponents):
         self.opponents[idx] = (PPO.load(opp_fp), opp_elo, opp_fp)
         self.curr_opp = idx
     else:
         self.opponents.append((PPO.load(opp_fp), opp_elo, opp_fp))
         self.curr_opp = len(self.opponents) - 1
Ejemplo n.º 3
0
def train():
    """Trains a PPO2 policy."""

    env_args = env_parser.parse_known_args()[0]
    policy_args = policy_parser.parse_known_args()[0]
    opt_args = opt_parser.parse_known_args()[0]

    os.makedirs(opt_args.save_path, exist_ok=True)

    # create environment
    # train_env = GFootballEnv(env_args) # for evaluation
    train_env = DummyVecEnv([
        make_env(env_args, opt_args.save_path, rank=i)
        for i in range(opt_args.num_envs)
    ])
    eval_env = GFootballEnv(env_args)  # for evaluation
    check_env(env=eval_env, warn=True)

    # define rl policy/value network
    policy = getattr(sys.modules[__name__], policy_args.policy)

    # initialize ppo
    tb_dir = os.path.join(opt_args.save_path, "tensorboard")
    os.makedirs(tb_dir, exist_ok=True)
    verbose = 1
    ppo = PPO(policy,
              train_env,
              learning_rate=opt_args.lr,
              n_steps=opt_args.n_steps,
              n_epochs=opt_args.n_epochs,
              gamma=opt_args.gamma,
              gae_lambda=0.95,
              clip_range=opt_args.clip_range,
              clip_range_vf=None,
              ent_coef=opt_args.ent_coef,
              vf_coef=opt_args.vf_coef,
              max_grad_norm=opt_args.max_grad_norm,
              tensorboard_log=tb_dir,
              verbose=verbose,
              seed=opt_args.seed)

    # load initial checkpoint
    if opt_args.load_path:
        ppo.load(os.path.join(opt_args.load_path, "ppo_gfootball.pt"))

    # start training ppo
    eval_dir = os.path.join(opt_args.save_path, "eval")
    os.makedirs(eval_dir, exist_ok=True)
    ppo.learn(opt_args.num_timesteps,
              log_interval=1,
              eval_env=eval_env,
              eval_freq=opt_args.save_interval,
              n_eval_episodes=10,
              eval_log_path=eval_dir)

    # save final checkpoint
    ppo.save(os.path.join(opt_args.save_path, "ppo_gfootball"))
Ejemplo n.º 4
0
def main(args):
    wandb.init(project=args.project_name, name=args.run_name)
    n_envs = len(os.sched_getaffinity(0))
    factory = EnvFactory(args.env)

    # Wrap the
    render_env = factory.make_env()  # for rendering

    callback = CallbackList([])

    # Wrap the environment around parallel processing friendly wrapper, unless debug is on
    if args.debug:
        envs = DummyVecEnv([factory.make_env for _ in range(n_envs)])
    else:
        envs = SubprocVecEnv([factory.make_env for _ in range(n_envs)])

    if args.stats_path is None:
        envs = VecNormalize(envs,
                            norm_obs=True,
                            clip_obs=np.inf,
                            norm_reward=False,
                            clip_reward=np.inf)
    else:
        envs = VecNormalize.load(args.stats_path, envs)
    eval_callback = WAndBEvalCallback(render_env, args.eval_every, envs)
    callback.callbacks.append(eval_callback)

    print("Do random explorations to build running averages")
    envs.reset()
    for _ in tqdm(range(1000)):
        random_action = np.stack(
            [envs.action_space.sample() for _ in range(n_envs)])
        envs.step(random_action)
    envs.training = False  # freeze the running averages (what a terrible variable name...)

    # We use PPO by default, but it should be easy to swap out for other algorithms.
    if args.pretrained_path is not None:
        pretrained_path = args.pretrained_path
        learner = PPO.load(pretrained_path, envs, device=args.device)
        learner.learn(total_timesteps=args.total_timesteps, callback=callback)
    else:
        policy_kwargs = dict(
            activation_fn=nn.ReLU,
            net_arch=[dict(vf=args.value_dims, pi=args.policy_dims)],
            log_std_init=args.log_std_init,
            squash_output=False)

        learner = PPO(MlpPolicy,
                      envs,
                      n_steps=args.n_steps,
                      verbose=1,
                      policy_kwargs=policy_kwargs,
                      device=args.device,
                      target_kl=2e-2)
        if args.device == 'cpu':
            torch.cuda.empty_cache()
        learner.learn(total_timesteps=args.total_timesteps, callback=callback)

    render_env.close()
    envs.close()
Ejemplo n.º 5
0
def pybullet_example():
    # PyBullet: Normalizing input features

    import pybullet_envs

    env = DummyVecEnv([lambda: gym.make("HalfCheetahBulletEnv-v0")])
    # Automatically normalize the input features and reward.
    env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.0)

    model = PPO("MlpPolicy", env)
    model.learn(total_timesteps=2000)

    # Don't forget to save the VecNormalize statistics when saving the agent.
    log_dir = "/tmp/"
    model.save(log_dir + "ppo_halfcheetah")
    stats_path = os.path.join(log_dir, "vec_normalize.pkl")
    env.save(stats_path)

    # To demonstrate loading.
    del model, env

    # Load the saved statistics.
    env = DummyVecEnv([lambda: gym.make("HalfCheetahBulletEnv-v0")])
    env = VecNormalize.load(stats_path, env)
    # Do not update them at test time.
    env.training = False
    # reward normalization is not needed at test time.
    env.norm_reward = False

    # Load the agent.
    model = PPO.load(log_dir + "ppo_halfcheetah", env=env)
Ejemplo n.º 6
0
def main():

    env = Pinokio2()
    # Optional: PPO2 requires a vectorized environment to run
    # the env is now wrapped automatically when passing it to the constructor
    # env = DummyVecEnv([lambda: env])

    if os.path.exists(save_file):
        model = PPO.load(save_file, env=DummyVecEnv([lambda: env]))
    else:
        model = PPO(MlpPolicy, env, verbose=1)

    while True:
        #model.learn(total_timesteps=10000)
        model.learn(total_timesteps=100000)

        model.save(save_file)

        obs = env.reset()
        for i in range(10):
            action, _states = model.predict(obs)
            obs, reward, done, info = env.step(action)
            env.render()
            if done:
                print("resetting because " + str(done))
                env.reset()
Ejemplo n.º 7
0
def simulate_and_save(args):
    config = gym_rl_mpc.SCENARIOS[args.env]['config'].copy()
    if args.psf:
        config['use_psf'] = True
        print("Using PSF corrected actions")
    config['wind_mean'] = args.wind_mean

    if not hasattr(args, 'save_sim_data'):
        args.save_sim_data = True

    env = gym.make(args.env, env_config=config)
    env_id = env.unwrapped.spec.id

    agent_path = args.agent
    agent = PPO.load(agent_path)
    sim_df = simulate_episode(env=env, agent=agent, max_time=args.time)
    if args.save_sim_data:
        agent_path_list = agent_path.split("\\")
        simdata_dir = os.path.join("logs", agent_path_list[-4],
                                   agent_path_list[-3], "sim_data")
        os.makedirs(simdata_dir, exist_ok=True)

        # Save file to logs\env_id\<EXPERIMENT_ID>\sim_data\<agent_file_name>_simdata.csv
        i = 0
        while os.path.exists(
                os.path.join(
                    simdata_dir, env_id + "_" + agent_path_list[-1][0:-4] +
                    f"_simdata_{i}.csv")):
            i += 1
        sim_df.to_csv(
            os.path.join(
                simdata_dir, env_id + "_" + agent_path_list[-1][0:-4] +
                f"_simdata_{i}.csv"))

    return sim_df, env
Ejemplo n.º 8
0
def main():

    tensorboard_log = "./log"

    env = Pinokio3()
    # Optional: PPO2 requires a vectorized environment to run
    # the env is now wrapped automatically when passing it to the constructor
    # env = DummyVecEnv([lambda: env])

    if os.path.exists( save_file ):
        model = PPO.load( save_file, env=DummyVecEnv([lambda:env]),tensorboard_log=tensorboard_log )
    else:
        policy_kwargs = dict(activation_fn=th.nn.ReLU, net_arch=net_arch)
        model = PPO(MlpPolicy, DummyVecEnv([lambda:env]), verbose=1,tensorboard_log=tensorboard_log)

    #https://stable-baselines3.readthedocs.io/en/master/guide/callbacks.html
    checkpoint_callback = CheckpointCallback(save_freq=10000, save_path='./checkpoints/',
                                         name_prefix='pinokio3')


    while True:
        model.learn(total_timesteps=15000000, callback=checkpoint_callback, tb_log_name=tb_log_name )

        model.save( save_file )
        print( "saved" )

        obs = env.reset()
        for i in range(20):
            action, _states = model.predict(obs)
            obs, reward, done, info = env.step(action)
            print( "action {} -> reward {}".format( env.decode_action(action), reward ) )
            env.render()
            if done:
                print( "resetting because " + str(done) )
                env.reset()
def create_stable_baselines3_agent(agent_path, agent_type):
    """
    Load and return a stable-baselines3 agent.
    The agent has a function `get_action` that takes in
    an observation and returns an appropiate action.

    `agent_type` is the algorithm name (only PPO-SB3 supported)
    """
    from stable_baselines3 import PPO
    import torch

    agent = None
    if agent_type == "SB3-PPO":
        if "bc_models" in agent_path:
            # Only stores policy parameters.
            # Force on CPU (codebase-level heuristic that everything runs on CPU)
            agent = torch.load(agent_path, map_location="cpu")
            agent.get_action = lambda obs: agent.predict(obs)[0]
        else:
            # GAIL: Stores the whole agent
            agent = PPO.load(agent_path)
            agent.get_action = lambda obs: agent.predict(obs)[0]
    else:
        raise RuntimeError("Unknown agent type for SB3: {}".format(agent_type))
    return agent
Ejemplo n.º 10
0
def main():

    tensorboard_log = "./log"

    env = Pinokio5()
    # Optional: PPO2 requires a vectorized environment to run
    # the env is now wrapped automatically when passing it to the constructor
    # env = DummyVecEnv([lambda: env])

    if os.path.exists(save_file):
        model = PPO.load(save_file,
                         env=DummyVecEnv([lambda: env]),
                         tensorboard_log=tensorboard_log)
    else:
        model = PPO(MlpPolicy, env, verbose=1, tensorboard_log=tensorboard_log)

    try:
        while True:
            #model.learn(total_timesteps=10000)
            model.learn(total_timesteps=8000000, tb_log_name=tb_log_name)

            model.save(save_file)

            obs = env.reset()
            for i in range(100):
                action, _states = model.predict(obs)
                obs, reward, done, info = env.step(action)
                env.render()
                if done:
                    print("resetting because " + str(done))
                    env.reset()
    except KeyboardInterrupt:
        print("Saving before exiting...")
        model.save(save_file)
        print("k bye")
Ejemplo n.º 11
0
def create_video(env,
                 savepoint="random",
                 out_filename="video.mp4",
                 video_size=(1230, 900)):
    if savepoint not in ["random", "argmax", None]:
        model = PPO.load(savepoint)

    # Videosettings
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(out_filename, fourcc, 10, video_size)

    observation = env.reset()
    progress = tqdm.tqdm(total=env.horizon)
    done = False
    while not done:
        progress.update(env.action_frequency)
        # Get Action
        if savepoint == "random":
            action = env.action_space.sample()
        elif savepoint == "argmax":
            action = np.argmax(observation)
        else:
            action, _ = model.predict(observation)

        observation, reward, done, info = env.step(action)
        img = env.render("rgb_array")

        resized_img = cv2.resize(img, video_size, cv2.INTER_NEAREST)
        out.write(np.asarray(resized_img * 255, dtype=np.uint8))

    out.release()
    progress.close()
    return out
Ejemplo n.º 12
0
def test(seed, model_filename, vec_filename, train, test, body_info=0, render=False):
    print("Testing:")
    print(f" Seed {seed}, model {model_filename} vec {vec_filename}")
    print(f" Train on {train}, test on {test}, w/ bodyinfo {body_info}")
    eval_env = utils.make_env(render=render, robot_body=test, body_info=body_info)
    eval_env = DummyVecEnv([eval_env])
    eval_env = VecNormalize.load(vec_filename, eval_env)
    eval_env.norm_reward = False

    eval_env.seed(seed)
    model = PPO.load(model_filename)

    obs = eval_env.reset()
    if render:
        eval_env.env_method("set_view")
    distance_x = 0
    # print(obs)
    total_reward = 0
    for step in range(1000):
        action, _states = model.predict(obs, deterministic=True)
        obs, reward, done, info = eval_env.step(action)
        if done:
            break
        else:  # the last observation will be after reset, so skip the last
            distance_x = eval_env.envs[0].robot.body_xyz[0]
        total_reward += reward[0]
        if render:
            time.sleep(0.01)

    eval_env.close()
    print(f"train {train}, test {test}, body_info {body_info}, step {step}, total_reward {total_reward}, distance_x {distance_x}")
    return total_reward, distance_x
Ejemplo n.º 13
0
def train_policy_ppo(path='policy_ppo', org_path='prob_ppo'):
    """
    学習済み方策をつかった環境を相手にトレーニングを行う
    引数:
        path        学習済みモデルファイルパス
        org_path    学習元となる方策がロードする学習済みモデルファイルパス
    """
    print(f'train ppo with prob_player path={path}, org_path={org_path}')
    # 学習済みモデルファイルのロード
    model = PPO.load(org_path)

    # じゃんけん環境の構築
    env = RockPaperScissorsEnv(AIPlayer(model))
    env = Monitor(env, LOGDIR, allow_early_resets=True)
    env = DummyVecEnv([lambda: env])

    # モデルのセット
    model.set_env(env)

    # トレーニング実行
    elapsed = time.time()
    model.learn(total_timesteps=1000000)
    print(f'elapse time: {time.time() - elapsed}sec')

    # 学習済みモデルの保存
    model.save(path)

    # じゃんけん環境のクローズ
    env.close()
Ejemplo n.º 14
0
def trained_agent(episodes=256,
                  continuous=True,
                  load=None,
                  save_name="test",
                  ent_coef=0.00001,
                  total_timesteps=25000,
                  learning_rate=lr()):
    env = gym.make("bilboquet-v0", continuous=continuous, amplitude=10)
    env.reset((300, 300))

    if load is None:
        model = PPO('MlpPolicy',
                    env,
                    verbose=1,
                    ent_coef=ent_coef,
                    learning_rate=learning_rate,
                    tensorboard_log=f"./ppo_bilboquet_tensorboard/")
        model.learn(total_timesteps=total_timesteps, tb_log_name=save_name)
        model.save(save_name + '.zip')
        print('DONE')
        obs = env.reset()
    else:
        model = PPO.load(load)
        obs = env.reset()

    for i in range(episodes):
        action, _states = model.predict(obs, deterministic=True)
        # print(action)
        obs, reward, done, info = env.step(action)
        # print(reward)
        env.render()
        if done:
            obs = env.reset()
Ejemplo n.º 15
0
def main(args):
    expert = None
    expert_state_dim = 0
    if args.policy_path is not None:
        policy_path = args.policy_path
        expert = PPO.load(policy_path)
        expert_state_dim = expert.observation_space.shape[0]

    factory = EnvFactory(args.env)
    env = DummyVecEnv([factory.make_env])
    if args.stats_path is not None:
        env = VecNormalize.load(args.stats_path, env)
        env.training = False
    else:
        env = VecNormalize(env, training=False)

    obs = env.reset()
    env.render()
    total_reward = 0
    while True:
        if expert is None:
            action = env.action_space.sample()
            action = np.zeros_like(action)
        else:
            good_obs = obs[:, :expert_state_dim]
            action, _ = expert.predict(good_obs, deterministic=True)
        obs, reward, done, info = env.step(action)
        env.render()
        reward = env.get_original_reward()
        total_reward += reward[0]
        if done:
            print("Total reward: {:.3f}".format(total_reward))
            obs = env.reset()
            total_reward = 0
Ejemplo n.º 16
0
def main():
    test_or_train = TEST_OR_TRAIN
    assert test_or_train in ["train", "test"]
    gym_config = SimulationParameters(time_step=TIME_STEP)
    robot_class = QuadrupedRobot
    robot_params = MiniCheetahParams(
        on_rack=False,
        enable_self_collision=True,
        motor_control_mode=MotorControlMode.HYBRID_COMPUTED_POS_TROT)
    task = TestTask(train_or_test=TEST_OR_TRAIN)

    env = LocomotionGymEnv(gym_config, robot_class, robot_params, task)

    policy_save_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                                   'data/policies')
    if not (os.path.exists(policy_save_dir)):
        os.makedirs(policy_save_dir)

    policy_save_filename = 'ppo_' + str(COUNT) + '_' + time.strftime(
        "%d-%m-%Y_%H-%M-%S")
    policy_save_path = os.path.join(policy_save_dir, policy_save_filename)

    if TEST_OR_TRAIN == "train":
        model = PPO('MlpPolicy', env, verbose=1)
        model.learn(total_timesteps=100000000)
        model.save(policy_save_path)
    else:
        model = PPO.load(POLICY_SAVE_PATH)
        obs = env.reset()
        while True:
            action, _state = model.predict(obs, deterministic=True)
            obs, reward, done, info = env.step(action)
            env.render()
            if done:
                obs = env.reset()
Ejemplo n.º 17
0
def main():
    # multiprocess environment
    # n_cpu = 8
    # env = SubprocVecEnv([lambda: gym.make('DYROSTocabi-v1') for i in range(n_cpu)])
    # env = VecNormalize(env, norm_obs=True, clip_obs=2.0, norm_reward=False, training=True)

    n_cpu = 1
    env = gym.make('DYROSTocabi-v1')
    env = DummyVecEnv([lambda: env])
    env = VecNormalize(env,
                       norm_obs=True,
                       clip_obs=2.0,
                       norm_reward=False,
                       training=True)

    model = PPO('MlpPolicy',
                env,
                verbose=1,
                n_steps=int(4096 / n_cpu),
                wandb_use=False)
    model.learn(total_timesteps=40000000)
    file_name = "ppo2_DYROSTocabi_" + str(datetime.datetime.now())
    model.save(file_name)
    env.save(file_name + "_env.pkl")

    model.policy.to("cpu")
    for name, param in model.policy.state_dict().items():
        weight_file_name = "./result/" + name + ".txt"
        np.savetxt(weight_file_name, param.data)

    np.savetxt("./result/obs_mean.txt", env.obs_rms.mean)
    np.savetxt("./result/obs_variance.txt", env.obs_rms.var)

    del model  # remove to demonstrate saving and loading
    del env

    # file_name = "ppo2_DYROSTocabi_2021-01-08 07:18:00.267089"

    env = gym.make('DYROSTocabi-v1')
    env = DummyVecEnv([lambda: env])
    env = VecNormalize.load(file_name + "_env.pkl", env)
    env.training = False

    model = PPO.load(file_name, env=env, wandb_use=False)

    #Enjoy trained agent
    obs = np.copy(env.reset())
    epi_reward = 0

    while True:
        action, _states = model.predict(obs, deterministic=True)

        obs, rewards, dones, info = env.step(action)
        env.render()
        epi_reward += rewards

        if dones:
            print("Episode Reward: ", epi_reward)
            epi_reward = 0
Ejemplo n.º 18
0
def train_stable_baselines(submodule, flags):
    """Train policies using the PPO algorithm in stable-baselines."""
    from stable_baselines3.common.vec_env import DummyVecEnv

    flow_params = submodule.flow_params
    # Path to the saved files
    exp_tag = flow_params['exp_tag']
    result_name = '{}/{}'.format(exp_tag, strftime("%Y-%m-%d-%H:%M:%S"))

    # Perform training.
    start_time = timeit.default_timer()
    # print experiment.json information
    print("=========================================")
    print('Beginning training.')
    print('Algorithm :', flags.algorithm)
    model = run_model_stablebaseline(flow_params, flags.num_cpus,
                                     flags.rollout_size, flags.num_steps,
                                     flags.algorithm, flags.exp_config)

    stop_time = timeit.default_timer()
    run_time = stop_time - start_time
    print("Training is Finished")
    print("total runtime: ", run_time)
    # Save the model to a desired folder and then delete it to demonstrate
    # loading.
    print('Saving the trained model!')
    path = os.path.realpath(os.path.expanduser('~/baseline_results'))
    ensure_dir(path)
    save_path = os.path.join(path, result_name)
    model.save(save_path)

    # dump the flow params
    with open(os.path.join(path, result_name) + '.json', 'w') as outfile:
        json.dump(flow_params,
                  outfile,
                  cls=FlowParamsEncoder,
                  sort_keys=True,
                  indent=4)

    # Replay the result by loading the model
    print('Loading the trained model and testing it out!')
    if flags.exp_config.lower() == "ppo":
        from stable_baselines3 import PPO
        model = PPO.load(save_path)
    elif flags.exp_config.lower() == "ddpg":
        from stable_baselines3 import DDPG
        model = DDPG.load(save_path)
    flow_params = get_flow_params(os.path.join(path, result_name) + '.json')
    flow_params['sim'].render = True
    env = env_constructor(params=flow_params, version=0)()
    # The algorithms require a vectorized environment to run
    eval_env = DummyVecEnv([lambda: env])
    obs = eval_env.reset()
    reward = 0
    for _ in range(flow_params['env'].horizon):
        action, _states = model.predict(obs)
        obs, rewards, dones, info = eval_env.step(action)
        reward += rewards
    print('the final reward is {}'.format(reward))
Ejemplo n.º 19
0
 def __init__(self, algorithm: str, checkpoint_path: str):
     if algorithm == 'ppo':
         policy = PPO.load(checkpoint_path)
     elif algorithm == 'sac':
         policy = SAC.load(checkpoint_path)
     else:
         raise NotImplementedError
     self._model = policy
Ejemplo n.º 20
0
    def __init__(self):
        config.loads('config.json')
        self.asset = 10000
        self.backtest = BackTest()

        # data = Market.kline('sh600519', '1d')
        # print(data)
        ltdxhq = LTdxHq()
        # df = ltdxhq.get_k_data_daily('603636', start='2021-09-01') # 000032 300142 603636 600519
        df = ltdxhq.get_k_data_1min('000032', start='2021-08-31') # 000032 300142 603636 600519
        df = StockDataFrame(df)
        ltdxhq.close()
        # print(df.head())

        self.kline = []
        self.buy_signal = []
        self.sell_signal = []

        # 2005-08-11 15:00 
        # open            46.01
        # close           47.37
        # high            47.40
        # low             46.01
        # vol        1359360.00
        # amount    63589532.00
        data = []
        for index, row in df.iterrows():
            data.append([index[:10], row.open, row.high, row.low, row.close, row.vol,])

        self.model = PPO.load('ppo_stock')

        for current_step in range(240, df.shape[0]):
            obs = np.array([
                df.iloc[current_step - NEXT_OBSERVATION_SIZE: current_step]['open'].values / MAX_SHARE_PRICE,
                df.iloc[current_step - NEXT_OBSERVATION_SIZE: current_step]['high'].values / MAX_SHARE_PRICE,
                df.iloc[current_step - NEXT_OBSERVATION_SIZE: current_step]['low'].values / MAX_SHARE_PRICE,
                df.iloc[current_step - NEXT_OBSERVATION_SIZE: current_step]['close'].values / MAX_SHARE_PRICE,
                df.iloc[current_step - NEXT_OBSERVATION_SIZE: current_step ]['vol'].values / MAX_NUM_SHARES,
                # df['close'].pct_change().fillna(0)[current_step: current_step + NEXT_OBSERVATION_SIZE],

                df['macd'][current_step - NEXT_OBSERVATION_SIZE: current_step].values,
                df['macdh'][current_step - NEXT_OBSERVATION_SIZE: current_step].values,
                df['macds'][current_step - NEXT_OBSERVATION_SIZE: current_step].values,
                df['kdjk'][current_step - NEXT_OBSERVATION_SIZE: current_step].values,
                df['kdjd'][current_step - NEXT_OBSERVATION_SIZE: current_step].values,
                df['kdjj'][current_step - NEXT_OBSERVATION_SIZE: current_step].values,
                df['rsi_6'][current_step - NEXT_OBSERVATION_SIZE: current_step].fillna(0).values,
                df['rsi_12'][current_step - NEXT_OBSERVATION_SIZE: current_step].fillna(0).values,
            ])

            # df.index.values[current_step][:10]
            self.kline.append([df.index.get_level_values(level=1)[current_step], df.iloc[current_step].open, df.iloc[current_step].high, df.iloc[current_step].low, df.iloc[current_step].close, df.iloc[current_step].vol])

            self.backtest.initialize(self.kline, data)
            self.begin(obs)
        # print(self.buy_signal)
        # print(self.sell_signal)
        plot_asset()
Ejemplo n.º 21
0
 def load(self, name: str, env, replace_parameters=None):
     self.log_dir = "ppo_cnn/" + str(datetime.datetime.now()).replace(
         ":", "-")
     os.makedirs(self.log_dir, exist_ok=True)
     monitor_env = Monitor(env, self.log_dir, allow_early_resets=True)
     vec_env = DummyVecEnv([lambda: monitor_env])
     self.model = PPO.load(name,
                           env=vec_env,
                           custom_objects=replace_parameters)
Ejemplo n.º 22
0
def main():
  # Create the callback: check every 1000 steps
  log_dir = 'log'
  callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=log_dir)
  num_cpu = 16
  model_stats_path = os.path.join(log_dir, "sac_" + env_name)
  env_stats_path = os.path.join(log_dir, 'sac_LR001.pkl')
  tb_log = 'tb_log'
  videoName = '5M_timesteps_sac'
  tb_log_name = videoName

  if(StartFresh):
        # env = make_vec_env(env_name, n_envs=4)
        # env = DummyVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)])
        env = SubprocVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)])
        env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.)
        env.reset()
        policy_kwargs = {
            'net_arch':[128,64,32],
        }
        model = PPO('MlpPolicy', 
          env, 
          learning_rate = 0.001,
          n_steps=500,
          # batch_size=0,
          # n_epochs=1,
          gamma=0.9,
          policy_kwargs = policy_kwargs, 
          verbose=1, 
          tensorboard_log=tb_log,
          device="auto")
  else:
      env = SubprocVecEnv([make_env(env_name, i, log_dir=log_dir) for i in range(num_cpu)])
      env = VecNormalize.load(env_stats_path, env)
      env.reset()

      
      model = PPO.load(model_stats_path, tensorboard_log=tb_log)
      model.set_env(env)

  if(DoTraining):
    eval_env = make_vec_env(env_name, n_envs=1)
    eval_env = VecNormalize(eval_env, norm_obs=True, norm_reward=True, clip_obs=10.)
    eval_env.reset()
    # model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=tb_log)
    model.learn(total_timesteps=25000000, tb_log_name=tb_log_name, reset_num_timesteps=False) #, callback=callback, =TensorboardCallback()

    # Don't forget to save the VecNormalize statistics when saving the agent
    model.save(model_stats_path)
    env.save(env_stats_path)
    
  if(DoVideo):
    # mean_reward, std_reward = evaluate_policy(model, eval_env)
    # print(f"Mean reward = {mean_reward:.2f} +/- {std_reward:.2f}")
    record_video(env_name, model, video_length=2000, prefix='ppo_'+ env_name + videoName)
Ejemplo n.º 23
0
def run_model(args):
    env = TankEnv(args.game_path,
                  opp_fp_and_elo=[(args.opp, 1000)],
                  game_port=args.base_port,
                  my_port=args.my_port,
                  image_based=args.image_based,
                  level_path=args.level_path,
                  rand_opp=args.rand_opp,
                  p=args.env_p,
                  opp_p=args.opp_env_p)
    model = None
    if args.p1:
        model = PPO.load(args.p1)
    elif args.p1same:
        model = PPO.load(args.opp)

    score = [0, 0, 0]
    print("Score: [Player1 Wins, Player2 Wins, Ties]")

    obs = env.reset()
    if args.image_based and (args.ai_view or args.rev_ai_view):
        fig = plt.gcf()
        fig.show()
        fig.canvas.draw()
    while True:
        if args.image_based and (args.ai_view or args.rev_ai_view):
            if not args.rev_ai_view:
                plt.imshow(obs, origin="lower")
            else:
                plt.imshow(env.opp_state, origin="lower")
            fig.canvas.draw()
        if model:
            action, _ = model.predict(obs)
        elif args.rand_p1:
            action = np.random.rand(5) * 2 - 1
        else:
            action = np.zeros(5, dtype=np.float32)
        obs, reward, done, info = env.step(action)
        if done:
            score[info["winner"]] += 1
            print("Score:", score)
            obs = env.reset()
Ejemplo n.º 24
0
    def __init__(self):
        _path = pathlib.Path(__file__).parent.resolve()
        custom_objects = {
            "lr_schedule": 0.00001,
            "clip_range": .02,
            "n_envs": 1,
            "device": "cpu"
        }

        sys.path.append(_path)
        self.actor = PPO.load(str(_path)+'/monkey_mdl.zip', custom_objects=custom_objects)
Ejemplo n.º 25
0
def load_model(run_name: str, model_file: str) -> Tuple[VecEnv, PPO]:
    run_dir = get_run_dir(run_name)
    cfg = load(run_dir)['preprocess']
    env = make_env(seed=123,
                   n_envs=1,
                   run_dir=run_dir,
                   frame_skip=cfg['frame_skip'],
                   frame_stack=cfg['frame_stack'],
                   is_eval=True)
    model = PPO.load(os.path.join(run_dir, model_file))
    return env, model
Ejemplo n.º 26
0
 def change_model(self):
     path = self.save_path
     try:
         files = [join(path, f)
                  for f in listdir(path) if isfile(join(path, f))]
         # files = sorted(files, key=getmtime, reverse=True)
         # model_name = files[random.randrange(min(len(files), 5))]
         model_name = max(files, key=getmtime)
         self.past_models[self.change_index] = PPO.load(model_name)
         self.change_index = (self.change_index + 1) % len(self.past_models)
     except Exception as e:
         print(e)
Ejemplo n.º 27
0
    def __init__(self):
        _path = pathlib.Path(__file__).parent.resolve()
        custom_objects = {
            "lr_schedule": 0.000001,
            "clip_range": .02,
            "n_envs": 1,
        }

        self.actor = PPO.load(str(_path) + '/example_mdl',
                              device='cpu',
                              custom_objects=custom_objects)
        self.parser = DiscreteAction()
Ejemplo n.º 28
0
def test(MODEL_TEST):
    log_dir = "model_save/" + MODEL_PATH + "/" + MODEL_PATH + MODEL_TEST

    env = ENV(util='test', par=PARAM, dt=DT)
    env.render = True
    env = Monitor(env, log_dir)

    if PARAM['algo']=='td3':
        model = TD3.load(log_dir)
    elif PARAM['algo']=='ddpg':
        model = DDPG.load(log_dir)
    elif PARAM['algo']=='ppo':
        model = PPO.load(log_dir)

    # plot_results(f"model_save/")
    trade_dt = pd.DataFrame([])     # trade_dt: 所有股票的交易数据
    result_dt = pd.DataFrame([])    # result_dt: 所有股票一年测试结果数据
    for i in range(TEST_STOCK_NUM):
        state = env.reset()
        stock_bh_id = 'stock_bh_'+str(i)            # 记录每个股票交易的buy_hold
        stock_port_id = 'stock_port_'+str(i)        # 记录每个股票交易的portfolio
        stock_action_id = 'stock_action_' + str(i)  # 记录每个股票交易的action
        flow_L_id = 'stock_flow_' + str(i)          # 记录每个股票的流水
        stock_bh_dt, stock_port_dt, action_policy_dt, flow_L_dt = [], [], [], []
        day = 0
        while True:
            action = model.predict(state)
            next_state, reward, done, info = env.step(action[0])
            state = next_state
            # print("trying:",day,"reward:", reward,"now profit:",env.profit)   # 测试每一步的交易policy
            stock_bh_dt.append(env.buy_hold)
            stock_port_dt.append(env.Portfolio_unit)
            action_policy_dt.append(action[0][0])  # 用于记录policy
            flow_L_dt.append(env.flow)
            day+=1
            if done:
                print('stock: {}, total profit: {:.2f}%, buy hold: {:.2f}%, sp: {:.4f}, mdd: {:.2f}%, romad: {:.4f}'
                      .format(i, env.profit*100, env.buy_hold*100, env.sp, env.mdd*100, env.romad))
                # 交易完后记录:股票ID,利润(单位100%),buy_hold(单位100%),夏普率,最大回撤率(单位100%),romad
                result=pd.DataFrame([[i,env.profit*100,env.buy_hold*100,env.sp,env.mdd*100,env.romad]])
                break

        trade_dt_stock = pd.DataFrame({stock_port_id: stock_port_dt,
                                       stock_bh_id: stock_bh_dt,
                                       stock_action_id: action_policy_dt,
                                       flow_L_id: flow_L_dt})  # 支股票的交易数据

        trade_dt = pd.concat([trade_dt, trade_dt_stock], axis=1)    # 所有股票交易数据合并(加行)
        result_dt = pd.concat([result_dt,result],axis=0)            # 所有股票结果数据合并(加列)

    result_dt.columns = ['stock_id','prfit(100%)','buy_hold(100%)','sp','mdd(100%)','romad']
    trade_dt.to_csv('out_dt/trade_'+MODEL_PATH+'.csv',index=False)
    result_dt.to_csv('out_dt/result_'+MODEL_PATH+'.csv',index=False)
Ejemplo n.º 29
0
def ai_eval():
    env = Snake_Env(server=False)
    model = PPO.load("./positivereward", env=env)
    obs = env.reset()
    for i in range(1000):
        action, _state = model.predict(obs, deterministic=True)
        #action = env.action_space.sample()
        #print(action)
        obs, reward, done, info = env.step(action)
        env.render()
        if done:
            env.reset()
Ejemplo n.º 30
0
def play(env_name, load_file, total_timesteps):
    env = DummyVecEnv([lambda: gym.make(env_name)])
    model = PPO.load(load_file, verbose=1)
    obs = env.reset()
    for i in range(total_timesteps):
        action, _states = model.predict(obs)
        obs, reward, done, info = env.step(action)
        # env.render() # dummy
        if done:
            print(info[0]['episode'])
    del model
    env.close()