Exemple #1
0
def init_localoptima_selfplayer(seed, reward_type):
    env = make_env(use_cnn=True,
                   reward_type=reward_type,
                   self_play=True,
                   save_score=True)

    def make_model(env):
        policy_kwargs = dict(
            net_arch=[dict(pi=[128, 128, 128], vf=[128, 128, 128])])
        return PPO2("CnnPolicy",
                    env,
                    verbose=1,
                    seed=seed,
                    self_play=True,
                    policy_kwargs=policy_kwargs)

    train_until_localoptimum(make_model, seed, env, reward_type)
from stable_baselines import PPO2
from stable_baselines.common.policies import FeedForwardPolicy, register_policy
from train_func import do_train_multi
from env_getter import make_env, make_env_hex
import tensorflow as tf
from stable_baselines.a2c.utils import conv, linear, conv_to_fc, batch_to_seq, seq_to_batch, lstm
import numpy as np

policy_kwargs = dict(net_arch=[dict(pi=[128, 128, 128], vf=[128, 128, 128])])
env = make_env(use_cnn=True, reward_type=0, self_play=True)
model1 = PPO2("CnnPolicy",
              env,
              verbose=1,
              self_play=True,
              tensorboard_log="D:\\4-System\\tensor",
              policy_kwargs=policy_kwargs)
model2 = PPO2("CnnPolicy",
              env,
              verbose=1,
              self_play=True,
              tensorboard_log="D:\\4-System\\tensor",
              policy_kwargs=policy_kwargs)
model3 = PPO2("CnnPolicy",
              env,
              verbose=1,
              self_play=True,
              tensorboard_log="D:\\4-System\\tensor",
              policy_kwargs=policy_kwargs)
model4 = PPO2("CnnPolicy",
              env,
              verbose=1,
from stable_baselines import PPO2
from stable_baselines.common.policies import FeedForwardPolicy, register_policy
from train_func import train_until_localoptimum, do_train_multi
from env_getter import make_env, make_env_hex
import tensorflow as tf
from stable_baselines.a2c.utils import conv, linear, conv_to_fc, batch_to_seq, seq_to_batch, lstm
import numpy as np
import sys

env = make_env(use_cnn=False, reward_type=0, self_play=True, save_score=False)


def make_model(env):
    policy_kwargs = dict(
        net_arch=[dict(pi=[128, 128, 128], vf=[128, 128, 128])])
    return PPO2("MlpPolicy",
                env,
                verbose=1,
                self_play=True,
                policy_kwargs=policy_kwargs)


models = []
for i in range(0, 5):
    models.append(make_model(env))
do_train_multi(models, "multiself", env)
Exemple #4
0
from stable_baselines import A2C
from train_func import do_train
from env_getter import make_env

env = make_env()
model = A2C('MlpPolicy',
            env,
            verbose=1,
            n_steps=71,
            epsilon=2.36561309730191e-05,
            alpha=0.930340721692521,
            learning_rate=0.00223553342110284,
            max_grad_norm=0.50523672966627,
            ent_coef=0.00105748404479239,
            vf_coef=0.0492764482019269,
            gamma=0.953839228968649)

do_train(model, "a2cbox", env)