def init_localoptima_selfplayer(seed, reward_type): env = make_env(use_cnn=True, reward_type=reward_type, self_play=True, save_score=True) def make_model(env): policy_kwargs = dict( net_arch=[dict(pi=[128, 128, 128], vf=[128, 128, 128])]) return PPO2("CnnPolicy", env, verbose=1, seed=seed, self_play=True, policy_kwargs=policy_kwargs) train_until_localoptimum(make_model, seed, env, reward_type)
from stable_baselines import PPO2 from stable_baselines.common.policies import FeedForwardPolicy, register_policy from train_func import do_train_multi from env_getter import make_env, make_env_hex import tensorflow as tf from stable_baselines.a2c.utils import conv, linear, conv_to_fc, batch_to_seq, seq_to_batch, lstm import numpy as np policy_kwargs = dict(net_arch=[dict(pi=[128, 128, 128], vf=[128, 128, 128])]) env = make_env(use_cnn=True, reward_type=0, self_play=True) model1 = PPO2("CnnPolicy", env, verbose=1, self_play=True, tensorboard_log="D:\\4-System\\tensor", policy_kwargs=policy_kwargs) model2 = PPO2("CnnPolicy", env, verbose=1, self_play=True, tensorboard_log="D:\\4-System\\tensor", policy_kwargs=policy_kwargs) model3 = PPO2("CnnPolicy", env, verbose=1, self_play=True, tensorboard_log="D:\\4-System\\tensor", policy_kwargs=policy_kwargs) model4 = PPO2("CnnPolicy", env, verbose=1,
from stable_baselines import PPO2 from stable_baselines.common.policies import FeedForwardPolicy, register_policy from train_func import train_until_localoptimum, do_train_multi from env_getter import make_env, make_env_hex import tensorflow as tf from stable_baselines.a2c.utils import conv, linear, conv_to_fc, batch_to_seq, seq_to_batch, lstm import numpy as np import sys env = make_env(use_cnn=False, reward_type=0, self_play=True, save_score=False) def make_model(env): policy_kwargs = dict( net_arch=[dict(pi=[128, 128, 128], vf=[128, 128, 128])]) return PPO2("MlpPolicy", env, verbose=1, self_play=True, policy_kwargs=policy_kwargs) models = [] for i in range(0, 5): models.append(make_model(env)) do_train_multi(models, "multiself", env)
from stable_baselines import A2C from train_func import do_train from env_getter import make_env env = make_env() model = A2C('MlpPolicy', env, verbose=1, n_steps=71, epsilon=2.36561309730191e-05, alpha=0.930340721692521, learning_rate=0.00223553342110284, max_grad_norm=0.50523672966627, ent_coef=0.00105748404479239, vf_coef=0.0492764482019269, gamma=0.953839228968649) do_train(model, "a2cbox", env)