def __init__(self, agent_filepath=""): Player.__init__(self) # Create the experience memory database if not os.path.exists(REPLAY_MEMORY_FILENAME): self.replay_memory = ReplayMemory() else: self.replay_memory = cPickle.load(open(REPLAY_MEMORY_FILENAME, 'r')) # Initialize the convolutional neural network self.network = MinecraftNet(agent_filepath) self.ae_network = FeatureNet() # Probability of selecting non-random action self.epsilon = STARTING_EPSILON # The total number of frames this agent has been trained on # through all the minibatch training self.frames_trained = 0 # Load old epsilon and frames learned values self.load() self.cnn_action_map = self.initActionMap() # The current and previous sequences of game frames and actions self.current_seq = None self.previous_seq = None self.previous_action = None # Event logging self.log = LogFile("run.log", True)
def __init__(self, gamma, memory, s, a, tau, learningRate=1e-3, criticpath=None, actorpath=None): self.gamma = gamma self.memory = ReplayMemory(memory) self.actor = Actor(state=s, actions=a) self.critic = Critic(state=s, actions=a) if (not (criticpath == None)): self.critic.load_state_dict(torch.load(criticpath)) if (not (actorpath == None)): self.actor.load_state_dict(torch.load(actorpath)) self.targetActor = Actor(state=s, actions=a) self.targetActor.load_state_dict(self.actor.state_dict()) self.targetCritic = Critic(state=s, actions=a) self.targetCritic.load_state_dict(self.critic.state_dict()) self.tau = tau self.actorOptimizer = optim.Adam(self.actor.parameters(), learningRate) self.criticOptimizer = optim.Adam(self.critic.parameters(), learningRate) #more a dimensionality thing self.state = s self.action = a self.OUarray = np.zeros((1000, self.action), dtype="f") self.step = 0
def __init__(self, path, model_path, target_model_path, actor_index): self.path = path self.model_path = model_path self.target_model_path = target_model_path self.actor_index = actor_index self.lr = 1e-3 self.gamma = 0.95 self.epsilon = 0.3 self.batch_size = 32 self.initial_exploration = 500 self.N_STEP = 3 self.step_reward = 0 self.qf = DuelingQFunc() self.target_qf = DuelingQFunc() #model.state_dict():モデルの学習パラメータをとってきている self.target_qf.load_state_dict(self.qf.state_dict()) self.optimizer = optim.Adam(self.qf.parameters(), lr=self.lr) self.criterion = nn.MSELoss() self.env = gym.make('CartPole-v0') self.obs_size = self.env.observation_space.shape[0] self.action_size = self.env.action_space.n self.obs_queue = queue.Queue() self.reward_queue = queue.Queue() self.action_queue = queue.Queue() self.total_step = 0 self.ten_step = 0 self.temporal_memory = ReplayMemory()
def __init__(self, gamma, memory_size, target_update_counter, batch_size, num_of_states, num_of_actions, ewc_importance=28, si_importance=30): self.num_of_states = num_of_states self.num_of_actions = num_of_actions self.eval_model = Net(self.num_of_states, HIDDEN_SIZE, self.num_of_actions) self.target_model = Net(self.num_of_states, HIDDEN_SIZE, self.num_of_actions) self.optimizer = torch.optim.Adam(self.eval_model.parameters(), lr=0.001) # self.optimizer = torch.optim.SGD(self.eval_model.parameters(), lr=0.01) self.loss_func = nn.MSELoss() # self.loss_func = nn.MS # self.loss_func = nn.SmoothL1Loss() self.memory_size = memory_size self.memory = ReplayMemory(memory_size) self.old_memory = [] self.learned_tasks = 0 self.target_udpate_counter = target_update_counter self.learn_step_counter = 0 self.batch_size = batch_size self.gamma = gamma self.epsilon = EPSILON_MAX self.ewc_importance = ewc_importance self.si_importance = si_importance
def __init__(self, state_num, action_num, device, CONFIG, action_list): self.action_list = action_list self.memory = ReplayMemory(CONFIG.MEMORY_CAPACITY) #== ENV PARAM == self.state_num = state_num self.action_num = action_num #== PARAM == self.EPSILON = CONFIG.EPSILON self.EPS_START = CONFIG.EPSILON self.EPS_END = CONFIG.EPSILON_END self.EPS_DECAY = CONFIG.MAX_EP_STEPS self.LR_C = CONFIG.LR_C self.LR_C_START = CONFIG.LR_C self.LR_C_END = CONFIG.LR_C_END self.LR_C_DECAY = CONFIG.MAX_EP_STEPS * CONFIG.MAX_EPISODES / 2 self.BATCH_SIZE = CONFIG.BATCH_SIZE self.GAMMA = CONFIG.GAMMA self.MAX_MODEL = CONFIG.MAX_MODEL #== Target Network Update == self.TAU = CONFIG.TAU self.HARD_UPDATE = CONFIG.HARD_UPDATE self.SOFT_UPDATE = CONFIG.SOFT_UPDATE #== DQN == self.double = CONFIG.DOUBLE self.device = device self.build_network()
def testPushInxes(self): RepMem = ReplayMemory() for i in range(0, 200000): RepMem.push(1, 1, 1, 1) self.assertEqual(RepMem.indx, ((i + 1) % RepMem.size)) self.assertTrue(RepMem.isFull())
def __init__(self, memory_cap, batch_size, resolution, action_count, session, lr, gamma, epsilon_min, epsilon_decay_steps, epsilon_max, trace_length, hidden_size): self.model = Network(session=session, action_count=action_count, resolution=resolution, lr=lr, batch_size=batch_size, trace_length=trace_length, hidden_size=hidden_size, scope='main') self.target_model = Network(session=session, action_count=action_count, resolution=resolution, lr=lr, batch_size=batch_size, trace_length=trace_length, hidden_size=hidden_size, scope='target') self.memory = ReplayMemory(memory_cap=memory_cap, batch_size=batch_size, resolution=resolution, trace_length=trace_length) self.batch_size = batch_size self.resolution = resolution self.action_count = action_count self.gamma = gamma self.epsilon_min = epsilon_min self.epsilon_decay_steps = epsilon_decay_steps self.epsilon_max = epsilon_max self.hidden_size = hidden_size self.trace_length = trace_length self.epsilon = epsilon_max self.epsilon_decrease = (epsilon_max-epsilon_min)/epsilon_decay_steps self.min_buffer_size = batch_size*trace_length self.state_in = (np.zeros([1, self.hidden_size]), np.zeros([1, self.hidden_size]))
def __init__(self): AbstractPlayer.__init__(self) self.movementStrategy = EpsilonStrategy() self.replayMemory = ReplayMemory(MEMORY_CAPACITY) self.episode = 0 networkOptions = [ keras.layers.Dense(state_size, input_dim=state_size, activation='relu'), keras.layers.Dense( 100, activation='relu', kernel_initializer=keras.initializers.he_normal()), keras.layers.Dense( 100, activation='relu', kernel_initializer=keras.initializers.he_normal()), keras.layers.Dense(NUM_ACTIONS) ] self.policyNetwork = keras.Sequential(networkOptions) self.targetNetwork = keras.Sequential(networkOptions) self.policyNetwork.compile( optimizer=keras.optimizers.Adam(learning_rate=ALPHA), loss=keras.losses.mean_squared_error) print(self.policyNetwork.summary()) try: self.policyNetwork.load_weights("./network/zelda-ddqn.h5") self.movementStrategy.epsilon = 0.01 print('Model loaded') except: print('Model file not found')
def __init__(self, training): # Create the environment self.environment = Environment() # Training or testing self.training = training # Set the initial training epsilon self.epsilon = 0.10 # Get the number of actions for storing memories and Q-values etc. total_actions = self.environment.total_actions() # Training or testing if self.training: # Training : Set a learning rate self.learning_rate = 1e-2 # Training: Set up the replay memory self.replay_memory = ReplayMemory(size=1000, num_actions=total_actions) else: # Testing: These are not needed self.learning_rate = None self.replay_memory = None # Create the neural network self.neural_network = NeuralNetwork(num_actions=total_actions, replay_memory=self.replay_memory) # This stores the rewards for each episode self.rewards = []
def __init__(self, input_shape, n_actions, optimizer='RMSprop', lr=1e-4, gamma=0.99, C=10000, batch_size=32, min_eps=0.1, max_eps=1, cutoff=1e6, second_cuttof=2.5e6, final_eps=0.01, device='GPU', clip=10): super().__init__(input_shape, n_actions, gamma, batch_size, min_eps, max_eps, cutoff, device) self.memory = ReplayMemory(input_shape) self.policy_network = DuelingDDQN(input_shape, n_actions, self.device) self.target_network = DuelingDDQN(input_shape, n_actions, self.device) self.optimizer = getattr(torch.optim, optimizer)(self.policy_network.parameters(), lr=lr) self.criterion = torch.nn.MSELoss() self.second_cuttof = second_cuttof self.final_eps = final_eps self.C = C self.C_counter = 0 self.clip = clip
def __init__(self, num_action, state_size, goal_size, max_eps=0.2, min_eps=0.02, eps_decay=0.95, gamma=0.98, lr=0.001, batch_size=128, buffer_size=1000000, PER=False, init_nn=None): self.PER = PER # Init Hyperparameters self.epsilon = max_eps self.min_eps = min_eps self.eps_decay = eps_decay self.gamma = gamma self.num_action = num_action self.batch_size = batch_size self.epsilon = max_eps self.memory = ReplayMemory(buffer_size, with_priorities=PER) self.state_size = state_size self.goal_size = goal_size # Initialize neural nets self.policy_net = NeuralNet(state_size, num_action, goal_size, lr) if init_nn is not None: self.policy_net = init_nn self.target_net = NeuralNet(state_size, num_action, goal_size, lr) self.target_net.set_weights(self.policy_net.get_weights())
def __init__(self, env_name, state_dim, action_dim): self.name = 'DriverAgent' # name for uploading results self.env_name = env_name # Randomly initialize actor network and critic network # with both their target networks self.state_dim = state_dim self.action_dim = action_dim # Tensorflow Session config = tf.ConfigProto() config.gpu_options.allow_growth = True self.sess = tf.Session(config=config) # Actor & Critic Network self.actor = ActorNetwork(self.sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA) self.critic = CriticNetwork(self.sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA) # Replay Memory self.memory = ReplayMemory(MEMORY_SIZE) # Loss value self.loss = 0 # loading networks. modify as you want self.saver = tf.train.Saver() if not os.path.exists(ckp_dir): print("Could not find old network weights") else: self.saver.restore(self.sess, os.path.join(ckp_dir, ckp_name)) print("Successfully loaded:", ckp_name)
def reset_training(self): self.learn_step_counter = 0 self.epsilon = EPSILON_MAX # Save a batch of old memories transitions = self.memory.sample(self.batch_size) self.old_memory = self.old_memory + transitions self.memory = ReplayMemory(self.memory_size)
def __init__(self): AbstractPlayer.__init__(self) self.movementStrategy = EpsilonStrategy() self.replayMemory = ReplayMemory(MEMORY_CAPACITY) self.episode = 0 self.policyNetwork = self._build_compile_model() self.targetNetwork = self._build_compile_model() if self.episode == 0 and os.path.exists( "./celdas/network/zelda.index"): self.policyNetwork.load_weights("./celdas/network/zelda") print(self.policyNetwork.summary())
def __init__(self, actor, critic, memory, s, a, tau, epsilon=0.5): self.memory = ReplayMemory(memory) self.targetActor = copy.deepcopy(actor) self.targetCritic = copy.deepcopy(critic) self.tau = tau self.epsilon = epsilon #more a dimensionality thing self.state = s self.action = a self.OUarray = np.zeros((1000, self.action), dtype="f") self.step = 0
def __init__(self, predictor_func, grid, time_limit=None, player=None, name=""): super().__init__(grid, time_limit, player) self.name = name self.predictor_func = predictor_func # TODO: needed? # the size of the state self.state_rows = grid[0]+grid[1] self.state_cols = self.state_rows self.state_depth = 1 self.exploration = INITIAL_EXPLORATION self.final_exploration = FINAL_EXPLORATION self.expl_update = \ (self.exploration - self.final_exploration) / EXPLORATION_STEPS self.gamma = DISCOUNT_GAMMA self.alpha = ALPHA self.batch_size = BATCH_SIZE self.max_gradient = GRADIENT_CLIPPING_NORM self.reg_param = REGULARIZATION_FACTOR if PRIORITY_REPLAY_BUFFER: self.replay_mem = \ PriorityReplayMemory(REPLAY_BUFFER_SIZE, PRIORITY_ALPHA) self.beta = PRIORITY_BETA_INIT self.beta_update = (1.0 - PRIORITY_BETA_INIT) / PRIORITY_BETA_ITERS else: self.replay_mem = ReplayMemory(REPLAY_BUFFER_SIZE) self.update = True self.null_state = self.to_state(Board(grid[0], grid[1])) self.last_time = time.time() self.reset_summary() self.graph = tf.Graph() self.session = tf.Session(graph=self.graph) with self.graph.as_default(): self.create_graph() self.session.run(tf.global_variables_initializer()) self.session.run(self.target_set_op) self.load()
def __init__(self, env, sess, LEARNING_RATE_ACTOR, LEARNING_RATE_CRITIC, NET_SIZE, MEMORY_LEN, REWARD_DISCOUNT, BATCH_SIZE, TAU, EXPLORATION_STEPS, VERBOSE, LOG_DIR_TF): self.env = env self.sess = sess self.observation_space = self.env.observation_space.shape[0] self.action_space = self.env.action_space.shape[0] self.REWARD_DISCOUNT = REWARD_DISCOUNT self.TAU = TAU self.BATCH_SIZE = BATCH_SIZE self.noise_state = np.zeros(self.action_space) self.EXPLORATION_STEPS = EXPLORATION_STEPS self.VERBOSE = VERBOSE self.LOG_DIR_TF = LOG_DIR_TF #check if action_space is symmetric if all(env.action_space.high == abs(env.action_space.low)): action_scale = env.action_space.high else: raise ActionSpaceNotSymmetricException self.actor = Actor(self.sess, self.observation_space, self.action_space, LEARNING_RATE_ACTOR, NET_SIZE, TAU, action_scale) self.critic = Critic(self.sess, self.observation_space, self.action_space, LEARNING_RATE_CRITIC, NET_SIZE, TAU) actor_network_variables = self.actor.network.get_variables() critic_q_net_variables = self.critic.q_net.get_variables() self.actor_target_update = self.actor.target_network.update_variables( actor_network_variables) self.critic_target_update = self.critic.target_q_net.update_variables( critic_q_net_variables) self.reward_pl = tf.placeholder(tf.float32, [None, 1], name='Reward_PL') self.done_pl = tf.placeholder(tf.bool, [None, 1], name='Done_PL') self.labels = tf.where( self.done_pl, self.reward_pl, self.reward_pl + tf.multiply(self.REWARD_DISCOUNT, self.critic.target_prediction)) #self.replay_memory = ReplayMemory(MEMORY_LEN, BATCH_SIZE) self.replay_memory = ReplayMemory(MEMORY_LEN, BATCH_SIZE, self.observation_space, self.action_space) self.log_reward_pl = tf.placeholder(tf.float32, name='Reward_log_pl') self.reward_f = tf.add(0.0, self.log_reward_pl) tf.summary.scalar('reward', self.reward_f) init = tf.global_variables_initializer() self.sess.run(init) self.sess.run(self.actor.network.copy_to(self.actor.target_network)) self.sess.run(self.critic.q_net.copy_to(self.critic.target_q_net)) self.writer = tf.summary.FileWriter(self.LOG_DIR_TF, self.sess.graph) self.merged = tf.summary.merge_all()
def __init__(self, state_dim, batch_size, action_dim, H, gamma, BATCH_SIZE): # The network self.action_dim = action_dim self.model = torch.nn.Sequential( torch.nn.Linear(state_dim, H), torch.nn.ReLU(), torch.nn.Linear(H, action_dim), torch.nn.ReLU(), ) self.loss_fn = torch.nn.MSELoss(size_average=False) self.gamma = gamma self.memory = ReplayMemory(capacity=2000) self.BATCH_SIZE = BATCH_SIZE
def __init__(self, env, act_dim, state_dim, goal_dim, act_range, buffer_size=int(1e6), gamma=0.98, lr=0.001, tau=0.95): """ Initialization """ # Environment and A2C parameters self.act_dim = act_dim self.act_range = act_range self.env_dim = state_dim + goal_dim self.gamma = gamma self.lr = lr self.tau = tau self.env = env # Create actor and critic networks self.actor_network = Actor(self.env_dim, act_dim, act_range) self.actor_target_network = Actor(self.env_dim, act_dim, act_range) self.actor_target_network.load_state_dict( self.actor_network.state_dict()) self.critic_network = Critic(self.env_dim, act_dim, act_range) self.critic_target_network = Critic(self.env_dim, act_dim, act_range) self.actor_target_network.load_state_dict( self.actor_network.state_dict()) sync_networks(self.actor_network) sync_networks(self.critic_network) # Optimizer self.actor_optim = torch.optim.Adam(self.actor_network.parameters(), lr=lr) self.critic_optim = torch.optim.Adam(self.critic_network.parameters(), lr=lr) # Replay buffer # self.buffer = MemoryBuffer(buffer_size) self.buffer = ReplayMemory(buffer_size) # Normalizers self.goal_normalizer = Normalizer( goal_dim, default_clip_range=5) # Clip between [-5, 5] self.state_normalizer = Normalizer(state_dim, default_clip_range=5)
def train(sess, env, args, actors, critics, noise): load_models(actors, critics) summary_ops, summary_vars = build_summaries() init = tf.global_variables_initializer() sess.run(init) writer = tf.summary.FileWriter(args['summary_dir'], sess.graph) for actor in actors: actor.update_target() for critic in critics: critic.update_target() replayMemory = ReplayMemory(int(args['buffer_size']), int(args['random_seed'])) for ep in range(int(args['max_episodes'])): if ep % 100 == 0: save_models(actors, critics) episode_reward = learn(actors, args, critics, env, ep, noise, replayMemory, sess, summary_ops, summary_vars, writer) print('|Reward: {} | Episode: {:d}'.format(episode_reward, ep))
def main(config: Config): print(config) # Let's run it! for i in range(config.num_experiments): experiment_seed = config.seed + i * config.num_episodes memory = ReplayMemory(config.replay_memory_size) # We will seed the algorithm (for reproducability). random.seed(experiment_seed) torch.manual_seed(experiment_seed) env.seed(experiment_seed) q_model = QNetwork(config.device, config.num_hidden_q_model) curiousity_model = StatePredictor(2, 3, config.num_hidden_curiosity_model, config.device) for i in range(20, 29): episode_durations, episode_loss = run_episodes(train, q_model, curiousity_model, memory, env, experiment_seed, config, experiment_number=i) # print(i, episode_durations, episode_loss) print("Finished experiment {}/{}".format(i + 1, config.num_experiments))
def __init__(self): self.movementStrategy = EpsilonStrategy() self.replayMemory = ReplayMemory(MEMORY_CAPACITY) self.episode = 0 self.policyNetwork = self._build_compile_model() self.targetNetwork = self._build_compile_model() if os.path.exists("./network/zelda-ddqn.h5"): print('Cargamos red') self.policyNetwork.load_weights("./network/zelda-ddqn.h5") print(self.policyNetwork.summary()) self.exploreNext = False self.steps = 0 self.averageLoss = 0 self.averageReward = 0 self.losses = [] self.rewards = []
def __init__(self, U, random, eps_max_action, **kwargs): self.U = U if isinstance(self.U, int): self.U = list(range(self.U)) self.envAction2index = {} for idx, u in enumerate(self.U): self.envAction2index[u] = idx self.Q = {} self.replay_memory = ReplayMemory(capacity=100, random=random) self.steps = 0 self.random = random self.eps_max_action = eps_max_action self.sizes = kwargs.get("sizes", None) self.step_start = kwargs.get("step_start", 1e2) self.max_abs_delta_Q = 0
def __init__(self, batch_size=4, gamma=.999, eps_start=.95, eps_end=.05, eps_decay=200, target_update=10, memory_size=5000): self.batch_size, self.gamma, self.eps_start, self.eps_end, self.eps_decay, self.target_update \ = batch_size, gamma, eps_start, eps_end, eps_decay, target_update self.steps, self.threshold, self.policy, self.target = 0, eps_start, DQN( ), DQN() self.optimizer = optim.RMSprop(self.policy.parameters()) self.memory = ReplayMemory(memory_size) self.training_history = []
def __init__(self, path, model_path, target_model_path): self.path = path self.model_path = model_path self.target_model_path = target_model_path self.lr = 1e-3 self.gamma = 0.95 self.epsilon = 0.3 self.batch_size = 32 self.N_STEP = 3 self.qf = DuelingQFunc() self.target_qf = DuelingQFunc() #model.state_dict():モデルの学習パラメータをとってきている self.target_qf.load_state_dict(self.qf.state_dict()) self.optimizer = optim.Adam(self.qf.parameters(), lr = self.lr) self.criterion = nn.MSELoss() self.memory = ReplayMemory() self.total_step = 0
def __init__(self): #with tf.device('/CPU:0'): self.agent = DDPG_Agent([96, 96, 9], 3, regularizer_coeff=regularizer_coeff) self.cp_managers = [] for opt, model, name in [[ self.agent.actor_optimizer, self.agent.actor, "actor" ], [self.agent.critic_optimizer, self.agent.critic, "critic"]]: checkpoint = tf.train.Checkpoint(optimizer=opt, model=model) cp_manager = tf.train.CheckpointManager( checkpoint, os.path.join(LOG_DIR, name), 3, keep_checkpoint_every_n_hours=4) checkpoint.restore(cp_manager.latest_checkpoint) self.cp_managers.append(cp_manager) self.memory = ReplayMemory(BATCH_SIZE, 30000, 300000, num_frames=9, gray_scale=True, normalize=True) #self.memory = ReplayMemory(BATCH_SIZE, 1000, 300000, gray_scale=False, normalize=True) self.env = gym.make("CarRacing-v0", verbose=0) self.train_writer = tf.summary.create_file_writer( os.path.join(LOG_DIR, "train")) self.test_writer = tf.summary.create_file_writer( os.path.join(LOG_DIR, "test")) self.episode_queue = Queue() self.parameter_queues = [] self.do_render = False self.render_freq = 10 self.train_freq = 2 self.max_iteration = 800 self.epsilon_max_step = 100000 self.parameter_send_freq = 1000
def __init__(self, state_dim, action_dim, device, CONFIG): #== ENV PARAM == self.state_dim = state_dim self.action_dim = action_dim #== PARAM == self.LR_C = CONFIG.LR_C self.LR_C_START = CONFIG.LR_C self.LR_C_END = CONFIG.LR_C_END self.LR_C_DECAY = CONFIG.MAX_EP_STEPS * CONFIG.MAX_EPISODES / 2 self.LR_A = CONFIG.LR_A self.LR_A_START = CONFIG.LR_A self.LR_A_END = CONFIG.LR_A_END self.LR_A_DECAY = CONFIG.MAX_EP_STEPS * CONFIG.MAX_EPISODES / 2 self.BATCH_SIZE = CONFIG.BATCH_SIZE self.GAMMA = CONFIG.GAMMA self.MAX_MODEL = CONFIG.MAX_MODEL self.SIGMA = CONFIG.SIGMA #== CRITIC TARGET UPDATE PARAM == self.double = CONFIG.DOUBLE self.TAU = CONFIG.TAU self.HARD_UPDATE = CONFIG.HARD_UPDATE self.SOFT_UPDATE = CONFIG.SOFT_UPDATE #== MODEL PARAM == self.device = device #== MEMORY & MODEL == self.memory = ReplayMemory(CONFIG.MEMORY_CAPACITY) self.build_network() self.random_process = OrnsteinUhlenbeck(action_dim, sigma=self.SIGMA, annealLen=CONFIG.MAX_EP_STEPS * 2, dt=1) self.train = True
def __init__(self, num_states, num_actions): self.num_states = num_states self.num_actions = num_actions self.freq_update_target = 5 # set frequency of updating target self.count_replay = 0 self.memory = ReplayMemory(10000) # set capacity # Construct a neural network self.model = models.Sequential() self.model.add( layers.Dense(input_shape=(num_states, ), units=128, activation='relu')) self.model.add(layers.Dense(512, activation='relu')) self.model.add(layers.Dense(num_actions)) self.model.summary() # Set how to train the model self.model.compile(loss='mse', optimizer=optimizers.SGD()) self._target_model = models.clone_model(self.model)
def __init__(self, allies, opponents, world_size, n_games, train_batch_size, replay_mem_limit, training_rate=10, update_rate=500, sim_moves_limit=30, exploration_steps=200000, exploration_range=(0.1, 1.0), viz=None, viz_execution=None, train_saving=None): self.allies = allies self.opponents = opponents self.world_size = world_size self.moves_limit = sim_moves_limit self.training_rate = training_rate self.policy_dist_rate = update_rate self.exploration_steps = exploration_steps self.exploration_range = exploration_range self.exploration_step_value = \ (exploration_range[1]-exploration_range[0])/exploration_steps self.experience_replay = ReplayMemory(batch_size=train_batch_size, table_size=replay_mem_limit) self.training_batch_size = train_batch_size self.n_games = n_games self.replay_mem_limit = replay_mem_limit self.environment = Environment(n_rows=world_size[0], n_cols=world_size[1], n_agents=allies, n_opponents=opponents) self.metrics = {"reward": list(), "loss": list()} self.viz = viz self.viz_execution = viz_execution self.train_saving = train_saving
def train(): # 创建环境 game = FlappyBird() env_1 = PLE(game, fps=30, display_screen=False) env_2 = PLE(game, fps=30, display_screen=True) obs_dim = len(env_1.getGameState()) act_dim = len(env_1.getActionSet()) print('action set:', env_1.getActionSet()) logger.info('obs_dim {}, act_dim {}'.format(obs_dim, act_dim)) # 创建经验池 rpm = ReplayMemory(MEMORY_SIZE) # DQN的经验回放池 # 根据parl框架构建agent model = Model(act_dim=act_dim) algorithm = DQN(model, act_dim=act_dim, gamma=GAMMA, lr=LEARNING_RATE) agent = Agent(algorithm, obs_dim=obs_dim, act_dim=act_dim, e_greed=0.3, e_greed_decrement=1e-6) # 加载模型 save_path = './flappybird.ckpt' if os.path.exists(save_path): agent.restore(save_path) # 先往经验池里存一些数据,避免最开始训练的时候样本丰富度不够 while len(rpm) < MEMORY_WARMUP_SIZE: run_episode(env_1, agent, rpm) max_episode = 2000 # 开始训练 episode = 0 while episode < max_episode: # 训练max_episode个回合,test部分不计算入episode数量 # train for i in range(0, 100): total_reward, steps = run_episode(env_1, agent, rpm) episode += 1 # test eval_reward, steps = evaluate(env_2, agent) logger.info( '[episode:{}], e_greed:{:.6f}, steps:{}, test_reward:{}'.format( episode, agent.e_greed, steps, eval_reward)) # 保存模型 ckpt = './models/episode_{}.ckpt'.format(episode) agent.save(ckpt) # 训练结束,保存模型 save_path = './flappybird.ckpt' agent.save(save_path)
class CNNPlayer(Player): def __init__(self, agent_filepath=""): Player.__init__(self) # Create the experience memory database if not os.path.exists(REPLAY_MEMORY_FILENAME): self.replay_memory = ReplayMemory() else: self.replay_memory = cPickle.load(open(REPLAY_MEMORY_FILENAME, 'r')) # Initialize the convolutional neural network self.network = MinecraftNet(agent_filepath) self.ae_network = FeatureNet() # Probability of selecting non-random action self.epsilon = STARTING_EPSILON # The total number of frames this agent has been trained on # through all the minibatch training self.frames_trained = 0 # Load old epsilon and frames learned values self.load() self.cnn_action_map = self.initActionMap() # The current and previous sequences of game frames and actions self.current_seq = None self.previous_seq = None self.previous_action = None # Event logging self.log = LogFile("run.log", True) #self.log.logMessage("INITIAL NETWORK PARAMS: %s" % str(self.network.solver.net.params['ip1'][0].data[...])) # Create a map of all the CNN's legal actions # We will be able to pick the best move from this list based on the CNN's output def initActionMap(self): actions = [] # Populate with all 18 legal actions # (break_block, updown_rot, leftright_rot, forwardback, leftright) actions.append(Action.Action(False, updown_rot=0.0, leftright_rot=0.0, forwardback=0, leftright=0)) actions.append(Action.Action(False, updown_rot=0.0, leftright_rot=0.0, forwardback=1, leftright=0)) actions.append(Action.Action(False, updown_rot=0.0, leftright_rot=0.0, forwardback=-1, leftright=0)) actions.append(Action.Action(False, updown_rot=0.0, leftright_rot=AGENT_ROTATION_SPEED, forwardback=0, leftright=0)) actions.append(Action.Action(False, updown_rot=0.0, leftright_rot=AGENT_ROTATION_SPEED, forwardback=1, leftright=0)) actions.append(Action.Action(False, updown_rot=0.0, leftright_rot=AGENT_ROTATION_SPEED, forwardback=-1, leftright=0)) actions.append(Action.Action(False, updown_rot=0.0, leftright_rot=-AGENT_ROTATION_SPEED, forwardback=0, leftright=0)) actions.append(Action.Action(False, updown_rot=0.0, leftright_rot=-AGENT_ROTATION_SPEED, forwardback=1, leftright=0)) actions.append(Action.Action(False, updown_rot=0.0, leftright_rot=-AGENT_ROTATION_SPEED, forwardback=-1, leftright=0)) actions.append(Action.Action(True, updown_rot=0.0, leftright_rot=0.0, forwardback=0, leftright=0)) actions.append(Action.Action(True, updown_rot=0.0, leftright_rot=0.0, forwardback=1, leftright=0)) actions.append(Action.Action(True, updown_rot=0.0, leftright_rot=0.0, forwardback=-1, leftright=0)) actions.append(Action.Action(True, updown_rot=0.0, leftright_rot=AGENT_ROTATION_SPEED, forwardback=0, leftright=0)) actions.append(Action.Action(True, updown_rot=0.0, leftright_rot=AGENT_ROTATION_SPEED, forwardback=1, leftright=0)) actions.append(Action.Action(True, updown_rot=0.0, leftright_rot=AGENT_ROTATION_SPEED, forwardback=-1, leftright=0)) actions.append(Action.Action(True, updown_rot=0.0, leftright_rot=-AGENT_ROTATION_SPEED, forwardback=0, leftright=0)) actions.append(Action.Action(True, updown_rot=0.0, leftright_rot=-AGENT_ROTATION_SPEED, forwardback=1, leftright=0)) actions.append(Action.Action(True, updown_rot=0.0, leftright_rot=-AGENT_ROTATION_SPEED, forwardback=-1, leftright=0)) return actions def getActionMapIndex(self, action): for i in range(len(self.cnn_action_map)): if action == self.cnn_action_map[i]: return i self.log.logError("ACTION %s NOT FOUND IN ACTION MAP" % str(action)) sys.exit(1) def sequenceForward(self, seq): cnn_input = seq.toCNNInput() output = self.network.forward(cnn_input) return output def pickBestAction(self, seq): cnn_outputs = self.sequenceForward(seq) self.log.logMessage("REINFORCEMENT NET OUTPUT: " + str(cnn_outputs)) max_output_index = 0 max_output = cnn_outputs[0] for i in range(len(cnn_outputs)): if cnn_outputs[i] > max_output: max_output = cnn_outputs[i] max_output_index = i self.log.logMessage("BEST ACTION CHOSEN: %s" % str(self.cnn_action_map[max_output_index])) return self.cnn_action_map[max_output_index] def pickRandomAction(self): return random.choice(self.cnn_action_map) def load(self): if os.path.exists(CNNPLAYER_SAVE_FILENAME): f = open(CNNPLAYER_SAVE_FILENAME, 'r') tokens = f.read().split() self.epsilon, self.frames_trained = float(tokens[0]), int(tokens[1]) f.close() def save(self): # Save the replay memory as a pickled file o = open(REPLAY_MEMORY_FILENAME, 'w') cPickle.dump(self.replay_memory, o) o.close() o = open(CNNPLAYER_SAVE_FILENAME, 'w') o.write("%.8f %d" % (self.epsilon, self.frames_trained)) o.close() # Log the last network weights #self.log.logMessage("FINAL NETWORK PARAMS: %s" % str(self.network.solver.net.params['ip1'][0].data[...])) # Train the agent's CNN on a minibatch of Experiences def trainMinibatch(self): self.log.logMessage("TRAINING MINIBATCH") self.frames_trained += TRAINING_BATCH_SIZE experiences = self.replay_memory.get_random(TRAINING_BATCH_SIZE) inputs = [] labels = [] for experience in experiences: cnn_outputs = self.sequenceForward(experience.curr_seq) #best_action = self.pickBestAction(experience.curr_seq) target_vector = [] for act in cnn_outputs: #act = cnn_outputs[act_id] act_target = experience.curr_reward + GAMMA * act target_vector.append(act_target) #target = experience.curr_reward + GAMMA * best_action_output inputs.append(experience.prev_seq) labels.append(target_vector) #dataset.append((experience.prev_seq, target)) #Do gradient descent to minimize (target - network.forward(experience.prev_seq)) ^ 2 # print("INPUTS:", inputs) # print("LABELS:", labels) #self.network.set_input_data(inputs, labels) self.network.set_train_input_data(inputs, labels) self.network.train(BATCH_TRAINING_ITERATIONS) # train for a single iteration # Receive the agent's reward from its previous Action along with # a Frame screenshot of the current game state def getDecision(self, current_frame): self.log.logMessage("DECISION #%d in GAME FRAME #%d" % (self.actions_performed, self.game.world_counter)) self.log.logMessage("TRAINED ON %d FRAMES" % (self.frames_trained)) features = self.ae_network.encodeNumpyArray(current_frame.pixels) #self.log.logMessage("Current frame yields features: %s" % str(features)) if self.previous_reward != 0: self.log.logMessage("GOT REWARD: %d" % self.previous_reward) self.total_score += self.previous_reward # First frame of game if self.actions_performed == 0: self.actions_performed += 1 self.previous_seq = Sequence(features) # print("FRAME SEQUENCE: {0}".format(self.previous_seq)) curr_action = self.pickRandomAction() self.previous_seq = self.previous_seq.createNewSequence(curr_action) self.previous_action = curr_action # print("FIRST SEQUENCE: {0}".format(self.previous_seq)) return # Should I make a random move? r = random.random() # Add on the current frame to the current sequence self.current_seq = self.previous_seq.createNewSequence(features) if r > self.epsilon or self.actions_performed < 4: #not self.current_seq.isFull(): curr_action = self.pickRandomAction() else: # Run the CNN and pick the max output action curr_action = self.pickBestAction(self.current_seq) # Finally, add the chosen action to the current sequence self.current_seq = self.current_seq.createNewSequence(curr_action) # Actually perform the action in the game self.performAction(curr_action) new_experience = Experience(self.previous_seq, self.previous_action, self.previous_reward, self.current_seq) self.replay_memory.store(new_experience) self.previous_seq = self.current_seq if self.game.world_counter > STARTING_FRAMES and self.game.world_counter % BATCH_TRAINING_FREQUENCY == 0: self.trainMinibatch() # Remember the chosen Action since it will be required for the next iteration self.previous_action = curr_action if self.epsilon < MAX_EPSILON: self.epsilon *= EPSILON_UPDATE self.log.logMessage("UPDATED EPSILON: %.5f" % self.epsilon)