Ejemplo n.º 1
0
	def __init__(self, sess, *, inpDim = 180, nGames = 8, nSteps = 15, nMiniBatches = 4, nOptEpochs = 5, lam = 0.95, gamma = 0.995, ent_coef = 0.01, vf_coef = 0.5, max_grad_norm = 0.5, minLearningRate = 0.000001, learningRate, clipRange, saveEvery = 10):
		"""
		nGames:		number of seperate Games played (in parallel)
		nSteps:		each game is executed for nSteps Time Steps (how long one game endures?)
		advantage estimates are made training then occurs for K epochs
		M= Mini Batch size
		"""

		#network/model for training
		self.trainingNetwork = PPONetwork(sess, inpDim, 60, "trainNet")
		self.trainingModel = PPOModel(sess, self.trainingNetwork, inpDim, 60, ent_coef, vf_coef, max_grad_norm)

		#player networks which choose decisions - allowing for later on experimenting with playing against older versions of the network (so decisions they make are not trained on).
		self.playerNetworks = {}

		#for now each player uses the same (up to date) network to make it's decisions.
		self.playerNetworks[1] = self.playerNetworks[2] = self.playerNetworks[3] = self.playerNetworks[4] = self.trainingNetwork
		self.trainOnPlayer = [True, True, True, True]# Lena should loose

		tf.global_variables_initializer().run(session=sess)

		#environment
		self.vectorizedGame = vectorizedBig2Games(nGames)

		#params
		self.nGames = nGames
		self.inpDim = inpDim
		self.nSteps = nSteps
		self.nMiniBatches = nMiniBatches
		self.nOptEpochs = nOptEpochs
		self.lam = lam
		self.gamma = gamma
		self.learningRate = learningRate
		self.minLearningRate = minLearningRate
		self.clipRange = clipRange
		self.saveEvery = saveEvery

		self.rewardNormalization = 1.0 #was 5.0 before!!! --- TODO? --- divide rewards by this number (so reward ranges from -1.0 to 3.0)

		#test networks - keep network saved periodically and run test games against current network
		self.testNetworks = {}

		# final 4 observations need to be carried over (for value estimation and propagating rewards back)
		self.prevObs = []
		self.prevGos = []
		self.prevAvailAcs = []
		self.prevRewards = []
		self.prevActions = []
		self.prevValues = []
		self.prevDones = []
		self.prevNeglogpacs = []

		#episode/training information
		self.totTrainingSteps = 0
		self.epInfos = []
		self.gamesDone = 0
		self.losses = []
 def __init__(self, sess, *, inpDim = 412, nGames = 8, nSteps = 20, nMiniBatches = 4, nOptEpochs = 5, lam = 0.95, gamma = 0.995, ent_coef = 0.01, vf_coef = 0.5, max_grad_norm = 0.5, minLearningRate = 0.000001, learningRate, clipRange, saveEvery = 500):
     
     #network/model for training
     self.trainingNetwork = PPONetwork(sess, inpDim, 1695, "trainNet")
     self.trainingModel = PPOModel(sess, self.trainingNetwork, inpDim, 1695, ent_coef, vf_coef, max_grad_norm)
     
     #player networks which choose decisions - allowing for later on experimenting with playing against older versions of the network (so decisions they make are not trained on).
     self.playerNetworks = {}
     
     #for now each player uses the same (up to date) network to make it's decisions.
     self.playerNetworks[1] = self.playerNetworks[2] = self.playerNetworks[3] = self.playerNetworks[4] = self.trainingNetwork
     self.trainOnPlayer = [True, True, True, True]
     
     tf.global_variables_initializer().run(session=sess)
     
     #environment
     self.vectorizedGame = vectorizedBig2Games(nGames)
     
     #params
     self.nGames = nGames
     self.inpDim = inpDim
     self.nSteps = nSteps
     self.nMiniBatches = nMiniBatches
     self.nOptEpochs = nOptEpochs
     self.lam = lam
     self.gamma = gamma
     self.learningRate = learningRate
     self.minLearningRate = minLearningRate
     self.clipRange = clipRange
     self.saveEvery = saveEvery
     
     self.rewardNormalization = 5.0 #divide rewards by this number (so reward ranges from -1.0 to 3.0)
     
     #test networks - keep network saved periodically and run test games against current network
     self.testNetworks = {}
     
     # final 4 observations need to be carried over (for value estimation and propagating rewards back)
     self.prevObs = []
     self.prevGos = []
     self.prevAvailAcs = []
     self.prevRewards = []
     self.prevActions = []
     self.prevValues = []
     self.prevDones = []
     self.prevNeglogpacs = []
     
     #episode/training information
     self.totTrainingSteps = 0
     self.epInfos = []
     self.gamesDone = 0
     self.losses = []
Ejemplo n.º 3
0
class big2PPOSimulation(object):
    def __init__(self,
                 sess,
                 *,
                 inpDim=412,
                 nGames=8,
                 nSteps=20,
                 nMiniBatches=4,
                 nOptEpochs=5,
                 lam=0.95,
                 gamma=0.995,
                 ent_coef=0.01,
                 vf_coef=0.5,
                 max_grad_norm=0.5,
                 minLearningRate=0.000001,
                 learningRate,
                 clipRange,
                 saveEvery=500):

        #network/model for training
        self.trainingNetwork = PPONetwork(sess, inpDim, 1695, "trainNet")
        self.trainingModel = PPOModel(sess, self.trainingNetwork, inpDim, 1695,
                                      ent_coef, vf_coef, max_grad_norm)

        #player networks which choose decisions - allowing for later on experimenting with playing against older versions of the network (so decisions they make are not trained on).
        self.playerNetworks = {}

        #for now each player uses the same (up to date) network to make it's decisions.
        self.playerNetworks[1] = self.playerNetworks[2] = self.playerNetworks[
            3] = self.playerNetworks[4] = self.trainingNetwork
        self.trainOnPlayer = [True, True, True, True]

        tf.global_variables_initializer().run(session=sess)

        #environment
        self.vectorizedGame = vectorizedBig2Games(nGames)

        #params
        self.nGames = nGames
        self.inpDim = inpDim
        self.nSteps = nSteps
        self.nMiniBatches = nMiniBatches
        self.nOptEpochs = nOptEpochs
        self.lam = lam
        self.gamma = gamma
        self.learningRate = learningRate
        self.minLearningRate = minLearningRate
        self.clipRange = clipRange
        self.saveEvery = saveEvery

        self.rewardNormalization = 5.0  #divide rewards by this number (so reward ranges from -1.0 to 3.0)

        #test networks - keep network saved periodically and run test games against current network
        self.testNetworks = {}

        # final 4 observations need to be carried over (for value estimation and propagating rewards back)
        self.prevObs = []
        self.prevGos = []
        self.prevAvailAcs = []
        self.prevRewards = []
        self.prevActions = []
        self.prevValues = []
        self.prevDones = []
        self.prevNeglogpacs = []

        #episode/training information
        self.totTrainingSteps = 0
        self.epInfos = []
        self.gamesDone = 0
        self.losses = []

    def run(self):
        #run vectorized games for nSteps and generate mini batch to train on.
        mb_obs, mb_pGos, mb_actions, mb_values, mb_neglogpacs, mb_rewards, mb_dones, mb_availAcs = [], [], [], [], [], [], [], []
        for i in range(len(self.prevObs)):
            mb_obs.append(self.prevObs[i])
            mb_pGos.append(self.prevGos[i])
            mb_actions.append(self.prevActions[i])
            mb_values.append(self.prevValues[i])
            mb_neglogpacs.append(self.prevNeglogpacs[i])
            mb_rewards.append(self.prevRewards[i])
            mb_dones.append(self.prevDones[i])
            mb_availAcs.append(self.prevAvailAcs[i])
        if len(self.prevObs) == 4:
            endLength = self.nSteps
        else:
            endLength = self.nSteps - 4
        for _ in range(self.nSteps):
            currGos, currStates, currAvailAcs = self.vectorizedGame.getCurrStates(
            )
            currStates = np.squeeze(currStates)
            currAvailAcs = np.squeeze(currAvailAcs)
            currGos = np.squeeze(currGos)
            actions, values, neglogpacs = self.trainingNetwork.step(
                currStates, currAvailAcs)
            rewards, dones, infos = self.vectorizedGame.step(actions)
            mb_obs.append(currStates.copy())
            mb_pGos.append(currGos)
            mb_availAcs.append(currAvailAcs.copy())
            mb_actions.append(actions)
            mb_values.append(values)
            mb_neglogpacs.append(neglogpacs)
            mb_dones.append(list(dones))
            #now back assign rewards if state is terminal
            toAppendRewards = np.zeros((self.nGames, ))
            mb_rewards.append(toAppendRewards)
            for i in range(self.nGames):
                if dones[i] == True:
                    reward = rewards[i]
                    mb_rewards[-1][i] = reward[mb_pGos[-1][i] -
                                               1] / self.rewardNormalization
                    mb_rewards[-2][i] = reward[mb_pGos[-2][i] -
                                               1] / self.rewardNormalization
                    mb_rewards[-3][i] = reward[mb_pGos[-3][i] -
                                               1] / self.rewardNormalization
                    mb_rewards[-4][i] = reward[mb_pGos[-4][i] -
                                               1] / self.rewardNormalization
                    mb_dones[-2][i] = True
                    mb_dones[-3][i] = True
                    mb_dones[-4][i] = True
                    self.epInfos.append(infos[i])
                    self.gamesDone += 1
                    print("Game %d finished. Lasted %d turns" %
                          (self.gamesDone, infos[i]['numTurns']))
        self.prevObs = mb_obs[endLength:]
        self.prevGos = mb_pGos[endLength:]
        self.prevRewards = mb_rewards[endLength:]
        self.prevActions = mb_actions[endLength:]
        self.prevValues = mb_values[endLength:]
        self.prevDones = mb_dones[endLength:]
        self.prevNeglogpacs = mb_neglogpacs[endLength:]
        self.prevAvailAcs = mb_availAcs[endLength:]
        mb_obs = np.asarray(mb_obs, dtype=np.float32)[:endLength]
        mb_availAcs = np.asarray(mb_availAcs, dtype=np.float32)[:endLength]
        mb_rewards = np.asarray(mb_rewards, dtype=np.float32)[:endLength]
        mb_actions = np.asarray(mb_actions, dtype=np.float32)[:endLength]
        mb_values = np.asarray(mb_values, dtype=np.float32)
        mb_neglogpacs = np.asarray(mb_neglogpacs, dtype=np.float32)[:endLength]
        mb_dones = np.asarray(mb_dones, dtype=np.bool)
        #discount/bootstrap value function with generalized advantage estimation:
        mb_returns = np.zeros_like(mb_rewards)
        mb_advs = np.zeros_like(mb_rewards)
        for k in range(4):
            lastgaelam = 0
            for t in reversed(range(k, endLength, 4)):
                nextNonTerminal = 1.0 - mb_dones[t]
                nextValues = mb_values[t + 4]
                delta = mb_rewards[
                    t] + self.gamma * nextValues * nextNonTerminal - mb_values[
                        t]
                mb_advs[
                    t] = lastgaelam = delta + self.gamma * self.lam * nextNonTerminal * lastgaelam

        mb_values = mb_values[:endLength]
        #mb_dones = mb_dones[:endLength]
        mb_returns = mb_advs + mb_values

        return map(sf01, (mb_obs, mb_availAcs, mb_returns, mb_actions,
                          mb_values, mb_neglogpacs))

    def train(self, nTotalSteps):

        nUpdates = nTotalSteps // (self.nGames * self.nSteps)

        for update in range(nUpdates):

            alpha = 1.0 - update / nUpdates
            lrnow = self.learningRate * alpha
            if lrnow < self.minLearningRate:
                lrnow = self.minLearningRate
            cliprangenow = self.clipRange * alpha

            states, availAcs, returns, actions, values, neglogpacs = self.run()

            batchSize = states.shape[0]
            self.totTrainingSteps += batchSize

            nTrainingBatch = batchSize // self.nMiniBatches

            currParams = self.trainingNetwork.getParams()

            mb_lossvals = []
            inds = np.arange(batchSize)
            for _ in range(self.nOptEpochs):
                np.random.shuffle(inds)
                for start in range(0, batchSize, nTrainingBatch):
                    end = start + nTrainingBatch
                    mb_inds = inds[start:end]
                    mb_lossvals.append(
                        self.trainingModel.train(
                            lrnow, cliprangenow, states[mb_inds],
                            availAcs[mb_inds], returns[mb_inds],
                            actions[mb_inds], values[mb_inds],
                            neglogpacs[mb_inds]))
            lossvals = np.mean(mb_lossvals, axis=0)
            self.losses.append(lossvals)

            newParams = self.trainingNetwork.getParams()
            needToReset = 0
            for param in newParams:
                if np.sum(np.isnan(param)) > 0:
                    needToReset = 1

            if needToReset == 1:
                self.trainingNetwork.loadParams(currParams)

            if update % self.saveEvery == 0:
                name = "modelParameters" + str(update)
                self.trainingNetwork.saveParams(name)
                joblib.dump(self.losses, "losses.pkl")
                joblib.dump(self.epInfos, "epInfos.pkl")
Ejemplo n.º 4
0
    def __init__(self,
                 sess,
                 *,
                 games_per_batch=5,
                 training_steps_per_game=5,
                 lam=0.95,
                 gamma=0.995,
                 ent_coef=0.01,
                 vf_coef=0.5,
                 max_grad_norm=0.5,
                 min_learning_rate=0.000001,
                 learning_rate,
                 clip_range,
                 save_every=100):
        """
        Constructor.
        Sets up the parameters of the training networks and the algorithm.
        :param sess: Tensorflow session in which to run.
        :param games_per_batch: Number of games in each batch.
        :param training_steps_per_game: Number of training steps performed in each game.
        :param lam: coefficient used to calculate loss
        :param gamma: discount factor.
        :param ent_coef: coefficient used to calculate loss
        :param vf_coef: coefficient used to calculate loss
        :param max_grad_norm: coefficient used to calculate loss
        :param min_learning_rate: minimal learning rate.
        :param learning_rate: learning rate of the players.
        :param clip_range: used to prevent large changes in the network
        :param save_every: Frequency of saving of the model.
        """
        # network/model for training
        output_dim = PPOPlayer.output_dim
        input_dim = PPOPlayer.input_dim
        self.trainingNetwork = PPONetwork(sess, input_dim, output_dim,
                                          "trainNet")
        self.trainingModel = PPOModel(sess, self.trainingNetwork, input_dim,
                                      output_dim, ent_coef, vf_coef,
                                      max_grad_norm)

        # player networks which makes decisions -
        # allowing for later on experimenting with playing against older versions of the network (so decisions they make are not trained on).
        self.playerNetworks = dict()

        # for now each player uses the same (up to date) network to make it's decisions.
        self.playerNetworks[1] = self.playerNetworks[2] = self.playerNetworks[
            3] = self.playerNetworks[4] = self.trainingNetwork
        self.trainOnPlayer = [True, True, True, True]

        tf.compat.v1.global_variables_initializer().run(session=sess)

        self.players = [
            PPOPlayer(DurakEnv.HAND_SIZE, "PPO 0", self.playerNetworks[1]),
            PPOPlayer(DurakEnv.HAND_SIZE, "PPO 1", self.playerNetworks[2]),
            PPOPlayer(DurakEnv.HAND_SIZE, "PPO 2", self.playerNetworks[3]),
            PPOPlayer(DurakEnv.HAND_SIZE, "PPO 3", self.playerNetworks[4])
        ]

        # environment
        game = DurakEnv(self.players, False)
        self.state = game.reset()
        self.vectorizedGame = game

        # params
        self.games_per_batch = games_per_batch
        self.training_steps_per_game = training_steps_per_game
        self.inpDim = input_dim
        self.lam = lam
        self.gamma = gamma
        self.learningRate = learning_rate
        self.minLearningRate = min_learning_rate
        self.clipRange = clip_range
        self.saveEvery = save_every

        self.rewardNormalization = 5.0  # divide rewards by this number (so reward ranges from -1.0 to 3.0)

        # test networks - keep network saved periodically and run test games against current network
        self.testNetworks = {}

        # final 4 observations need to be carried over (for value estimation and propagating rewards back)
        self.prevObs = []
        self.prevGos = []
        self.prevAvailAcs = []
        self.prevRewards = []
        self.prevActions = []
        self.prevValues = []
        self.prevDones = []
        self.prevNeglogpacs = []

        # episode/training information
        self.totTrainingSteps = 0
        self.gamesDone = 0
        self.losses = []

        logging.info("finished PPO Trainers init")
Ejemplo n.º 5
0
class PPOTrainer(object):
    def __init__(self,
                 sess,
                 *,
                 games_per_batch=5,
                 training_steps_per_game=5,
                 lam=0.95,
                 gamma=0.995,
                 ent_coef=0.01,
                 vf_coef=0.5,
                 max_grad_norm=0.5,
                 min_learning_rate=0.000001,
                 learning_rate,
                 clip_range,
                 save_every=100):
        """
        Constructor.
        Sets up the parameters of the training networks and the algorithm.
        :param sess: Tensorflow session in which to run.
        :param games_per_batch: Number of games in each batch.
        :param training_steps_per_game: Number of training steps performed in each game.
        :param lam: coefficient used to calculate loss
        :param gamma: discount factor.
        :param ent_coef: coefficient used to calculate loss
        :param vf_coef: coefficient used to calculate loss
        :param max_grad_norm: coefficient used to calculate loss
        :param min_learning_rate: minimal learning rate.
        :param learning_rate: learning rate of the players.
        :param clip_range: used to prevent large changes in the network
        :param save_every: Frequency of saving of the model.
        """
        # network/model for training
        output_dim = PPOPlayer.output_dim
        input_dim = PPOPlayer.input_dim
        self.trainingNetwork = PPONetwork(sess, input_dim, output_dim,
                                          "trainNet")
        self.trainingModel = PPOModel(sess, self.trainingNetwork, input_dim,
                                      output_dim, ent_coef, vf_coef,
                                      max_grad_norm)

        # player networks which makes decisions -
        # allowing for later on experimenting with playing against older versions of the network (so decisions they make are not trained on).
        self.playerNetworks = dict()

        # for now each player uses the same (up to date) network to make it's decisions.
        self.playerNetworks[1] = self.playerNetworks[2] = self.playerNetworks[
            3] = self.playerNetworks[4] = self.trainingNetwork
        self.trainOnPlayer = [True, True, True, True]

        tf.compat.v1.global_variables_initializer().run(session=sess)

        self.players = [
            PPOPlayer(DurakEnv.HAND_SIZE, "PPO 0", self.playerNetworks[1]),
            PPOPlayer(DurakEnv.HAND_SIZE, "PPO 1", self.playerNetworks[2]),
            PPOPlayer(DurakEnv.HAND_SIZE, "PPO 2", self.playerNetworks[3]),
            PPOPlayer(DurakEnv.HAND_SIZE, "PPO 3", self.playerNetworks[4])
        ]

        # environment
        game = DurakEnv(self.players, False)
        self.state = game.reset()
        self.vectorizedGame = game

        # params
        self.games_per_batch = games_per_batch
        self.training_steps_per_game = training_steps_per_game
        self.inpDim = input_dim
        self.lam = lam
        self.gamma = gamma
        self.learningRate = learning_rate
        self.minLearningRate = min_learning_rate
        self.clipRange = clip_range
        self.saveEvery = save_every

        self.rewardNormalization = 5.0  # divide rewards by this number (so reward ranges from -1.0 to 3.0)

        # test networks - keep network saved periodically and run test games against current network
        self.testNetworks = {}

        # final 4 observations need to be carried over (for value estimation and propagating rewards back)
        self.prevObs = []
        self.prevGos = []
        self.prevAvailAcs = []
        self.prevRewards = []
        self.prevActions = []
        self.prevValues = []
        self.prevDones = []
        self.prevNeglogpacs = []

        # episode/training information
        self.totTrainingSteps = 0
        self.gamesDone = 0
        self.losses = []

        logging.info("finished PPO Trainers init")

    def run(
        self
    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray,
               np.ndarray]:
        """
        Runs a game for a number of steps and collects information of the game.
        :return: information collected from the game.
        """
        # run vectorized games for nSteps and generate mini batch to train on.
        mb_obs, mb_pGos, mb_actions, mb_values, mb_neglogpacs, mb_rewards, mb_dones, mb_availAcs = [], [], [], [], [], [], [], []
        done = False
        game = self.vectorizedGame
        state = game.reset()
        steps = 0
        while not done and steps < 500:
            turn_player = game.get_turn_player()
            available_actions = game.get_available_actions()
            action = turn_player.get_action(state, game.to_attack())
            value, neglogpac = turn_player.get_val_neglogpac()
            new_state, reward, done = game.step(action)  # update the game

            # add to list
            mb_obs.append(turn_player.last_converted_state.flatten())
            mb_pGos.append(turn_player)
            mb_actions.append(Deck.get_index_from_card(action))
            mb_values.append(value[0])
            mb_neglogpacs.append(neglogpac[0])
            mb_rewards.append(reward)
            mb_dones.append(done)
            mb_availAcs.append(
                turn_player.last_converted_available_cards.flatten())

            # update current state
            state = new_state
            steps += 1

        # add dones to last plays
        for i in range(1, len(game.players) + 1):
            mb_dones[-i] = True

        # convert to numpy and finish game
        self.gamesDone += 1
        self.vectorizedGame = game
        self.state = state
        mb_obs = np.asarray(tuple(mb_obs), dtype=np.float64)
        mb_availAcs = np.asarray(tuple(mb_availAcs), dtype=np.float64)
        mb_rewards = np.asarray(tuple(mb_rewards), dtype=np.float64)
        mb_actions = np.asarray(tuple(mb_actions), dtype=np.int64)
        mb_values = np.asarray(tuple(mb_values), dtype=np.float64)
        mb_neglogpacs = np.asarray(tuple(mb_neglogpacs), dtype=np.float64)
        mb_dones = np.asarray(tuple(mb_dones), dtype=np.bool)

        # discount/bootstrap value function with generalized advantage estimation:
        mb_returns = np.zeros_like(mb_rewards)
        mb_advs = np.zeros_like(mb_rewards)
        for k in range(4):
            lastgaelam = 0
            for t in reversed(range(k,
                                    len(mb_rewards) - 4, len(game.players))):
                nextNonTerminal = 1.0 - mb_dones[t]
                nextValues = mb_values[t + len(game.players)]
                delta = mb_rewards[
                    t] + self.gamma * nextValues * nextNonTerminal - mb_values[
                        t]
                mb_advs[
                    t] = lastgaelam = delta + self.gamma * self.lam * nextNonTerminal * lastgaelam

        mb_values = mb_values
        mb_returns = mb_advs + mb_values

        return mb_obs, mb_availAcs, mb_returns, mb_actions, mb_values, mb_neglogpacs

    def get_batch(
        self
    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray,
               np.ndarray]:
        """
        :return: Information regarding a batch of games for training.
        """
        states, availAcs, returns, actions, values, neglogpacs = [], [], [], [], [], []
        for _ in range(self.games_per_batch):
            st, av, re, ac, va, ne = self.run()
            states.append(st)
            availAcs.append(av)
            returns.append(re)
            actions.append(ac)
            values.append(va)
            neglogpacs.append(ne)

        # convert to numpy
        states = np.asarray(tuple(states), dtype=object)
        availAcs = np.asarray(tuple(availAcs), dtype=object)
        returns = np.asarray(tuple(returns), dtype=object)
        actions = np.asarray(tuple(actions), dtype=object)
        values = np.asarray(tuple(values), dtype=object)
        neglogpacs = np.asarray(tuple(neglogpacs), dtype=object)

        logging.info("Finished getting a batch")

        return states, availAcs, returns, actions, values, neglogpacs

    def train(self, total_num_games) -> None:
        """
        Trains the PPO players against themselves.
        :param total_num_games: Number of training games.
        """

        nUpdates = total_num_games // self.games_per_batch

        for update in range(1, nUpdates + 1):

            alpha = 1.0 - update / nUpdates
            lrnow = self.learningRate * alpha
            if lrnow < self.minLearningRate:
                lrnow = self.minLearningRate
            cliprangenow = self.clipRange * alpha

            states, availAcs, returns, actions, values, neglogpacs = self.get_batch(
            )

            curr_params = self.trainingNetwork.getParams()
            mb_lossvals = []

            # train on the games after shuffling them
            for game_idx in np.random.randint(0, self.games_per_batch - 1,
                                              self.games_per_batch):
                steps = 0
                while steps + self.training_steps_per_game < states[
                        game_idx].shape[
                            0]:  # less than the amount of steps (actions) in the game
                    mb_inds = np.arange(steps,
                                        steps + self.training_steps_per_game)
                    mb_lossvals.append(
                        self.trainingModel.train(
                            lrnow, cliprangenow, states[game_idx][mb_inds],
                            availAcs[game_idx][mb_inds],
                            returns[game_idx][mb_inds],
                            actions[game_idx][mb_inds],
                            values[game_idx][mb_inds],
                            neglogpacs[game_idx][mb_inds]))
                    if steps == states[game_idx].shape[0] - 1:
                        break
                    if steps + self.training_steps_per_game < states[
                            game_idx].shape[0]:
                        # basic step
                        steps += self.training_steps_per_game
                    else:
                        # go over last indices, which are less than self.training_steps_per_game
                        steps = states[game_idx].shape[
                            0] - self.training_steps_per_game - 1

            logging.info("Finished training in update num: %s" % update *
                         self.games_per_batch)

            lossvals = np.mean(mb_lossvals, axis=0)
            self.losses.append(lossvals)

            new_params = self.trainingNetwork.getParams()
            for param in new_params:
                if np.sum(np.isnan(param)) > 0:
                    # remove changes in network
                    self.trainingNetwork.loadParams(curr_params)
                    logging.warning(
                        "Had to reset the params in update num: %s" % nUpdates)
                    break

            if update % self.saveEvery == 0:
                name = "PPOParams/model" + str(update * self.games_per_batch)
                self.trainingNetwork.saveParams(name)
                joblib.dump(self.losses, "losses.pkl")

            print("finished " + str(update * self.games_per_batch))
Ejemplo n.º 6
0
class big2PPOSimulation(object):

	def __init__(self, sess, *, inpDim = 180, nGames = 8, nSteps = 15, nMiniBatches = 4, nOptEpochs = 5, lam = 0.95, gamma = 0.995, ent_coef = 0.01, vf_coef = 0.5, max_grad_norm = 0.5, minLearningRate = 0.000001, learningRate, clipRange, saveEvery = 10):
		"""
		nGames:		number of seperate Games played (in parallel)
		nSteps:		each game is executed for nSteps Time Steps (how long one game endures?)
		advantage estimates are made training then occurs for K epochs
		M= Mini Batch size
		"""

		#network/model for training
		self.trainingNetwork = PPONetwork(sess, inpDim, 60, "trainNet")
		self.trainingModel = PPOModel(sess, self.trainingNetwork, inpDim, 60, ent_coef, vf_coef, max_grad_norm)

		#player networks which choose decisions - allowing for later on experimenting with playing against older versions of the network (so decisions they make are not trained on).
		self.playerNetworks = {}

		#for now each player uses the same (up to date) network to make it's decisions.
		self.playerNetworks[1] = self.playerNetworks[2] = self.playerNetworks[3] = self.playerNetworks[4] = self.trainingNetwork
		self.trainOnPlayer = [True, True, True, True]# Lena should loose

		tf.global_variables_initializer().run(session=sess)

		#environment
		self.vectorizedGame = vectorizedBig2Games(nGames)

		#params
		self.nGames = nGames
		self.inpDim = inpDim
		self.nSteps = nSteps
		self.nMiniBatches = nMiniBatches
		self.nOptEpochs = nOptEpochs
		self.lam = lam
		self.gamma = gamma
		self.learningRate = learningRate
		self.minLearningRate = minLearningRate
		self.clipRange = clipRange
		self.saveEvery = saveEvery

		self.rewardNormalization = 1.0 #was 5.0 before!!! --- TODO? --- divide rewards by this number (so reward ranges from -1.0 to 3.0)

		#test networks - keep network saved periodically and run test games against current network
		self.testNetworks = {}

		# final 4 observations need to be carried over (for value estimation and propagating rewards back)
		self.prevObs = []
		self.prevGos = []
		self.prevAvailAcs = []
		self.prevRewards = []
		self.prevActions = []
		self.prevValues = []
		self.prevDones = []
		self.prevNeglogpacs = []

		#episode/training information
		self.totTrainingSteps = 0
		self.epInfos = []
		self.gamesDone = 0
		self.losses = []

	def run(self):
		#run vectorized games for nSteps and generate mini batch to train on.
		mb_obs, mb_pGos, mb_actions, mb_values, mb_neglogpacs, mb_rewards, mb_dones, mb_availAcs = [], [], [], [], [], [], [], []
		for i in range(len(self.prevObs)):
			mb_obs.append(self.prevObs[i])
			mb_pGos.append(self.prevGos[i])
			mb_actions.append(self.prevActions[i])
			mb_values.append(self.prevValues[i])
			mb_neglogpacs.append(self.prevNeglogpacs[i])
			mb_rewards.append(self.prevRewards[i])
			mb_dones.append(self.prevDones[i])
			mb_availAcs.append(self.prevAvailAcs[i])
		if len(self.prevObs) == 4:
			endLength = self.nSteps
		else:
			endLength = self.nSteps-4

		# steps: indicate every move of one player!
		for i in range(self.nSteps):#8 steps.
			# for 2 parallel games:
			# currGos      = [0 0] -> array of active player
			# currStates   = [[[0...1]] [[0...1]]] -> avail. Cards
			# currAvailAcs = [[[inf...0]] [[inf...0]]] -> curr AvailAcs
			# actions      = [6 10]  -> idx of action to play?
			# values       = [0.19484621 0.23115599]
			# neglogpacs   = [2.5639274 2.6337652]
			# rewards      =  (array([0., 0., 0., 0.]), array([ 0., -6.,  0.,  0.]))
			# dones        =  (False, False)
			# self.nGames  =  2
			# mb_rewards   -> len(mb_rewards) 5....12, 5....12,...
			# mb_pGos is the active player.

			currGos, currStates, currAvailAcs = self.vectorizedGame.getCurrStates()
			currStates = np.squeeze(currStates)
			currAvailAcs = np.squeeze(currAvailAcs)
			currGos = np.squeeze(currGos)
			actions, values, neglogpacs = self.trainingNetwork.step(currStates, currAvailAcs)
			rewards, dones, infos = self.vectorizedGame.step(actions)
			print("Step:", i, "in run:", mb_pGos)
			infos = [t for t in infos if t]# delete empty entrys
			mb_obs.append(currStates.copy())
			mb_pGos.append(currGos)
			mb_availAcs.append(currAvailAcs.copy())
			mb_actions.append(actions)
			mb_values.append(values)
			mb_neglogpacs.append(neglogpacs)
			mb_dones.append(list(dones))
			#now back assign rewards if state is terminal

			# assign rewards correctly:
			# alle 4 Schritte da dann immer eine Runde zu Ende ist.

			toAppendRewards = np.zeros((self.nGames,))
			#mb_rewards.append(list(rewards))
			mb_rewards.append(toAppendRewards)#toAppendRewards
			for i in range(self.nGames):
				try:
					reward = rewards[i]
					mb_rewards[-1][i] = reward[mb_pGos[-1][i]-1] / self.rewardNormalization
					print("rewards:", mb_rewards[-1][i], "curr_play", mb_pGos[-1][i]-1, "zuordn", reward[mb_pGos[-1][i]-1])
					mb_rewards[-2][i] = reward[mb_pGos[-2][i]-1] / self.rewardNormalization
					mb_rewards[-3][i] = reward[mb_pGos[-3][i]-1] / self.rewardNormalization
					mb_rewards[-4][i] = reward[mb_pGos[-4][i]-1] / self.rewardNormalization
				except Exception as e:
					print("excpetion not possible len issues")
					print(e)
				if dones[i] == True:
					#do not append rewards if game is done!

					mb_dones[-2][i] = True
					mb_dones[-3][i] = True
					mb_dones[-4][i] = True
					self.gamesDone += 1
					#print("Games Done:", self.gamesDone, "Rewards:", reward)
			if len(infos)>0:
				self.epInfos.append(infos)# appends too much?
		self.prevObs = mb_obs[endLength:]
		self.prevGos = mb_pGos[endLength:]
		self.prevRewards = mb_rewards[endLength:]
		self.prevActions = mb_actions[endLength:]
		self.prevValues = mb_values[endLength:]
		self.prevDones = mb_dones[endLength:]
		self.prevNeglogpacs = mb_neglogpacs[endLength:]
		self.prevAvailAcs = mb_availAcs[endLength:]
		mb_obs      = np.asarray(mb_obs, dtype=np.float32)[:endLength]
		mb_availAcs = np.asarray(mb_availAcs, dtype=np.float32)[:endLength]
		mb_rewards  = np.asarray(mb_rewards, dtype=np.float32)[:endLength]

		# mb_rewards = [ [0 0], ...], len(mb_rewards) = 8 = nSteps
		mb_actions  = np.asarray(mb_actions, dtype=np.float32)[:endLength]
		mb_values   = np.asarray(mb_values, dtype=np.float32)
		mb_neglogpacs = np.asarray(mb_neglogpacs, dtype=np.float32)[:endLength]
		mb_dones = np.asarray(mb_dones, dtype=np.bool)
		#discount/bootstrap value function with generalized advantage estimation:
		mb_returns = np.zeros_like(mb_rewards)
		mb_advs = np.zeros_like(mb_rewards)
		for k in range(4):
			lastgaelam = 0
			for t in reversed(range(k, endLength, 4)):
				# t=0, 1, 2, 3, 4, 0, 5, 1, 6, 2, 7, 3
				nextNonTerminal = 1.0 - mb_dones[t]
				nextValues = mb_values[t+4]
				delta = mb_rewards[t] + self.gamma * nextValues * nextNonTerminal - mb_values[t]
				mb_advs[t] = lastgaelam = delta + self.gamma * self.lam * nextNonTerminal * lastgaelam

		mb_values = mb_values[:endLength]
		#mb_dones = mb_dones[:endLength]
		mb_returns = mb_advs + mb_values

		return map(sf01, (mb_obs, mb_availAcs, mb_returns, mb_actions, mb_values, mb_neglogpacs))

	def train(self, nTotalSteps):

		nUpdates = nTotalSteps // (self.nGames * self.nSteps)#62500000

		for update in range(nUpdates):

			alpha = 1.0 - update/nUpdates
			lrnow = self.learningRate * alpha
			if lrnow < self.minLearningRate:
				lrnow = self.minLearningRate
			cliprangenow = self.clipRange * alpha

			#1. run -> go into step see in run function
			states, availAcs, returns, actions, values, neglogpacs = self.run()

			batchSize = states.shape[0]
			self.totTrainingSteps += batchSize

			nTrainingBatch = batchSize // self.nMiniBatches

			currParams = self.trainingNetwork.getParams()

			mb_lossvals = []
			inds = np.arange(batchSize)
			for _ in range(self.nOptEpochs):
				np.random.shuffle(inds)
				for start in range(0, batchSize, nTrainingBatch):
					end = start + nTrainingBatch
					mb_inds = inds[start:end]
					loss_   = self.trainingModel.train(lrnow, cliprangenow, states[mb_inds], availAcs[mb_inds], returns[mb_inds], actions[mb_inds], values[mb_inds], neglogpacs[mb_inds])
					mb_lossvals.append(loss_)
					print("Loss:", loss_, "start", start, "bS", batchSize, "nB", nTrainingBatch)
			lossvals = np.mean(mb_lossvals, axis=0)
			self.losses.append(lossvals)

			newParams = self.trainingNetwork.getParams()
			needToReset = 0
			for param in newParams:
				if np.sum(np.isnan(param)) > 0:
					print("I need to reset! as nan values are contained!!!")
					halo
					needToReset = 1

			if needToReset == 1:
				self.trainingNetwork.loadParams(currParams)

			print(update, self.saveEvery, update % self.saveEvery)
			if update % self.saveEvery == 0:
				name = "modelParameters" + str(update)
				self.trainingNetwork.saveParams(name)
				print("Losses:", self.losses, "\n\n")
				joblib.dump(self.losses,  "losses.pkl")
				joblib.dump(self.epInfos, "epInfos.pkl")