def bagAndTrain(self, tree, dataMatrix, classVectors): numToBag = int(helpers.HyperParams.FRAC_PER_BAG * dataMatrix.shape[0]) baggedIndices = helpers.bag(np.arange(dataMatrix.shape[0]), numToBag) tree.train(dataMatrix.iloc[baggedIndices], classVectors.iloc[baggedIndices]) baggedIndicesBool = np.zeros(dataMatrix.shape[0], dtype=bool) baggedIndicesBool[baggedIndices] = True return baggedIndicesBool
def trainBoosted(self, dataMatrix, classVectors): mislabels = self.getMislabelSet(np.arange( dataMatrix.shape[0]), classVectors) distribution = np.ones(mislabels.shape[0]) distribution = helpers.toProbDistribution(distribution) numToBag = int(mislabels.shape[0]) for tree in self.trees: mislabelIndices = helpers.bag(np.arange(mislabels.shape[0]), numToBag, distribution) tree.train(dataMatrix.iloc[mislabels[mislabelIndices][:,0]], classVectors.iloc[mislabels[mislabelIndices][:,0]]) loss = self.calculatePseudoLoss(tree, dataMatrix, mislabels, distribution) beta = loss/(1-loss) self.boostedBeta.append(beta) distribution = self.updateDistribution(tree, dataMatrix, mislabels, distribution, beta)