def run(self, originalDataset: Dataset, runInfo: str): Logger.Info('Initializing evaluators') filterEvaluator = MLFilterEvaluator(originalDataset) preRankerEvaluator = None if bool(Properties.usePreRanker): preRankerEvaluator = FilterPreRankerEvaluator(originalDataset) if Properties.wrapperApproach == 'AucWrapperEvaluator': wrapperEvaluator = AucWrapperEvaluator() else: Logger.Error('Missing wrapper approach') raise Exception('Missing wrapper approach') experimentStartDate = Date() Logger.Info("Experiment Start Date/Time: " + str(self.experimentStartDate) + " for dataset " + originalDataset.name) # The first step is to evaluate the initial attributes, so we get a reference point to how well we did wrapperEvaluator.EvaluationAndWriteResultsToFile( originalDataset, "", 0, runInfo, True, 0, -1, -1) # now we create the replica of the original dataset, to which we can add columns dataset = originalDataset.replicateDataset() # Get the training set sub-folds, used to evaluate the various candidate attributes originalDatasetTrainingFolds = originalDataset.GenerateTrainingSetSubFolds( ) subFoldTrainingDatasets = dataset.GenerateTrainingSetSubFolds() date = Date() # We now apply the wrapper on the training subfolds in order to get the baseline score. This is the score a candidate attribute needs to "beat" currentScore = wrapperEvaluator.produceAverageScore( subFoldTrainingDatasets, None, None, None, None) Logger.Info(f"Initial score: {str(currentScore)} : {date}") # The probabilities assigned to each instance using the ORIGINAL dataset (training folds only) Logger.Info(f"Producing initial classification results: {date}") currentClassificationProbs = wrapperEvaluator.produceClassificationResults( originalDatasetTrainingFolds) date = Date() Logger.Info(f" .....done {date}") # Apply the unary operators (discretizers, normalizers) on all the original features. The attributes generated # here are different than the ones generated at later stages because they are included in the dataset that is # used to generate attributes in the iterative search phase Logger.Info(f"Starting to apply unary operators: {date}") oam = OperatorsAssignmentsManager() candidateAttributes = oam.applyUnaryOperators( dataset, None, filterEvaluator, subFoldTrainingDatasets, currentClassificationProbs) date = Date() Logger.Info(" .....done " + str(date)) # Now we add the new attributes to the dataset (they are added even though they may not be included in the # final dataset beacuse they are essential to the full generation of additional features Logger.Info("Starting to generate and add columns to dataset: " + str(date)) oam.GenerateAndAddColumnToDataset(dataset, candidateAttributes) date = Date() Logger.Info(" .....done " + str(date)) # The initial dataset has been populated with the discretized/normalized features. Time to begin the search iterationsCounter = 1 columnsAddedInthePreviousIteration = None self.performIterativeSearch( originalDataset, runInfo, preRankerEvaluator, filterEvaluator, wrapperEvaluator, dataset, originalDatasetTrainingFolds, subFoldTrainingDatasets, currentClassificationProbs, oam, candidateAttributes, iterationsCounter, columnsAddedInthePreviousIteration)
def processInitialEvaluationInformation(self, dataset: Dataset, classifier: str): # We now need to test all folds combinations (the original train/test allocation is disregarded, which is # not a problem for the offline training. The test set dataset MUST submit a new dataset object containing # only the training folds for fold in dataset.getFolds(): fold.setIsTestFold(False) wrapperName = 'AucWrapperEvaluator' if wrapperName == 'AucWrapperEvaluator': wrapperEvaluator = AucWrapperEvaluator() else: raise Exception('Unidentified wrapper') leaveOneFoldOutDatasets = dataset.GenerateTrainingSetSubFolds() classificationResults = wrapperEvaluator.produceClassificationResults( leaveOneFoldOutDatasets) aucVals = [] logLossVals = [] recallPrecisionValues = [] # list of dicts for classificationResult in classificationResults: aucVals.append(classificationResult.getAuc()) logLossVals.append(classificationResult.getLogLoss()) recallPrecisionValues.append( classificationResult.getRecallPrecisionValues()) self.numOfFoldsInEvaluation = len(dataset.getFolds()) aucVals = np.asarray(aucVals, dtype=np.float32) self.maxAUC = aucVals.max() self.minAUC = aucVals.min() self.avgAUC = np.average(aucVals) self.stdevAUC = aucVals.std() # double tempStdev = aucVals.stream().mapToDouble(a -> Math.pow(a - self.avgAUC, 2)).sum(); # self.stdevAUC = Math.sqrt(tempStdev / aucVals.size()); logLossVals = np.asarray(logLossVals, dtype=np.float32) self.maxLogLoss = logLossVals.max() self.minLogLoss = logLossVals.min() self.avgLogLoss = np.average(logLossVals) self.stdevLogLoss = logLossVals.std() # tempStdev = logLossVals.stream().mapToDouble(a -> Math.pow(a - self.avgLogLoss, 2)).sum(); # self.stdevLogLoss = Math.sqrt(tempStdev / logLossVals.size()); self.maxPrecisionAtFixedRecallValues = {} self.minPrecisionAtFixedRecallValues = {} self.avgPrecisionAtFixedRecallValues = {} self.stdevPrecisionAtFixedRecallValues = {} for recallVal in recallPrecisionValues[0].keys(): maxVal = -1 minVal = 2 valuesList = [] for precisionRecallVals in recallPrecisionValues: maxVal = max(precisionRecallVals.get(recallVal), maxVal) minVal = min(precisionRecallVals.get(recallVal), minVal) valuesList.append(precisionRecallVals[recallVal]) # now the assignments self.maxPrecisionAtFixedRecallValues[recallVal] = maxVal self.minPrecisionAtFixedRecallValues[recallVal] = minVal self.avgPrecisionAtFixedRecallValues[recallVal] = np.average( valuesList) self.stdevPrecisionAtFixedRecallValues[recallVal] = np.std( valuesList)