class Person(object): """ Person Information """ def __init__( self, name = None, birth_date = None, death_date = None ): self.name = name self.birth_date = birth_date self.death_date = death_date def set_name( self, first_name, last_name ): self.name = Name( first_name, last_name ) def set_birth_date( self, birth_day, birth_month, birth_year ): self.birth_date = Date( birth_date, birth_month, birth_year ) def set_death_date( self, death_day, death_month, death_year ): self.death_date = Date( death_day, death_month, death_year ) def get_name( self ): return self.name def get_birth_date( self ): return self.birth_date def get_death_date( self ): return self.death_date def __str__( self ): return "{}, {}, {}".format( self.name.__str__(), self.birth_date.__str__(), self.death_date.__str__() )
def generateTrainingSetDatasetAttributesWithoutValues(self, dataset): Logger.Info("Generating dataset attributes for dataset: " + dataset.name) # DateFormat dateFormat = new SimpleDateFormat("yyyy/MM/dd HH:mm:ss"); startDate = Date() # The structure: Classifier -> candidate feature (operator assignment, to be exact) -> meta-feature type -> A map of feature indices and values # { classifier: # { OperatorAssigment: # { meta-feature type: {indice, value}} # TreeMap<String, HashMap<OperatorAssignment,HashMap<String,TreeMap<Integer,AttributeInfo>>>> candidateAttributesList = new TreeMap<>() candidateAttributesList = {} classifiers = Properties.classifiersForMLAttributesGeneration.split(',') # obtaining the attributes for the dataset itself is straightforward dba = DatasetBasedAttributes() for classifier in classifiers: candidateAttributesList[classifier] = {} originalAuc = self.getOriginalAuc(dataset, classifier) # Generate the dataset attributes datasetAttributes = dba.getDatasetBasedFeatures(dataset, classifier) # now we need to generate the candidate attributes and evaluate them. This requires a few preliminary steps: # 1) Replicate the dataset and create the discretized features and add them to the dataset unaryOperators = OperatorsAssignmentsManager.getUnaryOperatorsList() # The unary operators need to be evaluated like all other operator assignments (i.e. attribtues generation) unaryOperatorAssignments = OperatorsAssignmentsManager.getOperatorAssignments(dataset, None, unaryOperators, int(Properties.maxNumOfAttsInOperatorSource)) replicatedDataset = self.generateDatasetReplicaWithDiscretizedAttributes(dataset, unaryOperatorAssignments) # 2) Obtain all other operator assignments (non-unary). IMPORTANT: this is applied on the REPLICATED dataset so we can take advantage of the discretized features nonUnaryOperators = OperatorsAssignmentsManager.getNonUnaryOperatorsList() nonUnaryOperatorAssignments = OperatorsAssignmentsManager.getOperatorAssignments(replicatedDataset, None, nonUnaryOperators, int(Properties.maxNumOfAttsInOperatorSource)) # 3) Generate the candidate attribute and generate its attributes nonUnaryOperatorAssignments.addAll(unaryOperatorAssignments) # oaList.parallelStream().forEach(oa -> { # ReentrantLock wrapperResultsLock = new ReentrantLock(); # for (OperatorAssignment oa : nonUnaryOperatorAssignments) { position = [0] #new int[]{0}; # TODO: keep it pararell, temporary changed to single thread # nonUnaryOperatorAssignments.parallelStream().forEach(oa -> { for oa in nonUnaryOperatorAssignments: try: datasetReplica = dataset.replicateDataset() # Here we generate all the meta-features that are "parent dependent" and do not require us to generate the values of the new attribute oaba = OperatorAssignmentBasedAttributes() # TreeMap < Integer, AttributeInfo > candidateAttributeValuesFreeMetaFeatures = oaba.getOperatorAssignmentBasedMetaFeatures(dataset, oa) evaluationInfo = self.runClassifier(classifier, datasetReplica.generateSet(True), datasetReplica.generateSet(False)) evaluationResults1 = evaluationInfo.getEvaluationStats() # synchronized (this){ #TODO: part of the pararell stream # candidateAttributesList.get(classifier).put(oa, new HashMap<>()); # candidateAttributesList.get(classifier).get(oa).put(DATASET_BASED, datasetAttributes); candidateAttributesList[classifier][oa][MLAttributeManager.DATASET_BASED] = datasetAttributes # Add the identifier of the classifier that was used classifierAttribute = AttributeInfo("Classifier", Operator.outputType.Discrete, self.getClassifierIndex(classifier), 3) candidateAttributeValuesFreeMetaFeatures[len(candidateAttributeValuesFreeMetaFeatures)] = classifierAttribute candidateAttributesList[classifier][oa][MLAttributeManager.OA_BASED] = candidateAttributeValuesFreeMetaFeatures # candidateAttributeValuesDependentMetaFeatures = oaba.getGeneratedAttributeValuesMetaFeatures(dataset, oa, candidateAttribute) # candidateAttributesList[classifier][oa][MLAttributeManager.VALUES_BASED] = candidateAttributeValuesDependentMetaFeatures candidateAttributesList[classifier][oa][MLAttributeManager.OA_BASED][candidateAttributesList[classifier][oa][MLAttributeManager.OA_BASED].size()] = self.createClassAttribute(originalAuc, datasetReplica, evaluationResults1) # wrapperResultsLock.lock(); #TODO: part of the pararell stream if (len(candidateAttributesList[classifier]) % 1000) == 0: date = Date() Logger.Info(date.__str__() + ": Finished processing " + ((position[0] * MLAttributeManager.ITERATION) + len(candidateAttributesList[classifier]) + '/' + nonUnaryOperatorAssignments.size() + ' elements for background model')) if (len(candidateAttributesList[classifier]) % MLAttributeManager.ITERATION) == 0: self.savePartArffCandidateAttributes(candidateAttributesList,classifier,dataset,position[0]) position[0] += 1 candidateAttributesList[classifier].clear() # wrapperResultsLock.unlock(); #TODO: part of the pararell stream except Exception as ex: Logger.Error("Error in ML features generation : " + oa.getName() + " : " + str(ex)) self.savePartArffCandidateAttributes(candidateAttributesList,classifier,dataset,position[0]) finishDate = Date() diffInMillies = finishDate - startDate Logger.Info("Getting candidate attributes for dataset " + dataset.name + " took " + diffInMillies.seconds.__str__() + " seconds")
def EvaluationAndWriteResultsToFile(self, dataset: Dataset, addedAttribute: str, iteration: int, runInfo: str, newFile: bool, evaluatedAttsCounter: int, filterEvaluatorScore: float, wrapperEvaluationScore: float): evaluation = self.runClassifier(Properties.classifier, dataset.generateSet(True), dataset.generateSet(False)) # We calcualte the TPR/FPR rate. We do it ourselves because we want all the values tprFprValues = self.calculateTprFprRate(evaluation, dataset) # The TRR/FPR values enable us to calculate the precision/recall values. recallPrecisionValues = self.calculateRecallPrecisionValues( dataset, tprFprValues, Properties.precisionRecallIntervals) # Next, we calculate the F-Measure at the selected points fMeasureValuesPerRecall = {} fMeasurePrecisionValues = Properties.FMeausrePoints for recallVal in fMeasurePrecisionValues: precision = recallPrecisionValues[recallVal] F1Measure = (2 * precision * recallVal) / (precision + recallVal) fMeasureValuesPerRecall[recallVal], = F1Measure # now we can write everything to file sb = '' # If it's a new file, we need to create a header for the file if newFile: sb += "Iteration,Added_Attribute,LogLoss,AUC," for recallVal in fMeasureValuesPerRecall.keys(): sb += f"F1_Measure_At_Recall_{recallVal}," for recallVal in recallPrecisionValues.keys(): sb += f"Precision_At_Recall_Val_{recallVal}," sb += "Chosen_Attribute_Filter_Score,Chosen_Attribute_Wrapper_Score,Num_Of_Evaluated_Attributes_In_Iteration" sb += "Iteration_Completion_time" sb += os.linesep sb += str(iteration) + "," sb += f'"{addedAttribute}",' # The LogLoss sb += str(self.CalculateLogLoss(evaluation, dataset)) + "," # The AUC sb += str( roc_auc_score( evaluation.actualPred, evaluation. scoreDistPerInstance[:, dataset.getMinorityClassIndex()])) + ',' # evaluation.areaUnderROC(dataset.getMinorityClassIndex())).concat(",")); # The F1 measure for recallVal in fMeasureValuesPerRecall.keys(): sb += str(fMeasureValuesPerRecall[recallVal]) + "," # Recall/Precision values for recallVal in recallPrecisionValues.keys(): sb += str(recallPrecisionValues[recallVal]) + "," sb += str(filterEvaluatorScore) + "," sb += str(wrapperEvaluationScore) + "," sb += str(evaluatedAttsCounter) + "," date = Date() sb += date.__str__() try: filename = Properties.resultsFilePath + dataset.name + runInfo + ".csv" if newFile: fw = open(filename, "w") else: fw = open(filename, "a") fw.write(sb + "\n") fw.close() except Exception as ex: Logger.Error("IOException: " + ex)