예제 #1
0
class Person(object):
    """ Person Information """
    def __init__( self, name = None, birth_date = None, death_date = None ):
        self.name = name
        self.birth_date = birth_date
        self.death_date = death_date

    def set_name( self, first_name, last_name ):
        self.name = Name( first_name, last_name )

    def set_birth_date( self, birth_day, birth_month, birth_year ):
        self.birth_date = Date( birth_date, birth_month, birth_year )

    def set_death_date( self, death_day, death_month, death_year ):
        self.death_date = Date( death_day, death_month, death_year )

    def get_name( self ):
        return self.name

    def get_birth_date( self ):
        return self.birth_date

    def get_death_date( self ):
        return self.death_date

    def __str__( self ):
        return "{}, {}, {}".format( self.name.__str__(), self.birth_date.__str__(), self.death_date.__str__() )
예제 #2
0
    def generateTrainingSetDatasetAttributesWithoutValues(self, dataset):
        Logger.Info("Generating dataset attributes for dataset: " + dataset.name)

        # DateFormat dateFormat = new SimpleDateFormat("yyyy/MM/dd HH:mm:ss");
        startDate = Date()
        # The structure: Classifier -> candidate feature (operator assignment, to be exact) -> meta-feature type -> A map of feature indices and values
        # { classifier:
        #     { OperatorAssigment:
        #           { meta-feature type: {indice, value}}
        # TreeMap<String, HashMap<OperatorAssignment,HashMap<String,TreeMap<Integer,AttributeInfo>>>> candidateAttributesList = new TreeMap<>()
        candidateAttributesList = {}

        classifiers = Properties.classifiersForMLAttributesGeneration.split(',')

        # obtaining the attributes for the dataset itself is straightforward
        dba = DatasetBasedAttributes()
        for classifier in classifiers:
            candidateAttributesList[classifier] = {}
            originalAuc = self.getOriginalAuc(dataset, classifier)

            # Generate the dataset attributes
            datasetAttributes = dba.getDatasetBasedFeatures(dataset, classifier)

            # now we need to generate the candidate attributes and evaluate them. This requires a few preliminary steps:
            # 1) Replicate the dataset and create the discretized features and add them to the dataset
            unaryOperators = OperatorsAssignmentsManager.getUnaryOperatorsList()

            # The unary operators need to be evaluated like all other operator assignments (i.e. attribtues generation)
            unaryOperatorAssignments = OperatorsAssignmentsManager.getOperatorAssignments(dataset, None, unaryOperators, int(Properties.maxNumOfAttsInOperatorSource))
            replicatedDataset = self.generateDatasetReplicaWithDiscretizedAttributes(dataset, unaryOperatorAssignments)

            # 2) Obtain all other operator assignments (non-unary). IMPORTANT: this is applied on the REPLICATED dataset so we can take advantage of the discretized features
            nonUnaryOperators = OperatorsAssignmentsManager.getNonUnaryOperatorsList()
            nonUnaryOperatorAssignments = OperatorsAssignmentsManager.getOperatorAssignments(replicatedDataset, None, nonUnaryOperators, int(Properties.maxNumOfAttsInOperatorSource))

            # 3) Generate the candidate attribute and generate its attributes
            nonUnaryOperatorAssignments.addAll(unaryOperatorAssignments)

            # oaList.parallelStream().forEach(oa -> {
            # ReentrantLock wrapperResultsLock = new ReentrantLock();
            # for (OperatorAssignment oa : nonUnaryOperatorAssignments) {
            position = [0] #new int[]{0};

            # TODO: keep it pararell, temporary changed to single thread
            # nonUnaryOperatorAssignments.parallelStream().forEach(oa -> {
            for oa in nonUnaryOperatorAssignments:
                try:
                    datasetReplica = dataset.replicateDataset()

                    # Here we generate all the meta-features that are "parent dependent" and do not require us to generate the values of the new attribute
                    oaba = OperatorAssignmentBasedAttributes()

                    # TreeMap < Integer, AttributeInfo >
                    candidateAttributeValuesFreeMetaFeatures = oaba.getOperatorAssignmentBasedMetaFeatures(dataset, oa)


                    evaluationInfo = self.runClassifier(classifier, datasetReplica.generateSet(True), datasetReplica.generateSet(False))
                    evaluationResults1 = evaluationInfo.getEvaluationStats()

                    # synchronized (this){ #TODO: part of the pararell stream
                    #     candidateAttributesList.get(classifier).put(oa, new HashMap<>());
                    #     candidateAttributesList.get(classifier).get(oa).put(DATASET_BASED, datasetAttributes);
                    candidateAttributesList[classifier][oa][MLAttributeManager.DATASET_BASED] =  datasetAttributes
                    # Add the identifier of the classifier that was used
                    classifierAttribute = AttributeInfo("Classifier", Operator.outputType.Discrete, self.getClassifierIndex(classifier), 3)
                    candidateAttributeValuesFreeMetaFeatures[len(candidateAttributeValuesFreeMetaFeatures)] = classifierAttribute
                    candidateAttributesList[classifier][oa][MLAttributeManager.OA_BASED] = candidateAttributeValuesFreeMetaFeatures

                    # candidateAttributeValuesDependentMetaFeatures = oaba.getGeneratedAttributeValuesMetaFeatures(dataset, oa, candidateAttribute)
                    # candidateAttributesList[classifier][oa][MLAttributeManager.VALUES_BASED] = candidateAttributeValuesDependentMetaFeatures
                    candidateAttributesList[classifier][oa][MLAttributeManager.OA_BASED][candidateAttributesList[classifier][oa][MLAttributeManager.OA_BASED].size()] = self.createClassAttribute(originalAuc, datasetReplica, evaluationResults1)


                    # wrapperResultsLock.lock(); #TODO: part of the pararell stream
                    if (len(candidateAttributesList[classifier]) % 1000) == 0:
                        date = Date()
                        Logger.Info(date.__str__() + ": Finished processing " + ((position[0] * MLAttributeManager.ITERATION) + len(candidateAttributesList[classifier]) + '/' + nonUnaryOperatorAssignments.size() + ' elements for background model'))

                    if (len(candidateAttributesList[classifier]) % MLAttributeManager.ITERATION) == 0:
                        self.savePartArffCandidateAttributes(candidateAttributesList,classifier,dataset,position[0])
                        position[0] += 1
                        candidateAttributesList[classifier].clear()
                    # wrapperResultsLock.unlock(); #TODO: part of the pararell stream
                except Exception as ex:
                    Logger.Error("Error in ML features generation : " + oa.getName() + "  :  " + str(ex))


            self.savePartArffCandidateAttributes(candidateAttributesList,classifier,dataset,position[0])

        finishDate = Date()
        diffInMillies = finishDate - startDate
        Logger.Info("Getting candidate attributes for dataset " + dataset.name + " took " + diffInMillies.seconds.__str__() + " seconds")
예제 #3
0
    def EvaluationAndWriteResultsToFile(self, dataset: Dataset,
                                        addedAttribute: str, iteration: int,
                                        runInfo: str, newFile: bool,
                                        evaluatedAttsCounter: int,
                                        filterEvaluatorScore: float,
                                        wrapperEvaluationScore: float):

        evaluation = self.runClassifier(Properties.classifier,
                                        dataset.generateSet(True),
                                        dataset.generateSet(False))

        # We calcualte the TPR/FPR rate. We do it ourselves because we want all the values
        tprFprValues = self.calculateTprFprRate(evaluation, dataset)

        # The TRR/FPR values enable us to calculate the precision/recall values.
        recallPrecisionValues = self.calculateRecallPrecisionValues(
            dataset, tprFprValues, Properties.precisionRecallIntervals)

        # Next, we calculate the F-Measure at the selected points
        fMeasureValuesPerRecall = {}
        fMeasurePrecisionValues = Properties.FMeausrePoints
        for recallVal in fMeasurePrecisionValues:
            precision = recallPrecisionValues[recallVal]
            F1Measure = (2 * precision * recallVal) / (precision + recallVal)
            fMeasureValuesPerRecall[recallVal], = F1Measure

        # now we can write everything to file
        sb = ''

        # If it's a new file, we need to create a header for the file
        if newFile:
            sb += "Iteration,Added_Attribute,LogLoss,AUC,"
            for recallVal in fMeasureValuesPerRecall.keys():
                sb += f"F1_Measure_At_Recall_{recallVal},"

            for recallVal in recallPrecisionValues.keys():
                sb += f"Precision_At_Recall_Val_{recallVal},"

            sb += "Chosen_Attribute_Filter_Score,Chosen_Attribute_Wrapper_Score,Num_Of_Evaluated_Attributes_In_Iteration"
            sb += "Iteration_Completion_time"
            sb += os.linesep

        sb += str(iteration) + ","
        sb += f'"{addedAttribute}",'

        # The LogLoss
        sb += str(self.CalculateLogLoss(evaluation, dataset)) + ","

        # The AUC
        sb += str(
            roc_auc_score(
                evaluation.actualPred, evaluation.
                scoreDistPerInstance[:,
                                     dataset.getMinorityClassIndex()])) + ','
        # evaluation.areaUnderROC(dataset.getMinorityClassIndex())).concat(","));

        # The F1 measure
        for recallVal in fMeasureValuesPerRecall.keys():
            sb += str(fMeasureValuesPerRecall[recallVal]) + ","

        # Recall/Precision values
        for recallVal in recallPrecisionValues.keys():
            sb += str(recallPrecisionValues[recallVal]) + ","

        sb += str(filterEvaluatorScore) + ","
        sb += str(wrapperEvaluationScore) + ","
        sb += str(evaluatedAttsCounter) + ","

        date = Date()
        sb += date.__str__()

        try:
            filename = Properties.resultsFilePath + dataset.name + runInfo + ".csv"
            if newFile:
                fw = open(filename, "w")
            else:
                fw = open(filename, "a")
            fw.write(sb + "\n")
            fw.close()

        except Exception as ex:
            Logger.Error("IOException: " + ex)