Ejemplo n.º 1
0
 def runExperiment(self, classifier: Classifier, parameter: Parameter, experimentPerformance: ExperimentPerformance,
                   crossValidation: CrossValidation):
     for i in range(self.K):
         trainSet = InstanceList(crossValidation.getTrainFold(i))
         testSet = InstanceList(crossValidation.getTestFold(i))
         classifier.train(trainSet, parameter)
         experimentPerformance.add(classifier.test(testSet))
Ejemplo n.º 2
0
    def nearestNeighbors(self, instance: Instance) -> InstanceList:
        """
        The nearestNeighbors method takes an Instance as an input. First it gets the possible class labels, then loops
        through the data InstanceList and creates new list of KnnInstances and adds the corresponding data with
        the distance between data and given instance. After sorting this newly created list, it loops k times and
        returns the first k instances as an InstanceList.

        PARAMETERS
        ----------
        instance : Instance
            Instance to find nearest neighbors

        RETURNS
        -------
        InstanceList
            The first k instances which are nearest to the given instance as an InstanceList.
        """
        result = InstanceList()
        instances = []
        possibleClassLabels = []
        if isinstance(instance, CompositeInstance):
            possibleClassLabels = instance.getPossibleClassLabels()
        for i in range(self.__data.size()):
            if not isinstance(instance, CompositeInstance) or self.__data.get(
                    i).getClassLabel() in possibleClassLabels:
                instances.append(
                    KnnInstance(
                        self.__data.get(i),
                        self.__distanceMetric.distance(self.__data.get(i),
                                                       instance)))
        instances.sort(key=cmp_to_key(self.makeComparator()))
        for i in range(min(self.__k, len(instances))):
            result.add(instances[i].instance)
        return result
Ejemplo n.º 3
0
 def train(self, trainSet: InstanceList, parameters: KMeansParameter):
     priorDistribution = trainSet.classDistribution()
     classMeans = InstanceList()
     classLists = Partition(trainSet)
     for i in range(classLists.size()):
         classMeans.add(classLists.get(i).average())
     self.model = KMeansModel(priorDistribution, classMeans,
                              parameters.getDistanceMetric())
Ejemplo n.º 4
0
    def __init__(self,
                 definition: DataDefinition = None,
                 separator: str = None,
                 fileName: str = None):
        """
        Constructor for generating a new DataSet with given DataDefinition.

        PARAMETERS
        ----------
        definition : DataDefinition
            Data definition of the data set.
        separator : str
            Separator character which separates the attribute values in the data file.
        fileName : str
            Name of the data set file.
        """
        self.__definition = definition
        if separator is None:
            self.__instances = InstanceList()
        else:
            self.__instances = InstanceList(definition, separator, fileName)
Ejemplo n.º 5
0
    def execute(self, experiment: Experiment) -> Performance:
        """
        Execute Stratified Single K-fold cross-validation with the given classifier on the given data set using the
        given parameters.

        PARAMETERS
        ----------
        experiment : Experiment
            Experiment to be run.

        RETURNS
        -------
        Performance
            A Performance instance.
        """
        crossValidation = StratifiedKFoldCrossValidation(
            experiment.getDataSet().getClassInstances(), self.__K,
            experiment.getParameter().getSeed())
        trainSet = InstanceList(crossValidation.getTrainFold(0))
        testSet = InstanceList(crossValidation.getTestFold(0))
        return experiment.getClassifier().singleRun(experiment.getParameter(),
                                                    trainSet, testSet)
Ejemplo n.º 6
0
    def initWithFile(self, fileName: str):
        """
        Constructor for generating a new DataSet from given File.

        PARAMETERS
        ----------
        fileName : str
            File to generate DataSet from.
        """
        self.__instances = InstanceList()
        self.__definition = DataDefinition()
        inputFile = open(fileName, 'r', encoding='utf8')
        lines = inputFile.readlines()
        i = 0
        for line in lines:
            attributes = line.split(",")
            if i == 0:
                for j in range(len(attributes) - 1):
                    try:
                        float(attributes[j])
                        self.__definition.addAttribute(
                            AttributeType.CONTINUOUS)
                    except:
                        self.__definition.addAttribute(AttributeType.DISCRETE)
            else:
                if len(attributes) != self.__definition.attributeCount() + 1:
                    continue
            if ";" not in attributes[len(attributes) - 1]:
                instance = Instance(attributes[len(attributes) - 1])
            else:
                labels = attributes[len(attributes) - 1].split(";")
                instance = CompositeInstance(labels[0], None, labels)
            for j in range(len(attributes) - 1):
                if self.__definition.getAttributeType(
                        j) is AttributeType.CONTINUOUS:
                    instance.addAttribute(
                        ContinuousAttribute(float(attributes[j])))
                elif self.__definition.getAttributeType(
                        j) is AttributeType.DISCRETE:
                    instance.addAttribute(DiscreteAttribute(attributes[j]))
            if instance.attributeSize() == self.__definition.attributeCount():
                self.__instances.add(instance)
            i = i + 1
    def train(self, trainSet: InstanceList, parameters: RandomForestParameter):
        """
        Training algorithm for random forest classifier. Basically the algorithm creates K distinct decision trees from
        K bootstrap samples of the original training set.

        PARAMETERS
        ----------
        trainSet : InstanceList
            Training data given to the algorithm
        parameters : RandomForestParameter
            Parameters of the bagging trees algorithm. ensembleSize returns the number of trees in the random forest.
        """
        forestSize = parameters.getEnsembleSize()
        forest = []
        for i in range(forestSize):
            bootstrap = trainSet.bootstrap(i)
            tree = DecisionTree(
                DecisionNode(InstanceList(bootstrap.getSample()), None,
                             parameters, False))
            forest.append(tree)
        self.model = TreeEnsembleModel(forest)
Ejemplo n.º 8
0
    def train(self, trainSet: InstanceList, parameters: BaggingParameter):
        """
        Bagging bootstrap ensemble method that creates individuals for its ensemble by training each classifier on a
        random redistribution of the training set.
        This training method is for a bagged decision tree classifier. 20 percent of the instances are left aside for
        pruning of the trees 80 percent of the instances are used for training the trees. The number of trees
        (forestSize) is a parameter, and basically the method will learn an ensemble of trees as a model.

        PARAMETERS
        ----------
        trainSet : InstanceList
            Training data given to the algorithm.
        parameters : Parameter
            Parameters of the bagging trees algorithm. ensembleSize returns the number of trees in the bagged forest.
        """
        forestSize = parameters.getEnsembleSize()
        forest = []
        for i in range(forestSize):
            bootstrap = trainSet.bootstrap(i)
            tree = DecisionTree(
                DecisionNode(InstanceList(bootstrap.getSample())))
            forest.append(tree)
        self.model = TreeEnsembleModel(forest)
    def execute(self, experiment: Experiment) -> ExperimentPerformance:
        """
        Execute the bootstrap run with the given classifier on the given data set using the given parameters.

        PARAMETERS
        ----------
        experiment : Experiment
            Experiment to be run.

        RETURNS
        -------
        ExperimentPerformance
            An ExperimentPerformance instance.
        """
        result = ExperimentPerformance()
        for i in range(self.__numberOfBootstraps):
            bootstrap = Bootstrap(experiment.getDataSet().getInstances(),
                                  i + experiment.getParameter().getSeed())
            bootstrapSample = InstanceList(bootstrap.getSample())
            experiment.getClassifier().train(bootstrapSample,
                                             experiment.getParameter())
            result.add(experiment.getClassifier().test(
                experiment.getDataSet().getInstanceList()))
        return result
 def runExperiment(self, classifier: Classifier, parameter: Parameter, crossValidation: CrossValidation):
     trainSet = InstanceList(crossValidation.getTrainFold(0))
     testSet = InstanceList(crossValidation.getTestFold(0))
     return classifier.singleRun(parameter, trainSet, testSet)
Ejemplo n.º 11
0
 def __init__(self, instanceList: InstanceList = None, ratio=None, seed=None, stratified: bool = None):
     """
     Divides the instances in the instance list into partitions so that all instances of a class are grouped in a
     single partition.
     PARAMETERS
     ----------
     ratio
         Ratio of the stratified partition. Ratio is between 0 and 1. If the ratio is 0.2, then 20 percent of the
         instances are put in the first group, 80 percent of the instances are put in the second group.
     seed
         seed is used as a random number.
     """
     self.__multilist = []
     if instanceList is not None:
         if ratio is None:
             classLabels = instanceList.getDistinctClassLabels()
             for classLabel in classLabels:
                 self.add(InstanceListOfSameClass(classLabel))
             for instance in instanceList.getInstances():
                 self.get(classLabels.index(instance.getClassLabel())).add(instance)
         else:
             if isinstance(ratio, float):
                 self.add(InstanceList())
                 self.add(InstanceList())
                 if stratified:
                     distribution = instanceList.classDistribution()
                     counts = [0] * len(distribution)
                     randomArray = [i for i in range(instanceList.size())]
                     random.seed(seed)
                     random.shuffle(randomArray)
                     for i in range(instanceList.size()):
                         instance = instanceList.get(randomArray[i])
                         classIndex = distribution.getIndex(instance.getClassLabel())
                         if counts[classIndex] < instanceList.size() * ratio * \
                                 distribution.getProbability(instance.getClassLabel()):
                             self.get(0).add(instance)
                         else:
                             self.get(1).add(instance)
                         counts[classIndex] = counts[classIndex] + 1
                 else:
                     instanceList.shuffle(seed)
                     for i in range(self.size()):
                         instance = instanceList.get(i)
                         if i < instanceList.size() * ratio:
                             self.get(0).add(instance)
                         else:
                             self.get(1).add(instance)
             elif isinstance(ratio, int):
                 attributeIndex = ratio
                 if seed is None:
                     valueList = instanceList.getAttributeValueList(attributeIndex)
                     for _ in valueList:
                         self.add(InstanceList())
                     for instance in instanceList.getInstances():
                         self.get(valueList.index(instance.getAttribute(attributeIndex).getValue())).add(instance)
                 elif isinstance(seed, int):
                     attributeValue = seed
                     self.add(InstanceList())
                     self.add(InstanceList())
                     for instance in instanceList.getInstances():
                         if instance.getAttribute(attributeIndex).getIndex() == attributeValue:
                             self.get(0).add(instance)
                         else:
                             self.get(1).add(instance)
                 elif isinstance(seed, float):
                     splitValue = seed
                     self.add(InstanceList())
                     self.add(InstanceList())
                     for instance in instanceList.getInstances():
                         if instance.getAttribute(attributeIndex).getValue() < splitValue:
                             self.get(0).add(instance)
                         else:
                             self.get(1).add(instance)