def filter(self, data):
        """
        Filters the dataset(s). When providing a list, this can be used to create compatible train/test sets,
        since the filter only gets initialized with the first dataset and all subsequent datasets get transformed
        using the same setup.

        NB: inputformat(Instances) must have been called beforehand.

        :param data: the Instances to filter
        :type data: Instances or list of Instances
        :return: the filtered Instances object(s)
        :rtype: Instances or list of Instances
        """
        if isinstance(data, list):
            result = []
            for d in data:
                result.append(
                    Instances(
                        javabridge.static_call(
                            "Lweka/filters/Filter;", "useFilter",
                            "(Lweka/core/Instances;Lweka/filters/Filter;)Lweka/core/Instances;",
                            d.jobject, self.jobject)))
            return result
        else:
            return Instances(
                javabridge.static_call(
                    "Lweka/filters/Filter;", "useFilter",
                    "(Lweka/core/Instances;Lweka/filters/Filter;)Lweka/core/Instances;",
                    data.jobject, self.jobject))
def split_data(data, test_size):  # split the data
    # create placeholder for train split
    data_train = Instances.copy_instances(data)
    # remove all instances from the placeholder
    for i in reversed(range(len(data_train))):
        data_train.delete(i)

    # create placeholder for test split
    data_test = Instances.copy_instances(data)
    # remove all instances from the placeholder
    for i in reversed(range(len(data_test))):
        data_test.delete(i)

    # create list of indices
    indices = list(range(len(data)))
    # shuffle indices
    random.shuffle(indices)
    # calculate number of indices in the test split
    num_test = int(round(len(indices) * test_size, 0))

    # get indices for the test split
    test_ids = indices[:num_test]
    # fill test split with instances
    for idx in test_ids:
        data_test.add_instance(data.get_instance(idx))

    # get indices for the train split
    train_ids = indices[num_test:]
    # fill train split with instances
    for idx in train_ids:
        data_train.add_instance(data.get_instance(idx))

    return data_train, data_test
Example #3
0
    def load_file(self, dfile, incremental=False):
        """
        Loads the specified file and returns the Instances object.
        In case of incremental loading, only the structure.

        :param dfile: the file to load
        :type dfile: str
        :param incremental: whether to load the dataset incrementally
        :type incremental: bool
        :return: the full dataset or the header (if incremental)
        :rtype: Instances
        """
        self.enforce_type(self.jobject,
                          "weka.core.converters.FileSourcedConverter")
        self.incremental = incremental
        if not javabridge.is_instance_of(dfile, "Ljava/io/File;"):
            dfile = javabridge.make_instance(
                "Ljava/io/File;", "(Ljava/lang/String;)V",
                javabridge.get_env().new_string_utf(str(dfile)))
        javabridge.call(self.jobject, "reset", "()V")
        javabridge.call(self.jobject, "setFile", "(Ljava/io/File;)V", dfile)
        if incremental:
            self.structure = Instances(
                javabridge.call(self.jobject, "getStructure",
                                "()Lweka/core/Instances;"))
            return self.structure
        else:
            return Instances(
                javabridge.call(self.jobject, "getDataSet",
                                "()Lweka/core/Instances;"))
def splitTrainSet(data,m_numLabledData=10) :

    total = data.num_instances
    labeled_amount = int(m_numLabledData * total / 100)
    unlabeled_amount = total - labeled_amount

    rand = Random(1)
    data.randomize(rand)

    labledDataSet = Instances.create_instances(data.relationname,data.attributes(),labeled_amount)
    UnlabledDataSet = Instances.create_instances(data.relationname,data.attributes(),unlabeled_amount)

    

    for i in range(labeled_amount) :
        labledDataSet.add_instance(data.get_instance(i))

    labledDataSet.randomize(rand)

    for i in range(unlabeled_amount) :
        UnlabledDataSet.add_instance(data.get_instance(labeled_amount + i))


    # labledDataSet.randomize(rand)
    labledDataSet.class_is_last()

    # UnlabledDataSet.randomize(rand)
    UnlabledDataSet.class_is_last()


    return labledDataSet,UnlabledDataSet
Example #5
0
    def train_internal(self):
        best_weights_arr = []
        #create an empty F with source as template
        F = Instances.template_instances(self.source[0])
        withF = False
        print("Find weight for each source data set")
        for source in self.source:
            bestWeight, bestError = self.process_source(source, F, withF)
            best_weights_arr.append(bestWeight)

        #sort the data based on the weights
        self.source = [
            source for _, source in sorted(zip(best_weights_arr, self.source),
                                           reverse=True,
                                           key=operator.itemgetter(0))
        ]

        print("Train for final stage")
        withF = True
        while len(self.source) > 0:  #self.max_source_dataset):
            weight, _ = self.process_source(self.source[0], F, withF)
            for inst in self.source[0]:
                inst.weight = weight
            F = Instances.append_instances(F, self.source[0])
            F.class_is_last()
            self.source.pop(0)

        return F
Example #6
0
    def load_file(self, dfile, incremental=False):
        """
        Loads the specified file and returns the Instances object.
        In case of incremental loading, only the structure.

        :param dfile: the file to load
        :type dfile: str
        :param incremental: whether to load the dataset incrementally
        :type incremental: bool
        :return: the full dataset or the header (if incremental)
        :rtype: Instances
        :raises Exception: if the file does not exist
        """
        self.enforce_type(self.jobject,
                          "weka.core.converters.FileSourcedConverter")
        self.incremental = incremental
        if not javabridge.is_instance_of(dfile, "Ljava/io/File;"):
            dfile = javabridge.make_instance(
                "Ljava/io/File;", "(Ljava/lang/String;)V",
                javabridge.get_env().new_string_utf(str(dfile)))
        javabridge.call(self.jobject, "reset", "()V")
        # check whether file exists, otherwise previously set file gets loaded again
        sfile = javabridge.to_string(dfile)
        if not os.path.exists(sfile):
            raise Exception("Dataset file does not exist: " + str(sfile))
        javabridge.call(self.jobject, "setFile", "(Ljava/io/File;)V", dfile)
        if incremental:
            self.structure = Instances(
                javabridge.call(self.jobject, "getStructure",
                                "()Lweka/core/Instances;"))
            return self.structure
        else:
            return Instances(
                javabridge.call(self.jobject, "getDataSet",
                                "()Lweka/core/Instances;"))
Example #7
0
    def load_url(self, url, incremental=False):
        """
        Loads the specified URL and returns the Instances object.
        In case of incremental loading, only the structure.

        :param url: the URL to load the data from
        :type url: str
        :param incremental: whether to load the dataset incrementally
        :type incremental: bool
        :return: the full dataset or the header (if incremental)
        :rtype: Instances
        """
        self.enforce_type(self.jobject,
                          "weka.core.converters.URLSourcedLoader")
        self.incremental = incremental
        javabridge.call(self.jobject, "reset", "()V")
        javabridge.call(self.jobject, "setURL", "(Ljava/lang/String;)V",
                        str(url))
        if incremental:
            self.structure = Instances(
                javabridge.call(self.jobject, "getStructure",
                                "()Lweka/core/Instances;"))
            return self.structure
        else:
            return Instances(
                javabridge.call(self.jobject, "getDataSet",
                                "()Lweka/core/Instances;"))
def testNB(training_data, testing_data):

    train_data = Instances.copy_instances(training_data)
    test_data = Instances.copy_instances(testing_data)

    evaluation = Evaluation(train_data)
    classifier = Classifier(classname="weka.classifiers.bayes.NaiveBayes")
    classifier.build_classifier(
        train_data)  # build classifier on the training data
    evaluation.test_model(classifier,
                          test_data)  # test and evaluate model on the test set
    print("")
    print("")
    print(
        evaluation.summary(
            "--------------Naive Bayes Evaluation--------------"))
    print("Accuracy: " + str(evaluation.percent_correct))
    print("")
    print("Label\tPrecision\t\tRecall\t\t\tF-Measure")
    print("<=50K\t" + str(evaluation.precision(0)) + "\t" +
          str(evaluation.recall(0)) + "\t" + str(evaluation.f_measure(0)))
    print(">50K\t" + str(evaluation.precision(1)) + "\t" +
          str(evaluation.recall(1)) + "\t" + str(evaluation.f_measure(1)))
    print("Mean\t" + str(((evaluation.precision(1)) +
                          (evaluation.precision(0))) / 2) + "\t" +
          str(((evaluation.recall(1)) + (evaluation.recall(0))) / 2) + "\t" +
          str(((evaluation.f_measure(1)) + (evaluation.f_measure(0))) / 2))
Example #9
0
def main():
    """
    Just runs some example code.
    """

    # load a dataset
    iris_file = helper.get_data_dir() + os.sep + "iris.arff"
    helper.print_info("Loading dataset: " + iris_file)
    loader = Loader("weka.core.converters.ArffLoader")
    iris_data = loader.load_file(iris_file)
    iris_data.class_is_last()

    # train classifier
    classifier = Classifier("weka.classifiers.trees.J48")
    classifier.build_classifier(iris_data)

    # save and read object
    helper.print_title("I/O: model (using serialization module)")
    outfile = tempfile.gettempdir() + os.sep + "j48.model"
    serialization.write(outfile, classifier)
    model = Classifier(jobject=serialization.read(outfile))
    print(model)

    # save classifier and dataset header (multiple objects)
    helper.print_title("I/O: model and header (using serialization module)")
    serialization.write_all(
        outfile,
        [classifier, Instances.template_instances(iris_data)])
    objects = serialization.read_all(outfile)
    for i, obj in enumerate(objects):
        helper.print_info("Object #" + str(i + 1) + ":")
        if javabridge.get_env().is_instance_of(
                obj,
                javabridge.get_env().find_class("weka/core/Instances")):
            obj = Instances(jobject=obj)
        elif javabridge.get_env().is_instance_of(
                obj,
                javabridge.get_env().find_class(
                    "weka/classifiers/Classifier")):
            obj = Classifier(jobject=obj)
        print(obj)

    # save and read object
    helper.print_title("I/O: just model (using Classifier class)")
    outfile = tempfile.gettempdir() + os.sep + "j48.model"
    classifier.serialize(outfile)
    model, _ = Classifier.deserialize(outfile)
    print(model)

    # save classifier and dataset header (multiple objects)
    helper.print_title("I/O: model and header (using Classifier class)")
    classifier.serialize(outfile, header=iris_data)
    model, header = Classifier.deserialize(outfile)
    print(model)
    if header is not None:
        print(header)
Example #10
0
    def load_file(self, dfile, incremental=False, class_index=None):
        """
        Loads the specified file and returns the Instances object.
        In case of incremental loading, only the structure.

        :param dfile: the file to load
        :type dfile: str
        :param incremental: whether to load the dataset incrementally
        :type incremental: bool
        :param class_index: the class index string to use ('first', 'second', 'third', 'last-2', 'last-1', 'last' or 1-based index)
        :type class_index: str
        :return: the full dataset or the header (if incremental)
        :rtype: Instances
        :raises Exception: if the file does not exist
        """
        self.enforce_type(self.jobject,
                          "weka.core.converters.FileSourcedConverter")
        self.incremental = incremental
        if not javabridge.is_instance_of(dfile, "Ljava/io/File;"):
            dfile = javabridge.make_instance(
                "Ljava/io/File;", "(Ljava/lang/String;)V",
                javabridge.get_env().new_string_utf(str(dfile)))
        javabridge.call(self.jobject, "reset", "()V")
        # check whether file exists, otherwise previously set file gets loaded again
        sfile = javabridge.to_string(dfile)
        if not os.path.exists(sfile):
            raise Exception("Dataset file does not exist: " + str(sfile))
        javabridge.call(self.jobject, "setFile", "(Ljava/io/File;)V", dfile)
        if incremental:
            self.structure = Instances(
                javabridge.call(self.jobject, "getStructure",
                                "()Lweka/core/Instances;"))
            result = self.structure
        else:
            result = Instances(
                javabridge.call(self.jobject, "getDataSet",
                                "()Lweka/core/Instances;"))
        if class_index is not None:
            if class_index == 'first':
                result.class_index = 0
            elif class_index == 'second':
                result.class_index = 1
            elif class_index == 'third':
                result.class_index = 2
            elif class_index == 'last-2':
                result.class_index = result.num_attributes - 3
            elif class_index == 'last-1':
                result.class_index = result.num_attributes - 2
            elif class_index == 'last':
                result.class_index = result.num_attributes - 1
            else:
                result.class_index = int(class_index)
        return result
def LabeledUnlabeldata(data, unlabeled, tree, y, cal_method=None):

    data1 = Instances.copy_instances(data)
    labeling = Instances.copy_instances(unlabeled)
    tree.build_classifier(data1)

    j = i = s = l = 0

    while i < labeling.num_instances:
        clsLabel = tree.classify_instance(labeling.get_instance(i))

        ##### probability calculation #####
        # dist = tree.distribution_for_instance(labeling.get_instance(i))
        dist = calculate_probability_distribution(tree, labeling, i,
                                                  cal_method)

        for k, dk in enumerate(dist):
            if dk >= y:

                j = i
                while j < labeling.num_instances:
                    clsLabel = tree.classify_instance(labeling.get_instance(j))

                    ##### probability calculation #####
                    # dist = tree.distribution_for_instance(labeling.get_instance(j))
                    dist = calculate_probability_distribution(
                        tree, labeling, j, cal_method)

                    for dp in dist:
                        if dp >= y:
                            inst = labeling.get_instance(i)
                            inst.set_value(inst.class_index, clsLabel)
                            data1.add_instance(inst)
                            labeling.delete(i)
                            l += 1
                            j -= 1

                    j += 1

            if k == (len(dist) - 1) and (l != 0):
                tree.build_classifier(data1)
                i = -1
                s += l
                l = 0
        i += 1

    data1.compactify()
    return data1
Example #12
0
    def folds(self, nfolds=10, seed=None):
        """
        Get (training,testing) datasets for cross-validation.

        Arguments:

            nfolds (int, optional): Number of folds. Default value is
              10.
            seed (int, optional): Seed value for shuffling
              dataset. Default value is random int 0 <= x <= 10000.

        Returns:

            list of (Instances,Instances) tuples: Each list element is
              a pair of (training,testing) datasets, respectively.
        """
        seed = seed or randint(0, 10000)
        rnd = WekaRandom(seed)

        fold_size = labmath.ceil(self.instances.num_instances / nfolds)

        # Shuffle the dataset.
        instances = WekaInstances.copy_instances(self.instances)
        instances.randomize(rnd)

        folds = []
        for i in range(nfolds):
            offset = i * fold_size
            testing_end = min(offset + fold_size, instances.num_instances - 1)

            # Calculate dataset indices for testing and training data.
            testing_range = (offset, testing_end - offset)
            left_range = (0, offset)
            right_range = (testing_end, instances.num_instances - testing_end)

            # If there's nothing to test, move on.
            if testing_range[1] < 1: continue

            # Create testing and training folds.
            testing = WekaInstances.copy_instances(instances, *testing_range)
            left = WekaInstances.copy_instances(instances, *left_range)
            right = WekaInstances.copy_instances(instances, *right_range)
            training = WekaInstances.append_instances(left, right)

            # Add fold to collection.
            folds.append((training, testing))

        return folds
def main():
    """
    Just runs some example code.
    """

    # load a dataset
    iris_file = helper.get_data_dir() + os.sep + "iris.arff"
    helper.print_info("Loading dataset: " + iris_file)
    loader = Loader("weka.core.converters.ArffLoader")
    iris_data = loader.load_file(iris_file)
    iris_data.class_is_last()

    # train classifier
    classifier = Classifier("weka.classifiers.trees.J48")
    classifier.build_classifier(iris_data)

    # save and read object
    helper.print_title("I/O: single object")
    outfile = tempfile.gettempdir() + os.sep + "j48.model"
    serialization.write(outfile, classifier)
    model = Classifier(jobject=serialization.read(outfile))
    print(model)

    # save classifier and dataset header (multiple objects)
    helper.print_title("I/O: single object")
    serialization.write_all(outfile, [classifier, Instances.template_instances(iris_data)])
    objects = serialization.read_all(outfile)
    for i, obj in enumerate(objects):
        helper.print_info("Object #" + str(i+1) + ":")
        if javabridge.get_env().is_instance_of(obj, javabridge.get_env().find_class("weka/core/Instances")):
            obj = Instances(jobject=obj)
        elif javabridge.get_env().is_instance_of(obj, javabridge.get_env().find_class("weka/classifiers/Classifier")):
            obj = Classifier(jobject=obj)
        print(obj)
Example #14
0
def load_model(filename):
    """ Load the model from cache.
    Args:
        filename(str): The target file name (without extension) to load. Example: LMT
    Returns:
        The classifier and data object if the target caching is saved, otherwise None.
    """

    # Path to the cashed model (example: caches/model/LMT.cache)
    path = os.path.join(os.path.join('caches', 'model'), filename + '.cache')

    print("Path to the cashed model to load:", path)

    if os.path.isfile(path):
        cached_model, cached_data_used_for_training = serialization.read_all(
            path)
        print("Loading cached classifier")
        trained_classifier = Classifier(jobject=cached_model)
        print("Loading cached data")
        training_data = Instances(jobject=cached_data_used_for_training)
        localizer_log.msg("Loaded model: {filename}".format(filename=filename))
        return [trained_classifier, training_data]

    localizer_log.msg("Failed to load cache of 'model'.")
    return None
Example #15
0
    def df_to_instances(self):
        '''
        transform pandas data frame to arff style data
        :param df:              panda data frame
        :param relation:        relation, string
        :param attr_label:      label attribute, string
        :return:                arff style data
        '''

        atts = []
        for col in self.df.columns:
            if col != self.attr_label:
                att = Attribute.create_numeric(col)
            else:
                att = Attribute.create_nominal(col, ['0', '1'])
            atts.append(att)
        nrow = len(self.df)
        result = Instances.create_instances(self.relation, atts, nrow)
        # data
        for i in range(nrow):
            inst = Instance.create_instance(
                self.df.iloc[i].astype('float64').to_numpy().copy(order='C'))
            result.add_instance(inst)

        return result
def create_subsample(data, percent, seed=1):
    """
    Generates a subsample of the dataset.
    :param data: the data to create the subsample from
    :type data: Instances
    :param percent: the percentage (0-100)
    :type percent: float
    :param seed: the seed value to use
    :type seed: int
    """
    if percent <= 0 or percent >= 100:
        return data
    data = Instances.copy_instances(data)
    data.randomize(Random(seed))
    data = Instances.copy_instances(data, 0, int(round(data.num_instances() * percent / 100.0)))
    return data
def main():

    try:
        jvm.start()

        loader = Loader(classname="weka.core.converters.CSVLoader")
        data = loader.load_file("./data/adult.csv")

        data.class_is_last()  # set class attribute

        # randomize data
        folds = k
        seed = 1
        rnd = Random(seed)
        rand_data = Instances.copy_instances(data)
        rand_data.randomize(rnd)
        if rand_data.class_attribute.is_nominal:
            rand_data.stratify(folds)

        NaiveBayes(rand_data, folds, seed, data)
        DecisionTree(rand_data, folds, seed, data)
    except Exception as e:
        raise e
    finally:
        jvm.stop()
Example #18
0
    def predBtn_clicked(self):

        gender = self.gender_entry.get()
        age = int(self.age_entry.get())
        height = int(self.height_entry.get())
        weight = int(self.weight_entry.get())
        sociability = self.sociability_entry.get()
        stability = self.stability_entry.get()
        '''Create the model'''
        objects = serialization.read_all("J48.model")

        cls = Classifier(jobject=objects[0])
        data = Instances(jobject=objects[1])
        '''Create the test set to be classified'''
        gender_values = ["Man", "Woman"]
        sociability_values = ["Introvert", "Extrovert"]
        stability_values = ["Stable", "Unstable"]

        values = [
            gender_values.index(gender), age, height, weight,
            self.BMI(weight, height),
            stability_values.index(stability),
            sociability_values.index(sociability),
            Instance.missing_value()
        ]

        inst = Instance.create_instance(values)
        inst.dataset = data
        '''Classification'''
        prediction = int(cls.classify_instance(inst))
        self.controller.show_frame("Result").show(prediction)
        self.clear()
Example #19
0
def save_model(model, data, filename):
    """Save the model to the target caching file.

    The caches should be defined in the config file. See README and
    config.sample for reference.

    Args:
        model(obj): The model to be saved. Should be a
            weka.classifier.Classifier object.
        data(obj): The training set to be cached.
        target(str): The target option in '[cached]' section in the config
            file.
        filename(str): The target file to save.

    Returns:
        True if the target caching is saved, otherwise False.
    """

    folder = os.path.join('caches', 'model')
    path = os.path.join(folder, filename + '.cache')
    build_if_not_exist(folder)
    serialization.write_all(path, [model, Instances.template_instances(data)])
    localizer_log.msg(
        "Saved cache of {target_name}.".format(target_name='model'))
    return True
def create_dataset_header():
    """
    Creates the dataset header.
    :return: the header
    :rtype: Instances
    """
    att_msg = Attribute.create_string("Message")
    att_cls = Attribute.create_nominal("Class", ["miss", "hit"])
    result = Instances.create_instances("MessageClassificationProblem", [att_msg, att_cls], 0)
    return result
 def get_outputformat(self):
     """
     Returns the output format.
     :return: the output format
     :rtype: Instances
     """
     inst = javabridge.call(self.jobject, "getOutputFormat", "()Lweka/core/Instances;")
     if inst is None:
         return None
     else:
         return Instances(inst)
 def dataset_format(self):
     """
     Returns the dataset format.
     :return: the format
     :rtype: Instances
     """
     data = javabridge.call(self.jobject, "getDatasetFormat", "()Lweka/core/Instances;")
     if data is None:
         return None
     else:
         return Instances(data)
 def generate_examples(self):
     """
     Returns complete dataset.
     :return: the generated dataset
     :rtype: Instances
     """
     data = javabridge.call(self.jobject, "generateExamples", "()Lweka/core/Instances;")
     if data is None:
         return None
     else:
         return Instances(data)
 def outputformat(self):
     """
     Returns the output format.
     :return: the output format
     :rtype: Instances
     """
     inst = self.__outputformat()
     if inst is None:
         return None
     else:
         return Instances(inst)
 def load(self):
     """
     Loads the text files from the specified directory and returns the Instances object.
     In case of incremental loading, only the structure.
     :return: the full dataset or the header (if incremental)
     :rtype: Instances
     """
     javabridge.call(self.jobject, "reset", "()V")
     return Instances(
         javabridge.call(self.jobject, "getDataSet",
                         "()Lweka/core/Instances;"))
 def instances(self):
     """
     Returns the data used in the analysis.
     :return: the data in use
     :rtype: Instances
     """
     inst = javabridge.call(self.jobject, "getInstances",
                            "()Lweka/core/Instances;")
     if inst is None:
         return None
     else:
         return Instances(inst)
 def filter(self, data):
     """
     Filters the dataset.
     :param data: the Instances to filter
     :type data: Instances
     :return: the filtered Instances object
     :rtype: Instances
     """
     return Instances(javabridge.static_call(
         "Lweka/filters/Filter;", "useFilter",
         "(Lweka/core/Instances;Lweka/filters/Filter;)Lweka/core/Instances;",
         data.jobject, self.jobject))
def DecisionTree(rnd_data, folds, seed, data):

    data_size = rnd_data.num_instances
    fold_size = math.floor(data_size / folds)

    # cross-validation
    evaluation = Evaluation(rnd_data)
    for i in range(folds):
        this_fold = fold_size
        test_start = i * fold_size
        test_end = (test_start + fold_size)
        if ((data_size - test_end) / fold_size < 1):
            this_fold = data_size - test_start
        test = Instances.copy_instances(rnd_data, test_start,
                                        this_fold)  # generate validation fold
        if i == 0:
            train = Instances.copy_instances(rnd_data, test_end,
                                             data_size - test_end)
        else:
            train_1 = Instances.copy_instances(rnd_data, 0, test_start)
            train_2 = Instances.copy_instances(rnd_data, test_end,
                                               data_size - test_end)
            train = Instances.append_instances(
                train_1, train_2)  # generate training fold

        # build and evaluate classifier
        cls = Classifier(classname="weka.classifiers.trees.J48")
        cls.build_classifier(train)  # build classifier on training set
        evaluation.test_model(cls,
                              test)  # test classifier on validation/test set

    print("")
    print("=== Decision Tree ===")
    print("Classifier: " + cls.to_commandline())
    print("Dataset: " + data.relationname)
    print("Folds: " + str(folds))
    print("Seed: " + str(seed))
    print("")
    print(
        evaluation.summary("=== " + str(folds) + "-fold Cross-Validation ==="))
Example #29
0
def LabeledUnlabeldata(data, unlabeled, tree, y, cal_method=None ) :
    
	data1 = Instances.copy_instances(data)
	labeling = Instances.copy_instances(unlabeled)
	tree.build_classifier(data1)
	update=False
	it=0
	labeling_num_instances = labeling.num_instances
	while labeling.num_instances > 3 and it < labeling_num_instances:
		it+=1
		update = False
		removed_index=set()
		print("labeling.num_instances ===>>   " , labeling.num_instances)

		for i,xi in enumerate(labeling) :
			clsLabel= tree.classify_instance(xi)
			dist = calculate_probability_distribution(tree , labeling , i , cal_method)
			for dp in dist :
				if dp >= y :
					update = True
					xi.set_value(xi.class_index,clsLabel)
					data1.add_instance(xi)
					removed_index.add(i)

		print("labeling ==================>>", labeling.num_instances)
		print("removed_index ==================>>", len(removed_index))
		removed_index_list = sorted(removed_index)
		for i,ii in enumerate(removed_index_list) :
			labeling.delete(ii-i)
		print("labeling ==================>>", labeling.num_instances)


		if update:
			tree.build_classifier(data1)


	data1.compactify()
	return data1
 def __init__(self, model=None, header=None):
     """
     Initializes the container.
     :param model: the model to store (eg Classifier or Clusterer)
     :type model: object
     :param header: the header instances
     :type header: Instances
     """
     super(ModelContainer, self).__init__()
     self.set("Model", model)
     if header is not None:
         header = Instances.template_instances(header)
     self.set("Header", header)
     self._allowed = ["Model", "Header"]
 def __init__(self, model=None, header=None):
     """
     Initializes the container.
     :param model: the model to store (eg Classifier or Clusterer)
     :type model: object
     :param header: the header instances
     :type header: Instances
     """
     super(ModelContainer, self).__init__()
     self.set("Model", model)
     if header is not None:
         header = Instances.template_instances(header)
     self.set("Header", header)
     self._allowed = ["Model", "Header"]
def generate_thresholdcurve_data(evaluation, class_index):
    """
    Generates the threshold curve data from the evaluation object's predictions.
    :param evaluation: the evaluation to obtain the predictions from
    :type evaluation: Evaluation
    :param class_index: the 0-based index of the class-label to create the plot for
    :type class_index: int
    :return: the generated threshold curve data
    :rtype: Instances
    """
    jtc = JavaObject.new_instance("weka.classifiers.evaluation.ThresholdCurve")
    pred = javabridge.call(evaluation.jobject, "predictions", "()Ljava/util/ArrayList;")
    result = Instances(
        javabridge.call(jtc, "getCurve", "(Ljava/util/ArrayList;I)Lweka/core/Instances;", pred, class_index))
    return result
Example #33
0
def create_dataset(tweets):
    text_att = Attribute.create_string('TEXT')
    nom_att = Attribute.create_nominal('CLASS', class_values)
    dataset = Instances.create_instances("tweets", [text_att, nom_att],
                                         len(tweets))

    for tweet in tweets:
        values = []
        values.append(dataset.attribute(0).add_string_value(tweet))
        values.append(Instance.missing_value())
        inst = Instance.create_instance(values)
        dataset.add_instance(inst)

    dataset.class_is_last()

    return dataset
Example #34
0
	def addNominals(self, dataset):
		# Add the nominal values for all columns, in case a column has none
		ignoreAttributes = ['readmitted']
		atts = []
		for a in dataset.attributes():
			if (not (a.is_nominal)) or (a.name in ignoreAttributes) :
				atts.append(a)
			else:
				newValues = list(a.values)
				pvalue = 'DefaultNominal'
				if(pvalue not in newValues):
					newValues.append(pvalue)
				atts.append(Attribute.create_nominal(a.name, newValues))
		newDataset = Instances.create_instances("Dataset", atts, 0)
		newDataset.class_is_last()
		return newDataset
def main():
    """
    Creates a dataset from scratch using random data and outputs it.
    """

    atts = []
    for i in xrange(5):
        atts.append(Attribute.create_numeric("x" + str(i)))

    data = Instances.create_instances("data", atts, 10)

    for n in xrange(10):
        values = []
        for i in xrange(5):
            values.append(n*100 + i)
        inst = Instance.create_instance(values)
        data.add_instance(inst)

    print(data)
def ndarray_to_instances(array, relation, att_template="Att-#", att_list=None):
    """
    Converts the numpy matrix into an Instances object and returns it.

    :param array: the numpy ndarray to convert
    :type array: numpy.darray
    :param relation: the name of the dataset
    :type relation: str
    :param att_template: the prefix to use for the attribute names, "#" is the 1-based index,
                         "!" is the 0-based index, "@" the relation name
    :type att_template: str
    :param att_list: the list of attribute names to use
    :type att_list: list
    :return: the generated instances object
    :rtype: Instances
    """
    if len(numpy.shape(array)) != 2:
        raise Exception("Number of array dimensions must be 2!")
    rows, cols = numpy.shape(array)

    # header
    atts = []
    if att_list is not None:
        if len(att_list) != cols:
            raise Exception(
                "Number columns and provided attribute names differ: " + str(cols) + " != " + len(att_list))
        for name in att_list:
            att = Attribute.create_numeric(name)
            atts.append(att)
    else:
        for i in xrange(cols):
            name = att_template.replace("#", str(i+1)).replace("!", str(i)).replace("@", relation)
            att = Attribute.create_numeric(name)
            atts.append(att)
    result = Instances.create_instances(relation, atts, rows)

    # data
    for i in xrange(rows):
        inst = Instance.create_instance(array[i])
        result.add_instance(inst)

    return result
Example #37
0
    def do_execute(self):
        """
        The actual execution of the actor.

        :return: None if successful, otherwise error message
        :rtype: str
        """
        result = None

        data = self.input.payload
        if isinstance(self._input.payload, Instance):
            inst = self.input.payload
            data = inst.dataset
        elif isinstance(self.input.payload, Instances):
            data = self.input.payload
            inst = None

        append = True
        if self._header is None or (self._header.equal_headers(data) is not None):
            self._header = Instances.template_instances(data, 0)
            outstr = str(data)
            append = False
        elif inst is not None:
            outstr = str(inst)
        else:
            outstr = str(data)

        f = None
        try:
            if append:
                f = open(str(self.resolve_option("output")), "a")
            else:
                f = open(str(self.resolve_option("output")), "w")
            f.write(outstr)
            f.write("\n")
        except Exception as e:
            result = self.full_name + "\n" + traceback.format_exc()
        finally:
            if f is not None:
                f.close()
        return result
Example #38
0
	def addPatientNominals(self, patient, dataset):
		# Add the nominal values for the patient to the master header, in case they aren't already there
		# Loop and add patient's nominal values in case they aren't in masterDataset
		# newDataset will be the new master header
		# Waiting on prediction patient to be defined
		# Should be like {sex_cd: "m", ...}
		ignoreAttributes = ['readmitted']
		atts = []
		for a in dataset.attributes():
			if (not (a.is_nominal)) or (a.name in ignoreAttributes) :
				atts.append(a)
			else:
				newValues = list(a.values)
				#print a.name
				pvalue = patient[a.name]
				if(pvalue not in newValues):
					newValues.append(pvalue)
				atts.append(Attribute.create_nominal(a.name, newValues))
		newDataset = Instances.create_instances("Dataset", atts, 0)
		newDataset.class_is_last()
		return newDataset
Example #39
0
 def copy(self, from_row=None, num_rows=None):
     return WekaInstances.copy_instances(self.instances,
                                         from_row=from_row,
                                         num_rows=num_rows)
Example #40
0
print("\nLoading dataset: " + fname + "\n")
data = loader.load_file(fname)
data.class_is_last()

# define classifiers
classifiers = ["weka.classifiers.rules.OneR", "weka.classifiers.trees.J48"]

# cross-validate original dataset
for classifier in classifiers:
    cls = Classifier(classname=classifier)
    evl = Evaluation(data)
    evl.crossvalidate_model(cls, data, 10, Random(1))
    print("%s (original): %0.0f%%" % (classifier, evl.percent_correct))

# replace 'outlook' in first 4 'no' instances with 'missing'
modified = Instances.copy_instances(data)
count = 0
for i in xrange(modified.num_instances):
    if modified.get_instance(i).get_string_value(modified.class_index) == "no":
        count += 1
        modified.get_instance(i).set_missing(0)
        if count == 4:
            break

# cross-validate modified dataset
for classifier in classifiers:
    cls = Classifier(classname=classifier)
    evl = Evaluation(modified)
    evl.crossvalidate_model(cls, modified, 10, Random(1))
    print("%s (modified): %0.0f%%" % (classifier, evl.percent_correct))
Example #41
0
import os
import weka.core.jvm as jvm
from weka.core.converters import Loader, Saver
from weka.core.dataset import Instances
from weka.filters import Filter

jvm.start()

# load weather.nominal
fname = data_dir + os.sep + "weather.nominal.arff"
print("\nLoading dataset: " + fname + "\n")
loader = Loader(classname="weka.core.converters.ArffLoader")
data = loader.load_file(fname)

# output header
print(Instances.template_instances(data))

# remove attribute no 3
print("\nRemove attribute no 3")
fltr = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "3"])
fltr.set_inputformat(data)
filtered = fltr.filter(data)

# output header
print(Instances.template_instances(filtered))

# save modified dataset
saver = Saver(classname="weka.core.converters.ArffSaver")
saver.save_file(filtered, data_dir + os.sep + "weather.nominal-filtered.arff")

jvm.stop()
def main():
    """
    Just runs some example code.
    """

    # load a dataset
    iris_file = helper.get_data_dir() + os.sep + "iris.arff"
    helper.print_info("Loading dataset: " + iris_file)
    loader = Loader("weka.core.converters.ArffLoader")
    iris_data = loader.load_file(iris_file)
    iris_data.class_is_last()
    helper.print_title("Iris dataset")
    print(iris_data)
    helper.print_title("Iris dataset (incrementally output)")
    for i in iris_data:
        print(i)
    helper.print_title("Iris summary")
    print(Instances.summary(iris_data))
    helper.print_title("Iris attributes")
    for a in iris_data.attributes():
        print(a)
    helper.print_title("Instance at #0")
    print(iris_data.get_instance(0))
    print(iris_data.get_instance(0).values)
    print("Attribute stats (first):\n" + str(iris_data.attribute_stats(0)))
    print("total count (first attribute):\n" + str(iris_data.attribute_stats(0).total_count))
    print("numeric stats (first attribute):\n" + str(iris_data.attribute_stats(0).numeric_stats))
    print("nominal counts (last attribute):\n"
          + str(iris_data.attribute_stats(iris_data.num_attributes - 1).nominal_counts))
    helper.print_title("Instance values at #0")
    for v in iris_data.get_instance(0):
        print(v)

    # append datasets
    helper.print_title("append datasets")
    data1 = Instances.copy_instances(iris_data, 0, 2)
    data2 = Instances.copy_instances(iris_data, 2, 2)
    print("Dataset #1:\n" + str(data1))
    print("Dataset #2:\n" + str(data2))
    msg = data1.equal_headers(data2)
    print("#1 == #2 ? " + "yes" if msg is None else msg)
    combined = Instances.append_instances(data1, data2)
    print("Combined:\n" + str(combined))

    # merge datasets
    helper.print_title("merge datasets")
    data1 = Instances.copy_instances(iris_data, 0, 2)
    data1.class_index = -1
    data1.delete_attribute(1)
    data1.delete_first_attribute()
    data2 = Instances.copy_instances(iris_data, 0, 2)
    data2.class_index = -1
    data2.delete_attribute(4)
    data2.delete_attribute(3)
    data2.delete_attribute(2)
    print("Dataset #1:\n" + str(data1))
    print("Dataset #2:\n" + str(data2))
    msg = data1.equal_headers(data2)
    print("#1 == #2 ? " + ("yes" if msg is None else msg))
    combined = Instances.merge_instances(data2, data1)
    print("Combined:\n" + str(combined))

    # load dataset incrementally
    iris_file = helper.get_data_dir() + os.sep + "iris.arff"
    helper.print_info("Loading dataset incrementally: " + iris_file)
    loader = Loader("weka.core.converters.ArffLoader")
    iris_data = loader.load_file(iris_file, incremental=True)
    iris_data.class_is_last()
    helper.print_title("Iris dataset")
    print(iris_data)
    for inst in loader:
        print(inst)

    # create attributes
    helper.print_title("Creating attributes")
    num_att = Attribute.create_numeric("num")
    print("numeric: " + str(num_att))
    date_att = Attribute.create_date("dat", "yyyy-MM-dd")
    print("date: " + str(date_att))
    nom_att = Attribute.create_nominal("nom", ["label1", "label2"])
    print("nominal: " + str(nom_att))

    # create dataset
    helper.print_title("Create dataset")
    dataset = Instances.create_instances("helloworld", [num_att, date_att, nom_att], 0)
    print(str(dataset))

    # create an instance
    helper.print_title("Create and add instance")
    values = [3.1415926, date_att.parse_date("2014-04-10"), 1.0]
    inst = Instance.create_instance(values)
    print("Instance #1:\n" + str(inst))
    dataset.add_instance(inst)
    values = [2.71828, date_att.parse_date("2014-08-09"), Instance.missing_value()]
    inst = Instance.create_instance(values)
    dataset.add_instance(inst)
    print("Instance #2:\n" + str(inst))
    inst.set_value(0, 4.0)
    print("Instance #2 (updated):\n" + str(inst))
    print("Dataset:\n" + str(dataset))
    dataset.delete_with_missing(2)
    print("Dataset (after delete of missing):\n" + str(dataset))
    values = [(1, date_att.parse_date("2014-07-11"))]
    inst = Instance.create_sparse_instance(values, 3, classname="weka.core.SparseInstance")
    print("sparse Instance:\n" + str(inst))
    dataset.add_instance(inst)
    print("dataset with mixed dense/sparse instance objects:\n" + str(dataset))

    # create dataset (lists)
    helper.print_title("Create dataset from lists")
    x = [[randint(1, 10) for _ in range(5)] for _ in range(10)]
    y = [randint(0, 1) for _ in range(10)]
    dataset2 = ds.create_instances_from_lists(x, y, "generated from lists")
    print(dataset2)
    x = [[randint(1, 10) for _ in range(5)] for _ in range(10)]
    dataset2 = ds.create_instances_from_lists(x, name="generated from lists (no y)")
    print(dataset2)

    # create dataset (matrices)
    helper.print_title("Create dataset from matrices")
    x = np.random.randn(10, 5)
    y = np.random.randn(10)
    dataset3 = ds.create_instances_from_matrices(x, y, "generated from matrices")
    print(dataset3)
    x = np.random.randn(10, 5)
    dataset3 = ds.create_instances_from_matrices(x, name="generated from matrices (no y)")
    print(dataset3)

    # create more sparse instances
    diabetes_file = helper.get_data_dir() + os.sep + "diabetes.arff"
    helper.print_info("Loading dataset: " + diabetes_file)
    loader = Loader("weka.core.converters.ArffLoader")
    diabetes_data = loader.load_file(diabetes_file)
    diabetes_data.class_is_last()
    helper.print_title("Create sparse instances using template dataset")
    sparse_data = Instances.template_instances(diabetes_data)
    for i in range(diabetes_data.num_attributes - 1):
        inst = Instance.create_sparse_instance(
            [(i, float(i+1) / 10.0)], sparse_data.num_attributes, classname="weka.core.SparseInstance")
        sparse_data.add_instance(inst)
    print("sparse dataset:\n" + str(sparse_data))

    # simple scatterplot of iris dataset: petalwidth x petallength
    iris_data = loader.load_file(iris_file)
    iris_data.class_is_last()
    pld.scatter_plot(
        iris_data, iris_data.attribute_by_name("petalwidth").index,
        iris_data.attribute_by_name("petallength").index,
        percent=50,
        wait=False)

    # line plot of iris dataset (without class attribute)
    iris_data = loader.load_file(iris_file)
    iris_data.class_is_last()
    pld.line_plot(iris_data, atts=range(iris_data.num_attributes - 1), percent=50, title="Line plot iris", wait=False)

    # matrix plot of iris dataset
    iris_data = loader.load_file(iris_file)
    iris_data.class_is_last()
    pld.matrix_plot(iris_data, percent=50, title="Matrix plot iris", wait=True)
Example #43
0
	def saveClassifier(self, filename, path='/home/sbiastoch/Schreibtisch/classifiers/'):
		serialization.write_all(path+filename, [self.classifier, Instances.template_instances(self.data)])
Example #44
0
# Discretize
print("Discretize numeric attributes (supervised)")
fltr = Filter(classname="weka.filters.supervised.attribute.Discretize")
fltr.inputformat(data)
filtered = fltr.filter(data)
print(filtered)

# PCA
print("Principal components analysis")
fltr = Filter(classname="weka.filters.unsupervised.attribute.PrincipalComponents")
fltr.inputformat(data)
filtered = fltr.filter(data)
print(filtered)

# load anneal
loader = Loader(classname="weka.core.converters.ArffLoader")
fname = data_dir + os.sep + "anneal.arff"
print("\nLoading dataset: " + fname + "\n")
data = loader.load_file(fname)
data.class_is_last()

# RemoveUseless
print("RemoveUseless")
fltr = Filter(classname="weka.filters.unsupervised.attribute.RemoveUseless")
fltr.inputformat(data)
filtered = fltr.filter(data)
print("Original header (#att=" + str(data.num_attributes) + "):\n" + str(Instances.template_instances(data)))
print("Filtered header (#att=" + str(filtered.num_attributes) + "):\n" + str(Instances.template_instances(filtered)))

jvm.stop()
Example #45
0
    with open(data_dir + os.sep + infile, "rb") as csvfile:
        print(infile)
        outfile = os.path.splitext(infile)[0] + ".arff"
        reader = csv.reader(csvfile)
        data = None
        ref_present = True
        for index, row in enumerate(reader):
            if index == 0:
                atts = []
                ref_present = ("Reference value" in row) or ("Reference Value" in row)
                for idx, col in enumerate(row):
                    col = col.lower()
                    atts.append(Attribute.create_numeric(col))
                    if not ref_present and (idx == 0):
                        atts.append(Attribute.create_numeric("reference value"))
                data = Instances.create_instances("irdc", atts, 0)
            else:
                values = []
                for idx, col in enumerate(row):
                    values.append(float(col))
                    if not ref_present and (idx == 0):
                        values.append(float('NaN'))
                inst = Instance.create_instance(values)
                data.add_instance(inst)

        saver = Saver(classname="weka.core.converters.ArffSaver")
        saver.save_file(data, data_dir + os.sep + outfile)

# train/test/predict
print("Train/test/predict...")
def plot_learning_curve(classifiers, train, test=None, increments=100, metric="percent_correct",
                        title="Learning curve", label_template="[#] @ $", key_loc="lower right",
                        outfile=None, wait=True):
    """
    Plots
    :param classifiers: list of Classifier template objects
    :type classifiers: list of Classifier
    :param train: dataset to use for the building the classifier, used for evaluating it test set None
    :type train: Instances
    :param test: optional dataset to use for the testing the built classifiers
    :type test: Instances
    :param increments: the increments (>= 1: # of instances, <1: percentage of dataset)
    :type increments: float
    :param metric: the name of the numeric metric to plot (Evaluation.<metric>)
    :type metric: str
    :param title: the title for the plot
    :type title: str
    :param label_template: the template for the label in the plot
                           (#: 1-based index, @: full classname, !: simple classname, $: options)
    :type label_template: str
    :param key_loc: the location string for the key
    :type key_loc: str
    :param outfile: the output file, ignored if None
    :type outfile: str
    :param wait: whether to wait for the user to close the plot
    :type wait: bool
    """

    if not plot.matplotlib_available:
        logger.error("Matplotlib is not installed, plotting unavailable!")
        return
    if not train.has_class():
        logger.error("Training set has no class attribute set!")
        return
    if (test is not None) and (train.equal_headers(test) is not None):
        logger.error("Training and test set are not compatible: " + train.equal_headers(test))
        return

    if increments >= 1:
        inc = increments
    else:
        inc = round(train.num_instances * increments)

    steps = []
    cls = []
    evls = {}
    for classifier in classifiers:
        cl = Classifier.make_copy(classifier)
        cls.append(cl)
        evls[cl] = []
    if test is None:
        tst = train
    else:
        tst = test

    for i in xrange(train.num_instances):
        if (i > 0) and (i % inc == 0):
            steps.append(i+1)
        for cl in cls:
            # train
            if cl.is_updateable:
                if i == 0:
                    tr = Instances.copy_instances(train, 0, 1)
                    cl.build_classifier(tr)
                else:
                    cl.update_classifier(train.get_instance(i))
            else:
                if (i > 0) and (i % inc == 0):
                    tr = Instances.copy_instances(train, 0, i + 1)
                    cl.build_classifier(tr)
            # evaluate
            if (i > 0) and (i % inc == 0):
                evl = Evaluation(tst)
                evl.test_model(cl, tst)
                evls[cl].append(getattr(evl, metric))

    fig, ax = plt.subplots()
    ax.set_xlabel("# of instances")
    ax.set_ylabel(metric)
    ax.set_title(title)
    fig.canvas.set_window_title(title)
    ax.grid(True)
    i = 0
    for cl in cls:
        evl = evls[cl]
        i += 1
        plot_label = label_template.\
            replace("#", str(i)).\
            replace("@", cl.classname).\
            replace("!", cl.classname[cl.classname.rfind(".") + 1:]).\
            replace("$", join_options(cl.config))
        ax.plot(steps, evl, label=plot_label)
    plt.draw()
    plt.legend(loc=key_loc, shadow=True)
    if outfile is not None:
        plt.savefig(outfile)
    if wait:
        plt.show()
def main():
    """
    Just runs some example code.
    """

    # load a dataset
    data_file = helper.get_data_dir() + os.sep + "vote.arff"
    helper.print_info("Loading dataset: " + data_file)
    loader = Loader("weka.core.converters.ArffLoader")
    data = loader.load_file(data_file)
    data.class_is_last()

    # classifier
    classifier = Classifier(classname="weka.classifiers.trees.J48")

    # randomize data
    folds = 10
    seed = 1
    rnd = Random(seed)
    rand_data = Instances.copy_instances(data)
    rand_data.randomize(rnd)
    if rand_data.class_attribute.is_nominal:
        rand_data.stratify(folds)

    # perform cross-validation and add predictions
    predicted_data = None
    evaluation = Evaluation(rand_data)
    for i in xrange(folds):
        train = rand_data.train_cv(folds, i)
        # the above code is used by the StratifiedRemoveFolds filter,
        # the following code is used by the Explorer/Experimenter
        # train = rand_data.train_cv(folds, i, rnd)
        test = rand_data.test_cv(folds, i)

        # build and evaluate classifier
        cls = Classifier.make_copy(classifier)
        cls.build_classifier(train)
        evaluation.test_model(cls, test)

        # add predictions
        addcls = Filter(
            classname="weka.filters.supervised.attribute.AddClassification",
            options=["-classification", "-distribution", "-error"])
        # setting the java object directory avoids issues with correct quoting in option array
        addcls.set_property("classifier", Classifier.make_copy(classifier))
        addcls.inputformat(train)
        addcls.filter(train)  # trains the classifier
        pred = addcls.filter(test)
        if predicted_data is None:
            predicted_data = Instances.template_instances(pred, 0)
        for n in xrange(pred.num_instances):
            predicted_data.add_instance(pred.get_instance(n))

    print("")
    print("=== Setup ===")
    print("Classifier: " + classifier.to_commandline())
    print("Dataset: " + data.relationname)
    print("Folds: " + str(folds))
    print("Seed: " + str(seed))
    print("")
    print(evaluation.summary("=== " + str(folds) + " -fold Cross-Validation ==="))
    print("")
    print(predicted_data)