def main():
    """
    Just runs some example code.
    """

    # load a dataset
    iris_file = helper.get_data_dir() + os.sep + "iris.arff"
    helper.print_info("Loading dataset: " + iris_file)
    loader = Loader("weka.core.converters.ArffLoader")
    iris_data = loader.load_file(iris_file)
    iris_data.class_is_last()

    # train classifier
    classifier = Classifier("weka.classifiers.trees.J48")
    classifier.build_classifier(iris_data)

    # save and read object
    helper.print_title("I/O: single object")
    outfile = tempfile.gettempdir() + os.sep + "j48.model"
    serialization.write(outfile, classifier)
    model = Classifier(jobject=serialization.read(outfile))
    print(model)

    # save classifier and dataset header (multiple objects)
    helper.print_title("I/O: single object")
    serialization.write_all(outfile, [classifier, Instances.template_instances(iris_data)])
    objects = serialization.read_all(outfile)
    for i, obj in enumerate(objects):
        helper.print_info("Object #" + str(i+1) + ":")
        if javabridge.get_env().is_instance_of(obj, javabridge.get_env().find_class("weka/core/Instances")):
            obj = Instances(jobject=obj)
        elif javabridge.get_env().is_instance_of(obj, javabridge.get_env().find_class("weka/classifiers/Classifier")):
            obj = Classifier(jobject=obj)
        print(obj)
Beispiel #2
0
    def train_internal(self):
        best_weights_arr = []
        #create an empty F with source as template
        F = Instances.template_instances(self.source[0])
        withF = False
        print("Find weight for each source data set")
        for source in self.source:
            bestWeight, bestError = self.process_source(source, F, withF)
            best_weights_arr.append(bestWeight)

        #sort the data based on the weights
        self.source = [
            source for _, source in sorted(zip(best_weights_arr, self.source),
                                           reverse=True,
                                           key=operator.itemgetter(0))
        ]

        print("Train for final stage")
        withF = True
        while len(self.source) > 0:  #self.max_source_dataset):
            weight, _ = self.process_source(self.source[0], F, withF)
            for inst in self.source[0]:
                inst.weight = weight
            F = Instances.append_instances(F, self.source[0])
            F.class_is_last()
            self.source.pop(0)

        return F
Beispiel #3
0
def save_model(model, data, filename):
    """Save the model to the target caching file.

    The caches should be defined in the config file. See README and
    config.sample for reference.

    Args:
        model(obj): The model to be saved. Should be a
            weka.classifier.Classifier object.
        data(obj): The training set to be cached.
        target(str): The target option in '[cached]' section in the config
            file.
        filename(str): The target file to save.

    Returns:
        True if the target caching is saved, otherwise False.
    """

    folder = os.path.join('caches', 'model')
    path = os.path.join(folder, filename + '.cache')
    build_if_not_exist(folder)
    serialization.write_all(path, [model, Instances.template_instances(data)])
    localizer_log.msg(
        "Saved cache of {target_name}.".format(target_name='model'))
    return True
Beispiel #4
0
def main():
    """
    Just runs some example code.
    """

    # load a dataset
    iris_file = helper.get_data_dir() + os.sep + "iris.arff"
    helper.print_info("Loading dataset: " + iris_file)
    loader = Loader("weka.core.converters.ArffLoader")
    iris_data = loader.load_file(iris_file)
    iris_data.class_is_last()

    # train classifier
    classifier = Classifier("weka.classifiers.trees.J48")
    classifier.build_classifier(iris_data)

    # save and read object
    helper.print_title("I/O: model (using serialization module)")
    outfile = tempfile.gettempdir() + os.sep + "j48.model"
    serialization.write(outfile, classifier)
    model = Classifier(jobject=serialization.read(outfile))
    print(model)

    # save classifier and dataset header (multiple objects)
    helper.print_title("I/O: model and header (using serialization module)")
    serialization.write_all(
        outfile,
        [classifier, Instances.template_instances(iris_data)])
    objects = serialization.read_all(outfile)
    for i, obj in enumerate(objects):
        helper.print_info("Object #" + str(i + 1) + ":")
        if javabridge.get_env().is_instance_of(
                obj,
                javabridge.get_env().find_class("weka/core/Instances")):
            obj = Instances(jobject=obj)
        elif javabridge.get_env().is_instance_of(
                obj,
                javabridge.get_env().find_class(
                    "weka/classifiers/Classifier")):
            obj = Classifier(jobject=obj)
        print(obj)

    # save and read object
    helper.print_title("I/O: just model (using Classifier class)")
    outfile = tempfile.gettempdir() + os.sep + "j48.model"
    classifier.serialize(outfile)
    model, _ = Classifier.deserialize(outfile)
    print(model)

    # save classifier and dataset header (multiple objects)
    helper.print_title("I/O: model and header (using Classifier class)")
    classifier.serialize(outfile, header=iris_data)
    model, header = Classifier.deserialize(outfile)
    print(model)
    if header is not None:
        print(header)
 def __init__(self, model=None, header=None):
     """
     Initializes the container.
     :param model: the model to store (eg Classifier or Clusterer)
     :type model: object
     :param header: the header instances
     :type header: Instances
     """
     super(ModelContainer, self).__init__()
     self.set("Model", model)
     if header is not None:
         header = Instances.template_instances(header)
     self.set("Header", header)
     self._allowed = ["Model", "Header"]
 def __init__(self, model=None, header=None):
     """
     Initializes the container.
     :param model: the model to store (eg Classifier or Clusterer)
     :type model: object
     :param header: the header instances
     :type header: Instances
     """
     super(ModelContainer, self).__init__()
     self.set("Model", model)
     if header is not None:
         header = Instances.template_instances(header)
     self.set("Header", header)
     self._allowed = ["Model", "Header"]
    def serialize(self, ser_file, header=None):
        """
        Serializes the clusterer to the specified file.

        :param ser_file: the file to save the model to
        :type ser_file: str
        :param header: the (optional) dataset header to store alongside; recommended
        :type header: Instances
        """

        if (header is not None) and header.num_instances > 0:
            header = Instances.template_instances(header)

        if header is not None:
            serialization_write_all(ser_file, [self, header])
        else:
            serialization_write(ser_file, self)
Beispiel #8
0
    def do_execute(self):
        """
        The actual execution of the actor.

        :return: None if successful, otherwise error message
        :rtype: str
        """
        result = None

        data = self.input.payload
        if isinstance(self._input.payload, Instance):
            inst = self.input.payload
            data = inst.dataset
        elif isinstance(self.input.payload, Instances):
            data = self.input.payload
            inst = None

        append = True
        if self._header is None or (self._header.equal_headers(data)
                                    is not None):
            self._header = Instances.template_instances(data, 0)
            outstr = str(data)
            append = False
        elif inst is not None:
            outstr = str(inst)
        else:
            outstr = str(data)

        f = None
        try:
            if append:
                f = open(str(self.resolve_option("output")), "a")
            else:
                f = open(str(self.resolve_option("output")), "w")
            f.write(outstr)
            f.write("\n")
        except Exception as e:
            result = self.full_name + "\n" + traceback.format_exc()
        finally:
            if f is not None:
                f.close()
        return result
Beispiel #9
0
    def do_execute(self):
        """
        The actual execution of the actor.

        :return: None if successful, otherwise error message
        :rtype: str
        """
        result = None

        data = self.input.payload
        if isinstance(self._input.payload, Instance):
            inst = self.input.payload
            data = inst.dataset
        elif isinstance(self.input.payload, Instances):
            data = self.input.payload
            inst = None

        append = True
        if self._header is None or (self._header.equal_headers(data) is not None):
            self._header = Instances.template_instances(data, 0)
            outstr = str(data)
            append = False
        elif inst is not None:
            outstr = str(inst)
        else:
            outstr = str(data)

        f = None
        try:
            if append:
                f = open(str(self.resolve_option("output")), "a")
            else:
                f = open(str(self.resolve_option("output")), "w")
            f.write(outstr)
            f.write("\n")
        except Exception as e:
            result = self.full_name + "\n" + traceback.format_exc()
        finally:
            if f is not None:
                f.close()
        return result
Beispiel #10
0
    def fit(self, data, targets):
        """
        Trains the estimator.

        :param data: the input variables as matrix, array-like of shape (n_samples, n_features)
        :type data: ndarray
        :param targets: the optional class attribute column, array-like of shape (n_samples,)
        :type targets: ndarray
        :return: itself
        :rtype: WekaTransformer
        """
        if targets is None:
            check_array(data)
        else:
            check_X_y(data, targets)
        d = to_instances(data,
                         y=targets,
                         num_nominal_labels=self._num_nominal_input_labels,
                         num_class_labels=self._num_nominal_output_labels)
        self.header_ = Instances.template_instances(d)
        self._filter.inputformat(d)
        self._filter.filter(d)
        return self
Beispiel #11
0
fltr.inputformat(data)
filtered = fltr.filter(data)
print(filtered)

# PCA
print("Principal components analysis")
fltr = Filter(
    classname="weka.filters.unsupervised.attribute.PrincipalComponents")
fltr.inputformat(data)
filtered = fltr.filter(data)
print(filtered)

# load anneal
loader = Loader(classname="weka.core.converters.ArffLoader")
fname = data_dir + os.sep + "anneal.arff"
print("\nLoading dataset: " + fname + "\n")
data = loader.load_file(fname)
data.class_is_last()

# RemoveUseless
print("RemoveUseless")
fltr = Filter(classname="weka.filters.unsupervised.attribute.RemoveUseless")
fltr.inputformat(data)
filtered = fltr.filter(data)
print("Original header (#att=" + str(data.num_attributes) + "):\n" +
      str(Instances.template_instances(data)))
print("Filtered header (#att=" + str(filtered.num_attributes) + "):\n" +
      str(Instances.template_instances(filtered)))

jvm.stop()
def main():
    """
    Just runs some example code.
    """

    # load a dataset
    iris_file = helper.get_data_dir() + os.sep + "iris.arff"
    helper.print_info("Loading dataset: " + iris_file)
    loader = Loader("weka.core.converters.ArffLoader")
    iris_data = loader.load_file(iris_file)
    iris_data.class_is_last()
    helper.print_title("Iris dataset")
    print(iris_data)
    helper.print_title("Iris dataset (incrementally output)")
    for i in iris_data:
        print(i)
    helper.print_title("Iris summary")
    print(Instances.summary(iris_data))
    helper.print_title("Iris attributes")
    for a in iris_data.attributes():
        print(a)
    helper.print_title("Instance at #0")
    print(iris_data.get_instance(0))
    print(iris_data.get_instance(0).values)
    print("Attribute stats (first):\n" + str(iris_data.attribute_stats(0)))
    print("total count (first attribute):\n" +
          str(iris_data.attribute_stats(0).total_count))
    print("numeric stats (first attribute):\n" +
          str(iris_data.attribute_stats(0).numeric_stats))
    print("nominal counts (last attribute):\n" + str(
        iris_data.attribute_stats(iris_data.num_attributes -
                                  1).nominal_counts))
    helper.print_title("Instance values at #0")
    for v in iris_data.get_instance(0):
        print(v)

    # append datasets
    helper.print_title("append datasets")
    data1 = Instances.copy_instances(iris_data, 0, 2)
    data2 = Instances.copy_instances(iris_data, 2, 2)
    print("Dataset #1:\n" + str(data1))
    print("Dataset #2:\n" + str(data2))
    msg = data1.equal_headers(data2)
    print("#1 == #2 ? " + "yes" if msg is None else msg)
    combined = Instances.append_instances(data1, data2)
    print("Combined:\n" + str(combined))

    # merge datasets
    helper.print_title("merge datasets")
    data1 = Instances.copy_instances(iris_data, 0, 2)
    data1.class_index = -1
    data1.delete_attribute(1)
    data1.delete_first_attribute()
    data2 = Instances.copy_instances(iris_data, 0, 2)
    data2.class_index = -1
    data2.delete_attribute(4)
    data2.delete_attribute(3)
    data2.delete_attribute(2)
    print("Dataset #1:\n" + str(data1))
    print("Dataset #2:\n" + str(data2))
    msg = data1.equal_headers(data2)
    print("#1 == #2 ? " + ("yes" if msg is None else msg))
    combined = Instances.merge_instances(data2, data1)
    print("Combined:\n" + str(combined))

    # load dataset incrementally
    iris_file = helper.get_data_dir() + os.sep + "iris.arff"
    helper.print_info("Loading dataset incrementally: " + iris_file)
    loader = Loader("weka.core.converters.ArffLoader")
    iris_data = loader.load_file(iris_file, incremental=True)
    iris_data.class_is_last()
    helper.print_title("Iris dataset")
    print(iris_data)
    for inst in loader:
        print(inst)

    # create attributes
    helper.print_title("Creating attributes")
    num_att = Attribute.create_numeric("num")
    print("numeric: " + str(num_att))
    date_att = Attribute.create_date("dat", "yyyy-MM-dd")
    print("date: " + str(date_att))
    nom_att = Attribute.create_nominal("nom", ["label1", "label2"])
    print("nominal: " + str(nom_att))

    # create dataset
    helper.print_title("Create dataset")
    dataset = Instances.create_instances("helloworld",
                                         [num_att, date_att, nom_att], 0)
    print(str(dataset))

    # create an instance
    helper.print_title("Create and add instance")
    values = [3.1415926, date_att.parse_date("2014-04-10"), 1.0]
    inst = Instance.create_instance(values)
    print("Instance #1:\n" + str(inst))
    dataset.add_instance(inst)
    values = [
        2.71828,
        date_att.parse_date("2014-08-09"),
        Instance.missing_value()
    ]
    inst = Instance.create_instance(values)
    dataset.add_instance(inst)
    print("Instance #2:\n" + str(inst))
    inst.set_value(0, 4.0)
    print("Instance #2 (updated):\n" + str(inst))
    print("Dataset:\n" + str(dataset))
    dataset.delete_with_missing(2)
    print("Dataset (after delete of missing):\n" + str(dataset))
    values = [(1, date_att.parse_date("2014-07-11"))]
    inst = Instance.create_sparse_instance(
        values, 3, classname="weka.core.SparseInstance")
    print("sparse Instance:\n" + str(inst))
    dataset.add_instance(inst)
    print("dataset with mixed dense/sparse instance objects:\n" + str(dataset))

    # create dataset (lists)
    helper.print_title("Create dataset from lists")
    x = [[randint(1, 10) for _ in range(5)] for _ in range(10)]
    y = [randint(0, 1) for _ in range(10)]
    dataset2 = ds.create_instances_from_lists(x, y, "generated from lists")
    print(dataset2)
    x = [[randint(1, 10) for _ in range(5)] for _ in range(10)]
    dataset2 = ds.create_instances_from_lists(
        x, name="generated from lists (no y)")
    print(dataset2)

    # create dataset (matrices)
    helper.print_title("Create dataset from matrices")
    x = np.random.randn(10, 5)
    y = np.random.randn(10)
    dataset3 = ds.create_instances_from_matrices(x, y,
                                                 "generated from matrices")
    print(dataset3)
    x = np.random.randn(10, 5)
    dataset3 = ds.create_instances_from_matrices(
        x, name="generated from matrices (no y)")
    print(dataset3)

    # create more sparse instances
    diabetes_file = helper.get_data_dir() + os.sep + "diabetes.arff"
    helper.print_info("Loading dataset: " + diabetes_file)
    loader = Loader("weka.core.converters.ArffLoader")
    diabetes_data = loader.load_file(diabetes_file)
    diabetes_data.class_is_last()
    helper.print_title("Create sparse instances using template dataset")
    sparse_data = Instances.template_instances(diabetes_data)
    for i in xrange(diabetes_data.num_attributes - 1):
        inst = Instance.create_sparse_instance(
            [(i, float(i + 1) / 10.0)],
            sparse_data.num_attributes,
            classname="weka.core.SparseInstance")
        sparse_data.add_instance(inst)
    print("sparse dataset:\n" + str(sparse_data))

    # simple scatterplot of iris dataset: petalwidth x petallength
    iris_data = loader.load_file(iris_file)
    iris_data.class_is_last()
    pld.scatter_plot(iris_data,
                     iris_data.attribute_by_name("petalwidth").index,
                     iris_data.attribute_by_name("petallength").index,
                     percent=50,
                     wait=False)

    # line plot of iris dataset (without class attribute)
    iris_data = loader.load_file(iris_file)
    iris_data.class_is_last()
    pld.line_plot(iris_data,
                  atts=xrange(iris_data.num_attributes - 1),
                  percent=50,
                  title="Line plot iris",
                  wait=False)

    # matrix plot of iris dataset
    iris_data = loader.load_file(iris_file)
    iris_data.class_is_last()
    pld.matrix_plot(iris_data, percent=50, title="Matrix plot iris", wait=True)
Beispiel #13
0
import weka.core.jvm as jvm
from weka.core.converters import Loader, Saver
from weka.core.dataset import Instances
from weka.filters import Filter

jvm.start()

# load weather.nominal
fname = data_dir + os.sep + "weather.nominal.arff"
print("\nLoading dataset: " + fname + "\n")
loader = Loader(classname="weka.core.converters.ArffLoader")
data = loader.load_file(fname)

# output header
print(Instances.template_instances(data))

# remove attribute no 3
print("\nRemove attribute no 3")
fltr = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "3"])
fltr.inputformat(data)
filtered = fltr.filter(data)

# output header
print(Instances.template_instances(filtered))

# save modified dataset
saver = Saver(classname="weka.core.converters.ArffSaver")
saver.save_file(filtered, data_dir + os.sep + "weather.nominal-filtered.arff")

jvm.stop()
def createTrainedModel():
    from weka.core.converters import Loader
    folderList = os.listdir(outputModel)
    i = 0
    classi = ""
    loader = Loader(classname="weka.core.converters.ArffLoader")
    data = loader.load_file(os.path.join(outputModel, "genderTrain.arff"))
    data.class_is_last()
    from weka.classifiers import Classifier
    classi = "weka.classifiers.bayes.NaiveBayes"
    cls = Classifier(classname=classi)
    from weka.attribute_selection import ASSearch, ASEvaluation, AttributeSelection
    search = ASSearch(classname="weka.attributeSelection.Ranker",
                      options=["-1.7976931348623157E308", "-1"])
    #evaluator = ASEvaluation(classname="weka.attributeSelection.ChiSquaredAttributeEval")
    #attsel = AttributeSelection()
    #attsel.search(search)
    #attsel.evaluator(evaluator)
    #attsel.select_attributes(data)
    cls.build_classifier(data)
    import weka.core.serialization as serialization
    from weka.core.dataset import Instances
    serialization.write_all(
        os.path.join(outputModel, "GenderModel" + ".model"),
        [cls, Instances.template_instances(data)])
    from weka.classifiers import Evaluation
    from weka.core.classes import Random
    evl = Evaluation(data)
    evl.crossvalidate_model(cls, data, 10, Random(1))
    print "Gender model predictions"
    print cls
    #print(evl.percent_correct)
    print(evl.summary())
    print(evl.class_details())

    data = loader.load_file(os.path.join(outputModel, "ageTrain.arff"))
    data.class_is_last()
    classi = "weka.classifiers.bayes.NaiveBayes"
    cls = Classifier(classname=classi)
    from weka.attribute_selection import ASSearch, ASEvaluation, AttributeSelection
    search = ASSearch(classname="weka.attributeSelection.Ranker",
                      options=["-1.7976931348623157E308", "-1"])
    #evaluator = ASEvaluation(classname="weka.attributeSelection.ChiSquaredAttributeEval")
    #attsel = AttributeSelection()
    #attsel.search(search)
    #attsel.evaluator(evaluator)
    #attsel.select_attributes(data)
    #classi = "weka.classifiers.trees.J48"
    #classi = "weka.classifiers.functions.Logistic"
    #classi = "weka.classifiers.trees.RandomForest"
    #classi = "weka.classifiers.bayes.NaiveBayes"
    #classi = "weka.classifiers.functions.SMOreg"
    cls.build_classifier(data)
    print "Age model predictions"
    print cls
    import weka.core.serialization as serialization
    from weka.core.dataset import Instances
    serialization.write_all(os.path.join(outputModel, "AgeModel" + ".model"),
                            [cls, Instances.template_instances(data)])
    evl = Evaluation(data)
    evl.crossvalidate_model(cls, data, 10, Random(1))

    #print(evl.percent_correct)
    print(evl.summary())
    print(evl.class_details())
    os._exit(0)
Beispiel #15
0
def classification(data, train, test, num_clases):
    baseClassifiers_list = [
        "weka.classifiers.bayes.NaiveBayes",
        "weka.classifiers.functions.MultilayerPerceptron",
        "weka.classifiers.functions.SMO", "weka.classifiers.lazy.IBk",
        "weka.classifiers.lazy.KStar", "weka.classifiers.meta.AdaBoostM1",
        "weka.classifiers.meta.Bagging", "weka.classifiers.meta.LogitBoost",
        "weka.classifiers.trees.J48", "weka.classifiers.trees.DecisionStump",
        "weka.classifiers.trees.LMT", "weka.classifiers.trees.RandomForest",
        "weka.classifiers.trees.REPTree", "weka.classifiers.rules.PART",
        "weka.classifiers.rules.JRip", "weka.classifiers.functions.Logistic",
        "weka.classifiers.meta.ClassificationViaRegression",
        "weka.classifiers.bayes.BayesNet"
    ]
    results_train = pd.DataFrame()
    results_test = pd.DataFrame()

    cost_matrix_list = [
        "[]", "[0]", "[0.0 1.0; 1.0 0.0]",
        "[0.0 1.0 2.0; 1.0 0.0 1.0; 2.0 1.0 0.0]",
        "[0.0 1.0 2.0 3.0; 1.0 0.0 1.0 2.0; 2.0 1.0 0.0 1.0; 3.0 2.0 1.0 0.0]",
        "[0.0 1.0 2.0 3.0 4.0; 1.0 0.0 1.0 2.0 3.0; 2.0 1.0 0.0 1.0 2.0; 3.0 2.0 1.0 0.0 1.0; 4.0 3.0 2.0 1.0 0.0]",
        "[0.0 1.0 2.0 3.0 4.0 5.0; 1.0 0.0 1.0 2.0 3.0 4.0; 2.0 1.0 0.0 1.0 2.0 3.0; 3.0 2.0 1.0 0.0 1.0 2.0; 4.0 3.0 2.0 1.0 0.0 1.0; 5.0 4.0 3.0 2.0 1.0 0.0]",
        "[0.0 1.0 2.0 3.0 4.0 5.0 6.0; 1.0 0.0 1.0 2.0 3.0 4.0 5.0; 2.0 1.0 0.0 1.0 2.0 3.0 4.0; 3.0 2.0 1.0 0.0 1.0 2.0 3.0; 4.0 3.0 2.0 1.0 0.0 1.0 2.0; 5.0 4.0 3.0 2.0 1.0 0.0 1.0; 6.0 5.0 4.0 3.0 2.0 1.0 0.0]",
        "[0.0 1.0 2.0 3.0 4.0 5.0 6.0 7.0; 1.0 0.0 1.0 2.0 3.0 4.0 5.0 6.0; 2.0 1.0 0.0 1.0 2.0 3.0 4.0 5.0; 3.0 2.0 1.0 0.0 1.0 2.0 3.0 4.0; 4.0 3.0 2.0 1.0 0.0 1.0 2.0 3.0; 5.0 4.0 3.0 2.0 1.0 0.0 1.0 2.0; 6.0 5.0 4.0 3.0 2.0 1.0 0.0 1.0; 7.0 6.0 5.0 4.0 3.0 2.0 1.0 0.0]",
        "[0.0 1.0 2.0 3.0 4.0 5.0 6.0 7.0 8.0; 1.0 0.0 1.0 2.0 3.0 4.0 5.0 6.0 7.0; 2.0 1.0 0.0 1.0 2.0 3.0 4.0 5.0 6.0; 3.0 2.0 1.0 0.0 1.0 2.0 3.0 4.0 5.0; 4.0 3.0 2.0 1.0 0.0 1.0 2.0 3.0 4.0; 5.0 4.0 3.0 2.0 1.0 0.0 1.0 2.0 3.0; 6.0 5.0 4.0 3.0 2.0 1.0 0.0 1.0 2.0; 7.0 6.0 5.0 4.0 3.0 2.0 1.0 0.0 1.0; 8.0 7.0 6.0 5.0 4.0 3.0 2.0 1.0 0.0]",
        "[0.0 1.0 2.0 3.0 4.0 5.0 6.0 7.0 8.0 9.0; 1.0 0.0 1.0 2.0 3.0 4.0 5.0 6.0 7.0 8.0; 2.0 1.0 0.0 1.0 2.0 3.0 4.0 5.0 6.0 7.0; 3.0 2.0 1.0 0.0 1.0 2.0 3.0 4.0 5.0 6.0; 4.0 3.0 2.0 1.0 0.0 1.0 2.0 3.0 4.0 5.0; 5.0 4.0 3.0 2.0 1.0 0.0 1.0 2.0 3.0 4.0; 6.0 5.0 4.0 3.0 2.0 1.0 0.0 1.0 2.0 3.0; 7.0 6.0 5.0 4.0 3.0 2.0 1.0 0.0 1.0 2.0; 8.0 7.0 6.0 5.0 4.0 3.0 2.0 1.0 0.0 1.0; 9.0 8.0 7.0 6.0 5.0 4.0 3.0 2.0 1.0 0.0]"
    ]

    real_train = []  # the real label of the dataset
    for i in range(train.num_instances):
        real_train.append(
            train.get_instance(i).values[(train.num_attributes - 1)])
    results_train['real'] = real_train

    real_test = []  # the real label of the dataset
    for i in range(test.num_instances):
        real_test.append(
            test.get_instance(i).values[(test.num_attributes - 1)])
    results_test['real'] = real_test

    num = 0
    for clas in baseClassifiers_list:
        column = "p" + np.str(num)

        #classifier
        classifier = SingleClassifierEnhancer(
            classname="weka.classifiers.meta.CostSensitiveClassifier",
            options=[
                "-cost-matrix", cost_matrix_list[num_clases], "-M", "-S", "1"
            ])
        base = Classifier(classname=clas)
        classifier.classifier = base

        predicted_data_train = None
        predicted_data_test = None

        evaluation = Evaluation(data)
        classifier.build_classifier(train)
        #evaluation.test_model(classifier, train)

        # add predictions
        addcls = Filter(
            classname="weka.filters.supervised.attribute.AddClassification",
            options=["-classification"])

        addcls.set_property("classifier", Classifier.make_copy(classifier))
        addcls.inputformat(train)
        #addcls.filter(train)  # trains the classifier
        pred_train = addcls.filter(train)

        pred_test = addcls.filter(test)

        if predicted_data_train is None:
            predicted_data_train = Instances.template_instances(pred_train, 0)
        for n in range(pred_train.num_instances):
            predicted_data_train.add_instance(pred_train.get_instance(n))

        if predicted_data_test is None:
            predicted_data_test = Instances.template_instances(pred_test, 0)
        for n in range(pred_test.num_instances):
            predicted_data_test.add_instance(pred_test.get_instance(n))

        preds_train = [
        ]  #labels predicted for the classifer trained in the iteration
        preds_test = []

        for i in range(predicted_data_train.num_instances):
            preds_train.append(
                predicted_data_train.get_instance(i).values[(
                    predicted_data_train.num_attributes - 1)])

        for i in range(predicted_data_test.num_instances):
            preds_test.append(
                predicted_data_test.get_instance(i).values[(
                    predicted_data_test.num_attributes - 1)])

        results_train[column] = preds_train
        results_test[column] = preds_test
        num = num + 1
    return results_train, results_test
Beispiel #16
0
	def saveClassifier(self, filename, path='/home/sbiastoch/Schreibtisch/classifiers/'):
		serialization.write_all(path+filename, [self.classifier, Instances.template_instances(self.data)])
def main():
    """
    Just runs some example code.
    """

    # load a dataset
    iris_file = helper.get_data_dir() + os.sep + "iris.arff"
    helper.print_info("Loading dataset: " + iris_file)
    loader = Loader("weka.core.converters.ArffLoader")
    iris_data = loader.load_file(iris_file)
    iris_data.class_is_last()
    helper.print_title("Iris dataset")
    print(iris_data)
    helper.print_title("Iris dataset (incrementally output)")
    for i in iris_data:
        print(i)
    helper.print_title("Iris summary")
    print(Instances.summary(iris_data))
    helper.print_title("Iris attributes")
    for a in iris_data.attributes():
        print(a)
    helper.print_title("Instance at #0")
    print(iris_data.get_instance(0))
    print(iris_data.get_instance(0).values)
    print("Attribute stats (first):\n" + str(iris_data.attribute_stats(0)))
    print("total count (first attribute):\n" + str(iris_data.attribute_stats(0).total_count))
    print("numeric stats (first attribute):\n" + str(iris_data.attribute_stats(0).numeric_stats))
    print("nominal counts (last attribute):\n"
          + str(iris_data.attribute_stats(iris_data.num_attributes - 1).nominal_counts))
    helper.print_title("Instance values at #0")
    for v in iris_data.get_instance(0):
        print(v)

    # append datasets
    helper.print_title("append datasets")
    data1 = Instances.copy_instances(iris_data, 0, 2)
    data2 = Instances.copy_instances(iris_data, 2, 2)
    print("Dataset #1:\n" + str(data1))
    print("Dataset #2:\n" + str(data2))
    msg = data1.equal_headers(data2)
    print("#1 == #2 ? " + "yes" if msg is None else msg)
    combined = Instances.append_instances(data1, data2)
    print("Combined:\n" + str(combined))

    # merge datasets
    helper.print_title("merge datasets")
    data1 = Instances.copy_instances(iris_data, 0, 2)
    data1.class_index = -1
    data1.delete_attribute(1)
    data1.delete_first_attribute()
    data2 = Instances.copy_instances(iris_data, 0, 2)
    data2.class_index = -1
    data2.delete_attribute(4)
    data2.delete_attribute(3)
    data2.delete_attribute(2)
    print("Dataset #1:\n" + str(data1))
    print("Dataset #2:\n" + str(data2))
    msg = data1.equal_headers(data2)
    print("#1 == #2 ? " + ("yes" if msg is None else msg))
    combined = Instances.merge_instances(data2, data1)
    print("Combined:\n" + str(combined))

    # load dataset incrementally
    iris_file = helper.get_data_dir() + os.sep + "iris.arff"
    helper.print_info("Loading dataset incrementally: " + iris_file)
    loader = Loader("weka.core.converters.ArffLoader")
    iris_data = loader.load_file(iris_file, incremental=True)
    iris_data.class_is_last()
    helper.print_title("Iris dataset")
    print(iris_data)
    for inst in loader:
        print(inst)

    # create attributes
    helper.print_title("Creating attributes")
    num_att = Attribute.create_numeric("num")
    print("numeric: " + str(num_att))
    date_att = Attribute.create_date("dat", "yyyy-MM-dd")
    print("date: " + str(date_att))
    nom_att = Attribute.create_nominal("nom", ["label1", "label2"])
    print("nominal: " + str(nom_att))

    # create dataset
    helper.print_title("Create dataset")
    dataset = Instances.create_instances("helloworld", [num_att, date_att, nom_att], 0)
    print(str(dataset))

    # create an instance
    helper.print_title("Create and add instance")
    values = [3.1415926, date_att.parse_date("2014-04-10"), 1.0]
    inst = Instance.create_instance(values)
    print("Instance #1:\n" + str(inst))
    dataset.add_instance(inst)
    values = [2.71828, date_att.parse_date("2014-08-09"), Instance.missing_value()]
    inst = Instance.create_instance(values)
    dataset.add_instance(inst)
    print("Instance #2:\n" + str(inst))
    inst.set_value(0, 4.0)
    print("Instance #2 (updated):\n" + str(inst))
    print("Dataset:\n" + str(dataset))
    dataset.delete_with_missing(2)
    print("Dataset (after delete of missing):\n" + str(dataset))
    values = [(1, date_att.parse_date("2014-07-11"))]
    inst = Instance.create_sparse_instance(values, 3, classname="weka.core.SparseInstance")
    print("sparse Instance:\n" + str(inst))
    dataset.add_instance(inst)
    print("dataset with mixed dense/sparse instance objects:\n" + str(dataset))

    # create dataset (lists)
    helper.print_title("Create dataset from lists")
    x = [[randint(1, 10) for _ in range(5)] for _ in range(10)]
    y = [randint(0, 1) for _ in range(10)]
    dataset2 = ds.create_instances_from_lists(x, y, "generated from lists")
    print(dataset2)
    x = [[randint(1, 10) for _ in range(5)] for _ in range(10)]
    dataset2 = ds.create_instances_from_lists(x, name="generated from lists (no y)")
    print(dataset2)

    # create dataset (matrices)
    helper.print_title("Create dataset from matrices")
    x = np.random.randn(10, 5)
    y = np.random.randn(10)
    dataset3 = ds.create_instances_from_matrices(x, y, "generated from matrices")
    print(dataset3)
    x = np.random.randn(10, 5)
    dataset3 = ds.create_instances_from_matrices(x, name="generated from matrices (no y)")
    print(dataset3)

    # create more sparse instances
    diabetes_file = helper.get_data_dir() + os.sep + "diabetes.arff"
    helper.print_info("Loading dataset: " + diabetes_file)
    loader = Loader("weka.core.converters.ArffLoader")
    diabetes_data = loader.load_file(diabetes_file)
    diabetes_data.class_is_last()
    helper.print_title("Create sparse instances using template dataset")
    sparse_data = Instances.template_instances(diabetes_data)
    for i in range(diabetes_data.num_attributes - 1):
        inst = Instance.create_sparse_instance(
            [(i, float(i+1) / 10.0)], sparse_data.num_attributes, classname="weka.core.SparseInstance")
        sparse_data.add_instance(inst)
    print("sparse dataset:\n" + str(sparse_data))

    # simple scatterplot of iris dataset: petalwidth x petallength
    iris_data = loader.load_file(iris_file)
    iris_data.class_is_last()
    pld.scatter_plot(
        iris_data, iris_data.attribute_by_name("petalwidth").index,
        iris_data.attribute_by_name("petallength").index,
        percent=50,
        wait=False)

    # line plot of iris dataset (without class attribute)
    iris_data = loader.load_file(iris_file)
    iris_data.class_is_last()
    pld.line_plot(iris_data, atts=range(iris_data.num_attributes - 1), percent=50, title="Line plot iris", wait=False)

    # matrix plot of iris dataset
    iris_data = loader.load_file(iris_file)
    iris_data.class_is_last()
    pld.matrix_plot(iris_data, percent=50, title="Matrix plot iris", wait=True)
Beispiel #18
0
    test = rand_data.test_cv(folds, i)

    # build and evaluate classifier
    cls = Classifier.make_copy(classifier)
    cls.build_classifier(train)
    evaluation.test_model(cls, test)

    # add predictions
    addcls = Filter(
        classname="weka.filters.supervised.attribute.AddClassification",
        options=["-classification", "-distribution", "-error"])
    # setting the java object directory avoids issues with correct quoting in option array
    addcls.set_property("classifier", Classifier.make_copy(classifier))
    addcls.inputformat(train)
    addcls.filter(train)  # trains the classifier
    pred = addcls.filter(test)
    if predicted_data is None:
        predicted_data = Instances.template_instances(pred, 0)
    for n in xrange(pred.num_instances):
        predicted_data.add_instance(pred.get_instance(n))

print("")
print("=== Setup ===")
print("Classifier: " + classifier.to_commandline())
print("Dataset: " + data.relationname)
print("Folds: " + str(folds))
print("Seed: " + str(seed))
print("")
print(evaluation.summary("=== " + str(folds) + " -fold Cross-Validation ==="))
print("")
print(predicted_data)
Beispiel #19
0
# Discretize
print("Discretize numeric attributes (supervised)")
fltr = Filter(classname="weka.filters.supervised.attribute.Discretize")
fltr.inputformat(data)
filtered = fltr.filter(data)
print(filtered)

# PCA
print("Principal components analysis")
fltr = Filter(classname="weka.filters.unsupervised.attribute.PrincipalComponents")
fltr.inputformat(data)
filtered = fltr.filter(data)
print(filtered)

# load anneal
loader = Loader(classname="weka.core.converters.ArffLoader")
fname = data_dir + os.sep + "anneal.arff"
print("\nLoading dataset: " + fname + "\n")
data = loader.load_file(fname)
data.class_is_last()

# RemoveUseless
print("RemoveUseless")
fltr = Filter(classname="weka.filters.unsupervised.attribute.RemoveUseless")
fltr.inputformat(data)
filtered = fltr.filter(data)
print("Original header (#att=" + str(data.num_attributes) + "):\n" + str(Instances.template_instances(data)))
print("Filtered header (#att=" + str(filtered.num_attributes) + "):\n" + str(Instances.template_instances(filtered)))

jvm.stop()
Beispiel #20
0
import os
import weka.core.jvm as jvm
from weka.core.converters import Loader, Saver
from weka.core.dataset import Instances
from weka.filters import Filter

jvm.start()

# load weather.nominal
fname = data_dir + os.sep + "weather.nominal.arff"
print("\nLoading dataset: " + fname + "\n")
loader = Loader(classname="weka.core.converters.ArffLoader")
data = loader.load_file(fname)

# output header
print(Instances.template_instances(data))

# remove attribute no 3
print("\nRemove attribute no 3")
fltr = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "3"])
fltr.set_inputformat(data)
filtered = fltr.filter(data)

# output header
print(Instances.template_instances(filtered))

# save modified dataset
saver = Saver(classname="weka.core.converters.ArffSaver")
saver.save_file(filtered, data_dir + os.sep + "weather.nominal-filtered.arff")

jvm.stop()
Beispiel #21
0
def main():
    """
    Just runs some example code.
    """

    # load a dataset
    data_file = helper.get_data_dir() + os.sep + "vote.arff"
    helper.print_info("Loading dataset: " + data_file)
    loader = Loader("weka.core.converters.ArffLoader")
    data = loader.load_file(data_file)
    data.class_is_last()

    # classifier
    classifier = Classifier(classname="weka.classifiers.trees.J48")

    # randomize data
    folds = 10
    seed = 1
    rnd = Random(seed)
    rand_data = Instances.copy_instances(data)
    rand_data.randomize(rnd)
    if rand_data.class_attribute.is_nominal:
        rand_data.stratify(folds)

    # perform cross-validation and add predictions
    predicted_data = None
    evaluation = Evaluation(rand_data)
    for i in xrange(folds):
        train = rand_data.train_cv(folds, i)
        # the above code is used by the StratifiedRemoveFolds filter,
        # the following code is used by the Explorer/Experimenter
        # train = rand_data.train_cv(folds, i, rnd)
        test = rand_data.test_cv(folds, i)

        # build and evaluate classifier
        cls = Classifier.make_copy(classifier)
        cls.build_classifier(train)
        evaluation.test_model(cls, test)

        # add predictions
        addcls = Filter(
            classname="weka.filters.supervised.attribute.AddClassification",
            options=["-classification", "-distribution", "-error"])
        # setting the java object directory avoids issues with correct quoting in option array
        addcls.set_property("classifier", Classifier.make_copy(classifier))
        addcls.inputformat(train)
        addcls.filter(train)  # trains the classifier
        pred = addcls.filter(test)
        if predicted_data is None:
            predicted_data = Instances.template_instances(pred, 0)
        for n in xrange(pred.num_instances):
            predicted_data.add_instance(pred.get_instance(n))

    print("")
    print("=== Setup ===")
    print("Classifier: " + classifier.to_commandline())
    print("Dataset: " + data.relationname)
    print("Folds: " + str(folds))
    print("Seed: " + str(seed))
    print("")
    print(evaluation.summary("=== " + str(folds) + " -fold Cross-Validation ==="))
    print("")
    print(predicted_data)