def main(): """ Just runs some example code. """ # load a dataset iris_file = helper.get_data_dir() + os.sep + "iris.arff" helper.print_info("Loading dataset: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") iris_data = loader.load_file(iris_file) iris_data.class_is_last() # train classifier classifier = Classifier("weka.classifiers.trees.J48") classifier.build_classifier(iris_data) # save and read object helper.print_title("I/O: single object") outfile = tempfile.gettempdir() + os.sep + "j48.model" serialization.write(outfile, classifier) model = Classifier(jobject=serialization.read(outfile)) print(model) # save classifier and dataset header (multiple objects) helper.print_title("I/O: single object") serialization.write_all(outfile, [classifier, Instances.template_instances(iris_data)]) objects = serialization.read_all(outfile) for i, obj in enumerate(objects): helper.print_info("Object #" + str(i+1) + ":") if javabridge.get_env().is_instance_of(obj, javabridge.get_env().find_class("weka/core/Instances")): obj = Instances(jobject=obj) elif javabridge.get_env().is_instance_of(obj, javabridge.get_env().find_class("weka/classifiers/Classifier")): obj = Classifier(jobject=obj) print(obj)
def train_internal(self): best_weights_arr = [] #create an empty F with source as template F = Instances.template_instances(self.source[0]) withF = False print("Find weight for each source data set") for source in self.source: bestWeight, bestError = self.process_source(source, F, withF) best_weights_arr.append(bestWeight) #sort the data based on the weights self.source = [ source for _, source in sorted(zip(best_weights_arr, self.source), reverse=True, key=operator.itemgetter(0)) ] print("Train for final stage") withF = True while len(self.source) > 0: #self.max_source_dataset): weight, _ = self.process_source(self.source[0], F, withF) for inst in self.source[0]: inst.weight = weight F = Instances.append_instances(F, self.source[0]) F.class_is_last() self.source.pop(0) return F
def save_model(model, data, filename): """Save the model to the target caching file. The caches should be defined in the config file. See README and config.sample for reference. Args: model(obj): The model to be saved. Should be a weka.classifier.Classifier object. data(obj): The training set to be cached. target(str): The target option in '[cached]' section in the config file. filename(str): The target file to save. Returns: True if the target caching is saved, otherwise False. """ folder = os.path.join('caches', 'model') path = os.path.join(folder, filename + '.cache') build_if_not_exist(folder) serialization.write_all(path, [model, Instances.template_instances(data)]) localizer_log.msg( "Saved cache of {target_name}.".format(target_name='model')) return True
def main(): """ Just runs some example code. """ # load a dataset iris_file = helper.get_data_dir() + os.sep + "iris.arff" helper.print_info("Loading dataset: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") iris_data = loader.load_file(iris_file) iris_data.class_is_last() # train classifier classifier = Classifier("weka.classifiers.trees.J48") classifier.build_classifier(iris_data) # save and read object helper.print_title("I/O: model (using serialization module)") outfile = tempfile.gettempdir() + os.sep + "j48.model" serialization.write(outfile, classifier) model = Classifier(jobject=serialization.read(outfile)) print(model) # save classifier and dataset header (multiple objects) helper.print_title("I/O: model and header (using serialization module)") serialization.write_all( outfile, [classifier, Instances.template_instances(iris_data)]) objects = serialization.read_all(outfile) for i, obj in enumerate(objects): helper.print_info("Object #" + str(i + 1) + ":") if javabridge.get_env().is_instance_of( obj, javabridge.get_env().find_class("weka/core/Instances")): obj = Instances(jobject=obj) elif javabridge.get_env().is_instance_of( obj, javabridge.get_env().find_class( "weka/classifiers/Classifier")): obj = Classifier(jobject=obj) print(obj) # save and read object helper.print_title("I/O: just model (using Classifier class)") outfile = tempfile.gettempdir() + os.sep + "j48.model" classifier.serialize(outfile) model, _ = Classifier.deserialize(outfile) print(model) # save classifier and dataset header (multiple objects) helper.print_title("I/O: model and header (using Classifier class)") classifier.serialize(outfile, header=iris_data) model, header = Classifier.deserialize(outfile) print(model) if header is not None: print(header)
def __init__(self, model=None, header=None): """ Initializes the container. :param model: the model to store (eg Classifier or Clusterer) :type model: object :param header: the header instances :type header: Instances """ super(ModelContainer, self).__init__() self.set("Model", model) if header is not None: header = Instances.template_instances(header) self.set("Header", header) self._allowed = ["Model", "Header"]
def __init__(self, model=None, header=None): """ Initializes the container. :param model: the model to store (eg Classifier or Clusterer) :type model: object :param header: the header instances :type header: Instances """ super(ModelContainer, self).__init__() self.set("Model", model) if header is not None: header = Instances.template_instances(header) self.set("Header", header) self._allowed = ["Model", "Header"]
def serialize(self, ser_file, header=None): """ Serializes the clusterer to the specified file. :param ser_file: the file to save the model to :type ser_file: str :param header: the (optional) dataset header to store alongside; recommended :type header: Instances """ if (header is not None) and header.num_instances > 0: header = Instances.template_instances(header) if header is not None: serialization_write_all(ser_file, [self, header]) else: serialization_write(ser_file, self)
def do_execute(self): """ The actual execution of the actor. :return: None if successful, otherwise error message :rtype: str """ result = None data = self.input.payload if isinstance(self._input.payload, Instance): inst = self.input.payload data = inst.dataset elif isinstance(self.input.payload, Instances): data = self.input.payload inst = None append = True if self._header is None or (self._header.equal_headers(data) is not None): self._header = Instances.template_instances(data, 0) outstr = str(data) append = False elif inst is not None: outstr = str(inst) else: outstr = str(data) f = None try: if append: f = open(str(self.resolve_option("output")), "a") else: f = open(str(self.resolve_option("output")), "w") f.write(outstr) f.write("\n") except Exception as e: result = self.full_name + "\n" + traceback.format_exc() finally: if f is not None: f.close() return result
def do_execute(self): """ The actual execution of the actor. :return: None if successful, otherwise error message :rtype: str """ result = None data = self.input.payload if isinstance(self._input.payload, Instance): inst = self.input.payload data = inst.dataset elif isinstance(self.input.payload, Instances): data = self.input.payload inst = None append = True if self._header is None or (self._header.equal_headers(data) is not None): self._header = Instances.template_instances(data, 0) outstr = str(data) append = False elif inst is not None: outstr = str(inst) else: outstr = str(data) f = None try: if append: f = open(str(self.resolve_option("output")), "a") else: f = open(str(self.resolve_option("output")), "w") f.write(outstr) f.write("\n") except Exception as e: result = self.full_name + "\n" + traceback.format_exc() finally: if f is not None: f.close() return result
def fit(self, data, targets): """ Trains the estimator. :param data: the input variables as matrix, array-like of shape (n_samples, n_features) :type data: ndarray :param targets: the optional class attribute column, array-like of shape (n_samples,) :type targets: ndarray :return: itself :rtype: WekaTransformer """ if targets is None: check_array(data) else: check_X_y(data, targets) d = to_instances(data, y=targets, num_nominal_labels=self._num_nominal_input_labels, num_class_labels=self._num_nominal_output_labels) self.header_ = Instances.template_instances(d) self._filter.inputformat(d) self._filter.filter(d) return self
fltr.inputformat(data) filtered = fltr.filter(data) print(filtered) # PCA print("Principal components analysis") fltr = Filter( classname="weka.filters.unsupervised.attribute.PrincipalComponents") fltr.inputformat(data) filtered = fltr.filter(data) print(filtered) # load anneal loader = Loader(classname="weka.core.converters.ArffLoader") fname = data_dir + os.sep + "anneal.arff" print("\nLoading dataset: " + fname + "\n") data = loader.load_file(fname) data.class_is_last() # RemoveUseless print("RemoveUseless") fltr = Filter(classname="weka.filters.unsupervised.attribute.RemoveUseless") fltr.inputformat(data) filtered = fltr.filter(data) print("Original header (#att=" + str(data.num_attributes) + "):\n" + str(Instances.template_instances(data))) print("Filtered header (#att=" + str(filtered.num_attributes) + "):\n" + str(Instances.template_instances(filtered))) jvm.stop()
def main(): """ Just runs some example code. """ # load a dataset iris_file = helper.get_data_dir() + os.sep + "iris.arff" helper.print_info("Loading dataset: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") iris_data = loader.load_file(iris_file) iris_data.class_is_last() helper.print_title("Iris dataset") print(iris_data) helper.print_title("Iris dataset (incrementally output)") for i in iris_data: print(i) helper.print_title("Iris summary") print(Instances.summary(iris_data)) helper.print_title("Iris attributes") for a in iris_data.attributes(): print(a) helper.print_title("Instance at #0") print(iris_data.get_instance(0)) print(iris_data.get_instance(0).values) print("Attribute stats (first):\n" + str(iris_data.attribute_stats(0))) print("total count (first attribute):\n" + str(iris_data.attribute_stats(0).total_count)) print("numeric stats (first attribute):\n" + str(iris_data.attribute_stats(0).numeric_stats)) print("nominal counts (last attribute):\n" + str( iris_data.attribute_stats(iris_data.num_attributes - 1).nominal_counts)) helper.print_title("Instance values at #0") for v in iris_data.get_instance(0): print(v) # append datasets helper.print_title("append datasets") data1 = Instances.copy_instances(iris_data, 0, 2) data2 = Instances.copy_instances(iris_data, 2, 2) print("Dataset #1:\n" + str(data1)) print("Dataset #2:\n" + str(data2)) msg = data1.equal_headers(data2) print("#1 == #2 ? " + "yes" if msg is None else msg) combined = Instances.append_instances(data1, data2) print("Combined:\n" + str(combined)) # merge datasets helper.print_title("merge datasets") data1 = Instances.copy_instances(iris_data, 0, 2) data1.class_index = -1 data1.delete_attribute(1) data1.delete_first_attribute() data2 = Instances.copy_instances(iris_data, 0, 2) data2.class_index = -1 data2.delete_attribute(4) data2.delete_attribute(3) data2.delete_attribute(2) print("Dataset #1:\n" + str(data1)) print("Dataset #2:\n" + str(data2)) msg = data1.equal_headers(data2) print("#1 == #2 ? " + ("yes" if msg is None else msg)) combined = Instances.merge_instances(data2, data1) print("Combined:\n" + str(combined)) # load dataset incrementally iris_file = helper.get_data_dir() + os.sep + "iris.arff" helper.print_info("Loading dataset incrementally: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") iris_data = loader.load_file(iris_file, incremental=True) iris_data.class_is_last() helper.print_title("Iris dataset") print(iris_data) for inst in loader: print(inst) # create attributes helper.print_title("Creating attributes") num_att = Attribute.create_numeric("num") print("numeric: " + str(num_att)) date_att = Attribute.create_date("dat", "yyyy-MM-dd") print("date: " + str(date_att)) nom_att = Attribute.create_nominal("nom", ["label1", "label2"]) print("nominal: " + str(nom_att)) # create dataset helper.print_title("Create dataset") dataset = Instances.create_instances("helloworld", [num_att, date_att, nom_att], 0) print(str(dataset)) # create an instance helper.print_title("Create and add instance") values = [3.1415926, date_att.parse_date("2014-04-10"), 1.0] inst = Instance.create_instance(values) print("Instance #1:\n" + str(inst)) dataset.add_instance(inst) values = [ 2.71828, date_att.parse_date("2014-08-09"), Instance.missing_value() ] inst = Instance.create_instance(values) dataset.add_instance(inst) print("Instance #2:\n" + str(inst)) inst.set_value(0, 4.0) print("Instance #2 (updated):\n" + str(inst)) print("Dataset:\n" + str(dataset)) dataset.delete_with_missing(2) print("Dataset (after delete of missing):\n" + str(dataset)) values = [(1, date_att.parse_date("2014-07-11"))] inst = Instance.create_sparse_instance( values, 3, classname="weka.core.SparseInstance") print("sparse Instance:\n" + str(inst)) dataset.add_instance(inst) print("dataset with mixed dense/sparse instance objects:\n" + str(dataset)) # create dataset (lists) helper.print_title("Create dataset from lists") x = [[randint(1, 10) for _ in range(5)] for _ in range(10)] y = [randint(0, 1) for _ in range(10)] dataset2 = ds.create_instances_from_lists(x, y, "generated from lists") print(dataset2) x = [[randint(1, 10) for _ in range(5)] for _ in range(10)] dataset2 = ds.create_instances_from_lists( x, name="generated from lists (no y)") print(dataset2) # create dataset (matrices) helper.print_title("Create dataset from matrices") x = np.random.randn(10, 5) y = np.random.randn(10) dataset3 = ds.create_instances_from_matrices(x, y, "generated from matrices") print(dataset3) x = np.random.randn(10, 5) dataset3 = ds.create_instances_from_matrices( x, name="generated from matrices (no y)") print(dataset3) # create more sparse instances diabetes_file = helper.get_data_dir() + os.sep + "diabetes.arff" helper.print_info("Loading dataset: " + diabetes_file) loader = Loader("weka.core.converters.ArffLoader") diabetes_data = loader.load_file(diabetes_file) diabetes_data.class_is_last() helper.print_title("Create sparse instances using template dataset") sparse_data = Instances.template_instances(diabetes_data) for i in xrange(diabetes_data.num_attributes - 1): inst = Instance.create_sparse_instance( [(i, float(i + 1) / 10.0)], sparse_data.num_attributes, classname="weka.core.SparseInstance") sparse_data.add_instance(inst) print("sparse dataset:\n" + str(sparse_data)) # simple scatterplot of iris dataset: petalwidth x petallength iris_data = loader.load_file(iris_file) iris_data.class_is_last() pld.scatter_plot(iris_data, iris_data.attribute_by_name("petalwidth").index, iris_data.attribute_by_name("petallength").index, percent=50, wait=False) # line plot of iris dataset (without class attribute) iris_data = loader.load_file(iris_file) iris_data.class_is_last() pld.line_plot(iris_data, atts=xrange(iris_data.num_attributes - 1), percent=50, title="Line plot iris", wait=False) # matrix plot of iris dataset iris_data = loader.load_file(iris_file) iris_data.class_is_last() pld.matrix_plot(iris_data, percent=50, title="Matrix plot iris", wait=True)
import weka.core.jvm as jvm from weka.core.converters import Loader, Saver from weka.core.dataset import Instances from weka.filters import Filter jvm.start() # load weather.nominal fname = data_dir + os.sep + "weather.nominal.arff" print("\nLoading dataset: " + fname + "\n") loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(fname) # output header print(Instances.template_instances(data)) # remove attribute no 3 print("\nRemove attribute no 3") fltr = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "3"]) fltr.inputformat(data) filtered = fltr.filter(data) # output header print(Instances.template_instances(filtered)) # save modified dataset saver = Saver(classname="weka.core.converters.ArffSaver") saver.save_file(filtered, data_dir + os.sep + "weather.nominal-filtered.arff") jvm.stop()
def createTrainedModel(): from weka.core.converters import Loader folderList = os.listdir(outputModel) i = 0 classi = "" loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(os.path.join(outputModel, "genderTrain.arff")) data.class_is_last() from weka.classifiers import Classifier classi = "weka.classifiers.bayes.NaiveBayes" cls = Classifier(classname=classi) from weka.attribute_selection import ASSearch, ASEvaluation, AttributeSelection search = ASSearch(classname="weka.attributeSelection.Ranker", options=["-1.7976931348623157E308", "-1"]) #evaluator = ASEvaluation(classname="weka.attributeSelection.ChiSquaredAttributeEval") #attsel = AttributeSelection() #attsel.search(search) #attsel.evaluator(evaluator) #attsel.select_attributes(data) cls.build_classifier(data) import weka.core.serialization as serialization from weka.core.dataset import Instances serialization.write_all( os.path.join(outputModel, "GenderModel" + ".model"), [cls, Instances.template_instances(data)]) from weka.classifiers import Evaluation from weka.core.classes import Random evl = Evaluation(data) evl.crossvalidate_model(cls, data, 10, Random(1)) print "Gender model predictions" print cls #print(evl.percent_correct) print(evl.summary()) print(evl.class_details()) data = loader.load_file(os.path.join(outputModel, "ageTrain.arff")) data.class_is_last() classi = "weka.classifiers.bayes.NaiveBayes" cls = Classifier(classname=classi) from weka.attribute_selection import ASSearch, ASEvaluation, AttributeSelection search = ASSearch(classname="weka.attributeSelection.Ranker", options=["-1.7976931348623157E308", "-1"]) #evaluator = ASEvaluation(classname="weka.attributeSelection.ChiSquaredAttributeEval") #attsel = AttributeSelection() #attsel.search(search) #attsel.evaluator(evaluator) #attsel.select_attributes(data) #classi = "weka.classifiers.trees.J48" #classi = "weka.classifiers.functions.Logistic" #classi = "weka.classifiers.trees.RandomForest" #classi = "weka.classifiers.bayes.NaiveBayes" #classi = "weka.classifiers.functions.SMOreg" cls.build_classifier(data) print "Age model predictions" print cls import weka.core.serialization as serialization from weka.core.dataset import Instances serialization.write_all(os.path.join(outputModel, "AgeModel" + ".model"), [cls, Instances.template_instances(data)]) evl = Evaluation(data) evl.crossvalidate_model(cls, data, 10, Random(1)) #print(evl.percent_correct) print(evl.summary()) print(evl.class_details()) os._exit(0)
def classification(data, train, test, num_clases): baseClassifiers_list = [ "weka.classifiers.bayes.NaiveBayes", "weka.classifiers.functions.MultilayerPerceptron", "weka.classifiers.functions.SMO", "weka.classifiers.lazy.IBk", "weka.classifiers.lazy.KStar", "weka.classifiers.meta.AdaBoostM1", "weka.classifiers.meta.Bagging", "weka.classifiers.meta.LogitBoost", "weka.classifiers.trees.J48", "weka.classifiers.trees.DecisionStump", "weka.classifiers.trees.LMT", "weka.classifiers.trees.RandomForest", "weka.classifiers.trees.REPTree", "weka.classifiers.rules.PART", "weka.classifiers.rules.JRip", "weka.classifiers.functions.Logistic", "weka.classifiers.meta.ClassificationViaRegression", "weka.classifiers.bayes.BayesNet" ] results_train = pd.DataFrame() results_test = pd.DataFrame() cost_matrix_list = [ "[]", "[0]", "[0.0 1.0; 1.0 0.0]", "[0.0 1.0 2.0; 1.0 0.0 1.0; 2.0 1.0 0.0]", "[0.0 1.0 2.0 3.0; 1.0 0.0 1.0 2.0; 2.0 1.0 0.0 1.0; 3.0 2.0 1.0 0.0]", "[0.0 1.0 2.0 3.0 4.0; 1.0 0.0 1.0 2.0 3.0; 2.0 1.0 0.0 1.0 2.0; 3.0 2.0 1.0 0.0 1.0; 4.0 3.0 2.0 1.0 0.0]", "[0.0 1.0 2.0 3.0 4.0 5.0; 1.0 0.0 1.0 2.0 3.0 4.0; 2.0 1.0 0.0 1.0 2.0 3.0; 3.0 2.0 1.0 0.0 1.0 2.0; 4.0 3.0 2.0 1.0 0.0 1.0; 5.0 4.0 3.0 2.0 1.0 0.0]", "[0.0 1.0 2.0 3.0 4.0 5.0 6.0; 1.0 0.0 1.0 2.0 3.0 4.0 5.0; 2.0 1.0 0.0 1.0 2.0 3.0 4.0; 3.0 2.0 1.0 0.0 1.0 2.0 3.0; 4.0 3.0 2.0 1.0 0.0 1.0 2.0; 5.0 4.0 3.0 2.0 1.0 0.0 1.0; 6.0 5.0 4.0 3.0 2.0 1.0 0.0]", "[0.0 1.0 2.0 3.0 4.0 5.0 6.0 7.0; 1.0 0.0 1.0 2.0 3.0 4.0 5.0 6.0; 2.0 1.0 0.0 1.0 2.0 3.0 4.0 5.0; 3.0 2.0 1.0 0.0 1.0 2.0 3.0 4.0; 4.0 3.0 2.0 1.0 0.0 1.0 2.0 3.0; 5.0 4.0 3.0 2.0 1.0 0.0 1.0 2.0; 6.0 5.0 4.0 3.0 2.0 1.0 0.0 1.0; 7.0 6.0 5.0 4.0 3.0 2.0 1.0 0.0]", "[0.0 1.0 2.0 3.0 4.0 5.0 6.0 7.0 8.0; 1.0 0.0 1.0 2.0 3.0 4.0 5.0 6.0 7.0; 2.0 1.0 0.0 1.0 2.0 3.0 4.0 5.0 6.0; 3.0 2.0 1.0 0.0 1.0 2.0 3.0 4.0 5.0; 4.0 3.0 2.0 1.0 0.0 1.0 2.0 3.0 4.0; 5.0 4.0 3.0 2.0 1.0 0.0 1.0 2.0 3.0; 6.0 5.0 4.0 3.0 2.0 1.0 0.0 1.0 2.0; 7.0 6.0 5.0 4.0 3.0 2.0 1.0 0.0 1.0; 8.0 7.0 6.0 5.0 4.0 3.0 2.0 1.0 0.0]", "[0.0 1.0 2.0 3.0 4.0 5.0 6.0 7.0 8.0 9.0; 1.0 0.0 1.0 2.0 3.0 4.0 5.0 6.0 7.0 8.0; 2.0 1.0 0.0 1.0 2.0 3.0 4.0 5.0 6.0 7.0; 3.0 2.0 1.0 0.0 1.0 2.0 3.0 4.0 5.0 6.0; 4.0 3.0 2.0 1.0 0.0 1.0 2.0 3.0 4.0 5.0; 5.0 4.0 3.0 2.0 1.0 0.0 1.0 2.0 3.0 4.0; 6.0 5.0 4.0 3.0 2.0 1.0 0.0 1.0 2.0 3.0; 7.0 6.0 5.0 4.0 3.0 2.0 1.0 0.0 1.0 2.0; 8.0 7.0 6.0 5.0 4.0 3.0 2.0 1.0 0.0 1.0; 9.0 8.0 7.0 6.0 5.0 4.0 3.0 2.0 1.0 0.0]" ] real_train = [] # the real label of the dataset for i in range(train.num_instances): real_train.append( train.get_instance(i).values[(train.num_attributes - 1)]) results_train['real'] = real_train real_test = [] # the real label of the dataset for i in range(test.num_instances): real_test.append( test.get_instance(i).values[(test.num_attributes - 1)]) results_test['real'] = real_test num = 0 for clas in baseClassifiers_list: column = "p" + np.str(num) #classifier classifier = SingleClassifierEnhancer( classname="weka.classifiers.meta.CostSensitiveClassifier", options=[ "-cost-matrix", cost_matrix_list[num_clases], "-M", "-S", "1" ]) base = Classifier(classname=clas) classifier.classifier = base predicted_data_train = None predicted_data_test = None evaluation = Evaluation(data) classifier.build_classifier(train) #evaluation.test_model(classifier, train) # add predictions addcls = Filter( classname="weka.filters.supervised.attribute.AddClassification", options=["-classification"]) addcls.set_property("classifier", Classifier.make_copy(classifier)) addcls.inputformat(train) #addcls.filter(train) # trains the classifier pred_train = addcls.filter(train) pred_test = addcls.filter(test) if predicted_data_train is None: predicted_data_train = Instances.template_instances(pred_train, 0) for n in range(pred_train.num_instances): predicted_data_train.add_instance(pred_train.get_instance(n)) if predicted_data_test is None: predicted_data_test = Instances.template_instances(pred_test, 0) for n in range(pred_test.num_instances): predicted_data_test.add_instance(pred_test.get_instance(n)) preds_train = [ ] #labels predicted for the classifer trained in the iteration preds_test = [] for i in range(predicted_data_train.num_instances): preds_train.append( predicted_data_train.get_instance(i).values[( predicted_data_train.num_attributes - 1)]) for i in range(predicted_data_test.num_instances): preds_test.append( predicted_data_test.get_instance(i).values[( predicted_data_test.num_attributes - 1)]) results_train[column] = preds_train results_test[column] = preds_test num = num + 1 return results_train, results_test
def saveClassifier(self, filename, path='/home/sbiastoch/Schreibtisch/classifiers/'): serialization.write_all(path+filename, [self.classifier, Instances.template_instances(self.data)])
def main(): """ Just runs some example code. """ # load a dataset iris_file = helper.get_data_dir() + os.sep + "iris.arff" helper.print_info("Loading dataset: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") iris_data = loader.load_file(iris_file) iris_data.class_is_last() helper.print_title("Iris dataset") print(iris_data) helper.print_title("Iris dataset (incrementally output)") for i in iris_data: print(i) helper.print_title("Iris summary") print(Instances.summary(iris_data)) helper.print_title("Iris attributes") for a in iris_data.attributes(): print(a) helper.print_title("Instance at #0") print(iris_data.get_instance(0)) print(iris_data.get_instance(0).values) print("Attribute stats (first):\n" + str(iris_data.attribute_stats(0))) print("total count (first attribute):\n" + str(iris_data.attribute_stats(0).total_count)) print("numeric stats (first attribute):\n" + str(iris_data.attribute_stats(0).numeric_stats)) print("nominal counts (last attribute):\n" + str(iris_data.attribute_stats(iris_data.num_attributes - 1).nominal_counts)) helper.print_title("Instance values at #0") for v in iris_data.get_instance(0): print(v) # append datasets helper.print_title("append datasets") data1 = Instances.copy_instances(iris_data, 0, 2) data2 = Instances.copy_instances(iris_data, 2, 2) print("Dataset #1:\n" + str(data1)) print("Dataset #2:\n" + str(data2)) msg = data1.equal_headers(data2) print("#1 == #2 ? " + "yes" if msg is None else msg) combined = Instances.append_instances(data1, data2) print("Combined:\n" + str(combined)) # merge datasets helper.print_title("merge datasets") data1 = Instances.copy_instances(iris_data, 0, 2) data1.class_index = -1 data1.delete_attribute(1) data1.delete_first_attribute() data2 = Instances.copy_instances(iris_data, 0, 2) data2.class_index = -1 data2.delete_attribute(4) data2.delete_attribute(3) data2.delete_attribute(2) print("Dataset #1:\n" + str(data1)) print("Dataset #2:\n" + str(data2)) msg = data1.equal_headers(data2) print("#1 == #2 ? " + ("yes" if msg is None else msg)) combined = Instances.merge_instances(data2, data1) print("Combined:\n" + str(combined)) # load dataset incrementally iris_file = helper.get_data_dir() + os.sep + "iris.arff" helper.print_info("Loading dataset incrementally: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") iris_data = loader.load_file(iris_file, incremental=True) iris_data.class_is_last() helper.print_title("Iris dataset") print(iris_data) for inst in loader: print(inst) # create attributes helper.print_title("Creating attributes") num_att = Attribute.create_numeric("num") print("numeric: " + str(num_att)) date_att = Attribute.create_date("dat", "yyyy-MM-dd") print("date: " + str(date_att)) nom_att = Attribute.create_nominal("nom", ["label1", "label2"]) print("nominal: " + str(nom_att)) # create dataset helper.print_title("Create dataset") dataset = Instances.create_instances("helloworld", [num_att, date_att, nom_att], 0) print(str(dataset)) # create an instance helper.print_title("Create and add instance") values = [3.1415926, date_att.parse_date("2014-04-10"), 1.0] inst = Instance.create_instance(values) print("Instance #1:\n" + str(inst)) dataset.add_instance(inst) values = [2.71828, date_att.parse_date("2014-08-09"), Instance.missing_value()] inst = Instance.create_instance(values) dataset.add_instance(inst) print("Instance #2:\n" + str(inst)) inst.set_value(0, 4.0) print("Instance #2 (updated):\n" + str(inst)) print("Dataset:\n" + str(dataset)) dataset.delete_with_missing(2) print("Dataset (after delete of missing):\n" + str(dataset)) values = [(1, date_att.parse_date("2014-07-11"))] inst = Instance.create_sparse_instance(values, 3, classname="weka.core.SparseInstance") print("sparse Instance:\n" + str(inst)) dataset.add_instance(inst) print("dataset with mixed dense/sparse instance objects:\n" + str(dataset)) # create dataset (lists) helper.print_title("Create dataset from lists") x = [[randint(1, 10) for _ in range(5)] for _ in range(10)] y = [randint(0, 1) for _ in range(10)] dataset2 = ds.create_instances_from_lists(x, y, "generated from lists") print(dataset2) x = [[randint(1, 10) for _ in range(5)] for _ in range(10)] dataset2 = ds.create_instances_from_lists(x, name="generated from lists (no y)") print(dataset2) # create dataset (matrices) helper.print_title("Create dataset from matrices") x = np.random.randn(10, 5) y = np.random.randn(10) dataset3 = ds.create_instances_from_matrices(x, y, "generated from matrices") print(dataset3) x = np.random.randn(10, 5) dataset3 = ds.create_instances_from_matrices(x, name="generated from matrices (no y)") print(dataset3) # create more sparse instances diabetes_file = helper.get_data_dir() + os.sep + "diabetes.arff" helper.print_info("Loading dataset: " + diabetes_file) loader = Loader("weka.core.converters.ArffLoader") diabetes_data = loader.load_file(diabetes_file) diabetes_data.class_is_last() helper.print_title("Create sparse instances using template dataset") sparse_data = Instances.template_instances(diabetes_data) for i in range(diabetes_data.num_attributes - 1): inst = Instance.create_sparse_instance( [(i, float(i+1) / 10.0)], sparse_data.num_attributes, classname="weka.core.SparseInstance") sparse_data.add_instance(inst) print("sparse dataset:\n" + str(sparse_data)) # simple scatterplot of iris dataset: petalwidth x petallength iris_data = loader.load_file(iris_file) iris_data.class_is_last() pld.scatter_plot( iris_data, iris_data.attribute_by_name("petalwidth").index, iris_data.attribute_by_name("petallength").index, percent=50, wait=False) # line plot of iris dataset (without class attribute) iris_data = loader.load_file(iris_file) iris_data.class_is_last() pld.line_plot(iris_data, atts=range(iris_data.num_attributes - 1), percent=50, title="Line plot iris", wait=False) # matrix plot of iris dataset iris_data = loader.load_file(iris_file) iris_data.class_is_last() pld.matrix_plot(iris_data, percent=50, title="Matrix plot iris", wait=True)
test = rand_data.test_cv(folds, i) # build and evaluate classifier cls = Classifier.make_copy(classifier) cls.build_classifier(train) evaluation.test_model(cls, test) # add predictions addcls = Filter( classname="weka.filters.supervised.attribute.AddClassification", options=["-classification", "-distribution", "-error"]) # setting the java object directory avoids issues with correct quoting in option array addcls.set_property("classifier", Classifier.make_copy(classifier)) addcls.inputformat(train) addcls.filter(train) # trains the classifier pred = addcls.filter(test) if predicted_data is None: predicted_data = Instances.template_instances(pred, 0) for n in xrange(pred.num_instances): predicted_data.add_instance(pred.get_instance(n)) print("") print("=== Setup ===") print("Classifier: " + classifier.to_commandline()) print("Dataset: " + data.relationname) print("Folds: " + str(folds)) print("Seed: " + str(seed)) print("") print(evaluation.summary("=== " + str(folds) + " -fold Cross-Validation ===")) print("") print(predicted_data)
# Discretize print("Discretize numeric attributes (supervised)") fltr = Filter(classname="weka.filters.supervised.attribute.Discretize") fltr.inputformat(data) filtered = fltr.filter(data) print(filtered) # PCA print("Principal components analysis") fltr = Filter(classname="weka.filters.unsupervised.attribute.PrincipalComponents") fltr.inputformat(data) filtered = fltr.filter(data) print(filtered) # load anneal loader = Loader(classname="weka.core.converters.ArffLoader") fname = data_dir + os.sep + "anneal.arff" print("\nLoading dataset: " + fname + "\n") data = loader.load_file(fname) data.class_is_last() # RemoveUseless print("RemoveUseless") fltr = Filter(classname="weka.filters.unsupervised.attribute.RemoveUseless") fltr.inputformat(data) filtered = fltr.filter(data) print("Original header (#att=" + str(data.num_attributes) + "):\n" + str(Instances.template_instances(data))) print("Filtered header (#att=" + str(filtered.num_attributes) + "):\n" + str(Instances.template_instances(filtered))) jvm.stop()
import os import weka.core.jvm as jvm from weka.core.converters import Loader, Saver from weka.core.dataset import Instances from weka.filters import Filter jvm.start() # load weather.nominal fname = data_dir + os.sep + "weather.nominal.arff" print("\nLoading dataset: " + fname + "\n") loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(fname) # output header print(Instances.template_instances(data)) # remove attribute no 3 print("\nRemove attribute no 3") fltr = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "3"]) fltr.set_inputformat(data) filtered = fltr.filter(data) # output header print(Instances.template_instances(filtered)) # save modified dataset saver = Saver(classname="weka.core.converters.ArffSaver") saver.save_file(filtered, data_dir + os.sep + "weather.nominal-filtered.arff") jvm.stop()
def main(): """ Just runs some example code. """ # load a dataset data_file = helper.get_data_dir() + os.sep + "vote.arff" helper.print_info("Loading dataset: " + data_file) loader = Loader("weka.core.converters.ArffLoader") data = loader.load_file(data_file) data.class_is_last() # classifier classifier = Classifier(classname="weka.classifiers.trees.J48") # randomize data folds = 10 seed = 1 rnd = Random(seed) rand_data = Instances.copy_instances(data) rand_data.randomize(rnd) if rand_data.class_attribute.is_nominal: rand_data.stratify(folds) # perform cross-validation and add predictions predicted_data = None evaluation = Evaluation(rand_data) for i in xrange(folds): train = rand_data.train_cv(folds, i) # the above code is used by the StratifiedRemoveFolds filter, # the following code is used by the Explorer/Experimenter # train = rand_data.train_cv(folds, i, rnd) test = rand_data.test_cv(folds, i) # build and evaluate classifier cls = Classifier.make_copy(classifier) cls.build_classifier(train) evaluation.test_model(cls, test) # add predictions addcls = Filter( classname="weka.filters.supervised.attribute.AddClassification", options=["-classification", "-distribution", "-error"]) # setting the java object directory avoids issues with correct quoting in option array addcls.set_property("classifier", Classifier.make_copy(classifier)) addcls.inputformat(train) addcls.filter(train) # trains the classifier pred = addcls.filter(test) if predicted_data is None: predicted_data = Instances.template_instances(pred, 0) for n in xrange(pred.num_instances): predicted_data.add_instance(pred.get_instance(n)) print("") print("=== Setup ===") print("Classifier: " + classifier.to_commandline()) print("Dataset: " + data.relationname) print("Folds: " + str(folds)) print("Seed: " + str(seed)) print("") print(evaluation.summary("=== " + str(folds) + " -fold Cross-Validation ===")) print("") print(predicted_data)