def filterUnusedFeatureFromList(self, data, unusedFuncitonList): filteredData = data for attribute in unusedFuncitonList: remove = Filter(classname="weka.filters.unsupervised.attribute.RemoveByName", options=["-E", "^" + attribute + ".*$"]) remove.set_inputformat(filteredData) filteredData = remove.filter(filteredData) return filteredData
def attributeSelector(self, data, selectNum): attributeSelector = Filter(classname="weka.filters.supervised.attribute.AttributeSelection",\ options=["-S", "weka.attributeSelection.Ranker -T -1.7976931348623157E308 -N " + str(selectNum),\ "-E", "weka.attributeSelection.InfoGainAttributeEval"]) attributeSelector.set_inputformat(data) data = attributeSelector.filter(data) return data
def getSetDataBySetIndex(self, data, index): # cut feature set out featureTable = FeatureTable() startIndexList = featureTable.getEachSetStartIndex() start = startIndexList[index] end = startIndexList[index+1] - 1 remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-V", "-R", str(start) + "-" + str(end) + ",last"]) remove.set_inputformat(data) filteredData = remove.filter(data) return filteredData
def remove_correct_classified(self, invert = False): options=[ '-W', self.classifier.to_commandline(), '-C', str(self.class_index), #classindex # '-F','0', # folds # '-T','0.1', #threshold by numeric classes '-I','0', # max iterations '-V' if not invert else '' ] # invert classname = "weka.filters.unsupervised.instance.RemoveMisclassified" remove = Filter(classname=classname, options=options) remove.inputformat(self.data) self.data = remove.filter(self.data)
def emlimitateUnusedFeature(self, trainData, testData = None): trainData.set_class_index(trainData.num_attributes() - 1) # set class attribute featureIndex = -1 filteredTrainData = trainData filteredTestData = testData attribute_index = 0 while attribute_index < filteredTrainData.num_attributes() - 1: sampleCoverage = 0 #print attribute_index # check value for current feature in each instance for instance_index in range(0, filteredTrainData.num_instances()): instance = filteredTrainData.get_instance(instance_index) value = instance.get_value(attribute_index) if value > 0: sampleCoverage += 1 if sampleCoverage == 0: #print "found" remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", str(attribute_index+1)]) #The index in this function start from 1 remove.set_inputformat(filteredTrainData) filteredTrainData = remove.filter(filteredTrainData) if filteredTestData: remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", str(attribute_index+1)]) #The index in this function start from 1 remove.set_inputformat(filteredTestData) filteredTestData = remove.filter(filteredTestData) else: attribute_index += 1 return [filteredTrainData, filteredTestData]
def main(): """ Just runs some example code. """ # load a dataset iris_file = helper.get_data_dir() + os.sep + "iris.arff" helper.print_info("Loading dataset: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") data = loader.load_file(iris_file) # remove class attribute data.delete_last_attribute() # build a clusterer and output model helper.print_title("Training SimpleKMeans clusterer") clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", "3"]) clusterer.build_clusterer(data) print(clusterer) helper.print_info("Evaluating on data") evaluation = ClusterEvaluation() evaluation.set_model(clusterer) evaluation.test_model(data) print("# clusters: " + str(evaluation.num_clusters)) print("log likelihood: " + str(evaluation.log_likelihood)) print("cluster assignments:\n" + str(evaluation.cluster_assignments)) plc.plot_cluster_assignments(evaluation, data, inst_no=True) # using a filtered clusterer helper.print_title("Filtered clusterer") loader = Loader("weka.core.converters.ArffLoader") data = loader.load_file(iris_file) clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", "3"]) remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "last"]) fclusterer = FilteredClusterer() fclusterer.clusterer = clusterer fclusterer.filter = remove fclusterer.build_clusterer(data) print(fclusterer) # load a dataset incrementally and build clusterer incrementally helper.print_title("Incremental clusterer") loader = Loader("weka.core.converters.ArffLoader") iris_inc = loader.load_file(iris_file, incremental=True) clusterer = Clusterer("weka.clusterers.Cobweb") remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "last"]) remove.inputformat(iris_inc) iris_filtered = remove.outputformat() clusterer.build_clusterer(iris_filtered) for inst in loader: remove.input(inst) inst_filtered = remove.output() clusterer.update_clusterer(inst_filtered) clusterer.update_finished() print(clusterer.to_commandline()) print(clusterer) print(clusterer.graph) plg.plot_dot_graph(clusterer.graph)
def Bag_J48graft(data, rnm): data.class_is_last() fc1 = FilteredClassifier() fc1.classifier = Classifier(classname="weka.classifiers.trees.J48graft", options=["-C", "0.25", "-M", "2"]) fc1.filter = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "first"]) fc2 = SingleClassifierEnhancer(classname="weka.classifiers.meta.Bagging", options=["-P", "100", "-S", "1", "-I", "10"]) fc2.classifier = fc1 pred_output = PredictionOutput(classname="weka.classifiers.evaluation.output.prediction.CSV", options=["-p", "1"]) folds = 10 fc2.build_classifier(data) evaluation = Evaluation(data) evaluation.crossvalidate_model(fc2, data, folds, Random(1), pred_output) f0 = open(rnm + '_Bag_J48graft_Tree.txt', 'w') print >> f0, "Filename: ", rnm print >> f0, '\n\n' print >> f0, str(fc2) f0.close() f1 = open(rnm + '_Bag_J48graft_Prediction.txt', 'w') print >> f1, 'Filename:', rnm print >> f1, 'Prediction Summary:', (pred_output.buffer_content()) f1.close() f2 = open(rnm + '_Bag_j48graft_Evaluation.txt', 'w') print >> f2, 'Filename:', rnm print >> f2, 'Evaluation Summary:', (evaluation.summary()) print >> f2, '\n\n\n' print >> f2, (evaluation.class_details()) f2.close() plot_roc(evaluation, class_index=[0,1], title=rnm, key_loc='best', outfile=rnm + '_Bag_J48graft_ROC.png', wait=False) value_Bag_J48graft = str(evaluation.percent_correct) return value_Bag_J48graft
def J48(data, rnm): data.class_is_last() fc = FilteredClassifier() fc.classifier = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.25", "-M", "2"]) fc.filter = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "first"]) pred_output = PredictionOutput(classname="weka.classifiers.evaluation.output.prediction.CSV", options=["-p", "1"]) folds = 10 evl = Evaluation(data) evl.crossvalidate_model(fc, data, folds, Random(1), pred_output) fc.build_classifier(data) f0 = open(rnm + '_J48_Tree.txt', 'w') print >> f0, "Filename: ", rnm print >> f0, '\n\n' print >> f0, str(fc) f0.close() f1 = open(rnm + '_J48_Prediction.txt', 'w') print >> f1, 'Filename:', rnm print >> f1, 'Prediction Summary:', (pred_output.buffer_content()) f1.close() f2 = open(rnm + '_J48_Evaluation.txt', 'w') print >> f2, 'Filename:', rnm print >> f2, 'Evaluation Summary:', (evl.summary()) print >> f2, '\n\n\n' print >> f2, (evl.class_details()) f2.close() plot_roc(evl, class_index=[0,1], title=rnm, key_loc='best', outfile=rnm + '_J48_ROC.png', wait=False) value_J48 = str(evl.percent_correct) return value_J48
def load_classifier(lang, tag): classifier = {} if lang == LANG_ID and tag == "nnp": objects = serialization.read_all(ID_MODEL_NNP) elif lang == LANG_ID and tag == "nn": objects = serialization.read_all(ID_MODEL_NN) elif lang == LANG_ID and tag == "cdp": objects = serialization.read_all(ID_MODEL_CDP) elif lang == LANG_EN and tag == "nnp": objects = serialization.read_all(EN_MODEL_NNP) elif lang == LANG_EN and tag == "jj": objects = serialization.read_all(EN_MODEL_JJ) elif lang == LANG_EN and tag == "nn": objects = serialization.read_all(EN_MODEL_NN) elif lang == LANG_EN and tag == "vbp": objects = serialization.read_all(EN_MODEL_VBP) elif lang == LANG_EN and tag == "cd": objects = serialization.read_all(EN_MODEL_CD) elif lang == LANG_EN and tag == "vb": objects = serialization.read_all(EN_MODEL_VB) classifier['classifier'] = Classifier(jobject=objects[0]) classifier['filter'] = Filter(jobject=objects[1]) return classifier
def filter(self): """ Returns the filter. :return: the filter :rtype: Filter """ return Filter(jobject=javabridge.call(self.jobject, "getFilter", "()Lweka/filters/Filter;"))
def __init__(self, index = 0, inference = "ExactInference", ghostAgents = None): BustersAgent.__init__(self, index, inference, ghostAgents) self.previousDistances = [0,0,0,0] jvm.start(max_heap_size="512m") self.loader = Loader(classname="weka.core.converters.ArffLoader") self.data = self.loader.load_file("data/game_toCluster.arff") self.data.delete_last_attribute() self.clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", "10", "-S", "4", "-I", "500"]) self.clusterer.build_clusterer(self.data) self.inst = "" self.data = self.loader.load_file("data/game_toCluster.arff") addCluster = Filter(classname="weka.filters.unsupervised.attribute.AddCluster", options=["-W", "weka.clusterers.SimpleKMeans -N 10 -S 4 -I 500", "-I", "last"]) addCluster.inputformat(self.data) filtered = addCluster.filter(self.data) self.f = open('data/addCluster.arff', 'w+') self.f.write(str(filtered)) self.clustered_data = self.classifyData('data/addCluster.arff')
def filterOutUnnecessaryAPIAndEvaluateOurApproach(self, ourApproahFile, apiFile, indexInTable, methodName, databaseTable, csvFilePath): outputStr = methodName+"," resultList = [] # Get whole feature set of our approach filteredData = self.load_Arff(ourApproahFile) # Use this function to get selected API feature and save the unselected api in a list filterOutList = self.attribueSelectionBasedOnRankingInDatabase(apiFile, indexInTable, databaseTable, "")[1] # Remove unselected API for functionName in filterOutList: functionName = functionName.split("(")[0] + "\(\)" functionName = functionName.replace('$','\$') remove = Filter(classname="weka.filters.unsupervised.attribute.RemoveByName", options=["-E", "^" + functionName + ".*$"]) remove.set_inputformat(filteredData) filteredData = remove.filter(filteredData) featureNum = filteredData.num_attributes() - 1 print "featureNum: " + str(featureNum) if csvFilePath != "": self.writeTenScaledTitleManual(featureNum, csvFilePath) #print "i:" + str(i) #print "functionName:" + functionName #print "featureNum: " + str(filteredData.num_attributes() - 1) for attributeStr in filteredData.attributes(): print(attributeStr) # Run ten scaled generation and evaluation step = 10 while step < featureNum: roundData = self.attributeSelector(filteredData, step) classifier = self.algorithmPicker(roundData, indexInTable) evaluation = self.evaluation(classifier, roundData) #print(self.algorithmTable[indexInTable] + ": " + "{:.2f}".format(evaluation.percent_correct()) + ", Feature select number:" + str(roundData.num_attributes() - 1) + "/" + str(featureNum)) resultList.append("{:.2f}".format(evaluation.percent_correct())) #csvFile.write("{:.2f}".format(evaluation.percent_correct()) +",") step += 10 classifier = self.algorithmPicker(filteredData, indexInTable) evaluation = self.evaluation(classifier, filteredData) #print(self.algorithmTable[indexInTable] + ": " + "{:.2f}".format(evaluation.percent_correct()) + ", Feature select number:" + str(filteredData.num_attributes() - 1) + "/" + str(featureNum)) resultList.append("{:.2f}".format(evaluation.percent_correct())) # Write out to CSV file for item in resultList: outputStr += item +"," outputStr = outputStr[0:-1] + "\n" self.writeToPath(csvFilePath, outputStr)
def filter_data(self, data): print("Filtering Data..\n") flter = Filter( classname="weka.filters.supervised.attribute.AttributeSelection") aseval = ASEvaluation( classname="weka.attributeSelection.CfsSubsetEval", options=["-P", "1", "-E", "1"]) assearch = ASSearch(classname="weka.attributeSelection.BestFirst", options=["-D", "1", "-N", "5"]) flter.set_property("evaluator", aseval.jobject) flter.set_property("search", assearch.jobject) flter.inputformat(data) filtered = flter.filter(data) return filtered
def discretize(data, index, file): discretizer = Filter( classname='weka.filters.supervised.attribute.Discretize', options=["-R", str(index), "-precision", "6"]) discretizer.inputformat(data) newData = discretizer.filter(data) discretizer.serialize(file) return newData
def remove(data, indecies, file): cmdIndex = ','.join(indecies) remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", cmdIndex]) remove.inputformat(data) newData = remove.filter(data) remove.serialize(file) return newData
def use_filter(data): """ Uses the AttributeSelection filter for attribute selection. :param data: the dataset to use :type data: Instances """ print("\n2. Filter") flter = Filter(classname="weka.filters.supervised.attribute.AttributeSelection") aseval = ASEvaluation(classname="weka.attributeSelection.CfsSubsetEval") assearch = ASSearch(classname="weka.attributeSelection.GreedyStepwise", options=["-B"]) flter.set_property("evaluator", aseval.jobject) flter.set_property("search", assearch.jobject) flter.inputformat(data) filtered = flter.filter(data) print(str(filtered))
def stringToNominal(data, indecies, file): cmdIndex = ','.join(indecies) stn = Filter( classname="weka.filters.unsupervised.attribute.StringToNominal", options=["-R", cmdIndex]) stn.inputformat(data) newData = stn.filter(data) stn.serialize(file) return newData
def get_rule_covering_inst(classifier, data, inst_idx): """ Finds the rule in a learned JRIP model that covers an instance :param classifier: trained JRIP model :param data: weka dataset :param inst_idx: instance ID to find corresponding rule of """ merge_filter = Filter( classname="weka.filters.supervised.attribute.ClassOrder", options=["-C", "0"]) merge_filter.inputformat(data) ordered_data = merge_filter.filter(data) rset = classifier.jwrapper.getRuleset() for i in range(rset.size()): r = rset.get(i) if r.covers(data.get_instance(inst_idx).jobject): print("Instance is covered by current rule:", str(r.toString(ordered_data.class_attribute.jobject))) break
def runBayes(file, bound): loader = Loader(classname="weka.core.converters.CSVLoader") data = loader.load_file(file) data.class_is_first() remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", bound]) cls = Classifier(classname="weka.classifiers.bayes.NaiveBayes") remove.inputformat(data) filtered = remove.filter(data) evl = Evaluation(filtered) evl.crossvalidate_model(cls, filtered, 10, Random(1)) print(evl.percent_correct) #print(evl.summary()) result = evl.class_details() print(result) return result
def weka_bayesnet(filearffpath='data/datatobayes.arff'): """Simple calling of the bayesian network from python. """ #Preparing the data loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file('data/datatobayes.arff') #data = loader.load_file('data/Full.arff') remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "first"]) remove.inputformat(data) filtered = data #remove.filter(data) #Classifier test from weka.classifiers import Classifier, Evaluation from weka.core.classes import Random filtered.class_is_last() classifier = Classifier(classname="weka.classifiers.bayes.BayesNet", options=['-D']) # evaluation = Evaluation(filtered) evaluation.crossvalidate_model(classifier, filtered, 10, Random(42)) return evaluation.area_under_roc(class_index=0) #ROC, no std of kfold
def make_partition(data, attributes, part='normal'): if part == 'normal': value = 'last' elif part == 'anomalous': value = 'first' keep_normal = Filter( classname='weka.filters.unsupervised.instance.RemoveWithValues', options=['-C', 'last', '-L', value]) keep_normal.inputformat(data) data_normal = keep_normal.filter(data) remove = Filter(classname='weka.filters.unsupervised.attribute.Remove', options=['-R', 'last']) remove.inputformat(data) data_normal = remove.filter(data_normal) N = data_normal.num_instances return data_normal, N
def load(path, db): nominals = [ 49, # dev_double_fp_config 50, # dev_endian_little 51, # dev_execution_capabilities 52, # dev_extensions 54, # dev_global_mem_cache_type 57, # dev_host_unified_memory 63, # dev_image_support 65, # dev_local_mem_type 96, # dev_queue_properties 97, # dev_single_fp_config 98, # dev_type 100, # dev_vendor_id ] nominal_indices = ",".join([str(index) for index in nominals]) force_nominal = ["-N", nominal_indices] # Load data from CSV. dataset = Dataset.load_csv(path, options=force_nominal) dataset.__class__ = Dataset # Set class index and database connection. dataset.class_index = -1 dataset.db = db # Create string->nominal type attribute filter, ignoring the first # attribute (scenario ID), since we're not classifying with it. string_to_nominal = WekaFilter( classname=("weka.filters.unsupervised." "attribute.StringToNominal"), options=["-R", "2-last"], ) string_to_nominal.inputformat(dataset.instances) # Create filtered dataset, and swap data around. filtered = string_to_nominal.filter(dataset.instances) dataset.instances = filtered return dataset
def run(dataset_path): start = time.time() ### load a dataset ### train_data = model.load_dataset_weka(dataset_path) # to_nomial_class_filter = Filter( classname="weka.filters.unsupervised.attribute.NumericToNominal", options=["-R", "last"]) to_nomial_class_filter.inputformat(train_data) ### Naive Bayes ### Choose what you want classifier = Classifier("weka.classifiers.bayes.NaiveBayesMultinomial") # classifier = Classifier("weka.classifiers.bayes.NaiveBayes") # classifier.build_classifer(train_data) evaluation = Evaluation(to_nomial_class_filter.filter(train_data)) evaluation.crossvalidate_model(classifier, to_nomial_class_filter.filter(train_data), 10, Random(42)) # print(evaluation.summary()) # print(evaluation.class_details()) # print(evaluation.matrix()) # ### Naive Bayes ### # mlp = Classifier("weka.classifiers.bayes.Naive Bayes") # mlp.build_classifer(train_file_5EMO) print(time.time() - start)
def get_weka_breast_cancer(self): split_ratio = 0.2 loader = Loader(classname="weka.core.converters.CSVLoader") loader.options = ['-F', ','] dataset = loader.load_file( os.path.join(DATASET_DIR, 'uci-20070111-breast-cancer.csv')) dataset.class_is_last() remove = Filter( classname="weka.filters.unsupervised.instance.RemovePercentage", options=["-P", str(split_ratio * 100)]) remove.inputformat(dataset) train_set = remove.filter(dataset) remove = Filter( classname="weka.filters.unsupervised.instance.RemovePercentage", options=["-P", str(split_ratio * 100), "-V"]) remove.inputformat(dataset) test_set = remove.filter(dataset) labels = dataset.class_attribute.values return train_set, test_set, labels
def _pre_process_to_classification(self, dataset): filter_data = Filter(classname = 'weka.filters.unsupervised.attribute.MathExpression', options = ['-unset-class-temporarily', '-E', "ifelse ( A>0, 1, 0 )", '-V', '-R', 'last']) filter_data.set_inputformat(dataset) filtered = filter_data.filter(dataset) discretize_data = Filter(classname = 'weka.filters.unsupervised.attribute.NumericToNominal', options = ['-R', 'last']) discretize_data.set_inputformat(filtered) discretized = discretize_data.filter(filtered) return discretized
def emlimitateUnusedFeature(self, trainData, testData=None): trainData.set_class_index(trainData.num_attributes() - 1) # set class attribute featureIndex = -1 filteredTrainData = trainData filteredTestData = testData attribute_index = 0 while attribute_index < filteredTrainData.num_attributes() - 1: sampleCoverage = 0 #print attribute_index # check value for current feature in each instance for instance_index in range(0, filteredTrainData.num_instances()): instance = filteredTrainData.get_instance(instance_index) value = instance.get_value(attribute_index) if value > 0: sampleCoverage += 1 if sampleCoverage == 0: #print "found" remove = Filter( classname="weka.filters.unsupervised.attribute.Remove", options=["-R", str(attribute_index + 1) ]) #The index in this function start from 1 remove.set_inputformat(filteredTrainData) filteredTrainData = remove.filter(filteredTrainData) if filteredTestData: remove = Filter( classname="weka.filters.unsupervised.attribute.Remove", options=["-R", str(attribute_index + 1) ]) #The index in this function start from 1 remove.set_inputformat(filteredTestData) filteredTestData = remove.filter(filteredTestData) else: attribute_index += 1 return [filteredTrainData, filteredTestData]
def load(path, db): nominals = [ 49, # dev_double_fp_config 50, # dev_endian_little 51, # dev_execution_capabilities 52, # dev_extensions 54, # dev_global_mem_cache_type 57, # dev_host_unified_memory 63, # dev_image_support 65, # dev_local_mem_type 96, # dev_queue_properties 97, # dev_single_fp_config 98, # dev_type 100, # dev_vendor_id ] nominal_indices = ",".join([str(index) for index in nominals]) force_nominal = ["-N", nominal_indices] # Load data from CSV. dataset = Dataset.load_csv(path, options=force_nominal) dataset.__class__ = Dataset # Set class index and database connection. dataset.class_index = -1 dataset.db = db # Create string->nominal type attribute filter, ignoring the first # attribute (scenario ID), since we're not classifying with it. string_to_nominal = WekaFilter(classname=("weka.filters.unsupervised." "attribute.StringToNominal"), options=["-R", "2-last"]) string_to_nominal.inputformat(dataset.instances) # Create filtered dataset, and swap data around. filtered = string_to_nominal.filter(dataset.instances) dataset.instances = filtered return dataset
def set_params(self, **params): """ Sets the options for the classifier, expects 'classname' and 'options'. :param params: the parameter dictionary :type params: dict """ if len(params) == 0: return if "classname" not in params: raise Exception("Cannot find 'classname' in parameters!") if "options" not in params: raise Exception("Cannot find 'options' in parameters!") self._classname = params["classname"] self._options = params["options"] self._filter = Filter(classname=self._classname, options=self._options) self._num_nominal_input_labels = None if "num_nominal_input_labels" in params: self._num_nominal_input_labels = params["num_nominal_input_labels"] self._num_nominal_output_labels = None if "num_nominal_output_labels" in params: self._num_nominal_output_labels = params[ "num_nominal_output_labels"]
def main(): jvm.start() loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file("train_sorted.arff") numofStores = 1115 for storeNum in range(0, numofStores): tempData = data removeUpper = Filter( classname="weka.filters.unsupervised.instance.RemoveWithValues", options=[ "-S", str(storeNum + 2) + ".0", "-C", "first", "-L", "first-last", "-V" ]) removeUpper.inputformat(data) tempData = removeUpper.filter(data) removeLower = Filter( classname="weka.filters.unsupervised.instance.RemoveWithValues", options=[ "-S", str(storeNum + 1) + ".0", "-C", "first", "-L", "first-last" ]) removeLower.inputformat(tempData) tempData = removeLower.filter(tempData) #removing the storeID attribute tempData.delete_first_attribute() saver = Saver(classname="weka.core.converters.ArffSaver") saver.save_file(tempData, "stores/store" + str(storeNum + 1) + ".arff") print 'Saved Store' + str(storeNum + 1) jvm.stop()
def obtainBayesNet(file): #The path of the arff extension file must be put. data = converters.load_any_file(folderPathOfArffFiles + file + ".arff") #In the case of this specific data set, the first two attributes were removed since they # represent the name and ranking which are unique values that would affect the classification. # Depending on the data set, certain attributes must be removed. remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "1-2"]) remove.inputformat(data) data = remove.filter(data) #It is specified that the class value is the last attribute. data.class_is_last() #Define the classifier to be used. classifier = Classifier(classname="weka.classifiers.bayes.BayesNet") evaluation = Evaluation(data) evaluation.crossvalidate_model(classifier, data, kFold, Random(42)) #The ROC-AUC is extracted from the string that is received from Weka. info = evaluation.class_details() roc_area = float(info[406:411]) return roc_area
def load(path, db): nominals = [ 49, # dev_global_mem_cache_type 52, # dev_host_unified_memory 54, # dev_local_mem_type 56, # dev_type 57, # dev_vendor ] nominal_indices = ",".join([str(index) for index in nominals]) force_nominal = ["-N", nominal_indices] # Load data from CSV. dataset = Dataset.load_csv(path, options=force_nominal) dataset.__class__ = Dataset # Set class index and database connection. dataset.class_index = -1 dataset.db = db # Create string->nominal type attribute filter, ignoring the first # attribute (scenario ID), since we're not classifying with it. string_to_nominal = WekaFilter( classname=("weka.filters.unsupervised." "attribute.StringToNominal"), options=["-R", "2-last"], ) string_to_nominal.inputformat(dataset.instances) # Create filtered dataset, and swap data around. filtered = string_to_nominal.filter(dataset.instances) # Create nominal->binary type attribute filter, ignoring the # first attribute (scenario ID), since we're not classifying with it. n2b = WekaFilter( classname="weka.filters.unsupervised.attribute.NominalToBinary", options=["-R", "2-last"], ) n2b.inputformat(filtered) dataset.instances = n2b.filter(filtered) return dataset
def load(path, db): nominals = [ 49, # dev_global_mem_cache_type 52, # dev_host_unified_memory 54, # dev_local_mem_type 56, # dev_type 57, # dev_vendor ] nominal_indices = ",".join([str(index) for index in nominals]) force_nominal = ["-N", nominal_indices] # Load data from CSV. dataset = Dataset.load_csv(path, options=force_nominal) dataset.__class__ = Dataset # Set class index and database connection. dataset.class_index = -1 dataset.db = db # Create string->nominal type attribute filter, ignoring the first # attribute (scenario ID), since we're not classifying with it. string_to_nominal = WekaFilter(classname=("weka.filters.unsupervised." "attribute.StringToNominal"), options=["-R", "2-last"]) string_to_nominal.inputformat(dataset.instances) # Create filtered dataset, and swap data around. filtered = string_to_nominal.filter(dataset.instances) # Create nominal->binary type attribute filter, ignoring the # first attribute (scenario ID), since we're not classifying with it. n2b = WekaFilter(classname="weka.filters.unsupervised.attribute.NominalToBinary", options=["-R", "2-last"]) n2b.inputformat(filtered) dataset.instances = n2b.filter(filtered) return dataset
jvm.start() # load ionosphere fname = data_dir + os.sep + "ionosphere.arff" print("\nLoading dataset: " + fname + "\n") loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(fname) data.class_is_last() for equal in ["", "-F"]: print("\nEqual frequency binning? " + str(equal == "-F") + "\n") for bins in [0, 40, 10, 5, 2]: if bins > 0: fltr = Filter( classname="weka.filters.unsupervised.attribute.Discretize", options=["-B", str(bins), equal]) fltr.inputformat(data) filtered = fltr.filter(data) else: filtered = data cls = Classifier(classname="weka.classifiers.trees.J48") # cross-validate evl = Evaluation(filtered) evl.crossvalidate_model(cls, filtered, 10, Random(1)) # build classifier on full dataset cls.build_classifier(filtered) # get size of tree from model strings lines = str(cls).split("\n") nodes = "N/A" for line in lines:
def PreprocessData(Data,option): IDs = [] if (option['idFlag']): # means that the last attribute is id attributeremove = AttributeRemove() attributeremove.setInvertSelection(Boolean(True)) # remove every attribute but the last one which is ID attributeremove.setAttributeIndices(String(str(Data.numAttributes()))) attributeremove.setInputFormat(Data) IDs = Filter.useFilter(Data, attributeremove) attributeremove = AttributeRemove() attributeremove.setInvertSelection(Boolean(False)) # remove IDs from dataset attributeremove.setAttributeIndices(String(str(Data.numAttributes()))) attributeremove.setInputFormat(Data) Data = Filter.useFilter(Data, attributeremove) # set the class Index - the index of the dependent variable Data.setClassIndex(Data.numAttributes() - 1) # remove of the classes if (option['rmClassFlag']): # means that instances with specified class label must be removed ClassLabel = option['rmClass'] removewithvalues = RemoveWithValues() removewithvalues.setAttributeIndex(String('last')) removewithvalues.setNominalIndices(String(str(ClassLabel))) removewithvalues.setInputFormat(Data) newData = Filter.useFilter(Data, removewithvalues) else: newData = Data if (option['weightFlag']): # it means that instances should be weighted according to number of samples # if there is only two classes, do it as before if (Data.numClasses()==2): # weight instances with reciprocal weight with number of samples numInstancesC1 = 0 numInstancesC2 = 0 # get numerical value of the class attribute for the first class because we don't know it classLabel = newData.instance(1).classAttribute() c1 = newData.instance(1).value(classLabel) # find number of instances per class for cnt in range(0,newData.numInstances()): if (newData.instance(cnt).value(classLabel) == c1): numInstancesC1 = numInstancesC1 + 1 else: numInstancesC2 = numInstancesC2 + 1 # calculate weights weightC1 = numInstancesC2 /(numInstancesC2 + numInstancesC1 + 0.0) weightC2 = numInstancesC1 /(numInstancesC2 + numInstancesC1 + 0.0) # assign weight to instances of classes for cnt in range(0,newData.numInstances()): if (newData.instance(cnt).value(classLabel) == c1): newData.instance(cnt).setWeight(weightC1) else: newData.instance(cnt).setWeight(weightC2) # if number of class are more than two then .... elif (Data.numClasses()>2): numClasses = Data.numClasses() stats = Data.attributeStats(Data.classIndex()) AttributeStats = stats.nominalCounts classLabels = Data.instance(1).classAttribute() # assign weight to instances of classes cnt = 0 sumWeigths = 0.0 numInstancesPerClass = {} weightPerClass = {} mapClassLabels = {} for e in classLabels.enumerateValues(): numInst = AttributeStats[cnt] + 0.0 w = 1.0 / numInst mapClassLabels.update({e:cnt}) weightPerClass.update({cnt:w}) numInstancesPerClass.update({cnt:numInst}) sumWeigths = sumWeigths + w cnt = cnt + 1 # normalize weights for k in weightPerClass.keys(): weightPerClass[k] = weightPerClass[k]/sumWeigths for cnt in range(0,newData.numInstances()): w = weightPerClass[ newData.instance(cnt).value(classLabels) ] newData.instance(cnt).setWeight(w) return newData, IDs
def attribueSelectionBasedOnRankingInDatabase(self, trainingData, indexInTable, databaseTable, csvFilePath, testingData = None): featureNum = trainingData.num_attributes() - 1 outputStr = "" outputStr += databaseTable+"," # select from database vector difference featureList3 = [] wholefeatureList = [] dbmgr = permissionMappingManager(databasePath) for row in dbmgr.query("select * from " + databaseTable): featureList3.append(row[0]) wholefeatureList.append(row[0]) #featureList3.reverse() bestRemainFilterList = [] resultList = [] digit = len(featureList3) % 10 bestAccuracy = 0 bestTrainingData = None bestTestingData = None bestEvaluation = None classifier = self.algorithmPicker(trainingData, indexInTable) evaluation = self.evaluation(classifier, trainingData, testingData) if evaluation.percent_correct() >= bestAccuracy: bestAccuracy = evaluation.percent_correct() bestTrainingData = trainingData bestTestingData = testingData bestRemainFilterList = list(featureList3) bestEvaluation = evaluation print(self.algorithmTable[indexInTable] + ": " + "{:.2f}".format(evaluation.percent_correct()) + ", Feature select number:" + str(trainingData.num_attributes() - 1) + "/" + str(featureNum)) resultList.append("{:.2f}".format(evaluation.percent_correct())) if digit > 0: for i in range(0, digit): functionName = featureList3.pop().split("(")[0] + "\(\)" functionName = functionName.replace('$','\$') #print "functionName:" + functionName remove = Filter(classname="weka.filters.unsupervised.attribute.RemoveByName", options=["-E", "^" + functionName + ".*$"]) remove.set_inputformat(trainingData) trainingData = remove.filter(trainingData) if testingData: remove.set_inputformat(testingData) testingData = remove.filter(testingData) #print "i:" + str(i) #print "functionName:" + functionName #print "featureNum: " + str(filteredData.num_attributes() - 1) #for attributeStr in trainingData.attributes(): # print(attributeStr) #self.printFunctionInfo(trainingData, trainingData.num_instances()) classifier = self.algorithmPicker(trainingData, indexInTable) evaluation = self.evaluation(classifier, trainingData, testingData) if evaluation.percent_correct() >= bestAccuracy: bestAccuracy = evaluation.percent_correct() bestTrainingData = trainingData bestTestingData = testingData bestRemainFilterList = list(featureList3) bestEvaluation = evaluation print(self.algorithmTable[indexInTable] + ": " + "{:.2f}".format(evaluation.percent_correct()) + ", Feature select number:" + str(trainingData.num_attributes() - 1) + "/" + str(featureNum)) resultList.append("{:.2f}".format(evaluation.percent_correct())) while trainingData.num_attributes() - 1 > 10: for i in range(0,10): functionName = featureList3.pop().split("(")[0] + "\(\)" functionName = functionName.replace('$','\$') #print "functionName:" + functionName remove = Filter(classname="weka.filters.unsupervised.attribute.RemoveByName", options=["-E", "^" + functionName + ".*$"]) remove.set_inputformat(trainingData) trainingData = remove.filter(trainingData) if testingData: remove.set_inputformat(testingData) testingData = remove.filter(testingData) #print functionName #print "featureNum: " + str(filteredData.num_attributes() - 1) #for attributeStr in trainingData.attributes(): # print(attributeStr) classifier = self.algorithmPicker(trainingData, indexInTable) evaluation = self.evaluation(classifier, trainingData, testingData) if evaluation.percent_correct() >= bestAccuracy: bestAccuracy = evaluation.percent_correct() bestTrainingData = trainingData bestTestingData = testingData bestRemainFilterList = list(featureList3) bestEvaluation = evaluation #print "update feature number:" + str(len(bestRemainFilterList)) print(self.algorithmTable[indexInTable] + ": " + "{:.2f}".format(evaluation.percent_correct()) + ", Feature select number:" + str(trainingData.num_attributes() - 1) + "/" + str(featureNum)) resultList.append("{:.2f}".format(evaluation.percent_correct())) resultList.reverse() fileteredfeatureList = [] #print "bestRemainFilterList number:" + str(len(bestRemainFilterList)) #print "wholefeatureList number:" + str(len(wholefeatureList)) for item in wholefeatureList: if item not in bestRemainFilterList: fileteredfeatureList.append(item) #print "update fileteredfeatureList number:" + str(len(fileteredfeatureList)) for item in resultList: outputStr += item +"," outputStr = outputStr[0:-1] + "\n" print outputStr self.writeToPath(csvFilePath, outputStr) accuracyStr = "{:.2f}".format(bestAccuracy) #print fileteredfeatureList return [bestEvaluation, bestTrainingData, bestTestingData, resultList]
from weka.filters import Filter toBeRemoved = [] for attribute in range(0, dataTrain.attributes().data.class_index): if dataTrain.attribute_stats( attribute).missing_count == dataTrain.attributes().data.num_instances and dataTest.attribute_stats( attribute).missing_count == dataTest.attributes().data.num_instances: sys.exit("Fold has full missing column") if (dataTrain.attribute_stats( attribute).missing_count / dataTrain.attributes().data.num_instances) > 0.5 and ( dataTest.attribute_stats( attribute).missing_count / dataTest.attributes().data.num_instances) > 0.5: toBeRemoved.append(str(attribute)) Remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=['-R', ','.join(toBeRemoved)]) Remove.inputformat(dataTrain) dataTrain = Remove.filter(dataTrain) Remove.inputformat(dataTest) dataTest = Remove.filter(dataTest) # ReplaceMV = Filter(classname="weka.filters.unsupervised.attribute.ReplaceMissingValues") # ReplaceMV.inputformat(dataTrain) # dataTrain = ReplaceMV.filter(dataTrain) # ReplaceMV.inputformat(dataTest) # dataTest = ReplaceMV.filter(dataTest) FS = Filter(classname="weka.filters.supervised.attribute.AttributeSelection", options=['-E', 'weka.attributeSelection.CfsSubsetEval -P 1 -E 1', '-S', "weka.attributeSelection.GreedyStepwise -T -1.7976931348623157E308 -N -1 -num-slots 1"])
from weka.core.converters import Loader, Saver from weka.core.dataset import Instances from weka.filters import Filter jvm.start() # load weather.nominal fname = data_dir + os.sep + "weather.nominal.arff" print("\nLoading dataset: " + fname + "\n") loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(fname) # output header print(Instances.template_instances(data)) # remove attribute no 3 print("\nRemove attribute no 3") fltr = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "3"]) fltr.set_inputformat(data) filtered = fltr.filter(data) # output header print(Instances.template_instances(filtered)) # save modified dataset saver = Saver(classname="weka.core.converters.ArffSaver") saver.save_file(filtered, data_dir + os.sep + "weather.nominal-filtered.arff") jvm.stop()
usage() return 1 options = {'idFlag':True, 'weightFlag': False, 'rmClassFlag': False, 'rmClass': 0} # read the first dataset fn = inputList[0] fid = FileReader(fn) Data = Instances(fid) Data, IDs = PreprocessData(Data,options) # remove class label attributeremove = AttributeRemove() attributeremove.setInvertSelection(Boolean(False)) # remove class labels from dataset attributeremove.setAttributeIndices(String(str(Data.numAttributes()))) attributeremove.setInputFormat(Data) newData = Filter.useFilter(Data, attributeremove) # loop over input arff file cnt = Data.numAttributes() for fnCnt in range(1,len(inputList)): fn = inputList[fnCnt] fid = FileReader(fn) Data = Instances(fid) Data, IDs = PreprocessData(Data,options) # remove class label attributeremove = AttributeRemove() attributeremove.setInvertSelection(Boolean(True)) # remove every attribute but the last one which is class label attributeremove.setAttributeIndices(String(str(Data.numAttributes()))) attributeremove.setInputFormat(Data) labels = Filter.useFilter(Data, attributeremove) attributeremove = AttributeRemove() attributeremove.setInvertSelection(Boolean(False)) # remove class labels from dataset
from weka.core.converters import Loader from weka.core.classes import Random from weka.classifiers import Classifier, Evaluation, PredictionOutput from weka.filters import Filter jvm.start() # load diabetes loader = Loader(classname="weka.core.converters.ArffLoader") fname = data_dir + os.sep + "diabetes.arff" print("\nLoading dataset: " + fname + "\n") data = loader.load_file(fname) # we'll set the class attribute after filtering # apply NominalToBinary filter and set class attribute fltr = Filter("weka.filters.unsupervised.attribute.NominalToBinary") fltr.inputformat(data) filtered = fltr.filter(data) filtered.class_is_last() # cross-validate LinearRegression on filtered data, display model cls = Classifier(classname="weka.classifiers.functions.LinearRegression") pout = PredictionOutput(classname="weka.classifiers.evaluation.output.prediction.PlainText") evl = Evaluation(filtered) evl.crossvalidate_model(cls, filtered, 10, Random(1), pout) print("10-fold cross-validation:\n" + evl.summary()) print("Predictions:\n\n" + str(pout)) cls.build_classifier(filtered) print("Model:\n\n" + str(cls)) # use AddClassification filter with LinearRegression on filtered data
jvm.start() # load diabetes loader = Loader(classname="weka.core.converters.ArffLoader") fname = data_dir + os.sep + "diabetes.arff" print("\nLoading dataset: " + fname + "\n") data = loader.load_file(fname) data.class_is_last() # simulate the 10 train/test pairs of cross-validation evl = Evaluation(data) for i in xrange(1, 11): # create train set remove = Filter( classname="weka.filters.supervised.instance.StratifiedRemoveFolds", options=["-N", "10", "-F", str(i), "-S", "1", "-V"]) remove.inputformat(data) train = remove.filter(data) # create test set remove = Filter( classname="weka.filters.supervised.instance.StratifiedRemoveFolds", options=["-N", "10", "-F", str(i), "-S", "1"]) remove.inputformat(data) test = remove.filter(data) cls = Classifier(classname="weka.classifiers.trees.J48") cls.build_classifier(train) evl.test_model(cls, test)
# load glass fname = data_dir + os.sep + "glass.arff" print("\nLoading dataset: " + fname + "\n") loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(fname) data.class_is_last() # cross-validate J48 cls = Classifier(classname="weka.classifiers.trees.J48") evl = Evaluation(data) evl.crossvalidate_model(cls, data, 10, Random(1)) print("All attributes: %0.0f%%" % evl.percent_correct) # remove attributes (1) and cross-validate J48 atts = "RI|Mg|Type" flt = Filter(classname="weka.filters.unsupervised.attribute.RemoveByName", options=["-E", "(" + atts + ")", "-V"]) flt.inputformat(data) filtered = flt.filter(data) cls = Classifier(classname="weka.classifiers.trees.J48") evl = Evaluation(filtered) evl.crossvalidate_model(cls, filtered, 10, Random(1)) print(atts + ": %0.0f%%" % evl.percent_correct) # remove attributes (2) and cross-validate J48 atts = "RI|Na|Mg|Ca|Ba|Type" flt = Filter(classname="weka.filters.unsupervised.attribute.RemoveByName", options=["-E", "(" + atts + ")", "-V"]) flt.inputformat(data) filtered = flt.filter(data) cls = Classifier(classname="weka.classifiers.trees.J48") evl = Evaluation(filtered) evl.crossvalidate_model(cls, filtered, 10, Random(1))
def main(): """ Just runs some example code. """ # load a dataset iris_file = helper.get_data_dir() + os.sep + "iris.arff" helper.print_info("Loading dataset: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") iris_data = loader.load_file(iris_file) iris_data.class_is_last() # classifier help helper.print_title("Creating help string") classifier = Classifier(classname="weka.classifiers.trees.J48") print(classifier.to_help()) # partial classname helper.print_title("Creating classifier from partial classname") clsname = ".J48" classifier = Classifier(classname=clsname) print(clsname + " --> " + classifier.classname) # classifier from commandline helper.print_title("Creating SMO from command-line string") cmdline = 'weka.classifiers.functions.SMO -K "weka.classifiers.functions.supportVector.NormalizedPolyKernel -E 3.0"' classifier = from_commandline(cmdline, classname="weka.classifiers.Classifier") classifier.build_classifier(iris_data) print("input: " + cmdline) print("output: " + classifier.to_commandline()) print("model:\n" + str(classifier)) # kernel classifier helper.print_title("Creating SMO as KernelClassifier") kernel = Kernel(classname="weka.classifiers.functions.supportVector.RBFKernel", options=["-G", "0.001"]) classifier = KernelClassifier(classname="weka.classifiers.functions.SMO", options=["-M"]) classifier.kernel = kernel classifier.build_classifier(iris_data) print("classifier: " + classifier.to_commandline()) print("model:\n" + str(classifier)) # build a classifier and output model helper.print_title("Training J48 classifier on iris") classifier = Classifier(classname="weka.classifiers.trees.J48") # Instead of using 'options=["-C", "0.3"]' in the constructor, we can also set the "confidenceFactor" # property of the J48 classifier itself. However, being of type float rather than double, we need # to convert it to the correct type first using the double_to_float function: classifier.set_property("confidenceFactor", typeconv.double_to_float(0.3)) classifier.build_classifier(iris_data) print(classifier) print(classifier.graph) print(classifier.to_source("MyJ48")) plot_graph.plot_dot_graph(classifier.graph) # evaluate model on test set helper.print_title("Evaluating J48 classifier on iris") evaluation = Evaluation(iris_data) evl = evaluation.test_model(classifier, iris_data) print(evl) print(evaluation.summary()) # evaluate model on train/test split helper.print_title("Evaluating J48 classifier on iris (random split 66%)") classifier = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.3"]) evaluation = Evaluation(iris_data) evaluation.evaluate_train_test_split(classifier, iris_data, 66.0, Random(1)) print(evaluation.summary()) # load a dataset incrementally and build classifier incrementally helper.print_title("Build classifier incrementally on iris") helper.print_info("Loading dataset: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") iris_inc = loader.load_file(iris_file, incremental=True) iris_inc.class_is_last() classifier = Classifier(classname="weka.classifiers.bayes.NaiveBayesUpdateable") classifier.build_classifier(iris_inc) for inst in loader: classifier.update_classifier(inst) print(classifier) # construct meta-classifiers helper.print_title("Meta classifiers") # generic FilteredClassifier instantiation print("generic FilteredClassifier instantiation") meta = SingleClassifierEnhancer(classname="weka.classifiers.meta.FilteredClassifier") meta.classifier = Classifier(classname="weka.classifiers.functions.LinearRegression") flter = Filter("weka.filters.unsupervised.attribute.Remove") flter.options = ["-R", "first"] meta.set_property("filter", flter.jobject) print(meta.to_commandline()) # direct FilteredClassifier instantiation print("direct FilteredClassifier instantiation") meta = FilteredClassifier() meta.classifier = Classifier(classname="weka.classifiers.functions.LinearRegression") flter = Filter("weka.filters.unsupervised.attribute.Remove") flter.options = ["-R", "first"] meta.filter = flter print(meta.to_commandline()) # generic Vote print("generic Vote instantiation") meta = MultipleClassifiersCombiner(classname="weka.classifiers.meta.Vote") classifiers = [ Classifier(classname="weka.classifiers.functions.SMO"), Classifier(classname="weka.classifiers.trees.J48") ] meta.classifiers = classifiers print(meta.to_commandline()) # cross-validate nominal classifier helper.print_title("Cross-validating NaiveBayes on diabetes") diabetes_file = helper.get_data_dir() + os.sep + "diabetes.arff" helper.print_info("Loading dataset: " + diabetes_file) loader = Loader("weka.core.converters.ArffLoader") diabetes_data = loader.load_file(diabetes_file) diabetes_data.class_is_last() classifier = Classifier(classname="weka.classifiers.bayes.NaiveBayes") pred_output = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.PlainText", options=["-distribution"]) evaluation = Evaluation(diabetes_data) evaluation.crossvalidate_model(classifier, diabetes_data, 10, Random(42), output=pred_output) print(evaluation.summary()) print(evaluation.class_details()) print(evaluation.matrix()) print("areaUnderPRC/0: " + str(evaluation.area_under_prc(0))) print("weightedAreaUnderPRC: " + str(evaluation.weighted_area_under_prc)) print("areaUnderROC/1: " + str(evaluation.area_under_roc(1))) print("weightedAreaUnderROC: " + str(evaluation.weighted_area_under_roc)) print("avgCost: " + str(evaluation.avg_cost)) print("totalCost: " + str(evaluation.total_cost)) print("confusionMatrix: " + str(evaluation.confusion_matrix)) print("correct: " + str(evaluation.correct)) print("pctCorrect: " + str(evaluation.percent_correct)) print("incorrect: " + str(evaluation.incorrect)) print("pctIncorrect: " + str(evaluation.percent_incorrect)) print("unclassified: " + str(evaluation.unclassified)) print("pctUnclassified: " + str(evaluation.percent_unclassified)) print("coverageOfTestCasesByPredictedRegions: " + str(evaluation.coverage_of_test_cases_by_predicted_regions)) print("sizeOfPredictedRegions: " + str(evaluation.size_of_predicted_regions)) print("falseNegativeRate: " + str(evaluation.false_negative_rate(1))) print("weightedFalseNegativeRate: " + str(evaluation.weighted_false_negative_rate)) print("numFalseNegatives: " + str(evaluation.num_false_negatives(1))) print("trueNegativeRate: " + str(evaluation.true_negative_rate(1))) print("weightedTrueNegativeRate: " + str(evaluation.weighted_true_negative_rate)) print("numTrueNegatives: " + str(evaluation.num_true_negatives(1))) print("falsePositiveRate: " + str(evaluation.false_positive_rate(1))) print("weightedFalsePositiveRate: " + str(evaluation.weighted_false_positive_rate)) print("numFalsePositives: " + str(evaluation.num_false_positives(1))) print("truePositiveRate: " + str(evaluation.true_positive_rate(1))) print("weightedTruePositiveRate: " + str(evaluation.weighted_true_positive_rate)) print("numTruePositives: " + str(evaluation.num_true_positives(1))) print("fMeasure: " + str(evaluation.f_measure(1))) print("weightedFMeasure: " + str(evaluation.weighted_f_measure)) print("unweightedMacroFmeasure: " + str(evaluation.unweighted_macro_f_measure)) print("unweightedMicroFmeasure: " + str(evaluation.unweighted_micro_f_measure)) print("precision: " + str(evaluation.precision(1))) print("weightedPrecision: " + str(evaluation.weighted_precision)) print("recall: " + str(evaluation.recall(1))) print("weightedRecall: " + str(evaluation.weighted_recall)) print("kappa: " + str(evaluation.kappa)) print("KBInformation: " + str(evaluation.kb_information)) print("KBMeanInformation: " + str(evaluation.kb_mean_information)) print("KBRelativeInformation: " + str(evaluation.kb_relative_information)) print("SFEntropyGain: " + str(evaluation.sf_entropy_gain)) print("SFMeanEntropyGain: " + str(evaluation.sf_mean_entropy_gain)) print("SFMeanPriorEntropy: " + str(evaluation.sf_mean_prior_entropy)) print("SFMeanSchemeEntropy: " + str(evaluation.sf_mean_scheme_entropy)) print("matthewsCorrelationCoefficient: " + str(evaluation.matthews_correlation_coefficient(1))) print("weightedMatthewsCorrelation: " + str(evaluation.weighted_matthews_correlation)) print("class priors: " + str(evaluation.class_priors)) print("numInstances: " + str(evaluation.num_instances)) print("meanAbsoluteError: " + str(evaluation.mean_absolute_error)) print("meanPriorAbsoluteError: " + str(evaluation.mean_prior_absolute_error)) print("relativeAbsoluteError: " + str(evaluation.relative_absolute_error)) print("rootMeanSquaredError: " + str(evaluation.root_mean_squared_error)) print("rootMeanPriorSquaredError: " + str(evaluation.root_mean_prior_squared_error)) print("rootRelativeSquaredError: " + str(evaluation.root_relative_squared_error)) print("prediction output:\n" + str(pred_output)) plot_cls.plot_roc( evaluation, title="ROC diabetes", class_index=range(0, diabetes_data.class_attribute.num_values), wait=False) plot_cls.plot_prc( evaluation, title="PRC diabetes", class_index=range(0, diabetes_data.class_attribute.num_values), wait=False) # load a numeric dataset bolts_file = helper.get_data_dir() + os.sep + "bolts.arff" helper.print_info("Loading dataset: " + bolts_file) loader = Loader("weka.core.converters.ArffLoader") bolts_data = loader.load_file(bolts_file) bolts_data.class_is_last() # build a classifier and output model helper.print_title("Training LinearRegression on bolts") classifier = Classifier(classname="weka.classifiers.functions.LinearRegression", options=["-S", "1", "-C"]) classifier.build_classifier(bolts_data) print(classifier) # cross-validate numeric classifier helper.print_title("Cross-validating LinearRegression on bolts") classifier = Classifier(classname="weka.classifiers.functions.LinearRegression", options=["-S", "1", "-C"]) evaluation = Evaluation(bolts_data) evaluation.crossvalidate_model(classifier, bolts_data, 10, Random(42)) print(evaluation.summary()) print("correlationCoefficient: " + str(evaluation.correlation_coefficient)) print("errorRate: " + str(evaluation.error_rate)) helper.print_title("Header - bolts") print(str(evaluation.header)) helper.print_title("Predictions on bolts") for index, pred in enumerate(evaluation.predictions): print(str(index+1) + ": " + str(pred) + " -> error=" + str(pred.error)) plot_cls.plot_classifier_errors(evaluation.predictions, wait=False) # learning curve cls = [ Classifier(classname="weka.classifiers.trees.J48"), Classifier(classname="weka.classifiers.bayes.NaiveBayesUpdateable")] plot_cls.plot_learning_curve( cls, diabetes_data, increments=0.05, label_template="[#] !", metric="percent_correct", wait=True) # access classifier's Java API labor_file = helper.get_data_dir() + os.sep + "labor.arff" helper.print_info("Loading dataset: " + labor_file) loader = Loader("weka.core.converters.ArffLoader") labor_data = loader.load_file(labor_file) labor_data.class_is_last() helper.print_title("Using JRip's Java API to access rules") jrip = Classifier(classname="weka.classifiers.rules.JRip") jrip.build_classifier(labor_data) rset = jrip.jwrapper.getRuleset() for i in range(rset.size()): r = rset.get(i) print(str(r.toString(labor_data.class_attribute.jobject)))
if (len_email > 0) and (len_content > 0): writer.writerow(row) # close csvfile csvfile.close() # start JVM jvm.start() # load CSV file loader = Loader(classname="weka.core.converters.CSVLoader", options=["-E", '"', "-F", ","]) data = loader.load_file(csvfilename) #print(data) # convert class to nominal wfilter = Filter(classname="weka.filters.unsupervised.attribute.StringToNominal", options=["-R", "last"]) wfilter.set_inputformat(data) data = wfilter.filter(data) # convert content to string wfilter = Filter(classname="weka.filters.unsupervised.attribute.NominalToString", options=["-C", "first"]) wfilter.set_inputformat(data) data = wfilter.filter(data) # set class attribute data.set_class_index(data.num_attributes() - 1) # generate baseline zeror = Classifier(classname="weka.classifiers.rules.ZeroR") evaluation = Evaluation(data) evaluation.crossvalidate_model(zeror, data, 10, Random(1))
def createTwoDatasets(self, wholeDataPath, trainingDataPercentage, trainingPath, testingPath, shuffleSeed = 43): wholeData = self.load_Arff(wholeDataPath) randomize = Filter(classname="weka.filters.unsupervised.instance.Randomize", options=["-S", str(shuffleSeed)]) randomize.set_inputformat(wholeData) wholeData = randomize.filter(wholeData) removePercentage = Filter(classname="weka.filters.unsupervised.instance.RemovePercentage", options=["-P", str(trainingDataPercentage), "-V"]) removePercentage.set_inputformat(wholeData) trainingData = removePercentage.filter(wholeData) print "instances:" + str(trainingData.num_instances()) removePercentage = Filter(classname="weka.filters.unsupervised.instance.RemovePercentage", options=["-P", str(trainingDataPercentage)]) removePercentage.set_inputformat(wholeData) testingData = removePercentage.filter(wholeData) print "instances:" + str(testingData.num_instances()) self.save_Arff(trainingData, trainingPath) self.save_Arff(testingData, testingPath)
def remove_attributes(self, *attributes): indices = [self.attribute_index(x) for x in attributes] remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", ','.join(str(x + 1) for x in indices)]) remove.inputformat(self.instances) self.instances = remove.filter(self.instances)
def main(): """ Just runs some example code. """ # load a dataset iris_file = helper.get_data_dir() + os.sep + "iris.arff" helper.print_info("Loading dataset: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") iris_data = loader.load_file(iris_file) iris_data.class_is_last() # classifier help helper.print_title("Creating help string") classifier = Classifier(classname="weka.classifiers.trees.J48") print(classifier.to_help()) # partial classname helper.print_title("Creating classifier from partial classname") clsname = ".J48" classifier = Classifier(classname=clsname) print(clsname + " --> " + classifier.classname) # classifier from commandline helper.print_title("Creating SMO from command-line string") cmdline = 'weka.classifiers.functions.SMO -K "weka.classifiers.functions.supportVector.NormalizedPolyKernel -E 3.0"' classifier = from_commandline(cmdline, classname="weka.classifiers.Classifier") classifier.build_classifier(iris_data) print("input: " + cmdline) print("output: " + classifier.to_commandline()) print("model:\n" + str(classifier)) # kernel classifier helper.print_title("Creating SMO as KernelClassifier") kernel = Kernel( classname="weka.classifiers.functions.supportVector.RBFKernel", options=["-G", "0.001"]) classifier = KernelClassifier(classname="weka.classifiers.functions.SMO", options=["-M"]) classifier.kernel = kernel classifier.build_classifier(iris_data) print("classifier: " + classifier.to_commandline()) print("model:\n" + str(classifier)) # build a classifier and output model helper.print_title("Training J48 classifier on iris") classifier = Classifier(classname="weka.classifiers.trees.J48") # Instead of using 'options=["-C", "0.3"]' in the constructor, we can also set the "confidenceFactor" # property of the J48 classifier itself. However, being of type float rather than double, we need # to convert it to the correct type first using the double_to_float function: classifier.set_property("confidenceFactor", types.double_to_float(0.3)) classifier.build_classifier(iris_data) print(classifier) print(classifier.graph) print(classifier.to_source("MyJ48")) plot_graph.plot_dot_graph(classifier.graph) # evaluate model on test set helper.print_title("Evaluating J48 classifier on iris") evaluation = Evaluation(iris_data) evl = evaluation.test_model(classifier, iris_data) print(evl) print(evaluation.summary()) # evaluate model on train/test split helper.print_title("Evaluating J48 classifier on iris (random split 66%)") classifier = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.3"]) evaluation = Evaluation(iris_data) evaluation.evaluate_train_test_split(classifier, iris_data, 66.0, Random(1)) print(evaluation.summary()) # load a dataset incrementally and build classifier incrementally helper.print_title("Build classifier incrementally on iris") helper.print_info("Loading dataset: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") iris_inc = loader.load_file(iris_file, incremental=True) iris_inc.class_is_last() classifier = Classifier( classname="weka.classifiers.bayes.NaiveBayesUpdateable") classifier.build_classifier(iris_inc) for inst in loader: classifier.update_classifier(inst) print(classifier) # construct meta-classifiers helper.print_title("Meta classifiers") # generic FilteredClassifier instantiation print("generic FilteredClassifier instantiation") meta = SingleClassifierEnhancer( classname="weka.classifiers.meta.FilteredClassifier") meta.classifier = Classifier( classname="weka.classifiers.functions.LinearRegression") flter = Filter("weka.filters.unsupervised.attribute.Remove") flter.options = ["-R", "first"] meta.set_property("filter", flter.jobject) print(meta.to_commandline()) # direct FilteredClassifier instantiation print("direct FilteredClassifier instantiation") meta = FilteredClassifier() meta.classifier = Classifier( classname="weka.classifiers.functions.LinearRegression") flter = Filter("weka.filters.unsupervised.attribute.Remove") flter.options = ["-R", "first"] meta.filter = flter print(meta.to_commandline()) # generic Vote print("generic Vote instantiation") meta = MultipleClassifiersCombiner(classname="weka.classifiers.meta.Vote") classifiers = [ Classifier(classname="weka.classifiers.functions.SMO"), Classifier(classname="weka.classifiers.trees.J48") ] meta.classifiers = classifiers print(meta.to_commandline()) # cross-validate nominal classifier helper.print_title("Cross-validating NaiveBayes on diabetes") diabetes_file = helper.get_data_dir() + os.sep + "diabetes.arff" helper.print_info("Loading dataset: " + diabetes_file) loader = Loader("weka.core.converters.ArffLoader") diabetes_data = loader.load_file(diabetes_file) diabetes_data.class_is_last() classifier = Classifier(classname="weka.classifiers.bayes.NaiveBayes") pred_output = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.PlainText", options=["-distribution"]) evaluation = Evaluation(diabetes_data) evaluation.crossvalidate_model(classifier, diabetes_data, 10, Random(42), output=pred_output) print(evaluation.summary()) print(evaluation.class_details()) print(evaluation.matrix()) print("areaUnderPRC/0: " + str(evaluation.area_under_prc(0))) print("weightedAreaUnderPRC: " + str(evaluation.weighted_area_under_prc)) print("areaUnderROC/1: " + str(evaluation.area_under_roc(1))) print("weightedAreaUnderROC: " + str(evaluation.weighted_area_under_roc)) print("avgCost: " + str(evaluation.avg_cost)) print("totalCost: " + str(evaluation.total_cost)) print("confusionMatrix: " + str(evaluation.confusion_matrix)) print("correct: " + str(evaluation.correct)) print("pctCorrect: " + str(evaluation.percent_correct)) print("incorrect: " + str(evaluation.incorrect)) print("pctIncorrect: " + str(evaluation.percent_incorrect)) print("unclassified: " + str(evaluation.unclassified)) print("pctUnclassified: " + str(evaluation.percent_unclassified)) print("coverageOfTestCasesByPredictedRegions: " + str(evaluation.coverage_of_test_cases_by_predicted_regions)) print("sizeOfPredictedRegions: " + str(evaluation.size_of_predicted_regions)) print("falseNegativeRate: " + str(evaluation.false_negative_rate(1))) print("weightedFalseNegativeRate: " + str(evaluation.weighted_false_negative_rate)) print("numFalseNegatives: " + str(evaluation.num_false_negatives(1))) print("trueNegativeRate: " + str(evaluation.true_negative_rate(1))) print("weightedTrueNegativeRate: " + str(evaluation.weighted_true_negative_rate)) print("numTrueNegatives: " + str(evaluation.num_true_negatives(1))) print("falsePositiveRate: " + str(evaluation.false_positive_rate(1))) print("weightedFalsePositiveRate: " + str(evaluation.weighted_false_positive_rate)) print("numFalsePositives: " + str(evaluation.num_false_positives(1))) print("truePositiveRate: " + str(evaluation.true_positive_rate(1))) print("weightedTruePositiveRate: " + str(evaluation.weighted_true_positive_rate)) print("numTruePositives: " + str(evaluation.num_true_positives(1))) print("fMeasure: " + str(evaluation.f_measure(1))) print("weightedFMeasure: " + str(evaluation.weighted_f_measure)) print("unweightedMacroFmeasure: " + str(evaluation.unweighted_macro_f_measure)) print("unweightedMicroFmeasure: " + str(evaluation.unweighted_micro_f_measure)) print("precision: " + str(evaluation.precision(1))) print("weightedPrecision: " + str(evaluation.weighted_precision)) print("recall: " + str(evaluation.recall(1))) print("weightedRecall: " + str(evaluation.weighted_recall)) print("kappa: " + str(evaluation.kappa)) print("KBInformation: " + str(evaluation.kb_information)) print("KBMeanInformation: " + str(evaluation.kb_mean_information)) print("KBRelativeInformation: " + str(evaluation.kb_relative_information)) print("SFEntropyGain: " + str(evaluation.sf_entropy_gain)) print("SFMeanEntropyGain: " + str(evaluation.sf_mean_entropy_gain)) print("SFMeanPriorEntropy: " + str(evaluation.sf_mean_prior_entropy)) print("SFMeanSchemeEntropy: " + str(evaluation.sf_mean_scheme_entropy)) print("matthewsCorrelationCoefficient: " + str(evaluation.matthews_correlation_coefficient(1))) print("weightedMatthewsCorrelation: " + str(evaluation.weighted_matthews_correlation)) print("class priors: " + str(evaluation.class_priors)) print("numInstances: " + str(evaluation.num_instances)) print("meanAbsoluteError: " + str(evaluation.mean_absolute_error)) print("meanPriorAbsoluteError: " + str(evaluation.mean_prior_absolute_error)) print("relativeAbsoluteError: " + str(evaluation.relative_absolute_error)) print("rootMeanSquaredError: " + str(evaluation.root_mean_squared_error)) print("rootMeanPriorSquaredError: " + str(evaluation.root_mean_prior_squared_error)) print("rootRelativeSquaredError: " + str(evaluation.root_relative_squared_error)) print("prediction output:\n" + str(pred_output)) plot_cls.plot_roc(evaluation, title="ROC diabetes", class_index=range( 0, diabetes_data.class_attribute.num_values), wait=False) plot_cls.plot_prc(evaluation, title="PRC diabetes", class_index=range( 0, diabetes_data.class_attribute.num_values), wait=False) # train 2nd classifier on diabetes dataset classifier2 = Classifier(classname="weka.classifiers.trees.RandomForest") evaluation2 = Evaluation(diabetes_data) evaluation2.crossvalidate_model(classifier2, diabetes_data, 10, Random(42)) plot_cls.plot_rocs({ "NB": evaluation, "RF": evaluation2 }, title="ROC diabetes", class_index=0, wait=False) plot_cls.plot_prcs({ "NB": evaluation, "RF": evaluation2 }, title="PRC diabetes", class_index=0, wait=False) # load a numeric dataset bolts_file = helper.get_data_dir() + os.sep + "bolts.arff" helper.print_info("Loading dataset: " + bolts_file) loader = Loader("weka.core.converters.ArffLoader") bolts_data = loader.load_file(bolts_file) bolts_data.class_is_last() # build a classifier and output model helper.print_title("Training LinearRegression on bolts") classifier = Classifier( classname="weka.classifiers.functions.LinearRegression", options=["-S", "1", "-C"]) classifier.build_classifier(bolts_data) print(classifier) # cross-validate numeric classifier helper.print_title("Cross-validating LinearRegression on bolts") classifier = Classifier( classname="weka.classifiers.functions.LinearRegression", options=["-S", "1", "-C"]) evaluation = Evaluation(bolts_data) evaluation.crossvalidate_model(classifier, bolts_data, 10, Random(42)) print(evaluation.summary()) print("correlationCoefficient: " + str(evaluation.correlation_coefficient)) print("errorRate: " + str(evaluation.error_rate)) helper.print_title("Header - bolts") print(str(evaluation.header)) helper.print_title("Predictions on bolts") for index, pred in enumerate(evaluation.predictions): print( str(index + 1) + ": " + str(pred) + " -> error=" + str(pred.error)) plot_cls.plot_classifier_errors(evaluation.predictions, wait=False) # train 2nd classifier and show errors in same plot classifier2 = Classifier(classname="weka.classifiers.functions.SMOreg") evaluation2 = Evaluation(bolts_data) evaluation2.crossvalidate_model(classifier2, bolts_data, 10, Random(42)) plot_cls.plot_classifier_errors( { "LR": evaluation.predictions, "SMOreg": evaluation2.predictions }, wait=False) # learning curve cls = [ Classifier(classname="weka.classifiers.trees.J48"), Classifier(classname="weka.classifiers.bayes.NaiveBayesUpdateable") ] plot_cls.plot_learning_curve(cls, diabetes_data, increments=0.05, label_template="[#] !", metric="percent_correct", wait=True) # access classifier's Java API labor_file = helper.get_data_dir() + os.sep + "labor.arff" helper.print_info("Loading dataset: " + labor_file) loader = Loader("weka.core.converters.ArffLoader") labor_data = loader.load_file(labor_file) labor_data.class_is_last() helper.print_title("Using JRip's Java API to access rules") jrip = Classifier(classname="weka.classifiers.rules.JRip") jrip.build_classifier(labor_data) rset = jrip.jwrapper.getRuleset() for i in xrange(rset.size()): r = rset.get(i) print(str(r.toString(labor_data.class_attribute.jobject)))
# load iris fname = data_dir + os.sep + "iris.arff" print("\nLoading dataset: " + fname + "\n") loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(fname) data.set_class_index(data.num_attributes() - 1) # plot pld.scatter_plot( data, data.get_attribute_by_name("petalwidth").get_index(), data.get_attribute_by_name("petallength").get_index(), wait=False) # add classifier errors to dataset addcls = Filter( classname="weka.filters.supervised.attribute.AddClassification", options=["-W", "weka.classifiers.trees.J48", "-classification", "-error"]) addcls.set_inputformat(data) filtered = addcls.filter(data) print(filtered) # build J48 cls = Classifier(classname="weka.classifiers.trees.J48") cls.build_classifier(data) evl = Evaluation(data) evl.test_model(cls, data) # plot classifier errors plc.plot_classifier_errors(evl.predictions(), wait=True) jvm.stop()
lines = s.split("\n") for line in lines: if line.find("Size of the tree :") > -1: result = line.replace("Size of the tree :", "").strip() return result # load ionosphere fname = data_dir + os.sep + "ionosphere.arff" print("\nLoading dataset: " + fname + "\n") loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(fname) data.class_is_last() # 1. cheating with default filter fltr = Filter(classname="weka.filters.supervised.attribute.Discretize", options=[]) fltr.inputformat(data) filtered = fltr.filter(data) cls = Classifier(classname="weka.classifiers.trees.J48") evl = Evaluation(filtered) evl.crossvalidate_model(cls, filtered, 10, Random(1)) cls.build_classifier(filtered) print("cheating (default): accuracy=%0.1f nodes=%s" % (evl.percent_correct, get_nodes(str(cls)))) # 2. using FilteredClassifier with default filter cls = FilteredClassifier() cls.classifier = Classifier(classname="weka.classifiers.trees.J48") cls.filter = Filter(classname="weka.filters.supervised.attribute.Discretize", options=[]) evl = Evaluation(data)
def runner(self, cdat, heap_size = 16384, seed = None, verbose = True): self.set_status(Pipeline.RUNNING) self.logs.append('Initializing Pipeline') para = self.config self.logs.append('Reading Pipeline Configuration') head = '' name = get_rand_uuid_str() self.logs.append('Reading Input File') for i, stage in enumerate(self.stages): if stage.code in ('dat.fle', 'prp.bgc', 'prp.nrm', 'prp.pmc', 'prp.sum'): self.stages[i].status = Pipeline.RUNNING if stage.code == 'dat.fle': head = os.path.abspath(stage.value.path) name, _ = os.path.splitext(stage.value.name) self.logs.append('Parsing to ARFF') path = os.path.join(head, '{name}.arff'.format(name = name)) # This bug, I don't know why, using Config.schema instead. # cdat.toARFF(path, express_config = para.Preprocess.schema, verbose = verbose) for i, stage in enumerate(self.stages): if stage.code in ('dat.fle', 'prp.bgc', 'prp.nrm', 'prp.pmc', 'prp.sum'): self.stages[i].status = Pipeline.COMPLETE self.logs.append('Saved ARFF at {path}'.format(path = path)) self.logs.append('Splitting to Training and Testing Sets') JVM.start(max_heap_size = '{size}m'.format(size = heap_size)) load = Loader(classname = 'weka.core.converters.ArffLoader') # data = load.load_file(path) # save = Saver(classname = 'weka.core.converters.ArffSaver') data = load.load_file(os.path.join(head, 'iris.arff')) # For Debugging Purposes Only data.class_is_last() # For Debugging Purposes Only # data.class_index = cdat.iclss for i, stage in enumerate(self.stages): if stage.code == 'prp.kcv': self.stages[i].status = Pipeline.RUNNING self.logs.append('Splitting Training Set') # TODO - Check if this seed is worth it. seed = assign_if_none(seed, random.randint(0, 1000)) opts = ['-S', str(seed), '-N', str(para.Preprocess.FOLDS)] wobj = Filter(classname = 'weka.filters.supervised.instance.StratifiedRemoveFolds', options = opts + ['-V']) wobj.inputformat(data) tran = wobj.filter(data) self.logs.append('Splitting Testing Set') wobj.options = opts test = wobj.filter(data) for i, stage in enumerate(self.stages): if stage.code == 'prp.kcv': self.stages[i].status = Pipeline.COMPLETE self.logs.append('Performing Feature Selection') feat = [ ] for comb in para.FEATURE_SELECTION: if comb.USE: for i, stage in enumerate(self.stages): if stage.code == 'ats': search = stage.value.search.name evaluator = stage.value.evaluator.name if search == comb.Search.NAME and evaluator == comb.Evaluator.NAME: self.stages[i].status = Pipeline.RUNNING srch = ASSearch(classname = 'weka.attributeSelection.{classname}'.format( classname = comb.Search.NAME, options = assign_if_none(comb.Search.OPTIONS, [ ]) )) ewal = ASEvaluation(classname = 'weka.attributeSelection.{classname}'.format( classname = comb.Evaluator.NAME, options = assign_if_none(comb.Evaluator.OPTIONS, [ ]) )) attr = AttributeSelection() attr.search(srch) attr.evaluator(ewal) attr.select_attributes(tran) meta = addict.Dict() meta.search = comb.Search.NAME meta.evaluator = comb.Evaluator.NAME meta.features = [tran.attribute(index).name for index in attr.selected_attributes] feat.append(meta) for i, stage in enumerate(self.stages): if stage.code == 'ats': search = stage.value.search.name evaluator = stage.value.evaluator.name if search == comb.Search.NAME and evaluator == comb.Evaluator.NAME: self.stages[i].status = Pipeline.COMPLETE models = [ ] for model in para.MODEL: if model.USE: summary = addict.Dict() self.logs.append('Modelling {model}'.format(model = model.LABEL)) summary.label = model.LABEL summary.name = model.NAME summary.options = assign_if_none(model.OPTIONS, [ ]) for i, stage in enumerate(self.stages): if stage.code == 'lrn' and stage.value.name == model.NAME: self.stages[i].status = Pipeline.RUNNING for i, instance in enumerate(data): iclass = list(range(instance.num_classes)) options = assign_if_none(model.OPTIONS, [ ]) classifier = Classifier(classname = 'weka.classifiers.{classname}'.format(classname = model.NAME), options = options) classifier.build_classifier(tran) serializer.write(os.path.join(head, '{name}.{classname}.model'.format( name = name, classname = model.NAME )), classifier) self.logs.append('Testing model {model}'.format(model = model.LABEL)) evaluation = Evaluation(tran) evaluation.test_model(classifier, test) summary.summary = evaluation.summary() frame = pd.DataFrame(data = evaluation.confusion_matrix) axes = sns.heatmap(frame, cbar = False, annot = True) b64str = get_b64_plot(axes) summary.confusion_matrix = addict.Dict({ 'value': evaluation.confusion_matrix.tolist(), 'plot': b64str }) self.logs.append('Plotting Learning Curve for {model}'.format(model = model.LABEL)) buffer = io.BytesIO() plot_classifier_errors(evaluation.predictions, tran, test, outfile = buffer, wait = False) b64str = buffer_to_b64(buffer) summary.learning_curve = b64str buffer = io.BytesIO() plot_roc(evaluation, class_index = iclass, outfile = buffer, wait = False) b64str = buffer_to_b64(buffer) summary.roc_curve = b64str buffer = io.BytesIO() plot_prc(evaluation, class_index = iclass, outfile = buffer, wait = False) b64str = buffer_to_b64(buffer) summary.prc_curve = b64str if classifier.graph: summary.graph = classifier.graph for i, instance in enumerate(test): prediction = classifier.classify_instance(instance) for i, stage in enumerate(self.stages): if stage.code == 'lrn' and stage.value.name == model.NAME: self.stages[i].status = Pipeline.COMPLETE models.append(summary) self.gist.models = models JVM.stop() JSON.write(os.path.join(head, '{name}.cgist'.format(name = name)), self.gist) self.logs.append('Pipeline Complete') self.set_status(Pipeline.COMPLETE)
def main(): """ Just runs some example code. """ # load a dataset iris = helper.get_data_dir() + os.sep + "iris.arff" helper.print_info("Loading dataset: " + iris) loader = Loader("weka.core.converters.ArffLoader") data = loader.load_file(iris) # remove class attribute helper.print_info("Removing class attribute") remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "last"]) remove.inputformat(data) filtered = remove.filter(data) # use MultiFilter helper.print_info("Use MultiFilter") remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "first"]) std = Filter(classname="weka.filters.unsupervised.attribute.Standardize") multi = MultiFilter() multi.filters = [remove, std] multi.inputformat(data) filtered_multi = multi.filter(data) # output datasets helper.print_title("Input") print(data) helper.print_title("Output") print(filtered) helper.print_title("Output (MultiFilter)") print(filtered_multi) # load text dataset text = helper.get_data_dir() + os.sep + "reutersTop10Randomized_1perc_shortened.arff" helper.print_info("Loading dataset: " + text) loader = Loader("weka.core.converters.ArffLoader") data = loader.load_file(text) data.class_is_last() # apply StringToWordVector stemmer = Stemmer(classname="weka.core.stemmers.IteratedLovinsStemmer") stopwords = Stopwords(classname="weka.core.stopwords.Rainbow") tokenizer = Tokenizer(classname="weka.core.tokenizers.WordTokenizer") s2wv = StringToWordVector(options=["-W", "10", "-L", "-C"]) s2wv.stemmer = stemmer s2wv.stopwords = stopwords s2wv.tokenizer = tokenizer s2wv.inputformat(data) filtered = s2wv.filter(data) helper.print_title("Input (StringToWordVector)") print(data) helper.print_title("Output (StringToWordVector)") print(filtered) # partial classname helper.print_title("Creating filter from partial classname") clsname = ".Standardize" f = Filter(classname=clsname) print(clsname + " --> " + f.classname) # source code helper.print_info("Generate source code") bolts = helper.get_data_dir() + os.sep + "labor.arff" helper.print_info("Loading dataset: " + bolts) loader = Loader("weka.core.converters.ArffLoader") data = loader.load_file(bolts) replace = Filter(classname="weka.filters.unsupervised.attribute.ReplaceMissingValues") replace.inputformat(data) replace.filter(data) print(replace.to_source("MyReplaceMissingValues", data))
import weka.core.jvm as jvm from weka.core.converters import Loader from weka.clusterers import Clusterer, ClusterEvaluation from weka.filters import Filter import weka.plot.clusterers as plc jvm.start() # load iris fname = data_dir + os.sep + "iris.arff" print("\nLoading dataset: " + fname + "\n") loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(fname) # remove class attribute flt = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "last"]) flt.set_inputformat(data) filtered = flt.filter(data) # build KMeans print("\n--> SimpleKMeans\n") cl = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", "3"]) cl.build_clusterer(filtered) evl = ClusterEvaluation() evl.set_model(cl) evl.test_model(filtered) print(evl.get_cluster_results()) plc.plot_cluster_assignments(evl, data, atts=[], inst_no=True, wait=True) # use AddCluster filter print("\n--> AddCluster filter\n")
def main(): """ Just runs some example code. """ # load a dataset iris = helper.get_data_dir() + os.sep + "iris.arff" helper.print_info("Loading dataset: " + iris) loader = Loader("weka.core.converters.ArffLoader") data = loader.load_file(iris) # remove class attribute helper.print_info("Removing class attribute") remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "last"]) remove.inputformat(data) filtered = remove.filter(data) # use MultiFilter helper.print_info("Use MultiFilter") remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "first"]) std = Filter(classname="weka.filters.unsupervised.attribute.Standardize") multi = MultiFilter() multi.filters = [remove, std] multi.inputformat(data) filtered_multi = multi.filter(data) # output datasets helper.print_title("Input") print(data) helper.print_title("Output") print(filtered) helper.print_title("Output (MultiFilter)") print(filtered_multi) # load text dataset text = helper.get_data_dir( ) + os.sep + "reutersTop10Randomized_1perc_shortened.arff" helper.print_info("Loading dataset: " + text) loader = Loader("weka.core.converters.ArffLoader") data = loader.load_file(text) data.class_is_last() # apply StringToWordVector stemmer = Stemmer(classname="weka.core.stemmers.IteratedLovinsStemmer") stopwords = Stopwords(classname="weka.core.stopwords.Rainbow") tokenizer = Tokenizer(classname="weka.core.tokenizers.WordTokenizer") s2wv = StringToWordVector(options=["-W", "10", "-L", "-C"]) s2wv.stemmer = stemmer s2wv.stopwords = stopwords s2wv.tokenizer = tokenizer s2wv.inputformat(data) filtered = s2wv.filter(data) helper.print_title("Input (StringToWordVector)") print(data) helper.print_title("Output (StringToWordVector)") print(filtered) # partial classname helper.print_title("Creating clusterer from partial classname") clsname = ".Standardize" f = Filter(classname=clsname) print(clsname + " --> " + f.classname)
def run_classifier(path, prot, sel, cols, prot_vals, beta): DIs = dict() jvm.start() for i in range(len(cols)-1): loader = Loader(classname="weka.core.converters.CSVLoader") data = loader.load_file(path) # remove selected attribute from the data # NOTE: options are ONE indexed, not ZERO indexed remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", \ options=["-R", str(sel[2]+1)]) remove.inputformat(data) data = remove.filter(data) # if running for only one attribue, remove all others (except protected) if i > 0: for j in range(1, prot[2]+1): if i != j: remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", \ options=["-R", ("1" if i>j else "2")]) remove.inputformat(data) data = remove.filter(data) # set prot attribute as Class attribute data.class_is_last() # run classifier cls = Classifier(classname="weka.classifiers.bayes.NaiveBayes") cls.build_classifier(data) # count the number of each combination pos_and_pred = float(0.0) pos_and_not_pred = float(0.0) neg_and_pred = float(0.0) neg_and_not_pred = float(0.0) for ind, inst in enumerate(data): if cls.classify_instance(inst): if prot_vals[ind] == prot[1]: pos_and_pred += 1 else: neg_and_pred += 1 else: if prot_vals[ind] == prot[1]: pos_and_not_pred += 1 else: neg_and_not_pred += 1 # calculate DI BER = ((pos_and_not_pred / (pos_and_pred + pos_and_not_pred)) + \ (neg_and_pred / (neg_and_pred + neg_and_not_pred))) * 0.5 if BER > 0.5: BER = 1 - BER DI = 1 - ((1 - 2 * BER) / (beta + 1 - 2 * BER)) if i == 0: # consider changing this to a 'code word' instead of 'all' DIs["all"] = DI else: DIs[cols[i-1]] = DI jvm.stop() return DIs
) #,options=["-method", "2"]) evaluator = ASEvaluation( classname='weka.attributeSelection.ClassifierAttributeEval', options=['-B', 'weka.classifiers.bayes.NaiveBayes']) Eval = AttributeSelection( classname='weka.attributeSelection.ClassifierAttributeEval', options=[ '-B', 'weka.classifiers.bayes.NaiveBayes', '--', "-S 'weka.attributeSelection.RerankingSearch -method 2'" ]) from weka.filters import Filter NominalToBinary = Filter( classname= "weka.filters.unsupervised.attribute.NominalToBinary", options=["-R", "5,7,8"]) NumericToNominal = Filter( classname= "weka.filters.unsupervised.attribute.NumericToNominal") ReplaceMV = Filter( classname= "weka.filters.unsupervised.attribute.ReplaceMissingValues") ReplaceMV.inputformat(dataTrain) dataTrain = ReplaceMV.filter(dataTrain) ReplaceMV.inputformat(dataTest) dataTest = ReplaceMV.filter(dataTest) from weka.classifiers import Classifier #mapper = Classifier(classname="weka.classifiers.misc.InputMappedClassifier", options=["-W", "weka.classifiers.functions.SMO", "--", "-K","weka.classifiers.functions.supportVector.PolyKernel -E 2.0"]) mapper = Classifier( classname="weka.classifiers.misc.InputMappedClassifier",
def getTenScaledResultsRankedByInfo(self, trainingData, indexInTable, csvFilePath, testingData = None): dbmgr = permissionMappingManager(databasePath) featureNum = trainingData.num_attributes() - 1 attributeIn = trainingData.attributes() attributeList = [] for item in attributeIn: functionName = str(item).split(" ")[1] functionName = functionName.split("(")[0] + "\(\)" functionName = functionName.replace('$','\$') #print functionName attributeList.append(functionName) outputStr = "" outputStr += "InfomationGain" + "," resultList = [] bestAccuracy = 0 bestTrainData = 0 bestTestData = 0 #for index in range(0, len(attributeList)-1): # attributeList[index] = attributeList[index].split(" ")[1] # print attributeList[index] csvFile = open(csvFilePath, "a") csvFile.write(self.algorithmTable[indexInTable]+",") step = 10 while step < featureNum: # pick top features filteredTrainData = self.attributeSelector(trainingData, step) # check top feature informations APIList = [] for item in filteredTrainData.attributes(): #print str(item) functionName = str(item).split(" ")[1] #functionName = functionName.split("_")[0][1:] APIList.append(functionName) numberOfInstance = self.getNumOfInstance(trainingData) # Get those features that it doesn't pick filteredList = [] attributeIn = filteredTrainData.attributes() for item in attributeIn: functionName = str(item).split(" ")[1] functionName = functionName.split("(")[0] + "\(\)" functionName = functionName.replace('$','\$') filteredList.append(functionName) items = self.getItemsNotInTheList(attributeList, filteredList) #print len(items) #for item in items: # print item # Re-process training data and make testing Data synchronized filteredTrainData = trainingData filterTestingData = testingData for attribute in items: remove = Filter(classname="weka.filters.unsupervised.attribute.RemoveByName", options=["-E", "^" + attribute + ".*$"]) remove.set_inputformat(filteredTrainData) filteredTrainData = remove.filter(filteredTrainData) if filterTestingData: remove.set_inputformat(filterTestingData) filterTestingData = remove.filter(filterTestingData) #print attribute #print str(filteredTrainData.num_attributes() - 1) # Build classifier and evaluate it classifier = self.algorithmPicker(filteredTrainData, indexInTable) evaluation = self.evaluation(classifier, filteredTrainData, filterTestingData) #print(self.algorithmTable[indexInTable] + ": " + "{:.2f}".format(evaluation.percent_correct()) + ", Feature select number:" + str(filteredTrainData.num_attributes() - 1) + "/" + str(featureNum)) resultList.append("{:.2f}".format(evaluation.percent_correct())) #Save best data and accuracy if evaluation.percent_correct() > bestAccuracy: bestAccuracy = evaluation.percent_correct() bestTrainData = filteredTrainData if testingData: bestTestData = filterTestingData #bestEvaluation = evaluation step += 10 classifier = self.algorithmPicker(trainingData, indexInTable) evaluation = self.evaluation(classifier, trainingData, testingData) #print(self.algorithmTable[indexInTable] + ": " + "{:.2f}".format(evaluation.percent_correct()) + ", Feature select number:" + str(trainingData.num_attributes() - 1) + "/" + str(featureNum)) resultList.append("{:.2f}".format(evaluation.percent_correct())) #Save best data and accuracy if evaluation.percent_correct() > bestAccuracy: bestAccuracy = evaluation.percent_correct() bestTrainData = filteredTrainData if testingData: bestTestData = filterTestingData #bestEvaluation = evaluation for item in resultList: outputStr += item +"," outputStr = outputStr[0:-1] + "\n" self.writeToPath(csvFilePath, outputStr) return [bestAccuracy, bestTrainData, bestTestData, resultList]
try: from weka.core.converters import Loader loader = Loader( classname="weka.core.converters.CSVLoader") data = loader.load_file(path + '/' + str(window) + 'd_' + str(begin) + 'to' + str(ntp - 1) + '.csv') data.class_is_last() for fold in range(1, 11): from weka.filters import Filter StratifiedCV = Filter( classname= "weka.filters.supervised.instance.StratifiedRemoveFolds", options=[ '-S', '42', '-N', '10', '-F', str(fold) ]) StratifiedCV.inputformat(data) dataTest = StratifiedCV.filter(data) StratifiedCV = Filter( classname= "weka.filters.supervised.instance.StratifiedRemoveFolds", options=[ '-S', '42', '-V', '-N', '10', '-F', str(fold) ]) StratifiedCV.inputformat(data) dataTrain = StratifiedCV.filter(data)
def testing(): logging.disable("weka") print "PROSES KLASIFIKASI\n------------------" jvm.start() pruning = 0 while pruning < 2: persen_train = 0 while persen_train < 4: fitur_hapus = 15 while fitur_hapus >= 0: list_akurasi = [] list_recall = [] list_presisi = [] list_fmeasure = [] list_roc = [] count = 0 nama = "hasilTest/" if(pruning == 0): nama += "unpruning" if(persen_train == 0): nama += "40" elif(persen_train == 1): nama += "50" elif(persen_train == 2): nama += "60" else: nama += "70" else: nama += "pruning" if(persen_train == 0): nama += "40" elif(persen_train == 1): nama += "50" elif(persen_train == 2): nama += "60" else: nama += "70" if(fitur_hapus > 0): nama += "removeF" + str(fitur_hapus) + ".txt" else: nama += "normal.txt" f = open(nama, "w") if(pruning == 0): nama = "unpruning" print "Tanpa Pruning" f.write("Hasil Decision Tree C4.5 tanpa Pruning (unpruning)\n") if(persen_train == 0): nama += "40" f.write("Dengan Training Set sebesar 40%\n") elif(persen_train == 1): nama += "50" f.write("Dengan Training Set sebesar 50%\n") elif(persen_train == 2): nama += "60" f.write("Dengan Training Set sebesar 60%\n") else: nama += "70" f.write("Dengan Training Set sebesar 70%\n") else: nama = "pruning" print "Dengan Pruning" f.write("Hasil Decision Tree C4.5 Pruning\n") if(persen_train == 0): nama += "40" f.write("Dengan Training Set sebesar 40%\n") elif(persen_train == 1): nama += "50" f.write("Dengan Training Set sebesar 50%\n") elif(persen_train == 2): nama += "60" f.write("Dengan Training Set sebesar 60%\n") else: nama += "70" f.write("Dengan Training Set sebesar 70%\n") if(fitur_hapus > 0): f.write("Menggunakan remove pada fitur " + str(fitur_hapus) + "\n\n") else: f.write("\n") f.write("No. Akurasi Recall Presisi F-Measure ROC\n") if persen_train == 0: print "40% Data Training" elif persen_train == 1: print "50% Data Training" elif persen_train == 2: print "60% Data Training" else: print "70% Data Training" print "Fitur yang dihapus:", fitur_hapus print "\nNo.\tAkurasi\tRecall\tPresisi\tF-Measure\tROC" while count < 100: loader = Loader(classname = "weka.core.converters.ArffLoader") data = loader.load_file("hasil.arff") data.class_is_last() if(fitur_hapus > 0): remove = Filter(classname = "weka.filters.unsupervised.attribute.Remove", options = ["-R", str(fitur_hapus)]) remove.inputformat(data) data_baru = remove.filter(data) data_baru.class_is_last() else: data_baru = loader.load_file("hasil.arff") data_baru.class_is_last() filter = Filter(classname = "weka.filters.unsupervised.instance.Randomize", options = ["-S", str(int(time.time()))]) filter.inputformat(data_baru) data_random = filter.filter(data_baru) data_random.class_is_last() if(pruning == 0): classifier = Classifier(classname = "weka.classifiers.trees.J48", options = ["-U"]) else: classifier = Classifier(classname = "weka.classifiers.trees.J48", options = ["-C", "0.25"]) evaluation = Evaluation(data_random) if(persen_train == 0): evaluation.evaluate_train_test_split(classifier, data_random, percentage = 40) elif(persen_train == 1): evaluation.evaluate_train_test_split(classifier, data_random, percentage = 50) elif(persen_train == 2): evaluation.evaluate_train_test_split(classifier, data_random, percentage = 60) else: evaluation.evaluate_train_test_split(classifier, data_random, percentage = 70) f.write(str(count + 1) + str( ". " ) + str(evaluation.weighted_true_positive_rate) + str( " " ) + str(evaluation.weighted_recall) + str( " " ) + str(evaluation.weighted_precision) + str( " " ) + str(evaluation.weighted_f_measure) + str( " " ) + str(evaluation.weighted_area_under_roc) + "\n") print count + 1, evaluation.weighted_true_positive_rate, evaluation.weighted_recall, evaluation.weighted_precision, evaluation.weighted_f_measure, evaluation.weighted_area_under_roc list_akurasi.append(evaluation.weighted_true_positive_rate) list_recall.append(evaluation.weighted_recall) list_presisi.append(evaluation.weighted_precision) list_fmeasure.append(evaluation.weighted_f_measure) list_roc.append(evaluation.weighted_area_under_roc) count += 1 time.sleep(1) list_akurasi.sort() list_recall.sort() list_presisi.sort() list_fmeasure.sort() list_roc.sort() f.write( "" + "\n") f.write( "Rata-Rata" + "\n") f.write( "Akurasi:" + str(sum(list_akurasi) / 100.0) + "\n") f.write( "Recall:" + str(sum(list_recall) / 100.0) + "\n") f.write( "Presisi:" + str(sum(list_presisi) / 100.0) + "\n") f.write( "F-Measure:" + str(sum(list_fmeasure) / 100.0) + "\n") f.write( "ROC:" + str(sum(list_roc) / 100.0) + "\n") f.write( "" + "\n") f.write( "Max" + "\n") f.write( "Akurasi:" + str(list_akurasi[-1] ) + "\n") f.write( "Recall:" + str(list_recall[-1] ) + "\n") f.write( "Presisi:" + str(list_presisi[-1] ) + "\n") f.write( "F-Measure:" + str(list_fmeasure[-1] ) + "\n") f.write( "ROC:" + str(list_roc[-1] ) + "\n") f.write( "" + "\n") f.write( "Min" + "\n") f.write( "Akurasi:" + str(list_akurasi[0] ) + "\n") f.write( "Recall:" + str(list_recall[0] ) + "\n") f.write( "Presisi:" + str(list_presisi[0] ) + "\n") f.write( "F-Measure:" + str(list_fmeasure[0] ) + "\n") f.write( "ROC:" + str(list_roc[0] ) + "\n") f.write( "" + "\n") print "" print "Rata-Rata" print "Akurasi:", sum(list_akurasi) / 100.0 print "Recall:", sum(list_recall) / 100.0 print "Presisi:", sum(list_presisi) / 100.0 print "F-Measure:", sum(list_fmeasure) / 100.0 print "ROC:", sum(list_roc) / 100.0 print "" print "Max" print "Akurasi:", list_akurasi[-1] print "Recall:", list_recall[-1] print "Presisi:", list_presisi[-1] print "F-Measure:", list_fmeasure[-1] print "ROC:", list_roc[-1] print "" print "Min" print "Akurasi:", list_akurasi[0] print "Recall:", list_recall[0] print "Presisi:", list_presisi[0] print "F-Measure:", list_fmeasure[0] print "ROC:", list_roc[0] print "" f.close() fitur_hapus -= 1 persen_train += 1 pruning += 1 jvm.stop()
print(group) train = data_dir + os.sep + group + "_Cal.arff" test = data_dir + os.sep + group + "_Test.arff" pred = data_dir + os.sep + group + "_Val.arff" loader = Loader(classname="weka.core.converters.ArffLoader") print(train) train_data = loader.load_file(train) train_data.class_index = train_data.attribute_by_name( "reference value").index print(test) test_data = loader.load_file(test) test_data.class_index = test_data.attribute_by_name( "reference value").index print(pred) pred_data = loader.load_file(pred) pred_data.class_index = pred_data.attribute_by_name( "reference value").index cls = FilteredClassifier() cls.classifier = Classifier( classname="weka.classifiers.functions.LinearRegression", options=["-S", "1", "-C"]) cls.filter = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "first"]) cls.build_classifier(train_data) evl = Evaluation(train_data) evl.test_model(cls, test_data) print(evl.summary()) jvm.stop()
def main(): """ Just runs some example code. """ # load a dataset data_file = helper.get_data_dir() + os.sep + "vote.arff" helper.print_info("Loading dataset: " + data_file) loader = Loader("weka.core.converters.ArffLoader") data = loader.load_file(data_file) data.class_is_last() # classifier classifier = Classifier(classname="weka.classifiers.trees.J48") # randomize data folds = 10 seed = 1 rnd = Random(seed) rand_data = Instances.copy_instances(data) rand_data.randomize(rnd) if rand_data.class_attribute.is_nominal: rand_data.stratify(folds) # perform cross-validation and add predictions predicted_data = None evaluation = Evaluation(rand_data) for i in xrange(folds): train = rand_data.train_cv(folds, i) # the above code is used by the StratifiedRemoveFolds filter, # the following code is used by the Explorer/Experimenter # train = rand_data.train_cv(folds, i, rnd) test = rand_data.test_cv(folds, i) # build and evaluate classifier cls = Classifier.make_copy(classifier) cls.build_classifier(train) evaluation.test_model(cls, test) # add predictions addcls = Filter( classname="weka.filters.supervised.attribute.AddClassification", options=["-classification", "-distribution", "-error"]) # setting the java object directory avoids issues with correct quoting in option array addcls.set_property("classifier", Classifier.make_copy(classifier)) addcls.inputformat(train) addcls.filter(train) # trains the classifier pred = addcls.filter(test) if predicted_data is None: predicted_data = Instances.template_instances(pred, 0) for n in xrange(pred.num_instances): predicted_data.add_instance(pred.get_instance(n)) print("") print("=== Setup ===") print("Classifier: " + classifier.to_commandline()) print("Dataset: " + data.relationname) print("Folds: " + str(folds)) print("Seed: " + str(seed)) print("") print(evaluation.summary("=== " + str(folds) + " -fold Cross-Validation ===")) print("") print(predicted_data)