def main(): """ Just runs some example code. """ # load a dataset iris = helper.get_data_dir() + os.sep + "iris.arff" helper.print_info("Loading dataset: " + iris) loader = Loader("weka.core.converters.ArffLoader") data = loader.load_file(iris) # remove class attribute helper.print_info("Removing class attribute") remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "last"]) remove.inputformat(data) filtered = remove.filter(data) # use MultiFilter helper.print_info("Use MultiFilter") remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "first"]) std = Filter(classname="weka.filters.unsupervised.attribute.Standardize") multi = MultiFilter() multi.filters = [remove, std] multi.inputformat(data) filtered_multi = multi.filter(data) # output datasets helper.print_title("Input") print(data) helper.print_title("Output") print(filtered) helper.print_title("Output (MultiFilter)") print(filtered_multi) # load text dataset text = helper.get_data_dir( ) + os.sep + "reutersTop10Randomized_1perc_shortened.arff" helper.print_info("Loading dataset: " + text) loader = Loader("weka.core.converters.ArffLoader") data = loader.load_file(text) data.class_is_last() # apply StringToWordVector stemmer = Stemmer(classname="weka.core.stemmers.IteratedLovinsStemmer") stopwords = Stopwords(classname="weka.core.stopwords.Rainbow") tokenizer = Tokenizer(classname="weka.core.tokenizers.WordTokenizer") s2wv = StringToWordVector(options=["-W", "10", "-L", "-C"]) s2wv.stemmer = stemmer s2wv.stopwords = stopwords s2wv.tokenizer = tokenizer s2wv.inputformat(data) filtered = s2wv.filter(data) helper.print_title("Input (StringToWordVector)") print(data) helper.print_title("Output (StringToWordVector)") print(filtered)
def predictWithWeka(csvFilenameWithInputToPredict, modelFilename): """ # Nota: para usar sin conocer la clase, se puede colocar una clase dummy # e ignorar los valores actual y error de @return results. # # Nota: es necesario que el archivo de nombre @csvFilenameWithInputToPredict # contenga instancias de ambas clases (spam y sanas) # # @csvFilenameWithInputToPredict : nombre del archivo csv con las instancias # a predecir. # # @modelFilename : nombre del archivo de modelo generado por weka y # compatible con el archivo csv de entrada # # @return results : lista de diccionarios con los siguientes indices # index, actual, predicted, error y distribution """ loader = Loader(classname="weka.core.converters.CSVLoader") cls = Classifier(jobject=serialization.read(modelFilename)) #print(cls) data = loader.load_file(csvFilenameWithInputToPredict) data.class_is_last() multi = MultiFilter() remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "first"]) numericToNom = Filter( classname="weka.filters.unsupervised.attribute.NumericToNominal", options=["-R", "8,11"]) normalize = Filter( classname="weka.filters.unsupervised.attribute.Normalize", options=["-S", "1.0", "-T", "0.0"]) multi.filters = [remove, numericToNom, normalize] multi.inputformat(data) test = multi.filter(data) results = [] for index, inst in enumerate(test): result = dict() pred = cls.classify_instance(inst) dist = cls.distribution_for_instance(inst) result["index"] = index + 1 result["actual"] = inst.get_string_value(inst.class_index) result["predicted"] = inst.class_attribute.value(int(pred)) result["error"] = "yes" if pred != inst.get_value( inst.class_index) else "no" result["distribution"] = str(dist.tolist()) results.append(result) #print result return results
def main(): """ Just runs some example code. """ # load a dataset iris = helper.get_data_dir() + os.sep + "iris.arff" helper.print_info("Loading dataset: " + iris) loader = Loader("weka.core.converters.ArffLoader") data = loader.load_file(iris) # remove class attribute helper.print_info("Removing class attribute") remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "last"]) remove.inputformat(data) filtered = remove.filter(data) # use MultiFilter helper.print_info("Use MultiFilter") remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "first"]) std = Filter(classname="weka.filters.unsupervised.attribute.Standardize") multi = MultiFilter() multi.filters = [remove, std] multi.inputformat(data) filtered_multi = multi.filter(data) # output datasets helper.print_title("Input") print(data) helper.print_title("Output") print(filtered) helper.print_title("Output (MultiFilter)") print(filtered_multi) # load text dataset text = helper.get_data_dir() + os.sep + "reutersTop10Randomized_1perc_shortened.arff" helper.print_info("Loading dataset: " + text) loader = Loader("weka.core.converters.ArffLoader") data = loader.load_file(text) data.class_is_last() # apply StringToWordVector stemmer = Stemmer(classname="weka.core.stemmers.IteratedLovinsStemmer") stopwords = Stopwords(classname="weka.core.stopwords.Rainbow") tokenizer = Tokenizer(classname="weka.core.tokenizers.WordTokenizer") s2wv = StringToWordVector(options=["-W", "10", "-L", "-C"]) s2wv.stemmer = stemmer s2wv.stopwords = stopwords s2wv.tokenizer = tokenizer s2wv.inputformat(data) filtered = s2wv.filter(data) helper.print_title("Input (StringToWordVector)") print(data) helper.print_title("Output (StringToWordVector)") print(filtered) # partial classname helper.print_title("Creating filter from partial classname") clsname = ".Standardize" f = Filter(classname=clsname) print(clsname + " --> " + f.classname) # source code helper.print_info("Generate source code") bolts = helper.get_data_dir() + os.sep + "labor.arff" helper.print_info("Loading dataset: " + bolts) loader = Loader("weka.core.converters.ArffLoader") data = loader.load_file(bolts) replace = Filter(classname="weka.filters.unsupervised.attribute.ReplaceMissingValues") replace.inputformat(data) replace.filter(data) print(replace.to_source("MyReplaceMissingValues", data))