def perform_KMeans(data, classes, k): clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", str(k)]) clusterer.build_clusterer(data) purity = cluster_purity(clusterer, data, classes) return purity
def perform_HC(data, classes, k, link): clusterer = Clusterer(classname="weka.clusterers.HierarchicalClusterer", options=["-N", str(k), "-L", link]) clusterer.build_clusterer(data) purity = cluster_purity(clusterer, data, classes) return purity
def main(): """ Just runs some example code. """ # load a dataset iris_file = helper.get_data_dir() + os.sep + "iris.arff" helper.print_info("Loading dataset: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") full = loader.load_file(iris_file) full.class_is_last() # remove class attribute data = Instances.copy_instances(full) data.no_class() data.delete_last_attribute() # build a clusterer and output model helper.print_title("Training SimpleKMeans clusterer") clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", "3"]) clusterer.build_clusterer(data) print("done") # classes to clusters evl = ClusterEvaluation() evl.set_model(clusterer) evl.test_model(full) helper.print_title("Cluster results") print(evl.cluster_results) helper.print_title("Classes to clusters") print(evl.classes_to_clusters)
def simpleKMeansTrain(self, dataf, options, mname, temp=True): ''' :param data: -> data to be clustered :param options: -> SimpleKMeans options N -> number of clusters A -> Distance function to use (ex: default is "weka.core.EuclideanDistance -R first-last") l -> maximum number of iterations default 500 num-slots -> number of execution slots, 1 means no parallelism S -> Random number seed (default 10) example => ["-N", "10", "-S", "10"] :return: ''' try: jvm.start(max_heap_size=self.wHeap) data = self.loadData(dataf, temp=True) clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=options) clusterer.build_clusterer(data) print clusterer # cluster the data for inst in data: cl = clusterer.cluster_instance(inst) # 0-based cluster index dist = clusterer.distribution_for_instance(inst) # cluster membership distribution print("cluster=" + str(cl) + ", distribution=" + str(dist)) self.saveModel(clusterer, 'skm', mname) except Exception, e: print(traceback.format_exc())
def emTrain(self, dataf, options, mname, temp=True): ''' :param data: -> data to be clustered :param options: -> EM options I -> number of iterations N -> number of clusters M -> Minimum standard deviation for normal density (default=1.0E-6) num-slots -> number of execution slots, 1 means no parallelism S -> random seed (default=100) example => ["-I", "1000", "-N", "6", "-X", "10", "-max", "-1", "-ll-cv", "1.0E-6", "-ll-iter", "1.0E-6", "-M", "1.0E-6", "-num-slots", "1", "-S", "100"] :return: ''' try: jvm.start(max_heap_size=self.wHeap) data = self.loadData(dataf, temp) clusterEM = Clusterer(classname="weka.clusterers.EM", options=options) clusterEM.build_clusterer(data) print(clusterEM) self.saveModel(clusterEM, 'em', mname, ) except Exception as e: print((traceback.format_exc())) finally: jvm.stop()
def simpleKMeansTrain(self, dataf, options, mname, temp=True): ''' :param data: -> data to be clustered :param options: -> SimpleKMeans options N -> number of clusters A -> Distance function to use (ex: default is "weka.core.EuclideanDistance -R first-last") l -> maximum number of iterations default 500 num-slots -> number of execution slots, 1 means no parallelism S -> Random number seed (default 10) example => ["-N", "10", "-S", "10"] :return: ''' try: jvm.start(max_heap_size=self.wHeap) data = self.loadData(dataf, temp=True) clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=options) clusterer.build_clusterer(data) print(clusterer) # cluster the data for inst in data: cl = clusterer.cluster_instance(inst) # 0-based cluster index dist = clusterer.distribution_for_instance(inst) # cluster membership distribution print(("cluster=" + str(cl) + ", distribution=" + str(dist))) self.saveModel(clusterer, 'skm', mname) except Exception as e: print((traceback.format_exc())) finally: jvm.stop()
def main(): """ Just runs some example code. """ # load a dataset iris_file = helper.get_data_dir() + os.sep + "iris.arff" helper.print_info("Loading dataset: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") data = loader.load_file(iris_file) # remove class attribute data.delete_last_attribute() # build a clusterer and output model helper.print_title("Training SimpleKMeans clusterer") clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", "3"]) clusterer.build_clusterer(data) print(clusterer) # cluster data helper.print_info("Clustering data") for index, inst in enumerate(data): cl = clusterer.cluster_instance(inst) dist = clusterer.distribution_for_instance(inst) print(str(index+1) + ": cluster=" + str(cl) + ", distribution=" + str(dist))
def main(): """ Just runs some example code. """ # load a dataset iris_file = helper.get_data_dir() + os.sep + "iris.arff" helper.print_info("Loading dataset: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") full = loader.load_file(iris_file) full.class_is_last() # remove class attribute data = Instances.copy_instances(full) data.no_class() data.delete_last_attribute() # build a clusterer and output model helper.print_title("Training SimpleKMeans clusterer") clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", "3"]) clusterer.build_clusterer(data) print("done") # classes to clusters evl = ClusterEvaluation() evl.set_model(clusterer) evl.test_model(full) helper.print_title("Cluster results") print(evl.cluster_results) helper.print_title("Classes to clusters") print(evl.classes_to_clusters)
def perform_DBScan(data, classes, e, min_points): clusterer = Clusterer(classname="weka.clusterers.DBSCAN", options=["-E", str(e), "-M", str(min_points)]) clusterer.build_clusterer(data) purity = cluster_purity(clusterer, data, classes) return purity
def main(): """ Just runs some example code. """ # load a dataset iris_file = helper.get_data_dir() + os.sep + "iris.arff" helper.print_info("Loading dataset: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") data = loader.load_file(iris_file) # remove class attribute data.delete_last_attribute() # build a clusterer and output model helper.print_title("Training SimpleKMeans clusterer") clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", "3"]) clusterer.build_clusterer(data) print(clusterer) helper.print_info("Evaluating on data") evaluation = ClusterEvaluation() evaluation.set_model(clusterer) evaluation.test_model(data) print("# clusters: " + str(evaluation.num_clusters)) print("log likelihood: " + str(evaluation.log_likelihood)) print("cluster assignments:\n" + str(evaluation.cluster_assignments)) plc.plot_cluster_assignments(evaluation, data, inst_no=True) # using a filtered clusterer helper.print_title("Filtered clusterer") loader = Loader("weka.core.converters.ArffLoader") data = loader.load_file(iris_file) clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", "3"]) remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "last"]) fclusterer = FilteredClusterer() fclusterer.clusterer = clusterer fclusterer.filter = remove fclusterer.build_clusterer(data) print(fclusterer) # load a dataset incrementally and build clusterer incrementally helper.print_title("Incremental clusterer") loader = Loader("weka.core.converters.ArffLoader") iris_inc = loader.load_file(iris_file, incremental=True) clusterer = Clusterer("weka.clusterers.Cobweb") remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "last"]) remove.inputformat(iris_inc) iris_filtered = remove.outputformat() clusterer.build_clusterer(iris_filtered) for inst in loader: remove.input(inst) inst_filtered = remove.output() clusterer.update_clusterer(inst_filtered) clusterer.update_finished() print(clusterer.to_commandline()) print(clusterer) print(clusterer.graph) plg.plot_dot_graph(clusterer.graph)
def main(): """ Just runs some example code. """ # load a dataset iris_file = helper.get_data_dir() + os.sep + "iris.arff" helper.print_info("Loading dataset: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") data = loader.load_file(iris_file) # remove class attribute data.delete_last_attribute() # build a clusterer and output model helper.print_title("Training SimpleKMeans clusterer") clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", "3"]) clusterer.build_clusterer(data) print(clusterer) helper.print_info("Evaluating on data") evaluation = ClusterEvaluation() evaluation.set_model(clusterer) evaluation.test_model(data) print("# clusters: " + str(evaluation.num_clusters)) print("log likelihood: " + str(evaluation.log_likelihood)) print("cluster assignments:\n" + str(evaluation.cluster_assignments)) plc.plot_cluster_assignments(evaluation, data, inst_no=True) # using a filtered clusterer helper.print_title("Filtered clusterer") loader = Loader("weka.core.converters.ArffLoader") data = loader.load_file(iris_file) clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", "3"]) remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "last"]) fclusterer = FilteredClusterer() fclusterer.clusterer = clusterer fclusterer.filter = remove fclusterer.build_clusterer(data) print(fclusterer) # load a dataset incrementally and build clusterer incrementally helper.print_title("Incremental clusterer") loader = Loader("weka.core.converters.ArffLoader") iris_inc = loader.load_file(iris_file, incremental=True) clusterer = Clusterer("weka.clusterers.Cobweb") remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "last"]) remove.inputformat(iris_inc) iris_filtered = remove.outputformat() clusterer.build_clusterer(iris_filtered) for inst in loader: remove.input(inst) inst_filtered = remove.output() clusterer.update_clusterer(inst_filtered) clusterer.update_finished() print(clusterer.to_commandline()) print(clusterer) print(clusterer.graph) plg.plot_dot_graph(clusterer.graph)
def run_cluster_simplek(self, output_directory, exc_class=False, num_clusters=7): data = Instances.copy_instances(self.training_data) data.no_class() data.delete_first_attribute() # build a clusterer and output model print("\nBuilding Clusterer on training data.") buildTimeStart = time.time() clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", "" + str(num_clusters)]) clusterer.build_clusterer(data) resultsString = "" resultsString = self.print_both(str(clusterer), resultsString) buildTimeString = "Clusterer Built in " + str( time.time() - buildTimeStart) + " secs.\n" resultsString = self.print_both(buildTimeString, resultsString) #Evaluate Clusterer resultsString = self.print_both("\nClustering data.", resultsString) buildTimeStart = time.time() clsexc = "" if (exc_class): # no class attribute clsexc = "_NO_Class" evl = ClusterEvaluation() evl.set_model(clusterer) evl.test_model(data) else: # classes to clusters evl = ClusterEvaluation() evl.set_model(clusterer) evl.test_model(self.training_data) resultsString = self.print_both("\nCluster results\n", resultsString) resultsString = self.print_both(str(evl.cluster_results), resultsString) resultsString = self.print_both("\nClasses to clusters\n", resultsString) resultsString = self.print_both(str(evl.classes_to_clusters), resultsString) buildTimeString = "\nClustered data in " + str( time.time() - buildTimeStart) + " secs.\n" resultsString = self.print_both(buildTimeString, resultsString) #Save Results and Cleanup self.save_results("SimpleKM" + clsexc + "_", resultsString, output_directory)
def run_clustering_task7_manual(self, output_directory, clusterer_name, num_clusters, seed=10): data = Instances.copy_instances(self.training_data) data.no_class() data.delete_first_attribute() clusterer_name_short = clusterer_name.replace("weka.clusterers.", "") # build a clusterer and output model print("\nBuilding " + clusterer_name_short + " Clusterer on training data.") buildTimeStart = time.time() clusterer = Clusterer( classname=clusterer_name, options=["-N", "" + str(num_clusters), "-S", "" + str(seed)]) clusterer.build_clusterer(data) resultsString = "" resultsString = self.print_both(str(clusterer), resultsString) buildTimeString = "Clusterer Built in " + str( time.time() - buildTimeStart) + " secs.\n" resultsString = self.print_both(buildTimeString, resultsString) #Evaluate Clusterer resultsString = self.print_both("\nClustering data.", resultsString) buildTimeStart = time.time() evl = ClusterEvaluation() evl.set_model(clusterer) evl.test_model(self.training_data) resultsString = self.print_both("\nCluster results\n", resultsString) resultsString = self.print_both(str(evl.cluster_results), resultsString) resultsString = self.print_both("\nClasses to clusters\n", resultsString) resultsString = self.print_both(str(evl.classes_to_clusters), resultsString) buildTimeString = "\nClustered data in " + str( time.time() - buildTimeStart) + " secs.\n" resultsString = self.print_both(buildTimeString, resultsString) #Save Results and Cleanup self.save_results( clusterer_name_short + "_" + "N" + str(num_clusters) + "_S" + str(seed), resultsString, output_directory)
def run_SKMeans_137(self): #construct output paths output_prefix = os.path.split(self.input_path)[-1].split(".")[0]; print(output_prefix); write_date = output_prefix + "." + str(datetime.now().date()); SKMeans_dir = os.path.join(self.output_dir,"SKMeans"); eval_path = os.path.join(SKMeans_dir, write_date + ".cl_eval.txt"); clust_desc_path = os.path.join(SKMeans_dir, write_date + ".cl_descr.txt"); clust_assign_path = os.path.join(SKMeans_dir, write_date + ".cl_assign.txt"); #create output dir if it doesn't already exist if(not os.path.exists(SKMeans_dir)): os.makedirs(SKMeans_dir); #clone data and build clusters # data_clone = copy.deepcopy(self.data_loaded); data_clone = self.data_loaded; clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N","137"]); clusterer.build_clusterer(data_clone); #cluster evaluation evaluation = ClusterEvaluation(); evaluation.set_model(clusterer); evaluation.test_model(data_clone); with open(eval_path, 'w') as outfile: outfile.write("number of clusters: \t" + str(evaluation.num_clusters) + "\n"); outfile.write("log likelihood: \t" + str(evaluation.num_clusters) + "\n"); outfile.write("cluster assignments: \t" + str(evaluation.cluster_assignments) + "\n"); outfile.write("***********************\n") outfile.write("\t".join(["SKmeans Cluster Evaluation Results\n"])); #header outfile.write(str(evaluation.cluster_results) + "\n"); #cluster Instance objects Description of clusters with open(clust_desc_path, 'w') as outfile: outfile.write(",".join(["cluster_num","distribution\n"])); #header for inst in data_clone: # data cl = clusterer.cluster_instance(inst); # 0-based cluster index dist = clusterer.distribution_for_instance(inst); #cluster membership distribution outfile.write(",".join([str(cl),str(dist)])); outfile.write("\n"); #cluster assignment by row with open(clust_assign_path, 'w') as outfile: outfile.write(",".join(["row_num","SKMeans\n"])); #header for i, inst in enumerate(evaluation.cluster_assignments): # data outfile.write(",".join([str(i),str(inst)])); outfile.write("\n"); return();
def create_cluster_model(arff_file, n=10, loader_type="csv", model="kmeans.model"): """ create cluster model """ check_jvm() if loader_type == "csv": loader = converters.Loader(classname="weka.core.converters.CSVLoader") else : loader = conventers.Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(arff_file) clusterer = Clusterer( classname="weka.clusterers.SimpleKMeans", options=["-N", str(n)]) clusterer.build_clusterer(data) serialization.write(model, clusterer)
def train_data(self): try: #helper.print_info("Loading dataset: " + self.datasetName) loader = Loader(classname="weka.core.converters.ArffLoader") data_train = loader.load_file(self.datasetName) data_train.delete_last_attribute() clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", "2"]) clusterer.build_clusterer(data_train) return clusterer except Exception, e: raise e print(traceback.format_exc())
def command(): jvm.start() import weka.core.converters as converters clusters = request.form['clusternum'] a1 = request.form['firstcol'] a2 = request.form['secondcol'] # print clusters # print a1 # print a2 if (a1 == 'B' and a2 == 'C'): data = converters.load_any_file("Data.csv") elif (a1 == 'B' and a2 == 'D'): data = converters.load_any_file("Data1.csv") elif (a1 == 'C' and a2 == 'D'): data = converters.load_any_file("Data2.csv") elif (a1 == 'C' and a2 == 'E'): data = converters.load_any_file("Data3.csv") elif (a1 == 'D' and a2 == 'E'): data = converters.load_any_file("Data4.csv") #data.class_is_last() print(data) # from weka.attribute_selection import ASSearch, ASEvaluation, AttributeSelection # search = ASSearch(classname="weka.attributeSelection.BestFirst", options=["-D", "1", "-N", "5"]) # evaluator = ASEvaluation(classname="weka.attributeSelection.CfsSubsetEval", options=["-P", "2", "-E", "1"]) # attsel = AttributeSelection() # attsel.search(search) # attsel.evaluator(evaluator) # attsel.select_attributes(data) f = open("filename.txt", "w") from weka.clusterers import Clusterer clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", "{}".format(clusters)]) clusterer.build_clusterer(data) print(clusterer) f.write(str(clusterer)) # cluster the data for inst in data: cl = clusterer.cluster_instance(inst) # 0-based cluster index dist = clusterer.distribution_for_instance( inst) # cluster membership distribution print("cluster=" + str(cl) + ", distribution=" + str(dist)) f.write("cluster=" + str(cl) + ", distribution=" + str(dist)) return render_template("output.html") f.close()
def run_clusterer(file): # Get filename from Pathlib object filename = file.parts[-1] dir = file.parents[0] print("Running Clusterer on %s" % filename) if not filename.endswith(".arff"): print("%s not ARFF file." % filename) return # Removes '.arff' from filename filename_base = filename[:-5] # Load data with class as first attr full = load_Arff_file(file) full.class_is_first() full_withoutclass = load_Arff_file(file) #data.delete_first_attribute() data = Instances.copy_instances(full) data.no_class() data.delete_first_attribute() dir = dir / "cluster_results_optimum" dir.mkdir(parents=True, exist_ok=True) # Init clusterer #"-N", "-1", n = "2" if (filename_base.startswith("fer2018_")): print("Changing number of clusters to 7") n = "7" #clusterer = Clusterer(classname="weka.clusterers.EM", options=[ "-S", "10", "-N", n]) #clusterer = Clusterer(classname="weka.clusterers.FarthestFirst", options=[ "-S", "10", "-N", n]) clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-S", "10", "-N", n]) clusterer.build_clusterer(data) evaluation = ClusterEvaluation() evaluation.set_model(clusterer) evaluation.test_model(full) str1 = str(filename_base) + "_cl_res.txt" output_results = dir / str1 output_cluster(evaluation, output_results)
def dbscanTrain(self, dataf, options, mname, temp=True): ''' :param data: -> data to be clustered :param options: -> dbscan options E -> epsilon (default = 0.9) M -> minPoints (default = 6) D -> default weka.clusterers.forOPTICSAndDBScan.DataObjects.EuclideanDataObject I -> index (database) used for DBSCAN (default = weka.clusterers.forOPTICSAndDBScan.Databases.SequentialDatabase) example => ["-E", "0.9", "-M", "6", "-I", "weka.clusterers.forOPTICSAndDBScan.Databases.SequentialDatabase", "-D", "weka.clusterers.forOPTICSAndDBScan.DataObjects.EuclideanDataObject"] :return: ''' try: jvm.start(max_heap_size=self.wHeap) data = self.loadData(dataf, temp) clusterDBSCAN = Clusterer(classname="weka.clusterers.DBSCAN", options=options) clusterDBSCAN.build_clusterer(data) print clusterDBSCAN self.saveModel(clusterDBSCAN, 'dbscan', mname) # cluster the data except Exception, e: print(traceback.format_exc())
def emTrain(self, dataf, options, mname, temp=True): ''' :param data: -> data to be clustered :param options: -> EM options I -> number of iterations N -> number of clusters M -> Minimum standard deviation for normal density (default=1.0E-6) num-slots -> number of execution slots, 1 means no parallelism S -> random seed (default=100) example => ["-I", "1000", "-N", "6", "-X", "10", "-max", "-1", "-ll-cv", "1.0E-6", "-ll-iter", "1.0E-6", "-M", "1.0E-6", "-num-slots", "1", "-S", "100"] :return: ''' try: jvm.start(max_heap_size=self.wHeap) data = self.loadData(dataf, temp) clusterEM = Clusterer(classname="weka.clusterers.EM", options=options) clusterEM.build_clusterer(data) print clusterEM self.saveModel(clusterEM, 'em', mname, ) except Exception, e: print(traceback.format_exc())
def dbscanTrain(self, dataf, options, mname, temp=True): ''' :param data: -> data to be clustered :param options: -> dbscan options E -> epsilon (default = 0.9) M -> minPoints (default = 6) D -> default weka.clusterers.forOPTICSAndDBScan.DataObjects.EuclideanDataObject I -> index (database) used for DBSCAN (default = weka.clusterers.forOPTICSAndDBScan.Databases.SequentialDatabase) example => ["-E", "0.9", "-M", "6", "-I", "weka.clusterers.forOPTICSAndDBScan.Databases.SequentialDatabase", "-D", "weka.clusterers.forOPTICSAndDBScan.DataObjects.EuclideanDataObject"] :return: ''' try: jvm.start(max_heap_size=self.wHeap) data = self.loadData(dataf, temp) clusterDBSCAN = Clusterer(classname="weka.clusterers.DBSCAN", options=options) clusterDBSCAN.build_clusterer(data) print(clusterDBSCAN) self.saveModel(clusterDBSCAN, 'dbscan', mname) # cluster the data except Exception as e: print((traceback.format_exc())) finally: jvm.stop()
# load iris fname = data_dir + os.sep + "iris.arff" print("\nLoading dataset: " + fname + "\n") loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(fname) # remove class attribute flt = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "last"]) flt.inputformat(data) filtered = flt.filter(data) # build KMeans print("\n--> SimpleKMeans\n") cl = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", "3"]) cl.build_clusterer(filtered) evl = ClusterEvaluation() evl.set_model(cl) evl.test_model(filtered) print(evl.cluster_results) plc.plot_cluster_assignments(evl, data, atts=[], inst_no=True, wait=True) # use AddCluster filter print("\n--> AddCluster filter\n") flt = Filter(classname="weka.filters.unsupervised.attribute.AddCluster", options=["-W", "weka.clusterers.SimpleKMeans -N 3"]) flt.inputformat(filtered) addcl = flt.filter(filtered) print(addcl) # classes-to-clusters evaluation
# load iris fname = data_dir + os.sep + "iris.arff" print("\nLoading dataset: " + fname + "\n") loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(fname) # remove class attribute flt = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "last"]) flt.set_inputformat(data) filtered = flt.filter(data) # build KMeans print("\n--> SimpleKMeans\n") cl = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", "3"]) cl.build_clusterer(filtered) evl = ClusterEvaluation() evl.set_model(cl) evl.test_model(filtered) print(evl.get_cluster_results()) plc.plot_cluster_assignments(evl, data, atts=[], inst_no=True, wait=True) # use AddCluster filter print("\n--> AddCluster filter\n") flt = Filter(classname="weka.filters.unsupervised.attribute.AddCluster", options=["-W", "weka.clusterers.SimpleKMeans -N 3"]) flt.set_inputformat(filtered) addcl = flt.filter(filtered) print(addcl) # classes-to-clusters evaluation
dataDir = os.path.join(os.path.dirname(os.path.abspath('')), 'data') modelDir = os.path.join(os.path.dirname(os.path.abspath('')), 'models') dformat = DataFormatter(dataDir) dformat.dict2arff(os.path.join(dataDir, 'System.csv'), os.path.join(dataDir, 'System.arff')) #Arff_file = os.path.join(dataDir, 'System.arff') jvm.start(packages=True) data = converters.load_any_file(os.path.join(dataDir, 'System.arff')) clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", "10", "-S", "10"]) clusterer.build_clusterer(data) # print clusterer # cluster the data # for inst in data: # cl = clusterer.cluster_instance(inst) # 0-based cluster index # dist = clusterer.distribution_for_instance(inst) # cluster membership distribution # print("cluster=" + str(cl) + ", distribution=" + str(dist)) # print inst # serialization.write(os.path.join(modelDir, 'SKM.model'), clusterer) clusterEM = Clusterer(classname="weka.clusterers.EM", options=[ "-I", "1000", "-N", "6", "-X", "10", "-max", "-1", "-ll-cv", "1.0E-6", "-ll-iter", "1.0E-6", "-M",
print("\nLoading dataset: " + fname + "\n") loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(fname) # build KMeans seeds = [-1, 11, 12] for seed in seeds: if seed == -1: seedStr = "default" else: seedStr = str(seed) print("\n--> SimpleKMeans - seed " + seedStr + "\n") cl = Clusterer("weka.clusterers.SimpleKMeans") if seed != -1: cl.set_options(["-S", str(seed)]) cl.build_clusterer(data) evl = ClusterEvaluation() evl.set_model(cl) evl.test_model(data) print(evl.get_cluster_results()) # build XMeans print("\n--> XMeans\n") flt = Filter(classname="weka.filters.unsupervised.attribute.RemoveType", options=["-T", "numeric", "-V"]) flt.set_inputformat(data) filtered = flt.filter(data) cl = Clusterer(classname="weka.clusterers.XMeans") cl.build_clusterer(filtered) evl = ClusterEvaluation() evl.set_model(cl) evl.test_model(filtered)
print("\nLoading dataset: " + fname + "\n") loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(fname) # build KMeans seeds = [-1, 11, 12] for seed in seeds: if seed == -1: seedStr = "default" else: seedStr = str(seed) print("\n--> SimpleKMeans - seed " + seedStr + "\n") cl = Clusterer("weka.clusterers.SimpleKMeans") if seed != -1: cl.options = ["-S", str(seed)] cl.build_clusterer(data) evl = ClusterEvaluation() evl.set_model(cl) evl.test_model(data) print(evl.cluster_results) # build XMeans print("\n--> XMeans\n") flt = Filter(classname="weka.filters.unsupervised.attribute.RemoveType", options=["-T", "numeric", "-V"]) flt.inputformat(data) filtered = flt.filter(data) cl = Clusterer(classname="weka.clusterers.XMeans") cl.build_clusterer(filtered) evl = ClusterEvaluation() evl.set_model(cl)
##saver.save_file(data, "data_with_class_type.arff") ### Deletes the not required attributes data.delete_attribute(2) data.delete_attribute(2) #####Uncomment to save the file with has serviceId as class, forkV and ForkW as attributes ###saver.save_file(data, "data_with_class_serviceID.arff") data.delete_attribute(2) #saver.save_file(data,"data.arff") num_clusters = "6" #Number of clusters for k mean ##Performing clustering clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", num_clusters]) clusterer.build_clusterer(data) for inst in data: cl = clusterer.cluster_instance(inst) # 0-based cluster index dist = clusterer.distribution_for_instance(inst) # cluster membership distribution #print("cluster=" + str(cl) + ", distribution=" + str(dist)) #########Getting the data about the clustered instances evaluation = ClusterEvaluation() evaluation.set_model(clusterer) evaluation.test_model(data) print evaluation.cluster_results #print("# clusters: " + str(evaluation.num_clusters)) #print("log likelihood: " + str(evaluation.log_likelihood)) #print("cluster assignments:\n" + str(evaluation.cluster_assignments)) #plc.plot_cluster_assignments(evaluation, data,[],True)
class ClusterAgent (BustersAgent): def registerInitialState(self, gameState): BustersAgent.registerInitialState(self, gameState) self.distancer = Distancer(gameState.data.layout, False) #Definimos si se usa la distancia (true para v1 y v2, false para v3) self.dis = True #Para calcular los valores de la clase en las politicas. self.clusters = 8 self.classes = 4 self.classCounts = [[0 for i in range(self.classes)]for j in range(self.clusters)] self.classIndex = 2 self.clusterIndex = 3 self.readInstances() #Esto nos servira para guardar las instancias de entrenamiento. self.numInstances = 52 self.numAttributes = 4 #self.instances = [[" " for i in range(self.numAttributes)] for j in range(self.numInstances)] self.ins = [" " for i in range(self.numInstances)] #Para usar la libreria debemos usar la maquina virtual de java, JVM jvm.start() #Creamos el modelo loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file("/home/dot/Escritorio/Universidad/Machine Learning/practica 2/Outputs/agent_header.arff") self.clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", str(self.clusters)]) self.clusterer.build_clusterer(data) print(self.clusterer) #Aplicamos la politica self.politicaMax() def readInstances(self): #Direccion del fichero agente (instancias sin cabecera). path = os.getcwd() + "/Outputs/agent.arff" f = open(path, 'r') index = 0 #Leemos cacda instancia for line in f: #Obtenemos los valores de los atributos (String) values = line.split(",") #Obtenemos el valor de la clase, de Norte a Oeste (0 - 3) classValue = 0 classAtt = values[self.classIndex] if (classAtt == "East"): classValue = 1 elif (classAtt == "South"): classValue = 2 elif (classAtt == "West"): classValue = 3 #Obtenemos el valor del cluster. cluster = values[self.clusterIndex] #Incrementamos la cuenta de la clase para el cluster. self.classCounts[int(cluster[-2:]) - 1][classValue] += 1 f.close() #Calcula la clase mayoritaria para cada cluster def politicaMax(self): self.max = [0 for i in range(self.clusters)] for i in range(self.clusters): temp_max = 0 class_index = 0 for j in range(self.classes): if (self.classCounts[i][j] > temp_max): temp_max = self.classCounts[i][j] class_index = j self.max[i] = class_index #print(class_index) ''' for i in range(self.clusters): print(self.max[i]) ''' def chooseAction(self, gameState): path = os.getcwd() + "/Outputs/newInstance.arff" f = open(path, 'w') if (self.dis): data = "@RELATION pacman\n" \ + "@ATTRIBUTE dis NUMERIC\n" \ + "@ATTRIBUTE relPos {-1,0,1,2,3,4,5,6,7,8}\n\n" \ + "@DATA\n" else: data = "@RELATION pacman\n" \ + "@ATTRIBUTE relPos {-1,0,1,2,3,4,5,6,7,8}\n\n" \ + "@DATA\n" # Obtenemos la posicion del pacman (x,y) pos_pac = gameState.data.agentStates[0].getPosition() # Obtenemos las distancias a los fantasmas for i in range(1, gameState.getNumAgents()): # Calculmos la distancia real (mazedistance) al fantasma i pos_ghost = gameState.data.agentStates[i].getPosition() distance = self.distancer.getDistance(pos_pac, pos_ghost) #Normalizacion: (distance - min)/(max - min): min = 1, max = 21 distance = (distance - 1) / (21 - 1) # Si la distancia es mayor a 1000 significa que el fantasma en cuestion ya ha sido comido if (self.dis): if (distance > 1000): data = data + ("-1,") else: data = data + str(distance) + "," # Obtenemos las posiciones relativas de los fantasmas con respecto del pacman for i in range(1, gameState.getNumAgents()): pos_ghost = gameState.data.agentStates[i].getPosition() if (pos_ghost[1] < 3): data = data + "-1," continue # Si el fantasma esta en la misma posicion lo indicamos como 0 if (pos_ghost == pos_pac): data = data + "0," # Determinamos las posiciones relativas # {NORTH = 1, NORTH_EAST = 2, EAST = 3, SOUTH_EAST = 4, SOUTH = 5, SOUTH_WEST = 6, WEST = 7, NORTH_WEST = 8}. if (pos_ghost[0] > pos_pac[0]): if (pos_ghost[1] > pos_pac[1]): data = data + "2," elif (pos_ghost[1] < pos_pac[1]): data = data + "4," else: data = data + "3," elif (pos_ghost[0] < pos_pac[0]): if (pos_ghost[1] > pos_pac[1]): data = data + "8," elif (pos_ghost[1] < pos_pac[1]): data = data + "6," else: data = data + "7," else: if (pos_ghost[1] > pos_pac[1]): data = data + "1," else: data = data + "5," data = data + "\n" #print(data) f.write(data) f.close() loader = Loader(classname="weka.core.converters.ArffLoader") newData = loader.load_file("/home/dot/Escritorio/Universidad/Machine Learning/practica 2/Outputs/newInstance.arff") dir = 4 direction = Directions.STOP for inst in newData: cl = self.clusterer.cluster_instance(inst) #print(cl) dir = self.max[cl] #print(dir) if (dir == 0): direction = Directions.NORTH elif (dir == 1): direction = Directions.EAST elif (dir == 2): direction = Directions.SOUTH elif (dir == 3): direction = Directions.WEST #print(direction) return direction
class WekaCluster(BaseEstimator, OptionHandler, ClusterMixin): """ Wraps a Weka cluster within the scikit-learn framework. """ def __init__(self, jobject=None, cluster=None, classname=None, options=None, nominal_input_vars=None, num_nominal_input_labels=None): """ Initializes the estimator. Can be either instantiated via the following priority of parameters: 1. JB_Object representing a Java Clusterer object 2. Clusterer pww3 wrapper 3. classname/options :param jobject: the JB_Object representing a Weka cluster to use :type jobject: JB_Object :param cluster: the cluster wrapper to use :type cluster: Clusterer :param classname: the classname of the Weka cluster to instantiate :type classname: str :param options: the command-line options of the Weka cluster to instantiate :type options: list :param num_nominal_input_labels: the dictionary with the number of labels for the nominal input variables (key is 0-based attribute index) :type num_nominal_input_labels: dict """ if jobject is not None: _jobject = jobject elif cluster is not None: _jobject = cluster.jobject elif classname is not None: if options is None: options = [] cluster = Clusterer(classname=classname, options=options) _jobject = cluster.jobject else: raise Exception("At least Java classname must be provided!") if not is_instance_of(_jobject, "weka.clusterers.Clusterer"): raise Exception( "Java object does not implement weka.clusterers.Clusterer!") super(WekaCluster, self).__init__(_jobject) self._cluster = Clusterer(jobject=_jobject) self.header_ = None # the following references are required for get_params/set_params self._classname = classname self._options = options self._nominal_input_vars = nominal_input_vars self._num_nominal_input_labels = num_nominal_input_labels @property def cluster(self): """ Returns the underlying cluster object, if any. :return: the cluster object :rtype: Clusterer """ return self._cluster @property def header(self): """ Returns the underlying dataset header, if any. :return: the dataset structure :rtype: Instances """ return self.header_ def fit(self, data, targets=None): """ Trains the cluster. :param data: the input variables as matrix, array-like of shape (n_samples, n_features) :type data: ndarray :param targets: ignored :type targets: ndarray :return: the cluster :rtype: WekaCluster """ if self._nominal_input_vars is not None: data = to_nominal_attributes(data, self._nominal_input_vars) d = to_instances(data, num_nominal_labels=self._num_nominal_input_labels) self._cluster.build_clusterer(d) self.header_ = d.template_instances(d, 0) return self def predict(self, data, targets=None): """ Predicts cluster labels. :param data: the input variables as matrix, array-like of shape (n_samples, n_features) :type data: ndarray :param targets: ignored :type targets: ndarray :return: the cluster labels (of type int) :rtype: ndarray """ check_is_fitted(self) if self._nominal_input_vars is not None: data = to_nominal_attributes(data, self._nominal_input_vars) result = [] for d in data: inst = to_instance(self.header_, d) result.append(int(self._cluster.cluster_instance(inst))) return np.array(result) def fit_predict(self, data, targets=None): """ Trains the cluster and returns the cluster labels. :param data: the input variables as matrix, array-like of shape (n_samples, n_features) :type data: ndarray :param targets: ignored :type targets: ndarray :return: the cluster labels (of type int) :rtype: ndarray """ self.fit(data) return self.predict(data) def get_params(self, deep=True): """ Returns the parameters for this cluster, basically classname and options list. :param deep: ignored :type deep: bool :return: the dictionary with options :rtype: dict """ result = dict() result["classname"] = self._classname result["options"] = self._options if self._nominal_input_vars is not None: result["nominal_input_vars"] = self._nominal_input_vars if self._num_nominal_input_labels is not None: result["num_nominal_input_labels"] = self._num_nominal_input_labels if self._num_nominal_input_labels is not None: result["num_nominal_input_labels"] = self._num_nominal_input_labels return result def set_params(self, **params): """ Sets the options for the cluster, expects 'classname' and 'options'. :param params: the parameter dictionary :type params: dict """ if len(params) == 0: return if "classname" not in params: raise Exception("Cannot find 'classname' in parameters!") if "options" not in params: raise Exception("Cannot find 'options' in parameters!") self._classname = params["classname"] self._options = params["options"] self._cluster = Clusterer(classname=self._classname, options=self._options) self._nominal_input_vars = None if "nominal_input_vars" in params: self._nominal_input_vars = params["nominal_input_vars"] self._num_nominal_input_labels = None if "num_nominal_input_labels" in params: self._num_nominal_input_labels = params["num_nominal_input_labels"] def __str__(self): """ For printing the model. :return: the model representation, if any :rtype: str """ if self._cluster is None: return self._classname + ": No model built yet" else: return str(self._cluster) def __copy__(self): """ Creates a deep copy of itself. :return: the copy :rtype: WekaEstimator """ result = WekaCluster(jobject=deepcopy(self.jobject)) result._classname = self._classname result._options = self._options[:] return result def __repr__(self, N_CHAR_MAX=700): """ Returns a valid Python string using its classname and options. :param N_CHAR_MAX: ignored :type N_CHAR_MAX: int :return: the representation :rtype: str """ if isinstance(self._nominal_input_vars, str): return "WekaCluster(classname='%s', options=%s, nominal_input_vars='%s')" % ( self._cluster.classname, str( self._cluster.options), str(self._nominal_input_vars)) else: return "WekaCluster(classname='%s', options=%s, nominal_input_vars=%s)" % ( self._cluster.classname, str( self._cluster.options), str(self._nominal_input_vars))
eca.drop('APROVADO', axis=1, inplace=True) eca.to_csv('temp.csv', index=False) from weka.clusterers import Clusterer import weka.core.jvm as jvm import weka.core.serialization as serialization jvm.start() # executar a tecnica variando de 1 a 9 clusters for i in range(1, 10): print '**************Numero de clusters: ' + str(i) clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", str(i)]) clusterer.build_clusterer(eca) print(clusterer) clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", "4"]) clusterer.build_clusterer(eca) print(clusterer) serialization.write("model/kmeans_eca_reprovacao.model", clusterer) # ler model '''objects = serialization.read_all("cluster.model") clusterer = Clusterer(jobject=objects[0]) data_aluno = loader.load_file("aluno_temp.csv") for instancia in data_aluno: resultado = clusterer.cluster_instance(instancia)
class ClusteredAgent(BustersAgent): "An agent that charges the closest ghost." def __init__(self, index = 0, inference = "ExactInference", ghostAgents = None): BustersAgent.__init__(self, index, inference, ghostAgents) self.previousDistances = [0,0,0,0] jvm.start(max_heap_size="512m") self.loader = Loader(classname="weka.core.converters.ArffLoader") self.data = self.loader.load_file("data/game_toCluster.arff") self.data.delete_last_attribute() self.clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", "10", "-S", "4", "-I", "500"]) self.clusterer.build_clusterer(self.data) self.inst = "" self.data = self.loader.load_file("data/game_toCluster.arff") addCluster = Filter(classname="weka.filters.unsupervised.attribute.AddCluster", options=["-W", "weka.clusterers.SimpleKMeans -N 10 -S 4 -I 500", "-I", "last"]) addCluster.inputformat(self.data) filtered = addCluster.filter(self.data) self.f = open('data/addCluster.arff', 'w+') self.f.write(str(filtered)) self.clustered_data = self.classifyData('data/addCluster.arff') def classifyData(self, filename): self.data_clust = [[],[],[],[],[],[],[],[],[],[]] with open(filename, "r") as f: for line in f: if "@" not in line or line != "\n": cluster_name = line.split(",")[-1] if cluster_name == "cluster1\n": self.data_clust[0].append(line) elif cluster_name == "cluster2\n": self.data_clust[1].append(line) elif cluster_name == "cluster3\n": self.data_clust[2].append(line) elif cluster_name == "cluster4\n": self.data_clust[3].append(line) elif cluster_name == "cluster5\n": self.data_clust[4].append(line) elif cluster_name == "cluster6\n": self.data_clust[5].append(line) elif cluster_name == "cluster7\n": self.data_clust[6].append(line) elif cluster_name == "cluster8\n": self.data_clust[7].append(line) elif cluster_name == "cluster9\n": self.data_clust[8].append(line) elif cluster_name == "cluster10\n": self.data_clust[9].append(line) return self.data_clust def registerInitialState(self, gameState): "Pre-computes the distance between every two points." BustersAgent.registerInitialState(self, gameState) def getInstance(self, gameState): headers = "" headers = headers + "@relation prueba\n\n" headers = headers + "@attribute score NUMERIC\n" headers = headers + "@attribute ghosts-living NUMERIC\n" headers = headers + "@attribute distance-ghost1 NUMERIC \n" headers = headers + "@attribute distance-ghost2 NUMERIC \n" headers = headers + "@attribute distance-ghost3 NUMERIC \n" headers = headers + "@attribute distance-ghost4 NUMERIC \n" headers = headers + "@attribute prev-distance-ghost1 NUMERIC \n" headers = headers + "@attribute prev-distance-ghost2 NUMERIC \n" headers = headers + "@attribute prev-distance-ghost3 NUMERIC \n" headers = headers + "@attribute prev-distance-ghost4 NUMERIC \n" headers = headers + "@attribute posX NUMERIC\n" headers = headers + "@attribute posY NUMERIC\n" headers = headers + "@attribute direction {North, South, East, West, Stop}\n" headers = headers + "@attribute wall-east {True, False}\n" headers = headers + "@attribute wall-south {True, False}\n" headers = headers + "@attribute wall-west {True, False}\n" headers = headers + "@attribute wall-north {True, False}\n" headers = headers + "@data\n\n\n" file = open('data/instances.arff', 'w+') file.write(headers) line = "" line = line + str(gameState.data.score) + "," livingGhosts = 0 for i in gameState.livingGhosts[1:]: livingGhosts += 1 line = line + str(livingGhosts) + "," # include the distances to the ghosts in the current turn for i in range(len(gameState.livingGhosts[1:])): if gameState.livingGhosts[i] is False: line = line + "0" + "," else: line = line +\ str(self.distancer.getDistance(gameState.getPacmanPosition(), gameState.getGhostPosition(i))) + "," # include the distances to the ghosts in the previous turn for i in self.previousDistances: line = line + str(i) + "," # store the distances of this turn for the next one for i in range(len(gameState.livingGhosts[1:])): if gameState.livingGhosts[i] is False: self.previousDistances[i] = 0 else: self.previousDistances[i] = self.distancer.getDistance(gameState.getPacmanPosition(), gameState.getGhostPosition(i)) line = line +\ str(gameState.data.agentStates[0].getPosition()[0]) + "," +\ str(gameState.data.agentStates[0].getPosition()[1])+ "," +\ str(gameState.data.agentStates[0].getDirection()) + "," +\ str(gameState.hasWall(gameState.getPacmanPosition()[0] - 1, gameState.getPacmanPosition()[1])) + "," +\ str(gameState.hasWall(gameState.getPacmanPosition()[0], gameState.getPacmanPosition()[1] - 1)) + "," +\ str(gameState.hasWall(gameState.getPacmanPosition()[0] + 1, gameState.getPacmanPosition()[1])) + "," +\ str(gameState.hasWall(gameState.getPacmanPosition()[0], gameState.getPacmanPosition()[1] + 1)) + ",?" file.write(line) file.close() loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file("data/instances.arff") data.class_is_last() # set class attribute for index, inst in enumerate(data): pred = self.clusterer.cluster_instance(inst) self.inst = inst return pred def closeMove(self, move, option): if move == Directions.NORTH: if option == 0: return Directions.EAST elif option == 1: return Directions.WEST else: return Directions.SOUTH elif move == Directions.SOUTH: if option == 0: return Directions.EAST elif option == 1: return Directions.WEST else: return Directions.NORTH elif move == Directions.EAST: if option == 0: return Directions.NORTH elif option == 1: return Directions.SOUTH else: return Directions.WEST elif move == Directions.WEST: if option == 0: return Directions.NORTH elif option == 1: return Directions.SOUTH else: return Directions.EAST return Directions.SOUTH def chooseAction(self, gameState): start = self.startMeasuring(gameState) move = self.getMove(ClusteredAgent.getInstance(self, gameState)) end = self.endMeasuring() self.f_stats.write(str(end - start) + "\n") if move in gameState.getLegalActions(0): return move # When chose an illegal action, try to round the obstacle rand = random.randint(0,1) closemove = self.closeMove(move, rand) if closemove in gameState.getLegalActions(0): return closemove closemove = self.closeMove(move, (rand+1)%2) if closemove in gameState.getLegalActions(0): return closemove # When this is not possible, we can only backtrack return self.closeMove(move, 2) def getMove(self, clusterNum): # get the closest instance values = [] for instance in self.clustered_data[clusterNum]: values.append(self.getSimilarity(instance)) inst = values.index(min(values)) # return the movement return self.clustered_data[clusterNum][inst].split(",")[-2] def similarityFunc(self, attrs): # ghosts-living a = float(attrs[1]) * 0.2 # distance-ghosts dist = 0 for i in attrs[2:6]: dist += float(i) a += dist * 0.2 # poxX and posY a += float(int(attrs[10]) + int(attrs[11])) * 0.2 # direction a += float(move_to_num[attrs[12]]) * 0.2 # walls wall = 0 for i in attrs[13:17]: wall += bool(i) a += wall * 0.2 return a def getSimilarity(self, instance): attrs_known_inst = instance.split(",") attrs_new_inst = str(self.inst).split(",") a = self.similarityFunc(attrs_known_inst) b = self.similarityFunc(attrs_new_inst) return abs(a - b)