def perform_KMeans(data, classes, k): clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", str(k)]) clusterer.build_clusterer(data) purity = cluster_purity(clusterer, data, classes) return purity
def perform_HC(data, classes, k, link): clusterer = Clusterer(classname="weka.clusterers.HierarchicalClusterer", options=["-N", str(k), "-L", link]) clusterer.build_clusterer(data) purity = cluster_purity(clusterer, data, classes) return purity
def assign_cluster(file_location, file_out="clustered.csv", model="kmeans.model", last_filename=False): data = read_csv_file(file_location) check_jvm() # load clusters obj = serialization.read(model) clusterer = Clusterer(jobject=obj) # create file with cluster group with open(file_out, 'w') as output: for index, attrs in enumerate(data): tmp = [] if last_filename: inst = Instance.create_instance(attrs[:-2]) else: inst = Instance.create_instance(attrs[1:]) pred = clusterer.cluster_instance(inst) dist = clusterer.distribution_for_instance(inst) if last_filename : tmp.append(attrs[-1]) tmp.append(pred) tmp.extend(attrs[:-2]) else: tmp.append(attrs[0]) tmp.append(pred) tmp.extend(attrs[1:]) print(str(index + 1) + ": label index=" + str(pred) + ", class distribution=" + str(dist)) output.write('%s\n'%(','.join(map(str,tmp)) ))
def main(): """ Just runs some example code. """ # load a dataset iris_file = helper.get_data_dir() + os.sep + "iris.arff" helper.print_info("Loading dataset: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") full = loader.load_file(iris_file) full.class_is_last() # remove class attribute data = Instances.copy_instances(full) data.no_class() data.delete_last_attribute() # build a clusterer and output model helper.print_title("Training SimpleKMeans clusterer") clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", "3"]) clusterer.build_clusterer(data) print("done") # classes to clusters evl = ClusterEvaluation() evl.set_model(clusterer) evl.test_model(full) helper.print_title("Cluster results") print(evl.cluster_results) helper.print_title("Classes to clusters") print(evl.classes_to_clusters)
def simpleKMeansTrain(self, dataf, options, mname, temp=True): ''' :param data: -> data to be clustered :param options: -> SimpleKMeans options N -> number of clusters A -> Distance function to use (ex: default is "weka.core.EuclideanDistance -R first-last") l -> maximum number of iterations default 500 num-slots -> number of execution slots, 1 means no parallelism S -> Random number seed (default 10) example => ["-N", "10", "-S", "10"] :return: ''' try: jvm.start(max_heap_size=self.wHeap) data = self.loadData(dataf, temp=True) clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=options) clusterer.build_clusterer(data) print clusterer # cluster the data for inst in data: cl = clusterer.cluster_instance(inst) # 0-based cluster index dist = clusterer.distribution_for_instance(inst) # cluster membership distribution print("cluster=" + str(cl) + ", distribution=" + str(dist)) self.saveModel(clusterer, 'skm', mname) except Exception, e: print(traceback.format_exc())
def emTrain(self, dataf, options, mname, temp=True): ''' :param data: -> data to be clustered :param options: -> EM options I -> number of iterations N -> number of clusters M -> Minimum standard deviation for normal density (default=1.0E-6) num-slots -> number of execution slots, 1 means no parallelism S -> random seed (default=100) example => ["-I", "1000", "-N", "6", "-X", "10", "-max", "-1", "-ll-cv", "1.0E-6", "-ll-iter", "1.0E-6", "-M", "1.0E-6", "-num-slots", "1", "-S", "100"] :return: ''' try: jvm.start(max_heap_size=self.wHeap) data = self.loadData(dataf, temp) clusterEM = Clusterer(classname="weka.clusterers.EM", options=options) clusterEM.build_clusterer(data) print(clusterEM) self.saveModel(clusterEM, 'em', mname, ) except Exception as e: print((traceback.format_exc())) finally: jvm.stop()
def perform_DBScan(data, classes, e, min_points): clusterer = Clusterer(classname="weka.clusterers.DBSCAN", options=["-E", str(e), "-M", str(min_points)]) clusterer.build_clusterer(data) purity = cluster_purity(clusterer, data, classes) return purity
def main(): if not is_installed("CLOPE"): print("CLOPE is not installed, installing now") install_package("CLOPE") print("please restart") return cls = Clusterer(classname="weka.clusterers.CLOPE") print("CLOPE is installed:", cls.to_commandline())
def run_cluster_simplek(self, output_directory, exc_class=False, num_clusters=7): data = Instances.copy_instances(self.training_data) data.no_class() data.delete_first_attribute() # build a clusterer and output model print("\nBuilding Clusterer on training data.") buildTimeStart = time.time() clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", "" + str(num_clusters)]) clusterer.build_clusterer(data) resultsString = "" resultsString = self.print_both(str(clusterer), resultsString) buildTimeString = "Clusterer Built in " + str( time.time() - buildTimeStart) + " secs.\n" resultsString = self.print_both(buildTimeString, resultsString) #Evaluate Clusterer resultsString = self.print_both("\nClustering data.", resultsString) buildTimeStart = time.time() clsexc = "" if (exc_class): # no class attribute clsexc = "_NO_Class" evl = ClusterEvaluation() evl.set_model(clusterer) evl.test_model(data) else: # classes to clusters evl = ClusterEvaluation() evl.set_model(clusterer) evl.test_model(self.training_data) resultsString = self.print_both("\nCluster results\n", resultsString) resultsString = self.print_both(str(evl.cluster_results), resultsString) resultsString = self.print_both("\nClasses to clusters\n", resultsString) resultsString = self.print_both(str(evl.classes_to_clusters), resultsString) buildTimeString = "\nClustered data in " + str( time.time() - buildTimeStart) + " secs.\n" resultsString = self.print_both(buildTimeString, resultsString) #Save Results and Cleanup self.save_results("SimpleKM" + clsexc + "_", resultsString, output_directory)
def run_clustering_task7_manual(self, output_directory, clusterer_name, num_clusters, seed=10): data = Instances.copy_instances(self.training_data) data.no_class() data.delete_first_attribute() clusterer_name_short = clusterer_name.replace("weka.clusterers.", "") # build a clusterer and output model print("\nBuilding " + clusterer_name_short + " Clusterer on training data.") buildTimeStart = time.time() clusterer = Clusterer( classname=clusterer_name, options=["-N", "" + str(num_clusters), "-S", "" + str(seed)]) clusterer.build_clusterer(data) resultsString = "" resultsString = self.print_both(str(clusterer), resultsString) buildTimeString = "Clusterer Built in " + str( time.time() - buildTimeStart) + " secs.\n" resultsString = self.print_both(buildTimeString, resultsString) #Evaluate Clusterer resultsString = self.print_both("\nClustering data.", resultsString) buildTimeStart = time.time() evl = ClusterEvaluation() evl.set_model(clusterer) evl.test_model(self.training_data) resultsString = self.print_both("\nCluster results\n", resultsString) resultsString = self.print_both(str(evl.cluster_results), resultsString) resultsString = self.print_both("\nClasses to clusters\n", resultsString) resultsString = self.print_both(str(evl.classes_to_clusters), resultsString) buildTimeString = "\nClustered data in " + str( time.time() - buildTimeStart) + " secs.\n" resultsString = self.print_both(buildTimeString, resultsString) #Save Results and Cleanup self.save_results( clusterer_name_short + "_" + "N" + str(num_clusters) + "_S" + str(seed), resultsString, output_directory)
def create_cluster_model(arff_file, n=10, loader_type="csv", model="kmeans.model"): """ create cluster model """ check_jvm() if loader_type == "csv": loader = converters.Loader(classname="weka.core.converters.CSVLoader") else : loader = conventers.Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(arff_file) clusterer = Clusterer( classname="weka.clusterers.SimpleKMeans", options=["-N", str(n)]) clusterer.build_clusterer(data) serialization.write(model, clusterer)
def run_clusterer(file): # Get filename from Pathlib object filename = file.parts[-1] dir = file.parents[0] print("Running Clusterer on %s" % filename) if not filename.endswith(".arff"): print("%s not ARFF file." % filename) return # Removes '.arff' from filename filename_base = filename[:-5] # Load data with class as first attr full = load_Arff_file(file) full.class_is_first() full_withoutclass = load_Arff_file(file) #data.delete_first_attribute() data = Instances.copy_instances(full) data.no_class() data.delete_first_attribute() dir = dir / "cluster_results_optimum" dir.mkdir(parents=True, exist_ok=True) # Init clusterer #"-N", "-1", n = "2" if (filename_base.startswith("fer2018_")): print("Changing number of clusters to 7") n = "7" #clusterer = Clusterer(classname="weka.clusterers.EM", options=[ "-S", "10", "-N", n]) #clusterer = Clusterer(classname="weka.clusterers.FarthestFirst", options=[ "-S", "10", "-N", n]) clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-S", "10", "-N", n]) clusterer.build_clusterer(data) evaluation = ClusterEvaluation() evaluation.set_model(clusterer) evaluation.test_model(full) str1 = str(filename_base) + "_cl_res.txt" output_results = dir / str1 output_cluster(evaluation, output_results)
def train_data(self): try: #helper.print_info("Loading dataset: " + self.datasetName) loader = Loader(classname="weka.core.converters.ArffLoader") data_train = loader.load_file(self.datasetName) data_train.delete_last_attribute() clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", "2"]) clusterer.build_clusterer(data_train) return clusterer except Exception, e: raise e print(traceback.format_exc())
def __init__(self, jobject=None, cluster=None, classname=None, options=None, nominal_input_vars=None, num_nominal_input_labels=None): """ Initializes the estimator. Can be either instantiated via the following priority of parameters: 1. JB_Object representing a Java Clusterer object 2. Clusterer pww3 wrapper 3. classname/options :param jobject: the JB_Object representing a Weka cluster to use :type jobject: JB_Object :param cluster: the cluster wrapper to use :type cluster: Clusterer :param classname: the classname of the Weka cluster to instantiate :type classname: str :param options: the command-line options of the Weka cluster to instantiate :type options: list :param num_nominal_input_labels: the dictionary with the number of labels for the nominal input variables (key is 0-based attribute index) :type num_nominal_input_labels: dict """ if jobject is not None: _jobject = jobject elif cluster is not None: _jobject = cluster.jobject elif classname is not None: if options is None: options = [] cluster = Clusterer(classname=classname, options=options) _jobject = cluster.jobject else: raise Exception("At least Java classname must be provided!") if not is_instance_of(_jobject, "weka.clusterers.Clusterer"): raise Exception( "Java object does not implement weka.clusterers.Clusterer!") super(WekaCluster, self).__init__(_jobject) self._cluster = Clusterer(jobject=_jobject) self.header_ = None # the following references are required for get_params/set_params self._classname = classname self._options = options self._nominal_input_vars = nominal_input_vars self._num_nominal_input_labels = num_nominal_input_labels
def query_instance(attributes, model="kmeans.model"): """ get the cluster for defined attributes :params attributes: array or list :returns: cluster id """ check_jvm() # create instance inst = Instance.create_instance(attributes) # load model obj = serialization.read(model) # load cluster and get the cluster_id cluster = Clusterer(jobject=obj) cluster_id = cluster.cluster_instance(inst) return cluster_id
def __init__(self, index = 0, inference = "ExactInference", ghostAgents = None): BustersAgent.__init__(self, index, inference, ghostAgents) self.previousDistances = [0,0,0,0] jvm.start(max_heap_size="512m") self.loader = Loader(classname="weka.core.converters.ArffLoader") self.data = self.loader.load_file("data/game_toCluster.arff") self.data.delete_last_attribute() self.clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", "10", "-S", "4", "-I", "500"]) self.clusterer.build_clusterer(self.data) self.inst = "" self.data = self.loader.load_file("data/game_toCluster.arff") addCluster = Filter(classname="weka.filters.unsupervised.attribute.AddCluster", options=["-W", "weka.clusterers.SimpleKMeans -N 10 -S 4 -I 500", "-I", "last"]) addCluster.inputformat(self.data) filtered = addCluster.filter(self.data) self.f = open('data/addCluster.arff', 'w+') self.f.write(str(filtered)) self.clustered_data = self.classifyData('data/addCluster.arff')
def predicaoCluster(matricula, curso, tipo_predicao): dados = retornarDadosCurso(curso) # selecionando as caracteristicas do aluno aluno = dados.loc[dados['MATRICULA'] == matricula][:] aluno.drop('MATRICULA', axis=1, inplace=True) aluno.drop('APROVADO', axis=1, inplace=True) aluno.drop('COD_DISCIPLINA', axis=1, inplace=True) aluno.drop('SIT_MATRICULA', axis=1, inplace=True) aluno = aluno.head(1) aluno.to_csv('aluno_temp.csv', index=False) from weka.clusterers import Clusterer import weka.core.jvm as jvm from weka.core.converters import Loader import weka.core.serialization as serialization jvm.start() if curso == 'si': if tipo_predicao == 'reprovacao': model = serialization.read_all("model/kmeans_si_reprovacao.model") elif tipo_predicao == 'evasao': model = serialization.read_all("model/kmeans_si_evasao.model") elif curso == 'eca': if tipo_predicao == 'reprovacao': model = serialization.read_all("model/kmeans_eca_reprovacao.model") elif tipo_predicao == 'evasao': model = serialization.read_all("model/kmeans_eca_evasao.model") cluster = Clusterer(jobject=model[0]) loader = Loader(classname="weka.core.converters.CSVLoader") dado_aluno = loader.load_file("aluno_temp.csv") for aluno in dado_aluno: cluster_aluno_pertence = cluster.cluster_instance(aluno) #jvm.stop() caracteristica = retornarCaracteristicaCluster(curso, tipo_predicao, cluster_aluno_pertence) return caracteristica
def simpleKMeansTrain(self, dataf, options, mname, temp=True): ''' :param data: -> data to be clustered :param options: -> SimpleKMeans options N -> number of clusters A -> Distance function to use (ex: default is "weka.core.EuclideanDistance -R first-last") l -> maximum number of iterations default 500 num-slots -> number of execution slots, 1 means no parallelism S -> Random number seed (default 10) example => ["-N", "10", "-S", "10"] :return: ''' try: jvm.start(max_heap_size=self.wHeap) data = self.loadData(dataf, temp=True) clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=options) clusterer.build_clusterer(data) print(clusterer) # cluster the data for inst in data: cl = clusterer.cluster_instance(inst) # 0-based cluster index dist = clusterer.distribution_for_instance(inst) # cluster membership distribution print(("cluster=" + str(cl) + ", distribution=" + str(dist))) self.saveModel(clusterer, 'skm', mname) except Exception as e: print((traceback.format_exc())) finally: jvm.stop()
def main(): """ Just runs some example code. """ # load a dataset iris_file = helper.get_data_dir() + os.sep + "iris.arff" helper.print_info("Loading dataset: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") data = loader.load_file(iris_file) # remove class attribute data.delete_last_attribute() # build a clusterer and output model helper.print_title("Training SimpleKMeans clusterer") clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", "3"]) clusterer.build_clusterer(data) print(clusterer) # cluster data helper.print_info("Clustering data") for index, inst in enumerate(data): cl = clusterer.cluster_instance(inst) dist = clusterer.distribution_for_instance(inst) print(str(index+1) + ": cluster=" + str(cl) + ", distribution=" + str(dist))
def dbscanTrain(self, dataf, options, mname, temp=True): ''' :param data: -> data to be clustered :param options: -> dbscan options E -> epsilon (default = 0.9) M -> minPoints (default = 6) D -> default weka.clusterers.forOPTICSAndDBScan.DataObjects.EuclideanDataObject I -> index (database) used for DBSCAN (default = weka.clusterers.forOPTICSAndDBScan.Databases.SequentialDatabase) example => ["-E", "0.9", "-M", "6", "-I", "weka.clusterers.forOPTICSAndDBScan.Databases.SequentialDatabase", "-D", "weka.clusterers.forOPTICSAndDBScan.DataObjects.EuclideanDataObject"] :return: ''' try: jvm.start(max_heap_size=self.wHeap) data = self.loadData(dataf, temp) clusterDBSCAN = Clusterer(classname="weka.clusterers.DBSCAN", options=options) clusterDBSCAN.build_clusterer(data) print clusterDBSCAN self.saveModel(clusterDBSCAN, 'dbscan', mname) # cluster the data except Exception, e: print(traceback.format_exc())
def registerInitialState(self, gameState): BustersAgent.registerInitialState(self, gameState) self.distancer = Distancer(gameState.data.layout, False) #Para calcular los valores de la clase en las politicas. self.clusters = 8 self.classes = 4 self.classCounts = [[0 for i in range(self.classes)] for j in range(self.clusters)] self.classIndex = 2 self.clusterIndex = 3 self.readInstances() #Esto nos servira para guardar las instancias de entrenamiento. self.numInstances = 52 self.numAttributes = 4 #self.instances = [[" " for i in range(self.numAttributes)] for j in range(self.numInstances)] self.ins = [" " for i in range(self.numInstances)] #Para usar la libreria debemos usar la maquina virtual de java, JVM jvm.start() #Creamos el modelo loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file( "/home/dot/Escritorio/Universidad/Machine Learning/practica 2/Outputs/agent_header.arff" ) self.clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", str(self.clusters)]) self.clusterer.build_clusterer(data) print(self.clusterer) #Aplicamos la politica self.politicaMax()
def set_params(self, **params): """ Sets the options for the cluster, expects 'classname' and 'options'. :param params: the parameter dictionary :type params: dict """ if len(params) == 0: return if "classname" not in params: raise Exception("Cannot find 'classname' in parameters!") if "options" not in params: raise Exception("Cannot find 'options' in parameters!") self._classname = params["classname"] self._options = params["options"] self._cluster = Clusterer(classname=self._classname, options=self._options) self._nominal_input_vars = None if "nominal_input_vars" in params: self._nominal_input_vars = params["nominal_input_vars"] self._num_nominal_input_labels = None if "num_nominal_input_labels" in params: self._num_nominal_input_labels = params["num_nominal_input_labels"]
def emTrain(self, dataf, options, mname, temp=True): ''' :param data: -> data to be clustered :param options: -> EM options I -> number of iterations N -> number of clusters M -> Minimum standard deviation for normal density (default=1.0E-6) num-slots -> number of execution slots, 1 means no parallelism S -> random seed (default=100) example => ["-I", "1000", "-N", "6", "-X", "10", "-max", "-1", "-ll-cv", "1.0E-6", "-ll-iter", "1.0E-6", "-M", "1.0E-6", "-num-slots", "1", "-S", "100"] :return: ''' try: jvm.start(max_heap_size=self.wHeap) data = self.loadData(dataf, temp) clusterEM = Clusterer(classname="weka.clusterers.EM", options=options) clusterEM.build_clusterer(data) print clusterEM self.saveModel(clusterEM, 'em', mname, ) except Exception, e: print(traceback.format_exc())
def dbscanTrain(self, dataf, options, mname, temp=True): ''' :param data: -> data to be clustered :param options: -> dbscan options E -> epsilon (default = 0.9) M -> minPoints (default = 6) D -> default weka.clusterers.forOPTICSAndDBScan.DataObjects.EuclideanDataObject I -> index (database) used for DBSCAN (default = weka.clusterers.forOPTICSAndDBScan.Databases.SequentialDatabase) example => ["-E", "0.9", "-M", "6", "-I", "weka.clusterers.forOPTICSAndDBScan.Databases.SequentialDatabase", "-D", "weka.clusterers.forOPTICSAndDBScan.DataObjects.EuclideanDataObject"] :return: ''' try: jvm.start(max_heap_size=self.wHeap) data = self.loadData(dataf, temp) clusterDBSCAN = Clusterer(classname="weka.clusterers.DBSCAN", options=options) clusterDBSCAN.build_clusterer(data) print(clusterDBSCAN) self.saveModel(clusterDBSCAN, 'dbscan', mname) # cluster the data except Exception as e: print((traceback.format_exc())) finally: jvm.stop()
def main(): """ Just runs some example code. """ # setup the flow helper.print_title("build and save clusterer") iris = helper.get_data_dir() + os.sep + "iris_no_class.arff" flow = Flow(name="build and save clusterer") filesupplier = FileSupplier() filesupplier.config["files"] = [iris] flow.actors.append(filesupplier) loaddataset = LoadDataset() flow.actors.append(loaddataset) train = Train() train.config["setup"] = Clusterer(classname="weka.clusterers.SimpleKMeans") flow.actors.append(train) pick = ContainerValuePicker() pick.config["value"] = "Model" flow.actors.append(pick) console = Console() pick.actors.append(console) writer = ModelWriter() writer.config["output"] = str( tempfile.gettempdir()) + os.sep + "simplekmeans.model" flow.actors.append(writer) # run the flow msg = flow.setup() if msg is None: print("\n" + flow.tree + "\n") msg = flow.execute() if msg is not None: print("Error executing flow:\n" + msg) else: print("Error setting up flow:\n" + msg) flow.wrapup() flow.cleanup()
def run_SKMeans_137(self): #construct output paths output_prefix = os.path.split(self.input_path)[-1].split(".")[0]; print(output_prefix); write_date = output_prefix + "." + str(datetime.now().date()); SKMeans_dir = os.path.join(self.output_dir,"SKMeans"); eval_path = os.path.join(SKMeans_dir, write_date + ".cl_eval.txt"); clust_desc_path = os.path.join(SKMeans_dir, write_date + ".cl_descr.txt"); clust_assign_path = os.path.join(SKMeans_dir, write_date + ".cl_assign.txt"); #create output dir if it doesn't already exist if(not os.path.exists(SKMeans_dir)): os.makedirs(SKMeans_dir); #clone data and build clusters # data_clone = copy.deepcopy(self.data_loaded); data_clone = self.data_loaded; clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N","137"]); clusterer.build_clusterer(data_clone); #cluster evaluation evaluation = ClusterEvaluation(); evaluation.set_model(clusterer); evaluation.test_model(data_clone); with open(eval_path, 'w') as outfile: outfile.write("number of clusters: \t" + str(evaluation.num_clusters) + "\n"); outfile.write("log likelihood: \t" + str(evaluation.num_clusters) + "\n"); outfile.write("cluster assignments: \t" + str(evaluation.cluster_assignments) + "\n"); outfile.write("***********************\n") outfile.write("\t".join(["SKmeans Cluster Evaluation Results\n"])); #header outfile.write(str(evaluation.cluster_results) + "\n"); #cluster Instance objects Description of clusters with open(clust_desc_path, 'w') as outfile: outfile.write(",".join(["cluster_num","distribution\n"])); #header for inst in data_clone: # data cl = clusterer.cluster_instance(inst); # 0-based cluster index dist = clusterer.distribution_for_instance(inst); #cluster membership distribution outfile.write(",".join([str(cl),str(dist)])); outfile.write("\n"); #cluster assignment by row with open(clust_assign_path, 'w') as outfile: outfile.write(",".join(["row_num","SKMeans\n"])); #header for i, inst in enumerate(evaluation.cluster_assignments): # data outfile.write(",".join([str(i),str(inst)])); outfile.write("\n"); return();
def main(): """ Just runs some example code. """ # setup the flow helper.print_title("Cross-validate clusterer") iris = helper.get_data_dir() + os.sep + "iris.arff" flow = Flow(name="cross-validate clusterer") filesupplier = FileSupplier() filesupplier.config["files"] = [iris] flow.actors.append(filesupplier) loaddataset = LoadDataset() flow.actors.append(loaddataset) flter = Filter() flter.name = "Remove class" flter.config["filter"] = filters.Filter( classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "last"]) flow.actors.append(flter) cv = CrossValidate() cv.config["setup"] = Clusterer(classname="weka.clusterers.EM") flow.actors.append(cv) console = Console() console.config["prefix"] = "Loglikelihood: " flow.actors.append(console) # run the flow msg = flow.setup() if msg is None: print("\n" + flow.tree + "\n") msg = flow.execute() if msg is not None: print("Error executing flow:\n" + msg) else: print("Error setting up flow:\n" + msg) flow.wrapup() flow.cleanup()
def command(): jvm.start() import weka.core.converters as converters clusters = request.form['clusternum'] a1 = request.form['firstcol'] a2 = request.form['secondcol'] # print clusters # print a1 # print a2 if (a1 == 'B' and a2 == 'C'): data = converters.load_any_file("Data.csv") elif (a1 == 'B' and a2 == 'D'): data = converters.load_any_file("Data1.csv") elif (a1 == 'C' and a2 == 'D'): data = converters.load_any_file("Data2.csv") elif (a1 == 'C' and a2 == 'E'): data = converters.load_any_file("Data3.csv") elif (a1 == 'D' and a2 == 'E'): data = converters.load_any_file("Data4.csv") #data.class_is_last() print(data) # from weka.attribute_selection import ASSearch, ASEvaluation, AttributeSelection # search = ASSearch(classname="weka.attributeSelection.BestFirst", options=["-D", "1", "-N", "5"]) # evaluator = ASEvaluation(classname="weka.attributeSelection.CfsSubsetEval", options=["-P", "2", "-E", "1"]) # attsel = AttributeSelection() # attsel.search(search) # attsel.evaluator(evaluator) # attsel.select_attributes(data) f = open("filename.txt", "w") from weka.clusterers import Clusterer clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", "{}".format(clusters)]) clusterer.build_clusterer(data) print(clusterer) f.write(str(clusterer)) # cluster the data for inst in data: cl = clusterer.cluster_instance(inst) # 0-based cluster index dist = clusterer.distribution_for_instance( inst) # cluster membership distribution print("cluster=" + str(cl) + ", distribution=" + str(dist)) f.write("cluster=" + str(cl) + ", distribution=" + str(dist)) return render_template("output.html") f.close()
def loadClusterModel(self, method, mname): finalname = "%s_%s.model" % (method, mname) cluster = Clusterer(jobject=serialization.read(os.path.join(self.modelDir, finalname))) logger.info('[%s] : [INFO] Loaded clusterer mode %s ', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), finalname) return cluster
##data.delete_last_attribute() ##saver.save_file(data, "data_with_class_type.arff") ### Deletes the not required attributes data.delete_attribute(2) data.delete_attribute(2) #####Uncomment to save the file with has serviceId as class, forkV and ForkW as attributes ###saver.save_file(data, "data_with_class_serviceID.arff") data.delete_attribute(2) #saver.save_file(data,"data.arff") num_clusters = "6" #Number of clusters for k mean ##Performing clustering clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", num_clusters]) clusterer.build_clusterer(data) for inst in data: cl = clusterer.cluster_instance(inst) # 0-based cluster index dist = clusterer.distribution_for_instance(inst) # cluster membership distribution #print("cluster=" + str(cl) + ", distribution=" + str(dist)) #########Getting the data about the clustered instances evaluation = ClusterEvaluation() evaluation.set_model(clusterer) evaluation.test_model(data) print evaluation.cluster_results #print("# clusters: " + str(evaluation.num_clusters)) #print("log likelihood: " + str(evaluation.log_likelihood)) #print("cluster assignments:\n" + str(evaluation.cluster_assignments))
eca.drop('N_FALTAS', axis=1, inplace=True) eca.drop('COD_DISCIPLINA', axis=1, inplace=True) eca.drop('APROVADO', axis=1, inplace=True) eca.to_csv('temp.csv', index=False) from weka.clusterers import Clusterer import weka.core.jvm as jvm import weka.core.serialization as serialization jvm.start() # executar a tecnica variando de 1 a 9 clusters for i in range(1, 10): print '**************Numero de clusters: ' + str(i) clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", str(i)]) clusterer.build_clusterer(eca) print(clusterer) clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", "4"]) clusterer.build_clusterer(eca) print(clusterer) serialization.write("model/kmeans_eca_reprovacao.model", clusterer) # ler model '''objects = serialization.read_all("cluster.model") clusterer = Clusterer(jobject=objects[0]) data_aluno = loader.load_file("aluno_temp.csv") for instancia in data_aluno:
def main(): """ Just runs some example code. """ # setup the flow count = 50 helper.print_title("build clusterer incrementally") iris = helper.get_data_dir() + os.sep + "iris.arff" flow = Flow(name="build clusterer incrementally") filesupplier = FileSupplier() filesupplier.config["files"] = [iris] flow.actors.append(filesupplier) initcounter = InitStorageValue() initcounter.config["storage_name"] = "counter" initcounter.config["value"] = 0 flow.actors.append(initcounter) loaddataset = LoadDataset() loaddataset.config["incremental"] = True flow.actors.append(loaddataset) remove = Filter(name="remove class attribute") remove.config["setup"] = filters.Filter( classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "last"]) flow.actors.append(remove) inccounter = UpdateStorageValue() inccounter.config["storage_name"] = "counter" inccounter.config["expression"] = "{X} + 1" flow.actors.append(inccounter) train = Train() train.config["setup"] = Clusterer(classname="weka.clusterers.Cobweb") flow.actors.append(train) pick = ContainerValuePicker() pick.config["value"] = "Model" pick.config["switch"] = True flow.actors.append(pick) tee = Tee(name="output model every " + str(count) + " instances") tee.config["condition"] = "@{counter} % " + str(count) + " == 0" flow.actors.append(tee) trigger = Trigger(name="output # of instances") tee.actors.append(trigger) getcounter = GetStorageValue() getcounter.config["storage_name"] = "counter" trigger.actors.append(getcounter) console = Console() console.config["prefix"] = "# of instances: " trigger.actors.append(console) console = Console(name="output model") tee.actors.append(console) # run the flow msg = flow.setup() if msg is None: print("\n" + flow.tree + "\n") msg = flow.execute() if msg is not None: print("Error executing flow:\n" + msg) else: print("Error setting up flow:\n" + msg) flow.wrapup() flow.cleanup()
def main(): """ Just runs some example code. """ # setup the flow helper.print_title("cluster data") iris = helper.get_data_dir() + os.sep + "iris_no_class.arff" clsfile = str(tempfile.gettempdir()) + os.sep + "simplekmeans.model" flow = Flow(name="cluster data") start = Start() flow.actors.append(start) build_save = Trigger() build_save.name = "build and save clusterer" flow.actors.append(build_save) filesupplier = FileSupplier() filesupplier.config["files"] = [iris] build_save.actors.append(filesupplier) loaddataset = LoadDataset() build_save.actors.append(loaddataset) ssv = SetStorageValue() ssv.config["storage_name"] = "data" build_save.actors.append(ssv) train = Train() train.config["setup"] = Clusterer(classname="weka.clusterers.SimpleKMeans") build_save.actors.append(train) ssv = SetStorageValue() ssv.config["storage_name"] = "model" build_save.actors.append(ssv) pick = ContainerValuePicker() pick.config["value"] = "Model" build_save.actors.append(pick) console = Console() console.config["prefix"] = "built: " pick.actors.append(console) writer = ModelWriter() writer.config["output"] = clsfile build_save.actors.append(writer) pred_serialized = Trigger() pred_serialized.name = "make predictions (serialized model)" flow.actors.append(pred_serialized) filesupplier = FileSupplier() filesupplier.config["files"] = [iris] pred_serialized.actors.append(filesupplier) loaddataset = LoadDataset() loaddataset.config["incremental"] = True pred_serialized.actors.append(loaddataset) predict = Predict() predict.config["model"] = clsfile pred_serialized.actors.append(predict) console = Console() console.config["prefix"] = "serialized: " pred_serialized.actors.append(console) pred_storage = Trigger() pred_storage.name = "make predictions (model from storage)" flow.actors.append(pred_storage) filesupplier = FileSupplier() filesupplier.config["files"] = [iris] pred_storage.actors.append(filesupplier) loaddataset = LoadDataset() loaddataset.config["incremental"] = True pred_storage.actors.append(loaddataset) predict = Predict() predict.config["storage_name"] = "model" pred_storage.actors.append(predict) console = Console() console.config["prefix"] = "storage: " pred_storage.actors.append(console) # run the flow msg = flow.setup() if msg is None: print("\n" + flow.tree + "\n") msg = flow.execute() if msg is not None: print("Error executing flow:\n" + msg) else: print("Error setting up flow:\n" + msg) flow.wrapup() flow.cleanup()
jvm.start() # load iris fname = data_dir + os.sep + "iris.arff" print("\nLoading dataset: " + fname + "\n") loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(fname) # remove class attribute flt = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "last"]) flt.set_inputformat(data) filtered = flt.filter(data) # build KMeans print("\n--> SimpleKMeans\n") cl = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", "3"]) cl.build_clusterer(filtered) evl = ClusterEvaluation() evl.set_model(cl) evl.test_model(filtered) print(evl.get_cluster_results()) plc.plot_cluster_assignments(evl, data, atts=[], inst_no=True, wait=True) # use AddCluster filter print("\n--> AddCluster filter\n") flt = Filter(classname="weka.filters.unsupervised.attribute.AddCluster", options=["-W", "weka.clusterers.SimpleKMeans -N 3"]) flt.set_inputformat(filtered) addcl = flt.filter(filtered) print(addcl)
import weka.core.packages as packages dataDir = os.path.join(os.path.dirname(os.path.abspath('')), 'data') modelDir = os.path.join(os.path.dirname(os.path.abspath('')), 'models') dformat = DataFormatter(dataDir) dformat.dict2arff(os.path.join(dataDir, 'System.csv'), os.path.join(dataDir, 'System.arff')) #Arff_file = os.path.join(dataDir, 'System.arff') jvm.start(packages=True) data = converters.load_any_file(os.path.join(dataDir, 'System.arff')) clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", "10", "-S", "10"]) clusterer.build_clusterer(data) # print clusterer # cluster the data # for inst in data: # cl = clusterer.cluster_instance(inst) # 0-based cluster index # dist = clusterer.distribution_for_instance(inst) # cluster membership distribution # print("cluster=" + str(cl) + ", distribution=" + str(dist)) # print inst # serialization.write(os.path.join(modelDir, 'SKM.model'), clusterer) clusterEM = Clusterer(classname="weka.clusterers.EM", options=[ "-I", "1000", "-N", "6", "-X", "10", "-max", "-1",
# load weather.numeric fname = data_dir + os.sep + "weather.numeric.arff" print("\nLoading dataset: " + fname + "\n") loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(fname) # build KMeans seeds = [-1, 11, 12] for seed in seeds: if seed == -1: seedStr = "default" else: seedStr = str(seed) print("\n--> SimpleKMeans - seed " + seedStr + "\n") cl = Clusterer("weka.clusterers.SimpleKMeans") if seed != -1: cl.options = ["-S", str(seed)] cl.build_clusterer(data) evl = ClusterEvaluation() evl.set_model(cl) evl.test_model(data) print(evl.cluster_results) # build XMeans print("\n--> XMeans\n") flt = Filter(classname="weka.filters.unsupervised.attribute.RemoveType", options=["-T", "numeric", "-V"]) flt.inputformat(data) filtered = flt.filter(data) cl = Clusterer(classname="weka.clusterers.XMeans")
# load weather.numeric fname = data_dir + os.sep + "weather.numeric.arff" print("\nLoading dataset: " + fname + "\n") loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(fname) # build KMeans seeds = [-1, 11, 12] for seed in seeds: if seed == -1: seedStr = "default" else: seedStr = str(seed) print("\n--> SimpleKMeans - seed " + seedStr + "\n") cl = Clusterer("weka.clusterers.SimpleKMeans") if seed != -1: cl.set_options(["-S", str(seed)]) cl.build_clusterer(data) evl = ClusterEvaluation() evl.set_model(cl) evl.test_model(data) print(evl.get_cluster_results()) # build XMeans print("\n--> XMeans\n") flt = Filter(classname="weka.filters.unsupervised.attribute.RemoveType", options=["-T", "numeric", "-V"]) flt.set_inputformat(data) filtered = flt.filter(data) cl = Clusterer(classname="weka.clusterers.XMeans") cl.build_clusterer(filtered)
# load weather.numeric fname = data_dir + os.sep + "weather.numeric.arff" print("\nLoading dataset: " + fname + "\n") loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(fname) # build KMeans seeds = [-1, 11, 12] for seed in seeds: if seed == -1: seedStr = "default" else: seedStr = str(seed) print("\n--> SimpleKMeans - seed " + seedStr + "\n") cl = Clusterer("weka.clusterers.SimpleKMeans") if seed != -1: cl.options = ["-S", str(seed)] cl.build_clusterer(data) evl = ClusterEvaluation() evl.set_model(cl) evl.test_model(data) print(evl.cluster_results) # build XMeans print("\n--> XMeans\n") flt = Filter(classname="weka.filters.unsupervised.attribute.RemoveType", options=["-T", "numeric", "-V"]) flt.inputformat(data) filtered = flt.filter(data) cl = Clusterer(classname="weka.clusterers.XMeans") cl.build_clusterer(filtered)
# load iris fname = data_dir + os.sep + "iris.arff" print("\nLoading dataset: " + fname + "\n") loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(fname) # remove class attribute flt = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "last"]) flt.inputformat(data) filtered = flt.filter(data) # build KMeans print("\n--> SimpleKMeans\n") cl = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", "3"]) cl.build_clusterer(filtered) evl = ClusterEvaluation() evl.set_model(cl) evl.test_model(filtered) print(evl.cluster_results) plc.plot_cluster_assignments(evl, data, atts=[], inst_no=True, wait=True) # use AddCluster filter print("\n--> AddCluster filter\n") flt = Filter(classname="weka.filters.unsupervised.attribute.AddCluster", options=["-W", "weka.clusterers.SimpleKMeans -N 3"]) flt.inputformat(filtered) addcl = flt.filter(filtered) print(addcl)
def main(): """ Just runs some example code. """ # load a dataset iris_file = helper.get_data_dir() + os.sep + "iris.arff" helper.print_info("Loading dataset: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") data = loader.load_file(iris_file) # remove class attribute data.delete_last_attribute() # build a clusterer and output model helper.print_title("Training SimpleKMeans clusterer") clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", "3"]) clusterer.build_clusterer(data) print(clusterer) helper.print_info("Evaluating on data") evaluation = ClusterEvaluation() evaluation.set_model(clusterer) evaluation.test_model(data) print("# clusters: " + str(evaluation.num_clusters)) print("log likelihood: " + str(evaluation.log_likelihood)) print("cluster assignments:\n" + str(evaluation.cluster_assignments)) plc.plot_cluster_assignments(evaluation, data, inst_no=True) # using a filtered clusterer helper.print_title("Filtered clusterer") loader = Loader("weka.core.converters.ArffLoader") data = loader.load_file(iris_file) clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", "3"]) remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "last"]) fclusterer = FilteredClusterer() fclusterer.clusterer = clusterer fclusterer.filter = remove fclusterer.build_clusterer(data) print(fclusterer) # load a dataset incrementally and build clusterer incrementally helper.print_title("Incremental clusterer") loader = Loader("weka.core.converters.ArffLoader") iris_inc = loader.load_file(iris_file, incremental=True) clusterer = Clusterer("weka.clusterers.Cobweb") remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "last"]) remove.inputformat(iris_inc) iris_filtered = remove.outputformat() clusterer.build_clusterer(iris_filtered) for inst in loader: remove.input(inst) inst_filtered = remove.output() clusterer.update_clusterer(inst_filtered) clusterer.update_finished() print(clusterer.to_commandline()) print(clusterer) print(clusterer.graph) plg.plot_dot_graph(clusterer.graph)