def perform_KMeans(data, classes, k):

    clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans",
                          options=["-N", str(k)])
    clusterer.build_clusterer(data)
    purity = cluster_purity(clusterer, data, classes)
    return purity
def perform_HC(data, classes, k, link):

    clusterer = Clusterer(classname="weka.clusterers.HierarchicalClusterer",
                          options=["-N", str(k), "-L", link])
    clusterer.build_clusterer(data)
    purity = cluster_purity(clusterer, data, classes)
    return purity
Exemple #3
0
def assign_cluster(file_location, file_out="clustered.csv", model="kmeans.model", last_filename=False):
    data = read_csv_file(file_location)
    check_jvm()
    # load clusters
    obj = serialization.read(model)
    clusterer = Clusterer(jobject=obj)

    # create file with cluster group
    with open(file_out, 'w') as output:
        for index, attrs in enumerate(data):
            tmp = []
            if last_filename:
                inst = Instance.create_instance(attrs[:-2])
            else:
                inst = Instance.create_instance(attrs[1:])

            pred = clusterer.cluster_instance(inst)
            dist = clusterer.distribution_for_instance(inst)

            if last_filename :
                tmp.append(attrs[-1])
                tmp.append(pred)
                tmp.extend(attrs[:-2])
            else:
                tmp.append(attrs[0])
                tmp.append(pred)
                tmp.extend(attrs[1:])

            print(str(index + 1) + ": label index=" +
                  str(pred) + ", class distribution=" + str(dist))
            output.write('%s\n'%(','.join(map(str,tmp)) ))
def main():
    """
    Just runs some example code.
    """

    # load a dataset
    iris_file = helper.get_data_dir() + os.sep + "iris.arff"
    helper.print_info("Loading dataset: " + iris_file)
    loader = Loader("weka.core.converters.ArffLoader")
    full = loader.load_file(iris_file)
    full.class_is_last()

    # remove class attribute
    data = Instances.copy_instances(full)
    data.no_class()
    data.delete_last_attribute()

    # build a clusterer and output model
    helper.print_title("Training SimpleKMeans clusterer")
    clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", "3"])
    clusterer.build_clusterer(data)
    print("done")

    # classes to clusters
    evl = ClusterEvaluation()
    evl.set_model(clusterer)
    evl.test_model(full)
    helper.print_title("Cluster results")
    print(evl.cluster_results)
    helper.print_title("Classes to clusters")
    print(evl.classes_to_clusters)
Exemple #5
0
 def simpleKMeansTrain(self, dataf, options, mname, temp=True):
     '''
     :param data: -> data to be clustered
     :param options: -> SimpleKMeans options
                   N -> number of clusters
                   A -> Distance function to use (ex: default is "weka.core.EuclideanDistance -R first-last")
                   l -> maximum number of iterations default 500
           num-slots -> number of execution slots, 1 means no parallelism
                   S -> Random number seed (default 10)
           example => ["-N", "10", "-S", "10"]
     :return:
     '''
     try:
         jvm.start(max_heap_size=self.wHeap)
         data = self.loadData(dataf, temp=True)
         clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=options)
         clusterer.build_clusterer(data)
         print clusterer
         # cluster the data
         for inst in data:
             cl = clusterer.cluster_instance(inst)  # 0-based cluster index
             dist = clusterer.distribution_for_instance(inst)  # cluster membership distribution
             print("cluster=" + str(cl) + ", distribution=" + str(dist))
         self.saveModel(clusterer, 'skm', mname)
     except Exception, e:
         print(traceback.format_exc())
Exemple #6
0
 def emTrain(self, dataf, options, mname, temp=True):
     '''
     :param data: -> data to be clustered
     :param options: -> EM options
                   I -> number of iterations
                   N -> number of clusters
                   M -> Minimum standard deviation for normal density (default=1.0E-6)
           num-slots -> number of execution slots, 1 means no parallelism
                   S -> random seed (default=100)
             example => ["-I", "1000", "-N", "6", "-X", "10", "-max", "-1", "-ll-cv", "1.0E-6",
                                    "-ll-iter", "1.0E-6", "-M", "1.0E-6", "-num-slots", "1", "-S", "100"]
     :return:
     '''
     try:
         jvm.start(max_heap_size=self.wHeap)
         data = self.loadData(dataf, temp)
         clusterEM = Clusterer(classname="weka.clusterers.EM",
                           options=options)
         clusterEM.build_clusterer(data)
         print(clusterEM)
         self.saveModel(clusterEM, 'em', mname, )
     except Exception as e:
         print((traceback.format_exc()))
     finally:
         jvm.stop()
def main():
    """
    Just runs some example code.
    """

    # load a dataset
    iris_file = helper.get_data_dir() + os.sep + "iris.arff"
    helper.print_info("Loading dataset: " + iris_file)
    loader = Loader("weka.core.converters.ArffLoader")
    full = loader.load_file(iris_file)
    full.class_is_last()

    # remove class attribute
    data = Instances.copy_instances(full)
    data.no_class()
    data.delete_last_attribute()

    # build a clusterer and output model
    helper.print_title("Training SimpleKMeans clusterer")
    clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans",
                          options=["-N", "3"])
    clusterer.build_clusterer(data)
    print("done")

    # classes to clusters
    evl = ClusterEvaluation()
    evl.set_model(clusterer)
    evl.test_model(full)
    helper.print_title("Cluster results")
    print(evl.cluster_results)
    helper.print_title("Classes to clusters")
    print(evl.classes_to_clusters)
def perform_DBScan(data, classes, e, min_points):

    clusterer = Clusterer(classname="weka.clusterers.DBSCAN",
                          options=["-E", str(e), "-M",
                                   str(min_points)])
    clusterer.build_clusterer(data)
    purity = cluster_purity(clusterer, data, classes)
    return purity
def main():
    if not is_installed("CLOPE"):
        print("CLOPE is not installed, installing now")
        install_package("CLOPE")
        print("please restart")
        return

    cls = Clusterer(classname="weka.clusterers.CLOPE")
    print("CLOPE is installed:", cls.to_commandline())
Exemple #10
0
    def run_cluster_simplek(self,
                            output_directory,
                            exc_class=False,
                            num_clusters=7):
        data = Instances.copy_instances(self.training_data)
        data.no_class()
        data.delete_first_attribute()

        # build a clusterer and output model
        print("\nBuilding Clusterer on training data.")
        buildTimeStart = time.time()
        clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans",
                              options=["-N", "" + str(num_clusters)])
        clusterer.build_clusterer(data)

        resultsString = ""
        resultsString = self.print_both(str(clusterer), resultsString)

        buildTimeString = "Clusterer Built in " + str(
            time.time() - buildTimeStart) + " secs.\n"
        resultsString = self.print_both(buildTimeString, resultsString)

        #Evaluate Clusterer
        resultsString = self.print_both("\nClustering data.", resultsString)

        buildTimeStart = time.time()

        clsexc = ""
        if (exc_class):
            # no class attribute
            clsexc = "_NO_Class"
            evl = ClusterEvaluation()
            evl.set_model(clusterer)
            evl.test_model(data)
        else:
            # classes to clusters
            evl = ClusterEvaluation()
            evl.set_model(clusterer)
            evl.test_model(self.training_data)

        resultsString = self.print_both("\nCluster results\n", resultsString)
        resultsString = self.print_both(str(evl.cluster_results),
                                        resultsString)

        resultsString = self.print_both("\nClasses to clusters\n",
                                        resultsString)
        resultsString = self.print_both(str(evl.classes_to_clusters),
                                        resultsString)

        buildTimeString = "\nClustered data in " + str(
            time.time() - buildTimeStart) + " secs.\n"
        resultsString = self.print_both(buildTimeString, resultsString)

        #Save Results and Cleanup
        self.save_results("SimpleKM" + clsexc + "_", resultsString,
                          output_directory)
Exemple #11
0
    def run_clustering_task7_manual(self,
                                    output_directory,
                                    clusterer_name,
                                    num_clusters,
                                    seed=10):
        data = Instances.copy_instances(self.training_data)
        data.no_class()
        data.delete_first_attribute()

        clusterer_name_short = clusterer_name.replace("weka.clusterers.", "")
        # build a clusterer and output model
        print("\nBuilding " + clusterer_name_short +
              " Clusterer on training data.")
        buildTimeStart = time.time()
        clusterer = Clusterer(
            classname=clusterer_name,
            options=["-N", "" + str(num_clusters), "-S", "" + str(seed)])
        clusterer.build_clusterer(data)

        resultsString = ""
        resultsString = self.print_both(str(clusterer), resultsString)

        buildTimeString = "Clusterer Built in " + str(
            time.time() - buildTimeStart) + " secs.\n"
        resultsString = self.print_both(buildTimeString, resultsString)

        #Evaluate Clusterer
        resultsString = self.print_both("\nClustering data.", resultsString)

        buildTimeStart = time.time()

        evl = ClusterEvaluation()
        evl.set_model(clusterer)
        evl.test_model(self.training_data)

        resultsString = self.print_both("\nCluster results\n", resultsString)
        resultsString = self.print_both(str(evl.cluster_results),
                                        resultsString)

        resultsString = self.print_both("\nClasses to clusters\n",
                                        resultsString)
        resultsString = self.print_both(str(evl.classes_to_clusters),
                                        resultsString)

        buildTimeString = "\nClustered data in " + str(
            time.time() - buildTimeStart) + " secs.\n"
        resultsString = self.print_both(buildTimeString, resultsString)

        #Save Results and Cleanup
        self.save_results(
            clusterer_name_short + "_" + "N" + str(num_clusters) + "_S" +
            str(seed), resultsString, output_directory)
Exemple #12
0
def create_cluster_model(arff_file, n=10, loader_type="csv", model="kmeans.model"):
    """ create cluster model """
    check_jvm()
    if loader_type == "csv":
        loader = converters.Loader(classname="weka.core.converters.CSVLoader")
    else :
        loader = conventers.Loader(classname="weka.core.converters.ArffLoader")

    data = loader.load_file(arff_file)
    clusterer = Clusterer(
        classname="weka.clusterers.SimpleKMeans", options=["-N", str(n)])
    clusterer.build_clusterer(data)
    serialization.write(model, clusterer)
def run_clusterer(file):
    # Get filename from Pathlib object
    filename = file.parts[-1]
    dir = file.parents[0]

    print("Running Clusterer on %s" % filename)

    if not filename.endswith(".arff"):
        print("%s not ARFF file." % filename)
        return

    # Removes '.arff' from filename
    filename_base = filename[:-5]

    # Load data with class as first attr
    full = load_Arff_file(file)
    full.class_is_first()

    full_withoutclass = load_Arff_file(file)
    #data.delete_first_attribute()

    data = Instances.copy_instances(full)
    data.no_class()
    data.delete_first_attribute()

    dir = dir / "cluster_results_optimum"
    dir.mkdir(parents=True, exist_ok=True)
    # Init clusterer

    #"-N", "-1",
    n = "2"

    if (filename_base.startswith("fer2018_")):
        print("Changing number of clusters to 7")
        n = "7"

#clusterer = Clusterer(classname="weka.clusterers.EM", options=[ "-S", "10", "-N", n])
#clusterer = Clusterer(classname="weka.clusterers.FarthestFirst", options=[ "-S", "10", "-N", n])
    clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans",
                          options=["-S", "10", "-N", n])
    clusterer.build_clusterer(data)

    evaluation = ClusterEvaluation()
    evaluation.set_model(clusterer)
    evaluation.test_model(full)

    str1 = str(filename_base) + "_cl_res.txt"

    output_results = dir / str1
    output_cluster(evaluation, output_results)
Exemple #14
0
    def train_data(self):
        try:
            #helper.print_info("Loading dataset: " + self.datasetName)
            loader = Loader(classname="weka.core.converters.ArffLoader")
            data_train = loader.load_file(self.datasetName)
            data_train.delete_last_attribute()
            clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans",
                                  options=["-N", "2"])
            clusterer.build_clusterer(data_train)
            return clusterer

        except Exception, e:
            raise e
            print(traceback.format_exc())
Exemple #15
0
    def __init__(self,
                 jobject=None,
                 cluster=None,
                 classname=None,
                 options=None,
                 nominal_input_vars=None,
                 num_nominal_input_labels=None):
        """
        Initializes the estimator. Can be either instantiated via the following priority of parameters:
        1. JB_Object representing a Java Clusterer object
        2. Clusterer pww3 wrapper
        3. classname/options

        :param jobject: the JB_Object representing a Weka cluster to use
        :type jobject: JB_Object
        :param cluster: the cluster wrapper to use
        :type cluster: Clusterer
        :param classname: the classname of the Weka cluster to instantiate
        :type classname: str
        :param options: the command-line options of the Weka cluster to instantiate
        :type options: list
        :param num_nominal_input_labels: the dictionary with the number of labels for the nominal input variables (key is 0-based attribute index)
        :type num_nominal_input_labels: dict
        """
        if jobject is not None:
            _jobject = jobject
        elif cluster is not None:
            _jobject = cluster.jobject
        elif classname is not None:
            if options is None:
                options = []
            cluster = Clusterer(classname=classname, options=options)
            _jobject = cluster.jobject
        else:
            raise Exception("At least Java classname must be provided!")

        if not is_instance_of(_jobject, "weka.clusterers.Clusterer"):
            raise Exception(
                "Java object does not implement weka.clusterers.Clusterer!")

        super(WekaCluster, self).__init__(_jobject)
        self._cluster = Clusterer(jobject=_jobject)
        self.header_ = None
        # the following references are required for get_params/set_params
        self._classname = classname
        self._options = options
        self._nominal_input_vars = nominal_input_vars
        self._num_nominal_input_labels = num_nominal_input_labels
Exemple #16
0
def query_instance(attributes, model="kmeans.model"):
    """
        get the cluster for defined attributes
        :params attributes: array or list
        :returns: cluster id
    """
    check_jvm()
    # create instance
    inst = Instance.create_instance(attributes)
    # load model
    obj = serialization.read(model)
    # load cluster and get the cluster_id
    cluster = Clusterer(jobject=obj)
    cluster_id = cluster.cluster_instance(inst)

    return cluster_id
Exemple #17
0
 def __init__(self, index = 0, inference = "ExactInference", ghostAgents = None):
     BustersAgent.__init__(self, index, inference, ghostAgents)
     self.previousDistances = [0,0,0,0]
     jvm.start(max_heap_size="512m")
     self.loader = Loader(classname="weka.core.converters.ArffLoader")
     self.data = self.loader.load_file("data/game_toCluster.arff")
     self.data.delete_last_attribute()
     self.clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", "10", "-S", "4", "-I", "500"])
     self.clusterer.build_clusterer(self.data)
     self.inst = ""
     self.data = self.loader.load_file("data/game_toCluster.arff")
     addCluster = Filter(classname="weka.filters.unsupervised.attribute.AddCluster", options=["-W", "weka.clusterers.SimpleKMeans -N 10 -S 4 -I 500", "-I", "last"])
     addCluster.inputformat(self.data)
     filtered = addCluster.filter(self.data)
     self.f = open('data/addCluster.arff', 'w+')
     self.f.write(str(filtered))
     self.clustered_data = self.classifyData('data/addCluster.arff')
Exemple #18
0
def predicaoCluster(matricula, curso, tipo_predicao):

    dados = retornarDadosCurso(curso)
    # selecionando as caracteristicas do aluno
    aluno = dados.loc[dados['MATRICULA'] == matricula][:]
    aluno.drop('MATRICULA', axis=1, inplace=True)
    aluno.drop('APROVADO', axis=1, inplace=True)
    aluno.drop('COD_DISCIPLINA', axis=1, inplace=True)
    aluno.drop('SIT_MATRICULA', axis=1, inplace=True)
    aluno = aluno.head(1)

    aluno.to_csv('aluno_temp.csv', index=False)

    from weka.clusterers import Clusterer
    import weka.core.jvm as jvm
    from weka.core.converters import Loader
    import weka.core.serialization as serialization

    jvm.start()

    if curso == 'si':
        if tipo_predicao == 'reprovacao':
            model = serialization.read_all("model/kmeans_si_reprovacao.model")
        elif tipo_predicao == 'evasao':
            model = serialization.read_all("model/kmeans_si_evasao.model")
    elif curso == 'eca':
        if tipo_predicao == 'reprovacao':
            model = serialization.read_all("model/kmeans_eca_reprovacao.model")
        elif tipo_predicao == 'evasao':
            model = serialization.read_all("model/kmeans_eca_evasao.model")
    cluster = Clusterer(jobject=model[0])

    loader = Loader(classname="weka.core.converters.CSVLoader")
    dado_aluno = loader.load_file("aluno_temp.csv")
    for aluno in dado_aluno:
        cluster_aluno_pertence = cluster.cluster_instance(aluno)

    #jvm.stop()

    caracteristica = retornarCaracteristicaCluster(curso, tipo_predicao,
                                                   cluster_aluno_pertence)

    return caracteristica
Exemple #19
0
 def simpleKMeansTrain(self, dataf, options, mname, temp=True):
     '''
     :param data: -> data to be clustered
     :param options: -> SimpleKMeans options
                   N -> number of clusters
                   A -> Distance function to use (ex: default is "weka.core.EuclideanDistance -R first-last")
                   l -> maximum number of iterations default 500
           num-slots -> number of execution slots, 1 means no parallelism
                   S -> Random number seed (default 10)
           example => ["-N", "10", "-S", "10"]
     :return:
     '''
     try:
         jvm.start(max_heap_size=self.wHeap)
         data = self.loadData(dataf, temp=True)
         clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=options)
         clusterer.build_clusterer(data)
         print(clusterer)
         # cluster the data
         for inst in data:
             cl = clusterer.cluster_instance(inst)  # 0-based cluster index
             dist = clusterer.distribution_for_instance(inst)  # cluster membership distribution
             print(("cluster=" + str(cl) + ", distribution=" + str(dist)))
         self.saveModel(clusterer, 'skm', mname)
     except Exception as e:
         print((traceback.format_exc()))
     finally:
         jvm.stop()
def main():
    """
    Just runs some example code.
    """

    # load a dataset
    iris_file = helper.get_data_dir() + os.sep + "iris.arff"
    helper.print_info("Loading dataset: " + iris_file)
    loader = Loader("weka.core.converters.ArffLoader")
    data = loader.load_file(iris_file)

    # remove class attribute
    data.delete_last_attribute()

    # build a clusterer and output model
    helper.print_title("Training SimpleKMeans clusterer")
    clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", "3"])
    clusterer.build_clusterer(data)
    print(clusterer)

    # cluster data
    helper.print_info("Clustering data")
    for index, inst in enumerate(data):
        cl = clusterer.cluster_instance(inst)
        dist = clusterer.distribution_for_instance(inst)
        print(str(index+1) + ": cluster=" + str(cl) + ", distribution=" + str(dist))
Exemple #21
0
    def dbscanTrain(self, dataf, options, mname, temp=True):
        '''
        :param data: -> data to be clustered
        :param options: -> dbscan options
                      E -> epsilon (default = 0.9)
                      M -> minPoints (default = 6)
                      D -> default weka.clusterers.forOPTICSAndDBScan.DataObjects.EuclideanDataObject
                      I -> index (database) used for DBSCAN (default = weka.clusterers.forOPTICSAndDBScan.Databases.SequentialDatabase)
                example => ["-E",  "0.9",  "-M", "6", "-I", "weka.clusterers.forOPTICSAndDBScan.Databases.SequentialDatabase", "-D", "weka.clusterers.forOPTICSAndDBScan.DataObjects.EuclideanDataObject"]
        :return:
        '''

        try:
            jvm.start(max_heap_size=self.wHeap)
            data = self.loadData(dataf, temp)
            clusterDBSCAN = Clusterer(classname="weka.clusterers.DBSCAN", options=options)
            clusterDBSCAN.build_clusterer(data)
            print clusterDBSCAN
            self.saveModel(clusterDBSCAN, 'dbscan', mname)
            # cluster the data
        except Exception, e:
            print(traceback.format_exc())
    def registerInitialState(self, gameState):
        BustersAgent.registerInitialState(self, gameState)
        self.distancer = Distancer(gameState.data.layout, False)

        #Para calcular los valores de la clase en las politicas.
        self.clusters = 8
        self.classes = 4
        self.classCounts = [[0 for i in range(self.classes)]
                            for j in range(self.clusters)]

        self.classIndex = 2
        self.clusterIndex = 3

        self.readInstances()

        #Esto nos servira para guardar las instancias de entrenamiento.
        self.numInstances = 52
        self.numAttributes = 4
        #self.instances = [[" " for i in range(self.numAttributes)] for j in range(self.numInstances)]
        self.ins = [" " for i in range(self.numInstances)]

        #Para usar la libreria debemos usar la maquina virtual de java, JVM
        jvm.start()

        #Creamos el modelo
        loader = Loader(classname="weka.core.converters.ArffLoader")
        data = loader.load_file(
            "/home/dot/Escritorio/Universidad/Machine Learning/practica 2/Outputs/agent_header.arff"
        )

        self.clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans",
                                   options=["-N", str(self.clusters)])
        self.clusterer.build_clusterer(data)

        print(self.clusterer)

        #Aplicamos la politica
        self.politicaMax()
Exemple #23
0
    def set_params(self, **params):
        """
        Sets the options for the cluster, expects 'classname' and 'options'.

        :param params: the parameter dictionary
        :type params: dict
        """
        if len(params) == 0:
            return
        if "classname" not in params:
            raise Exception("Cannot find 'classname' in parameters!")
        if "options" not in params:
            raise Exception("Cannot find 'options' in parameters!")
        self._classname = params["classname"]
        self._options = params["options"]
        self._cluster = Clusterer(classname=self._classname,
                                  options=self._options)
        self._nominal_input_vars = None
        if "nominal_input_vars" in params:
            self._nominal_input_vars = params["nominal_input_vars"]
        self._num_nominal_input_labels = None
        if "num_nominal_input_labels" in params:
            self._num_nominal_input_labels = params["num_nominal_input_labels"]
Exemple #24
0
 def emTrain(self, dataf, options, mname, temp=True):
     '''
     :param data: -> data to be clustered
     :param options: -> EM options
                   I -> number of iterations
                   N -> number of clusters
                   M -> Minimum standard deviation for normal density (default=1.0E-6)
           num-slots -> number of execution slots, 1 means no parallelism
                   S -> random seed (default=100)
             example => ["-I", "1000", "-N", "6", "-X", "10", "-max", "-1", "-ll-cv", "1.0E-6",
                                    "-ll-iter", "1.0E-6", "-M", "1.0E-6", "-num-slots", "1", "-S", "100"]
     :return:
     '''
     try:
         jvm.start(max_heap_size=self.wHeap)
         data = self.loadData(dataf, temp)
         clusterEM = Clusterer(classname="weka.clusterers.EM",
                           options=options)
         clusterEM.build_clusterer(data)
         print clusterEM
         self.saveModel(clusterEM, 'em', mname, )
     except Exception, e:
         print(traceback.format_exc())
Exemple #25
0
    def dbscanTrain(self, dataf, options, mname, temp=True):
        '''
        :param data: -> data to be clustered
        :param options: -> dbscan options
                      E -> epsilon (default = 0.9)
                      M -> minPoints (default = 6)
                      D -> default weka.clusterers.forOPTICSAndDBScan.DataObjects.EuclideanDataObject
                      I -> index (database) used for DBSCAN (default = weka.clusterers.forOPTICSAndDBScan.Databases.SequentialDatabase)
                example => ["-E",  "0.9",  "-M", "6", "-I", "weka.clusterers.forOPTICSAndDBScan.Databases.SequentialDatabase", "-D", "weka.clusterers.forOPTICSAndDBScan.DataObjects.EuclideanDataObject"]
        :return:
        '''

        try:
            jvm.start(max_heap_size=self.wHeap)
            data = self.loadData(dataf, temp)
            clusterDBSCAN = Clusterer(classname="weka.clusterers.DBSCAN", options=options)
            clusterDBSCAN.build_clusterer(data)
            print(clusterDBSCAN)
            self.saveModel(clusterDBSCAN, 'dbscan', mname)
            # cluster the data
        except Exception as e:
            print((traceback.format_exc()))
        finally:
            jvm.stop()
Exemple #26
0
def main():
    """
    Just runs some example code.
    """

    # setup the flow
    helper.print_title("build and save clusterer")
    iris = helper.get_data_dir() + os.sep + "iris_no_class.arff"

    flow = Flow(name="build and save clusterer")

    filesupplier = FileSupplier()
    filesupplier.config["files"] = [iris]
    flow.actors.append(filesupplier)

    loaddataset = LoadDataset()
    flow.actors.append(loaddataset)

    train = Train()
    train.config["setup"] = Clusterer(classname="weka.clusterers.SimpleKMeans")
    flow.actors.append(train)

    pick = ContainerValuePicker()
    pick.config["value"] = "Model"
    flow.actors.append(pick)

    console = Console()
    pick.actors.append(console)

    writer = ModelWriter()
    writer.config["output"] = str(
        tempfile.gettempdir()) + os.sep + "simplekmeans.model"
    flow.actors.append(writer)

    # run the flow
    msg = flow.setup()
    if msg is None:
        print("\n" + flow.tree + "\n")
        msg = flow.execute()
        if msg is not None:
            print("Error executing flow:\n" + msg)
    else:
        print("Error setting up flow:\n" + msg)
    flow.wrapup()
    flow.cleanup()
Exemple #27
0
    def run_SKMeans_137(self):
        
        #construct output paths
        output_prefix = os.path.split(self.input_path)[-1].split(".")[0];
        print(output_prefix);
        write_date = output_prefix + "." + str(datetime.now().date());
        SKMeans_dir = os.path.join(self.output_dir,"SKMeans");
        eval_path = os.path.join(SKMeans_dir, write_date + ".cl_eval.txt");
        clust_desc_path = os.path.join(SKMeans_dir, write_date + ".cl_descr.txt");
        clust_assign_path = os.path.join(SKMeans_dir, write_date + ".cl_assign.txt");
        
        #create output dir if it doesn't already exist
        if(not os.path.exists(SKMeans_dir)):
            os.makedirs(SKMeans_dir);
        
        #clone data and build clusters
#         data_clone = copy.deepcopy(self.data_loaded);
        data_clone = self.data_loaded;
        clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N","137"]);
        clusterer.build_clusterer(data_clone);
        
        #cluster evaluation
        evaluation = ClusterEvaluation();
        evaluation.set_model(clusterer);
        evaluation.test_model(data_clone);
        with open(eval_path, 'w') as outfile:
            outfile.write("number of clusters: \t" + str(evaluation.num_clusters) + "\n");
            outfile.write("log likelihood: \t" + str(evaluation.num_clusters) + "\n");
            outfile.write("cluster assignments: \t" + str(evaluation.cluster_assignments) + "\n");
            outfile.write("***********************\n")
            outfile.write("\t".join(["SKmeans Cluster Evaluation Results\n"])); #header
            outfile.write(str(evaluation.cluster_results) + "\n");
        
        #cluster Instance objects Description of clusters
        with open(clust_desc_path, 'w') as outfile:
            outfile.write(",".join(["cluster_num","distribution\n"])); #header
            for inst in data_clone:    # data
                cl = clusterer.cluster_instance(inst); # 0-based cluster index
                dist = clusterer.distribution_for_instance(inst); #cluster membership distribution
                outfile.write(",".join([str(cl),str(dist)]));
                outfile.write("\n");
     
        #cluster assignment by row
        with open(clust_assign_path, 'w') as outfile:
            outfile.write(",".join(["row_num","SKMeans\n"])); #header
            for i, inst in enumerate(evaluation.cluster_assignments):    # data
                outfile.write(",".join([str(i),str(inst)]));
                outfile.write("\n");
        
        
        return();
        
def main():
    """
    Just runs some example code.
    """

    # setup the flow
    helper.print_title("Cross-validate clusterer")
    iris = helper.get_data_dir() + os.sep + "iris.arff"

    flow = Flow(name="cross-validate clusterer")

    filesupplier = FileSupplier()
    filesupplier.config["files"] = [iris]
    flow.actors.append(filesupplier)

    loaddataset = LoadDataset()
    flow.actors.append(loaddataset)

    flter = Filter()
    flter.name = "Remove class"
    flter.config["filter"] = filters.Filter(
        classname="weka.filters.unsupervised.attribute.Remove",
        options=["-R", "last"])
    flow.actors.append(flter)

    cv = CrossValidate()
    cv.config["setup"] = Clusterer(classname="weka.clusterers.EM")
    flow.actors.append(cv)

    console = Console()
    console.config["prefix"] = "Loglikelihood: "
    flow.actors.append(console)

    # run the flow
    msg = flow.setup()
    if msg is None:
        print("\n" + flow.tree + "\n")
        msg = flow.execute()
        if msg is not None:
            print("Error executing flow:\n" + msg)
    else:
        print("Error setting up flow:\n" + msg)
    flow.wrapup()
    flow.cleanup()
Exemple #29
0
def command():
    jvm.start()

    import weka.core.converters as converters
    clusters = request.form['clusternum']
    a1 = request.form['firstcol']
    a2 = request.form['secondcol']
    # print clusters
    # print a1
    # print a2
    if (a1 == 'B' and a2 == 'C'):
        data = converters.load_any_file("Data.csv")
    elif (a1 == 'B' and a2 == 'D'):
        data = converters.load_any_file("Data1.csv")
    elif (a1 == 'C' and a2 == 'D'):
        data = converters.load_any_file("Data2.csv")
    elif (a1 == 'C' and a2 == 'E'):
        data = converters.load_any_file("Data3.csv")
    elif (a1 == 'D' and a2 == 'E'):
        data = converters.load_any_file("Data4.csv")

    #data.class_is_last()

    print(data)

    # from weka.attribute_selection import ASSearch, ASEvaluation, AttributeSelection
    # search = ASSearch(classname="weka.attributeSelection.BestFirst", options=["-D", "1", "-N", "5"])
    # evaluator = ASEvaluation(classname="weka.attributeSelection.CfsSubsetEval", options=["-P", "2", "-E", "1"])
    # attsel = AttributeSelection()
    # attsel.search(search)
    # attsel.evaluator(evaluator)
    # attsel.select_attributes(data)
    f = open("filename.txt", "w")
    from weka.clusterers import Clusterer
    clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans",
                          options=["-N", "{}".format(clusters)])
    clusterer.build_clusterer(data)

    print(clusterer)
    f.write(str(clusterer))
    # cluster the data
    for inst in data:
        cl = clusterer.cluster_instance(inst)  # 0-based cluster index
        dist = clusterer.distribution_for_instance(
            inst)  # cluster membership distribution
        print("cluster=" + str(cl) + ", distribution=" + str(dist))
        f.write("cluster=" + str(cl) + ", distribution=" + str(dist))

    return render_template("output.html")
    f.close()
Exemple #30
0
 def loadClusterModel(self, method, mname):
     finalname = "%s_%s.model" % (method, mname)
     cluster = Clusterer(jobject=serialization.read(os.path.join(self.modelDir, finalname)))
     logger.info('[%s] : [INFO] Loaded clusterer mode %s ',
                 datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), finalname)
     return cluster
##data.delete_last_attribute()
##saver.save_file(data, "data_with_class_type.arff")


### Deletes the not required attributes 
data.delete_attribute(2)
data.delete_attribute(2)
#####Uncomment to save the file with has serviceId as class, forkV and ForkW as attributes
###saver.save_file(data, "data_with_class_serviceID.arff")
data.delete_attribute(2)

#saver.save_file(data,"data.arff")
num_clusters = "6"   #Number of clusters for k mean

##Performing clustering
clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", num_clusters])
clusterer.build_clusterer(data)

for inst in data:
    cl = clusterer.cluster_instance(inst)  # 0-based cluster index
    dist = clusterer.distribution_for_instance(inst)   # cluster membership distribution
    #print("cluster=" + str(cl) + ", distribution=" + str(dist))

#########Getting the data about the clustered instances
evaluation = ClusterEvaluation()
evaluation.set_model(clusterer)
evaluation.test_model(data)
print evaluation.cluster_results
#print("# clusters: " + str(evaluation.num_clusters))
#print("log likelihood: " + str(evaluation.log_likelihood))
#print("cluster assignments:\n" + str(evaluation.cluster_assignments))
Exemple #32
0
eca.drop('N_FALTAS', axis=1, inplace=True)
eca.drop('COD_DISCIPLINA', axis=1, inplace=True)
eca.drop('APROVADO', axis=1, inplace=True)

eca.to_csv('temp.csv', index=False)

from weka.clusterers import Clusterer
import weka.core.jvm as jvm
import weka.core.serialization as serialization

jvm.start()

# executar a tecnica variando de 1 a 9 clusters
for i in range(1, 10):
    print '**************Numero de clusters: ' + str(i)
    clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans",
                          options=["-N", str(i)])
    clusterer.build_clusterer(eca)
    print(clusterer)

clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans",
                      options=["-N", "4"])
clusterer.build_clusterer(eca)
print(clusterer)
serialization.write("model/kmeans_eca_reprovacao.model", clusterer)

# ler model
'''objects = serialization.read_all("cluster.model")
clusterer = Clusterer(jobject=objects[0])

data_aluno = loader.load_file("aluno_temp.csv")
for instancia in data_aluno:
Exemple #33
0
def main():
    """
    Just runs some example code.
    """

    # setup the flow
    count = 50
    helper.print_title("build clusterer incrementally")
    iris = helper.get_data_dir() + os.sep + "iris.arff"

    flow = Flow(name="build clusterer incrementally")

    filesupplier = FileSupplier()
    filesupplier.config["files"] = [iris]
    flow.actors.append(filesupplier)

    initcounter = InitStorageValue()
    initcounter.config["storage_name"] = "counter"
    initcounter.config["value"] = 0
    flow.actors.append(initcounter)

    loaddataset = LoadDataset()
    loaddataset.config["incremental"] = True
    flow.actors.append(loaddataset)

    remove = Filter(name="remove class attribute")
    remove.config["setup"] = filters.Filter(
        classname="weka.filters.unsupervised.attribute.Remove",
        options=["-R", "last"])
    flow.actors.append(remove)

    inccounter = UpdateStorageValue()
    inccounter.config["storage_name"] = "counter"
    inccounter.config["expression"] = "{X} + 1"
    flow.actors.append(inccounter)

    train = Train()
    train.config["setup"] = Clusterer(classname="weka.clusterers.Cobweb")
    flow.actors.append(train)

    pick = ContainerValuePicker()
    pick.config["value"] = "Model"
    pick.config["switch"] = True
    flow.actors.append(pick)

    tee = Tee(name="output model every " + str(count) + " instances")
    tee.config["condition"] = "@{counter} % " + str(count) + " == 0"
    flow.actors.append(tee)

    trigger = Trigger(name="output # of instances")
    tee.actors.append(trigger)

    getcounter = GetStorageValue()
    getcounter.config["storage_name"] = "counter"
    trigger.actors.append(getcounter)

    console = Console()
    console.config["prefix"] = "# of instances: "
    trigger.actors.append(console)

    console = Console(name="output model")
    tee.actors.append(console)

    # run the flow
    msg = flow.setup()
    if msg is None:
        print("\n" + flow.tree + "\n")
        msg = flow.execute()
        if msg is not None:
            print("Error executing flow:\n" + msg)
    else:
        print("Error setting up flow:\n" + msg)
    flow.wrapup()
    flow.cleanup()
def main():
    """
    Just runs some example code.
    """

    # setup the flow
    helper.print_title("cluster data")
    iris = helper.get_data_dir() + os.sep + "iris_no_class.arff"
    clsfile = str(tempfile.gettempdir()) + os.sep + "simplekmeans.model"

    flow = Flow(name="cluster data")

    start = Start()
    flow.actors.append(start)

    build_save = Trigger()
    build_save.name = "build and save clusterer"
    flow.actors.append(build_save)

    filesupplier = FileSupplier()
    filesupplier.config["files"] = [iris]
    build_save.actors.append(filesupplier)

    loaddataset = LoadDataset()
    build_save.actors.append(loaddataset)

    ssv = SetStorageValue()
    ssv.config["storage_name"] = "data"
    build_save.actors.append(ssv)

    train = Train()
    train.config["setup"] = Clusterer(classname="weka.clusterers.SimpleKMeans")
    build_save.actors.append(train)

    ssv = SetStorageValue()
    ssv.config["storage_name"] = "model"
    build_save.actors.append(ssv)

    pick = ContainerValuePicker()
    pick.config["value"] = "Model"
    build_save.actors.append(pick)

    console = Console()
    console.config["prefix"] = "built: "
    pick.actors.append(console)

    writer = ModelWriter()
    writer.config["output"] = clsfile
    build_save.actors.append(writer)

    pred_serialized = Trigger()
    pred_serialized.name = "make predictions (serialized model)"
    flow.actors.append(pred_serialized)

    filesupplier = FileSupplier()
    filesupplier.config["files"] = [iris]
    pred_serialized.actors.append(filesupplier)

    loaddataset = LoadDataset()
    loaddataset.config["incremental"] = True
    pred_serialized.actors.append(loaddataset)

    predict = Predict()
    predict.config["model"] = clsfile
    pred_serialized.actors.append(predict)

    console = Console()
    console.config["prefix"] = "serialized: "
    pred_serialized.actors.append(console)

    pred_storage = Trigger()
    pred_storage.name = "make predictions (model from storage)"
    flow.actors.append(pred_storage)

    filesupplier = FileSupplier()
    filesupplier.config["files"] = [iris]
    pred_storage.actors.append(filesupplier)

    loaddataset = LoadDataset()
    loaddataset.config["incremental"] = True
    pred_storage.actors.append(loaddataset)

    predict = Predict()
    predict.config["storage_name"] = "model"
    pred_storage.actors.append(predict)

    console = Console()
    console.config["prefix"] = "storage: "
    pred_storage.actors.append(console)

    # run the flow
    msg = flow.setup()
    if msg is None:
        print("\n" + flow.tree + "\n")
        msg = flow.execute()
        if msg is not None:
            print("Error executing flow:\n" + msg)
    else:
        print("Error setting up flow:\n" + msg)
    flow.wrapup()
    flow.cleanup()
Exemple #35
0
jvm.start()

# load iris
fname = data_dir + os.sep + "iris.arff"
print("\nLoading dataset: " + fname + "\n")
loader = Loader(classname="weka.core.converters.ArffLoader")
data = loader.load_file(fname)

# remove class attribute
flt = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "last"])
flt.set_inputformat(data)
filtered = flt.filter(data)

# build KMeans
print("\n--> SimpleKMeans\n")
cl = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", "3"])
cl.build_clusterer(filtered)
evl = ClusterEvaluation()
evl.set_model(cl)
evl.test_model(filtered)
print(evl.get_cluster_results())
plc.plot_cluster_assignments(evl, data, atts=[], inst_no=True, wait=True)

# use AddCluster filter
print("\n--> AddCluster filter\n")
flt = Filter(classname="weka.filters.unsupervised.attribute.AddCluster",
             options=["-W", "weka.clusterers.SimpleKMeans -N 3"])
flt.set_inputformat(filtered)
addcl = flt.filter(filtered)
print(addcl)
Exemple #36
0
import weka.core.packages as packages

dataDir = os.path.join(os.path.dirname(os.path.abspath('')), 'data')
modelDir = os.path.join(os.path.dirname(os.path.abspath('')), 'models')

dformat = DataFormatter(dataDir)

dformat.dict2arff(os.path.join(dataDir, 'System.csv'),
                  os.path.join(dataDir, 'System.arff'))

#Arff_file = os.path.join(dataDir, 'System.arff')

jvm.start(packages=True)

data = converters.load_any_file(os.path.join(dataDir, 'System.arff'))
clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans",
                      options=["-N", "10", "-S", "10"])
clusterer.build_clusterer(data)

# print clusterer
# cluster the data
# for inst in data:
#     cl = clusterer.cluster_instance(inst)  # 0-based cluster index
#     dist = clusterer.distribution_for_instance(inst)   # cluster membership distribution
#     print("cluster=" + str(cl) + ", distribution=" + str(dist))
#     print inst

# serialization.write(os.path.join(modelDir, 'SKM.model'), clusterer)

clusterEM = Clusterer(classname="weka.clusterers.EM",
                      options=[
                          "-I", "1000", "-N", "6", "-X", "10", "-max", "-1",
Exemple #37
0
# load weather.numeric
fname = data_dir + os.sep + "weather.numeric.arff"
print("\nLoading dataset: " + fname + "\n")
loader = Loader(classname="weka.core.converters.ArffLoader")
data = loader.load_file(fname)

# build KMeans
seeds = [-1, 11, 12]
for seed in seeds:
    if seed == -1:
        seedStr = "default"
    else:
        seedStr = str(seed)
    print("\n--> SimpleKMeans - seed " + seedStr + "\n")
    cl = Clusterer("weka.clusterers.SimpleKMeans")
    if seed != -1:
        cl.options = ["-S", str(seed)]
    cl.build_clusterer(data)
    evl = ClusterEvaluation()
    evl.set_model(cl)
    evl.test_model(data)
    print(evl.cluster_results)

# build XMeans
print("\n--> XMeans\n")
flt = Filter(classname="weka.filters.unsupervised.attribute.RemoveType",
             options=["-T", "numeric", "-V"])
flt.inputformat(data)
filtered = flt.filter(data)
cl = Clusterer(classname="weka.clusterers.XMeans")
Exemple #38
0
# load weather.numeric
fname = data_dir + os.sep + "weather.numeric.arff"
print("\nLoading dataset: " + fname + "\n")
loader = Loader(classname="weka.core.converters.ArffLoader")
data = loader.load_file(fname)

# build KMeans
seeds = [-1, 11, 12]
for seed in seeds:
    if seed == -1:
        seedStr = "default"
    else:
        seedStr = str(seed)
    print("\n--> SimpleKMeans - seed " + seedStr + "\n")
    cl = Clusterer("weka.clusterers.SimpleKMeans")
    if seed != -1:
        cl.set_options(["-S", str(seed)])
    cl.build_clusterer(data)
    evl = ClusterEvaluation()
    evl.set_model(cl)
    evl.test_model(data)
    print(evl.get_cluster_results())

# build XMeans
print("\n--> XMeans\n")
flt = Filter(classname="weka.filters.unsupervised.attribute.RemoveType", options=["-T", "numeric", "-V"])
flt.set_inputformat(data)
filtered = flt.filter(data)
cl = Clusterer(classname="weka.clusterers.XMeans")
cl.build_clusterer(filtered)
Exemple #39
0
# load weather.numeric
fname = data_dir + os.sep + "weather.numeric.arff"
print("\nLoading dataset: " + fname + "\n")
loader = Loader(classname="weka.core.converters.ArffLoader")
data = loader.load_file(fname)

# build KMeans
seeds = [-1, 11, 12]
for seed in seeds:
    if seed == -1:
        seedStr = "default"
    else:
        seedStr = str(seed)
    print("\n--> SimpleKMeans - seed " + seedStr + "\n")
    cl = Clusterer("weka.clusterers.SimpleKMeans")
    if seed != -1:
        cl.options = ["-S", str(seed)]
    cl.build_clusterer(data)
    evl = ClusterEvaluation()
    evl.set_model(cl)
    evl.test_model(data)
    print(evl.cluster_results)

# build XMeans
print("\n--> XMeans\n")
flt = Filter(classname="weka.filters.unsupervised.attribute.RemoveType", options=["-T", "numeric", "-V"])
flt.inputformat(data)
filtered = flt.filter(data)
cl = Clusterer(classname="weka.clusterers.XMeans")
cl.build_clusterer(filtered)
Exemple #40
0
# load iris
fname = data_dir + os.sep + "iris.arff"
print("\nLoading dataset: " + fname + "\n")
loader = Loader(classname="weka.core.converters.ArffLoader")
data = loader.load_file(fname)

# remove class attribute
flt = Filter(classname="weka.filters.unsupervised.attribute.Remove",
             options=["-R", "last"])
flt.inputformat(data)
filtered = flt.filter(data)

# build KMeans
print("\n--> SimpleKMeans\n")
cl = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", "3"])
cl.build_clusterer(filtered)
evl = ClusterEvaluation()
evl.set_model(cl)
evl.test_model(filtered)
print(evl.cluster_results)
plc.plot_cluster_assignments(evl, data, atts=[], inst_no=True, wait=True)

# use AddCluster filter
print("\n--> AddCluster filter\n")
flt = Filter(classname="weka.filters.unsupervised.attribute.AddCluster",
             options=["-W", "weka.clusterers.SimpleKMeans -N 3"])
flt.inputformat(filtered)
addcl = flt.filter(filtered)
print(addcl)
def main():
    """
    Just runs some example code.
    """

    # load a dataset
    iris_file = helper.get_data_dir() + os.sep + "iris.arff"
    helper.print_info("Loading dataset: " + iris_file)
    loader = Loader("weka.core.converters.ArffLoader")
    data = loader.load_file(iris_file)

    # remove class attribute
    data.delete_last_attribute()

    # build a clusterer and output model
    helper.print_title("Training SimpleKMeans clusterer")
    clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", "3"])
    clusterer.build_clusterer(data)
    print(clusterer)
    helper.print_info("Evaluating on data")
    evaluation = ClusterEvaluation()
    evaluation.set_model(clusterer)
    evaluation.test_model(data)
    print("# clusters: " + str(evaluation.num_clusters))
    print("log likelihood: " + str(evaluation.log_likelihood))
    print("cluster assignments:\n" + str(evaluation.cluster_assignments))
    plc.plot_cluster_assignments(evaluation, data, inst_no=True)

    # using a filtered clusterer
    helper.print_title("Filtered clusterer")
    loader = Loader("weka.core.converters.ArffLoader")
    data = loader.load_file(iris_file)
    clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", "3"])
    remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "last"])
    fclusterer = FilteredClusterer()
    fclusterer.clusterer = clusterer
    fclusterer.filter = remove
    fclusterer.build_clusterer(data)
    print(fclusterer)

    # load a dataset incrementally and build clusterer incrementally
    helper.print_title("Incremental clusterer")
    loader = Loader("weka.core.converters.ArffLoader")
    iris_inc = loader.load_file(iris_file, incremental=True)
    clusterer = Clusterer("weka.clusterers.Cobweb")
    remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "last"])
    remove.inputformat(iris_inc)
    iris_filtered = remove.outputformat()
    clusterer.build_clusterer(iris_filtered)
    for inst in loader:
        remove.input(inst)
        inst_filtered = remove.output()
        clusterer.update_clusterer(inst_filtered)
    clusterer.update_finished()
    print(clusterer.to_commandline())
    print(clusterer)
    print(clusterer.graph)
    plg.plot_dot_graph(clusterer.graph)