Example #1
0
 def simpleKMeansTrain(self, dataf, options, mname, temp=True):
     '''
     :param data: -> data to be clustered
     :param options: -> SimpleKMeans options
                   N -> number of clusters
                   A -> Distance function to use (ex: default is "weka.core.EuclideanDistance -R first-last")
                   l -> maximum number of iterations default 500
           num-slots -> number of execution slots, 1 means no parallelism
                   S -> Random number seed (default 10)
           example => ["-N", "10", "-S", "10"]
     :return:
     '''
     try:
         jvm.start(max_heap_size=self.wHeap)
         data = self.loadData(dataf, temp=True)
         clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=options)
         clusterer.build_clusterer(data)
         print(clusterer)
         # cluster the data
         for inst in data:
             cl = clusterer.cluster_instance(inst)  # 0-based cluster index
             dist = clusterer.distribution_for_instance(inst)  # cluster membership distribution
             print(("cluster=" + str(cl) + ", distribution=" + str(dist)))
         self.saveModel(clusterer, 'skm', mname)
     except Exception as e:
         print((traceback.format_exc()))
     finally:
         jvm.stop()
def main():
    """
    Just runs some example code.
    """

    # load a dataset
    iris_file = helper.get_data_dir() + os.sep + "iris.arff"
    helper.print_info("Loading dataset: " + iris_file)
    loader = Loader("weka.core.converters.ArffLoader")
    data = loader.load_file(iris_file)

    # remove class attribute
    data.delete_last_attribute()

    # build a clusterer and output model
    helper.print_title("Training SimpleKMeans clusterer")
    clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", "3"])
    clusterer.build_clusterer(data)
    print(clusterer)

    # cluster data
    helper.print_info("Clustering data")
    for index, inst in enumerate(data):
        cl = clusterer.cluster_instance(inst)
        dist = clusterer.distribution_for_instance(inst)
        print(str(index+1) + ": cluster=" + str(cl) + ", distribution=" + str(dist))
Example #3
0
def assign_cluster(file_location, file_out="clustered.csv", model="kmeans.model", last_filename=False):
    data = read_csv_file(file_location)
    check_jvm()
    # load clusters
    obj = serialization.read(model)
    clusterer = Clusterer(jobject=obj)

    # create file with cluster group
    with open(file_out, 'w') as output:
        for index, attrs in enumerate(data):
            tmp = []
            if last_filename:
                inst = Instance.create_instance(attrs[:-2])
            else:
                inst = Instance.create_instance(attrs[1:])

            pred = clusterer.cluster_instance(inst)
            dist = clusterer.distribution_for_instance(inst)

            if last_filename :
                tmp.append(attrs[-1])
                tmp.append(pred)
                tmp.extend(attrs[:-2])
            else:
                tmp.append(attrs[0])
                tmp.append(pred)
                tmp.extend(attrs[1:])

            print(str(index + 1) + ": label index=" +
                  str(pred) + ", class distribution=" + str(dist))
            output.write('%s\n'%(','.join(map(str,tmp)) ))
Example #4
0
 def simpleKMeansTrain(self, dataf, options, mname, temp=True):
     '''
     :param data: -> data to be clustered
     :param options: -> SimpleKMeans options
                   N -> number of clusters
                   A -> Distance function to use (ex: default is "weka.core.EuclideanDistance -R first-last")
                   l -> maximum number of iterations default 500
           num-slots -> number of execution slots, 1 means no parallelism
                   S -> Random number seed (default 10)
           example => ["-N", "10", "-S", "10"]
     :return:
     '''
     try:
         jvm.start(max_heap_size=self.wHeap)
         data = self.loadData(dataf, temp=True)
         clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=options)
         clusterer.build_clusterer(data)
         print clusterer
         # cluster the data
         for inst in data:
             cl = clusterer.cluster_instance(inst)  # 0-based cluster index
             dist = clusterer.distribution_for_instance(inst)  # cluster membership distribution
             print("cluster=" + str(cl) + ", distribution=" + str(dist))
         self.saveModel(clusterer, 'skm', mname)
     except Exception, e:
         print(traceback.format_exc())
Example #5
0
    def run_SKMeans_137(self):
        
        #construct output paths
        output_prefix = os.path.split(self.input_path)[-1].split(".")[0];
        print(output_prefix);
        write_date = output_prefix + "." + str(datetime.now().date());
        SKMeans_dir = os.path.join(self.output_dir,"SKMeans");
        eval_path = os.path.join(SKMeans_dir, write_date + ".cl_eval.txt");
        clust_desc_path = os.path.join(SKMeans_dir, write_date + ".cl_descr.txt");
        clust_assign_path = os.path.join(SKMeans_dir, write_date + ".cl_assign.txt");
        
        #create output dir if it doesn't already exist
        if(not os.path.exists(SKMeans_dir)):
            os.makedirs(SKMeans_dir);
        
        #clone data and build clusters
#         data_clone = copy.deepcopy(self.data_loaded);
        data_clone = self.data_loaded;
        clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N","137"]);
        clusterer.build_clusterer(data_clone);
        
        #cluster evaluation
        evaluation = ClusterEvaluation();
        evaluation.set_model(clusterer);
        evaluation.test_model(data_clone);
        with open(eval_path, 'w') as outfile:
            outfile.write("number of clusters: \t" + str(evaluation.num_clusters) + "\n");
            outfile.write("log likelihood: \t" + str(evaluation.num_clusters) + "\n");
            outfile.write("cluster assignments: \t" + str(evaluation.cluster_assignments) + "\n");
            outfile.write("***********************\n")
            outfile.write("\t".join(["SKmeans Cluster Evaluation Results\n"])); #header
            outfile.write(str(evaluation.cluster_results) + "\n");
        
        #cluster Instance objects Description of clusters
        with open(clust_desc_path, 'w') as outfile:
            outfile.write(",".join(["cluster_num","distribution\n"])); #header
            for inst in data_clone:    # data
                cl = clusterer.cluster_instance(inst); # 0-based cluster index
                dist = clusterer.distribution_for_instance(inst); #cluster membership distribution
                outfile.write(",".join([str(cl),str(dist)]));
                outfile.write("\n");
     
        #cluster assignment by row
        with open(clust_assign_path, 'w') as outfile:
            outfile.write(",".join(["row_num","SKMeans\n"])); #header
            for i, inst in enumerate(evaluation.cluster_assignments):    # data
                outfile.write(",".join([str(i),str(inst)]));
                outfile.write("\n");
        
        
        return();
        
Example #6
0
def command():
    jvm.start()

    import weka.core.converters as converters
    clusters = request.form['clusternum']
    a1 = request.form['firstcol']
    a2 = request.form['secondcol']
    # print clusters
    # print a1
    # print a2
    if (a1 == 'B' and a2 == 'C'):
        data = converters.load_any_file("Data.csv")
    elif (a1 == 'B' and a2 == 'D'):
        data = converters.load_any_file("Data1.csv")
    elif (a1 == 'C' and a2 == 'D'):
        data = converters.load_any_file("Data2.csv")
    elif (a1 == 'C' and a2 == 'E'):
        data = converters.load_any_file("Data3.csv")
    elif (a1 == 'D' and a2 == 'E'):
        data = converters.load_any_file("Data4.csv")

    #data.class_is_last()

    print(data)

    # from weka.attribute_selection import ASSearch, ASEvaluation, AttributeSelection
    # search = ASSearch(classname="weka.attributeSelection.BestFirst", options=["-D", "1", "-N", "5"])
    # evaluator = ASEvaluation(classname="weka.attributeSelection.CfsSubsetEval", options=["-P", "2", "-E", "1"])
    # attsel = AttributeSelection()
    # attsel.search(search)
    # attsel.evaluator(evaluator)
    # attsel.select_attributes(data)
    f = open("filename.txt", "w")
    from weka.clusterers import Clusterer
    clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans",
                          options=["-N", "{}".format(clusters)])
    clusterer.build_clusterer(data)

    print(clusterer)
    f.write(str(clusterer))
    # cluster the data
    for inst in data:
        cl = clusterer.cluster_instance(inst)  # 0-based cluster index
        dist = clusterer.distribution_for_instance(
            inst)  # cluster membership distribution
        print("cluster=" + str(cl) + ", distribution=" + str(dist))
        f.write("cluster=" + str(cl) + ", distribution=" + str(dist))

    return render_template("output.html")
    f.close()
Example #7
0
#     cl2 = clusterEM.cluster_instance(inst)
#     dist2 = clusterEM.distribution_for_instance(inst)
#     print ("cluster=" + str(cl2) + ", distribution=" + str(dist2))
#     print inst
#
clusterDBSCAN = Clusterer(
    classname="weka.clusterers.DBSCAN",
    options=[
        "-E", "0.9", "-M", "6", "-I",
        "weka.clusterers.forOPTICSAndDBScan.Databases.SequentialDatabase",
        "-D",
        "weka.clusterers.forOPTICSAndDBScan.DataObjects.EuclideanDataObject"
    ])
clusterDBSCAN.build_clusterer(data)

serialization.write(os.path.join(modelDir, "dbscan.model"), clusterDBSCAN)
cluster = Clusterer(
    jobject=serialization.read(os.path.join(modelDir, "dbscan.model")))
# print clusterDBSCAN
# print clusterDBSCAN.number_of_clusters
for inst in data:
    cl3 = cluster.cluster_instance(inst)
    dist3 = cluster.distribution_for_instance(inst)
    print(("cluster=" + str(cl3) + ", distribution=" + str(dist3)))

# for inst in data:
#     cl3 = clusterDBSCAN.cluster_instance(inst)
#     dist3 = clusterDBSCAN.distribution_for_instance(inst)
#     print ("cluster=" + str(cl3) + ", distribution=" + str(dist3))
jvm.stop()
data.delete_attribute(2)
data.delete_attribute(2)
#####Uncomment to save the file with has serviceId as class, forkV and ForkW as attributes
###saver.save_file(data, "data_with_class_serviceID.arff")
data.delete_attribute(2)

#saver.save_file(data,"data.arff")
num_clusters = "6"   #Number of clusters for k mean

##Performing clustering
clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", num_clusters])
clusterer.build_clusterer(data)

for inst in data:
    cl = clusterer.cluster_instance(inst)  # 0-based cluster index
    dist = clusterer.distribution_for_instance(inst)   # cluster membership distribution
    #print("cluster=" + str(cl) + ", distribution=" + str(dist))

#########Getting the data about the clustered instances
evaluation = ClusterEvaluation()
evaluation.set_model(clusterer)
evaluation.test_model(data)
print evaluation.cluster_results
#print("# clusters: " + str(evaluation.num_clusters))
#print("log likelihood: " + str(evaluation.log_likelihood))
#print("cluster assignments:\n" + str(evaluation.cluster_assignments))
#plc.plot_cluster_assignments(evaluation, data,[],True)

####Using WEKA files to get the required results by calling them through this script

#########Calling the WEKA GUI to display the clusters