def __init__(self, use_scikit=True): self.dc = DataClass() self.data = [] self.node_data = [] self.assignments_series = [] self.min_final = None self.max_final = None self.files = [ f for f in listdir("data/sensors") if isfile(join("data/sensors", f)) ] print(self.files) self.n_nodes = len(self.files) self.n_series_disp = 10 # self.i_run = int(self.n_nodes/2) self.i_run2 = 1 # self.use_previous_cluster_data = False self.centroids = None self.final_centroids = None self.final_clusters = None self.clusters = [] self.node_centroids = [] self.partial_sample_index = 0 self.use_scikit = use_scikit
class ML_Export_TS: def __init__(self, use_scikit=True): self.dc = DataClass() self.data = [] self.node_data = [] self.assignments_series = [] self.min_final = None self.max_final = None self.files = [ f for f in listdir("data/sensors") if isfile(join("data/sensors", f)) ] print(self.files) self.n_nodes = len(self.files) self.n_series_disp = 10 # self.i_run = int(self.n_nodes/2) self.i_run2 = 1 # self.use_previous_cluster_data = False self.centroids = None self.final_centroids = None self.final_clusters = None self.clusters = [] self.node_centroids = [] self.partial_sample_index = 0 self.use_scikit = use_scikit def set_lib(self, use_scikit): self.use_scikit = use_scikit def init(self): self.final_centroids = None self.centroids = None self.read_data() def read_data(self): """ read data from files each file has the data for a measurement node over a time frame of n days, for every hour :return: """ self.data = [] self.node_data = [] for i, f in enumerate(self.files[0:self.n_nodes]): # print(str(i) + ". reading: " + f) fdata = self.dc.read_data(join("data/sensors/", f)) data = copy.copy(fdata) self.data.append(data) node = Constants.NODE_MODEL node["id"] = i self.node_data.append(copy.deepcopy(node)) def export_ts(self, node): self.dc.write_data("data/output/" + str(node) + ".csv", self.data[node]["series"], 2)
def MakePlan(Parameters): STARTTIME = time.time() #makedb() #print "dbCreated: ", time.time()-STARTTIME Data=DataClass(Parameters) #print time.time()-STARTTIME emptyRoute=np.empty((Data.DAYS,2), dtype=int) for i in range (Parameters.days): for j in range (2): emptyRoute[i,j]=Data.n+2*i+j rmvd=[[]]*Data.DAYS Plan=PlanVariables(emptyRoute,Data) heuristicResponse= Heuristic(Plan,rmvd,Data,Parameters.timeMultiplier) newPlan=heuristicResponse[0] bestPlan=newPlan bestObjective=heuristicResponse[1] iterations=0 #print time.time()-STARTTIME while (iterations<MAXITERATIONS and (time.time() - STARTTIME)<25): metaheu=1 while(metaheu<=3): if metaheu==1: meta=MetaH1(newPlan.route) if metaheu==2: meta=MetaH2(newPlan.route) if metaheu==3: meta=MetaH3(newPlan.route) Plan=PlanVariables(meta[0],Data) rmvd=meta[1] heuristicResponse=Heuristic(Plan,rmvd,Data,Parameters.timeMultiplier) newPlan=heuristicResponse[0] newObjective=heuristicResponse[1] if newObjective>bestObjective: bestObjective=newObjective bestPlan=newPlan metaheu=1 iterations=0 else: metaheu=metaheu+1 iterations=iterations+1 #print time.time()-STARTTIME #print bestObjective #print bestPlan.route #return (no_of_locations,names,latitudes,longitudes,start,end,free) return [bestPlan,Data]
def __init__(self, use_scikit=True): self.dc = DataClass() self.data = [] self.node_data = [] self.assignments_series = [] self.min_final = None self.max_final = None self.files = [] self.n_nodes = 0 self.n_series_disp = 10 # self.i_run = int(self.n_nodes/2) self.i_run2 = 1 # self.use_previous_cluster_data = False self.centroids = None self.final_centroids = None self.final_clusters = None self.clusters = [] self.node_centroids = [] self.partial_sample_index = 0 self.use_scikit = use_scikit
def __init__(self, parent=None): super(mainWindow, self).__init__(parent) ## ---------------------- TAB 1-------------------------------------- self.timer = QTimer() self.timer.timeout.connect(self.tick) self.timer.start(1000) self.RunPlots = False self.NoOfPlots = 8 self.PlotNo = {} self.unitNo = {} self.comboColor2 = {} self.comboStyle2 = {} self.sensorData = {} self.checkboxShow = {} self.comboCOMport = 1 self.comboBaudRate = 9600 self.filled = {} self.time_var = 0 self.data1_arr = [] self.plotColor = ['k', 'k', 'b', 'r', 'y', 'g', 'b', 'r', 'y', 'g'] self.plotStyle = ['-', '-', '-', '-', '-', '-', '-', '-', '-', '-'] self.usePlot = ['1', '1', '1', '1', '1', '1', '1', '1', '1', '1'] # Tab 1 self.tab1 = QtGui.QWidget() self.addTab(self.tab1, "Incomming Data") self.figure = plt.figure(figsize=(30, 15)) self.resize(1200, 700) self.canvas = FigureCanvas(self.figure) # Label l1 = QLabel() l2 = QLabel() l3 = QLabel() l4 = QLabel() l5 = QLabel() l6 = QLabel() l7 = QLabel() l1.setText("Show") l2.setText("Unit No") l3.setText("Data") l4.setText("Plot No") l5.setText("Offset") l6.setText("Plot Color") l7.setText("Plot Style") l1.setAlignment(Qt.AlignLeft) l2.setAlignment(Qt.AlignLeft) l3.setAlignment(Qt.AlignLeft) l4.setAlignment(Qt.AlignLeft) l5.setAlignment(Qt.AlignLeft) l6.setAlignment(Qt.AlignLeft) l7.setAlignment(Qt.AlignLeft) ## Create Grid for grid = QGridLayout() grid.addWidget(l1, 1, 1) grid.addWidget(l2, 1, 2) grid.addWidget(l3, 1, 3) grid.addWidget(l4, 1, 4) grid.addWidget(l5, 1, 5) grid.addWidget(l6, 1, 6) grid.addWidget(l7, 1, 7) for i in range(2, self.NoOfPlots + 1): # Checkboxes self.checkboxShow[i] = QtGui.QCheckBox('', self) self.checkboxShow[i].setChecked(True) # Combo box 1 - Plot nr self.PlotNo[i] = QtGui.QComboBox(self) self.PlotNo[i].addItem("1") self.PlotNo[i].addItem("2") self.PlotNo[i].setFixedWidth(50) # Combo box 2 - Slave nr self.unitNo[i] = QtGui.QComboBox(self) self.unitNo[i].addItem("1") self.unitNo[i].addItem("2") self.unitNo[i].addItem("3") self.unitNo[i].addItem("4") self.unitNo[i].addItem("5") self.unitNo[i].addItem("6") self.unitNo[i].addItem("7") self.unitNo[i].setFixedWidth(50) # Combo box 3 - Sensor Data self.sensorData[i] = QtGui.QComboBox(self) self.sensorData[i].addItem("Temperature 1") self.sensorData[i].addItem("Temperature 2") self.sensorData[i].addItem("Temperature 3") self.sensorData[i].addItem("Humidity 1") self.sensorData[i].addItem("Light 1") self.sensorData[i].setFixedWidth(150) # Offset line = QtGui.QLineEdit(self) line.setFixedWidth(50) # Plot Color colorPath = "C:/Users/KWFO/Desktop/Python_GUI/plot_colors/" self.comboColor2[i] = QtGui.QComboBox(self) self.comboColor2[i].addItem(QIcon(colorPath + "black.png"), "") self.comboColor2[i].addItem(QIcon(colorPath + "blue.png"), "") self.comboColor2[i].addItem(QIcon(colorPath + "red1.png"), "") #self.comboColor2[i].addItem(QIcon(colorPath + "yellow1.png"),"") self.comboColor2[i].addItem(QIcon(colorPath + "green.png"), "") self.comboColor2[i].addItem(QIcon(colorPath + "orange.png"), "") self.comboColor2[i].addItem(QIcon(colorPath + "magenta.png"), "") self.comboColor2[i].addItem(QIcon(colorPath + "cyan2.png"), "") self.comboColor2[i].setFixedWidth(50) self.comboColor2[i].setCurrentIndex( i - 2) # Set different color for all at startup # Plot Style self.comboStyle2[i] = QtGui.QComboBox(self) self.comboStyle2[i].addItem("solid") self.comboStyle2[i].addItem("dashed") self.comboStyle2[i].addItem("dots") self.comboStyle2[i].addItem("solid + dots") self.comboStyle2[i].setFixedWidth(90) grid.addWidget(self.checkboxShow[i], i, 1) grid.addWidget(self.unitNo[i], i, 2) grid.addWidget(self.sensorData[i], i, 3) grid.addWidget(self.PlotNo[i], i, 4) grid.addWidget(line, i, 5) grid.addWidget(self.comboColor2[i], i, 6) grid.addWidget(self.comboStyle2[i], i, 7) b1 = QPushButton("Plot incomming data") b1.clicked.connect(self.b1_clicked) b1.setFixedHeight(40) b1.setFixedWidth(125) b2 = QPushButton("Stop plotting") b2.clicked.connect(self.b2_clicked) b2.setFixedHeight(40) b2.setFixedWidth(125) serial_Setup = QGridLayout() com_port = QLabel() com_port.setText("COM Port") baudrate = QLabel() baudrate.setText("Baud Rate") self.comboCOMport = QtGui.QComboBox(self) self.comboCOMport.addItem("COM1") self.comboCOMport.addItem("COM2") self.comboCOMport.addItem("COM3") self.comboCOMport.addItem("COM4") self.comboCOMport.addItem("COM5") self.comboCOMport.addItem("COM6") self.comboCOMport.addItem("COM7") self.comboCOMport.addItem("COM8") self.comboCOMport.addItem("COM9") self.comboBaudRate = QtGui.QComboBox(self) self.comboBaudRate.addItem("9600") self.comboBaudRate.addItem("18200") self.comboBaudRate.addItem("36400") self.comboBaudRate.addItem("72800") self.comboBaudRate.addItem("115600") serial_Setup.addWidget(com_port, 1, 1) serial_Setup.addWidget(self.comboCOMport, 2, 1) serial_Setup.addWidget(baudrate, 1, 2) serial_Setup.addWidget(self.comboBaudRate, 2, 2) buttons = QtGui.QHBoxLayout() buttons.addWidget(b1) buttons.addWidget(b2) buttons.addSpacing(100) buttons.addLayout(serial_Setup) buttons.addStretch() self.show_plot_1 = QtGui.QCheckBox('Show Plot 1', self) self.show_plot_1.setChecked(True) self.show_plot_2 = QtGui.QCheckBox('Show Plot 2', self) self.show_plot_2.setChecked(True) # Input Data on Left Side Of Screen input_data = QtGui.QVBoxLayout() input_data.addLayout(buttons) input_data.addSpacing(20) input_data.addWidget(self.show_plot_1) input_data.addWidget(self.show_plot_2) input_data.addSpacing(40) input_data.addLayout(grid) input_data.addStretch() hbox = QtGui.QHBoxLayout() hbox.addLayout(input_data) hbox.addWidget(self.canvas) self.tab1.setLayout(hbox) ## --------------------------------------------------- # Tab 2 self.txt_data = {} self.txt_data2 = {} self.unit_1 = {} self.unit_2 = {} #self.lineEdit = '' self.tab2 = QtGui.QWidget() self.addTab(self.tab2, "Load Saved Data") buttonLoadData = QPushButton("Load Data") buttonLoadData.clicked.connect(self.loadData_clicked) buttonLoadData.setFixedHeight(40) buttonLoadData.setFixedWidth(125) self.figure2 = plt.figure(figsize=(30, 15)) self.canvas2 = FigureCanvas(self.figure2) tab2_hbox = QtGui.QHBoxLayout() tab2_hbox.addWidget(buttonLoadData) tab2_hbox.addWidget(self.canvas2) self.tab2.setLayout(tab2_hbox) # --------------Tab 3 self.tab3 = QtGui.QWidget() self.addTab(self.tab3, "Tab3 Test") btnLoadData = QPushButton("Load Data") btnLoadData.clicked.connect(self.loadData_clicked) btnLoadData.setFixedHeight(40) btnLoadData.setFixedWidth(125) load_table = QTableWidget() load_table.setWindowTitle("Loaded Data") #load_table.resize(800, 800) load_table.setRowCount(5) load_table.setColumnCount(2) view_table = QTableWidget() view_table.setWindowTitle("Data to View") #view_table.resize(800, 800) view_table.setRowCount(5) view_table.setColumnCount(2) table_hbox = QtGui.QHBoxLayout() table_hbox.addWidget(load_table) table_hbox.addSpacing(50) table_hbox.addWidget(view_table) myData = DataClass(6) myObj = ActionClass(self.tab3, myData) input_data2 = QtGui.QVBoxLayout() input_data2.addWidget(btnLoadData) input_data2.addLayout(myObj.grid) input_data2.addLayout(table_hbox) input_data2.addStretch() self.figure3 = plt.figure(figsize=(30, 15)) self.canvas3 = FigureCanvas(self.figure3) tab3_hbox = QtGui.QHBoxLayout() tab3_hbox.addLayout(input_data2) tab3_hbox.addWidget(self.canvas3) self.tab3.setLayout(tab3_hbox) # Plot example tab3_data = [1, 2, 3, 4, 6, 8] self.figure3.clf() tab3_ax = self.figure3.add_subplot(111) tab3_ax.plot(tab3_data, '--', color='blue') self.canvas2.draw() # Plot First Time self.plot()
# from sklearn.metrics import classification_report from sklearn.metrics.pairwise import pairwise_distances_argmin from sklearn.cluster import KMeans import pandas as pd import numpy as np import matplotlib.pylab as plt import math import random from math import sqrt from matplotlib import style style.use('ggplot') dc = DataClass() from os import listdir from os.path import isfile, join files = [f for f in listdir("data") if isfile(join("data", f))] print(files) centroids = [] plotdata = 1 plot_original_data = 0 n_series = len(files) n_series = 1 experiment_id = 2
class MachineLearningMain: def __init__(self, use_scikit=True): self.dc = DataClass() self.data = [] self.node_data = [] self.assignments_series = [] self.min_final = None self.max_final = None self.files = [ f for f in listdir("data/sensors") if isfile(join("data/sensors", f)) ] print(self.files) self.n_nodes = len(self.files) self.n_series_disp = 10 # self.i_run = int(self.n_nodes/2) self.i_run2 = 1 # self.use_previous_cluster_data = False self.centroids = None self.final_centroids = None self.final_clusters = None self.clusters = [] self.node_centroids = [] self.partial_sample_index = 0 self.use_scikit = use_scikit def set_lib(self, use_scikit): self.use_scikit = use_scikit def init(self): self.final_centroids = None self.centroids = None self.read_data() # self.assign_class_to_nodes() def assign_class_to_nodes(self): print("machine learning: assign class to nodes") assignment_index = 0 node_id = 0 for node in self.node_data: cluster = 0 # get average cluster index for node n_series_node = len(self.data[node_id]["series"]) # get the assignments for the time series corresponding to the node node_assignments = [None] * n_series_node for i in range(n_series_node): # cluster += self.assignments_series[assignment_index]["cluster"] if assignment_index < len(self.assignments_series): node_assignments[i] = self.assignments_series[ assignment_index]["cluster"] assignment_index += 1 # node["class"] = int(cluster/n_series_node) # get class with max number of occurences in list node["class"] = max(node_assignments, key=node_assignments.count) node["demand"] = int(self.clusters[node["class"]]["avg_demand"]) node["priority"] = int(self.clusters[node["class"]]["priority"]) # print(node) node_id += 1 return self.node_data def get_info(self, node_id=None): if node_id is None: info = {"n_nodes": len(self.node_data), "nodes": self.node_data} else: info = self.node_data[node_id] return info def read_data(self): """ read data from files each file has the data for a measurement node over a time frame of n days, for every hour :return: """ self.data = [] self.node_data = [] for i, f in enumerate(self.files[0:self.n_nodes]): # print(str(i) + ". reading: " + f) fdata = self.dc.read_data(join("data/sensors/", f)) data = copy.copy(fdata) self.data.append(data) node = Constants.NODE_MODEL node["id"] = i self.node_data.append(copy.deepcopy(node)) def get_raw_data(self, node=0): t_start = time.time() # self.read_data() data = self.data[node] imax = data["series"].shape[0] imax_all = 0 for i in range(len(data)): data_array = data["series"][i] imax_all += data_array.shape[0] # print('imax: ' + str(imax)) t_end = time.time() min = int(np.min(data["series"])) max = int(np.max(data["series"])) dt = t_end - t_start info = { "description": "Raw data", "details": { "node": node, "n_series": imax, "n_nodes": len(data), "n_series_total": imax_all, "dt": int(dt * 1000), "min": min, "max": max }, "headers": np.ndarray.tolist(data["headers"]), "dt": dt, "lines": data["series"].shape[0], "columns": data["series"].shape[1] } return (np.ndarray.tolist(data["series"][:self.n_series_disp]), info) def get_array_of_arrays(self, a): array = [] for ag in a: for ag1 in ag: array.append(ag1) return array def get_display_data(self, d, global_scale=False): if d is not None: # centroids = d[0] # info = d[1] # return np.ndarray.tolist(centroids[:self.n_series_disp]), info ddata = d[0] info = d[1] start = len(ddata) - self.n_series_disp - 1 if start < 0: start = 0 end = len(ddata) # start = 0 # end = len(ddata) # if end > self.n_series_disp - 1: # end = self.n_series_disp - 1 ddata = ddata[start:end] if global_scale and self.min_final is not None: # print("use global scale") min = self.min_final max = self.max_final else: min = int(np.min(ddata)) max = int(np.max(ddata)) info["details"]["min"] = min info["details"]["max"] = max return np.ndarray.tolist(ddata), info else: return None def get_centroids(self, data, n_clusters=8, init=None): if self.use_scikit: if n_clusters is not None: if init is not None: kmeans = KMeans(n_clusters=n_clusters, init=init) else: kmeans = KMeans(n_clusters=n_clusters) else: n_clusters_range = range(2, 10) max_silhouette_avg = [0] * len(n_clusters_range) # data = np.array(data) for (i, k) in enumerate(n_clusters_range): kmeans = KMeans(n_clusters=k) a = kmeans.fit_predict(data) # print(data.shape) # print(a) # The silhouette_score gives the average value for all the samples. # This gives a perspective into the density and separation of the formed # clusters silhouette_avg = silhouette_score(data, a) # print("For n_clusters =", k, # "The average silhouette_score is :", silhouette_avg) max_silhouette_avg[i] = silhouette_avg n_clusters = n_clusters_range[max_silhouette_avg.index( max(max_silhouette_avg))] kmeans = KMeans(n_clusters=n_clusters) a = kmeans.fit(data) centroids = a.cluster_centers_ return centroids, a else: if n_clusters is None: n_clusters = 3 dcluster.reinit(data, n_clusters) # dcluster.add_new_data(data, n_clusters) centroids, a = dcluster.k_means_clust_dynamic() # print(centroids) return centroids, a def get_assignments(self, a, data): if self.use_scikit: return a.predict(data) else: return a def assign_sample_to_cluster(self, node_id, sample_id): data = self.data[node_id]["series"] data1 = data[sample_id] data1 = [data1] assignments = self.get_assignments(self.final_clusters, data1) return assignments[0] def assign_partial_sample_to_cluster(self, node_id, sample_id, init=False): data = list(self.data[node_id]["series"][sample_id]) if init: self.partial_sample_index = 0 index = self.partial_sample_index min_dist = 0 min_index = 0 for (i, c) in enumerate(self.final_centroids): d = ml.euclid_dist(data[0:index], c[0:index]) if i == 0: min_dist = d else: if d < min_dist: min_dist = d min_index = i partial_time_series = [0] * len(data) partial_time_series[0:index] = data[0:index] assignment = min_index if self.partial_sample_index < len(data) - 1: self.partial_sample_index += 1 else: self.partial_sample_index = 0 # # get assignments of time series to the final clusters partial_time_series = np.array(partial_time_series) return assignment, partial_time_series def assign_partial_sample_to_cluster_default(self, node_id, sample_id, init=False): data = list(self.data[node_id]["series"][sample_id]) if init: self.partial_sample_index = 0 data1 = [0] * len(data) partial_time_series = [0] * len(data) # print(data1) cluster_mean = list(np.mean(self.final_centroids, axis=0)) # print(cluster_mean) # print(data) for i in range(0, len(data)): if i <= self.partial_sample_index: data1[i] = data[i] partial_time_series[i] = data[i] elif i > self.partial_sample_index: data1[i] = cluster_mean[i] assignments = self.get_assignments(self.final_clusters, [data1]) if self.partial_sample_index < len(data1) - 1: self.partial_sample_index += 1 else: self.partial_sample_index = 0 # # get assignments of time series to the final clusters partial_time_series = np.array(partial_time_series) return assignments[0], partial_time_series def run_clustering_on_partial_sample(self, node_id, sample_id, init=False): assignment, partial_time_series = self.assign_partial_sample_to_cluster( node_id, sample_id, init) min = int(np.min(partial_time_series)) max = int(np.max(partial_time_series)) info = { "description": "Partial node data loading vs global clusters", "headers": ["new sample"], "dt": 0, "details": { "node_id": node_id, "node_sample": sample_id, "assignment": int(assignment), "min": min, "max": max }, "assignments": None } # print(partial_time_series) partial_time_series = [list(partial_time_series)] for (i, c) in enumerate(self.final_centroids): partial_time_series.append(list(c)) info["headers"].append("cluster " + str(i)) partial_time_series = np.array(partial_time_series) return partial_time_series, info def update_node_clusters_with_partial_sample(self, node_id, sample_id, init=False): data = self.node_centroids[node_id]["centroids"] info = { "description": "Node clusters loading vs global clusters", "headers": ["data"], "dt": 0, "details": { "node_id": node_id, "node_sample": sample_id, "min": 0, "max": 0 }, "assignments": None } # print(partial_time_series) # partial_time_series = [list(partial_time_series)] # for (i, c) in enumerate(self.final_centroids): # partial_time_series.append(list(c)) # info["headers"].append("cluster " + str(i)) # # partial_time_series = np.array(partial_time_series) return data, info def run_clustering_on_node_id(self, node_id, nclusters, partial_sample_until_id=None, add_deviation_value=None): """ Run clustering on specified node. The data from the node is an array of arrays (for each day there is an array of 24 values) The result is the consumer behaviour over the analyzed time frame :param node_id: :param nclusters: :return: """ t_start = time.time() # print(self.data) data = copy.deepcopy(self.data[node_id]["series"]) if partial_sample_until_id is not None: data = data[0:partial_sample_until_id] if add_deviation_value is not None: data[partial_sample_until_id] += add_deviation_value if nclusters is not None and nclusters > len(data): print("node " + str(node_id) + "nclusters > len(data): " + str(nclusters) + "," + str(len(data))) return [], None, data res = self.get_centroids(data, nclusters) centroids = res[0] nc = len(centroids) centroids_np = np.array(centroids) desc = "Clusters from all data (single clustering)" # assign each time series to a cluster assignments = [] headers = [] for i in range(len(centroids_np)): headers.append("cluster " + str(i)) # the assignments of the data series to the clusters assignments_series = [None] * len(assignments) for (i, a) in enumerate(assignments): assignments_series[i] = { "series": i, "cluster": int(assignments[i]) } t_end = time.time() dt = t_end - t_start min = int(np.min(centroids_np)) max = int(np.max(centroids_np)) append = True for n in self.node_centroids: if n["id"] == node_id: n["centroids"] = centroids_np append = False break if append: self.node_centroids.append({ "id": node_id, "centroids": centroids_np }) info = { "description": desc, "headers": headers, "dt": t_end - t_start, "details": { "node": node_id, "new_node": node_id, "n_clusters": nc, "n_nodes": len(self.data), "dt": int(dt * 1000), "min": min, "max": max }, "assignments": assignments_series } return centroids_np, info, data def run_clustering_on_node_range(self, r, nclusters): """ Run clustering on specified node range. The data from a node is an array of arrays (for each day there is an array of 24 values). The clusters are calculated separately for each node and added to the cluster array (various consumer behaviours in the network) :param start: :param end: :param nclusters: :return: """ t_start = time.time() centroid_vect = [] raw_data_vect = [] if r is None: r = list(range(0, len(self.data))) # run clustering for each node and save clusters into array for node_id in r: res = self.run_clustering_on_node_id(node_id, nclusters) centroid_vect.append(res[0]) raw_data_vect.append(res[2]) centroid_vect = self.get_array_of_arrays(centroid_vect) # raw_data_vect = self.get_array_of_arrays(raw_data_vect) centroids_np = np.array(centroid_vect) headers = [] for i in range(len(centroids_np)): headers.append("cluster " + str(i)) t_end = time.time() dt = t_end - t_start min = int(np.min(centroids_np)) max = int(np.max(centroids_np)) info = { "description": "Clusters from node range (single clustering)", "headers": headers, "dt": t_end - t_start, "details": { "node_range": r, "n_clusters": len(centroids_np), "n_nodes": len(self.data), "dt": int(dt * 1000), "min": min, "max": max }, "assignments": None } return centroids_np, info def run_single_clustering_on_node_range(self, r, nclusters, n_data=None): """ Run clustering on specified node. The data from the node is an array of arrays (for each day there is an array of 24 values) The result is the consumer behaviour over the analyzed time frame :param n_data: the number of samples to be used from the data :param node_id: :param nclusters: :return: """ if r is None: r = list(range(0, len(self.data))) t_start = time.time() # print(self.data) # data = self.data[node_id]["series"] # data = [[series for series in node_series["series"]] for node_series in self.data] # data = np.array(data) # print(data.shape) # data = np.array([]) data = [] for id in r: for (i, s) in enumerate(self.data[id]["series"]): if n_data is not None: if i < n_data: data.append(s) else: data.append(s) data = np.array(data) # print(data.shape) # print(self.data[0]["series"]) # print(data) res = self.get_centroids(data, nclusters) centroids = res[0] nc = len(centroids) centroids_np = np.array(centroids) desc = "Clusters from all data from all nodes (single clustering)" # assign each time series to a cluster assignments = [] headers = [] for i in range(len(centroids_np)): headers.append("cluster " + str(i)) # the assignments of the data series to the clusters assignments_series = [None] * len(assignments) for (i, a) in enumerate(assignments): assignments_series[i] = { "series": i, "cluster": int(assignments[i]) } t_end = time.time() dt = t_end - t_start min = int(np.min(centroids_np)) max = int(np.max(centroids_np)) info = { "description": desc, "headers": headers, "dt": t_end - t_start, "details": { "n_clusters": nc, "n_nodes": len(self.data), "dt": int(dt * 1000), "min": min, "max": max }, "assignments": assignments_series } return centroids_np, info, data def run_dual_clustering_on_node_range(self, r, nclusters, nclusters_final): """ Run dual clustering on specified node range. The data from a node is an array of arrays (for each day there is an array of 24 values). The clusters are calculated separately for each node and added to the cluster array. Then, there is another clustering on this cluster array which returns the final clusters for all the network (consumer types in the network) :param r: :param nclusters: :param nclusters_final: :return: """ t_start = time.time() centroid_vect = [] raw_data_vect = [] if r is None: r = list(range(0, len(self.data))) print("node range: ", r) # run clustering for each node and save clusters into array for node_id in r: res = self.run_clustering_on_node_id(node_id, nclusters) centroid_vect.append(res[0]) raw_data_vect.append(res[2]) centroid_vect = self.get_array_of_arrays(centroid_vect) raw_data_vect = self.get_array_of_arrays(raw_data_vect) n_clusters_total = len(centroid_vect) centroids_np = np.array(centroid_vect) # run clustering again for the previous clusters res = self.get_centroids(centroids_np, nclusters_final, self.final_centroids) centroids = res[0] self.final_centroids = res[0] self.final_clusters = res[1] nc = len(centroids) centroids_np = np.array(centroids) # get assignments of time series to the final clusters assignments = self.get_assignments(res[1], raw_data_vect) n = len(centroids_np) headers = [None] * n self.clusters = [] demands = [] for i in range(n): headers[i] = "cluster " + str(i) cluster = Constants.CLUSTER_MODEL cluster["id"] = assignments[i] avg_demand = np.average(centroids_np[i]) cluster["avg_demand"] = avg_demand demands.append(avg_demand) cluster["centroid"] = centroids_np[i] self.clusters.append(copy.deepcopy(cluster)) demands = np.array(demands) temp = demands.argsort() ranks = np.empty_like(temp) ranks[temp] = np.arange(len(demands)) for i in range(n): self.clusters[i]["priority"] = ranks[i] # print(self.clusters) # the assignments of the data series to the clusters self.assignments_series = [None] * len(assignments) for (i, a) in enumerate(assignments): self.assignments_series[i] = {"series": i, "cluster": int(a)} t_end = time.time() dt = t_end - t_start min = int(np.min(centroids_np)) max = int(np.max(centroids_np)) self.min_final = min self.max_final = max info = { "description": "Clusters from node range (dual clustering)", "headers": headers, "dt": t_end - t_start, "details": { "node_range": r, "n_clusters": nc, "n_nodes": len(self.data), "dt": int(dt * 1000), "min": min, "max": max }, "assignments": self.assignments_series } return centroids_np, info def run_clustering_twice(self, node=None): """ NOTE: DEPRECATED node == None => run clustering for all nodes and then run clustering again on all clusters node > 0 => run clustering for the selected node :param plot: :return: :param node: :return: """ t_start = time.time() nclusters = 2 nclusters_final = 3 centroids = [] data = [] desc = "" assignments = [] data_array_for_all = [] try: if node is None: # print("consumer nodes: " + str(len(self.data))) # for i in range(0, len(self.data)): for i in range(0, self.i_run2): data_array = self.data[i]["series"] # data_array_for_all.append([d.tolist() for d in data_array]) for data_array1 in data_array: data_array_for_all.append(data_array1) # data_array has multiple time series from the same consumer len_data = len(data_array) data_array1 = data_array # data_array1 = data_array[0:int(len_data / 2)] # if self.centroids is None: # kmeans = KMeans(n_clusters=nclusters) # else: # kmeans = KMeans(n_clusters=nclusters, init=self.centroids[i]) kmeans = KMeans(n_clusters=nclusters) # print kmeans # Compute cluster centers and predict cluster index for each sample. a = kmeans.fit(data_array1) # print a.cluster_centers_ assignments = a.predict(data_array1) centroid = a.cluster_centers_ # print(centroid) centroids.append(centroid) self.centroids = centroids centroids_all = [] for centroid_group in centroids: for centroid in centroid_group: centroids_all.append(centroid) n_clusters_total = len(centroids_all) centroids_np = centroids_all centroids_np = np.array(centroids_np) desc = "Final clusters (double clustering)" if self.final_centroids is None: kmeans = KMeans(n_clusters=nclusters_final) else: kmeans = KMeans(n_clusters=nclusters_final, init=self.final_centroids) # kmeans = KMeans(n_clusters=nclusters_final) # print kmeans # Compute cluster centers and predict cluster index for each sample. a = kmeans.fit(centroids_np) assignments = a.predict(data_array_for_all) self.final_centroids = a.cluster_centers_ data = self.final_centroids else: desc = "Clusters from all data (single clustering)" data_array = self.data[node]["series"] kmeans = KMeans(n_clusters=nclusters_final) # print kmeans # Compute cluster centers and predict cluster index for each sample. a = kmeans.fit(data_array) # print a.cluster_centers_ assignments = a.predict(data_array) centroids = a.cluster_centers_ n_clusters_total = len(centroids) data = centroids headers = [] for i in range(len(data)): headers.append("cluster " + str(i)) assignments_series = [None] * len(assignments) for (i, a) in enumerate(assignments): assignments_series[i] = { "series": i, "cluster": int(assignments[i]) } t_end = time.time() dt = t_end - t_start min = np.min(data) max = np.max(data) # print("min: " + str(np.min(data))) info = { "description": desc, "headers": headers, "dt": t_end - t_start, "details": { "node": node, "new_node": self.i_run2, "n_clusters": n_clusters_total, "n_nodes": len(self.data), "dt": int(dt * 1000), "min": min, "max": max }, "assignments": assignments_series } except: info = "failed" self.i_run2 += 1 if self.i_run2 >= self.n_nodes: self.i_run2 = 1 return np.ndarray.tolist(data[:self.n_series_disp]), info
class ML_dualVsSingleNumberOfClusters: def __init__(self, use_scikit=True): self.dc = DataClass() self.data = [] self.node_data = [] self.assignments_series = [] self.min_final = None self.max_final = None self.files = [ f for f in listdir("data/sensors") if isfile(join("data/sensors", f)) ] print(self.files) self.n_nodes = len(self.files) self.n_series_disp = 10 # self.i_run = int(self.n_nodes/2) self.i_run2 = 1 # self.use_previous_cluster_data = False self.centroids = None self.final_centroids = None self.final_clusters = None self.clusters = [] self.node_centroids = [] self.partial_sample_index = 0 self.use_scikit = use_scikit def set_lib(self, use_scikit): self.use_scikit = use_scikit def init(self): self.final_centroids = None self.centroids = None self.read_data() def read_data(self): """ read data from files each file has the data for a measurement node over a time frame of n days, for every hour :return: """ self.data = [] self.node_data = [] for i, f in enumerate(self.files[0:self.n_nodes]): # print(str(i) + ". reading: " + f) fdata = self.dc.read_data(join("data/sensors/", f)) data = copy.copy(fdata) self.data.append(data) node = Constants.NODE_MODEL node["id"] = i self.node_data.append(copy.deepcopy(node)) def test_single_scenario(self): """ test with different number of clusters for stage 2 (2 stage clustering) comparing the deviation from single stage clustering """ self.set_lib(True) # res_dual = self.run_dual_clustering_on_node_range(None, None, 3) # n_data = len(self.data[0]) n_data = 81 n_clusters = 3 # n_clusters_for_nodes = 80 n_clusters_for_nodes = None print("n_data: ", n_data) print("n_clusters_for_nodes: ", n_clusters_for_nodes) res_dual1 = run_dual_clustering_on_node_range(self.data, None, n_clusters_for_nodes, n_clusters) res_single1 = run_clustering_for_all_nodes_at_once( self.data, None, n_clusters, n_data) res_all1 = np.concatenate((res_dual1, res_single1), axis=0) comp, ca, rd = get_comp(res_dual1, res_single1) print("comp_avg: " + str(ca)) res_all = copy.copy(res_all1) res_dual = copy.copy(res_dual1) res_single = copy.copy(res_single1) comp_avg = ca res_diff = rd colors = ['b'] * n_clusters colors2 = ['g:'] * n_clusters cluster_labels1 = ["cd" + str(i + 1) for i in range(n_clusters)] cluster_labels2 = ["cs" + str(i + 1) for i in range(n_clusters)] plot_from_matrix(res_all, colors + colors2) plt.legend(cluster_labels1 + cluster_labels2) if n_clusters_for_nodes is None: n_clusters_for_nodes = "auto" plt.title("number of clusters for nodes: " + str(n_clusters_for_nodes) + ", average deviation: " + str(int(comp_avg))) plt.xlabel("Time of day (hours)") plt.ylabel("Value of cluster centroids ($m^3 / s$)") plt.show() def test_full_range(self): """ test with different number of clusters for stage 2 (2 stage clustering) comparing the deviation from single stage clustering """ self.set_lib(True) # res_dual = self.run_dual_clustering_on_node_range(None, None, 3) n_clusters = 3 nc_max = 81 n_data = nc_max r1 = list(range(2, nc_max)) # r1 = [2, 10, 82] n_clusters_for_nodes_range = [None] + r1 comp_avg_vect = [0] * len(n_clusters_for_nodes_range) # test_index = 0 test_index = len(n_clusters_for_nodes_range) - 1 for (i, k) in enumerate(n_clusters_for_nodes_range): ncn = k print("n_clusters_for_nodes: " + str(k)) res_dual1 = run_dual_clustering_on_node_range( self.data, None, ncn, n_clusters) res_single1 = run_clustering_for_all_nodes_at_once( self.data, None, n_clusters, n_data) res_all1 = np.concatenate((res_dual1, res_single1), axis=0) comp, ca, rd = get_comp(res_dual1, res_single1) comp_avg_vect[i] = ca print("comp_avg: " + str(ca)) if i == test_index: res_all = copy.copy(res_all1) res_dual = copy.copy(res_dual1) res_single = copy.copy(res_single1) n_clusters_for_nodes = k comp_avg = ca res_diff = rd colors = ['b'] * n_clusters colors2 = ['g:'] * n_clusters cluster_labels1 = ["cd" + str(i + 1) for i in range(n_clusters)] cluster_labels2 = ["cs" + str(i + 1) for i in range(n_clusters)] plot_from_matrix(res_all, colors + colors2) plt.legend(cluster_labels1 + cluster_labels2) if n_clusters_for_nodes is None: n_clusters_for_nodes = "auto" plt.title("number of clusters for nodes: " + str(n_clusters_for_nodes) + ", average deviation: " + str(int(comp_avg))) plt.figure() comp_avg_dynamic = comp_avg_vect[0] comp_avg_vect = comp_avg_vect[1:] n_clusters_for_nodes_range = n_clusters_for_nodes_range[1:] print("comp_avg_trim") print(comp_avg_vect) print("comp_dynamic") print(comp_avg_dynamic) result_obj = [0] * len(n_clusters_for_nodes_range) for (i, nc) in enumerate(n_clusters_for_nodes_range): result_obj[i] = {"nc": nc, "val": comp_avg_vect[i]} result_b_obj = {"nc": None, "val": comp_avg_dynamic} print(result_obj) vmax = max(node["val"] for node in result_obj) vmin = min(node["val"] for node in result_obj) imin = result_obj[0]["nc"] imax = result_obj[0]["nc"] # ncmax = result_obj for obj in result_obj: if obj["val"] == vmax: imax = obj["nc"] if obj["val"] == vmin: imin = obj["nc"] if result_b_obj["val"] < vmin: result_b_obj["nc"] = imin if result_b_obj["val"] > vmax: result_b_obj["nc"] = imax for (i, res) in enumerate(result_obj): if i < len(result_obj) - 1: if (result_obj[i]["val"] <= result_b_obj["val"] and result_b_obj["val"] <= result_obj[i + 1]["val"] ) or (result_obj[i]["val"] >= result_b_obj["val"] and result_b_obj["val"] >= result_obj[i + 1]["val"]): result_b_obj["nc"] = result_obj[i]["nc"] break print(result_b_obj) # return True width = 0.35 # the width of the bars plt.bar(n_clusters_for_nodes_range, comp_avg_vect, width) plt.bar(result_b_obj["nc"] - width, result_b_obj["val"], width) plt.xlabel("number of clusters for nodes") plt.ylabel("average deviation from single clustering") plt.show()
class ML_Anomaly: def __init__(self, use_scikit=True): self.dc = DataClass() self.data = [] self.node_data = [] self.assignments_series = [] self.min_final = None self.max_final = None self.files = [ f for f in listdir("data/sensors") if isfile(join("data/sensors", f)) ] print(self.files) self.n_nodes = len(self.files) self.n_series_disp = 10 # self.i_run = int(self.n_nodes/2) self.i_run2 = 1 # self.use_previous_cluster_data = False self.centroids = None self.final_centroids = None self.final_clusters = None self.clusters = [] self.node_centroids = [] self.partial_sample_index = 0 self.use_scikit = use_scikit def set_lib(self, use_scikit): self.use_scikit = use_scikit def init(self): self.final_centroids = None self.centroids = None self.read_data() def read_data(self): """ read data from files each file has the data for a measurement node over a time frame of n days, for every hour :return: """ self.data = [] self.node_data = [] for i, f in enumerate(self.files[0:self.n_nodes]): # print(str(i) + ". reading: " + f) fdata = self.dc.read_data(join("data/sensors/", f)) data = copy.copy(fdata) self.data.append(data) node = Constants.NODE_MODEL node["id"] = i self.node_data.append(copy.deepcopy(node)) def run_test_3(self): """ adding an anomaly at a certain point (constant additional demand) comparing the evolution of clusters (with partial clustering) to the normal evolution of clusters observing the velocity of change that gives the time until steady state error """ day_start_deviation = 10 day_end = 20 deviation = 200 # iterations = day_end - day_start_deviation iterations = day_end deviation_element_vect = [None] * iterations deviation_total_vect = [None] * iterations res_partial_whole_vect = [None] * iterations res_partial_whole_anomaly_vect = [None] * iterations data = copy.deepcopy(self.data[0]["series"]) data_with_constant_anomaly = copy.deepcopy(self.data[0]["series"]) # add constant deviation (anomaly) to the second data set # starting with day_start_deviation (index for the day of the start of the anomaly) for i, d in enumerate(range(day_start_deviation, day_end)): for j in range(len(data_with_constant_anomaly[d])): data_with_constant_anomaly[d][j] += deviation centroids_init = dcluster.reinit(data[0:day_start_deviation - 1], 2, 5) res_partial_whole, a = dcluster.k_means_clust_dynamic() # for i, d in enumerate(range(day_start_deviation, day_end)): for i, d in enumerate(range(day_end)): print(str(i) + "," + str(d)) res_partial_whole_vect[i] = copy.deepcopy( dcluster.k_means_clust_dynamic_partial_update_whole(data[d])) # run clustering update for second data set with anomalies dcluster.reinit(data_with_constant_anomaly[0:day_start_deviation - 1], 2, 5, centroids_init) res_partial_whole_anomaly, a = dcluster.k_means_clust_dynamic() # for i,d in enumerate(range(day_start_deviation, day_end)): for i, d in enumerate(range(day_end)): print(str(i) + "," + str(d)) res_partial_whole_anomaly_vect[i] = copy.deepcopy( dcluster.k_means_clust_dynamic_partial_update_whole( data_with_constant_anomaly[d])) # plot the results (deviation between the 2 data sets) for i, d in enumerate(range(day_end)): total_deviation, average_deviation, deviation = get_comp( res_partial_whole_anomaly_vect[i], res_partial_whole_vect[i]) print(average_deviation) deviation_total_vect[i] = average_deviation deviation_element_vect[i] = deviation plt.clf() for ts in res_partial_whole_vect[i]: plt.plot(ts) for ts in res_partial_whole_anomaly_vect[i]: plt.plot(ts) plt.gca().set_title("deviation " + str(d) + ", with anomaly from day " + str(day_start_deviation)) plt.pause(0.2) print(deviation_total_vect) plt.clf() # plt.subplot(212) plt.plot(deviation_total_vect) # plt.ylim([-100, 100]) plt.gca().set_title("anomaly transient effect on cluster centroids") plt.xlabel("time (days)") plt.ylabel("standard deviation") plt.show()