def __init__(self, dataContext, name): clusteringResultsPath = getClusteringResultsPath( ) + getTimeSeriesToyDatasetName() + '/' self.name = name super().__init__(dataContext, clusteringResultsPath) self.ownResourcesFolder = self.clusteringResultsPath + self.name + '/' self.microClustersFolder = self.ownResourcesFolder
# obtain the data sets from the csv files non_time_series_datasets = getDatasetsFromFolder( getNonTimeSeriesDatasetsPath()) # clustering algorithms kmeans = KMeans() dbscan = DBSCAN(eps=0.2, min_samples=10) algorithms = [(getmKMeansName(), kmeans), (getDbscanName(), dbscan)] # iterate over the data sets for datIndx in range(len(non_time_series_datasets)): # row index X = non_time_series_datasets[datIndx]['dataset'] dName = non_time_series_datasets[datIndx]['name'] k = non_time_series_datasets[datIndx]['k'] baseFolder = getClusteringResultsPath() + dName + '/' # normalize dataset for easier parameter selection X = StandardScaler().fit_transform(X) # iterate over the algorithms for algIndx in range(len(algorithms)): # column index algo = algorithms[algIndx][1] algoName = algorithms[algIndx][0] # if the algorithm is 'kmeans', k must be set specifically for the data set if algoName.lower() == 'kmeans': algo.set_params(n_clusters=k) # assign labels labels = algo.fit_predict(X) print(" ", len(set(labels)), " clusters") clusteringInfo(labels) # result contains every scaled element with the corresponding label result = np.c_[X, labels]
dataset = dataset[dataset['to_timestamp'].apply(validate_timestamp, date = datetime.datetime(2018, 10, 2, 10, 30))] # data context dataContext = [ # hardcodeamos los límites de sweden BoundingBox(minimun=11.0273686052 , maximun=23.9033785336), BoundingBox(minimun=55.3617373725, maximun=69.1062472602), ] # dyclee dyclee = Dyclee(dataContext = dataContext, relativeSize = params["relativeSize"], speed = params["speed"], lambd = params["lambd"], periodicRemovalAt = params["periodicRemovalAt"], periodicUpdateAt = params["periodicUpdateAt"], timeWindow = params["timeWindow"], closenessThreshold = params["closenessThreshold"]) tGlobal = 6000 # siguiendo lo del notebook, estoy tomando casi cada media hs ac = 0 # represents amount of processed elements folder = getClusteringResultsPath() + getRealDatasetName() + '/' + getDycleeName() + '/' # for saving the prepared resuls if will_calculate_DBCV: resetStorage(folder) first = True # first clustering; 'end' picture will be added at the beggining (to understand the loop) for ind, p in dataset.iterrows(): if end(p['to_timestamp']): # break; para no seguir, solo quiero ver los insights de las 11 y 16 del dia 2 print(p, ' ... (the end)') break if not validate_p(p): # me saco de encima los que estan por fuera de sweden, y no corresponden al dia que quiero analizar continue ac += 1 point = [p.latitude, p.longitude] # ahora esta completo el dataset, hay que subsetear
import json import numpy as np import os from config import getClusteringResultsPath, getMbkName, getTimeSeriesDatasetsPath, getTimeSeriesToyDatasetName import shutil folder = getClusteringResultsPath() + getTimeSeriesToyDatasetName() + '/' + getMbkName() + '/' def resetStorage(): if os.path.exists(folder): shutil.rmtree(folder) def createDirectoryIfNotExists(folder): # check if resourcesFolder needs to be created if not os.path.exists(folder): os.makedirs(folder) def storeResult(snapshot): createDirectoryIfNotExists(folder) time = snapshot.get("time") result = snapshot.get("result") targetFile = folder + str(time) + '.csv' np.savetxt(targetFile, result, delimiter=',',) def storeAlgoConfig(dict): createDirectoryIfNotExists(folder)
def __init__(self, dataContext): clusteringResultsPath = getClusteringResultsPath() print('clusteringResultsPath', clusteringResultsPath) super().__init__(dataContext, clusteringResultsPath)