def evaluate(run_identifier, control_params, params, function_to_evaluate, out_folder, force = False): try: experiment_db = ExperimentDB(out_folder, function_to_evaluate.__name__, run_identifier, dump_also_as_json=True) previous_result = experiment_db.get_experiment_result(params) if previous_result is not None and not force: print("already_exists:", function_to_evaluate.__name__ + " for", run_identifier, "with", params) return previous_result else: print(function_to_evaluate.__name__ + " for", run_identifier, "with", params) result_q = Queue() p = Process(target=function_to_evaluate, args=(result_q, control_params, params)) p.start() p.join() # this blocks until the process terminates try: res = result_q.get_nowait() except Empty: print("no result available for this call. the process most likely failed with %s" % str(p.exitcode)) res = None experiment_db.add_experiment(control_params, params, res) return (control_params, params, res) except: raise
def setName(self, name): oldname = self.name if not self._isFinished.is_set(): return {'oldname': oldname, 'name': self.name, 'error': \ 'Experiment is currently running.'} result = ExperimentDB.rename(oldname, name) if 'error' not in result: self.name = name return result
def runCurrentExperiment(self, expType="Standard", isLoad=False): """ Creates an experiment runner for the current model and starts running the model in a seperate thread """ if self.experimentRunner: self.stopCurrentExperiment() self.datasets[self.currentDataset].rewind() if isLoad: modelInfo = json.loads(ExperimentDB.get(self.name)['metadata']) modelDescriptionText = modelInfo['modelDescriptionText'] subDescriptionText = modelInfo['subDescriptionText'] self.loadDescriptionFile(modelDescriptionText, subDescriptionText) else: data = dict( modelDescriptionText=self.descriptionText, subDescriptionText=self.subDescriptionText ) ExperimentDB.add(self.name, data) self.__currentModelData = [] if expType == "Standard": self.experimentRunner = ExperimentRunner( name = self.name, modelDescription=self.models[self.currentModel], control= self.control, dataset=self.datasets[self.currentDataset]) elif expType == "Anomaly": self.experimentRunner = AnomalyRunner( name = self.name, modelDescription=self.models[self.currentModel], control= self.control, dataset=self.datasets[self.currentDataset]) if isLoad: self.experimentRunner.load() else: self.experimentRunner.run() return self.getExperimentInfo(self.models[self.currentModel])
def getProtosAtTime(self, timestep): collection = ExperimentDB.getExperimentDB(self.name) experimentData = collection.find_one({"_id":timestep}) experimentData['protos'] = [] predictedField = self._modelDescription["predictedField"] predictedFieldIndex = self.getFieldNames().index(predictedField) dists = json.loads(experimentData['classificationDist']) for distId in json.loads(experimentData["classificationIdx"]): distSurroundingValues = collection.find({"_id": { "$gt": distId-10, "$lt": distId+10 }}).sort('_id', pymongo.ASCENDING) experimentData['protos'].append(dict( ids=[], actual=[], prediction=[], anomaly=[], anomalyLabel=[], dist=dists.pop(0), index=distId )) protosId = len(experimentData['protos']) - 1 for distSurroundingValue in distSurroundingValues: inferences = json.loads(distSurroundingValue["inferences"]) actual = distSurroundingValue["actual"] inference = inferences[InferenceElement.multiStepBestPredictions] step = min(inference.iterkeys()) prediction =inference[step] if prediction is None: prediction = 0.0 anomaly = inferences[InferenceElement.anomalyScore] anomalyLabel = inferences[InferenceElement.anomalyLabel] experimentData['protos'][protosId]["ids"].append(distSurroundingValue["_id"]) experimentData['protos'][protosId]["actual"].append(actual) experimentData['protos'][protosId]["prediction"].append(prediction) experimentData['protos'][protosId]["anomaly"].append(anomaly) experimentData['protos'][protosId]["anomalyLabel"].append(anomalyLabel) return experimentData
def result_evaluation_dataset_speed_comparison(out_folder, out_folder_csv): for fcnt, plotname in [('do_kmeans', 'kmeans_speeds')]: print(plotname) run_identifiers = ExperimentDB.get_identifiers(out_folder, fcnt) plot_data = {} result_data = {} for run_identifier in run_identifiers: db = ExperimentDB(out_folder, fcnt, run_identifier) for resid in db.get_algorithm_run_ids(): (control_params, params, res) = db.get_experiment_result_from_run_id(resid) if res is None: continue ds = params['info']['dataset_name'] alg = params['info']['algorithm'] no_clusters = params['task']['no_clusters'] run = params['task']['run'] duration_kmeans = res['duration_kmeans'] no_iterations = len(res['iteration_changes']) if ds not in result_data: result_data[ds] = {} result_data[ds]['results'] = {} result_data[ds]['infos'] = {} if no_clusters not in result_data[ds]['results']: result_data[ds]['results'][no_clusters] = {} if alg not in result_data[ds]['results'][no_clusters]: result_data[ds]['results'][no_clusters][alg] = {} if 'duration' not in result_data[ds]['results'][no_clusters][ alg]: result_data[ds]['results'][no_clusters][alg][ 'duration'] = {} if 'no_iterations' not in result_data[ds]['results'][ no_clusters][alg]: result_data[ds]['results'][no_clusters][alg][ 'no_iterations'] = {} result_data[ds]['results'][no_clusters][alg]['duration'][ run] = duration_kmeans if 'truncated_svd' in res: result_data[ds]['results'][no_clusters][alg]['duration'][ run] += res['truncated_svd']['duration'] result_data[ds]['infos']['input_dimension'] = res[ 'input_dimension'] result_data[ds]['infos']['input_samples'] = res[ 'input_samples'] result_data[ds]['infos']['input_annz'] = res['input_annz'] result_data[ds]['results'][no_clusters][alg]['no_iterations'][ run] = no_iterations remove_incomplete_data(result_data) print("Result data:") pprint(result_data) create_plot(output_folder=out_folder_csv, plot_name=plotname, pdata=result_data)
def getDetailsAtTime(self, timestep): collection = ExperimentDB.getExperimentDB(self.name) return collection.find_one({"_id":timestep})
def _runExperimentLoop(self, queue): collection = ExperimentDB.getExperimentDB(self.name) while self._maxiterations == -1 or self._iteration <= self._maxiterations: try: # Get next record record = self._dataset.getNextRecord() except StopIteration: self._isFinished.set() return None if self._stop.isSet(): break # Feed record to model and get prediction modelResult = self._model.run(record) if modelResult is None: continue if modelResult.inferences[InferenceElement.anomalyVector] is not None: modelResult.inferences[InferenceElement.anomalyVector] = \ modelResult.inferences[InferenceElement.anomalyVector].nonzero()[0].tolist() distances = self._model._classifier_distances #classifier.getSelf().getLatestDistances() sortedDistIdx = [] sortedDists = [] if distances is not None and len(distances) > 0: sortedDistIdx = distances.argsort() sortedDists = distances[sortedDistIdx[:5]].tolist() idList = self._model._classifier_indexes #classifier.getSelf().getParameter('categoryRecencyList') if len(idList) > 0: sortedDistIdx = [ \ idList[i] + self._model._classificationDelay - 1\ for i in sortedDistIdx[:min(5, len(sortedDistIdx))]] else: sortedDistIdx = [] #matrix = classifier.getSelf()._knn._Memory #print matrix.shape #print "Index: %s" % (sorted) #if len(sorted) > 0: # print matrix.getRow(int(sorted[0])) # print matrix.getRow(int(sorted[0])).nonzero()[0] predictedField = self._modelDescription["predictedField"] predictedFieldIndex = self.getFieldNames().index(predictedField) modelResult.inferences['encodings'] = None modelResult.sensorInput.dataEncodings = None actual = modelResult.sensorInput.dataRow[predictedFieldIndex] dbelem = {"_id":self._iteration, "actual": actual, "inferences": json.dumps(modelResult.inferences), "classificationIdx":json.dumps(sortedDistIdx), "classificationDist":json.dumps(sortedDists) } collection.insert(dbelem) self._dataQ.put(dbelem) self._iteration += 1 gevent.sleep(0) self._isFinished.set()
def _runExperimentLoadLoop(self, queue): collection = ExperimentDB.getExperimentDB(self.name) experimentData = collection.find() for record in experimentData: self._dataQ.put(record) gevent.sleep(0)
def getDataAtTime(self, dataInput): timestep = int(dataInput['timestep']) collection = ExperimentDB.getExperimentDB(self.name) experimentData = collection.find_one({"_id": timestep}) return experimentData
def _runExperimentLoop(self, queue): self.prevFieldPred = {} self._model.resetSequenceStates() cOut = os.fdopen(os.open("/tmp/cerebro.cout", os.O_RDWR | os.O_CREAT), 'w+') oldC = os.dup(1) collection = ExperimentDB.getExperimentDB(self.name) while self._maxiterations == -1 or self._iteration <= self._maxiterations: try: # Get next record record = self._dataset.getNextRecord() except StopIteration: self._isFinished.set() return None if self._stop.isSet(): break # Feed record to model and get prediction. Capture all the stdout as well os.dup2(cOut.fileno(), 1) modelResult = self._model.run(record) os.dup2(oldC, 1) cOut.seek(0) verboseOutput = cOut.read() cOut.truncate(0) modelResult.inferences['encodings'] = None modelResult.sensorInput.dataEncodings = None model = self._model sensor = model._getSensorRegion() sp = model._getSPRegion() tp = model._getTPRegion() cl = model._getClassifierRegion() spImp = None tpImp = None if sp is not None: spImp = sp.getSelf()._sfdr if tp is not None: tpImp = tp.getSelf()._tfdr clImp = cl.getSelf()._claClassifier #Copy all the pertinent data sourceScalars = copy.deepcopy(sensor.getOutputData('sourceOut')) sensorBits = sensor.getOutputData('dataOut') sensorBUOut = sensorBits.nonzero()[0].tolist() SPBUOut = [] nConnectedInputs = [] overlaps = [] if spImp is not None: SPBUOut = sp.getOutputData('bottomUpOut').nonzero()[0].tolist() nConnectedInputs = spImp._allConnectedM.nNonZerosPerRow()[SPBUOut].astype('int32').tolist() overlaps = zip(SPBUOut, spImp._overlapsNoBoost[SPBUOut].astype('int32').tolist()) TPTDOut = tp.getOutputData('topDownOut') if tp else None sensorTDIn = sensor.getInputData('temporalTopDownIn') permanences = {} predictedCols = () predictedConfidences = () tpInfActiveCells = () tpLrnActiveCells = () tpPredCells = [] if TPTDOut is not None: predictedCols = TPTDOut.nonzero()[0].tolist() predictedConfidences = TPTDOut[predictedCols].tolist() tpInfActiveCells = self._formatActiveCells(tpImp.infActiveState['t']) tpLrnActiveCells = self._formatActiveCells(tpImp.lrnActiveState['t']) tpInfPredT_1 = self._formatActiveCells(tpImp.infPredictedState['t-1']) tpInfPredT = self._formatActiveCells(tpImp.infPredictedState['t']) tpPredCells = tpImp.infPredictedState['t'].nonzero()[0].tolist() sensorPredBits = [] if sensorTDIn is not None: sensorPredBits = sensorTDIn if self.prevPredictedCols is None: self.prevPredictedCols = [] self.prevTPPredictedCells = [] self.prevPredictedConfs = [] self.prevTPPredicted = [] clPattern = clImp._patternNZHistory[-1] step = clImp.steps[0] bitHistories = {} fieldActivations = {} fieldPredictions = {} for fieldName, (start, stop) in self.fieldRanges.iteritems(): nzBits = sensorBits[start:stop].nonzero()[0] fieldActivations[fieldName] = nzBits.tolist() nzBits = sensorPredBits[start:stop].nonzero()[0] fieldPredictions[fieldName] = nzBits.tolist() predictedField = self._modelDescription["predictedField"] predictedFieldIndex = self.getFieldNames().index(self.predictedField) actual = modelResult.sensorInput.dataRow[predictedFieldIndex] dthandler = lambda obj: obj.isoformat() if isinstance(obj, datetime.datetime) \ else None record = {"_id":self._iteration, "actual": actual, "SPBUOut":SPBUOut, "overlaps":overlaps, "predictedCols": self.prevPredictedCols, "tpInfActive": tpInfActiveCells, "tpLrnActive": tpLrnActiveCells, "tpPredicted": self.prevTPPredictedCells, "tpInfPredT_1":tpInfPredT_1, "tpInfPredT":tpInfPredT, "permanences": permanences, "overlaps": overlaps, "inferences": json.dumps(modelResult.inferences), "record":json.dumps(modelResult.rawInput, default=dthandler), "fieldActivations":fieldActivations, #TODO: for some reason, field predictions don't need to be shifted?? "fieldPredictions": fieldPredictions, "verboseOutput": verboseOutput, } collection.insert(record) self._dataQ.put(record) self.prevPredictedCols = predictedCols self.prevTPPredictedCells = tpPredCells self.prevPredictedConfs = predictedConfidences #self.prevTPPredicted = tpPredCells self.prevTPPredicted = None self.prevFieldPred = fieldPredictions self._iteration += 1 gevent.sleep(0) os.close(oldC) cOut.close() self._isFinished.set()
def getDataAtTime(self, dataInput): timestep = int(dataInput['timestep']) collection = ExperimentDB.getExperimentDB(self.name) experimentData = collection.find_one({"_id":timestep}) return experimentData
def POST(self): name = web.input()["name"] return json.dumps(ExperimentDB.delete(name))
def GET(self): return json.dumps(ExperimentDB.list())
def result_evaluation_minibatch_best_params(out_folder, out_folder_csv, remove_incomplete=False, ignore_datasets={}): fcnt, plotname = ('do_minibatch_best_params', 'kmeans_params') print(plotname) run_identifiers = ExperimentDB.get_identifiers(out_folder, fcnt) plot_data = OrderedDict() result_data = OrderedDict() for run_identifier in run_identifiers: db = ExperimentDB(out_folder, fcnt, run_identifier) print(run_identifiers) for resid in db.get_algorithm_run_ids(): (control_params, params, res) = db.get_experiment_result_from_run_id(resid) print(resid, control_params, params) if res is None: continue ds = params['info']['dataset_name'] alg = params['info']['algorithm'] no_clusters = params['task']['no_clusters'] run = params['task']['run'] duration_kmeans = res['duration_kmeans'] no_iterations = len(res['iteration_changes']) iteration_durations = res['iteration_durations'] iteration_changes = res['iteration_changes'] iteration_wcssd = res['iteration_wcssd'] if 'pca' in alg: param_percent = params['info']['truncated_svd_annz_percentage'] elif 'bv' in alg: param_percent = params['task']['bv_annz'] else: param_percent = 0 if ds in ignore_datasets: continue if ds not in result_data: result_data[ds] = OrderedDict() result_data[ds]['results'] = OrderedDict() result_data[ds]['infos'] = OrderedDict() if no_clusters not in result_data[ds]['results']: result_data[ds]['results'][no_clusters] = OrderedDict() if alg not in result_data[ds]['results'][no_clusters]: result_data[ds]['results'][no_clusters][alg] = OrderedDict() for descr in [ 'iteration_durations', 'iteration_changes', 'iteration_wcssd', 'duration', 'no_iterations' ]: if descr not in result_data[ds]['results'][no_clusters][alg]: result_data[ds]['results'][no_clusters][alg][ descr] = OrderedDict() for descr in [ 'iteration_durations', 'iteration_changes', 'iteration_wcssd', 'duration' ]: if run not in result_data[ds]['results'][no_clusters][alg][ descr]: result_data[ds]['results'][no_clusters][alg][descr][ run] = OrderedDict() kmeans_duration_this_run = duration_kmeans if 'truncated_svd' in res: kmeans_duration_this_run += res['truncated_svd']['duration'] if param_percent in result_data[ds]['results'][no_clusters][alg][ 'duration'][run]: raise Exception( "dataset=%s no_clusters=%s alg=%s duration run=%s already added !!! %s %s" % (ds, str(no_clusters), alg, str(run), control_params, params)) for descr in [ 'iteration_durations', 'iteration_changes', 'iteration_wcssd', 'duration' ]: if run not in result_data[ds]['results'][no_clusters][alg][ descr]: result_data[ds]['results'][no_clusters][alg][descr][ run] = OrderedDict() result_data[ds]['results'][no_clusters][alg]['duration'][run][ param_percent] = kmeans_duration_this_run result_data[ds]['infos']['input_dimension'] = res[ 'input_dimension'] result_data[ds]['infos']['input_samples'] = res['input_samples'] result_data[ds]['infos']['input_annz'] = res['input_annz'] if run in result_data[ds]['results'][no_clusters][alg][ 'no_iterations']: if result_data[ds]['results'][no_clusters][alg][ 'no_iterations'][run] != no_iterations: print( alg, run, no_iterations, result_data[ds]['results'] [no_clusters][alg]['no_iterations'][run], ds, no_clusters, param_percent, resid) raise Exception( "Number of iterations is not identical! len(res['iteration_changes']) = %d, no_iterations= %d resid=%d" % (no_iterations, result_data[ds]['results'] [no_clusters][alg]['no_iterations'][run], resid)) else: result_data[ds]['results'][no_clusters][alg]['no_iterations'][ run] = no_iterations result_data[ds]['results'][no_clusters][alg][ 'iteration_durations'][run][ param_percent] = iteration_durations result_data[ds]['results'][no_clusters][alg]['iteration_changes'][ run][param_percent] = iteration_changes result_data[ds]['results'][no_clusters][alg]['iteration_wcssd'][ run][param_percent] = iteration_changes if remove_incomplete: remove_incomplete_data(result_data) return result_data
def result_evaluation_memory_consumption(out_folder, out_folder_csv): for fcnt, plotname in [('do_kmeans', 'kmeans_speeds')]: print(plotname) run_identifiers = ExperimentDB.get_identifiers(out_folder, fcnt) plot_data = {} result_data = {} for run_identifier in run_identifiers: db = ExperimentDB(out_folder, fcnt, run_identifier) for resid in db.get_algorithm_run_ids(): (control_params, params, res) = db.get_experiment_result_from_run_id(resid) if res is None: continue ds = params['info']['dataset_name'] alg = params['info']['algorithm'] no_clusters = params['task']['no_clusters'] run = params['task']['run'] duration_kmeans = res['duration_kmeans'] no_iterations = len(res['iteration_changes']) if ds not in result_data: result_data[ds] = {} result_data[ds]['results'] = {} result_data[ds]['infos'] = {} if no_clusters not in result_data[ds]['results']: result_data[ds]['results'][no_clusters] = {} if alg not in result_data[ds]['results'][no_clusters]: result_data[ds]['results'][no_clusters][alg] = {} if 'duration' not in result_data[ds]['results'][no_clusters][alg]: result_data[ds]['results'][no_clusters][alg]['duration'] = {} if 'no_iterations' not in result_data[ds]['results'][no_clusters][alg]: result_data[ds]['results'][no_clusters][alg]['no_iterations'] = {} no_samples = res['input_samples'] size_of_data_storage_element = 8 size_of_key_storage_element = 4 size_of_pointer_storage_element = 8 if alg != 'kmeans': no_clusters_remaining = res['no_clusters_remaining'] if alg == 'kmeans': mem_consumption = 0 elif alg == 'elkan': # elkan stores two dense matrices # 1. lower_bound_matrix = no_samples * no_clusters_remaining # 2. distance_between_clusters_matrix = no_clusters_remaining * no_clusters_remaining lower_bound_matrix_mem_consumption = no_samples * no_clusters_remaining * size_of_data_storage_element distance_between_clusters_matrix_mem_consumption = no_clusters_remaining * no_clusters_remaining * size_of_data_storage_element mem_consumption = lower_bound_matrix_mem_consumption + distance_between_clusters_matrix_mem_consumption elif alg == 'pca_elkan': # pca_elkan stores two dense matrices + orthonormal_basis_matrix + projected_matrix_samples + projected_matrix_clusters # 1. lower_bound_matrix = no_samples * no_clusters_remaining # 2. distance_between_clusters_matrix = no_clusters_remaining * no_clusters_remaining # 3. orthonormal_basis_matrix = no_orthonormal_vectors * orthonormal_basis_matrix_dim # 4. projected_matrix_samples = no_samples * dim ( = no_orthonormal_vectors) # 5 projected_matrix_clusters = no_clusters_remaining * dim ( = no_orthonormal_vectors) lower_bound_matrix_mem_consumption = no_samples * no_clusters_remaining * size_of_data_storage_element distance_between_clusters_matrix_mem_consumption = no_clusters_remaining * no_clusters_remaining * size_of_data_storage_element # These matrices are stored as sparse matrices. Can be changed in the future since these matrices are almost completely dense orthonormal_basis_matrix_mem_consumption = (res['truncated_svd']['no_components'] * res['truncated_svd']['no_features'] * (size_of_data_storage_element + size_of_key_storage_element)) \ + ((res['truncated_svd']['no_components'] + 1) * size_of_pointer_storage_element) projected_matrix_samples_mem_consumption = (no_samples * res['truncated_svd']['no_components'] * (size_of_data_storage_element + size_of_key_storage_element)) \ + ((no_samples + 1) * size_of_pointer_storage_element) projected_matrix_clusters_mem_consumption = (no_clusters_remaining * res['truncated_svd']['no_components'] * (size_of_data_storage_element + size_of_key_storage_element)) \ + ((no_samples + 1) * size_of_pointer_storage_element) mem_consumption = lower_bound_matrix_mem_consumption \ + distance_between_clusters_matrix_mem_consumption \ + orthonormal_basis_matrix_mem_consumption \ + projected_matrix_samples_mem_consumption \ + projected_matrix_clusters_mem_consumption elif alg == 'pca_kmeans': # pca_elkan stores a orthonormal_basis_matrix + projected_matrix # 1. orthonormal_basis_matrix = no_orthonormal_vectors * orthonormal_basis_matrix_dim # 2. projected_matrix_samples = no_samples * dim ( = no_orthonormal_vectors) # 3 projected_matrix_clusters = no_clusters_remaining * dim ( = no_orthonormal_vectors) # These matrices are stored as sparse matrices. Can be changed in the future since these matrices are almost completely dense orthonormal_basis_matrix_mem_consumption = (res['truncated_svd']['no_components'] * res['truncated_svd']['no_features'] * (size_of_data_storage_element + size_of_key_storage_element)) \ + ((res['truncated_svd']['no_components'] + 1) * size_of_pointer_storage_element) projected_matrix_samples_mem_consumption = (no_samples * res['truncated_svd']['no_components'] * (size_of_data_storage_element + size_of_key_storage_element)) \ + ((no_samples + 1) * size_of_pointer_storage_element) projected_matrix_clusters_mem_consumption = (no_clusters_remaining * res['truncated_svd']['no_components'] * (size_of_data_storage_element + size_of_key_storage_element)) \ + ((no_samples + 1) * size_of_pointer_storage_element) mem_consumption = orthonormal_basis_matrix_mem_consumption \ + projected_matrix_samples_mem_consumption \ + projected_matrix_clusters_mem_consumption elif alg == 'kmeans_optimized': # kmeans_optimized stores a projected_matrix_samples + projected_matrix_clusters # 1. projected_matrix_samples = no_samples * dim ( = no_orthonormal_vectors) # 2 projected_matrix_clusters = no_clusters_remaining * dim ( = no_orthonormal_vectors) annz_projected_matrix_samples = res['block_vector_data']['annz'] # annz_projected_matrix_clusters was not measured (we use the annz_projected_matrix_samples as an approximation) annz_projected_matrix_clusters = annz_projected_matrix_samples projected_matrix_samples_mem_consumption = (annz_projected_matrix_samples * no_samples * (size_of_data_storage_element + size_of_key_storage_element)) \ + ((no_samples + 1) * size_of_pointer_storage_element) projected_matrix_clusters_mem_consumption = (annz_projected_matrix_clusters * no_clusters_remaining * (size_of_data_storage_element + size_of_key_storage_element)) \ + ((no_samples + 1) * size_of_pointer_storage_element) mem_consumption = projected_matrix_samples_mem_consumption \ + projected_matrix_clusters_mem_consumption elif alg == 'yinyang': # yinyang stores a dense matrix to keep a lower bound to every of the t groups t = no_clusters_remaining / 10 mem_consumption = no_samples * t * size_of_data_storage_element elif alg == 'fast_yinyang': # yinyang stores a dense matrix to keep a lower bound to every of the t groups + block vector projected matrices samples/clusters # 1. lower_bound_group_matrix = no_samples * t # 2. projected_matrix_samples = no_samples * dim ( = no_orthonormal_vectors) # 3. projected_matrix_clusters = no_clusters_remaining * dim ( = no_orthonormal_vectors) t = no_clusters_remaining / 10 lower_bound_group_matrix_mem_consumption = no_samples * t * size_of_data_storage_element annz_projected_matrix_samples = res['block_vector_data']['annz'] # annz_projected_matrix_clusters was not measured (we use the annz_projected_matrix_samples as an approximation) annz_projected_matrix_clusters = annz_projected_matrix_samples projected_matrix_samples_mem_consumption = (annz_projected_matrix_samples * no_samples * (size_of_data_storage_element + size_of_key_storage_element)) \ + ((no_samples + 1) * size_of_pointer_storage_element) projected_matrix_clusters_mem_consumption = (annz_projected_matrix_clusters * no_clusters_remaining * (size_of_data_storage_element + size_of_key_storage_element)) \ + ((no_samples + 1) * size_of_pointer_storage_element) mem_consumption = lower_bound_group_matrix_mem_consumption \ + projected_matrix_samples_mem_consumption \ + projected_matrix_clusters_mem_consumption else: raise Exception("please provide details for the memory consumption of %s" % alg) kmeans_duration_this_run = duration_kmeans if 'truncated_svd' in res: kmeans_duration_this_run += res['truncated_svd']['duration'] mem_consumption = (mem_consumption / 1024.0) / 1024.0 result_data[ds]['results'][no_clusters_remaining][alg]['duration'][run] = (float(mem_consumption), kmeans_duration_this_run) result_data[ds]['infos']['input_dimension'] = res['input_dimension'] result_data[ds]['infos']['input_samples'] = res['input_samples'] result_data[ds]['infos']['input_annz'] = res['input_annz'] result_data[ds]['results'][no_clusters][alg]['no_iterations'][run] = no_iterations remove_incomplete_data(result_data) print("Result data:") pprint(result_data) create_plot(output_folder=out_folder_csv, plot_name=plotname, pdata=result_data)