def clusterTestDataUsingMultipleClusterings(self, eventlog, testData): num_clusters = self.parameters["num_clusters"] eventAttributes = eventlog.data["attributes"]["event"] activities = eventlog.getActivityOccurrences(testData) for activityId, activity in activities.items(): numEvents = len(activity["occ"]) writeLog("Clustering %d test events for activity: %s (id: %s)" % (numEvents, activity["name"], activityId)) model = self.model[activityId] if activityId in self.model else None if ((numEvents < 2) or (model == None)): i = 0 for e in activity["occ"]: e.append(0) i += 1 continue events = [None] * numEvents maxLen = len(eventAttributes) + 2 for i, e in enumerate(activity["occ"]): events[i] = e[2:maxLen] df = pd.DataFrame(events, columns=eventlog.data["attributes"]["event"]) labels = self.predict(df, model, self.vectorizer[activityId], self.known_values[activityId]) i = 0 for e in activity["occ"]: e.append(labels[i]) i += 1
def predict(self, df, model, vectorizer, known_values): threshold = self.parameters["ignore_values_threshold"] * len(df) if threshold > 0: for col in df.columns: writeLog("Replacing unusual values in column %s." % (col)) if col in known_values: isin = df[col].isin(known_values[col]) df[col].loc[-isin] = OTHER_TOKEN writeLog("Vectorizing data frame of shape: %s" % (str(df.shape))) XX = vectorizer.transform(df.to_dict(orient='records')) alg = algorithms[self.algorithm] return alg["predict"](XX, model)
def setTrainingSize(self, parameters, pTraining): cases = np.asarray(self.data["cases"]) maxNumCases = parameters["max_num_cases_in_training"] if (maxNumCases != None) and (maxNumCases < len(cases)): writeLog("Filtering out %d cases out of %d" % (maxNumCases, len(cases))) cases = np.random.choice(cases, maxNumCases, replace=False) self.data["cases"] = cases nTraining = int(len(cases) * pTraining) indexes = self.rng.permutation(len(cases)) self.trainingData = cases[indexes[:nTraining]] self.testData = cases[indexes[nTraining:]] self.initializeDerivedData()
def testPaused(parameters): wasPaused = False while True: filename = parameters["pause_filename"] if (not isFile(filename)): filename = getInputPath() + parameters["pause_filename"] if (not isFile(filename)): filename = getOutputPath() + parameters["pause_filename"] if (not isFile(filename)): break if not wasPaused: writeLog("Tests paused until file is removed: %s" % filename) wasPaused = True sleep(1) if wasPaused: writeLog("Tests continued...")
def performCrossValidatedTestsForFullEventLog(self): parameters = self.parameters nSplits = parameters["cross-validation-splits"] writeLog("Performing cross-validation using %d splits" % (nSplits)) fullTestData = np.asarray(self.data["cases"]) self.initializationReport() kf = KFold(n_splits=nSplits, random_state=self.rng, shuffle=True) cvRunIndex = 0 for trainIndex, testIndex in kf.split(fullTestData): cvRunIndex += 1 parameters["cross-validation-run"] = cvRunIndex self.performCrossValidationRun(fullTestData, trainIndex, testIndex, parameters)
def train_hashvalue(df, parameters): hashes = {} hashId = 0 nextHashId = 0 labels = [] for row in df: hashValue = hash(tuple(row)) if (hashValue in hashes): hashId = hashes[hashValue] else: nextHashId += 1 hashId = hashes[hashValue] = nextHashId labels.append(hashId) writeLog( "Hashvalue clustering resulted into %d unique hash values for %d rows." % (len(hashes), len(labels))) return hashes, labels, [i for i in range(nextHashId)]
def initializeDerivedData(self, forSplittedEventLog=False): self.activities = {} self.activitiesByLabel = {} if ("activities" in self.data): for a in self.data["activities"]: self.activities[a["id"]] = {"name": a["name"], "occ": []} self.activitiesByLabel[a["name"].replace(" ", "_")] = a if (not forSplittedEventLog): writeLog("Initializing activity counts for %d cases" % (len(self.data["cases"]))) for c in self.data["cases"]: counters = collections.Counter(t[0] for t in c["t"]) c["occ"] = [ counters[act["id"]] for act in self.data["activities"] ] self.flows = {}
def trainForCaseClustering(self, eventlog, cases): if self.parameters["disable_case_attributes"] and self.parameters[ "disable_raw_case_attributes"]: writeLog("Case clustering not needed. Skipping it.") for t in cases: t["_cluster"] = 0 return writeLog("Clustering %d cases" % (len(cases))) # num_clusters = self.parameters["num_clusters"] # if (num_clusters <= 1): # for t in cases: # t["_cluster"] = 0 # return t0 = time() data = [] cols = [] ica_clustering, iao_clustering, ica_filtering, iao_filtering = self.getCaseFeatureGroupsToInclude( ) ica_cols = [] iao_cols = [] if ica_filtering: data += [c["a"] + c["occ"] for c in cases ] if iao_filtering else [c["a"] for c in cases] ica_cols = ["A_" + a for a in eventlog.data["attributes"]["case"]] cols += ica_cols if iao_filtering: if not ica_filtering: data += [c["occ"] for c in cases] iao_cols = ["O_" + a["name"] for a in eventlog.data["activities"]] cols += iao_cols df = pd.DataFrame(data, columns=cols) self.known_values = self.filterUnusualValues(df, self.parameters) if (ica_filtering and (not ica_clustering)): df = df.drop(ica_cols, axis=1) if (iao_filtering and (not iao_clustering)): df = df.drop(iao_cols, axis=1) if ("Cost" in df.columns): df = df.drop(["Cost"], axis=1) if ("_cluster" in df.columns): df = df.drop(["_cluster"], axis=1) if not self.parameters["disable_case_attributes"]: self.model, self.vectorizer, labels = self.train( df, self.parameters) for i, d in enumerate(labels): cases[i]["_cluster"] = d writeLog("Case clustering done in %0.3fs" % (time() - t0)) else: self.model = None self.vectorizer = None writeLog("Case data filtering done in %0.3fs" % (time() - t0))
def __init__(self, parameters, rng, filename=None, pTraining=0.0, modelCluster=None, inputJson=None): writeLog("Initializing event log") self.rng = rng self.parameters = dict(parameters) self.trainingData = [] self.testData = [] if (inputJson != None): self.data = json.loads(inputJson) self.filename = "unnamed" self.filepath = "" elif (filename != None): path = Path(filename) if (not path.is_file()): filename = getInputDatasetFilename(filename) self.filepath = filename self.filename = ntpath.basename(filename) with open(filename) as f: self.data = json.load(f) else: return self.pTraining = pTraining if pTraining == None: return if (modelCluster != None): model = modelCluster.models[0] if not ("activities" in self.data): self.data["activities"] = model.eventlogActivities if not ("attributes" in self.data): self.data["attributes"] = model.eventlogAttributes self.setTrainingSize(parameters, pTraining) self.initializationReport()
def __init__(self, algorithm=None, globalParameters=None, parameters=None, copyFrom=None): if copyFrom != None: self.algorithm = copyFrom.algorithm self.parameters = dict(copyFrom.parameters) else: self.algorithm = algorithm if (globalParameters != None): self.parameters = dict(globalParameters) self.parameters.update(parameters) else: self.parameters = parameters writeLog("Creating new clustering object for algorithm: " + self.algorithm) self.model = None self.vectorizer = None self.known_values = None self.labels = []
def waitForConfiguration(origFilename, parameters): wasPaused = False filename = None while True: filename = origFilename if isFile(filename): break filename = getInputPath() + origFilename if isFile(filename): break filename = getOutputPath() + origFilename if isFile(filename): break if not wasPaused: writeLog( "Tests paused until a new configuration file appears in: %s" % origFilename) wasPaused = True sleep(1) if wasPaused: writeLog("Got new configuration. Continuing...") writeLog("Reading new configuration from %a" % filename) result = loadConfiguration(filename, parameters) os.remove(filename) return result
def train_xmeans(df, parameters): # create object of X-Means algorithm that uses CCORE for processing # initial centers - optional parameter, if it is None, then random centers will be used by the algorithm. # let's avoid random initial centers and initialize them using K-Means++ method: max_num_clusters = parameters["max_num_clusters"] num_clusters = parameters["num_clusters"] initial_centers = kmeans_plusplus_initializer( df, min(df.shape[0], num_clusters)).initialize() xmeans_instance = xmeans(df, initial_centers, ccore=True, kmax=max_num_clusters) # run cluster analysis xmeans_instance.process() # obtain results of clustering clusters = xmeans_instance.get_clusters() writeLog( "X-means clustered using %d clusters (init: %d, max: %d). Using that as the desired number of clusters for k-means." % (len(clusters), num_clusters, max_num_clusters)) return do_train_kmeans(df, len(clusters), xmeans_instance.get_centers())
def performCrossValidationRun(self, fullTestData, trainIndex, testIndex, parameters): maxNumCases = parameters["max_num_cases_in_training"] cvRunIndex = parameters["cross-validation-run"] nSplits = parameters["cross-validation-splits"] writeLog("Starting cross-validation run %d of %d" % (cvRunIndex, nSplits)) if (maxNumCases != None) and (maxNumCases < len(trainIndex)): writeLog("Filtering out %d cases out of %d" % (maxNumCases, len(trainIndex))) trainIndex = np.random.choice(trainIndex, maxNumCases, replace=False) runEventLog = self.createEmptyCopy() runEventLog.data["cases"] = fullTestData[trainIndex] runEventLog.pTraining = parameters["test_data_percentage"] runEventLog.setTrainingSize(parameters, runEventLog.pTraining) runEventLog.initializationReport() m = ModelCluster(runEventLog.rng) m.initialize( parameters=parameters, case_clustering=Clustering( parameters["case_clustering_method"], parameters, { "num_clusters": parameters["num_case_clusters"], "max_num_clusters": parameters["max_num_case_clusters"], "ignore_values_threshold": parameters["ignore_values_threshold_for_case_attributes"] }), event_clustering=Clustering( parameters["event_clustering_method"], parameters, { "num_clusters": parameters["num_event_clusters"], "max_num_clusters": parameters["max_num_event_clusters"], "ignore_values_threshold": parameters["ignore_values_threshold_for_event_attributes"] }), rng=runEventLog.rng) trainResult = m.train(runEventLog) writeLog("Starting cross-validation test for run %d" % (cvRunIndex)) runEventLog = self.createEmptyCopy() runEventLog.data["cases"] = fullTestData[testIndex] runEventLog.testData = fullTestData[testIndex] runEventLog.trainingData = [] runEventLog.pTraining = 0.0 runEventLog.initializeDerivedData() runEventLog.initializationReport() maxNumTraces = parameters[ "max_num_traces_in_testing"] if "max_num_traces_in_testing" in parameters else None m.test(runEventLog, 1.0, trainResult, maxNumTraces)
def splitLog(self, eventlog, onlyTest=False): self.eventlog = eventlog true_k = len(self.models) if (true_k == 1): return [self.eventlog] t0 = time() result = [ self.eventlog.createEmptyCopy(self.parameters) for model in self.models ] if (not onlyTest): cases = np.array([c["occ"] for c in self.eventlog.trainingData]) df = pd.DataFrame( cases, columns=[a["name"] for a in self.eventlog.data["activities"]]) self.caseClusterVectorizer = DictVectorizer(sparse=False) X = self.caseClusterVectorizer.fit_transform( df.to_dict(orient='records')) writeLog("Event log splitting done in %fs" % (time() - t0)) writeLog("n_samples: %d, n_features: %d" % X.shape) # ############################################################################# # Do the actual clustering # if opts.minibatch: self.caseClusterModel = MiniBatchKMeans(n_clusters=true_k, init='k-means++', n_init=1, init_size=1000, batch_size=1000, verbose=False) # else: # self.caseClusterModel = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1, # verbose=opts.verbose) # writeLog("Clustering sparse data with %s" % self.caseClusterModel) t0 = time() x = self.caseClusterModel.fit(X) writeLog("done in %0.3fs" % (time() - t0)) for i, d in enumerate(x.labels_): result[d].addTrace(self.eventlog.trainingData[i], True) cases = np.array([c["occ"] for c in self.eventlog.testData]) df = pd.DataFrame( cases, columns=[a["name"] for a in self.eventlog.data["activities"]]) XX = self.caseClusterVectorizer.transform(df.to_dict(orient='records')) x = self.caseClusterModel.predict(XX) for i, d in enumerate(x): result[d].addTrace(self.eventlog.testData[i], False) for eventlog in result: eventlog.initializeDerivedData(True) return result
def load(self, filename, parameters): path = Path(filename) if (not path.is_file()): filename = getOutputPath() + filename with open(filename, 'rb') as f: saved = pickle.load( f ) # https://groups.google.com/d/msg/lasagne-users/w8safJOJYvI/SvdiuIHIDQAJ self.parameters = dict(parameters) self.parameters.update(saved["parameters"]) self.caseClusterModel = saved["case_cluster_model"] self.caseClusterVectorizer = saved["case_cluster_vectorizer"] self.case_clustering = saved["case_clustering"] self.event_clustering = saved["event_clustering"] self.algorithm = saved["nn_params"]["algorithm"] self.num_layers = saved["nn_params"]["num_layers"] self.optimizer = saved["nn_params"]["optimizer"] self.learning_rate = saved["nn_params"]["learning_rate"] self.batch_size = saved["nn_params"]["batch_size"] self.num_callbacks = saved["nn_params"]["num_callbacks"] self.case_name = saved["nn_params"]["case_name"] self.hidden_dim_size = saved["nn_params"]["hidden_dim_size"] self.num_iterations_between_reports = saved["nn_params"][ "num_iterations_between_reports"] self.grad_clipping = saved["nn_params"]["grad_clipping"] self.predict_only_outcome = saved["nn_params"]["predict_only_outcome"] self.final_trace_only = saved["nn_params"]["final_trace_only"] self.max_num_words = saved["nn_params"]["max_num_words"] self.trace_length_modifier = saved["nn_params"][ "trace_length_modifier"] self.truncate_unknowns = saved["nn_params"]["truncate_unknowns"] self.num_models = saved["nn_params"]["num_models"] self.models = [] for i in range(self.num_models): writeLog("Loading model %d of %d" % (i + 1, self.num_models)) model = Model(self.parameters) self.models.append(model) model.load(saved["saved_models"][i])
def train(self, eventlog): self.eventlogs = self.splitLog(eventlog) writeLog("Trace distribution by models:") trainDatasetSize = 0 for i, eventlog in enumerate(self.eventlogs): writeLog( "Model #%d: Train: %d traces, Test: %d traces" % (i + 1, len(eventlog.trainingData), len(eventlog.testData))) trainDatasetSize += len(eventlog.trainingData) + len( eventlog.testData) tutrain = 0 numSuccess = 0 numFail = 0 titu = 0 litu = 0 numEpochs = [] ivs = [] bestIterations = [] for i, eventlog in enumerate(self.eventlogs): model = self.models[i] writeLog("Training model %d of %d" % (i + 1, len(self.eventlogs))) ns, ne, tu = model.train(eventlog) numEpochs.append(model.epoch) ivs.append(len(model.word_to_index)) bestIterations.append(model.best_iteration) tutrain += tu numSuccess += ns numFail += ne titu += model.train_initialization_time_used litu += model.layer_initialization_time_used srtrain = numSuccess / numFail writeLog("Total time used in training: %d (success rate = %f)" % (tutrain, srtrain)) return { "success_rate": srtrain, "train_dataset_size": trainDatasetSize, "train_time_used": tutrain, "train_init_time_used": titu, "layer_init_time_used": litu, "num_epochs": np.mean(np.asarray(numEpochs)), "test_iterations": self.parameters["num_callbacks"], "input_vector_size": np.mean(ivs), "best_iteration": np.mean(bestIterations) }
def do_train_kmeans(df, num_clusters, centers=None): if (df.shape[1] == 0) or (num_clusters < 2): writeLog( "No columns in the table to be clustered. Returning constant labels." ) model = None labels = len(df) * [0] return model, labels, [0] if centers == None: model = MiniBatchKMeans(n_clusters=num_clusters, init='k-means++', n_init=1, init_size=1000, batch_size=1000, verbose=False) else: model = KMeans(n_clusters=num_clusters, init=np.asarray(centers), n_init=1, max_iter=1) x = model.fit(df) writeLog("K-means model created for %d clusters." % (model.n_clusters)) return model, x.labels_, [i for i in range(model.n_clusters)]
def filterUnusualValues(self, df, parameters): writeLog("Number of colums to filter unusual values from %d" % (len(df.columns))) t0 = time() threshold = parameters["ignore_values_threshold"] * len(df) known_values = {} for col in df.columns: writeLog( "Replacing unusual values in column '%s' with minimum usage of %d rows." % (col, threshold)) vc = df[col].value_counts() toRemove = vc[vc <= threshold].index toKeep = vc[vc > threshold].index known_values[col] = toKeep writeLog( "Remaining known values: %s (removed %d values out of %d values)" % (str([i for i in toKeep]), len(toRemove), len(toKeep))) if len(toRemove) > 0: df[col].replace(toRemove, OTHER_TOKEN, inplace=True) writeLog("Unusual value filtering done in %f s" % (time() - t0)) return known_values
def preProcessForTraining(self, parameters): disableDurations = parameters["disable_durations"] if not disableDurations: numEvents = 0 writeLog("Pre-processing %d cases" % (len(self.trainingData))) for c in self.trainingData: prev = None prevDate = None evts = c["t"] numEvents += len(evts) for e in evts: eDate = parse_date(e[1]) if prev is not None: key = "%s->%s" % (prev[0], e[0]) if (key in self.flows): flow = self.flows[key] else: flow = self.flows[key] = {"name": key, "occ": []} delta = eDate - prevDate flow["occ"].append(delta) prevDate = eDate prev = e writeLog( "Total number of events in training data: %d (Average case length: %f)" % (numEvents, (numEvents / len(self.trainingData)))) writeLog("Pre-processing %d flows" % (len(self.flows))) for key in self.flows: f = self.flows[key] nOcc = len(f["occ"]) f["occ"].sort() if (nOcc > 0): min = f["min"] = f["occ"][0] max = f["max"] = f["occ"][nOcc - 1] f["avg"] = np.mean(f["occ"]) f["med"] = np.median(f["occ"]) f["perc10"] = np.percentile(f["occ"], 10) f["perc25"] = np.percentile(f["occ"], 25) f["perc75"] = np.percentile(f["occ"], 75) f["perc90"] = np.percentile(f["occ"], 90) f["diff"] = f["max"] - f["min"] f["fast"] = f["perc10"] f["slow"] = f["perc90"]
os.remove(started_tests_filename) if ("test_config_filename" in default_parameters) and ( default_parameters["test_config_filename"] != None): parameters = dict(default_parameters) configuration = waitForConfiguration( parameters["test_config_filename"], parameters) main(configuration, parameters) writeLog("Tests finished.") started_tests_filename = default_parameters[ "output_directory"] + "current-tests.json" if (opts.configuration_from_standard_input): writeLog("Reading configuration from standard input") jsonConfig = sys.stdin.readline() configuration = json.loads(jsonConfig) writeLog("Standard input reading finished") parameters = dict(default_parameters) configuration = None if (opts.configuration_filename != None): configuration = loadConfiguration(opts.configuration_filename, parameters) configure(parameters["input_directory"], parameters["output_directory"], opts.log_to_file_only) writeLog(__doc__) if __name__ == '__main__': main(configuration, parameters)
def main(configuration, parameters): def saveConfigs(testConfigs): jsonConfig = json.dumps(testConfigs) with open(started_tests_filename, "w") as f: f.write(jsonConfig) if configuration != None: tests = [] if Path(started_tests_filename).is_file(): tsts = None with open(started_tests_filename) as data: tsts = json.load(data) for t in tsts: ts = dict(default_parameters) ts.update(t) tests.append(ts) writeLog("Loaded remaining %d test configurations from %s." % (len(tests), started_tests_filename)) else: if (not collect(configuration, default_parameters, tests)): writeLog("Exit requested. Finishing tests...") return saveConfigs(tests) writeLog("Generated %d test configurations." % (len(tests))) if opts.skip_tests > 0: tests = tests[opts.skip_tests:] saveConfigs(tests) writeLog( "Skipping %d first test configurations leaving total of %d test remaining." % (opts.skip_tests, len(tests))) testPaused(parameters) nTests = len(tests) i = 1 while (len(tests) > 0): writeLog("Starting test %d of %d." % (i, nTests)) try: run(tests[0]) except: writeLog("Exception: " + traceback.format_exc()) tests = tests[1:] saveConfigs(tests) testPaused(parameters) i = i + 1 os.remove(started_tests_filename) if ("test_config_filename" in default_parameters) and ( default_parameters["test_config_filename"] != None): parameters = dict(default_parameters) configuration = waitForConfiguration( parameters["test_config_filename"], parameters) main(configuration, parameters) writeLog("Tests finished.")
def test(self, eventlog, tracePercentage=1.0, trainResult=None, maxNumTraces=None): self.eventlogs = self.splitLog(eventlog, True) writeLog("Trace distribution by models:") for i, eventlog in enumerate(self.eventlogs): writeLog( "Model #%d: Train: %d cases, Test: %d cases" % (i + 1, len(eventlog.trainingData), len(eventlog.testData))) traces = [] predictions = [] probs = [] numSuccess = 0 t0 = time() for i, model in enumerate(self.models): writeLog("Testing model %d of %d" % (i + 1, len(self.eventlogs))) t, pred, prob, ns = model.test(self.eventlogs[i], tracePercentage, maxNumTraces) traces += t predictions += pred probs += prob numSuccess += ns tutest = (time() - t0) sr_test = numSuccess / len(predictions) writeLog("Success rate for test data: %d/%d (=%f%%)" % (numSuccess, len(predictions), 100 * sr_test)) train_success_rate = "" train_time_used = "" train_init_time_used = "" train_layer_init_time_used = "" num_epochs = "" test_iterations = "" train_dataset_size = 0 if trainResult != None: train_success_rate = trainResult["success_rate"] train_time_used = trainResult["train_time_used"] train_init_time_used = trainResult["train_init_time_used"] train_layer_init_time_used = trainResult["layer_init_time_used"] train_dataset_size = trainResult["train_dataset_size"] num_epochs = trainResult["num_epochs"] test_iterations = trainResult["test_iterations"] train_input_vector_size = trainResult["input_vector_size"] train_best_iteration = trainResult["best_iteration"] writeTestResultRow([ datetime.now().replace(microsecond=0).isoformat(), "ok-test", self.parameters["test_name"], self.case_name, self.parameters["dataset_name"] if (("dataset_name" in self.parameters) and (self.parameters["dataset_name"] != None)) else self.eventlog.filename, self.parameters["cross-validation-run"] if (("cross-validation-run" in self.parameters) and (self.parameters["cross-validation-run"] != None)) else "", train_dataset_size, len(traces), len(traces), self.algorithm, self.num_layers, self.hidden_dim_size, self.optimizer, self.learning_rate, "", train_input_vector_size, self.batch_size, self.grad_clipping, self.num_iterations_between_reports, train_best_iteration, test_iterations, "", num_epochs, train_init_time_used, train_layer_init_time_used, train_time_used, train_time_used, train_time_used, tutest, tutest, train_success_rate, sr_test, "", "", "", "", "", "", "", "", "", "", self.predict_only_outcome, self.final_trace_only, self.trace_length_modifier, self.num_iterations_between_reports * self.num_callbacks == 100000 * 50, self.max_num_words, self.truncate_unknowns, not self.parameters["disable_activity_labels"], not self.parameters["disable_durations"], not self.parameters["disable_event_attributes"], not self.parameters["disable_case_attributes"], not self.parameters["disable_raw_event_attributes"], not self.parameters["disable_raw_case_attributes"], self.parameters["predict_next_activity"], self.parameters["use_single_event_clustering"], self.parameters["duration_split_method"], self.parameters["case_clustering_method"], self.parameters["event_clustering_method"], self.parameters["case_clustering_include_activity_occurrences"], self.parameters["case_clustering_include_case_attributes"], self. parameters["include_activity_occurrences_as_raw_case_attributes"], self.parameters["use_single_value_for_duration"], self.parameters["max_num_case_clusters"], self.parameters["max_num_event_clusters"], self.parameters["ignore_values_threshold_for_case_attributes"], self.parameters["ignore_values_threshold_for_event_attributes"] ]) writeLog("Collecting results...") result = {} for i, trace in enumerate(traces): pred = predictions[i] result[trace.traceId] = { "outcome": pred[len(OUTCOME_SELECTION_TOKEN_PREFIX):] if pred.startswith(OUTCOME_SELECTION_TOKEN_PREFIX) else pred, "p": probs[i], "expected": trace.outcome if trace.outcome != None else "" } return result
def __init__(self, rng): lasagne.random.set_rng(rng) writeLog("Creating new model cluster object")
def train(self, df, parameters): writeLog("Number of colums to cluster %d" % (len(df.columns))) t0 = time() vectorizer = DictVectorizer(sparse=False) writeLog("Vectorizing data frame of shape: %s" % (str(df.shape))) X = vectorizer.fit_transform(df.to_dict(orient='records')) writeLog("Data vectorization done in %fs" % (time() - t0)) writeLog("n_samples: %d, n_features: %d" % X.shape) t0 = time() alg = algorithms[self.algorithm] # ############################################################################# # Do the actual clustering if df.shape[0] < 2: writeLog("One row or less to cluster. Returning constant labels.") model = None labels = len(df) * [0] allLabels = [0] elif df.shape[1] == 0: writeLog( "No columns in the table to be clustered. Returning constant labels." ) model = None labels = len(df) * [0] allLabels = [0] else: model, labels, allLabels = alg["train"](X, parameters) if (len(allLabels) > len(self.labels)): self.labels = allLabels writeLog("Clustering using %s done in %fs" % (self.algorithm, time() - t0)) return model, vectorizer, labels
def trainForEventClustering(self, eventlog, cases): if self.parameters["disable_event_attributes"] and self.parameters[ "disable_raw_event_attributes"]: writeLog("Event clustering not needed. Skipping it.") for c in cases: for e in c["t"]: e.append(0) return writeLog("Clustering events in %d cases" % (len(cases))) # num_clusters = self.parameters["num_clusters"] # if (num_clusters <= 1): # for c in cases: # for e in c["t"]: # e.append(0) # return t0 = time() if (self.parameters["use_single_event_clustering"]): events = [] for c in cases: for e in c["t"]: events.append(["" if i == None else i for i in e[2:]]) df = pd.DataFrame(events, columns=eventlog.data["attributes"]["event"]) known_values = self.filterUnusualValues(df, self.parameters) if not self.parameters["disable_event_attributes"]: model, vectorizer, labels = self.train(df, self.parameters) i = 0 for c in cases: for e in c["t"]: e.append(labels[i]) i += 1 self.vectorizer = {"primary": vectorizer} self.model = {"primary": model} else: model = None vectorizer = None self.known_values = {"primary": known_values} else: self.model = {} self.vectorizer = {} self.known_values = {} eventAttributes = eventlog.data["attributes"]["event"] activities = eventlog.getActivityOccurrences(cases) for activityId, activity in activities.items(): t0 = time() writeLog("Clustering %d events for activity: %s (id: %s)" % (len(activity["occ"]), activity["name"], activityId)) events = [None] * len(activity["occ"]) maxLen = len(eventAttributes) + 2 for i, e in enumerate(activity["occ"]): events[i] = e[2:maxLen] if (len(events) < 1): i = 0 for e in activity["occ"]: e.append(0) i += 1 continue df = pd.DataFrame(events, columns=eventlog.data["attributes"]["event"]) self.known_values[activityId] = self.filterUnusualValues( df, self.parameters) if not self.parameters["disable_event_attributes"]: self.model[activityId], self.vectorizer[ activityId], labels = self.train(df, self.parameters) i = 0 if not self.parameters["disable_event_attributes"]: for e in activity["occ"]: e.append(labels[i]) i += 1 else: self.model[activityId] = None self.vectorizer[activityId] = None writeLog("Event clustering done in %0.3fs" % (time() - t0))
def convertTracesFromInputData(self, data, parameters, trace_length_modifier): writeLog("Converting %d cases into event traces." % (len(data))) enableDurations = not parameters["disable_durations"] splitDurationsInto5Buckets = parameters[ "duration_split_method"] == "5-buckets" addOnlyFullTraceForFinisher = not parameters["predict_next_activity"] useSingleValueForDuration = parameters["use_single_value_for_duration"] includeActivityOccurrencesAsRawCaseAttributes = parameters[ "include_activity_occurrences_as_raw_case_attributes"] disableEventAttributes = parameters["disable_event_attributes"] splitTracesToPrefixes = parameters["split_traces_to_prefixes"] minPrefixLength = parameters["min_splitted_trace_prefix_length"] maxTraceLength = parameters["max_trace_length"] result = [] numFilteredCases = 0 numFilteredTraces = 0 for c in data: traces = [] l = len(c["t"]) finisherTraceFiltered = False if l > minPrefixLength: if splitTracesToPrefixes: if (l > maxTraceLength): numFilteredCases += 1 numFilteredTraces += l - maxTraceLength - minPrefixLength l = maxTraceLength finisherTraceFiltered = True for i in range(minPrefixLength, l): traces.append(c["t"][:i]) else: if (l > maxTraceLength): numFilteredCases += 1 numFilteredTraces += 1 finisherTraceFiltered = True traces.append(c["t"][:maxTraceLength]) else: traces.append(c["t"]) if len(traces) == 0: continue lastTrace = traces[len(traces) - 1] for trace in traces: sentence = [] durations = [] cAttributes = ( c["a"] + c["occ"] ) if includeActivityOccurrencesAsRawCaseAttributes else c["a"] prev = None prevDate = None eAttributes = [] for e in trace: eDate = parse_date(e[1]) durationPart = DURATION_TOKEN_PREFIX + "normal" dp = 0.5 if enableDurations and prev is not None: key = "%s->%s" % (prev[0], e[0]) flow = self.flows[key] if key in self.flows else None delta = eDate - prevDate if (flow != None) and ("slow" in flow): if splitDurationsInto5Buckets: if (delta > flow["perc90"]): durationPart = DURATION_TOKEN_PREFIX + "perc90" dp = 0.0 elif (delta > flow["perc75"]): durationPart = DURATION_TOKEN_PREFIX + "perc75" dp = 0.25 elif (delta > flow["perc25"]): durationPart = DURATION_TOKEN_PREFIX + "perc25" dp = 0.5 elif (delta > flow["perc10"]): durationPart = DURATION_TOKEN_PREFIX + "perc10" dp = 0.75 else: durationPart = DURATION_TOKEN_PREFIX + "perc0" dp = 1.0 else: if (delta > flow["slow"]): durationPart = DURATION_TOKEN_PREFIX + "slow" dp = 0.0 elif (delta < flow["fast"]): durationPart = DURATION_TOKEN_PREFIX + "fast" dp = 1.0 actPart = self.activities[e[0]]["name"] eAttributes += [ e[2:(len(e) - 1) if disableEventAttributes else -1] ] clusterPart = EVENT_ATTRIBUTE_TOKEN_PREFIX + str( e[len(e) - 1]) sentence.append(durationPart + WORD_PART_SEPARATOR + actPart.replace(WORD_PART_SEPARATOR, "_") + WORD_PART_SEPARATOR + clusterPart) if useSingleValueForDuration: durations.append(dp) prevDate = eDate prev = e finisher = c["f"] if "f" in c else ( (trace == lastTrace) and (not finisherTraceFiltered)) cluster = c["_cluster"] if ("_cluster" in c) else None if (not (addOnlyFullTraceForFinisher and finisher)): result.append( TraceData(c["n"], c["s"] if "s" in c else None, "s" in c, cAttributes, eAttributes, cluster, sentence, durations, parameters, trace_length_modifier, self.model, False)) if (finisher): result.append( TraceData(c["n"] + "_f", c["s"] if "s" in c else None, "s" in c, cAttributes, eAttributes, cluster, sentence, durations, parameters, trace_length_modifier, self.model, True)) writeLog("Generated %d event traces out of %d cases." % (len(result), len(data))) if (numFilteredTraces > 0): writeLog( "Filtered %d traces in %d cases due to them having more than maximum allowed number of events (%d)" % (numFilteredTraces, numFilteredCases, maxTraceLength)) return result
def train_skmeans(df, parameters): # num_clusters = parameters["num_clusters"] # create instance of Elbow method using K value from 1 to 10. # kmin, kmax = 1, 20 # elbow_instance = elbow(df, kmin, kmax) # process input data and obtain results of analysis # elbow_instance.process() # num_clusters = elbow_instance.get_amount() # most probable amount of clusters # https://datascience.stackexchange.com/questions/34187/kmeans-using-silhouette-score max_num_clusters = parameters["max_num_clusters"] Ks = range(2, min(max_num_clusters, len(df)) + 1) kms = [ MiniBatchKMeans(n_clusters=i, init='k-means++', n_init=1, init_size=1000, batch_size=1000, verbose=False) for i in Ks ] writeLog("Performing K-means for cluster sizes 2 - %d" % (min(max_num_clusters, len(df)))) sil_coeff = [] all_labels = [] distance_matrix = None max_num_samples_training_cluster = parameters[ "max_num_samples_training_cluster"] if len(df) > max_num_samples_training_cluster: writeLog( "The number of samples to be clustered (%d) exceeds the configured maximum of %d. Taking random sample of the configured maximum size." % (len(df), max_num_samples_training_cluster)) traindf = df[np.random.choice( df.shape[0], max_num_samples_training_cluster, replace=False), :] else: traindf = df for i, km in enumerate(kms): x = km.fit(traindf) if (i == 0): distance_matrix = pairwise_distances(traindf, metric="euclidean") score = 0.0 try: score = silhouette_score(distance_matrix, x.labels_, metric='precomputed') writeLog("sihouette_score for cluster size %d = %f" % (km.n_clusters, score)) except: writeLog( "Unable to calculate sihouette_score for cluster size %d. Using %f." % (km.n_clusters, score)) if len(traindf) < len(df): labels = km.predict(df) else: labels = x.labels_ sil_coeff.append(score) all_labels.append(labels) if score >= 1.0: writeLog( "Maximum silhouette score reached. No need to consider any more clusters." ) break max_index = np.asarray(sil_coeff).argmax(axis=0) model = kms[max_index] labels = all_labels[max_index] writeLog("Optimum number of clusters: " + str(model.n_clusters)) return model, labels, [i for i in range(model.n_clusters)]
def run(parameters): rng = np.random.RandomState(random_seed) writeLog("Running test using parameters: " + json.dumps(parameters)) inputJson = None if (opts.input_data_from_standard_input): writeLog("Reading from standard input") inputJson = sys.stdin.readline() writeLog("Standard input reading finished") if (parameters["write_input_to_file"]): filename = get_filename( "testdata_", "%s_%s_%s" % (parameters["file_handle"], "", ""), "json") with open(filename, "w") as f: f.write(inputJson) if (parameters["model_filename"] != None): m = ModelCluster(rng) m.load(parameters["model_filename"], parameters) inputFilename = None if parameters[ "test_filename"] == None else parameters["test_filename"] if (inputFilename != None): writeLog("Reading test data from file: " + inputFilename) el = EventLog(parameters, rng, inputFilename, modelCluster=m, inputJson=inputJson) jsonResult = "{}" if (len(el.testData) > 0): writeLog("Test set contains %d cases." % (len(el.testData))) result = m.test(el) jsonResult = json.dumps(result) filename = get_filename( "predict_result", "%s_%s_%s" % (parameters["file_handle"], m.case_name, m.eventlog.filename), "json") with open(filename, "w") as f: f.write(jsonResult) writeLog("Generated results saved into file: %s" % filename) else: writeLog("Test set is empty. No results created.") print(jsonResult) elif ((parameters["input_filename"] != None) or (inputJson != None)): if parameters["cross-validation-splits"] != None: EventLog.performCrossValidatedTests(parameters, inputJson, rng) return e = EventLog(parameters, rng, parameters["input_filename"], parameters["test_data_percentage"], inputJson=inputJson) m = ModelCluster(rng) m.initialize( parameters=parameters, case_clustering=Clustering( parameters["case_clustering_method"], parameters, { "num_clusters": parameters["num_case_clusters"], "max_num_clusters": parameters["max_num_case_clusters"], "ignore_values_threshold": parameters["ignore_values_threshold_for_case_attributes"] }), event_clustering=Clustering( parameters["event_clustering_method"], parameters, { "num_clusters": parameters["num_event_clusters"], "max_num_clusters": parameters["max_num_event_clusters"], "ignore_values_threshold": parameters["ignore_values_threshold_for_event_attributes"] }), rng=rng) trainResult = m.train(e) filename = m.save(parameters["file_handle"], parameters) writeLog("Generated model saved into file: %s" % filename) print(filename) if (parameters["test_filename"] != None): m = ModelCluster(rng) m.load(filename, parameters) el = EventLog(parameters, rng, parameters["test_filename"], modelCluster=m) result = m.test(el, 1.0, trainResult) jsonResult = json.dumps(result) filename = get_filename( "predict_result", "%s_%s_%s" % (parameters["file_handle"], m.case_name, m.eventlog.filename), "json") with open(filename, "w") as f: f.write(jsonResult) writeLog("Generated results saved into file: %s" % filename) print(jsonResult)
def __init__(self, num_layers, algorithm, num_units, hidden_dim_size, grad_clipping, optimizer, learning_rate): self.traces_train = [] self.traces_test = [] self.num_layers = num_layers self.algorithm = algorithm self.num_units = num_units self.hidden_dim_size = hidden_dim_size self.grad_clipping = grad_clipping self.optimizer = optimizer self.learning_rate = learning_rate writeLog("Preparing " + str(self.num_layers) + " layers for algorithm: " + self.algorithm) # First, we build the network, starting with an input layer # Recurrent layers expect input of shape # (batch size, SEQ_LENGTH, num_features) mask_var = T.matrix('mask') l_in = lasagne.layers.InputLayer(shape=(None, None, num_units)) l_mask = lasagne.layers.InputLayer((None, None), mask_var) self.l_layers = [l_in] # We now build the LSTM layer which takes l_in as the input layer # We clip the gradients at GRAD_CLIP to prevent the problem of exploding gradients. if (self.algorithm == "gru"): layerCreatorFunc = lambda parentLayer, isFirstLayer, isLastLayer: lasagne.layers.GRULayer( parentLayer, self.hidden_dim_size, grad_clipping=self.grad_clipping, mask_input=l_mask if isFirstLayer else None, only_return_final=isLastLayer) else: # All gates have initializers for the input-to-gate and hidden state-to-gate # weight matrices, the cell-to-gate weight vector, the bias vector, and the nonlinearity. # The convention is that gates use the standard sigmoid nonlinearity, # which is the default for the Gate class. # gate_parameters = lasagne.layers.recurrent.Gate( # W_in=lasagne.init.Orthogonal(), W_hid=lasagne.init.Orthogonal(), # b=lasagne.init.Constant(0.)) # cell_parameters = lasagne.layers.recurrent.Gate( # W_in=lasagne.init.Orthogonal(), W_hid=lasagne.init.Orthogonal(), # # Setting W_cell to None denotes that no cell connection will be used. # W_cell=None, b=lasagne.init.Constant(0.), # # By convention, the cell nonlinearity is tanh in an LSTM. # nonlinearity=lasagne.nonlinearities.tanh) layerCreatorFunc = lambda parentLayer, isFirstLayer, isLastLayer: lasagne.layers.LSTMLayer( parentLayer, self.hidden_dim_size, grad_clipping=self.grad_clipping, mask_input=l_mask if isFirstLayer else None, nonlinearity=lasagne.nonlinearities.tanh, # Here, we supply the gate parameters for each gate # ingate=gate_parameters, forgetgate=gate_parameters, # cell=cell_parameters, outgate=gate_parameters, # We'll learn the initialization and use gradient clipping only_return_final=isLastLayer) for layerId in range(self.num_layers): self.l_layers.append( layerCreatorFunc(self.l_layers[layerId], layerId == 0, layerId == self.num_layers - 1)) # The output of l_forward_2 of shape (batch_size, N_HIDDEN) is then passed through the softmax nonlinearity to # create probability distribution of the prediction # The output of this stage is (batch_size, vocab_size) self.l_out = lasagne.layers.DenseLayer( self.l_layers[len(self.l_layers) - 1], num_units=num_units, W=lasagne.init.Normal(), nonlinearity=lasagne.nonlinearities.softmax) self.l_layers.append(self.l_out) # Theano tensor for the targets target_values = T.ivector('target_output') #! target_var = T.matrix('target_output') # lasagne.layers.get_output produces a variable for the output of the net network_output = lasagne.layers.get_output(self.l_out) # https://github.com/Lasagne/Lasagne/blob/master/examples/recurrent.py # The network output will have shape (n_batch, 1); let's flatten to get a # 1-dimensional vector of predicted values # predicted_values = network_output.flatten() # flat_target_values = target_values.flatten() # Our cost will be mean-squared error # cost = T.mean((predicted_values - flat_target_values)**2) # cost = T.mean((network_output - target_values)**2) # The loss function is calculated as the mean of the (categorical) cross-entropy between the prediction and target. #! cost = T.nnet.categorical_crossentropy(network_output,target_var).mean() cost = T.nnet.categorical_crossentropy(network_output, target_values).mean() # Retrieve all parameters from the network all_params = lasagne.layers.get_all_params(self.l_out, trainable=True) # Compute AdaGrad updates for training writeLog("Computing updates...") writeLog("Using optimizer: " + self.optimizer) if (self.optimizer == "sgd"): updates = lasagne.updates.sgd(cost, all_params, self.learning_rate) elif (self.optimizer == "adagrad"): updates = lasagne.updates.adagrad(cost, all_params, self.learning_rate) elif (self.optimizer == "adadelta"): updates = lasagne.updates.adagrad(cost, all_params, self.learning_rate, 0.95) elif (self.optimizer == "momentum"): updates = lasagne.updates.momentum(cost, all_params, self.learning_rate, 0.9) elif (self.optimizer == "nesterov_momentum"): updates = lasagne.updates.nesterov_momentum( cost, all_params, self.learning_rate, 0.9) elif (self.optimizer == "rmsprop"): updates = lasagne.updates.rmsprop(cost, all_params, self.learning_rate, 0.9) else: updates = lasagne.updates.adam(cost, all_params, self.learning_rate, beta1=0.9, beta2=0.999) # Theano functions for training and computing cost writeLog("Compiling train function...") self.train = theano.function( [l_in.input_var, target_values, l_mask.input_var], cost, updates=updates, allow_input_downcast=True) #! self.train = theano.function([l_in.input_var, target_var, l_mask.input_var], cost, updates=updates, allow_input_downcast=True) writeLog("Compiling train cost computing function...") # self.compute_cost = theano.function([l_in.input_var, target_values, l_mask.input_var], cost, allow_input_downcast=True) # In order to generate text from the network, we need the probability distribution of the next character given # the state of the network and the input (a seed). # In order to produce the probability distribution of the prediction, we compile a function called probs. writeLog("Compiling propabilities computing function...") self.propabilities = theano.function( [l_in.input_var, l_mask.input_var], network_output, allow_input_downcast=True)
def initializationReport(self): writeLog("Initialized event log %s" % (self.filename)) writeLog(" # cases: %d (train: %d, test: %d)" % (len( self.data["cases"]), len(self.trainingData), len(self.testData))) writeLog(" # activities: %d" % (len(self.data["activities"]))) writeLog(" # case attributes: %d" % (len(self.data["attributes"]["case"]))) writeLog(" # event attributes: %d" % (len(self.data["attributes"]["event"]))) if (self.pTraining != None): writeLog(" Training set percentage: %d" % (int(self.pTraining * 100)))