def user_requested_anomaly7(): """ Checks if the user requested an anomaly, and returns True/False accordingly. """ digit = 0 res = False if is_nonzero_file7(summon_filename): lines = [] with open(get_full_path(summon_filename)) as f: lines = f.readlines() if len(lines) > 0: try: digit = int(lines[0]) if digit > 0: res = True except Exception as e: res = False append_logs("ERROR:" + str(e), name4logs, "always") else: res = False else: res = False # Disable summoning of anomalies after the requested number of anomalies were added if res: with open(get_full_path(summon_filename), "w") as f: if digit > 0: f.write(str(digit - 1)) else: f.write("0") return res
def ask_model(lmodel, observations_df, scaling): # TODO: use scaling for KitNET too datapoint = None try: datapoint = observations_df.to_numpy()[-1] rmse_score = lmodel.execute(datapoint) except Exception as e: rmse_score = 0 append_logs( "ERROR: KitNET ask_model failed. datapoint: " + str(datapoint) + " . Exception: " + str(e), name4logs, "always") return rmse_score
def launch_scripts(): """ Launches all the python scripts necessary for Thio to run. If you want to disable Telemanom and/or KitNET, just comment them out here and restart 0launcher. """ with open("state_controls/scripts_to_run.txt", "w") as scripts_f: scripts_f.write("") try: run('lib_telemanom_train.py', "thio_telemanom") run('lib_telemanom_infer.py', "thio_telemanom") run('lib_KitNET_train.py', "thio_kitnet") run('lib_KitNET_infer.py', "thio_kitnet") run('gui.py', "thio_kitnet") except Exception as e: helper_funcs.append_logs(str(e), name4logs, "always")
def execute(self, x): if self.v is None: e_msg = 'cant execute x, because a feature mapping not learned or provided. Try running process(x) instead.' append_logs(e_msg, "KitNET lib", "always") raise RuntimeError(msg) else: self.n_executed += 1 # Ensemble Layer s_l1 = np.zeros(len(self.ensembleLayer)) for a in range(len(self.ensembleLayer)): # make sub inst xi = x[self.v[a]] s_l1[a] = self.ensembleLayer[a].execute(xi) # OutputLayer return self.outputLayer.execute(s_l1)
def python_script_running7(script_filename): """ Returns True if the script with the given filename is currently running, False otherwise. Args: script_filename (str): e.g. "lib_telemanom_train.py" """ res = False try: for p in psutil.process_iter(): if len(p.cmdline()) > 1: if script_filename in p.cmdline()[1]: res = True break except Exception as e: append_logs("Exception: " + str(e), name4logs, "always") return res
def fetched_data_to_dataframe(filename, last_n_values=-1): """ Reads the dataset file and converts it into a pandas dataframe, with columns representing channels. Args: filename (str): the dataset filename (e.g. "syntheticData.txt") last_n_values (int): number of the latest datapoints to read """ cols_number = 3 * len(channels) my_cols = [str(i) for i in range(cols_number)] # create some row names cols2delete = [] for c in range(cols_number): if (c + 1) % 3 != 0: cols2delete.append(c) df = pd.DataFrame() try: # TODO: use isNonZeroFile7 to check if non zero # import tailer as tl import io source = helper_funcs.get_full_path(filename) if last_n_values != -1: last_lines = helper_funcs.read_last_lines(source, last_n_values) source = io.StringIO('\n'.join(last_lines)) df = pd.read_csv(source, sep=";|§", names=my_cols, header=None, engine="python") except Exception as e: helper_funcs.append_logs( "ERROR in fetchedData_to_DataFrame upon trying to open " + filename + " : " + str(e), "parser", "always", "print") if not df.empty: df = df.drop(df.columns[cols2delete], axis=1) # the columns in the fetched file are sorted alphabetically. # We sort it here too - to make them be the same columns df.columns = sorted(channels) return df
def __init__(self, n, max_autoencoder_size=10, fm_grace_period=None, ad_grace_period=10000, learning_rate=0.1, hidden_ratio=0.75, feature_map=None): # Parameters: self.AD_grace_period = ad_grace_period if fm_grace_period is None: self.FM_grace_period = ad_grace_period else: self.FM_grace_period = fm_grace_period if max_autoencoder_size <= 0: self.m = 1 else: self.m = max_autoencoder_size self.lr = learning_rate self.hr = hidden_ratio self.n = n # Variables self.n_trained = 0 # the number of training instances so far self.n_executed = 0 # the number of executed instances so far self.v = feature_map if self.v is None: append_logs( "Feature-Mapper: train-mode, Anomaly-Detector: off-mode", "KitNET lib", "verbose") else: self.__createAD__() append_logs( "Feature-Mapper: execute-mode, Anomaly-Detector: train-mode", "KitNET lib", "verbose") self.FM = CorClust( self.n ) # incremental feature cluatering for the feature mapping process self.ensembleLayer = [] self.outputLayer = None
def get_model(input_dataframe): input_arr = input_dataframe.to_numpy() dataset_size = len(input_dataframe.index) # KitNET params: max_ae = 10 # maximum size for any autoencoder in the ensemble layer fm_grace = int( dataset_size * 0.1 ) # the number of instances taken to learn the feature mapping (the ensemble's architecture) ad_grace = dataset_size - fm_grace # the number of instances used to train the anomaly detector (ensemble itself) append_logs( "Dataset_size: " + str(dataset_size) + " . FMgrace: " + str(fm_grace) + " . ADgrace: " + str(ad_grace), name4logs, "verbose") append_logs("numpy.ndarray tail my input_arr:\n" + str(input_arr[-3:]), name4logs, "verbose") # Build KitNET kit_net_obj = KitNET(input_arr.shape[1], max_ae, fm_grace, ad_grace) model = None for i in range(input_arr.shape[0]): if i % 1000 == 0: g_msg = "progress: " + str(i) # save_model_to_pickle(model, -1, "pickled_models/kitnet_test_" + str(i) + ".pkl") append_logs(g_msg, name4logs, "verbose") model = kit_net_obj.train(input_arr[i, ]) return model, None, True
def train(self, x): # If the FM is in train-mode, and the user has not supplied a feature mapping if self.n_trained <= self.FM_grace_period and self.v is None: # update the incremetnal correlation matrix self.FM.update(x) if self.n_trained == self.FM_grace_period: # If the feature mapping should be instantiated self.v = self.FM.cluster(self.m) self.__createAD__() t_msg = "The Feature-Mapper found a mapping: " + str( self.n) + " features to " + str(len( self.v)) + " autoencoders." append_logs(t_msg, "KitNET lib", "verbose") t_msg = "Feature-Mapper: execute-mode, Anomaly-Detector: train-mode" append_logs(t_msg, "KitNET lib", "verbose") else: # train # Ensemble Layer s_l1 = np.zeros(len(self.ensembleLayer)) for a in range(len(self.ensembleLayer)): # make sub instance for autoencoder 'a' xi = x[self.v[a]] s_l1[a] = self.ensembleLayer[a].train(xi) # OutputLayer self.outputLayer.train(s_l1) if self.n_trained == self.AD_grace_period + self.FM_grace_period: t_msg = "Feature-Mapper: execute-mode, Anomaly-Detector: exeute-mode" append_logs(t_msg, "KitNET lib", "verbose") self.n_trained += 1 return self
def fetch_and_save_datapoint(data_channels, use_synthetic_data7): """ Returns a string that looks like this: 1582830400.15; bitcoin; eur; 8080.99 § litecoin; eur; 58.08 Also saves the string to the latest_datapoint file. Args: data_channels (list of strings): names of channels use_synthetic_data7 (bool): True if synthetic data is used, False otherwise """ try: price_dic, ts = recieve_datapoint(data_channels, use_synthetic_data7) data_point_str = price_dict_to_str(price_dic, "eur", ts) if use_synthetic_data7: data_filename_for_saving = "dataset/latest_datapoint_synthetic.txt" else: data_filename_for_saving = "dataset/latest_datapoint_fetched.txt" # TODO: move it to 0launcher list_to_file(data_filename_for_saving, [data_point_str], "w") except Exception as fetch_e: data_point_str = None f_msg = "failed to get a datapoint: " + str(fetch_e) append_logs(f_msg, "0launcher", "always", "print") return data_point_str
def data_sanity_check(use_synthetic_data7, data_channels): """ Checks if the dataset contains corrupted data. The checks cover the case where the user has changed the number of channels, but forgot to delete the old data that still has the old number of channels. Args: use_synthetic_data7 (bool): if True, the synthetic data is used data_channels (list of strings): channel names """ # TODO: check if there are at least 2 channels in configs, otherwise KitNET will not work if use_synthetic_data7: dataset_filename = "dataset/syntheticData.txt" else: dataset_filename = "dataset/fetchedData.txt" bad_shape_msg = dataset_filename + "seems to be in a bad shape, as reading it into a dataframe causes an error" \ " or meaningless output. If you changed the number of channels, deleting " \ "the data that has the previous number of channels could help " # TODO: remove code duplication, as a similar code is used in fetched_data_to_dataframe if is_nonzero_file7(dataset_filename): cols_number = 3 * len(data_channels) my_cols = [str(i) for i in range(cols_number)] # create some row names print("checking...", dataset_filename) df = pd.DataFrame() try: df = pd.read_csv(get_full_path(dataset_filename), sep=";|§", names=my_cols, header=None, engine="python") except Exception as e: append_logs(bad_shape_msg + " " + str(e), name4logs, "always", "print") exit() timestamps = pd.DataFrame(df.index).to_numpy() latest_timestamp = timestamps[-1] if "nan" in str(latest_timestamp): append_logs(bad_shape_msg, name4logs, "always", "print") exit() else: append_logs( dataset_filename + " doesn't exist or of zero size. First launch?", name4logs, "always", "print")
def get_price_from_substr(istr, original_str): """Extracts the float value (e.g. 7.5) and the channel_name from a string like this: "channel_name; unit; 7.5". Args: istr (str): a string like this: "bitcoin; eur; 7.5" original_str (str): at the very first stage of parcing, before this func is called, we receive a string like: 1585474566.27; bitcoin; eur; 3.664121010741326 § ethereum; eur; 1.0710547987175814 ... We pass it here for debug purposes. """ positions_list = find_all_positions_of_character(istr, ';') if len(positions_list) > 0: temp_list = positions_list[ -1:] # get the position of last ";" as a list of 1 element else: temp_list = [] if len(temp_list) > 0: position = temp_list[0] position += 2 # skip "; " price_str = istr[position:] # get the string from this position if ("None" in price_str) or ("invalid" in price_str): price = -1 helper_funcs.append_logs( "get_price_from_substr: -None- or -invalid- in the input string. Could be just a missing data, " "or a sign of something bad. Input: " + str(istr) + " . Original str: " + original_str, "parser", "always") else: try: price = float(price_str) # try to convert it into float except Exception as e: price = -1 # if can't parse the price, return "-1" msg = "ERROR: get_price_from_substr: price = float(price_str) caused an arror: " + str( e) + " . Inputs: istr = " + str(istr) helper_funcs.append_logs(msg, "parser", "always", "print") position = positions_list[ 0] # get the position of the first ";" to remove the timestamp name_str = istr[:position] else: price = -1 name_str = "" msg = "get_price_from_substr: len(temp_list) is zero. Caused by this istr: " + istr helper_funcs.append_logs(msg, "parser", "always", "print") return price, name_str
from dataset_preprocessing import cleanup_dataset, data_sanity_check time_between_fetches = 1.0 # how often should the data be fetched from the data provider, in seconds this_many_last_observations = 500 # to save them TO a separate file # Reset states delete_logs() # delete logs from the previous sessions with open(get_full_path("state_controls/summonAnomaly.txt"), "w") as f: f.write("0") with open(get_full_path("state_controls/exit7.txt"), "w") as f: f.write("0") data_channels = read_configs()["data_channels"] use_synthetic_data7 = synthetic_data7() append_logs("use_synthetic_data7 : " + str(use_synthetic_data7), "0launcher", "always", "print") append_logs("data_channels : " + str(data_channels), "0launcher", "always", "print") cleanup_dataset(use_synthetic_data7) data_sanity_check(use_synthetic_data7, data_channels) launch_scripts() if use_synthetic_data7: data_filename = "dataset/syntheticData.txt" last_n_filename = "dataset/lastNpoints_synthetic.txt" else: data_filename = "dataset/fetchedData.txt" last_n_filename = "dataset/lastNpoints_fetched.txt"
from helper_funcs import append_logs, infer_and_save_results, exit7, synthetic_data7 import lib_KitNET_calc ask_model_func = lib_KitNET_calc.ask_model useOnlyThisManyLatestOfLastN = 50000 filename2write = "risk_scores/kitnet_anomaly.txt" name4logs = "lib_KitNET_infer" method_name = "kitnet" if synthetic_data7(): output_postfix = "_synthetic" else: output_postfix = "_fetched" modelpath = "pickled_models/" + method_name + output_postfix + ".pkl" append_logs("Starting the main circle", name4logs, "always") old_modification_ts = -1 old_meta_model_dic = None scales_dic = None while True: exit7() old_modification_ts, old_meta_model_dic, scales_dic = infer_and_save_results( ask_model_func, modelpath, old_modification_ts, old_meta_model_dic, useOnlyThisManyLatestOfLastN, method_name, scales_dic) time.sleep(1.0)
def data_send_loop(add_data_callback_func): """Regularly reads the data to plot, and emits it.""" # Setup the signal-slot mechanism. source = Communicate() source.data_signal.connect(add_data_callback_func) loop_counter = 0 if use_synthetic_data7: data_filename = "dataset/latest_datapoint_synthetic.txt" else: data_filename = "dataset/latest_datapoint_fetched.txt" # TODO: generate this dic automatically pause_fetching = { "prices_dic": False, "kitnet_risk": False, "telemanom_risk": False } while True: # TODO: generate this list and the dicts automatically list2emit = [None, None, None] try: prices_dic = dict() kitnet_risk = dict() telemanom_risk = dict() # to prevent flooding the log with the entries about non-existing file during the first start if not pause_fetching["prices_dic"]: prices_dic = read_prediction_for_aggregation( data_filename, "realtime_graph") if prices_dic is None: pause_fetching["prices_dic"] = True if not pause_fetching["kitnet_risk"]: # TODO: calculate the number from configs kitnet_risk = get_max_anomaly_from_latest( "risk_scores/kitnet_anomaly" + anomaly_file_postfix + ".txt", 10) if kitnet_risk is None: pause_fetching["kitnet_risk"] = True if not pause_fetching["telemanom_risk"]: # TODO: calculate the number from configs telemanom_risk = get_max_anomaly_from_latest( "risk_scores/telemanom_anomaly" + anomaly_file_postfix + ".txt", 10) if telemanom_risk is None: pause_fetching["telemanom_risk"] = True list2emit = [prices_dic, kitnet_risk, telemanom_risk] append_logs(str(prices_dic), name4logs, "verbose") except Exception as e: append_logs(str(e), name4logs, "always") time.sleep(sleep_time) # in seconds source.data_signal.emit(list2emit) # <- Here you emit a signal! loop_counter += 1 if loop_counter % 60 == 0: for channel_key, value in pause_fetching.items(): pause_fetching[channel_key] = False
""" Creates/updates the Telemanom model, by regularly training it on the latest N datapoints. It runs in parallel with the lib_telemanom_infer.py, to make training and inference work as separate processes. """ from helper_funcs import append_logs, train_and_save_model, exit7 import lib_telemanom_calc get_model_func = lib_telemanom_calc.get_model name4logs = "lib_telemanom_train" method_name = "telemanom" use_this_many_latest_dp = 30000 # bigger number means better AI but more compute required # main circle append_logs("starting the training circling", name4logs, "always", "print") while True: exit7() train_and_save_model(use_this_many_latest_dp, get_model_func, method_name)
The corresponding license texts are at end of this file. """ import numpy as np import time from scipy.cluster.hierarchy import linkage, to_tree import pickle from helper_funcs import append_logs, get_full_path, synthetic_data7 use_synthetic_data7 = synthetic_data7() name4logs = "lib_KitNET_calc" msg = "Reading Sample dataset..." append_logs(msg, name4logs, "verbose") if use_synthetic_data7: filename = "dataset/syntheticData.txt" else: filename = "dataset/fetchedData.txt" np.seterr(all='ignore') def sigmoid(x): return 1. / (1 + np.exp(-x)) class DenoisingAutoencoderParams: """A data class for storing the Denoising Autoencoder params."""