def optimal_parameters(train_signal, train_labels, test_signal, test_labels, window_size, paa_segments, vocab_size, gamma): S = np.zeros((len(window_size), len(paa_segments), len(vocab_size))) max_S = 0 optimal_labels = np.zeros_like(test_labels) for w in range(len(window_size)): for p in range(len(paa_segments)): for v in range(len(vocab_size)): train_sax=sax_via_window(train_signal, window_size[w], paa_segments[p], vocab_size[v], nr_strategy='none', z_threshold=0.01) test_sax = sax_via_window(test_signal, window_size[w], paa_segments[p], vocab_size[v], nr_strategy='none') train_probs = extract_ngram_probs(train_sax) train_prof = extract_train_prof(train_probs, thres=1) alarm_regions = extract_alarm_regions(test_sax, train_prof) # print("Attacks detected: ", len(alarm_regions)) if not alarm_regions: continue predicted_labels = np.zeros_like(test_labels) predicted_labels[alarm_regions] = 1 s = compute_s(test_labels, predicted_labels, gamma) if s > max_S: max_S = s optimal_labels = predicted_labels S[w,p,v] = s return S, optimal_labels
def discretization(ano_set, train_set, anomaly_set, win_size, paa_size, alphabet_size, signal): print('Computing discrete model for ', signal, ' .......') # Find discrete sequences in the training data sax1 = sax_via_window(series=np.array(train_set[signal]), win_size=win_size, paa_size=paa_size, alphabet_size=alphabet_size, nr_strategy='exact', z_threshold=0.01) # Find discrete sequences in the anomalous data sax2 = sax_via_window(series=np.array(ano_set[signal]), win_size=win_size, paa_size=paa_size, alphabet_size=alphabet_size, nr_strategy='exact', z_threshold=0.01) # List discrete sequences that are in the anomaly set, but not in the training set attacklist = [] for k, v in sax2.items(): if k not in sax1: for e in v: attacklist.append(ano_set.index[e]) # Sort the dates that are found and remove None values attacklist = [i for i in attacklist if i is not None] attacklist.sort() # Add the found anomalies to the output and return for i in attacklist: anomaly_set.at[i, 'anomaly'] = 1 return anomaly_set
def test_series_to_wordbag(): """Test TS to vector.""" dat = np.array([ 0., 0., 0., 0., 0., -0.270340178359072, -0.367828308500142, 0.666980581124872, 1.87088147328446, 2.14548907684624, -0.480859313143032, -0.72911654245842, -0.490308602315934, -0.66152028906509, -0.221049033806403, 0.367003418871239, 0.631073992586373, 0.0487728723414486, 0.762655178750436, 0.78574757843331, 0.338239686422963, 0.784206454089066, -2.14265084073625, 2.11325193044223, 0.186018356196443, 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.519132472499234, -2.604783141655, -0.244519550114012, -1.6570790528784, 3.34184602886343, 2.10361226260999, 1.9796808733979, -0.822247322003058, 1.06850578033292, -0.678811824405992, 0.804225748913681, 0.57363964388698, 0.437113583759113, 0.437208643628268, 0.989892093383503, 1.76545983424176, 0.119483882364649, -0.222311941138971, -0.74669456611669, -0.0663660879732063, 0., 0., 0., 0., 0. ]) sax_none = sax_via_window(dat, 6, 3, 3, "none", 0.01) wordbag = series_to_wordbag(dat, 6, 3, 3, "none", 0.01) wordbag2 = manyseries_to_wordbag(np.matrix([dat, dat]), 6, 3, 3, "none", 0.01) frequencies = {} for k, v in sax_none.items(): frequencies[k] = len(v) for k, v in wordbag.items(): assert v == frequencies[k] for k, v in wordbag2.items(): assert v == frequencies[k] * 2
def test_via_window(): """Test SAX via window.""" dat = np.array([ 0., 0., 0., 0., 0., -0.270340178359072, -0.367828308500142, 0.666980581124872, 1.87088147328446, 2.14548907684624, -0.480859313143032, -0.72911654245842, -0.490308602315934, -0.66152028906509, -0.221049033806403, 0.367003418871239, 0.631073992586373, 0.0487728723414486, 0.762655178750436, 0.78574757843331, 0.338239686422963, 0.784206454089066, -2.14265084073625, 2.11325193044223, 0.186018356196443, 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.519132472499234, -2.604783141655, -0.244519550114012, -1.6570790528784, 3.34184602886343, 2.10361226260999, 1.9796808733979, -0.822247322003058, 1.06850578033292, -0.678811824405992, 0.804225748913681, 0.57363964388698, 0.437113583759113, 0.437208643628268, 0.989892093383503, 1.76545983424176, 0.119483882364649, -0.222311941138971, -0.74669456611669, -0.0663660879732063, 0., 0., 0., 0., 0. ]) sax_none = sax_via_window(dat, 6, 3, 3, "none", 0.01) elements_num = 0 for key in sax_none: elements_num += len(sax_none[key]) elements_num assert len(dat) - 6 == elements_num cca = sax_none['cca'] assert np.array_equal(np.array(cca), np.array([0, 1])) sax_exact = sax_via_window(dat, 6, 3, 3, "exact", 0.01) cca = sax_exact['cca'] assert np.array_equal(np.array(cca), np.array([0])) sax_mindist = sax_via_window(dat, 6, 3, 3, "mindist", 0.01) cca = sax_mindist['cca'] bbc = sax_mindist['bbc'] assert np.array_equal(np.array(cca), np.array([0])) assert np.array_equal(np.array(bbc), np.array([2]))
def find_discords_hotsax(series, win_size=100, num_discords=2, alphabet_size=3, paa_size=3, znorm_threshold=0.01, sax_type='unidim'): """HOT-SAX-driven discords discovery.""" discords = list() global_registry = set() # Z-normalized versions for every subsequence. znorms = np.array([ znorm(series[pos:pos + win_size], znorm_threshold) for pos in range(len(series) - win_size + 1) ]) # SAX words for every subsequence. sax_data = sax_via_window(series, win_size=win_size, paa_size=paa_size, alphabet_size=alphabet_size, nr_strategy=None, znorm_threshold=0.01, sax_type=sax_type) """[2.0] build the 'magic' array""" magic_array = list() for k, v in sax_data.items(): magic_array.append((k, len(v))) """[2.1] sort it ascending by the number of occurrences""" magic_array = sorted(magic_array, key=lambda tup: tup[1]) while len(discords) < num_discords: best_discord = find_best_discord_hotsax(series, win_size, global_registry, sax_data, magic_array, znorms) if -1 == best_discord[0]: break discords.append(best_discord) mark_start = max(0, best_discord[0] - win_size + 1) mark_end = best_discord[0] + win_size for i in range(mark_start, mark_end): global_registry.add(i) return discords
def series_to_wordbag(series, win_size, paa_size, alphabet_size=3, nr_strategy='exact', z_threshold=0.01): """VSM implementation.""" sax = sax_via_window(series, win_size, paa_size, alphabet_size, nr_strategy, z_threshold) # convert the dict to a wordbag frequencies = {} for k, v in sax.items(): frequencies[k] = len(v) return frequencies
def sax_conversion(nx): from saxpy.sax import sax_via_window for i in range(nx.shape[0]): for j in range(nx.shape[2]): sax = sax_via_window(nx[0][:][2], 5, 5, 3, 'none', 0.01) return sax
x1 = data2.iloc[1:, 1].values.flatten() # pick a random sample from class 0 x1 = x1.astype(np.float) plt.plot(x1) plt.show() window_size = 50 word_size = 3 alphabet_size = 3 nr_strategy = "exact" z_threshold = 0.01 #sax_via_window(data, win_size(segmentation), paa_size(word_size), alphabet_size,nr_strategy='exact', z_threshold=0.01): sax_window = sax_via_window(x1, window_size, word_size, alphabet_size, nr_strategy, z_threshold) sax_keys = list(sax_window.keys()) sax_values = list(sax_window.values()) i = 0 for n_val in sax_values: print(x1[n_val]) print(n_val) print(sax_keys[i]) x2 = list() for n1_val in n_val: #print(x1[n1_val], ",",x1[n1_val+1],",",x1[n1_val+2] ) alpha_count = 0 while (alpha_count < alphabet_size):
def find_best_discord_hotsax(series, win_size, a_size, paa_size, znorm_threshold, globalRegistry): # noqa: C901 """Find the best discord with hotsax.""" """ [1.0] get the sax data first 将一个 time series 转化为 SAX字典 (key: 字符串, value: 窗口索引组成的列表) """ sax_none = sax_via_window(series, win_size, a_size, paa_size, "none", 0.01) """ [2.0] build the 'magic' array magic_array: a list of tuples (字符串, 窗口索引个数) """ magic_array = list() for k, v in sax_none.items(): magic_array.append((k, len(v))) """ [2.1] sort it desc by the key 按照 窗口索引个数降序 对 tuple 排序 """ m_arr = sorted(magic_array, key=lambda tup: tup[1]) """ [3.0] define the key vars bestSoFarPosition bestSoFarDistance对应的窗口开始索引 这个窗口是该时间序列的异常子序列 bestSoFarDistance max(min(distance)) 对于每一个窗口, 我们求出它与其他窗口的最小距离 对所有的最小距离取一个最大值 """ bestSoFarPosition = -1 bestSoFarDistance = 0. distanceCalls = 0 visit_array = np.zeros(len(series), dtype=np.int) """[4.0] and we are off iterating over the magic array entries""" for entry in m_arr: """[5.0] some moar of teh vars""" curr_word = entry[0] # occurrences 当前word 的窗口索引列表 occurrences = sax_none[curr_word] """[6.0] jumping around by the same word occurrences makes it easier to nail down the possibly small distance value 通过在相同的单词之间 跳转, 使得更容易确定可能的小距离值 -- so we can be efficient and all that...""" # curr_pos 当前窗口索引 开始索引 for curr_pos in occurrences: # 若 已经在 globalRegistry 跳出本次循环 if curr_pos in globalRegistry: continue """[7.0] we don't want an overlapping subsequence""" # 避免 重复的子序列 mark_start = curr_pos - win_size mark_end = curr_pos + win_size # 我们要找到 与 当前窗口 相似性最大的(距离最小的)窗口, 而 visit_set 定义我们已经看过的窗口的开始索引 visit_set = set(range(mark_start, mark_end)) """[8.0] here is our subsequence in question""" # cur_seq 标准化的子序列 cur_seq = znorm(series[curr_pos:(curr_pos + win_size)], znorm_threshold) """[9.0] let's see what is NN distance""" # 定义 nn_dist 为: 当前窗口 与 其他窗口 的最小距离 (两窗口 不能有重复部分 且 不能相邻?) nn_dist = np.inf # 定义 bool 是否进行随机搜索 do_random_search = 1 """[10.0] ordered by occurrences search first""" # 通过在相同的单词之间 跳转, 使得更容易确定可能的小距离值 for next_pos in occurrences: """[11.0] skip bad pos""" # 避免 重复子序列 if next_pos in visit_set: continue else: visit_set.add(next_pos) """[12.0] distance we compute""" dist = euclidean( cur_seq, znorm(series[next_pos:(next_pos + win_size)], znorm_threshold)) distanceCalls += 1 """[13.0] keep the books up-to-date""" # 更新 nn_dist if dist < nn_dist: nn_dist = dist if dist < bestSoFarDistance: do_random_search = 0 break """[13.0] if not broken above, we shall proceed with random search""" # 上面循环正常结束 并没有提前跳出 那我们就要进行随机搜索 if do_random_search: """[14.0] build that random visit order array""" curr_idx = 0 for i in range(0, (len(series) - win_size)): # 为什么不是 len(series) - win_size + 1 # 当前窗口开始索引 在上面没有查看过 if not (i in visit_set): # 将其添加到 visit_array 中 visit_array[curr_idx] = i curr_idx += 1 # 此时 curr_idx 为 在上面没查看过的窗口开始索引的个数 # 打乱顺序 it_order = np.random.permutation(visit_array[0:curr_idx]) curr_idx -= 1 """[15.0] and go random""" while curr_idx >= 0: # 随机选择 窗口开始索引 it_order[curr_idx] rand_pos = it_order[curr_idx] curr_idx -= 1 dist = euclidean( cur_seq, znorm(series[rand_pos:(rand_pos + win_size)], znorm_threshold)) distanceCalls += 1 """[16.0] keep the books up-to-date again""" # 更新 nn_dist if dist < nn_dist: nn_dist = dist if dist < bestSoFarDistance: nn_dist = dist break """[17.0] and BIGGER books""" # 更新 bestSoFarDistance 和 bestSoFarPosition if (nn_dist > bestSoFarDistance) and (nn_dist < np.inf): bestSoFarDistance = nn_dist bestSoFarPosition = curr_pos return (bestSoFarPosition, bestSoFarDistance)
def prepare_data(window_len, word_len, alphabet_len, alpha_to_num): source = input("Source (1=CSV,2=Finance): ") if source == "1": source = input("CSV file (weather_JAN.csv): ") if source == "": source = "weather_JAN.csv" ts_data = pd.read_csv(source, index_col="date", parse_dates=["date"], dtype=np.float32) sax_ret = sax_via_window(ts_data["x"].values, window_len, word_len, alphabet_size=alphabet_len, nr_strategy="none", z_threshold=0.01) else: source = input("Remote Source (yahoo): ") if source == "": source = "yahoo" contract = input("Contract (SPY): ") if contract == "": contract = "SPY" start_date = input("Start Date (2000-01-01): ") if start_date == "": start_date = "2000-01-01" end_date = input("End Date (now): ") if end_date == "": end_date = datetime.datetime.now() ts_data = get_asset_data(source, contract, start_date, end_date) if "Close" in ts_data: close_tag = "Close" elif "close" in ts_data: close_tag = "close" else: return {"Error": "Couldn't find Close data."} ts_data["x"] = ts_data[close_tag] sax_ret = sax_via_window(ts_data["x"].values, window_len, word_len, alphabet_size=alphabet_len, nr_strategy="none", z_threshold=0.01) result_x = result_y = [] if sax_ret: my_sax = dict() for k, v in sax_ret.items(): for i in v: my_sax[i] = k tmp_d = {"x": [], "y": []} for i in range(len(my_sax)): word = my_sax[i] if i < len(my_sax) - 1: pred = my_sax[i + 1][-1] num_list = [np.float32(alpha_to_num[char][1]) for char in word] increment_list = [] for num in num_list: increment_list.append(num) tmp_d["x"].append(np.array(increment_list)) tmp_d["y"].append( np.array([np.float32(alpha_to_num[pred][1])])) last_ts = ts_data["x"].values[-window_len:] last_sax = my_sax[len(my_sax) - 1] print("LAST WINDOW ITEMS: ", last_ts) print("MAX WINDOW ITEM : ", max(last_ts)) print("MIN WINDOW ITEM : ", min(last_ts)) print(" LAST SAX : ", last_sax) # FORMAT: # result_x[0] = [1] result_y[0] = 3 # result_x[1] = [1,4] result_y[1] = 3 # result_x[2] = [1,4,2] result_y[2] = 3 # result_x[3] = [1,4,2,2] result_y[3] = 3 # result_x[4] = [1,4,2,2,4] result_y[4] = 3 ##### opt_val_test = input("Validate and Test: ") if opt_val_test == "y": train_percent = float(input("Train %: ")) # Separate Dataset into train, val and test pos_train = int(len(tmp_d["x"]) * train_percent) pos_train = int(pos_train / window_len) * window_len pos_val = len(tmp_d["x"][pos_train:]) / 2 pos_val = pos_train + int(pos_val / window_len) * window_len pos_test = pos_val result_x = dict() result_x["train"] = tmp_d["x"][:pos_train] result_x["val"] = tmp_d["x"][pos_train:pos_val] result_x["test"] = tmp_d["x"][pos_test:] result_y = dict() result_y["train"] = np.array(tmp_d["y"][:pos_train]) result_y["val"] = np.array(tmp_d["y"][pos_train:pos_val]) result_y["test"] = np.array(tmp_d["y"][pos_val:]) else: result_x = {"train": tmp_d["x"]} result_y = {"train": np.array(tmp_d["y"])} else: print("Not enough data!") return result_x, result_y
def find_best_discord_hotsax(series, win_size, a_size, paa_size, znorm_threshold, globalRegistry): # noqa: C901 """Find the best discord with hotsax.""" """[1.0] get the sax data first""" sax_none = sax_via_window(series, win_size, a_size, paa_size, "none", 0.01) """[2.0] build the 'magic' array""" magic_array = list() for k, v in sax_none.items(): magic_array.append((k, len(v))) """[2.1] sort it desc by the key""" m_arr = sorted(magic_array, key=lambda tup: tup[1]) """[3.0] define the key vars""" bestSoFarPosition = -1 bestSoFarDistance = 0. distanceCalls = 0 visit_array = np.zeros(len(series), dtype=np.int) """[4.0] and we are off iterating over the magic array entries""" for entry in m_arr: """[5.0] some moar of teh vars""" curr_word = entry[0] occurrences = sax_none[curr_word] """[6.0] jumping around by the same word occurrences makes it easier to nail down the possibly small distance value -- so we can be efficient and all that...""" for curr_pos in occurrences: if curr_pos in globalRegistry: continue """[7.0] we don't want an overlapping subsequence""" mark_start = curr_pos - win_size mark_end = curr_pos + win_size visit_set = set(range(mark_start, mark_end)) """[8.0] here is our subsequence in question""" cur_seq = znorm(series[curr_pos:(curr_pos + win_size)], znorm_threshold) """[9.0] let's see what is NN distance""" nn_dist = np.inf do_random_search = 1 """[10.0] ordered by occurrences search first""" for next_pos in occurrences: """[11.0] skip bad pos""" if next_pos in visit_set: continue else: visit_set.add(next_pos) """[12.0] distance we compute""" dist = euclidean( cur_seq, znorm(series[next_pos:(next_pos + win_size)], znorm_threshold)) distanceCalls += 1 """[13.0] keep the books up-to-date""" if dist < nn_dist: nn_dist = dist if dist < bestSoFarDistance: do_random_search = 0 break """[13.0] if not broken above, we shall proceed with random search""" if do_random_search: """[14.0] build that random visit order array""" curr_idx = 0 for i in range(0, (len(series) - win_size)): if not (i in visit_set): visit_array[curr_idx] = i curr_idx += 1 it_order = np.random.permutation(visit_array[0:curr_idx]) curr_idx -= 1 """[15.0] and go random""" while curr_idx >= 0: rand_pos = it_order[curr_idx] curr_idx -= 1 dist = euclidean( cur_seq, znorm(series[rand_pos:(rand_pos + win_size)], znorm_threshold)) distanceCalls += 1 """[16.0] keep the books up-to-date again""" if dist < nn_dist: nn_dist = dist if dist < bestSoFarDistance: nn_dist = dist break """[17.0] and BIGGER books""" if (nn_dist > bestSoFarDistance) and (nn_dist < np.inf): bestSoFarDistance = nn_dist bestSoFarPosition = curr_pos return (bestSoFarPosition, bestSoFarDistance)
def _prepare_data(self, alpha_to_num): result_x = result_y = dict() last_sax_word = "" sax_ret = dict() if self.source_type == "csv": if "http://" in self.source or "https://" in self.source: csv_file = "tmp.csv" urlretrieve(self.source, csv_file) ts_data = pd.read_csv(csv_file) if "input" in ts_data: ts_data["input"] = pd.to_numeric(ts_data["input"], downcast='float') sax_ret = sax_via_window(ts_data["input"].values, self.window_len, self.word_len, alphabet_size=self.alphabet_size, nr_strategy="none", z_threshold=0.01) os.remove(csv_file) else: log.error("Error: Invalid Link.") return result_x, result_y, last_sax_word elif self.source_type == "financial": ts_data = self._get_asset_data() if "Close" in ts_data: close_tag = "Close" elif "close" in ts_data: close_tag = "close" else: log.error("Error: Couldn't find Close data.") return result_x, result_y, last_sax_word ts_data["input"] = ts_data[close_tag] sax_ret = sax_via_window(ts_data["input"].values, self.window_len, self.word_len, alphabet_size=self.alphabet_size, nr_strategy="none", z_threshold=0.01) else: log.error("Invalid 'source_type'!") return result_x, result_y, last_sax_word if sax_ret: my_sax = dict() for k, v in sax_ret.items(): for i in v: my_sax[i] = k tmp_d = {"x": [], "y": []} for i in range(len(my_sax)): word = my_sax[i] if i < len(my_sax) - 1: pred = my_sax[i + 1][-1] num_list = [ np.float32(alpha_to_num[char][1]) for char in word ] increment_list = [] for num in num_list: increment_list.append(num) tmp_d["x"].append(np.array(increment_list)) tmp_d["y"].append( np.array([np.float32(alpha_to_num[pred][1])])) result_x = {"train": tmp_d["x"]} result_y = {"train": np.array(tmp_d["y"])} if my_sax: last_sax_word = my_sax[len(my_sax) - 1] else: log.error("Not enough SAX data!") else: log.error("Not enough data!") return result_x, result_y, last_sax_word
def main(): window_len = int(input("window_len: ")) word_len = int(input("word_len: ")) alphabet_len = int(input("alphabet_len: ")) alpha_to_num_step = float(1 / alphabet_len) alpha_to_num_map = float(alpha_to_num_step / 2) source = "weather_JAN.csv" ts_data = pd.read_csv(source, index_col="date", parse_dates=["date"], dtype=np.float32) sax_ret = sax_via_window(ts_data["temp"].values, window_len, word_len, alphabet_size=alphabet_len, nr_strategy="none", z_threshold=0.01) my_sax = dict() for k, v in sax_ret.items(): for i in v: my_sax[i] = k tmp_d = {"x": [], "y": []} for k, v in my_sax.items(): num_list = [ np.float32(((ord(char) - 96) * alpha_to_num_step) - alpha_to_num_map) for char in v[:-1] ] increment_list = [] for num in num_list: increment_list.append(num) tmp_d["x"].append(np.array(increment_list)) tmp_d["y"].append( np.array([ np.float32("".join([ str(((ord(char) - 96) * alpha_to_num_step) - alpha_to_num_map) for char in v[-1] ])) ])) # FORMAT: # result_x[0] = [1] result_y[0] = 3 # result_x[1] = [1,4] result_y[1] = 3 # result_x[2] = [1,4,2] result_y[2] = 3 # result_x[3] = [1,4,2,2] result_y[3] = 3 # result_x[4] = [1,4,2,2,4] result_y[4] = 3 ##### result_x = dict() result_x["train"] = tmp_d["x"][:len(tmp_d["x"]) - 2000] result_x["test"] = tmp_d["x"][len(tmp_d["x"]) - 2000:len(tmp_d["x"]) - 1000] result_x["val"] = tmp_d["x"][len(tmp_d["x"]) - 1000:len(tmp_d["x"])] result_y = dict() result_y["train"] = np.array(tmp_d["y"][:len(tmp_d["y"]) - 2000]) result_y["test"] = np.array(tmp_d["y"][len(tmp_d["y"]) - 2000:len(tmp_d["y"]) - 1000]) result_y["val"] = np.array(tmp_d["y"][len(tmp_d["y"]) - 1000:len(tmp_d["y"])]) batch_size = window_len * (word_len - 1) h_dims = word_len epochs = input("Epochs: ") if not epochs == "": epochs = int(epochs) else: epochs = 100 start_time = time.time() model_file = "{}_epochs.model".format(epochs) if not os.path.exists(model_file): x = C.sequence.input_variable(1) z = create_model(x, h_dims) var_l = C.input_variable(1, dynamic_axes=z.dynamic_axes, name="y") learning_rate = 0.005 lr_schedule = C.learning_parameter_schedule(learning_rate) loss = C.squared_error(z, var_l) error = C.squared_error(z, var_l) momentum_schedule = C.momentum_schedule(0.9, minibatch_size=batch_size) learner = C.fsadagrad(z.parameters, lr=lr_schedule, momentum=momentum_schedule) trainer = C.Trainer(z, (loss, error), [learner]) # training loss_summary = [] start = time.time() for epoch in range(0, epochs): for x_batch, l_batch in next_batch(result_x, result_y, "train", batch_size): trainer.train_minibatch({x: x_batch, var_l: l_batch}) if epoch % (epochs / 10) == 0: training_loss = trainer.previous_minibatch_loss_average loss_summary.append(training_loss) print("epoch: {}, loss: {:.4f}".format(epoch, training_loss)) print("Training took {:.1f} sec".format(time.time() - start)) # Print the train, validation and test errors for label_txt in ["train", "val", "test"]: print("mse for {}: {:.6f}".format( label_txt, get_mse(trainer, x, result_x, result_y, batch_size, var_l, label_txt))) z.save(model_file) else: z = C.load_model(model_file) x = C.logging.find_all_with_name(z, "")[-1] # Print out all layers in the model print("Loading {} and printing all nodes:".format(model_file)) node_outputs = C.logging.find_all_with_name(z, "") for n in node_outputs: print(" {}".format(n)) results = [] # predict # f, a = plt.subplots(2, 1, figsize=(12, 8)) for j, ds in enumerate(["val", "test"]): fig = plt.figure() a = fig.add_subplot(2, 1, 1) results = [] for x_batch, y_batch in next_batch(result_x, result_y, ds, batch_size): pred = z.eval({x: x_batch}) results.extend(pred[:, 0]) # because we normalized the input data we need to multiply the prediction # with SCALER to get the real values. a.plot((result_y[ds]).flatten(), label=ds + " raw") a.plot(np.array(results), label=ds + " pred") a.legend() fig.savefig("{}_chart_{}_epochs.jpg".format(ds, epochs)) print("Delta: ", time.time() - start_time) return result_x, result_y, results
def prepare_data(window_len, word_len, alphabet_len, alpha_to_num, train_percent): source = input("Source (1=CSV,2=Finance): ") if source == "1": source = "weather_JAN.csv" ts_data = pd.read_csv(source, index_col="date", parse_dates=["date"], dtype=np.float32) sax_ret = sax_via_window(ts_data["temp"].values, window_len, word_len, alphabet_size=alphabet_len, nr_strategy="none", z_threshold=0.01) else: source = input("Remote Source (yahoo): ") if source == "": source = "yahoo" contract = input("Contract (SPY): ") if contract == "": contract = "SPY" start_date = input("Start Date (2000-01-01): ") if start_date == "": start_date = "2000-01-01" end_date = input("End Date (now): ") if end_date == "": end_date = datetime.datetime.now() ts_data = get_asset_data(source, contract, start_date, end_date) if "Close" in ts_data: close_tag = "Close" elif "close" in ts_data: close_tag = "close" else: return {"Error": "Couldn't find Close data."} sax_ret = sax_via_window(ts_data[close_tag].values, window_len, word_len, alphabet_size=alphabet_len, nr_strategy="none", z_threshold=0.01) my_sax = dict() for k, v in sax_ret.items(): for i in v: my_sax[i] = k tmp_d = {"x": [], "y": []} for k, v in my_sax.items(): num_list = [np.float32(alpha_to_num[char][1]) for char in v[:-1]] increment_list = [] for num in num_list: increment_list.append(num) tmp_d["x"].append(np.array(increment_list)) tmp_d["y"].append( np.array([np.float32(alpha_to_num[char][1]) for char in v[-1]])) # FORMAT: # result_x[0] = [1] result_y[0] = 3 # result_x[1] = [1,4] result_y[1] = 3 # result_x[2] = [1,4,2] result_y[2] = 3 # result_x[3] = [1,4,2,2] result_y[3] = 3 # result_x[4] = [1,4,2,2,4] result_y[4] = 3 ##### # Separate Dataset into train (80%), val (10%) and test (10%) pos_train = int(len(tmp_d["x"]) * train_percent) pos_train = int(pos_train / window_len) * window_len pos_val = len(tmp_d["x"][pos_train:]) / 2 pos_val = pos_train + int(pos_val / window_len) * window_len pos_test = pos_val result_x = dict() result_x["train"] = tmp_d["x"][:pos_train] result_x["val"] = tmp_d["x"][pos_train:pos_val] result_x["test"] = tmp_d["x"][pos_test:] result_y = dict() result_y["train"] = np.array(tmp_d["y"][:pos_train]) result_y["val"] = np.array(tmp_d["y"][pos_train:pos_val]) result_y["test"] = np.array(tmp_d["y"][pos_val:]) return result_x, result_y
def generate_candidates(self): candidates, cand_len = [], self.max_len # Fast shapelets (SAX Representation) if self.fast_shapelets: while cand_len >= self.min_len: sax_time_series = [] mask_patterns = [] mask_dict = [] sax_list = [] masked_words = {} sax_index = [] sax_counts = [] max_count = 0 if (self.fast_shapelet_arg['n_masking'] < self.fast_shapelet_arg['alpha_size']): counter = 0 while counter < self.fast_shapelet_arg['n_masking']: mask_index = random.sample( range(self.fast_shapelet_arg['alpha_size']), self.fast_shapelet_arg['mask_len']) if mask_index in mask_patterns: continue mask_patterns.append(mask_index) counter += 1 counter = 0 for time_serie, label in zip(self.time_series, self.labels): sax = sax_via_window(time_serie, cand_len, 5, 5, "none", 0.01) sax_ = {} for key, indexes in sax.items(): for index in indexes: sax_[index] = key sax = list(sax_.values()) sax_time_series.append((sax, label)) sax_list.extend(sax) for iteration in mask_patterns: masked_list = {} masked_words = [] for sax_series, label in sax_time_series: masked_sax = [] words = sax_series for mask_index in iteration: if mask_index == 0: words = [word[1:] for word in words] elif mask_index == self.fast_shapelet_arg[ 'alpha_size'] - 1: words = [ word[:self. fast_shapelet_arg['alpha_size'] - 1] for word in words ] else: words = [ (word[:mask_index] + word[mask_index + 1:]) for word in words ] masked_words.extend(words) if label in masked_list: masked_list[label] = masked_list[label] + words else: masked_list[label] = words if not sax_counts: for word in masked_words: sax_dict = {} for label in masked_list: sax_dict[label] = masked_list[label].count( word) max_count = max(max_count, sax_dict[label]) sax_counts.append(sax_dict) else: for index, word in enumerate(masked_words): sax_dict = sax_counts[index] for label in masked_list: sax_dict[label] = sax_dict[ label] + masked_list[label].count(word) max_count = max(max_count, sax_dict[label]) sax_counts[index] = sax_dict sax_complement = [] sax_dist_pow = [] for sax_count in sax_counts: sax_dict = {} sax_dist = {} for label in sax_count: sax_dict[ label] = max_count - sax_count[label] sax_dist = { key: sax_dict[key] - sax_count.get(key, 0) for key in sax_count.keys() } sax_complement.append(sax_dict) sax_dist_pow.append(sum(sax_dist.values())) max_dist_pow = sorted( range(len(sax_dist_pow)), key=lambda i: sax_dist_pow[i], reverse=True)[:self.shapelet_count] for max_index in max_dist_pow: index = 0 for idx, time_serie in enumerate(sax_time_series): index += len(time_serie[0]) if index >= max_index: location = max_index - (index - len(time_serie[0])) candidates.append( (self.time_series[idx] [location:location + cand_len], time_serie[1])) break if cand_len == self.min_len: break if self.shapelet_range: cand_len -= 1 else: cand_len = self.min_len # Normal representation else: while cand_len >= self.min_len: for time_serie, label in zip(self.time_series, self.labels): for k in range(len(time_serie) - cand_len + 1): candidates.append((time_serie[k:k + cand_len], label)) if cand_len == self.min_len: break if self.shapelet_range: cand_len -= 1 else: cand_len = self.min_len return pd.DataFrame(candidates)