def maximize_level_node(self, max_level): """ Try to maximaxe the level value :param p_value: :return: """ values_group = list(self.group.values()) original_level = self.level equal = True while equal and self.level < max_level: temp_level = self.level + 1 data = np.array(values_group[0]) data_znorm = znorm(data) data_paa = paa(data_znorm, self.paa_value) pr = ts_to_string(data_paa, cuts_for_asize(temp_level)) for index in range(1, len(values_group)): data = np.array(values_group[index]) data_znorm = znorm(data) data_paa = paa(data_znorm, self.paa_value) pr_2 = ts_to_string(data_paa, cuts_for_asize(temp_level)) if pr_2 != pr: equal = False if equal: self.level = temp_level if original_level != self.level: logger.info("New level for node: {}".format(self.level)) data = np.array(values_group[0]) data_znorm = znorm(data) data_paa = paa(data_znorm, self.paa_value) self.pattern_representation = ts_to_string(data_paa, cuts_for_asize(self.level)) else: logger.info("Can't split again, max level already reached")
def update_features(self, indices): self.minimalp = 0 self.minimaln = 0 activation1, activation2 = self.get_activations() print("activation1") print(activation1) print("activation2") print(activation2) dat_znorm_p = znorm(activation1[indices]) dat_znorm_n = znorm(activation2[indices]) symRep_p = ts_to_string(dat_znorm_p, cuts_for_asize(self.symbols)) symRep_n = ts_to_string(dat_znorm_n, cuts_for_asize(self.symbols)) # transfer result to feature(string to tuple) feature_p = tuple(symRep_p) feature_n = tuple(symRep_n) print("found symbolic feature for SQ-P:", symRep_p) if feature_p in self.testObjective.feature_p: self.minimalp = 1 self.testObjective.feature_p.remove(feature_p) self.coverage_p = 1 - len(self.testObjective.feature_p ) / self.testObjective.originalNumOfFeature self.displayCoverage1() print("found symbolic feature for SQ-N:", symRep_n) if feature_n in self.testObjective.feature_n: self.minimaln = 1 self.testObjective.feature_n.remove(feature_n) self.coverage_n = 1 - len(self.testObjective.feature_n ) / self.testObjective.originalNumOfFeature self.displayCoverage2() return self.coverage_p, self.coverage_n
def test_stringing(): """Test string conversion.""" # 11: np.array([-np.inf, -1.33517773611894, -0.908457868537385, # -0.604585346583237, -0.348755695517045, # -0.114185294321428, 0.114185294321428, 0.348755695517045, # 0.604585346583237, 0.908457868537385, 1.33517773611894]), ab = sax.ts_to_string(np.array([-1.33517773611895, -1.33517773611894]), alphabet.cuts_for_asize(11)) assert 'ab' == ab kj = sax.ts_to_string(np.array([1.33517773611895, 1.33517773611894]), alphabet.cuts_for_asize(11)) assert 'kj' == kj
def sax_via_window(series, win_size, paa_size, alphabet_size=3, nr_strategy='exact', z_threshold=0.01): """Simple via window conversion implementation.""" cuts = cuts_for_asize(alphabet_size) sax = defaultdict(list) prev_word = '' for i in range(0, len(series) - win_size): sub_section = series[i:(i + win_size)] zn = znorm(sub_section, z_threshold) paa_rep = paa(zn, paa_size) curr_word = ts_to_string(paa_rep, cuts) if '' != prev_word: if 'exact' == nr_strategy and prev_word == curr_word: continue elif 'mindist' == nr_strategy and\ is_mindist_zero(prev_word, curr_word): continue prev_word = curr_word sax[curr_word].append(i) return sax
def apply_adaptive_sax(ts, win_size, paa_size, alphabet_size, z_threshold): """ This function applies the sax transformation to a 1-dim time series using adaptive break-points :param ts: 1-time series :type ts: 1D array :param win_size: size fo the sliding window that generated each sax word :type win_size: int :param paa_size: number of characters in a single sax word :type paa_size: int :param alphabet_size: number of unique characters to use in the sax representation :type alphabet_size: int :param z_threshold: z_threshold for the znorm method from saxpy :type z_threshold: float :return: the sax sequence, a list of strings, where each string represents a single sax word :rtype: list of str """ sax_sequence = [] cuts = cuts_for_asize(alphabet_size) for t in range(0, len(ts) - win_size + 1): ts_win = ts[t:(t + win_size)] ts_win_znormed = znorm(ts_win, z_threshold) paa_rep = paa(ts_win_znormed, paa_size) sax_word = ts_to_string(paa_rep, cuts) sax_sequence.append(sax_word) return sax_sequence
def SAXtransform(df, nbCuts, segmentSize): ndf = pd.DataFrame() df = AvgDiscretisation(df, segmentSize) cuts = cuts_for_asize(nbCuts) for c in df.columns: ndf[c] = list(ts_to_string(df[c].values, cuts)) return ndf
def discretizar(v): alphabet_values = {'a':1,'b':2,'c':3,'d':4,'e':5} abc = ts_to_string(v, cuts_for_asize(5)) # abc : (cadena de String) r = [] for i in range(len(abc)): val = alphabet_values[ abc[i] ] r.append(val) return r
def SAX(sequence: np.ndarray, alphabet_size: int, length: int = 0) -> str: """ Computes SAX string of a sequence of numbers with specified alphabet size. Length of the output string may be specified; length 0 will generate a string as long as the sequence. """ debug("Calculating SAX of {}, with alphabet of size {}".format( sequence, alphabet_size)) if alphabet_size == 1: if length == 0: return "a" * len(sequence) else: return "a" * length else: if length == 0 or length == len(sequence): return ts_to_string(znorm(sequence), cuts_for_asize(alphabet_size)) else: return ts_to_string(paa(znorm(sequence), length), cuts_for_asize(alphabet_size))
def update_features(self, data): self.hidden = self.state_manager.get_hidden_state(data) activation_p, activation_n = self.get_activation() dat_znorm_p = znorm(activation_p[self.indices]) dat_znorm_n = znorm(activation_n[self.indices]) sym_rep_p = ts_to_string(dat_znorm_p, cuts_for_asize(self.symbols)) sym_rep_n = ts_to_string(dat_znorm_n, cuts_for_asize(self.symbols)) feature_p = tuple(sym_rep_p) feature_n = tuple(sym_rep_n) if feature_p in self.feature_p: index = self.feature_p.index(feature_p) self.covered_dict_p[index] = True self.frequency_dict_p[index] += 1 if feature_n in self.feature_n: index = self.feature_n.index(feature_n) self.covered_dict_n[index] = True self.frequency_dict_n[index] += 1
def update_features(self, data): self.hidden = self.state_manager.get_hidden_state(data) activation = self.get_activation() dat_znorm = znorm(activation[self.indices]) sym_rep = ts_to_string(dat_znorm, cuts_for_asize(self.symbols)) feature = tuple(sym_rep) if feature in self.feature: index = self.feature.index(feature) self.covered_dict[index] = True
def saxrepresentation(matrix): result = [] index_ts = 0 for ts in matrix.T: sax_representation = znorm(ts) dat_paa_3 = paa(sax_representation, 3) a = ts_to_string(dat_paa_3, cuts_for_asize(3)) result.append(a) return result
def row_pattern_loss(row: np.ndarray, pr: Tuple[str, int]): pattern = [] cuts = cuts_for_asize(pr[1] + 1)[1:] for c in pr[0]: n = ord(c) - 97 pattern.append(cuts[n]) if len(pattern) != len(row): normalized_row = paa(znorm(row), len(pattern)) else: normalized_row = znorm(row) return distance(normalized_row, pattern)
def sax_transform(time_series_df, num_cuts): """ Applies SAX transformation for a number of cuts, e.g. 3, 5, ... to have symbolic representation. """ time_series_df = _preprocess_ts(time_series_df) cuts = cuts_for_asize(num_cuts) return (time_series_df.groupby(OCCURRENCE_INDEX).agg({ 'value_representation': lambda x: ts_to_string(x.to_numpy(), cuts) }).reset_index())
def sax(self, cardinality): ''' Creates SAX representation of the time series :param cardinality: number of symbols to use in SAX representation ''' self.cardinality = cardinality self.cuts = cuts_for_asize(self.cardinality) self.string = ts_to_string(self.norm_values, self.cuts) #denormalize cuts for correct vizualisaton self.cuts_den = self.cuts * self.std + self.mean self.data['symbol'] = list(self.string) self.sax_freq = self.generate_freq()
def update_features(self, data): self.hidden = self.state_manager.get_hidden_state([data]) activation = self.get_activation() dat_znorm = (activation[:, self.indices] - self.mean) / self.std dat_znorm = [paa(item, self.seq_len) for item in dat_znorm] features = [ tuple(ts_to_string(item, cuts_for_asize(self.symbols))) for item in dat_znorm ] for feature in features: if feature in self.feature: index = self.feature.index(feature) self.covered_dict[index] = True
def fitness(self, hidden, sym): activation = self.get_activations(hidden) dat_znorm = Z_ScoreNormalization( activation[:, self.testObjective.indices], self.testObjective.mean, self.testObjective.std) dat_znorm = [ paa(item, self.testObjective.seq_len) for item in dat_znorm ] cuts = cuts_for_asize(self.testObjective.symbols) cuts = np.append(cuts, np.array([np.inf])) sym_size = len(sym) out = np.array([ self.cal_fittness_seq(cuts, sym_size, sym, series) for series in dat_znorm ]) return out
def extract_features( song_name): # returns mfcc and chroma features in SAX represetation try: x, fs = librosa.load(song_name) except: return None mfccs = librosa.feature.mfcc(x, sr=fs, n_mfcc=39) chroma = librosa.feature.chroma_stft(x, sr=fs) feature_matrix = np.concatenate((mfccs, chroma)) sax_rep = [] sax_rep = [ ts_to_string(paa(znorm(feat), SAX_VOCAB_LENGTH), cuts_for_asize(SAX_VOCAB_LENGTH)) for feat in feature_matrix ] return sax_rep
def sax_via_window(series, win_size, paa_size, alphabet_size=3, nr_strategy='exact', z_threshold=0.01): """Simple via window conversion implementation.""" # 生成指定size的alphabet cuts cuts = cuts_for_asize(alphabet_size) # 初始化sax sax = defaultdict(list) prev_word = '' for i in range(0, len(series) - win_size): # series被当前窗口所围住的子部分 sub_section = series[i:(i + win_size)] # 标准化 zn = znorm(sub_section, z_threshold) # PAA分段聚合 将子部分降维到paa_size维 paa_rep = paa(zn, paa_size) # PAA后的序列转化为字符串 curr_word = ts_to_string(paa_rep, cuts) # if '' != prev_word: if 'exact' == nr_strategy and prev_word == curr_word: continue elif 'mindist' == nr_strategy and\ is_mindist_zero(prev_word, curr_word): continue prev_word = curr_word sax[curr_word].append(i) return sax
def update_features(self, hidden, test_num): activation = self.get_activations(hidden) dat_znorm = Z_ScoreNormalization( activation[:, self.testObjective.indices], self.testObjective.mean, self.testObjective.std) dat_znorm = [ paa(item, self.testObjective.seq_len) for item in dat_znorm ] features = [ tuple( ts_to_string(item, cuts_for_asize(self.testObjective.symbols))) for item in dat_znorm ] self.cov_count += 1 for feature in features: if feature in self.testObjective.feature: self.cov_count = 0 self.testObjective.feature.remove(feature) self.testObjective.covered_feature.append(feature) del self.testObjective.test_record[feature] self.coverage = 1 - len(self.testObjective.feature ) / self.testObjective.originalNumOfFeature cov_fitness = np.array([ self.fitness(hidden, listElem) for listElem in self.testObjective.feature ]) cov_index = np.min(cov_fitness, axis=1) cov_fitness = np.argmin(cov_fitness, axis=1) for idx, feature in enumerate(self.testObjective.feature): test_record = self.testObjective.test_record[feature] if test_record == None or test_record[1] > cov_fitness[idx]: self.testObjective.test_record[feature] = list( [test_num + cov_index[idx], cov_fitness[idx]]) self.displayCoverage()
def znorm_paa_sax(time_series, alpha, w=3, missing='z'): """Takes an array containing real values, z-normalizes, reduces dimensionality to w, and finally returns a sax representation of length alpha time series: array holding a time series of one measurement for one patient w: the dimensionality to reduce to using PAA, set to len(time_series) in plain alpha: alpha is the number of discretized segments that the SAX rep will reflect, set to 2, 3 or 5 in plain using RDS algo """ # If time_series is a string, make it into list format e.g. 'abc' -> ['a', 'b', 'c'] # why? because it's the structure we require for below and i CBA to change it if (isinstance(time_series, str)): time_series = list(time_series) if (len(time_series) > 0): # normalizing one time series, time series as numpy array (np.array([])) normalized_time_series = znorm(np.array(time_series)) # dimensionality reduction of time series according to w paa_norm_time_series = paa(normalized_time_series, w) # turning a discretized and reduced time series into a sequence of characters return ts_to_string(paa_norm_time_series, cuts_for_asize(alpha)) else: return missing
def recycle_bad_leaves(p_value, good_leaf_nodes, bad_leaf_nodes, suppressed_nodes, paa_value): """ Recycle bad-leaves phase :param bad_leaf_nodes: [description] """ bad_leaf_nodes_dict = dict() for node in bad_leaf_nodes: if node.level in bad_leaf_nodes_dict.keys(): bad_leaf_nodes_dict[node.level].append(node) else: bad_leaf_nodes_dict[node.level] = [node] bad_leaf_nodes_size = sum([node.size for node in bad_leaf_nodes]) if bad_leaf_nodes_size >= p_value: # max bad level current_level = max(bad_leaf_nodes_dict.keys()) while bad_leaf_nodes_size >= p_value: if current_level in bad_leaf_nodes_dict.keys(): merge_dict = dict() keys_to_be_removed = list() merge = False for current_level_node in bad_leaf_nodes_dict[ current_level]: pr_node = current_level_node.pattern_representation if pr_node in merge_dict.keys(): merge = True merge_dict[pr_node].append(current_level_node) if pr_node in keys_to_be_removed: keys_to_be_removed.remove(pr_node) else: merge_dict[pr_node] = [current_level_node] keys_to_be_removed.append(pr_node) if merge: for k in keys_to_be_removed: del merge_dict[k] for pr, node_list in merge_dict.items(): group = dict() for node in node_list: bad_leaf_nodes_dict[current_level].remove(node) group.update(node.group) if current_level > 1: level = current_level else: level = 1 leaf_merge = Node(level=level, pattern_representation=pr, group=group, paa_value=paa_value) if leaf_merge.size >= p_value: leaf_merge.label = "good-leaf" good_leaf_nodes.append(leaf_merge) bad_leaf_nodes_size -= leaf_merge.size else: leaf_merge.label = "bad-leaf" bad_leaf_nodes_dict[current_level].append( leaf_merge) temp_level = current_level - 1 for node in bad_leaf_nodes_dict[current_level]: if temp_level > 1: values_group = list(node.group.values()) data = np.array(values_group[0]) data_znorm = znorm(data) data_paa = paa(data_znorm, paa_value) pr = ts_to_string(data_paa, cuts_for_asize(temp_level)) else: pr = "a" * paa_value node.level = temp_level node.pattern_representation = pr if current_level > 0: if temp_level not in bad_leaf_nodes_dict.keys(): bad_leaf_nodes_dict[ temp_level] = bad_leaf_nodes_dict.pop( current_level) else: bad_leaf_nodes_dict[temp_level] = bad_leaf_nodes_dict[ temp_level] + bad_leaf_nodes_dict.pop( current_level) current_level -= 1 else: break #print("sopprimo le serie rimanenti") remaining_bad_leaf_nodes = list(bad_leaf_nodes_dict.values())[0] for node in remaining_bad_leaf_nodes: suppressed_nodes.append(node)
with open(sys.argv[1], 'r') as h: lines = h.readlines() DATA = [] time_series = [] for line in lines: line = line.strip() if line != 'null' and line != '\n': time_series.append(float(line)) else: DATA.append(time_series) time_series = [] for data in DATA: data = np.asfarray(data, float) data = np.diff(data) data_znorm = znorm(data) data_paa = paa(data_znorm, w) sax_words.append(ts_to_string(data_paa, cuts_for_asize(a))) # sax_words_ri = [] # i = 0 # for word in sax_words: # perms = set([''.join(p) for p in permutations(word)]) # sax_words_ri.append(perms) # i+=1 # Write all SAX words to file only once instead of generating again and again with open('sax_words_ri_norot_w=' + str(w) + '_a=' + str(a), 'w+') as sax_words_file: for l in sax_words: sax_words_file.write(l + '\n') # sax_words_file.write('\n')
def main(argv): #load configuration parameters = load_configuration() #load parameters #dataset path_to_dataset = parameters['path_to_dataset'] load_size = parameters['load_size'] #SAX alphabet_size = parameters['alphabet_size'] paa_size = parameters['paa_size'] window_size = parameters['window_size'] step = parameters['step'] substring_size = parameters['substring_size'] #smoothing threshold_freq = parameters['threshold_freq'] #projections prj_size = parameters['prj_size'] prj_iterations = parameters['prj_iterations'] anomaly_threshold = parameters['anomaly_threshold'] #loading data loader = DataLoader.DataLoader(path_to_dataset) data = DataTypes.Data() #loader.load_all(data,200) loader.load_subset(data, load_size, 100) #period from which extract anomalies begin_date = datetime.datetime.fromtimestamp(data.index_to_time[0]) end_date = datetime.datetime.fromtimestamp(data.index_to_time[load_size - 1]) if parameters['power_type'] == -1: tank = parameters['tank'] sensor_type = parameters['sensor_type'] #print(data.measures[0]) print("Loading of %i tank %i data from %s to %s " % (sensor_type, tank, begin_date, end_date)) s_values = [ data.measures[i][0][tank][sensor_type] for i in range(0, len(data.measures)) ] else: power_type = parameters['power_type'] print("Loading measures of power %i from %s to %s " % (power_type, begin_date, end_date)) s_values = [ data.measures[i][1][power_type] for i in range(0, len(data.measures)) ] len_serie = len(s_values) hash_table_substrings = {} #getting first n alphabet letters alphabet = get_alphabet_letters(alphabet_size) #creating hash table indexed by all of substrings of length k hash_table_substrings = get_hash_table(alphabet, prj_size) #list containg score for each window anomalies_score = [] for index in range(0, len_serie, step): begin = index end = begin + window_size if end < len_serie: window_values = s_values[begin:end] window_znorm = znorm(s_values) window_paa = paa(window_znorm, paa_size) window_string = ts_to_string(window_paa, cuts_for_asize(alphabet_size)) #each character of the string corresponds to k values of the series k = window_size // paa_size #get smoothed string window_smoothed = smoothing(window_string, threshold_freq) #fill hash table by applying random projection hash_table_substrings = put_in_bucket(hash_table_substrings, window_smoothed, begin, prj_iterations, prj_size, substring_size, k) total = 0 for key, values in hash_table_substrings.items(): total = total + len(values) buckets_with_anomalies, bucket_freq = analyzed_bucket( hash_table_substrings, total, anomaly_threshold) #number of bucket with anomalies n_buckets_anomalies = len(buckets_with_anomalies.keys()) #getting score for current window avg_window_score = getting_score(hash_table_substrings, buckets_with_anomalies, n_buckets_anomalies) anomalies_score.append(avg_window_score) #reset table hash_table_substrings = get_hash_table(alphabet, prj_size) else: break print(anomalies_score)
########## hist = yf.download(tickers = "DJI", period = 'max') words = [] dow_df = ent.util_pattern_space(hist_sma, lag = 1, dim = 50) dow_df = dow_df[:] for i in range(len(dow_df)): dat_znorm = znorm(dow_df[i,:]) dat_paa= paa(dat_znorm, 3) word = ts_to_string(dat_paa, cuts_for_asize(2)) words.append(word) print(words) print(collections.Counter(words)) from sklearn.preprocessing import LabelEncoder le=LabelEncoder() sqn = le.fit_transform(words) nb_classes = len(np.unique(sqn)) from keras.utils import to_categorical onehot = to_categorical(sqn)
def discretise(data, number_of_bins): return ts_to_string(znorm(data), cuts_for_asize(number_of_bins))
def test_sizing(): """Test alphabet sizes.""" for s in range(2, 20): assert len(alphabet.cuts_for_asize(s)) == s
def sax_via_window(series, win_size, paa_size, alphabet_size=3, nr_strategy='exact', znorm_threshold=0.01, sax_type='unidim'): """Simple via window conversion implementation. # SAX-ENERGY >>> sax_via_window([[1, 2, 3], [4, 5, 6]], win_size=1, paa_size=3, sax_type='energy', nr_strategy=None)['abc'] [0, 1] >>> sax_via_window([[1, 2, 3, 4], [4, 5, 6, 7]], win_size=1, paa_size=4, sax_type='energy', nr_strategy=None)['aacc'] [0, 1] >>> sax_via_window([[1, 2, 3, 4], [4, 5, 6, 7]], win_size=2, paa_size=4, sax_type='energy', nr_strategy=None)['aaccaacc'] [0] # SAX-REPEAT >>> sax_via_window([[1, 2, 3], [4, 5, 6], [7, 8, 9]], win_size=2, paa_size=2, sax_type='repeat', nr_strategy=None)['ab'] [0, 1] >>> sax_via_window([[1, 2, 3], [4, 5, 6], [7, 8, 9]], win_size=1, paa_size=1, sax_type='repeat', nr_strategy=None)['a'] [0, 1, 2] # SAX-INDEPENDENT >>> sax_via_window([[1, 2, 3, 4], [4, 5, 6, 7]], win_size=2, paa_size=2, sax_type='independent', nr_strategy=None)['acacacac'] [0] >>> sax_via_window([[1, 2], [4, 5], [7, 8]], win_size=2, paa_size=2, sax_type='independent', nr_strategy=None)['acac'] [0, 1] >>> sax_via_window([[1, 2], [4, 8], [7, 5]], win_size=2, paa_size=2, sax_type='independent', nr_strategy=None)['acac'] [0] >>> sax_via_window([[1, 2], [4, 8], [7, 5]], win_size=2, paa_size=2, sax_type='independent', nr_strategy=None)['acca'] [1] """ # Convert to numpy array. series = np.array(series) # Check on dimensions. if len(series.shape) > 2: raise ValueError('Please reshape time-series to stack dimensions along the 2nd dimension, so that the array shape is a 2-tuple.') # PAA size is the length of the PAA sequence. if sax_type != 'energy' and paa_size > win_size: raise ValueError('PAA size cannot be greater than the window size.') if sax_type == 'energy' and len(series.shape) == 1: raise ValueError('Must pass a multidimensional time-series to SAX-ENERGY.') # Breakpoints. cuts = cuts_for_asize(alphabet_size) # Dictionary mapping SAX words to indices. sax = defaultdict(list) if sax_type == 'repeat': # Maps indices to multi-dimensional SAX words. multidim_sax_dict = [] # List of all the multi-dimensional SAX words. multidim_sax_list = [] # Sliding window across time dimension. for i in range(series.shape[0] - win_size + 1): # Subsection starting at this index. sub_section = series[i: i + win_size] # Z-normalized subsection. if win_size == 1: zn = sub_section else: zn = znorm(sub_section, znorm_threshold) # PAA representation of subsection. paa_rep = paa(zn, paa_size, 'repeat') # SAX representation of subsection, but in terms of multi-dimensional vectors. multidim_sax = get_sax_list(paa_rep, cuts) # Update data-structures. multidim_sax_dict.append(multidim_sax) multidim_sax_list.extend(multidim_sax) # Cluster with k-means++. kmeans = KMeans(n_clusters=alphabet_size, random_state=0).fit(multidim_sax_list) # Cluster indices in sorted order. order = np.lexsort(np.rot90(kmeans.cluster_centers_)) # Sliding window across time dimension. prev_word = '' for i in range(series.shape[0] - win_size + 1): # Map cluster indices to new SAX letters. curr_word_list = map(lambda cluster_index: idx2letter(order[cluster_index]), kmeans.predict(multidim_sax_dict[i])) curr_word = ''.join(curr_word_list) if '' != prev_word: if 'exact' == nr_strategy and prev_word == curr_word: continue elif 'mindist' == nr_strategy and is_mindist_zero(prev_word, curr_word): continue prev_word = curr_word sax[curr_word].append(i) else: # Sliding window across time dimension. prev_word = '' for i in range(series.shape[0] - win_size + 1): # Subsection starting at this index. sub_section = series[i: i + win_size] if sax_type == 'energy': curr_word = '' for energy_dist in sub_section: # Normalize energy distribution. energy_zn = znorm(energy_dist, znorm_threshold) # PAA representation of energy distribution. paa_rep = paa(energy_zn, paa_size, 'unidim') # paa_rep = energy_zn # SAX representation of the energy distribution. energy_word = ts_to_string(paa_rep, cuts) # Add to current word. curr_word += energy_word elif sax_type == 'independent': curr_word = '' for dim in range(sub_section.shape[1]): # Obtain the subsequence restricted to one dimension. one_dimension_sub_section = sub_section[:, dim] # Z-normalized subsection. zn = znorm(one_dimension_sub_section, znorm_threshold) # PAA representation of subsection. paa_rep = paa(zn, paa_size, 'unidim') # Get the SAX word - just a unidimensional SAX. one_dim_word = ts_to_string(paa_rep, cuts) # Add this dimensions' representation to the overall SAX word. curr_word += one_dim_word else: # Z-normalized subsection. zn = znorm(sub_section, znorm_threshold) # PAA representation of subsection. paa_rep = paa(zn, paa_size, sax_type) # SAX representation of subsection. curr_word = ts_to_string(paa_rep, cuts) if '' != prev_word: if 'exact' == nr_strategy and prev_word == curr_word: continue elif 'mindist' == nr_strategy and is_mindist_zero(prev_word, curr_word): continue prev_word = curr_word sax[curr_word].append(i) return sax
def sax_by_chunking(series, paa_size, alphabet_size=3, z_threshold=0.01): """Simple chunking conversion implementation.""" paa_rep = paa(znorm(series, z_threshold), paa_size) cuts = cuts_for_asize(alphabet_size) return ts_to_string(paa_rep, cuts)
def start_splitting(self, p_value: int, max_level: int, good_leaf_nodes: list(), bad_leaf_nodes: list()): """ Splitting Node Naive algorithm (k, P) Anonymity :param p_value: :param max_level: :param paa_value :return: """ # logger.info("good_leaf_nodes: {}, bad_leaf_nodes: {}".format(len(good_leaf_nodes), len(bad_leaf_nodes))) if self.size < p_value: logger.info("size:{}, p_value:{} == bad-leaf".format(self.size, p_value)) self.label = "bad-leaf" bad_leaf_nodes.append(self) return if self.level == max_level: logger.info("size:{}, p_value:{} == good-leaf".format(self.size, p_value)) self.label = "good-leaf" good_leaf_nodes.append(self) return if p_value <= self.size < 2*p_value: logger.info("Maximize-level, size:{}, p_value:{} == good-leaf".format(self.size, p_value)) self.maximize_level_node(max_level) self.label = "good-leaf" good_leaf_nodes.append(self) return """ Otherwise, we need to check if node N has to be split. The checking relies on a tentative split performed on N. Suppose that, by increasing the level of N, N is tentatively split into a number of child nodes. If all these child nodes contain fewer than P time series, no real split is performed and the original node N is labeled as good-leaf and the recursion terminates on N. Otherwise, there must exist tentative child node(s) whose size >= P, also called TG-node(s) (Tentative Good Nodes). The rest children whose size < P are called TB-nodes (Tentative Bad Nodes), if any. If the total number of records in all TB-nodes under N is no less than P, we merge them into a single tentative node, denoted by childmerge, at the level of N.level. If the above tentative process produces nc tentative child nodes (including TB and TG) and nc >= 2, N will really be split into nc children and then the node splitting procedure will be recursively invoked on each of them """ tentative_child_node = dict() temp_level = self.level + 1 for key, value in self.group.items(): # to reduce dimensionality data = np.array(value) data_znorm = znorm(data) data_paa = paa(data_znorm, self.paa_value) pr = ts_to_string(data_paa, cuts_for_asize(temp_level)) if pr in tentative_child_node.keys(): tentative_child_node[pr].append(key) else: tentative_child_node[pr] = [key] length_all_tentative_child = [len(x) for x in list(tentative_child_node.values())] good_leaf = np.all(np.array(length_all_tentative_child) < p_value) if good_leaf: logger.info("Good-leaf, all_tentative_child are < {}".format(p_value)) self.label = "good-leaf" good_leaf_nodes.append(self) return else: logger.info("N can be split") logger.info("Compute tentative good nodes and tentative bad nodes") # tentative good nodes # index of nodes in tentative_child_node with more p_value pr_keys = list(tentative_child_node.keys()) # get index tentative good node pattern_representation_tg = list() tg_nodes_index = list(np.where(np.array(length_all_tentative_child) >= p_value)[0]) # logger.info(pr_keys) tg_nodes = list() for index in tg_nodes_index: keys_elements = tentative_child_node[pr_keys[index]] dict_temp = dict() for key in keys_elements: dict_temp[key] = self.group[key] tg_nodes.append(dict_temp) pattern_representation_tg.append(pr_keys[index]) # tentative bad nodes tb_nodes_index = list(np.where(np.array(length_all_tentative_child) < p_value)[0]) tb_nodes = list() pattern_representation_tb = list() for index in tb_nodes_index: keys_elements = tentative_child_node[pr_keys[index]] dict_temp = dict() for key in keys_elements: dict_temp[key] = self.group[key] tb_nodes.append(dict_temp) pattern_representation_tb.append(pr_keys[index]) total_size_tb_nodes = 0 for tb_node in tb_nodes: total_size_tb_nodes += len(tb_node) if total_size_tb_nodes >= p_value: logger.info("Merge all bad nodes in a single node, and label it as good-leaf") child_merge_node_group = dict() for tb_node in tb_nodes: for key, value in tb_node.items(): child_merge_node_group[key] = value node_merge = Node(level=self.level, pattern_representation=self.pattern_representation, label="good-leaf", group=child_merge_node_group, parent=self) self.child_node.append(node_merge) good_leaf_nodes.append(node_merge) nc = len(tg_nodes) + len(tb_nodes) # tb_nodes sono un po' perplesso su questo tb_nodes logger.info("Split only tg_nodes {0}".format(len(tg_nodes))) if nc >= 2: for index in range(0, len(tg_nodes)): node = Node(level=self.level, pattern_representation=pattern_representation_tg[index], label="intermediate", group=tg_nodes[index], parent=self) self.child_node.append(node) node.start_splitting(p_value, max_level, good_leaf_nodes, bad_leaf_nodes) else: for index in range(0, len(tg_nodes)): node = Node(level=self.level, pattern_representation=pattern_representation_tg[index], label="good-leaf", group=tg_nodes[index], parent=self) self.child_node.append(node) good_leaf_nodes.append(node) else: nc = len(tg_nodes) + len(tb_nodes) # tb_nodes sono un po' perplesso su questo tb_nodes logger.info("Label all tb_node {0} as bad-leaf and split only tg_nodes {1}".format(len(tb_nodes),len(tg_nodes))) for index in range(0, len(tb_nodes)): node = Node(level=self.level, pattern_representation=pattern_representation_tb[index], label="bad-leaf", group=tb_nodes[index], parent=self) self.child_node.append(node) bad_leaf_nodes.append(node) if nc >= 2: for index in range(0, len(tg_nodes)): node = Node(level=self.level, pattern_representation=pattern_representation_tg[index], label="intermediate", group=tg_nodes[index], parent=self) self.child_node.append(node) node.start_splitting(p_value, max_level, good_leaf_nodes, bad_leaf_nodes) else: for index in range(0, len(tg_nodes)): node = Node(level=self.level, pattern_representation=pattern_representation_tg[index], label="good-leaf", group=tg_nodes[index], parent=self) self.child_node.append(node) good_leaf_nodes.append(node)