def bin_and(self, keys, grps, m_grp): n = self.d_set.attr_size min_supp = self.d_set.thd_supp pattern = GP() gi = GI.parse_gi(keys[0]) pattern.add_gradual_item(gi) # bin_1 = grps[0]['bins'] # main_bin = [bin_1[str(x)][:] for x in range(self.d_set.seg_count)] for i in range(len(keys)): if i == 0: continue bin_2 = grps[i]['bins'] # temp_bin = [np.multiply(temp_bin[k], bin_2[str(k)][:]) for k in range(self.d_set.seg_count)] # temp_bin = [] bin_sum = 0 for k in range(self.d_set.seg_count): m_grp[str(k)][...] = np.multiply(m_grp[str(k)][:], bin_2[str(k)][:]) bin_sum += np.sum(m_grp[str(k)][:]) # temp_bin.append(arr) supp = float(bin_sum) / float(n * (n - 1.0) / 2.0) if supp >= min_supp: # main_bin = temp_bin gi = GI.parse_gi(keys[i]) pattern.add_gradual_item(gi) pattern.set_support(supp) # print(str(pattern.to_string()) + ' : ' + str(pattern.support)) return pattern
def generate_d(self): v_bins = self.d_set.valid_bins # 1. Fetch valid bins group attr_keys = [GI(x[0], x[1].decode()).as_string() for x in v_bins[:, 0]] # 2. Initialize an empty d-matrix n = len(attr_keys) d = np.zeros((n, n), dtype=np.dtype('i8')) # cumulative sum of all segments for i in range(n): for j in range(n): if GI.parse_gi(attr_keys[i]).attribute_col == GI.parse_gi( attr_keys[j]).attribute_col: # 2a. Ignore similar attributes (+ or/and -) continue else: bin_1 = v_bins[i][1] bin_2 = v_bins[j][1] # Cumulative sum of all segments for 2x2 (all attributes) gradual items # 2b. calculate sum from bin ranks (in chunks) bin_sum = 0 for k in range(len(bin_1)): bin_sum += np.sum(np.multiply(bin_1[k], bin_2[k])) d[i][j] += bin_sum # print(d) return d, attr_keys
def generate_d(self): # v_items = self.d_set.valid_items # 1. Fetch valid bins group attr_keys = self.d_set.valid_items # [GI(x[0], x[1].decode()).as_string() for x in v_items[:, 0]] ranks = self.d_set.rank_matrix # 2. Initialize an empty d-matrix n = len(attr_keys) d = np.zeros((n, n), dtype=np.dtype('i8')) # cumulative sum of all segments for i in range(n): for j in range(n): gi_1 = GI.parse_gi(attr_keys[i]) gi_2 = GI.parse_gi(attr_keys[j]) if gi_1.attribute_col == gi_2.attribute_col: # Ignore similar attributes (+ or/and -) continue else: bin_1 = ranks[:, gi_1.attribute_col].copy() bin_2 = ranks[:, gi_2.attribute_col].copy() # 2b. Reconstruct if negative (swap 0.5 and 1, leave 0 as 0) if gi_1.is_decrement(): bin_1 = np.where(bin_1 == 0.5, 1, np.where(bin_1 == 1, 0.5, 0)) if gi_2.is_decrement(): bin_2 = np.where(bin_2 == 0.5, 1, np.where(bin_2 == 1, 0.5, 0)) # Cumulative sum of all segments for 2x2 (all attributes) gradual items temp_bin = np.where(bin_1 == bin_2, 1, 0) d[i][j] += np.sum(temp_bin) # print(d) return d, attr_keys
def generate_d(self): # 1a. Retrieve/Generate distance matrix (d) grp_name = 'dataset/' + self.d_set.step_name + '/valid_items' attr_keys = [ x.decode() for x in self.d_set.read_zarr_dataset(grp_name) ] grp_name = 'dataset/' + self.d_set.step_name + '/d_matrix' d = self.d_set.read_zarr_dataset(grp_name) if d.size > 0: # 1b. Fetch valid bins group return d, attr_keys # 1b. Fetch valid bins group z_root = zarr.open(self.d_set.z_file, 'r') grp_name = 'dataset/' + self.d_set.step_name + '/rank_matrix' ranks = z_root[grp_name][:] # [:] TO BE REMOVED # 2. Initialize an empty d-matrix n = len(attr_keys) d = np.zeros((n, n), dtype=np.dtype('i8')) # cumulative sum of all segments for i in range(n): for j in range(n): gi_1 = GI.parse_gi(attr_keys[i]) gi_2 = GI.parse_gi(attr_keys[j]) if gi_1.attribute_col == gi_2.attribute_col: # Ignore similar attributes (+ or/and -) continue else: # for s in ranks.iter_chunks(): bin_1 = ranks[:, gi_1.attribute_col].copy() bin_2 = ranks[:, gi_2.attribute_col].copy() # 2b. Reconstruct if negative (swap 0.5 and 1, leave 0 as 0) if gi_1.is_decrement(): bin_1 = np.where(bin_1 == 0.5, 1, np.where(bin_1 == 1, 0.5, 0)) if gi_2.is_decrement(): bin_2 = np.where(bin_2 == 0.5, 1, np.where(bin_2 == 1, 0.5, 0)) # Cumulative sum of all segments for 2x2 (all attributes) gradual items temp_bin = np.where(bin_1 == bin_2, 1, 0) d[i][j] += np.sum(temp_bin) # print(d) grp_name = 'dataset/' + self.d_set.step_name + '/d_matrix' self.d_set.add_zarr_dataset(grp_name, d, compress=True) return d, attr_keys
def generate_aco_gp(self, p_matrix): attr_keys = self.attr_keys v_matrix = self.d pattern = GP() # 1. Generate gradual items with highest pheromone and visibility m = p_matrix.shape[0] for i in range(m): combine_feature = np.multiply(v_matrix[i], p_matrix[i]) total = np.sum(combine_feature) with np.errstate(divide='ignore', invalid='ignore'): probability = combine_feature / total cum_prob = np.cumsum(probability) r = np.random.random_sample() try: j = np.nonzero(cum_prob > r)[0][0] gi = GI.parse_gi(attr_keys[j]) if not pattern.contains_attr(gi): pattern.add_gradual_item(gi) except IndexError: continue # 2. Evaporate pheromones by factor e p_matrix = (1 - self.e_factor) * p_matrix return pattern, p_matrix
def generate_random_gp(self): p = self.p_matrix n = len(self.attr_index) pattern = GP() attrs = np.random.permutation(n) for i in attrs: max_extreme = n * 100 x = float(rand.randint(1, max_extreme) / max_extreme) pos = float(p[i][0] / (p[i][0] + p[i][1] + p[i][2])) neg = float((p[i][0] + p[i][1]) / (p[i][0] + p[i][1] + p[i][2])) if x < pos: temp = GI(self.attr_index[i], '+') elif (x >= pos) and (x < neg): temp = GI(self.attr_index[i], '-') else: # temp = GI(self.attr_index[i], 'x') continue pattern.add_gradual_item(temp) return pattern
def generate_d(self): # 1a. Retrieve/Generate distance matrix (d) grp_name = 'dataset/' + self.d_set.step_name + '/d_matrix' d = self.d_set.read_h5_dataset(grp_name) if d.size > 0: # 1b. Fetch valid bins group grp_name = 'dataset/' + self.d_set.step_name + '/valid_bins/' h5f = h5py.File(self.d_set.h5_file, 'r') attr_keys = list(h5f[grp_name].keys()) h5f.close() return d, attr_keys # 1b. Fetch valid bins group grp_name = 'dataset/' + self.d_set.step_name + '/valid_bins/' h5f = h5py.File(self.d_set.h5_file, 'r') grp = h5f[grp_name] attr_keys = list(grp.keys()) # 2. Initialize an empty d-matrix n = len(grp) d = np.zeros((n, n), dtype=float) # cumulative sum of all segments for k in range(self.d_set.seg_count): # 2. For each segment do a binary AND for i in range(n): for j in range(n): bin_1 = grp[attr_keys[i]] bin_2 = grp[attr_keys[j]] if GI.parse_gi(attr_keys[i]).attribute_col == GI.parse_gi( attr_keys[j]).attribute_col: # Ignore similar attributes (+ or/and -) continue else: # Cumulative sum of all segments for 2x2 (all attributes) gradual items d[i][j] += np.sum( np.multiply(bin_1['bins'][str(k)][:], bin_2['bins'][str(k)][:])) # 3. Save d_matrix in HDF5 file h5f.close() grp_name = 'dataset/' + self.d_set.step_name + '/d_matrix' self.d_set.add_h5_dataset(grp_name, d) return d, attr_keys
def generate_d(self): # 1a. Fetch valid attribute keys grp_name = 'dataset/' + self.d_set.step_name + '/valid_bins/' h5f = h5py.File(self.d_set.h5_file, 'r') bin_grp = h5f[grp_name] attr_keys = list(bin_grp.keys()) # 1b. Retrieve/Generate distance matrix (d) grp_name = 'dataset/' + self.d_set.step_name + '/d_matrix' d = self.d_set.read_h5_dataset(grp_name) if d.size > 0: # 1b. Fetch valid bins group h5f.close() return d, attr_keys # 2. Initialize an empty d-matrix n = len(attr_keys) d = np.zeros((n, n), dtype=np.dtype('i8')) # cumulative sum of all segments for i in range(n): for j in range(n): gi_1 = GI.parse_gi(attr_keys[i]) gi_2 = GI.parse_gi(attr_keys[j]) if gi_1.attribute_col == gi_2.attribute_col: # 2a. Ignore similar attributes (+ or/and -) continue else: bin_1 = bin_grp[gi_1.as_string()] # v_bins[i][1] bin_2 = bin_grp[gi_2.as_string()] # v_bins[j][1] # Cumulative sum of all segments for 2x2 (all attributes) gradual items # 2b. calculate sum from bin ranks (in chunks) #print(bin_1[k]) bin_sum = 0 for k in range(len(bin_1)): bin_sum += np.sum(np.multiply(bin_1[k], bin_2[k])) d[i][j] += bin_sum h5f.close() grp_name = 'dataset/' + self.d_set.step_name + '/d_matrix' self.d_set.add_h5_dataset(grp_name, d, compress=True) return d, attr_keys
def init_gp_attributes(self, attr_data=None): # 1. Transpose csv array data if attr_data is None: attr_data = self.data.T self.attr_size = self.row_count else: self.attr_size = len(attr_data[self.attr_cols[0]]) # 2. Initialize (k x attr) matrix n = self.attr_size m = self.col_count k = int(n * (n - 1) / 2) self.rank_matrix = np.zeros((k, m), dtype=np.float16) # 3. Determine binary rank (fuzzy: 0, 0.5, 1) and calculate support of pattern valid_count = 0 for col in self.attr_cols: col_data = np.array(attr_data[col], dtype=float) incr = GI(col, '+') decr = GI(col, '-') # 3a. Determine gradual ranks tmp_rank = np.where( col_data < col_data[:, np.newaxis], 1, np.where(col_data > col_data[:, np.newaxis], 0.5, 0)) tmp_rank = tmp_rank[np.triu_indices(n, k=1)] # 3a. Determine gradual ranks # bin_sum = 0 # row = 0 # for i in range(n): # for j in range(i + 1, n): # if col_data[i] > col_data[j]: # self.rank_matrix[row][col] = 1 # bin_sum += 1 # elif col_data[j] > col_data[i]: # self.rank_matrix[row][col] = 0.5 # bin_sum += 1 # row += 1 # 3b. Check support of each generated item-set supp = float(np.count_nonzero(tmp_rank)) / float(n * (n - 1.0) / 2.0) if supp >= self.thd_supp: self.rank_matrix[:, col] = tmp_rank self.valid_items.append(incr.as_string()) self.valid_items.append(decr.as_string()) valid_count += 2 if valid_count < 3: self.no_bins = True del self.data del attr_data gc.collect()
def init_gp_attributes(self, attr_data=None): # 1. Transpose csv array data if attr_data is None: attr_data = self.data.T self.attr_size = self.row_count else: self.attr_size = len(attr_data[self.attr_cols[0]]) self.step_name = 'step_' + str(int(self.row_count - self.attr_size)) # 2. Initialize h5 groups to store class attributes self.init_zarr_groups() z_root = zarr.open(self.z_file, 'r+') # 3. Initialize (k x attr) matrix n = self.attr_size m = self.col_count k = int(n * (n - 1) / 2) chunk_size = 5 grp_name = 'dataset/' + self.step_name + '/rank_matrix' compressor = Blosc(cname='zstd', clevel=3, shuffle=Blosc.BITSHUFFLE) rank_matrix = z_root.create_dataset(grp_name, shape=(k, m), dtype=np.float16, compressor=compressor) # rank_matrix = np.memmap(self.np_file, dtype=float, mode='w+', shape=(k, m)) # 4. Determine binary rank (fuzzy: 0, 0.5, 1) and calculate support of pattern valid_count = 0 valid_items = [] for col in self.attr_cols: col_data = np.array(attr_data[col], dtype=float) incr = GI(col, '+') decr = GI(col, '-') # 4a. Determine gradual ranks tmp_rank = np.where( col_data < col_data[:, np.newaxis], 1, np.where(col_data > col_data[:, np.newaxis], 0.5, 0)) tmp_rank = tmp_rank[np.triu_indices(n, k=1)] # 4b. Check support of each generated item-set supp = float(np.count_nonzero(tmp_rank)) / float(n * (n - 1.0) / 2.0) if supp >= self.thd_supp: rank_matrix[:, col] = tmp_rank[:] valid_items.append(incr.as_string()) valid_items.append(decr.as_string()) valid_count += 2 grp_name = 'dataset/' + self.step_name + '/valid_items' self.add_zarr_dataset(grp_name, np.array(valid_items).astype('S')) data_size = np.array( [self.col_count, self.row_count, self.attr_size, valid_count]) self.add_zarr_dataset('dataset/size_arr', data_size) if valid_count < 3: self.no_bins = True del self.data del attr_data del valid_items # print(rank_matrix[:]) gc.collect()
def init_gp_attributes(self, attr_data=None): # 1. Transpose csv array data if attr_data is None: attr_data = self.data.T self.attr_size = self.row_count else: self.attr_size = len(attr_data[self.attr_cols[0]]) self.step_name = 'step_' + str(int(self.row_count - self.attr_size)) # 2. Initialize h5 groups to store class attributes self.init_h5_groups() h5f = h5py.File(self.h5_file, 'r+') # 3. Initialize (k x attr) matrix n = self.attr_size m = self.col_count k = int(n * (n - 1) / 2) # if k > 10000: # ch = 10000 # else: # ch = k grp_name = 'dataset/' + self.step_name + '/rank_matrix' rank_matrix = h5f.create_dataset(grp_name, (k, m), dtype=np.float16, chunks=True, compression="gzip", compression_opts=9, shuffle=True) # rank_matrix = np.memmap(self.np_file, dtype=float, mode='w+', shape=(k, m)) # 4. Determine binary rank (fuzzy: 0, 0.5, 1) and calculate support of pattern valid_count = 0 valid_items = [] for col in self.attr_cols: col_data = np.array(attr_data[col], dtype=float) incr = GI(col, '+') decr = GI(col, '-') # 4a. Determine gradual ranks tmp_rank = np.where( col_data < col_data[:, np.newaxis], 1, np.where(col_data > col_data[:, np.newaxis], 0.5, 0)) tmp_rank = tmp_rank[np.triu_indices(n, k=1)] # 4b. Check support of each generated item-set supp = float(np.count_nonzero(tmp_rank)) / float(n * (n - 1.0) / 2.0) if supp >= self.thd_supp: rank_matrix[:, col] = tmp_rank[:] valid_items.append(incr.as_string()) valid_items.append(decr.as_string()) valid_count += 2 h5f.close() grp_name = 'dataset/' + self.step_name + '/valid_items' self.add_h5_dataset(grp_name, np.array(valid_items).astype('S')) data_size = np.array( [self.col_count, self.row_count, self.attr_size, valid_count]) self.add_h5_dataset('dataset/size_arr', data_size) if valid_count < 3: self.no_bins = True # rank_matrix.flush() del self.data del attr_data del valid_items gc.collect()
def generate_d(self): # 1a. Retrieve/Generate distance matrix (d) grp_name = 'dataset/' + self.d_set.step_name + '/d_matrix' d = self.d_set.read_h5_dataset(grp_name) if d.size > 0: return d # 2. Initialize an empty d-matrix attr_keys = self.attr_keys n = len(attr_keys) d = np.zeros((n, n), dtype=float) # cumulative sum of all segments attr_combs = list(combinations(attr_keys, 2)) h5f = h5py.File(self.d_set.h5_file, 'r+') for str_i, str_j in attr_combs: gi_1 = GI.parse_gi(str_i) gi_2 = GI.parse_gi(str_j) if gi_1.attribute_col == gi_2.attribute_col: # Ignore similar attributes (+ or/and -) continue else: # Cumulative sum of all segments for 2x2 (all attributes) gradual items col_data_1 = self.d_set.attr_data[gi_1.attribute_col] col_data_2 = self.d_set.attr_data[gi_2.attribute_col] grp1 = 'dataset/' + self.d_set.step_name + '/temp_bin1' if gi_1.symbol == '+': bin_1 = h5f.create_dataset( grp1, data=col_data_1 > col_data_1[:, np.newaxis], chunks=True) else: bin_1 = h5f.create_dataset( grp1, data=col_data_1 < col_data_1[:, np.newaxis], chunks=True) grp2 = 'dataset/' + self.d_set.step_name + '/temp_bin2' if gi_2.symbol == '+': bin_2 = h5f.create_dataset( grp2, data=col_data_2 > col_data_2[:, np.newaxis], chunks=True) else: bin_2 = h5f.create_dataset( grp2, data=col_data_2 < col_data_2[:, np.newaxis], chunks=True) for k in bin_1.iter_chunks(): i = attr_keys.index(gi_1.as_string()) j = attr_keys.index(gi_2.as_string()) bin_sum = np.sum(np.multiply(bin_1[k], bin_2[k])) d[i][j] += bin_sum d[j][i] += bin_sum del h5f[grp1] del h5f[grp2] # 3. Save d_matrix in HDF5 file h5f.close() grp_name = 'dataset/' + self.d_set.step_name + '/d_matrix' self.d_set.add_h5_dataset(grp_name, d) return d
def init_gp_attributes(self, attr_data=None): # (check) implement parallel multiprocessing # 1. Initialize h5 groups to store class attributes self.init_h5_groups() # h5f = h5py.File(self.h5_file, 'r+') # 2. Transpose csv array data if attr_data is None: h5f = h5py.File(self.h5_file, 'r+') attr_data = h5f['dataset/attr_data'] self.attr_size = self.row_count else: self.attr_size = len(attr_data[self.attr_cols[0]]) self.step_name = 'step_' + str(int(self.row_count - self.attr_size)) # 3. Construct and store 1-item_set valid bins # execute binary rank to calculate support of pattern n = self.attr_size valid_count = 0 # valid_bins = list() for col in self.attr_cols: col_data = np.array(attr_data[col], dtype=float) incr = GI(col, '+') decr = GI(col, '-') # 3a. Chunk col_data into segments col_segs = np.array_split(col_data, self.chunks) col_bins_pos = [] col_bins_neg = [] bin_sum = 0 # print(col_segs) for i in range(self.chunks): for j in range(self.chunks): with np.errstate(invalid='ignore'): tmp_bin = col_segs[i] > col_segs[j][:, np.newaxis] bin_sum += np.sum(tmp_bin) col_bins_pos.append(tmp_bin) tmp_bin = col_segs[i] < col_segs[j][:, np.newaxis] col_bins_neg.append(tmp_bin) # 3b. Check support of each generated itemset supp = float(bin_sum) / float(n * (n - 1.0) / 2.0) if supp >= self.thd_supp: grp_name = 'dataset/' + self.step_name + '/valid_bins/' + incr.as_string( ) # self.add_h5_dataset(grp_name, col_bins_pos) h5f.create_dataset(grp_name, data=col_bins_pos) grp_name = 'dataset/' + self.step_name + '/valid_bins/' + decr.as_string( ) # self.add_h5_dataset(grp_name, col_bins_neg) h5f.create_dataset(grp_name, data=col_bins_neg) valid_count += 2 # valid_bins.append(np.array([incr.tolist(), col_bins_pos], dtype=object)) # valid_bins.append(np.array([decr.tolist(), col_bins_neg], dtype=object)) # self.valid_bins = np.array(valid_bins) # print(self.valid_bins) # h5f.close() # grp_name = 'dataset/' + self.step_name + '/valid_items' # self.add_h5_dataset(grp_name, np.array(valid_items).astype('S')) h5f.close() data_size = np.array( [self.col_count, self.row_count, self.attr_size, valid_count]) self.add_h5_dataset('dataset/size_arr', data_size) if valid_count < 3: self.no_bins = True # rank_matrix.flush() del attr_data # del valid_items gc.collect()