Ejemplo n.º 1
0
    def bin_and(self, keys, grps, m_grp):
        n = self.d_set.attr_size
        min_supp = self.d_set.thd_supp
        pattern = GP()

        gi = GI.parse_gi(keys[0])
        pattern.add_gradual_item(gi)
        # bin_1 = grps[0]['bins']
        # main_bin = [bin_1[str(x)][:] for x in range(self.d_set.seg_count)]
        for i in range(len(keys)):
            if i == 0:
                continue
            bin_2 = grps[i]['bins']
            # temp_bin = [np.multiply(temp_bin[k], bin_2[str(k)][:]) for k in range(self.d_set.seg_count)]
            # temp_bin = []
            bin_sum = 0
            for k in range(self.d_set.seg_count):
                m_grp[str(k)][...] = np.multiply(m_grp[str(k)][:],
                                                 bin_2[str(k)][:])
                bin_sum += np.sum(m_grp[str(k)][:])
                # temp_bin.append(arr)
            supp = float(bin_sum) / float(n * (n - 1.0) / 2.0)
            if supp >= min_supp:
                # main_bin = temp_bin
                gi = GI.parse_gi(keys[i])
                pattern.add_gradual_item(gi)
                pattern.set_support(supp)
        # print(str(pattern.to_string()) + ' : ' + str(pattern.support))
        return pattern
Ejemplo n.º 2
0
    def generate_d(self):
        v_bins = self.d_set.valid_bins
        # 1. Fetch valid bins group
        attr_keys = [GI(x[0], x[1].decode()).as_string() for x in v_bins[:, 0]]

        # 2. Initialize an empty d-matrix
        n = len(attr_keys)
        d = np.zeros((n, n),
                     dtype=np.dtype('i8'))  # cumulative sum of all segments
        for i in range(n):
            for j in range(n):
                if GI.parse_gi(attr_keys[i]).attribute_col == GI.parse_gi(
                        attr_keys[j]).attribute_col:
                    # 2a. Ignore similar attributes (+ or/and -)
                    continue
                else:
                    bin_1 = v_bins[i][1]
                    bin_2 = v_bins[j][1]
                    # Cumulative sum of all segments for 2x2 (all attributes) gradual items
                    # 2b. calculate sum from bin ranks (in chunks)
                    bin_sum = 0
                    for k in range(len(bin_1)):
                        bin_sum += np.sum(np.multiply(bin_1[k], bin_2[k]))
                    d[i][j] += bin_sum
        # print(d)
        return d, attr_keys
Ejemplo n.º 3
0
    def generate_d(self):
        # v_items = self.d_set.valid_items
        # 1. Fetch valid bins group
        attr_keys = self.d_set.valid_items  # [GI(x[0], x[1].decode()).as_string() for x in v_items[:, 0]]
        ranks = self.d_set.rank_matrix

        # 2. Initialize an empty d-matrix
        n = len(attr_keys)
        d = np.zeros((n, n), dtype=np.dtype('i8'))  # cumulative sum of all segments
        for i in range(n):
            for j in range(n):
                gi_1 = GI.parse_gi(attr_keys[i])
                gi_2 = GI.parse_gi(attr_keys[j])
                if gi_1.attribute_col == gi_2.attribute_col:
                    # Ignore similar attributes (+ or/and -)
                    continue
                else:
                    bin_1 = ranks[:, gi_1.attribute_col].copy()
                    bin_2 = ranks[:, gi_2.attribute_col].copy()

                    # 2b. Reconstruct if negative (swap 0.5 and 1, leave 0 as 0)
                    if gi_1.is_decrement():
                        bin_1 = np.where(bin_1 == 0.5, 1, np.where(bin_1 == 1, 0.5, 0))

                    if gi_2.is_decrement():
                        bin_2 = np.where(bin_2 == 0.5, 1, np.where(bin_2 == 1, 0.5, 0))

                    # Cumulative sum of all segments for 2x2 (all attributes) gradual items
                    temp_bin = np.where(bin_1 == bin_2, 1, 0)
                    d[i][j] += np.sum(temp_bin)
        # print(d)
        return d, attr_keys
Ejemplo n.º 4
0
    def generate_d(self):
        # 1a. Retrieve/Generate distance matrix (d)
        grp_name = 'dataset/' + self.d_set.step_name + '/valid_items'
        attr_keys = [
            x.decode() for x in self.d_set.read_zarr_dataset(grp_name)
        ]

        grp_name = 'dataset/' + self.d_set.step_name + '/d_matrix'
        d = self.d_set.read_zarr_dataset(grp_name)
        if d.size > 0:
            # 1b. Fetch valid bins group
            return d, attr_keys

        # 1b. Fetch valid bins group
        z_root = zarr.open(self.d_set.z_file, 'r')
        grp_name = 'dataset/' + self.d_set.step_name + '/rank_matrix'
        ranks = z_root[grp_name][:]  # [:] TO BE REMOVED

        # 2. Initialize an empty d-matrix
        n = len(attr_keys)
        d = np.zeros((n, n),
                     dtype=np.dtype('i8'))  # cumulative sum of all segments
        for i in range(n):
            for j in range(n):
                gi_1 = GI.parse_gi(attr_keys[i])
                gi_2 = GI.parse_gi(attr_keys[j])
                if gi_1.attribute_col == gi_2.attribute_col:
                    # Ignore similar attributes (+ or/and -)
                    continue
                else:
                    # for s in ranks.iter_chunks():
                    bin_1 = ranks[:, gi_1.attribute_col].copy()
                    bin_2 = ranks[:, gi_2.attribute_col].copy()

                    # 2b. Reconstruct if negative (swap 0.5 and 1, leave 0 as 0)
                    if gi_1.is_decrement():
                        bin_1 = np.where(bin_1 == 0.5, 1,
                                         np.where(bin_1 == 1, 0.5, 0))

                    if gi_2.is_decrement():
                        bin_2 = np.where(bin_2 == 0.5, 1,
                                         np.where(bin_2 == 1, 0.5, 0))

                    # Cumulative sum of all segments for 2x2 (all attributes) gradual items
                    temp_bin = np.where(bin_1 == bin_2, 1, 0)
                    d[i][j] += np.sum(temp_bin)
        # print(d)
        grp_name = 'dataset/' + self.d_set.step_name + '/d_matrix'
        self.d_set.add_zarr_dataset(grp_name, d, compress=True)
        return d, attr_keys
Ejemplo n.º 5
0
    def generate_aco_gp(self, p_matrix):
        attr_keys = self.attr_keys
        v_matrix = self.d
        pattern = GP()

        # 1. Generate gradual items with highest pheromone and visibility
        m = p_matrix.shape[0]
        for i in range(m):
            combine_feature = np.multiply(v_matrix[i], p_matrix[i])
            total = np.sum(combine_feature)
            with np.errstate(divide='ignore', invalid='ignore'):
                probability = combine_feature / total
            cum_prob = np.cumsum(probability)
            r = np.random.random_sample()
            try:
                j = np.nonzero(cum_prob > r)[0][0]
                gi = GI.parse_gi(attr_keys[j])
                if not pattern.contains_attr(gi):
                    pattern.add_gradual_item(gi)
            except IndexError:
                continue

        # 2. Evaporate pheromones by factor e
        p_matrix = (1 - self.e_factor) * p_matrix
        return pattern, p_matrix
Ejemplo n.º 6
0
 def generate_random_gp(self):
     p = self.p_matrix
     n = len(self.attr_index)
     pattern = GP()
     attrs = np.random.permutation(n)
     for i in attrs:
         max_extreme = n * 100
         x = float(rand.randint(1, max_extreme) / max_extreme)
         pos = float(p[i][0] / (p[i][0] + p[i][1] + p[i][2]))
         neg = float((p[i][0] + p[i][1]) / (p[i][0] + p[i][1] + p[i][2]))
         if x < pos:
             temp = GI(self.attr_index[i], '+')
         elif (x >= pos) and (x < neg):
             temp = GI(self.attr_index[i], '-')
         else:
             # temp = GI(self.attr_index[i], 'x')
             continue
         pattern.add_gradual_item(temp)
     return pattern
Ejemplo n.º 7
0
    def generate_d(self):
        # 1a. Retrieve/Generate distance matrix (d)
        grp_name = 'dataset/' + self.d_set.step_name + '/d_matrix'
        d = self.d_set.read_h5_dataset(grp_name)
        if d.size > 0:
            # 1b. Fetch valid bins group
            grp_name = 'dataset/' + self.d_set.step_name + '/valid_bins/'
            h5f = h5py.File(self.d_set.h5_file, 'r')
            attr_keys = list(h5f[grp_name].keys())
            h5f.close()
            return d, attr_keys

        # 1b. Fetch valid bins group
        grp_name = 'dataset/' + self.d_set.step_name + '/valid_bins/'
        h5f = h5py.File(self.d_set.h5_file, 'r')
        grp = h5f[grp_name]
        attr_keys = list(grp.keys())

        # 2. Initialize an empty d-matrix
        n = len(grp)
        d = np.zeros((n, n), dtype=float)  # cumulative sum of all segments
        for k in range(self.d_set.seg_count):
            # 2. For each segment do a binary AND
            for i in range(n):
                for j in range(n):
                    bin_1 = grp[attr_keys[i]]
                    bin_2 = grp[attr_keys[j]]
                    if GI.parse_gi(attr_keys[i]).attribute_col == GI.parse_gi(
                            attr_keys[j]).attribute_col:
                        # Ignore similar attributes (+ or/and -)
                        continue
                    else:
                        # Cumulative sum of all segments for 2x2 (all attributes) gradual items
                        d[i][j] += np.sum(
                            np.multiply(bin_1['bins'][str(k)][:],
                                        bin_2['bins'][str(k)][:]))

        # 3. Save d_matrix in HDF5 file
        h5f.close()
        grp_name = 'dataset/' + self.d_set.step_name + '/d_matrix'
        self.d_set.add_h5_dataset(grp_name, d)
        return d, attr_keys
Ejemplo n.º 8
0
    def generate_d(self):
        # 1a. Fetch valid attribute keys
        grp_name = 'dataset/' + self.d_set.step_name + '/valid_bins/'
        h5f = h5py.File(self.d_set.h5_file, 'r')
        bin_grp = h5f[grp_name]
        attr_keys = list(bin_grp.keys())

        # 1b. Retrieve/Generate distance matrix (d)
        grp_name = 'dataset/' + self.d_set.step_name + '/d_matrix'
        d = self.d_set.read_h5_dataset(grp_name)
        if d.size > 0:
            # 1b. Fetch valid bins group
            h5f.close()
            return d, attr_keys

        # 2. Initialize an empty d-matrix
        n = len(attr_keys)
        d = np.zeros((n, n),
                     dtype=np.dtype('i8'))  # cumulative sum of all segments
        for i in range(n):
            for j in range(n):
                gi_1 = GI.parse_gi(attr_keys[i])
                gi_2 = GI.parse_gi(attr_keys[j])
                if gi_1.attribute_col == gi_2.attribute_col:
                    # 2a. Ignore similar attributes (+ or/and -)
                    continue
                else:
                    bin_1 = bin_grp[gi_1.as_string()]  # v_bins[i][1]
                    bin_2 = bin_grp[gi_2.as_string()]  # v_bins[j][1]
                    # Cumulative sum of all segments for 2x2 (all attributes) gradual items
                    # 2b. calculate sum from bin ranks (in chunks)
                    #print(bin_1[k])
                    bin_sum = 0
                    for k in range(len(bin_1)):
                        bin_sum += np.sum(np.multiply(bin_1[k], bin_2[k]))
                    d[i][j] += bin_sum

        h5f.close()
        grp_name = 'dataset/' + self.d_set.step_name + '/d_matrix'
        self.d_set.add_h5_dataset(grp_name, d, compress=True)
        return d, attr_keys
Ejemplo n.º 9
0
    def init_gp_attributes(self, attr_data=None):
        # 1. Transpose csv array data
        if attr_data is None:
            attr_data = self.data.T
            self.attr_size = self.row_count
        else:
            self.attr_size = len(attr_data[self.attr_cols[0]])

        # 2. Initialize (k x attr) matrix
        n = self.attr_size
        m = self.col_count
        k = int(n * (n - 1) / 2)
        self.rank_matrix = np.zeros((k, m), dtype=np.float16)

        # 3. Determine binary rank (fuzzy: 0, 0.5, 1) and calculate support of pattern
        valid_count = 0
        for col in self.attr_cols:
            col_data = np.array(attr_data[col], dtype=float)
            incr = GI(col, '+')
            decr = GI(col, '-')

            # 3a. Determine gradual ranks
            tmp_rank = np.where(
                col_data < col_data[:, np.newaxis], 1,
                np.where(col_data > col_data[:, np.newaxis], 0.5, 0))
            tmp_rank = tmp_rank[np.triu_indices(n, k=1)]

            # 3a. Determine gradual ranks
            # bin_sum = 0
            # row = 0
            # for i in range(n):
            #    for j in range(i + 1, n):
            #        if col_data[i] > col_data[j]:
            #            self.rank_matrix[row][col] = 1
            #            bin_sum += 1
            #        elif col_data[j] > col_data[i]:
            #            self.rank_matrix[row][col] = 0.5
            #            bin_sum += 1
            #        row += 1

            # 3b. Check support of each generated item-set
            supp = float(np.count_nonzero(tmp_rank)) / float(n *
                                                             (n - 1.0) / 2.0)
            if supp >= self.thd_supp:
                self.rank_matrix[:, col] = tmp_rank
                self.valid_items.append(incr.as_string())
                self.valid_items.append(decr.as_string())
                valid_count += 2

        if valid_count < 3:
            self.no_bins = True
        del self.data
        del attr_data
        gc.collect()
Ejemplo n.º 10
0
    def init_gp_attributes(self, attr_data=None):
        # 1. Transpose csv array data
        if attr_data is None:
            attr_data = self.data.T
            self.attr_size = self.row_count
        else:
            self.attr_size = len(attr_data[self.attr_cols[0]])
        self.step_name = 'step_' + str(int(self.row_count - self.attr_size))

        # 2. Initialize h5 groups to store class attributes
        self.init_zarr_groups()
        z_root = zarr.open(self.z_file, 'r+')

        # 3. Initialize (k x attr) matrix
        n = self.attr_size
        m = self.col_count
        k = int(n * (n - 1) / 2)
        chunk_size = 5

        grp_name = 'dataset/' + self.step_name + '/rank_matrix'
        compressor = Blosc(cname='zstd', clevel=3, shuffle=Blosc.BITSHUFFLE)
        rank_matrix = z_root.create_dataset(grp_name,
                                            shape=(k, m),
                                            dtype=np.float16,
                                            compressor=compressor)
        # rank_matrix = np.memmap(self.np_file, dtype=float, mode='w+', shape=(k, m))

        # 4. Determine binary rank (fuzzy: 0, 0.5, 1) and calculate support of pattern
        valid_count = 0
        valid_items = []
        for col in self.attr_cols:
            col_data = np.array(attr_data[col], dtype=float)
            incr = GI(col, '+')
            decr = GI(col, '-')

            # 4a. Determine gradual ranks
            tmp_rank = np.where(
                col_data < col_data[:, np.newaxis], 1,
                np.where(col_data > col_data[:, np.newaxis], 0.5, 0))
            tmp_rank = tmp_rank[np.triu_indices(n, k=1)]

            # 4b. Check support of each generated item-set
            supp = float(np.count_nonzero(tmp_rank)) / float(n *
                                                             (n - 1.0) / 2.0)
            if supp >= self.thd_supp:
                rank_matrix[:, col] = tmp_rank[:]
                valid_items.append(incr.as_string())
                valid_items.append(decr.as_string())
                valid_count += 2

        grp_name = 'dataset/' + self.step_name + '/valid_items'
        self.add_zarr_dataset(grp_name, np.array(valid_items).astype('S'))
        data_size = np.array(
            [self.col_count, self.row_count, self.attr_size, valid_count])
        self.add_zarr_dataset('dataset/size_arr', data_size)
        if valid_count < 3:
            self.no_bins = True
        del self.data
        del attr_data
        del valid_items
        # print(rank_matrix[:])
        gc.collect()
Ejemplo n.º 11
0
    def init_gp_attributes(self, attr_data=None):
        # 1. Transpose csv array data
        if attr_data is None:
            attr_data = self.data.T
            self.attr_size = self.row_count
        else:
            self.attr_size = len(attr_data[self.attr_cols[0]])
        self.step_name = 'step_' + str(int(self.row_count - self.attr_size))

        # 2. Initialize h5 groups to store class attributes
        self.init_h5_groups()
        h5f = h5py.File(self.h5_file, 'r+')

        # 3. Initialize (k x attr) matrix
        n = self.attr_size
        m = self.col_count
        k = int(n * (n - 1) / 2)
        # if k > 10000:
        #    ch = 10000
        # else:
        #    ch = k

        grp_name = 'dataset/' + self.step_name + '/rank_matrix'
        rank_matrix = h5f.create_dataset(grp_name, (k, m),
                                         dtype=np.float16,
                                         chunks=True,
                                         compression="gzip",
                                         compression_opts=9,
                                         shuffle=True)
        # rank_matrix = np.memmap(self.np_file, dtype=float, mode='w+', shape=(k, m))

        # 4. Determine binary rank (fuzzy: 0, 0.5, 1) and calculate support of pattern
        valid_count = 0
        valid_items = []
        for col in self.attr_cols:
            col_data = np.array(attr_data[col], dtype=float)
            incr = GI(col, '+')
            decr = GI(col, '-')

            # 4a. Determine gradual ranks
            tmp_rank = np.where(
                col_data < col_data[:, np.newaxis], 1,
                np.where(col_data > col_data[:, np.newaxis], 0.5, 0))
            tmp_rank = tmp_rank[np.triu_indices(n, k=1)]

            # 4b. Check support of each generated item-set
            supp = float(np.count_nonzero(tmp_rank)) / float(n *
                                                             (n - 1.0) / 2.0)
            if supp >= self.thd_supp:
                rank_matrix[:, col] = tmp_rank[:]
                valid_items.append(incr.as_string())
                valid_items.append(decr.as_string())
                valid_count += 2

        h5f.close()
        grp_name = 'dataset/' + self.step_name + '/valid_items'
        self.add_h5_dataset(grp_name, np.array(valid_items).astype('S'))
        data_size = np.array(
            [self.col_count, self.row_count, self.attr_size, valid_count])
        self.add_h5_dataset('dataset/size_arr', data_size)
        if valid_count < 3:
            self.no_bins = True
        # rank_matrix.flush()
        del self.data
        del attr_data
        del valid_items
        gc.collect()
Ejemplo n.º 12
0
    def generate_d(self):
        # 1a. Retrieve/Generate distance matrix (d)
        grp_name = 'dataset/' + self.d_set.step_name + '/d_matrix'
        d = self.d_set.read_h5_dataset(grp_name)
        if d.size > 0:
            return d

        # 2. Initialize an empty d-matrix
        attr_keys = self.attr_keys
        n = len(attr_keys)
        d = np.zeros((n, n), dtype=float)  # cumulative sum of all segments
        attr_combs = list(combinations(attr_keys, 2))
        h5f = h5py.File(self.d_set.h5_file, 'r+')

        for str_i, str_j in attr_combs:
            gi_1 = GI.parse_gi(str_i)
            gi_2 = GI.parse_gi(str_j)
            if gi_1.attribute_col == gi_2.attribute_col:
                # Ignore similar attributes (+ or/and -)
                continue
            else:
                # Cumulative sum of all segments for 2x2 (all attributes) gradual items
                col_data_1 = self.d_set.attr_data[gi_1.attribute_col]
                col_data_2 = self.d_set.attr_data[gi_2.attribute_col]

                grp1 = 'dataset/' + self.d_set.step_name + '/temp_bin1'
                if gi_1.symbol == '+':
                    bin_1 = h5f.create_dataset(
                        grp1,
                        data=col_data_1 > col_data_1[:, np.newaxis],
                        chunks=True)
                else:
                    bin_1 = h5f.create_dataset(
                        grp1,
                        data=col_data_1 < col_data_1[:, np.newaxis],
                        chunks=True)

                grp2 = 'dataset/' + self.d_set.step_name + '/temp_bin2'
                if gi_2.symbol == '+':
                    bin_2 = h5f.create_dataset(
                        grp2,
                        data=col_data_2 > col_data_2[:, np.newaxis],
                        chunks=True)
                else:
                    bin_2 = h5f.create_dataset(
                        grp2,
                        data=col_data_2 < col_data_2[:, np.newaxis],
                        chunks=True)

                for k in bin_1.iter_chunks():
                    i = attr_keys.index(gi_1.as_string())
                    j = attr_keys.index(gi_2.as_string())
                    bin_sum = np.sum(np.multiply(bin_1[k], bin_2[k]))
                    d[i][j] += bin_sum
                    d[j][i] += bin_sum
                del h5f[grp1]
                del h5f[grp2]

        # 3. Save d_matrix in HDF5 file
        h5f.close()
        grp_name = 'dataset/' + self.d_set.step_name + '/d_matrix'
        self.d_set.add_h5_dataset(grp_name, d)
        return d
Ejemplo n.º 13
0
    def init_gp_attributes(self, attr_data=None):
        # (check) implement parallel multiprocessing
        # 1. Initialize h5 groups to store class attributes
        self.init_h5_groups()
        # h5f = h5py.File(self.h5_file, 'r+')

        # 2. Transpose csv array data
        if attr_data is None:
            h5f = h5py.File(self.h5_file, 'r+')
            attr_data = h5f['dataset/attr_data']
            self.attr_size = self.row_count
        else:
            self.attr_size = len(attr_data[self.attr_cols[0]])
        self.step_name = 'step_' + str(int(self.row_count - self.attr_size))

        # 3. Construct and store 1-item_set valid bins
        # execute binary rank to calculate support of pattern
        n = self.attr_size
        valid_count = 0
        # valid_bins = list()
        for col in self.attr_cols:
            col_data = np.array(attr_data[col], dtype=float)
            incr = GI(col, '+')
            decr = GI(col, '-')

            # 3a. Chunk col_data into segments
            col_segs = np.array_split(col_data, self.chunks)
            col_bins_pos = []
            col_bins_neg = []
            bin_sum = 0
            # print(col_segs)
            for i in range(self.chunks):
                for j in range(self.chunks):
                    with np.errstate(invalid='ignore'):
                        tmp_bin = col_segs[i] > col_segs[j][:, np.newaxis]
                        bin_sum += np.sum(tmp_bin)
                        col_bins_pos.append(tmp_bin)
                        tmp_bin = col_segs[i] < col_segs[j][:, np.newaxis]
                        col_bins_neg.append(tmp_bin)

            # 3b. Check support of each generated itemset
            supp = float(bin_sum) / float(n * (n - 1.0) / 2.0)
            if supp >= self.thd_supp:
                grp_name = 'dataset/' + self.step_name + '/valid_bins/' + incr.as_string(
                )
                # self.add_h5_dataset(grp_name, col_bins_pos)
                h5f.create_dataset(grp_name, data=col_bins_pos)
                grp_name = 'dataset/' + self.step_name + '/valid_bins/' + decr.as_string(
                )
                # self.add_h5_dataset(grp_name, col_bins_neg)
                h5f.create_dataset(grp_name, data=col_bins_neg)
                valid_count += 2
                # valid_bins.append(np.array([incr.tolist(), col_bins_pos], dtype=object))
                # valid_bins.append(np.array([decr.tolist(), col_bins_neg], dtype=object))
        # self.valid_bins = np.array(valid_bins)
        # print(self.valid_bins)
        # h5f.close()
        # grp_name = 'dataset/' + self.step_name + '/valid_items'
        # self.add_h5_dataset(grp_name, np.array(valid_items).astype('S'))
        h5f.close()
        data_size = np.array(
            [self.col_count, self.row_count, self.attr_size, valid_count])
        self.add_h5_dataset('dataset/size_arr', data_size)
        if valid_count < 3:
            self.no_bins = True
        # rank_matrix.flush()
        del attr_data
        # del valid_items
        gc.collect()