def clustering_oka(nec_set, k=25): """ Group record according to NCP. OKA: one time pass k-means """ can_clusters = [cluster for cluster in nec_set if len(cluster) >= k] nec_set = [cluster for cluster in nec_set if len(cluster) < k] remain = sum([len(t) for t in nec_set]) clusters = [] # randomly choose seed and find k-1 nearest records to form cluster with size k seed_index = random.sample(range(len(nec_set)), remain / k) for index in seed_index: can_clusters.append(nec_set[index]) nec_set = [t for i, t in enumerate(nec_set[:]) if i not in set(seed_index)] while len(nec_set) > 0: nec = nec_set.pop() index = find_best_cluster_iloss(nec, can_clusters) can_clusters[index].merge_cluster(nec) residual = [] less_clusters = [] for cluster in can_clusters: if len(cluster) < k: less_clusters.append(cluster) else: if len(cluster) > k: adjust_cluster(cluster, residual, k) clusters.append(cluster) while len(residual) > 0: record = residual.pop() record_key = qid_to_key(record[:QI_LEN]) if len(less_clusters) > 0: index = find_best_cluster_iloss(record, less_clusters) less_clusters[index].add_record(record) residual_handle(residual, record_key, less_clusters[index]) if len(less_clusters[index]) >= k: clusters.append(less_clusters.pop(index)) else: index = find_best_cluster_iloss(record, clusters) clusters[index].add_record(record) residual_handle(residual, record_key, clusters[index]) # sometimes residual records cannot satisfy less_clusters # so we need to handle these clusters if len(less_clusters) > 0: for cluster in less_clusters: residual.extend(cluster.member) while len(residual) > 0: record = residual.pop() record_key = qid_to_key(record[:QI_LEN]) index = find_best_cluster_iloss(record, clusters) clusters[index].add_record(record) residual_handle(residual, record_key, clusters[index]) return clusters
def NCP(record): """Compute NCP (Normalized Certainty Penalty) when generate record to gen_result. """ ncp = 0.0 # exclude SA values(last one type []) list_key = qid_to_key(record) try: return NCP_CACHE[list_key] except KeyError: pass for i in range(QI_LEN): # if leaf_num of numerator is 1, then NCP is 0 width = 0.0 if IS_CAT[i] is False: try: float(record[i]) except ValueError: temp = record[i].split(',') width = float(temp[1]) - float(temp[0]) else: width = len(ATT_TREES[i][record[i]]) * 1.0 width /= QI_RANGE[i] ncp += width NCP_CACHE[list_key] = ncp return ncp
def residual_handle(residual, record_key, cluster): while True: try: same_record = residual[-1] except IndexError: break if record_key == qid_to_key(same_record[:QI_LEN]): cluster.add_record(residual.pop(-1)) else: break
def create_nec(data): """ create NEC from dateset using dict :param data: dataset :return: NEC in dict format: key is str, value is Cluster """ nec_dict = dict() for record in data: key = qid_to_key(record[:QI_LEN]) try: nec_dict[key].add_same_record(record) except KeyError: nec_dict[key] = Cluster([record], record) return nec_dict