Ejemplo n.º 1
0
def conditional_entropy(universe, feature_b, feature_a, display=False):
    """
    to calculate the conditional entropy, H(B|A) or H(B|A1,A2...)
    :param universe: the universe of objects(feature vector/sample/instance)
    :param feature_b: list, features' index
    :param feature_a: list, features' index
    :param display: to display the probability
    :return:
    """
    partitions_a = partition(universe, feature_a)
    partitions_b = partition(universe, feature_b)
    total = 0
    for a in partitions_a:
        inner_total = 0
        # length = len(a)
        for b in partitions_b:
            length = len(b)
            a_b = [i for i in a if i in b]
            probability = len(a_b) / length
            if display:
                print(probability)
            if probability > 0:
                inner_total += probability * math.log2(probability)
        total += inner_total * length / universe.shape[0]
    return -1 * total
 def proximity_of_objects_in_boundary_region_to_mean_positive_region_based_distance(
         universe, features_1, features_2, distance):
     """
     proximity_of_objects_in_boundary_region_from_mean_positive_region
     don't consider the condition that the positive region is empty
     :param universe: the universe of objects(feature vector/sample/instance)
     :param features_1: list, a set of features' serial number
     :param features_2: list, a set of features' serial number
     :param distance: the method to calculate the distance of objects
     :return: float
     """
     partition_2 = partition(universe, features_2)
     boundary = []
     positive = []
     for subset in partition_2:
         boundary.extend(boundary_region_of_sample_subset(universe, subset, features_1))
         boundary = list(set(boundary))
         positive.extend(positive_region_of_sample_subset(universe, subset, features_1))
     if len(boundary) == 0:
         return 1
     if len(positive) == 0:
         return 1 / len(boundary)
     mean = NoiseResistantDependencyMeasure.mean_positive_region(universe, positive, features_1)
     proximity_of_object_in_boundary_from_mean = 0
     for y in boundary:
         proximity_of_object_in_boundary_from_mean += distance(mean, universe[y], features_1)
     return 1 / proximity_of_object_in_boundary_from_mean
Ejemplo n.º 3
0
def joint_entropy(universe, feature_a, feature_b):
    """
    calculate the joint entropy of a and b, H(a,b)
    :param universe: the universe of objects(feature vector/sample/instance)
    :param feature_b: list, features' index
    :param feature_a: list, features' index
    :return:
    """
    partitions_a = partition(universe, feature_a)
    partitions_b = partition(universe, feature_b)
    total = 0
    for a in partitions_a:
        for b in partitions_b:
            a_b = [i for i in a if i in b]
            probability = len(a_b) / universe.shape[0]
            if probability > 0:
                total += probability * math.log2(probability)
    return -1 * total
Ejemplo n.º 4
0
def classical_rough_set_partition_all():
    data = pd.read_csv("mushroom.csv", header=None)
    data = np.array(data)
    print(data.shape)
    attributes = [i for i in range(data.shape[1])]

    print("partition_by_array_equal")
    start_time = time.time()
    # matrix1 = partition_by_equal_array(data, attributes)
    partition_by_equal_array(data, attributes)
    print('The time used: {} seconds'.format(time.time() - start_time))

    print("partition")
    start_time = time.time()
    # matrix2 = partition(data, attributes)
    partition(data, attributes)
    print('The time used: {} seconds'.format(time.time() - start_time))
    print()
    pass
Ejemplo n.º 5
0
    def calculate_positive_region(self, attributes):
        """
        FH hint the NT
        FM hint the NS
        a
        NT : the nearest instance from the same label with x
        NS : the nearest instance whose label is different from x
        step 4-9 先计算the margin of instance of x, 如果无法计算出来, 执行step 11-18
        step 11-18
        :param attributes:
        :return:
        """
        positive_region = []
        if self.distance == euclidean_distance:
            distance_matrix = generate_euclidean_distance_matrix_by_vector(
                self.universe, attributes)
        elif self.distance == standardized_euclidean_distance:
            distance_matrix = generate_euclidean_distance_matrix_by_vector(
                self.universe, attributes, standard=True)
        else:
            distance_matrix = generate_distance_matrix(self.universe,
                                                       attributes,
                                                       self.distance)
        for label in self.labels:  # 针对每个label
            label_positive_region = []
            elementary_sets = partition(self.universe, [label])  # 计算其划分
            for elementary_set in elementary_sets:
                exclude = [
                    j for j in [i for i in range(self.universe.shape[0])]
                    if j not in elementary_set
                ]
                for x in elementary_set:
                    # print(distance_matrix[x])

                    # 同类
                    # print(elementary_set)
                    # print(distance_matrix[x][elementary_set])
                    # print(heapq.nsmallest(2, distance_matrix[x][elementary_set]))
                    if len(elementary_set) == 1:
                        margin = 0

                    #  异类
                    # print(exclude)
                    # print(distance_matrix[x][exclude])
                    # print(heapq.nsmallest(2, distance_matrix[x][exclude]))
                    elif len(exclude) == 1:
                        margin = 1
                    else:
                        margin = heapq.nsmallest(2, distance_matrix[x][exclude])[1] - \
                                 heapq.nsmallest(2, distance_matrix[x][elementary_set])[1]
                    # print(margin, x)
                    if margin > 0:
                        label_positive_region.append(x)
            positive_region.append(label_positive_region)
        return positive_region
Ejemplo n.º 6
0
def entropy(universe, feature):
    """
    to calculate the entropy of feature, H(a)
    :param universe: the universe of objects(feature vector/sample/instance)
    :param feature: list, features' index
    :return:
    """
    partitions = partition(universe, feature)
    total = 0
    for yi in partitions:
        probability = len(yi) / universe.shape[0]
        total += probability * math.log2(probability)
    return -1 * total
 def noisy_dependency_of_feature_subset_d_on_feature_subset_c(universe, feature_subset_c, feature_subset_d):
     """
     :param universe: the universe of objects(feature vector/sample/instance)
     :param feature_subset_c: list, a set of features' serial number
     :param feature_subset_d: list, a set of features' serial number
     :return: noisy dependency of feature subset a on feature subset b
     """
     partition_d = partition(universe, feature_subset_d)
     total_dependency = 0
     for p in partition_d:
         the_dependency = NoiseResistantDependencyMeasure. \
             proximity_of_boundary_region_to_positive_region_based_portion(universe, p, feature_subset_c)
         total_dependency += the_dependency
     return total_dependency
Ejemplo n.º 8
0
def lower_approximations_of_universe_neighborhood(universe, attributes, labels, delta):
    """
    get the features lower approximations of U/R
    :param universe: the universe of objects(feature vector/sample/instance)
    :param attributes: features' index
    :param labels: labels' index
    :param delta: radius
    :return: list, lower_approximations is composed by a set of objects' index
    """
    lower_approximations = []
    partition_1 = generate_delta_neighborhood(universe, attributes, delta)
    partition_2 = partition(universe, labels)
    for x in partition_1:
        if set_is_include(x, partition_2):
            lower_approximations.append(x[0])
    lower_approximations.sort()
    return lower_approximations
    def dep_density(self, attributes):
        if len(attributes) == 0:
            return 0
        card_s = 0
        density_neighborhoods = \
            generate_density_neighborhood(self.universe, attributes, distance=self.distance)
        partitions = partition(self.universe, self.decision_features)

        for density_neighborhood in density_neighborhoods:
            for single_partition in partitions:
                if density_neighborhood[0] in single_partition:
                    card_s += len([
                        j
                        for j in density_neighborhood if j in single_partition
                    ]) - 1
        dep_s = card_s / self.universe.shape[0]
        return dep_s
 def proximity_of_boundary_region_to_positive_region_based_portion(universe, sample_subset, feature_subset):
     """
     a noise measure function
     to describe the information contain by the boundary of partition(universe, sample_subset)
     :param universe: the universe of objects(feature vector/sample/instance)
     :param sample_subset: list, a set of objects' serial number
     :param feature_subset: list, a set of features' serial number
     :return: float, the proximity
     """
     partition_1 = partition(universe, feature_subset)
     total = 0
     for elementary_set in partition_1:
         related_information = NoiseResistantDependencyMeasure.related_information_of_subset_b(
             elementary_set, sample_subset)
         if related_information != 1:
             total += related_information
     return total / (len(partition_1))
Ejemplo n.º 11
0
def main():
    # entropy
    data = pd.read_csv("./../Resources/watermelon2 train.csv", header=None)
    result = entropy(np.array(data), [0])
    print(result)
    # part entropy
    partitions_ = partition(np.array(data), [0])
    print(partitions_)
    for part in partitions_:
        result = part_entropy(np.array(data), part, [6])
        print(result)
    # conditional entropy
    for feature in range(6):
        print(feature, "#")
        conditional_entropy(np.array(data), [feature], [6], True)
    conditional_entropy(np.array(data), [4], [6])
    # conditional mutual information(no example to check)
    # conditional_mutual_information(np.array(data), [], [], [])
    return