Ejemplo n.º 1
0
 def _calculate_information_gain(self, y, y1, y2):
     p = len(y1) / len(y)
     entropy = calculate_entropy(y)
     info_gain = entropy \
                 - p * calculate_entropy(y1) \
                 - (1-p) * calculate_entropy(y2)
     return info_gain
Ejemplo n.º 2
0
 def _calculate_information_gain(self, y, y1, y2):
     # Calculate information gain
     p = len(y1) / len(y)
     entropy = calculate_entropy(y)
     info_gain = entropy - p * calculate_entropy(y1) - (
         1 - p) * calculate_entropy(y2)
     # print("info_gain",info_gain)
     return info_gain
 def _calculate_information_gain(self, y, y1, y2):
     # Calculate information gain
     p = len(y1) / len(y)
     entropy = calculate_entropy(y)
     info_gain = entropy - p * \
                           calculate_entropy(y1) - (1 - p) * \
                                                   calculate_entropy(y2)
     # print("info_gain",info_gain)
     return info_gain
Ejemplo n.º 4
0
    def get_diversity_metrics(self, checkpoint, x_test, y_test, num_samples=10, num_iterations=3):

        x_test_repeated = np.repeat(x_test, num_samples, axis=0)
        y_test_repeated = np.repeat(y_test, num_samples, axis=0)

        entropy_list = []
        uni_diversity = []
        bi_diversity = []

        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            saver = tf.train.Saver()
            saver.restore(sess, checkpoint)

            for _ in tqdm(range(num_iterations)):
                total_ent = 0
                uni = 0
                bi = 0
                answer_logits = []
                pred_sentences = []

                for batch_i, (input_batch, output_batch, source_sent_lengths, tar_sent_lengths) in enumerate(
                        utils.get_batches_xy(x_test_repeated, y_test_repeated, self.batch_size)):
                    result = sess.run(self.inference_logits, feed_dict={self.input_data: input_batch,
                                                                        self.source_sentence_length: source_sent_lengths,
                                                                        self.keep_prob: 1.0,
                                                                        self.word_dropout_keep_prob: 1.0,
                                                                        self.z_temperature: self.z_temp})
                    answer_logits.extend(result)

                for idx, (actual, pred) in enumerate(zip(x_test_repeated, answer_logits)):
                    pred_sentences.append(" ".join([self.decoder_idx_word[i] for i in pred if i not in [self.pad, self.eos]]))

                    if (idx + 1) % num_samples == 0:
                        word_list = [word_tokenize(p) for p in pred_sentences]
                        corpus = [item for sublist in word_list for item in sublist]
                        total_ent += utils.calculate_entropy(corpus)
                        diversity_result = utils.calculate_ngram_diversity(corpus)
                        uni += diversity_result[0]
                        bi += diversity_result[1]

                        pred_sentences = []

                entropy_list.append(total_ent / len(x_test))
                uni_diversity.append(uni / len(x_test))
                bi_diversity.append(bi / len(x_test))

        print('Entropy = {:>.3f} | Distinct-1 = {:>.3f} | Distinct-2 = {:>.3f}'.format(np.mean(entropy_list),
                                                                                       np.mean(uni_diversity),
                                                                                       np.mean(bi_diversity)))
Ejemplo n.º 5
0
 def __init__(self, features, labels, num_cls):
     # features: List[List[any]], labels: List[int], num_cls: int
     self.features = np.array(features)
     self.labels = labels
     self.children = []
     self.num_cls = num_cls
     branch = []
     # find the most common labels in current node
     count_max = 0
     for label in np.unique(labels):
         branch.append(self.labels.count(label))
         if branch[-1] > count_max:
             count_max = labels.count(label)
             self.cls_max = label
             # splittable is false when all features belongs to one class
     self.entropy = Util.calculate_entropy(branch)
     if len(np.unique(labels)) < 2 or len(self.features[0]) == 0:
         self.splittable = False
     else:
         self.splittable = True
     self.dim_split = None  # the index of the feature to be split
     self.feature_uniq_split = None  # the possible unique values of the feature to be split
Ejemplo n.º 6
0
    def create_node(arr):
        def get_positive_key(type, index):
            return type.value + str(InputData.BENIGN) + str(index)

        def get_negative_key(type, index):
            return type.value + str(InputData.MALIGNANT) + str(index)

        def get_key(item, type, index):
            return item.value + str(type) + str(index)

        positive_count = 0
        negative_count = 0

        max = -sys.maxsize - 1
        max_class = None

        count_map = {}

        for i in range(1, 11):

            for item in list(InputType):
                count_map[get_positive_key(item, i)] = 0
                count_map[get_negative_key(item, i)] = 0

        for item in arr:
            if item.type == InputData.BENIGN:
                positive_count += 1
            else:
                negative_count += 1

            for input_type in list(InputType):
                key = get_key(input_type, item.type,
                              item.get_value(input_type))
                count_map[key] = count_map[key] + 1

        # print(positive_count, negative_count)

        if positive_count == 0 and negative_count == 0:
            result = Node(None)
            result.result = InputData.MALIGNANT
            return result

        if positive_count > 0 and negative_count == 0:
            result = Node(None)
            result.result = InputData.BENIGN
            return result

        if negative_count > 0 and positive_count == 0:
            result = Node(None)
            result.result = InputData.MALIGNANT
            return result

        # print(count_map)

        total = len(arr)

        entropy_total = calculate_entropy(positive_count, negative_count)

        ig_map = {}

        for item in list(InputType):
            ig_map[item] = entropy_total

        for i in range(1, 11):

            for item in list(InputType):
                value = calculate_total_entropy(
                    count_map[get_positive_key(item, i)],
                    count_map[get_negative_key(item, i)], total)

                ig_map[item] = ig_map[item] - value

        # print(ig_map)

        for key, value in ig_map.items():
            if value > max:
                max = value
                max_class = key

        # print(max)

        node = Node(max_class)
        node.arr = arr

        for i in range(1, 11):
            child = Node(max_class, i)
            for item in arr:
                if i == item.get_value(max_class):
                    child.add_item(item)
            if child.length == 0:
                child.result = -1
            node.add_child(child)

        return node
Ejemplo n.º 7
0
    def create_node(arr):

        positive_count = 0
        negative_count = 0

        max = -sys.maxsize - 1
        max_class = None

        count_map = {}

        for outlook in list(Outlook):
            count_map[str(outlook.name) + InputData.NEGATIVE] = 0
            count_map[str(outlook.name) + InputData.POSITIVE] = 0

        for temperature in list(Temperature):
            count_map[str(temperature.name) + InputData.NEGATIVE] = 0
            count_map[str(temperature.name) + InputData.POSITIVE] = 0

        for humidity in list(Humidity):
            count_map[str(humidity.name) + InputData.NEGATIVE] = 0
            count_map[str(humidity.name) + InputData.POSITIVE] = 0

        for wind in list(Wind):
            count_map[str(wind.name) + InputData.NEGATIVE] = 0
            count_map[str(wind.name) + InputData.POSITIVE] = 0

        for item in arr:
            if item.result == 1:
                positive_count += 1
            else:
                negative_count += 1

            for value in list(Outlook):
                if item.outlook == value:
                    key = str(value.name) + str(item.result)
                    count_map[key] = count_map[key] + 1

            for value in list(Temperature):
                if item.temperature == value:
                    key = str(value.name) + str(item.result)
                    count_map[key] = count_map[key] + 1

            for value in list(Humidity):
                if item.humidity == value:
                    key = str(value.name) + str(item.result)
                    count_map[key] = count_map[key] + 1

            for value in list(Wind):
                if item.wind == value:
                    key = str(value.name) + str(item.result)
                    count_map[key] = count_map[key] + 1

        if positive_count > 0 and negative_count == 0:
            result = Node(None)
            result.result = InputData.POSITIVE
            return result

        if negative_count > 0 and positive_count == 0:
            result = Node(None)
            result.result = InputData.NEGATIVE
            return result

        total = len(arr)

        entropy_total = calculate_entropy(positive_count, negative_count)

        ig_outlook = entropy_total

        for outlook in list(Outlook):
            ig_outlook -= calculate_total_entropy(
                count_map[str(outlook.name) + InputData.POSITIVE],
                count_map[str(outlook.name) + InputData.NEGATIVE], total)

        # print("outlook:", ig_outlook)

        ig_temperature = entropy_total

        for temperature in list(Temperature):
            ig_temperature -= calculate_total_entropy(
                count_map[str(temperature.name) + InputData.POSITIVE],
                count_map[str(temperature.name) + InputData.NEGATIVE], total)

        # print("temp:", ig_temperature)

        ig_humidity = entropy_total

        for humidity in list(Humidity):
            ig_humidity -= calculate_total_entropy(
                count_map[str(humidity.name) + InputData.POSITIVE],
                count_map[str(humidity.name) + InputData.NEGATIVE], total)

        # print("humidity:", ig_humidity)

        ig_wind = entropy_total

        for wind in list(Wind):
            ig_wind -= calculate_total_entropy(
                count_map[str(wind.name) + InputData.POSITIVE],
                count_map[str(wind.name) + InputData.NEGATIVE], total)

        # print("wind:", ig_wind)

        if ig_outlook > max:
            max = ig_outlook
            max_class = Outlook

        if ig_temperature > max:
            max = ig_temperature
            max_class = Temperature

        if ig_humidity > max:
            max = ig_humidity
            max_class = Humidity

        if ig_wind > max:
            max = ig_wind
            max_class = Wind

        # print("max", max)
        # print("value", max_class)

        node = Node(max_class)
        node.arr = arr

        if max_class == Outlook:
            sunny_node = Node(Outlook, Outlook.SUNNY)
            overcast_node = Node(Outlook, Outlook.OVERCAST)
            runny_node = Node(Outlook, Outlook.RAIN)

            node.add_child(sunny_node)
            node.add_child(overcast_node)
            node.add_child(runny_node)

            for item in arr:
                if item.outlook == Outlook.SUNNY:
                    sunny_node.add_item(item)
                if item.outlook == Outlook.OVERCAST:
                    overcast_node.add_item(item)
                if item.outlook == Outlook.RAIN:
                    runny_node.add_item(item)

        if max_class == Temperature:
            hot_node = Node(Temperature, Temperature.HOT)
            mild_node = Node(Temperature, Temperature.MILD)
            cool_node = Node(Temperature, Temperature.COOL)

            node.add_child(hot_node)
            node.add_child(mild_node)
            node.add_child(cool_node)

            for item in arr:
                if item.temperature == Temperature.HOT:
                    hot_node.add_item(item)
                if item.temperature == Temperature.MILD:
                    mild_node.add_item(item)
                if item.temperature == Temperature.COOL:
                    cool_node.add_item(item)

        if max_class == Humidity:
            high_node = Node(Humidity, Humidity.HIGH)
            normal_node = Node(Humidity, Humidity.NORMAL)

            node.add_child(high_node)
            node.add_child(normal_node)

            for item in arr:
                if item.humidity == Humidity.HIGH:
                    high_node.add_item(item)
                if item.humidity == Humidity.NORMAL:
                    normal_node.add_item(item)

        if max_class == Wind:
            weak_node = Node(Wind, Wind.WEAK)
            strong_node = Node(Wind, Wind.STRONG)

            node.add_child(weak_node)
            node.add_child(strong_node)

            for item in arr:
                if item.wind == Wind.WEAK:
                    weak_node.add_item(item)
                if item.wind == Wind.STRONG:
                    strong_node.add_item(item)

        return node