forked from iisc-sa-open/trsl
-
Notifications
You must be signed in to change notification settings - Fork 0
/
node.py
137 lines (119 loc) · 4.74 KB
/
node.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
#! /usr/bin/env python2
"""
Node Class implemented
todo: Add functionality to serialize and load nodes,
which could in turn be used by another function to do the same
with the entire deciset_dataon tree.
"""
from question import Question
import math
class Node(object):
"""
A class which holds all the attributes of a node in the
deciset_dataon tree
todo: Evaluate if a named tuple or dictionary would suffice
"""
def __init__(self):
self.lchild = None
self.rchild = None
self.set = None
self.dist = None
self.predictor_variable_index = None
self.row_fragment_indices = None
self.probabilistic_entropy = 0
self.absolute_entropy = 0
self.parent = None
self.depth = 0
self.probability = 0
self.best_question = None
def question_already_asked(self, x_index, set_data):
"""
Checks if the same question has been asked
(same set and predictor variable index)
in the parent and the parent's parent and so on
till the root.
The rationale here is that asking the same question again
on a subset of the data the question was asked before
would cause all the data to go down one one of YES or NO path
which is unnecessary computation.
"""
parent = self.parent
while parent is not None:
if parent.set == set_data and parent.predictor_variable_index == x_index:
return True
else:
parent = parent.parent
return False
def eval_question(self, ngram_table, pred_var_set_pair):
"""
Evaluate question by computing the avg conditional entropy,
reduction, belongs to and not belongs to probability
"""
x_index, set_data = pred_var_set_pair
question = Question()
if self.question_already_asked(x_index, set_data):
#The reduction is set to 0 by default for a question
return question
question.set = set_data
question.predictor_variable_index = x_index
self.count_target_word_frequencies(ngram_table, x_index, set_data, question)
question.b_dist_entropy = self.frequencies_to_probabilities_and_entropy(question.b_dist)
question.nb_dist_entropy = self.frequencies_to_probabilities_and_entropy(question.nb_dist)
size_row_fragment = (
len(self.row_fragment_indices)
)
question.b_probability = 0 if size_row_fragment is 0 else (
self.probability * float(len(question.b_indices))/size_row_fragment
)
question.nb_probability = 0 if size_row_fragment is 0 else (
self.probability * float(len(question.nb_indices))/size_row_fragment
)
question.avg_conditional_entropy = (
(question.b_probability * question.b_dist_entropy)
+
(question.nb_probability * question.nb_dist_entropy)
)
question.reduction = (
self.probabilistic_entropy - question.avg_conditional_entropy
)
return question
def count_target_word_frequencies(self, ngram_table, x_index, set_data, question):
"""
Count target word frequencies for predictor
variable belongs to set and predictor variable
does not belong to set
"""
for table_index in self.row_fragment_indices:
predictor_word = ngram_table[table_index, x_index]
target_word = ngram_table[
table_index, ngram_table.ngram_window_size-1
]
if predictor_word in set_data:
question.b_indices.append(table_index)
try:
question.b_dist[target_word] += 1.0
except KeyError:
question.b_dist[target_word] = 1.0
else:
question.nb_indices.append(table_index)
try:
question.nb_dist[target_word] += 1.0
except KeyError:
question.nb_dist[target_word] = 1.0
def frequencies_to_probabilities_and_entropy(self, hashmap):
"""
Compute probability from frequency of occurence
and return the entropy of the same from the
hasmap entries
"""
frequency_sum = sum(hashmap.values())
entropy = 0
for key in hashmap.keys():
frequency = hashmap[key]
probability = frequency / frequency_sum
probability_of_info_gain = (
probability * math.log(probability, 2)
)
hashmap[key] = probability
entropy += -probability_of_info_gain
return entropy