def MDLPC_criterion(self, data, feature, cut_point): ''' Determines whether a partition is accepted according to the MDLPC criterion :param feature: feature of interest :param cut_point: proposed cut_point :param partition_index: index of the sample (dataframe partition) in the interval of interest :return: True/False, whether to accept the partition ''' #get dataframe only with desired attribute and class columns, and split by cut_point data_partition = data.copy(deep=True) data_left = data_partition[data_partition[feature] <= cut_point] data_right = data_partition[data_partition[feature] > cut_point] #compute information gain obtained when splitting data at cut_point cut_point_gain = cut_point_information_gain(dataset=data_partition, cut_point=cut_point, feature_label=feature, class_label=self._class_name) #compute delta term in MDLPC criterion N = len(data_partition) # number of examples in current partition partition_entropy = entropy(data_partition[self._class_name]) k = len(data_partition[self._class_name].unique()) k_left = len(data_left[self._class_name].unique()) k_right = len(data_right[self._class_name].unique()) entropy_left = entropy(data_left[self._class_name]) # entropy of partition entropy_right = entropy(data_right[self._class_name]) delta = log(3 ** k, 2) - (k * partition_entropy) + (k_left * entropy_left) + (k_right * entropy_right) #to split or not to split gain_threshold = (log(N - 1, 2) + delta) / N if cut_point_gain > gain_threshold: return True else: return False
def Gain(S, A): gain = 0 values = {} lenS = len(S) Spos, Sneg = countPN(S, masterkey) S_entropy = entropy(Spos, Sneg) for s in S: value = s[A] if value not in values: values[value] = [0, 0, 0] if s[masterkey] == "yes": values[value][0] = values[value][0] + 1 elif s[masterkey] == "no": values[value][1] = values[value][1] + 1 acum = 0 S_At = 0 propt = 0 for k, v in values.items(): v[2] = entropy(v[0], v[1]) propt = (v[0] + v[1]) / lenS acum = propt * v[2] S_At = S_At - acum return S_entropy + S_At
def compute_weighted_entropy(attribute_vals, ys): """ Pivot by match value """ ys_by_match = defaultdict(list) for is_match, y in izip(attribute_vals, ys): ys_by_match[is_match].append(y) """ Compute entropy, weighted by size of attribute """ wt_entropy = 0.0 num_items = float(len(ys)) for ismatch, lst in ys_by_match.items(): ent = entropy(lst) wt = (len(lst) / num_items) wt_entropy += wt * ent return wt_entropy
def set_entropy(self): tic = time() node = self.root queue = Queue() queue.put(node) while not queue.empty(): node = queue.get() cnts = [] for child in node.children.values(): queue.put(child) cnts.append(child.count) if cnts: cnts = np.array(cnts) node.entropy = entropy(cnts) else: node.entropy = 0
]) testCol = ['EGITIM', 'YAS', 'CINSIYET'] testtData = pd.DataFrame(testManual, columns=testCol) columns2 = ['EGITIM', 'YAS', 'CINSIYET', 'KABUL'] testDataManual = pd.DataFrame(testDataManual, columns=columns2) # data = 'Qualitative_Bankruptcy.data.csv' # cov19 = 'TimeAge.csv' # df = pd.DataFrame(np.array(pd.read_csv(data))) # columns = ['Industrial Risk', 'Management Risk', 'Financial Flexibility', 'Credibility', 'Competitiveness', # 'Operating Risk', 'Class'] # data = 'Qualitative_Bankruptcy.data.csv' # # df = pd.DataFrame(pd.read_csv(data)) # # blueWins = df[['blueWins']] # # df.drop(columns=['blueWins'],inplace=True) # # df.insert(len(df.columns), 'blueWins', blueWins) # # print(df.head()) e = entropy(testDataManual, 'karci', resultCol='KABUL', column_list=columns2) e.calc() e.result(testtData) # for index, row in testtData.iterrows(): # # print(row['YAS']) # rangei =100 # rangej = 1 # for i in range(0,rangei): # for j in range(0,rangej): # print('\rData Loading {:.5f}% {} {}'.format((i+j/(rangej-1 if rangej != 1 else rangej))/(rangei)*100,i,j), end='\n', flush=False)