def testDiag2Chain(self): # Perform the diagonalization for a 2-site chain. Compare # an exact diagonalization against scipy. # First calculate X,P calc = Calculate() polygon = sp.array([[0,0],[2,0]]) maple_link = maple.MapleLink("/opt/maple13/bin/maple -tu") precision = 25 X,P = calc.correlations(polygon, maple_link, precision) entropy = calc.entropy(X, P, 1, precision,False) # Manual diag. # [ a, b, # c, d] XP = X*P a = XP[0,0] b = XP[0,1] c = XP[1,0] d = XP[1,1] eig_plus = ((a+d)+sympy.mpmath.sqrt((a+d)**2-4*(a*d-b*c)))/2 eig_minus = ((a+d)-sympy.mpmath.sqrt((a+d)**2-4*(a*d-b*c)))/2 sqrt_eigs = [sympy.mpmath.sqrt(eig_plus),sympy.mpmath.sqrt(eig_minus)] S = 0 for vk in sqrt_eigs: S += ((vk + 0.5)*sympy.mpmath.log(vk + 0.5) - (vk - 0.5)*sympy.mpmath.log(vk - 0.5)) # Scipy operates in double, so we check equality up to a tolerance bound = 10**(-13) eq_(True, abs(S - entropy) < bound)
def testDiag2Chain(self): # Perform the diagonalization for a 2-site chain. Compare # an exact diagonalization against scipy. # First calculate X,P calc = Calculate() polygon = sp.array([[0, 0], [2, 0]]) maple_link = maple.MapleLink("/opt/maple13/bin/maple -tu") precision = 25 X, P = calc.correlations(polygon, maple_link, precision) entropy = calc.entropy(X, P, 1, precision, False) # Manual diag. # [ a, b, # c, d] XP = X * P a = XP[0, 0] b = XP[0, 1] c = XP[1, 0] d = XP[1, 1] eig_plus = ((a + d) + sympy.mpmath.sqrt((a + d)**2 - 4 * (a * d - b * c))) / 2 eig_minus = ((a + d) - sympy.mpmath.sqrt((a + d)**2 - 4 * (a * d - b * c))) / 2 sqrt_eigs = [sympy.mpmath.sqrt(eig_plus), sympy.mpmath.sqrt(eig_minus)] S = 0 for vk in sqrt_eigs: S += ((vk + 0.5) * sympy.mpmath.log(vk + 0.5) - (vk - 0.5) * sympy.mpmath.log(vk - 0.5)) # Scipy operates in double, so we check equality up to a tolerance bound = 10**(-13) eq_(True, abs(S - entropy) < bound)
def __fit_without_prune(self, data, features, target): ''' Built entire decision tree without pruning ''' continuous_features = list() discrete_features = list() for feature in features: if len(list(data[feature])) > 0: is_continue = self.is_attr_continue(list(data[feature])) if is_continue: continuous_features.append(feature) else: discrete_features.append(feature) if not continuous_features: return MyID3(self.gain_ratio).fit(data, features, target) # Continuous attribute # If only one value exist entropy_data_target = Calculate.entropy(data[target]) if entropy_data_target == 0: value_list = Calculate.get_unique_data(data, target) value_dict = dict() for key, value in value_list.items(): value_dict[key] = len(value_list[key]) return Tree( Node( None, 0.0, # Entropy must be 0 since only one value exist value_dict, result=data[target][0], is_leaf=True)) if (len(features) == 0): value_list = Calculate.get_unique_data(data, target) value_dict = dict() for key, value in value_list.items(): value_dict[key] = len(value_list[key]) return Tree( Node(None, entropy_data_target, value_dict, result=Calculate.most_label(data[target]), is_leaf=True)) # Find best attribute and build tree recursively best_attr = '' best_point = 0 is_discrete = False best_splitter = 0 chosen_edge = list(['', '']) for feature in continuous_features: best_treshold = self.find_threshold(data[[feature]], data[[target]]) if best_treshold[1] > best_point: best_attr = str(feature) chosen_edge[0] = best_attr + ' > ' + str(best_treshold[0]) chosen_edge[1] = best_attr + ' <= ' + str(best_treshold[0]) best_point = best_treshold[1] best_splitter = best_treshold[0] for feature in discrete_features: point = Calculate.info_gain(data[feature], data[target]) if point > best_point: best_point = point best_attr = str(feature) is_discrete = True value_list = Calculate.get_unique_data(data, target) value_dict = dict() for key, value in value_list.items(): value_dict[key] = len(value_list[key]) dtree = Tree(Node(best_attr, best_point, value_dict)) # Scan all posible value to be generated subtree if is_discrete: list_attribute = Calculate.get_unique_data(data, best_attr) else: list_attribute = Calculate.split_by_threshold( data, best_attr, best_splitter) i = 0 for attribute in list_attribute: data = pd.DataFrame(data=list_attribute[attribute]).reset_index( drop=True) dtree.add_child(self.__fit_without_prune(data, features, target)) if is_discrete: dtree.children[i].value.edge = attribute else: dtree.children[i].value.edge = chosen_edge[i] i += 1 return dtree
def fit(self, data, attributes, target_name): ''' Built and return decision tree using ID3 algorithm ''' data_target = data[target_name] # Data target contains one label entropy_data_target = Calculate.entropy(data_target) if entropy_data_target == 0: value_list = Calculate.get_unique_data(data, target_name) value_dict = dict() for key, value in value_list.items(): value_dict[key] = len(value_list[key]) # Set current_node, info_gain, values tree = Tree( Node(None, entropy_data_target, value_dict, result=data_target[0], is_leaf=True)) return tree # Nothing attribute shall be chosen if len(attributes) == 0: # Set current_node, info_gain, values value_list = Calculate.get_unique_data(data, target_name) value_dict = dict() for key, value in value_list.items(): value_dict[key] = len(value_list[key]) tree = Tree( Node(None, entropy_data_target, value_dict, result=Calculate.most_label(data_target), is_leaf=True)) return tree else: # Find best attribute to be node using either info gain or gain ratio best_attr = '' best_point = 0 # Could be Info gain or Gain ratio for attr in attributes: if self.gain_ratio: point = Calculate.gain_ratio(data[attr], data_target) if point > best_point: best_point = point best_attr = attr else: point = Calculate.info_gain(data[attr], data_target) if point > best_point: best_point = point best_attr = attr value_list = Calculate.get_unique_data(data, target_name) value_dict = dict() for key, value in value_list.items(): value_dict[key] = len(value_list[key]) # Build decision tree recursively dtree = Tree(Node(best_attr, best_point, value_dict)) # Delete usage attribute in attributes attributes.remove(best_attr) # Scan all posible value to be generated subtree list_attribute = Calculate.get_unique_data(data, best_attr) i = 0 for attribute in list_attribute: data = pd.DataFrame( data=list_attribute[attribute]).reset_index(drop=True) data.drop(best_attr, axis=1, inplace=True) dtree.add_child(self.fit(data, attributes, target_name)) dtree.children[i].value.edge = attribute i += 1 return dtree