def get_gini_of_split(self, Y1, Y2): """get the square error of a split""" # Assume that we assign each a certain label to the two set, # the best assignment is the mean value of each set gini1 = gini(Y1) gini2 = gini(Y2) length = len(Y1) + len(Y2) return len(Y1) / length * gini1 + len(Y2) / length * gini2
def occupation_gini(matrix, motif, G): """return gini of occupancy over sites""" eps = [score_seq(matrix, site) for site in motif] fgs = [exp(-ep) for ep in eps] Zb = Zb_from_matrix(matrix, G) Z = sum(fgs) + Zb return gini([fg / Z for fg in fgs])
def run_once(self, times): res = {} alphas, xs_min, errors = [], [], [] ginis, means = [], [] pops, gdps, revs, exps = [], [], [], [] btms, tops = [], [] self.reset() for t in range(times): self.society.update_all_wages() self.government.reset_accounts() self.government.collect_taxes() self.government.redistribute_revenue(self.param.bias) if self.param.negative_tax: self.government.pay_nit(self.param.exemption_quantile, self.param.subsidy_rate) if self.param.basic_income: self.government.pay_ubi(self.param.fixed_benefit, self.param.variable_benefit) all_incomes = self.society.get_overall_incomes() a, x, e = IncomeDist.estimate_params(all_incomes) alphas.append(a) xs_min.append(x) errors.append(e) ginis.append(utils.gini(all_incomes)) gdps.append(sum(all_incomes)) means.append(gdps[-1] / self.society.size) revs.append(self.government.revenue) exps.append(self.government.expenses) pops.append(self.society.size) btm, top = utils.bottom_top(all_incomes) btms.append(btm) tops.append(top) self.society.update_population() res['alpha'] = alphas res['x_min'] = xs_min res['error'] = errors res['gini'] = ginis res['mean'] = means res['pop'] = pops res['gdp'] = gdps res['rev'] = revs res['exp'] = exps res['btm'] = btms res['top'] = tops return DataFrame.from_dict(res, orient='columns')
def split(self, x, y, depth, tree_idx): if (self.max_depth is not None and depth > self.max_depth) or self.check_terminate( y.to_numpy()): cnt = Counter(y.to_numpy()) self.tree[tree_idx] = \ [None, None, None, None, cnt.most_common(1)[0][0]] return while True: feature_names = self.pickup_feature_names(x.keys()) best_split_feature = None best_split_value = None best_criteria = None for feature_name in feature_names: feature = x[feature_name].sort_values().to_numpy() for idx in range(1, len(feature)): split_value = (feature[idx - 1] + feature[idx]) / 2 larger_y, others_y = y[x[feature_name] > split_value], y[ x[feature_name] <= split_value] larger_n, others_n = len(larger_y), len(others_y) if self.criterion == 'gini': new_gini = larger_n * utils.gini(larger_y.to_numpy( )) + others_n * utils.gini(others_y.to_numpy()) new_gini /= (larger_n + others_n) if best_criteria is None or new_gini < best_criteria: best_criteria = new_gini best_split_feature = feature_name best_split_value = split_value elif self.criterion == 'entropy': after_entropy = larger_n * utils.entropy( larger_y.to_numpy()) + others_n * utils.entropy( others_y.to_numpy()) after_entropy /= (larger_n + others_n) if best_criteria is None or after_entropy < best_criteria: best_criteria = after_entropy best_split_feature = feature_name best_split_value = split_value larger_y, others_y = y[ x[best_split_feature] > best_split_value], y[ x[best_split_feature] <= best_split_value] larger_n, others_n = len(larger_y), len(others_y) if self.criterion == 'gini': init_criteria = utils.gini(y.to_numpy()) new_gini = larger_n * utils.gini(larger_y.to_numpy( )) + others_n * utils.gini(others_y.to_numpy()) new_gini /= (larger_n + others_n) elif self.criterion == 'entropy': init_criteria = utils.entropy(y.to_numpy()) after_entropy = larger_n * utils.entropy(larger_y.to_numpy( )) + others_n * utils.entropy(others_y.to_numpy()) after_entropy /= (larger_n + others_n) if (x[best_split_feature] > best_split_value).sum() == 0 or \ (x[best_split_feature] <= best_split_value).sum() == 0: continue else: break greater_idx = len(self.tree) others_idx = greater_idx + 1 self.tree[tree_idx] = [ best_split_feature, best_split_value, greater_idx, others_idx, None ] self.tree.append([]) self.tree.append([]) self.split(x[x[best_split_feature] > best_split_value], larger_y, depth + 1, greater_idx) self.split(x[x[best_split_feature] <= best_split_value], others_y, depth + 1, others_idx)
corr = sess.run(model.num_correct, feed_dict={ model.input: [test_image], model.label: [original_label] }) total_corr += corr IG = integrated_gradients(sess, reference_image, test_image, original_label, model, gradient_func='output_input_gradient', steps=num_steps) IG_vector = IG.flatten() gini_v = gini(IG_vector) total_gini += gini_v log_file.write('%d %.4f\n' % (corr, gini_v)) acc = total_corr / num_eval_examples adv_gini = total_gini / num_eval_examples print('Accuracy: {:.2f}%'.format(100 * acc)) print('Average Gini: {:.4f}'.format(adv_gini)) log_file.close()
saver.restore(sess, checkpoint) e = shap.DeepExplainer((model.input, model.output), reference_images) for i in range(num_eval_examples): test_image = X[i] original_label = y[i] corr = sess.run(model.num_correct, feed_dict={ model.input: [test_image], model.label: [original_label] }) total_corr += corr shap_value = e.shap_values(X[i:i + 1])[0] shap_value_vector = shap_value.flatten() gini_v = gini(shap_value_vector) total_gini += gini_v log_file.write('%d %.4f\n' % (corr, gini_v)) acc = total_corr / num_eval_examples avg_gini = total_gini / num_eval_examples print('Accuracy: {:.2f}%'.format(100 * acc)) print('Average Gini: {:.4f}'.format(avg_gini)) log_file.close()
import dash from dash.dependencies import Input, Output import dash_core_components as dcc import dash_html_components as html import plotly.graph_objs as go import plotly.figure_factory as ff import numpy as np from utils import gini app = dash.Dash('') data1 = np.random.power(50, 500) data2 = np.random.uniform(0, 1, 10000) print 'Gini of Data1: ', gini(data1) print 'Gini of Data2: ', gini(data2) trace0 = go.Scatter(x=[0, 1], y=[0, 1], name='Line of Ideal Centralization', mode='lines') trace1 = go.Histogram(x=data1, histnorm='probability density', name='Lorenz Curve', cumulative=dict(enabled=True)) data = [trace0, trace1] layout = go.Layout(title='ETH Mining Decentralization (by Block Reward)', showlegend=True,
y = data['Class'].values print('Testing entropy, information gain, gain ratio...') assert (utils.entropy([1, 0, 0, 1, 0, 1]) == 1) assert (utils.entropy([1, 1, 1]) == 0) assert (utils.entropy([0]) == 0) outlook_index = np.where(original_attributes == 'Outlook')[0][0] Xs, ys, d = utils.split_categ(X, y, outlook_index, list(set(X[:, outlook_index]))) assert (np.isclose(utils.information_gain(y, ys), 0.246, rtol=1e-2)) assert (np.isclose(utils.gain_ratio(y, ys, y), 0.156, rtol=1e-2)) print('Testing gini index...') assert (utils.gini_impurity([1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0]) == 0.5) assert (utils.gini_impurity([0, 0, 0, 0, 0]) == 0) print('Testing gini...') assert (utils.gini([0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1], [[0, 1, 0, 0], [1, 1, 0, 0, 1, 1, 0, 1]]) == 0.0625) print('Testing Decision Tree...') m = dt.DecisionTreeClassifier(missing_branch=False) m.fit(X, y) m.to_pdf(original_attributes, out='tree1.pdf') assert (m.predict((['OVERCAST', 80, 90, 'T'])) == 'PLAY') assert (m.predict(['RAIN', 80, 50, 'F']) == 'PLAY') assert (m.predict(['RAIN', 80, 70, 'T']) == "DON'T PLAY") assert (m.predict(['SUNNY', 50, 50, 'T']) == 'PLAY') assert (m.predict(['SUNNY', 50, 91, 'T']) == "DON'T PLAY") assert (m.predict([np.nan, 50, 91, 'T']) == "DON'T PLAY") print('Testing Decision Tree with missing values (branch_nan = True)...') m = dt.DecisionTreeClassifier(missing_branch=True) # data, attributes, categories = read.readData(data_path = '../Dados/Test_with_nan.csv', class_name='Class',