def main(): print("-- XGBoost --") from sklearn import preprocessing from sklearn.preprocessing import LabelEncoder with pyRAPL.Measurement('Read_data', output=csv_output): df = pd.read_csv('/home/gabi/Teste/BaseSintetica/1k_5att.csv') X = df.iloc[:, :-1].values y = df.iloc[:, -1].values X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, seed=2) csv_output.save() clf = XGBoost() clf.fit(X_train, y_train) y_pred = clf.predict(X_test) accuracy = accuracy_score(y_test, y_pred) print("Accuracy:", accuracy)
def predict_value(self, x, tree=None): with pyRAPL.Measurement('Predict_value', output=csv_output): """ Do a recursive search down the tree and make a prediction of the data sample by the value of the leaf that we end up at """ if tree is None: tree = self.root # If we have a value (i.e we're at a leaf) => return value as the prediction if tree.value is not None: return tree.value # Choose the feature that we will test feature_value = x[tree.feature_i] # Determine if we will follow left or right branch branch = tree.false_branch if isinstance(feature_value, int) or isinstance( feature_value, float): if feature_value >= tree.threshold: branch = tree.true_branch elif feature_value == tree.threshold: branch = tree.true_branch # Test subtree return self.predict_value(x, branch) csv_output.save()
def fit(self, X, y, loss=None): with pyRAPL.Measurement('Fit_1', output=csv_output): """ Build decision tree """ self.one_dim = len(np.shape(y)) == 1 self.root = self._build_tree(X, y) self.loss = None csv_output.save()
def _split(self, y): with pyRAPL.Measurement('Split', output=csv_output): """ y contains y_true in left half of the middle column and y_pred in the right half. Split and return the two matrices """ col = int(np.shape(y)[1] / 2) y, y_pred = y[:, :col], y[:, col:] return y, y_pred csv_output.save()
def __init__(self, params): self.params = params try: pyRAPL.setup() self.rapl_enabled = True self.meter_rapl = pyRAPL.Measurement("bar") self.meter_rapl.begin() except: self.rapl_enabled = False
def fit(self, features, target): with pyRAPL.Measurement('Fit', output=csv_output): self.root = CART() if (self.tree == 'cls'): self.root._grow_tree(features, target, self.criterion) else: self.root._grow_tree(features, target, 'mse') self.root._prune(self.prune, self.max_depth, self.min_criterion, self.root.n_samples) csv_output.save()
def _calculate_information_gain(self, y, y1, y2): with pyRAPL.Measurement('Information_gain', output=csv_output): # Calculate information gain p = len(y1) / len(y) entropy = calculate_entropy(y) info_gain = entropy - p * \ calculate_entropy(y1) - (1 - p) * \ calculate_entropy(y2) return info_gain csv_output.save()
def _predict(self, d): with pyRAPL.Measurement('Predict', output=csv_output): if self.feature != None: if d[self.feature] <= self.threshold: return self.left._predict(d) else: return self.right._predict(d) else: return self.label csv_output.save()
def _approximate_update(self, y): with pyRAPL.Measurement('Approximate_update', output=csv_output): # y split into y, y_pred y, y_pred = self._split(y) # Newton's Method gradient = np.sum(y * self.loss.gradient(y, y_pred), axis=0) hessian = np.sum(self.loss.hess(y, y_pred), axis=0) update_approximation = gradient / hessian return update_approximation csv_output.save()
def _gain_by_taylor(self, y, y1, y2): with pyRAPL.Measurement('Gain_by_taylor', output=csv_output): # Split y, y_pred = self._split(y) y1, y1_pred = self._split(y1) y2, y2_pred = self._split(y2) true_gain = self._gain(y1, y1_pred) false_gain = self._gain(y2, y2_pred) gain = self._gain(y, y_pred) return true_gain + false_gain - gain csv_output.save()
def _majority_vote(self, y): with pyRAPL.Measurement('Majority_vote', output=csv_output): most_common = None max_count = 0 for label in np.unique(y): # Count number of occurences of samples with label count = len(y[y == label]) if count > max_count: most_common = label max_count = count return most_common csv_output.save()
def _show_tree(self, depth, cond): with pyRAPL.Measurement('Show_tree', output=csv_output): base = ' ' * depth + cond if self.feature != None: print(base + 'if X[' + str(self.feature) + '] <= ' + str(self.threshold)) self.left._show_tree(depth + 1, 'then ') self.right._show_tree(depth + 1, 'else ') else: print(base + '{value: ' + str(self.label) + ', samples: ' + str(self.n_samples) + '}') csv_output.save()
def fit(self, X, y): with pyRAPL.Measurement('Fit_3', output=csv_output): y = to_categorical(y) y_pred = np.zeros(np.shape(y)) for i in self.bar(range(self.n_estimators)): tree = self.trees[i] y_and_pred = np.concatenate((y, y_pred), axis=1) tree.fit(X, y_and_pred) update_pred = tree.predict(X) y_pred -= np.multiply(self.learning_rate, update_pred) csv_output.save()
def measure_power(self, label): meter = pyRAPL.Measurement(label=label) meter.begin() time.sleep(self.MEASURE_TIME) meter.end() m_energy = meter._results.pkg[self.SOCKET] # micro-J m_time = meter._results.duration # micro-s power = m_energy / m_time # watts return power
def _split_tree(self, features, target, criterion): with pyRAPL.Measurement('Split_tree', output=csv_output): features_l = features[features[:, self.feature] <= self.threshold] target_l = target[features[:, self.feature] <= self.threshold] self.left = CART() self.left.depth = self.depth + 1 self.left._grow_tree(features_l, target_l, criterion) features_r = features[features[:, self.feature] > self.threshold] target_r = target[features[:, self.feature] > self.threshold] self.right = CART() self.right.depth = self.depth + 1 self.right._grow_tree(features_r, target_r, criterion) csv_output.save()
def rapl_power(label, powertime, sockets): meter = pyRAPL.Measurement(label=label) while meter._results is None or meter._results.pkg is None: meter.begin() time.sleep(powertime) meter.end() results = {} m_time = meter._results.duration # micro-s for skt in sockets: m_energy = meter._results.pkg[skt] # micro-J results[skt] = m_energy / m_time # watts return results
def before(self): try: pyRAPL.setup() self.meter = pyRAPL.Measurement('bar') self.meter.begin() self.successful = True except FileNotFoundError: logging.warning( "RAPL file not found. Perhaps you are using a platform that does not support RAPL (for example Windows)" ) self.successful = False except PermissionError: logging.warning( "PermissionError occured while reading RAPL file. Fix with \"sudo chmod -R a+r /sys/class/powercap/intel-rapl\"" ) self.successful = False
def _calc_impurity(self, criterion, target): with pyRAPL.Measurement('Calc_impurity', output=csv_output): if criterion == 'gini': return 1.0 - sum([(float(len(target[target == c])) / float(target.shape[0]))**2.0 for c in np.unique(target)]) elif criterion == 'mse': return np.mean((target - np.mean(target))**2.0) else: entropy = 0.0 for c in np.unique(target): p = float(len(target[target == c])) / target.shape[0] if p > 0.0: entropy -= p * np.log2(p) return entropy csv_output.save()
def _grow_tree(self, features, target, criterion='gini'): with pyRAPL.Measurement('Grow_tree', output=csv_output): self.n_samples = features.shape[0] if len(np.unique(target)) == 1: self.label = target[0] return best_gain = 0.0 best_feature = None best_threshold = None if criterion in {'gini', 'entropy'}: self.label = max([(c, len(target[target == c])) for c in np.unique(target)], key=lambda x: x[1])[0] else: self.label = np.mean(target) impurity_node = self._calc_impurity(criterion, target) for col in range(features.shape[1]): feature_level = np.unique(features[:, col]) thresholds = (feature_level[:-1] + feature_level[1:]) / 2.0 for threshold in thresholds: target_l = target[features[:, col] <= threshold] impurity_l = self._calc_impurity(criterion, target_l) n_l = float(target_l.shape[0]) / self.n_samples target_r = target[features[:, col] > threshold] impurity_r = self._calc_impurity(criterion, target_r) n_r = float(target_r.shape[0]) / self.n_samples impurity_gain = impurity_node - (n_l * impurity_l + n_r * impurity_r) if impurity_gain > best_gain: best_gain = impurity_gain best_feature = col best_threshold = threshold self.feature = best_feature self.gain = best_gain self.threshold = best_threshold self._split_tree(features, target, criterion) csv_output.save()
def predict(self, X): with pyRAPL.Measurement('Predict_2', output=csv_output): y_pred = None # Make predictions for tree in self.trees: # Estimate gradient and update prediction update_pred = tree.predict(X) if y_pred is None: y_pred = np.zeros_like(update_pred) y_pred -= np.multiply(self.learning_rate, update_pred) # Turn into probability distribution (Softmax) y_pred = np.exp(y_pred) / np.sum( np.exp(y_pred), axis=1, keepdims=True) # Set label to the value that maximizes probability y_pred = np.argmax(y_pred, axis=1) return y_pred csv_output.save()
def predict(self, X): with pyRAPL.Measurement('Predict_2', output=csv_output): y_preds = np.empty((X.shape[0], len(self.trees))) # Let each tree make a prediction on the data for i, tree in enumerate(self.trees): # Indices of the features that the tree has trained on idx = tree.feature_indices # Make a prediction based on those features prediction = tree.predict(X[:, idx]) y_preds[:, i] = prediction y_pred = [] # For each sample for sample_predictions in y_preds: # Select the most common class prediction y_pred.append( np.bincount(sample_predictions.astype('int')).argmax()) return y_pred csv_output.save()
def classification_example(): with pyRAPL.Measurement('Read_data', output=csv_output): df = pd.read_csv('/home/gabrieli/Documentos/BaseSintetica/1k_5att.csv') #df = df.apply(LabelEncoder().fit_transform) X = df.iloc[:, :-1].values y = df.iloc[:, -1].values X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4) csv_output.save() cls = CART(tree='cls', criterion='gini', prune='depth') cls.fit(X_train, y_train) cls.print_tree() pred = cls.predict(X_test) print("Accuracy:", accuracy_score(y_test, pred))
def print_tree(self, tree=None, indent=" "): with pyRAPL.Measurement('Print_tree', output=csv_output): """ Recursively print the decision tree """ if not tree: tree = self.root # If we're at leaf => print the label if tree.value is not None: print(tree.value) # Go deeper down the tree else: # Print test print("%s:%s? " % (tree.feature_i, tree.threshold)) # Print the true scenario print("%sT->" % (indent), end="") self.print_tree(tree.true_branch, indent + indent) # Print the false scenario print("%sF->" % (indent), end="") self.print_tree(tree.false_branch, indent + indent) csv_output.save()
def main(): print("-- Classification Tree --") with pyRAPL.Measurement('Read_data', output=csv_output): dataset = pd.read_csv('/home/gabi/Teste/BaseSintetica/1k_5att.csv') X = dataset.iloc[:, 0:5].values y = dataset.iloc[:, 5].values X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4) csv_output.save() clf = ClassificationTree() clf.fit(X_train, y_train) y_pred = clf.predict(X_test) accuracy = accuracy_score(y_test, y_pred) print("Accuracy:", accuracy)
def test_context_measure(fs_one_socket): """ Test to measure the energy consumption of a function using the Measurement class - launch the measure - write a new value to the RAPL power measurement api file - launch a function - end the measure Test if: - the energy consumption measured is the delta between the first and the last value in the RAPL power measurement file """ pyRAPL.setup() out = dummyOutput() with pyRAPL.Measurement('toto', output=out): measurable_function(1) assert out.data.pkg == [(POWER_CONSUMPTION_PKG - PKG_0_VALUE)] assert out.data.dram == [(POWER_CONSUMPTION_DRAM - DRAM_0_VALUE)]
def test_nomal_measure_bench(fs_one_socket): """ Test to measure the energy consumption of a function using the Measurement class - launch the measure - write a new value to the RAPL power measurement api file - launch a function - end the measure Test if: - the energy consumption measured is the delta between the first and the last value in the RAPL power measurement file """ pyRAPL.setup() measure = pyRAPL.Measurement('toto') measure.begin() measurable_function(1) measure.end() assert measure.result.pkg == [(POWER_CONSUMPTION_PKG - PKG_0_VALUE)] assert measure.result.dram == [(POWER_CONSUMPTION_DRAM - DRAM_0_VALUE)]
def regression_example(): with pyRAPL.Measurement('Read_data', output=csv_output): df = pd.read_csv('/home/gabrieli/Documentos/BaseSintetica/1k_5att.csv') #df = df.apply(LabelEncoder().fit_transform) X = df.iloc[:, :-1].values y = df.iloc[:, -1].values X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4) csv_output.save() # Fit regression model reg = CART(tree='reg', criterion='mse', prune='depth') reg.fit(X_train, y_train) reg.print_tree() pred = reg.predict(X_test) mse = mean_squared_error(y_test, pred) print("MSE:", mse)
def _prune(self, method, max_depth, min_criterion, n_samples): with pyRAPL.Measurement('Prune', output=csv_output): if self.feature is None: return self.left._prune(method, max_depth, min_criterion, n_samples) self.right._prune(method, max_depth, min_criterion, n_samples) pruning = False if method == 'impurity' and self.left.feature is None and self.right.feature is None: if (self.gain * float(self.n_samples) / n_samples) < min_criterion: pruning = True elif method == 'depth' and self.depth >= max_depth: pruning = True if pruning is True: self.left = None self.right = None self.feature = None csv_output.save()
def fit(self, X, y): with pyRAPL.Measurement('Fit_3', output=csv_output): n_features = np.shape(X)[1] # If max_features have not been defined => select it as # sqrt(n_features) if not self.max_features: self.max_features = int(math.sqrt(n_features)) # Choose one random subset of the data for each tree subsets = get_random_subsets(X, y, self.n_estimators) for i in self.progressbar(range(self.n_estimators)): X_subset, y_subset = subsets[i] # Feature bagging (select random subsets of the features) idx = np.random.choice(range(n_features), size=self.max_features, replace=True) # Save the indices of the features for prediction self.trees[i].feature_indices = idx # Choose the features corresponding to the indices X_subset = X_subset[:, idx] # Fit the tree to the data self.trees[i].fit(X_subset, y_subset) csv_output.save()
def _build_tree(self, X, y, current_depth=0): with pyRAPL.Measurement('Build_tree', output=csv_output): """ Recursive method which builds out the decision tree and splits X and respective y on the feature of X which (based on impurity) best separates the data""" largest_impurity = 0 best_criteria = None # Feature index and threshold best_sets = None # Subsets of the data # Check if expansion of y is needed if len(np.shape(y)) == 1: y = np.expand_dims(y, axis=1) # Add y as last column of X Xy = np.concatenate((X, y), axis=1) n_samples, n_features = np.shape(X) if n_samples >= self.min_samples_split and current_depth <= self.max_depth: # Calculate the impurity for each feature for feature_i in range(n_features): # All values of feature_i feature_values = np.expand_dims(X[:, feature_i], axis=1) unique_values = np.unique(feature_values) # Iterate through all unique values of feature column i and # calculate the impurity for threshold in unique_values: # Divide X and y depending on if the feature value of X at index feature_i # meets the threshold Xy1, Xy2 = divide_on_feature(Xy, feature_i, threshold) if len(Xy1) > 0 and len(Xy2) > 0: # Select the y-values of the two sets y1 = Xy1[:, n_features:] y2 = Xy2[:, n_features:] # Calculate impurity impurity = self._impurity_calculation(y, y1, y2) # If this threshold resulted in a higher information gain than previously # recorded save the threshold value and the feature # index if impurity > largest_impurity: largest_impurity = impurity best_criteria = { "feature_i": feature_i, "threshold": threshold } best_sets = { "leftX": Xy1[:, :n_features], # X of left subtree "lefty": Xy1[:, n_features:], # y of left subtree "rightX": Xy2[:, :n_features], # X of right subtree "righty": Xy2[:, n_features:] # y of right subtree } if largest_impurity > self.min_impurity: # Build subtrees for the right and left branches true_branch = self._build_tree(best_sets["leftX"], best_sets["lefty"], current_depth + 1) false_branch = self._build_tree(best_sets["rightX"], best_sets["righty"], current_depth + 1) return DecisionNode(feature_i=best_criteria["feature_i"], threshold=best_criteria["threshold"], true_branch=true_branch, false_branch=false_branch) # We're at leaf => determine value leaf_value = self._leaf_value_calculation(y) return DecisionNode(value=leaf_value) csv_output.save()