Beispiel #1
0
def main():

    print("-- XGBoost --")

    from sklearn import preprocessing
    from sklearn.preprocessing import LabelEncoder

    with pyRAPL.Measurement('Read_data', output=csv_output):
        df = pd.read_csv('/home/gabi/Teste/BaseSintetica/1k_5att.csv')

        X = df.iloc[:, :-1].values
        y = df.iloc[:, -1].values

        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.4,
                                                            seed=2)

    csv_output.save()

    clf = XGBoost()
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy:", accuracy)
    def predict_value(self, x, tree=None):
        with pyRAPL.Measurement('Predict_value', output=csv_output):
            """ Do a recursive search down the tree and make a prediction of the data sample by the
                value of the leaf that we end up at """

            if tree is None:
                tree = self.root

            # If we have a value (i.e we're at a leaf) => return value as the prediction
            if tree.value is not None:
                return tree.value

            # Choose the feature that we will test
            feature_value = x[tree.feature_i]

            # Determine if we will follow left or right branch
            branch = tree.false_branch
            if isinstance(feature_value, int) or isinstance(
                    feature_value, float):
                if feature_value >= tree.threshold:
                    branch = tree.true_branch
            elif feature_value == tree.threshold:
                branch = tree.true_branch

            # Test subtree
            return self.predict_value(x, branch)
        csv_output.save()
 def fit(self, X, y, loss=None):
     with pyRAPL.Measurement('Fit_1', output=csv_output):
         """ Build decision tree """
         self.one_dim = len(np.shape(y)) == 1
         self.root = self._build_tree(X, y)
         self.loss = None
     csv_output.save()
Beispiel #4
0
 def _split(self, y):
     with pyRAPL.Measurement('Split', output=csv_output):
         """ y contains y_true in left half of the middle column and
        y_pred in the right half. Split and return the two matrices """
         col = int(np.shape(y)[1] / 2)
         y, y_pred = y[:, :col], y[:, col:]
         return y, y_pred
     csv_output.save()
Beispiel #5
0
 def __init__(self, params):
     self.params = params
     try:
         pyRAPL.setup()
         self.rapl_enabled = True
         self.meter_rapl = pyRAPL.Measurement("bar")
         self.meter_rapl.begin()
     except:
         self.rapl_enabled = False
 def fit(self, features, target):
     with pyRAPL.Measurement('Fit', output=csv_output):
         self.root = CART()
         if (self.tree == 'cls'):
             self.root._grow_tree(features, target, self.criterion)
         else:
             self.root._grow_tree(features, target, 'mse')
         self.root._prune(self.prune, self.max_depth, self.min_criterion,
                          self.root.n_samples)
     csv_output.save()
 def _calculate_information_gain(self, y, y1, y2):
     with pyRAPL.Measurement('Information_gain', output=csv_output):
         # Calculate information gain
         p = len(y1) / len(y)
         entropy = calculate_entropy(y)
         info_gain = entropy - p * \
            calculate_entropy(y1) - (1 - p) * \
            calculate_entropy(y2)
         return info_gain
     csv_output.save()
 def _predict(self, d):
     with pyRAPL.Measurement('Predict', output=csv_output):
         if self.feature != None:
             if d[self.feature] <= self.threshold:
                 return self.left._predict(d)
             else:
                 return self.right._predict(d)
         else:
             return self.label
     csv_output.save()
Beispiel #9
0
    def _approximate_update(self, y):
        with pyRAPL.Measurement('Approximate_update', output=csv_output):
            # y split into y, y_pred
            y, y_pred = self._split(y)
            # Newton's Method
            gradient = np.sum(y * self.loss.gradient(y, y_pred), axis=0)
            hessian = np.sum(self.loss.hess(y, y_pred), axis=0)
            update_approximation = gradient / hessian

            return update_approximation
        csv_output.save()
Beispiel #10
0
    def _gain_by_taylor(self, y, y1, y2):
        with pyRAPL.Measurement('Gain_by_taylor', output=csv_output):
            # Split
            y, y_pred = self._split(y)
            y1, y1_pred = self._split(y1)
            y2, y2_pred = self._split(y2)

            true_gain = self._gain(y1, y1_pred)
            false_gain = self._gain(y2, y2_pred)
            gain = self._gain(y, y_pred)
            return true_gain + false_gain - gain
        csv_output.save()
 def _majority_vote(self, y):
     with pyRAPL.Measurement('Majority_vote', output=csv_output):
         most_common = None
         max_count = 0
         for label in np.unique(y):
             # Count number of occurences of samples with label
             count = len(y[y == label])
             if count > max_count:
                 most_common = label
                 max_count = count
         return most_common
     csv_output.save()
 def _show_tree(self, depth, cond):
     with pyRAPL.Measurement('Show_tree', output=csv_output):
         base = '    ' * depth + cond
         if self.feature != None:
             print(base + 'if X[' + str(self.feature) + '] <= ' +
                   str(self.threshold))
             self.left._show_tree(depth + 1, 'then ')
             self.right._show_tree(depth + 1, 'else ')
         else:
             print(base + '{value: ' + str(self.label) + ', samples: ' +
                   str(self.n_samples) + '}')
     csv_output.save()
Beispiel #13
0
    def fit(self, X, y):
        with pyRAPL.Measurement('Fit_3', output=csv_output):
            y = to_categorical(y)

            y_pred = np.zeros(np.shape(y))
            for i in self.bar(range(self.n_estimators)):
                tree = self.trees[i]
                y_and_pred = np.concatenate((y, y_pred), axis=1)
                tree.fit(X, y_and_pred)
                update_pred = tree.predict(X)

                y_pred -= np.multiply(self.learning_rate, update_pred)
        csv_output.save()
    def measure_power(self, label):
        meter = pyRAPL.Measurement(label=label)

        meter.begin()
        time.sleep(self.MEASURE_TIME)
        meter.end()

        m_energy = meter._results.pkg[self.SOCKET] # micro-J
        m_time   = meter._results.duration         # micro-s

        power = m_energy / m_time # watts

        return power
    def _split_tree(self, features, target, criterion):
        with pyRAPL.Measurement('Split_tree', output=csv_output):
            features_l = features[features[:, self.feature] <= self.threshold]
            target_l = target[features[:, self.feature] <= self.threshold]
            self.left = CART()
            self.left.depth = self.depth + 1
            self.left._grow_tree(features_l, target_l, criterion)

            features_r = features[features[:, self.feature] > self.threshold]
            target_r = target[features[:, self.feature] > self.threshold]
            self.right = CART()
            self.right.depth = self.depth + 1
            self.right._grow_tree(features_r, target_r, criterion)
        csv_output.save()
def rapl_power(label, powertime, sockets):
    meter = pyRAPL.Measurement(label=label)

    while meter._results is None or meter._results.pkg is None:
        meter.begin()
        time.sleep(powertime)
        meter.end()

    results = {}
    m_time = meter._results.duration # micro-s
    for skt in sockets:
        m_energy = meter._results.pkg[skt] # micro-J
        results[skt] = m_energy / m_time # watts

    return results
 def before(self):
     try:
         pyRAPL.setup()
         self.meter = pyRAPL.Measurement('bar')
         self.meter.begin()
         self.successful = True
     except FileNotFoundError:
         logging.warning(
             "RAPL file not found. Perhaps you are using a platform that does not support RAPL (for example Windows)"
         )
         self.successful = False
     except PermissionError:
         logging.warning(
             "PermissionError occured while reading RAPL file. Fix with \"sudo chmod -R a+r /sys/class/powercap/intel-rapl\""
         )
         self.successful = False
 def _calc_impurity(self, criterion, target):
     with pyRAPL.Measurement('Calc_impurity', output=csv_output):
         if criterion == 'gini':
             return 1.0 - sum([(float(len(target[target == c])) /
                                float(target.shape[0]))**2.0
                               for c in np.unique(target)])
         elif criterion == 'mse':
             return np.mean((target - np.mean(target))**2.0)
         else:
             entropy = 0.0
             for c in np.unique(target):
                 p = float(len(target[target == c])) / target.shape[0]
                 if p > 0.0:
                     entropy -= p * np.log2(p)
             return entropy
     csv_output.save()
    def _grow_tree(self, features, target, criterion='gini'):
        with pyRAPL.Measurement('Grow_tree', output=csv_output):
            self.n_samples = features.shape[0]

            if len(np.unique(target)) == 1:
                self.label = target[0]
                return

            best_gain = 0.0
            best_feature = None
            best_threshold = None

            if criterion in {'gini', 'entropy'}:
                self.label = max([(c, len(target[target == c]))
                                  for c in np.unique(target)],
                                 key=lambda x: x[1])[0]
            else:
                self.label = np.mean(target)

            impurity_node = self._calc_impurity(criterion, target)

            for col in range(features.shape[1]):
                feature_level = np.unique(features[:, col])
                thresholds = (feature_level[:-1] + feature_level[1:]) / 2.0

                for threshold in thresholds:
                    target_l = target[features[:, col] <= threshold]
                    impurity_l = self._calc_impurity(criterion, target_l)
                    n_l = float(target_l.shape[0]) / self.n_samples

                    target_r = target[features[:, col] > threshold]
                    impurity_r = self._calc_impurity(criterion, target_r)
                    n_r = float(target_r.shape[0]) / self.n_samples

                    impurity_gain = impurity_node - (n_l * impurity_l +
                                                     n_r * impurity_r)
                    if impurity_gain > best_gain:
                        best_gain = impurity_gain
                        best_feature = col
                        best_threshold = threshold

            self.feature = best_feature
            self.gain = best_gain
            self.threshold = best_threshold
            self._split_tree(features, target, criterion)
        csv_output.save()
Beispiel #20
0
    def predict(self, X):
        with pyRAPL.Measurement('Predict_2', output=csv_output):
            y_pred = None
            # Make predictions
            for tree in self.trees:
                # Estimate gradient and update prediction
                update_pred = tree.predict(X)
                if y_pred is None:
                    y_pred = np.zeros_like(update_pred)
                y_pred -= np.multiply(self.learning_rate, update_pred)

            # Turn into probability distribution (Softmax)
            y_pred = np.exp(y_pred) / np.sum(
                np.exp(y_pred), axis=1, keepdims=True)
            # Set label to the value that maximizes probability
            y_pred = np.argmax(y_pred, axis=1)
            return y_pred
        csv_output.save()
 def predict(self, X):
     with pyRAPL.Measurement('Predict_2', output=csv_output):
         y_preds = np.empty((X.shape[0], len(self.trees)))
         # Let each tree make a prediction on the data
         for i, tree in enumerate(self.trees):
             # Indices of the features that the tree has trained on
             idx = tree.feature_indices
             # Make a prediction based on those features
             prediction = tree.predict(X[:, idx])
             y_preds[:, i] = prediction
         y_pred = []
         # For each sample
         for sample_predictions in y_preds:
             # Select the most common class prediction
             y_pred.append(
                 np.bincount(sample_predictions.astype('int')).argmax())
         return y_pred
     csv_output.save()
def classification_example():
    with pyRAPL.Measurement('Read_data', output=csv_output):
        df = pd.read_csv('/home/gabrieli/Documentos/BaseSintetica/1k_5att.csv')
        #df = df.apply(LabelEncoder().fit_transform)

        X = df.iloc[:, :-1].values
        y = df.iloc[:, -1].values

        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.4)
    csv_output.save()

    cls = CART(tree='cls', criterion='gini', prune='depth')
    cls.fit(X_train, y_train)
    cls.print_tree()

    pred = cls.predict(X_test)
    print("Accuracy:", accuracy_score(y_test, pred))
    def print_tree(self, tree=None, indent=" "):
        with pyRAPL.Measurement('Print_tree', output=csv_output):
            """ Recursively print the decision tree """
            if not tree:
                tree = self.root

            # If we're at leaf => print the label
            if tree.value is not None:
                print(tree.value)
            # Go deeper down the tree
            else:
                # Print test
                print("%s:%s? " % (tree.feature_i, tree.threshold))
                # Print the true scenario
                print("%sT->" % (indent), end="")
                self.print_tree(tree.true_branch, indent + indent)
                # Print the false scenario
                print("%sF->" % (indent), end="")
                self.print_tree(tree.false_branch, indent + indent)
        csv_output.save()
def main():
    print("-- Classification Tree --")

    with pyRAPL.Measurement('Read_data', output=csv_output):
        dataset = pd.read_csv('/home/gabi/Teste/BaseSintetica/1k_5att.csv')
        X = dataset.iloc[:, 0:5].values
        y = dataset.iloc[:, 5].values
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.4)
    csv_output.save()

    clf = ClassificationTree()
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)

    print("Accuracy:", accuracy)
def test_context_measure(fs_one_socket):
    """
    Test to measure the energy consumption of a function using the Measurement class

    - launch the measure
    - write a new value to the RAPL power measurement api file
    - launch a function
    - end the measure

    Test if:
      - the energy consumption measured is the delta between the first and the last value in the RAPL power measurement
        file
    """
    pyRAPL.setup()
    out = dummyOutput()
    with pyRAPL.Measurement('toto', output=out):
        measurable_function(1)

    assert out.data.pkg == [(POWER_CONSUMPTION_PKG - PKG_0_VALUE)]
    assert out.data.dram == [(POWER_CONSUMPTION_DRAM - DRAM_0_VALUE)]
Beispiel #26
0
def test_nomal_measure_bench(fs_one_socket):
    """
    Test to measure the energy consumption of a function using the Measurement class

    - launch the measure
    - write a new value to the RAPL power measurement api file
    - launch a function
    - end the measure

    Test if:
      - the energy consumption measured is the delta between the first and the last value in the RAPL power measurement
        file
    """
    pyRAPL.setup()
    measure = pyRAPL.Measurement('toto')
    measure.begin()
    measurable_function(1)
    measure.end()

    assert measure.result.pkg == [(POWER_CONSUMPTION_PKG - PKG_0_VALUE)]
    assert measure.result.dram == [(POWER_CONSUMPTION_DRAM - DRAM_0_VALUE)]
Beispiel #27
0
def regression_example():
    with pyRAPL.Measurement('Read_data', output=csv_output):
        df = pd.read_csv('/home/gabrieli/Documentos/BaseSintetica/1k_5att.csv')
        #df = df.apply(LabelEncoder().fit_transform)

        X = df.iloc[:, :-1].values
        y = df.iloc[:, -1].values

        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.4)
    csv_output.save()

    # Fit regression model
    reg = CART(tree='reg', criterion='mse', prune='depth')
    reg.fit(X_train, y_train)
    reg.print_tree()

    pred = reg.predict(X_test)
    mse = mean_squared_error(y_test, pred)
    print("MSE:", mse)
    def _prune(self, method, max_depth, min_criterion, n_samples):
        with pyRAPL.Measurement('Prune', output=csv_output):
            if self.feature is None:
                return

            self.left._prune(method, max_depth, min_criterion, n_samples)
            self.right._prune(method, max_depth, min_criterion, n_samples)

            pruning = False

            if method == 'impurity' and self.left.feature is None and self.right.feature is None:
                if (self.gain * float(self.n_samples) /
                        n_samples) < min_criterion:
                    pruning = True
            elif method == 'depth' and self.depth >= max_depth:
                pruning = True

            if pruning is True:
                self.left = None
                self.right = None
                self.feature = None
        csv_output.save()
    def fit(self, X, y):
        with pyRAPL.Measurement('Fit_3', output=csv_output):
            n_features = np.shape(X)[1]
            # If max_features have not been defined => select it as
            # sqrt(n_features)
            if not self.max_features:
                self.max_features = int(math.sqrt(n_features))

            # Choose one random subset of the data for each tree
            subsets = get_random_subsets(X, y, self.n_estimators)

            for i in self.progressbar(range(self.n_estimators)):
                X_subset, y_subset = subsets[i]
                # Feature bagging (select random subsets of the features)
                idx = np.random.choice(range(n_features),
                                       size=self.max_features,
                                       replace=True)
                # Save the indices of the features for prediction
                self.trees[i].feature_indices = idx
                # Choose the features corresponding to the indices
                X_subset = X_subset[:, idx]
                # Fit the tree to the data
                self.trees[i].fit(X_subset, y_subset)
        csv_output.save()
    def _build_tree(self, X, y, current_depth=0):
        with pyRAPL.Measurement('Build_tree', output=csv_output):
            """ Recursive method which builds out the decision tree and splits X and respective y on the feature of X which (based on impurity) best separates the data"""
            largest_impurity = 0
            best_criteria = None  # Feature index and threshold
            best_sets = None  # Subsets of the data

            # Check if expansion of y is needed
            if len(np.shape(y)) == 1:
                y = np.expand_dims(y, axis=1)

            # Add y as last column of X
            Xy = np.concatenate((X, y), axis=1)

            n_samples, n_features = np.shape(X)

            if n_samples >= self.min_samples_split and current_depth <= self.max_depth:
                # Calculate the impurity for each feature
                for feature_i in range(n_features):
                    # All values of feature_i
                    feature_values = np.expand_dims(X[:, feature_i], axis=1)
                    unique_values = np.unique(feature_values)

                    # Iterate through all unique values of feature column i and
                    # calculate the impurity
                    for threshold in unique_values:
                        # Divide X and y depending on if the feature value of X at index feature_i
                        # meets the threshold
                        Xy1, Xy2 = divide_on_feature(Xy, feature_i, threshold)

                        if len(Xy1) > 0 and len(Xy2) > 0:
                            # Select the y-values of the two sets
                            y1 = Xy1[:, n_features:]
                            y2 = Xy2[:, n_features:]

                            # Calculate impurity
                            impurity = self._impurity_calculation(y, y1, y2)

                            # If this threshold resulted in a higher information gain than previously
                            # recorded save the threshold value and the feature
                            # index
                            if impurity > largest_impurity:
                                largest_impurity = impurity
                                best_criteria = {
                                    "feature_i": feature_i,
                                    "threshold": threshold
                                }
                                best_sets = {
                                    "leftX":
                                    Xy1[:, :n_features],  # X of left subtree
                                    "lefty":
                                    Xy1[:, n_features:],  # y of left subtree
                                    "rightX":
                                    Xy2[:, :n_features],  # X of right subtree
                                    "righty":
                                    Xy2[:, n_features:]  # y of right subtree
                                }

            if largest_impurity > self.min_impurity:
                # Build subtrees for the right and left branches
                true_branch = self._build_tree(best_sets["leftX"],
                                               best_sets["lefty"],
                                               current_depth + 1)
                false_branch = self._build_tree(best_sets["rightX"],
                                                best_sets["righty"],
                                                current_depth + 1)
                return DecisionNode(feature_i=best_criteria["feature_i"],
                                    threshold=best_criteria["threshold"],
                                    true_branch=true_branch,
                                    false_branch=false_branch)

            # We're at leaf => determine value
            leaf_value = self._leaf_value_calculation(y)

            return DecisionNode(value=leaf_value)
        csv_output.save()