Ejemplo n.º 1
0
    def reduce_data(self, X, y):
        if self.classifier == None:
            self.classifier = KNeighborsClassifier(n_neighbors=self.n_neighbors)
        if self.classifier.n_neighbors != self.n_neighbors:
            self.classifier.n_neighbors = self.n_neighbors

        X, y = check_arrays(X, y, sparse_format="csr")

        classes = np.unique(y)
        self.classes_ = classes

        if self.n_neighbors >= len(X):
            self.X_ = np.array(X)
            self.y_ = np.array(y)
            self.reduction_ = 0.0

        mask = np.zeros(y.size, dtype=bool)

        tmp_m = np.ones(y.size, dtype=bool)
        for i in xrange(y.size):
            tmp_m[i] = not tmp_m[i]
            self.classifier.fit(X[tmp_m], y[tmp_m])
            sample, label = X[i], y[i]

            if self.classifier.predict(sample) == [label]:
                mask[i] = not mask[i]

            tmp_m[i] = not tmp_m[i]

        self.X_ = np.asarray(X[mask])
        self.y_ = np.asarray(y[mask])
        self.reduction_ = 1.0 - float(len(self.y_)) / len(y)
        return self.X_, self.y_
Ejemplo n.º 2
0
def _my_lrap(y_true, y_score):
    """Simple implementation of label ranking average precision"""
    y_true, y_score = check_arrays(y_true, y_score)
    n_samples, n_labels = y_true.shape
    score = np.empty((n_samples, ))
    for i in range(n_samples):
        # The best rank correspond to 1. Rank higher than 1 are worse.
        # The best inverse ranking correspond to n_labels.
        unique_rank, inv_rank = np.unique(y_score[i], return_inverse=True)
        n_ranks = unique_rank.size
        rank = n_ranks - inv_rank

        # Rank need to be corrected to take into account ties
        # ex: rank 1 ex aequo means that both label are rank 2.
        corr_rank = np.bincount(rank, minlength=n_ranks + 1).cumsum()
        rank = corr_rank[rank]

        relevant = y_true[i].nonzero()[0]
        if relevant.size == 0 or relevant.size == n_labels:
            score[i] = 1
            continue

        score[i] = 0.
        for label in relevant:
            # Let's count the number of relevant label with better rank
            # (smaller rank).
            n_ranked_above = sum(rank[r] <= rank[label] for r in relevant)

            # Weight by the rank of the actual label
            score[i] += n_ranked_above / rank[label]

        score[i] /= relevant.size

    return score.mean()
Ejemplo n.º 3
0
Archivo: utils.py Proyecto: spolakh/rep
def calc_hist_with_errors(x, weight=None, bins=60, normed=True, x_range=None, ignored_sideband=0.0):
    """
    Calculate data for error bar (for plot pdf with errors)

    :param x: data
    :type x: list or numpy.array
    :param weight: weights
    :type weight: None or list or numpy.array

    :return: tuple (x-points (list), y-points (list), y points errors (list), x points errors (list))
    """
    weight = numpy.ones(len(x)) if weight is None else weight
    x, weight = check_arrays(x, weight)

    if x_range is None:
        x_range = numpy.percentile(x, [100 * ignored_sideband, 100 * (1 - ignored_sideband)])

    ans, bins = numpy.histogram(x, bins=bins, normed=normed, weights=weight, range=x_range)
    yerr = []
    normalization = 1.0
    if normed:
        normalization = float(len(bins) - 1) / float(sum(weight)) / (x_range[1] - x_range[0])
    for i in range(len(bins) - 1):
        weight_bin = weight[(x > bins[i]) * (x <= bins[i + 1])]
        yerr.append(numpy.sqrt(sum(weight_bin * weight_bin)) * normalization)
    bins_mean = [0.5 * (bins[i] + bins[i + 1]) for i in range(len(ans))]
    xerr = [0.5 * (bins[i + 1] - bins[i]) for i in range(len(ans))]
    return bins_mean, ans, yerr, xerr
Ejemplo n.º 4
0
    def reduce_data(self, X, y):

        X, y = check_arrays(X, y, sparse_format="csr")

        if self.classifier == None:
            self.classifier = KNeighborsClassifier(
                n_neighbors=self.n_neighbors)

        prots_s = []
        labels_s = []

        classes = np.unique(y)
        self.classes_ = classes

        for cur_class in classes:
            mask = y == cur_class
            insts = X[mask]
            prots_s = prots_s + [insts[np.random.randint(0, insts.shape[0])]]
            labels_s = labels_s + [cur_class]

        self.classifier.fit(prots_s, labels_s)
        for sample, label in zip(X, y):
            if self.classifier.predict(sample) != [label]:
                prots_s = prots_s + [sample]
                labels_s = labels_s + [label]
                self.classifier.fit(prots_s, labels_s)

        self.X_ = np.asarray(prots_s)
        self.y_ = np.asarray(labels_s)
        self.reduction_ = 1.0 - float(len(self.y_)) / len(y)
        return self.X_, self.y_
Ejemplo n.º 5
0
    def reduce_data(self, X, y):
        if self.classifier == None:
            self.classifier = KNeighborsClassifier(
                n_neighbors=self.n_neighbors, algorithm='brute')
        if self.classifier.n_neighbors != self.n_neighbors:
            self.classifier.n_neighbors = self.n_neighbors

        X, y = check_arrays(X, y, sparse_format="csr")

        classes = np.unique(y)
        self.classes_ = classes
        self.classifier.fit(X, y)
        nn_idx = self.classifier.kneighbors(X,
                                            n_neighbors=2,
                                            return_distance=False)
        nn_idx = nn_idx.T[1]

        mask = [
            nn_idx[nn_idx[index]] == index and y[index] != y[nn_idx[index]]
            for index in xrange(nn_idx.shape[0])
        ]
        mask = ~np.asarray(mask)
        if self.keep_class != None and self.keep_class in self.classes_:
            mask[y == self.keep_class] = True

        self.X_ = np.asarray(X[mask])
        self.y_ = np.asarray(y[mask])
        self.reduction_ = 1.0 - float(len(self.y_)) / len(y)

        return self.X_, self.y_
Ejemplo n.º 6
0
Archivo: cnn.py Proyecto: dvro/ml
    def reduce_data(self, X, y):
        
        X, y = check_arrays(X, y, sparse_format="csr")

        if self.classifier == None:
            self.classifier = KNeighborsClassifier(n_neighbors=self.n_neighbors)

        prots_s = []
        labels_s = []

        classes = np.unique(y)
        self.classes_ = classes

        for cur_class in classes:
            mask = y == cur_class
            insts = X[mask]
            prots_s = prots_s + [insts[np.random.randint(0, insts.shape[0])]]
            labels_s = labels_s + [cur_class]


        self.classifier.fit(prots_s, labels_s)
        for sample, label in zip(X, y):
            if self.classifier.predict(sample) != [label]:
                prots_s = prots_s + [sample]
                labels_s = labels_s + [label]
                self.classifier.fit(prots_s, labels_s)
       
        self.X_ = np.asarray(prots_s)
        self.y_ = np.asarray(labels_s)
        self.reduction_ = 1.0 - float(len(self.y_))/len(y)
        return self.X_, self.y_
Ejemplo n.º 7
0
    def transform(self, X, y=None):
        """Project the data by using matrix product with the random matrix

        Parameters
        ----------
        X : numpy array or scipy.sparse of shape [n_samples, n_features]
            The input data to project into a smaller dimensional space.

        y : is not used: placeholder to allow for usage in a Pipeline.

        Returns
        -------
        X_new : numpy array or scipy sparse of shape [n_samples, n_components]
            Projected array.

        """
        X, y = check_arrays(X, y)

        if self.components_ is None:
            raise ValueError('No random projection matrix had been fit.')

        if X.shape[1] != self.components_.shape[1]:
            raise ValueError(
                'Impossible to perform projection:'
                'X at fit stage had a different number of features.'
                '(%s != %s)' % (X.shape[1], self.components_.shape[1]))

        if not sp.issparse(X):
            X = np.atleast_2d(X)

        X_new = safe_sparse_dot(X, self.components_.T,
                                dense_output=self.dense_output)
        return X_new
Ejemplo n.º 8
0
    def reduce_data(self, X, y):
        X, y = check_arrays(X, y, sparse_format="csr")

        if self.classifier == None:
            self.classifier = KNeighborsClassifier(n_neighbors=self.n_neighbors)
        if self.classifier.n_neighbors != self.n_neighbors:
            self.classifier.n_neighbors = self.n_neighbors

        classes = np.unique(y)
        self.classes_ = classes

        # loading inicial groups
        self.groups = []
        for label in classes:
            mask = y == label
            self.groups = self.groups + [_Group(X[mask], label)]

        self._main_loop()
        self._generalization_step()
        self._merge()
        self._pruning()
        self.X_ = np.asarray([g.rep_x for g in self.groups])
        self.y_ = np.asarray([g.label for g in self.groups])
        self.reduction_ = 1.0 - float(len(self.y_))/len(y)
        return self.X_, self.y_
Ejemplo n.º 9
0
    def reduce_data(self, X, y):
        if self.classifier == None:
            self.classifier = KNeighborsClassifier(n_neighbors=self.n_neighbors)
        if self.classifier.n_neighbors != self.n_neighbors:
            self.classifier.n_neighbors = self.n_neighbors

        X, y = check_arrays(X, y, sparse_format="csr")

        classes = np.unique(y)
        self.classes_ = classes

        if self.n_neighbors >= len(X):
            self.X_ = np.array(X)
            self.y_ = np.array(y)
            self.reduction_ = 0.0
            return self.X_, self.y_

        mask = np.zeros(y.size, dtype=bool)

        tmp_m = np.ones(y.size, dtype=bool)
        for i in xrange(y.size):
            tmp_m[i] = not tmp_m[i]
            self.classifier.fit(X[tmp_m], y[tmp_m])
            sample, label = X[i], y[i]

            if self.classifier.predict(sample) == [label]:
                mask[i] = not mask[i]

            tmp_m[i] = not tmp_m[i]

        self.X_ = np.asarray(X[mask])
        self.y_ = np.asarray(y[mask])
        self.reduction_ = 1.0 - float(len(self.y_)) / len(y)
        return self.X_, self.y_
Ejemplo n.º 10
0
    def transform(self, X, y=None):
        """Project the data by using matrix product with the random matrix

        Parameters
        ----------
        X : numpy array or scipy.sparse of shape [n_samples, n_features]
            The input data to project into a smaller dimensional space.

        y : is not used: placeholder to allow for usage in a Pipeline.

        Returns
        -------
        X_new : numpy array or scipy sparse of shape [n_samples, n_components]
            Projected array.

        """
        X, y = check_arrays(X, y)

        if self.components_ is None:
            raise ValueError('No random projection matrix had been fit.')

        if X.shape[1] != self.components_.shape[1]:
            raise ValueError(
                'Impossible to perform projection:'
                'X at fit stage had a different number of features.'
                '(%s != %s)' % (X.shape[1], self.components_.shape[1]))

        if not sp.issparse(X):
            X = np.atleast_2d(X)

        X_new = safe_sparse_dot(X, self.components_.T,
                                dense_output=self.dense_output)
        return X_new
    def fit(self, X, y, sample_weight=None):
        X, y = check_arrays(X, y, dtype=DTYPE, sparse_format="dense", check_ccontiguous=True)
        sample_weight = check_sample_weight(y, sample_weight=sample_weight)
        sample_weight = normalize_weight(y, sample_weight, sig_weight=self.sig_weight)
        self.random_state = check_random_state(self.random_state)
        self.estimators = []
        score = numpy.zeros(len(X), dtype=float)
        y_signed = 2 * y - 1

        self.w_sig = []
        self.w_bck = []

        for _ in range(self.n_estimators):
            residual = y_signed
            # numpy.exp(- y_signed * score)
            # residual[y > 0.5] /= numpy.mean(residual[y > 0.5])
            # residual[y < 0.5] /= -numpy.mean(residual[y < 0.5])

            trainX, testX, trainY, testY, trainW, testW, trainR, testR, trainS, testS = \
                train_test_split(X, y, sample_weight, residual, score,
                                 train_size=self.train_part, test_size=self.test_size, random_state=self.random_state)

            tree = DecisionTreeRegressor(criterion=self.criterion, splitter=self.splitter,
                                         max_depth=self.max_depth, min_samples_leaf=self.min_samples_leaf,
                                         max_features=self.max_features, random_state=self.random_state)

            # fitting
            tree.fit(trainX, trainR, sample_weight=trainW, check_input=False)

            # post-pruning
            self.update_terminal_regions(tree.tree_, testX, testY, testW, testS)

            # updating score
            # score += self.learning_rate * tree.predict(X)
            self.estimators.append(tree)
Ejemplo n.º 12
0
    def reduce_data(self, X, y):
        X, y = check_arrays(X, y, sparse_format="csr")

        if self.classifier == None:
            self.classifier = KNeighborsClassifier(n_neighbors=self.n_neighbors)
        if self.classifier.n_neighbors != self.n_neighbors:
            self.classifier.n_neighbors = self.n_neighbors

        classes = np.unique(y)
        self.classes_ = classes

        minority_class = self.pos_class
        if self.pos_class == None:
            minority_class = min(set(y), key = list(y).count)

        # loading inicial groups
        self.groups = []
        for label in classes:
            mask = y == label
            self.groups = self.groups + [_Group(X[mask], label)]

        self._main_loop()
        self._generalization_step()
        min_groups = filter(lambda g: g.label == minority_class, self.groups)
        self._merge()
        self._pruning()
        max_groups = filter(lambda g: g.label != minority_class, self.groups)
        self.groups = min_groups + max_groups
        self.X_ = np.asarray([g.rep_x for g in self.groups])
        self.y_ = np.asarray([g.label for g in self.groups])
        self.reduction_ = 1.0 - float(len(self.y_))/len(y)
        return self.X_, self.y_
Ejemplo n.º 13
0
def cvm_flatness(y, proba, X, uniform_variables, sample_weight=None, label=1, knn=30):
    """ The most simple way to compute Cramer-von Mises flatness, this is however very slow
    if you need to compute it many times
    :param y: real classes of events, shape = [n_samples]
    :param proba: predicted probabilities, shape = [n_samples, n_classes]
    :param X: pandas.DataFrame with uniform features (i.e. test dataset)
    :param uniform_variables: features, along which uniformity is desired, list of strings
    :param sample_weight: weights of events, shape = [n_samples]
    :param label: class, for which uniformity is measured (usually, 0 is bck, 1 is signal)
    :param knn: number of nearest neighbours used in knn

    Example of usage:
    proba = classifier.predict_proba(testX)
    cvm_flatness(testY, proba=proba, X=testX, uniform_variables=['mass'])
    """
    y, proba = check_arrays(y, proba)
    assert len(y) == len(proba) == len(X), 'Different lengths'
    y = column_or_1d(y)
    sample_weight = check_sample_weight(y, sample_weight=sample_weight)

    X = pandas.DataFrame(X)

    signal_mask = y == label
    groups_indices = computeSignalKnnIndices(uniform_variables=uniform_variables, dataframe=X,
                                             is_signal=signal_mask, n_neighbors=knn)
    groups_indices = groups_indices[signal_mask, :]

    return ut.group_based_cvm(proba[:, label], mask=signal_mask, groups_indices=groups_indices,
                              sample_weight=sample_weight)
Ejemplo n.º 14
0
def plot_score_variable_correlation(y_true, y_pred, correlation_values, cuts, sample_weight=None, classifier_name="",
                                    var_name="", score_function=efficiency_score, bins_number=20):
    """
    Different score functions available: Efficiency, Precision, Recall, F1Score, and other things from sklearn.metrics
    :param y_pred: numpy.array, of shape [n_samples]
    :param y_true: numpy.array, of shape [n_samples] with float predictions
    :param correlation_values: numpy.array of shape [n_samples], usually that is masses of events
    :param cuts: array-like of cuts, for each cut a separate
    :param sample_weight: numpy.array or None, shape = [n_samples]
    :param classifier_name: str, used only in label
    :param var_name: str, i.e. 'mass'
    :param score_function: any function with signature (y_true, y_pred, sample_weight=None)
    :param bins_number: int, the number of bins
    """
    y_true, y_pred, correlation_values = check_arrays(y_true, y_pred, correlation_values)
    sample_weight = check_sample_weight(y_true, sample_weight=sample_weight)

    binner = Binner(correlation_values, n_bins=bins_number)
    bins_data = binner.split_into_bins(correlation_values, y_true, y_pred, sample_weight)
    for cut in cuts:
        x_values = []
        y_values = []
        for bin_data in bins_data:
            bin_masses, bin_y_true, bin_proba, bin_weight = bin_data
            y_values.append(score_function(bin_y_true, bin_proba[:, 1] > cut, sample_weight=bin_weight))
            x_values.append(numpy.mean(bin_masses))
        pylab.plot(x_values, y_values, '.-', label="cut = %0.3f" % cut)

    pylab.title("Correlation with results of " + classifier_name)
    pylab.xlabel(var_name)
    pylab.ylabel(score_function.__name__)
    pylab.legend(loc="lower right")
Ejemplo n.º 15
0
Archivo: utils.py Proyecto: spolakh/rep
def reorder_by_first(*arrays):
    """
    Applies the same permutation to all passed arrays,
    permutation sorts the first passed array
    """
    arrays = check_arrays(*arrays)
    order = numpy.argsort(arrays[0])
    return [arr[order] for arr in arrays]
    def fit(self, X, y, sample_weight=None):
        sample_weight = check_sample_weight(y, sample_weight=sample_weight)
        assert len(X) == len(y), 'Different lengths of X and y'
        X = pandas.DataFrame(X)
        y = numpy.array(column_or_1d(y), dtype=int)
        assert numpy.all(numpy.in1d(y, [0, 1])), 'Only two-class classification supported'
        self.check_params()

        self.estimators = []
        self.scores = []

        n_samples = len(X)
        n_inbag = int(self.subsample * len(X))
        self.loss = copy.copy(self.loss)
        self.loss.fit(X, y, sample_weight=sample_weight)

        # preparing for fitting in trees
        X = self.get_train_vars(X)
        self.n_features = X.shape[1]
        X, y = check_arrays(X, y, dtype=DTYPE, sparse_format="dense", check_ccontiguous=True)
        y_pred = numpy.zeros(len(X), dtype=float)

        if self.init_estimator is not None:
            y_signed = 2 * y - 1
            self.init_estimator.fit(X, y_signed, sample_weight=sample_weight)
            y_pred += numpy.ravel(self.init_estimator.predict(X))

        for stage in range(self.n_estimators):
            # tree creation
            tree = DecisionTreeRegressor(
                criterion=self.criterion,
                splitter=self.splitter,
                max_depth=self.max_depth,
                min_samples_split=self.min_samples_split,
                min_samples_leaf=self.min_samples_leaf,
                max_features=self.max_features,
                random_state=self.random_state,
                max_leaf_nodes=self.max_leaf_nodes)

            # tree learning
            residual = self.loss.negative_gradient(y_pred)
            train_indices = self.random_state.choice(n_samples, size=n_inbag, replace=False)

            tree.fit(X[train_indices], residual[train_indices],
                     sample_weight=sample_weight[train_indices], check_input=False)
            # update tree leaves
            if self.update_tree:
                self.loss.update_tree(tree.tree_, X=X, y=y, y_pred=y_pred, sample_weight=sample_weight,
                                      update_mask=numpy.ones(len(X), dtype=bool), residual=residual)

            y_pred += self.learning_rate * tree.predict(X)
            self.estimators.append(tree)
            self.scores.append(self.loss(y_pred))
        return self
Ejemplo n.º 17
0
    def fit(self, X):
        """Creates a biclustering for X.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)

        """
        X, = check_arrays(X, sparse_format='csr', dtype=np.float64)
        check_array_ndim(X)
        self._check_parameters()
        self._fit(X)
Ejemplo n.º 18
0
def _log_loss(y_true, y_pred, eps=1e-10, sample_weight=None):
    """ This is shorter ans simpler version og log_loss, which supports sample_weight """
    sample_weight = check_sample_weight(y_true, sample_weight=sample_weight)
    y_true, y_pred, sample_weight = check_arrays(y_true, y_pred, sample_weight)
    y_true = column_or_1d(y_true)

    lb = LabelBinarizer()
    T = lb.fit_transform(y_true)
    if T.shape[1] == 1:
        T = numpy.append(1 - T, T, axis=1)

    # Clipping
    Y = numpy.clip(y_pred, eps, 1 - eps)

    # Check if dimensions are consistent.
    T, Y = check_arrays(T, Y)

    # Renormalize
    Y /= Y.sum(axis=1)[:, numpy.newaxis]
    loss = -(T * numpy.log(Y) * sample_weight[:, numpy.newaxis]).sum() / numpy.sum(sample_weight)
    return loss
Ejemplo n.º 19
0
    def fit(self, X):
        """Creates a biclustering for X.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)

        """
        X, = check_arrays(X, sparse_format="csr", dtype=np.float64)
        check_array_ndim(X)
        self._check_parameters()
        self._fit(X)
Ejemplo n.º 20
0
def plot_roc(y_true, y_pred, sample_weight=None, classifier_name="", is_cut=False, mask=None):
    """Plots ROC curve in the way physicists like it
    :param y_true: numpy.array, shape=[n_samples]
    :param y_pred: numpy.array, shape=[n_samples]
    :param sample_weight: numpy.array | None, shape = [n_samples]
    :param classifier_name: str, the name of classifier for label
    :param is_cut: predictions are binary
    :param mask: plot ROC curve only for events that have mask=True
    """
    if is_cut:
        assert len(numpy.unique(y_pred)) == 2, 'Cut assumes that prediction are 0 and 1 (or True/False)'

    MAX_STEPS = 500
    y_true, y_pred = check_arrays(y_true, y_pred)
    if mask is not None:
        mask = numpy.array(mask, dtype=bool)  # converting to bool, just in case
        y_true = y_true[mask]
        y_pred = y_pred[mask]
        if sample_weight is not None:
            sample_weight = sample_weight[mask]

    fpr, tpr, thresholds = check_arrays(*roc_curve(y_true, y_pred, sample_weight=sample_weight))
    roc_auc = auc(fpr, tpr)
    # tpr = recall = isSasS / isS = signal efficiency
    # fpr = isBasS / isB = 1 - specificity = 1 - backgroundRejection
    bg_rejection = 1. - fpr

    if len(fpr) > MAX_STEPS:
        # decreasing the number of points in plot
        targets = numpy.linspace(0, 1, MAX_STEPS)
        x_ids = numpy.searchsorted(tpr, targets)
        y_ids = numpy.searchsorted(fpr, targets)
        indices = numpy.concatenate([x_ids, y_ids, [0, len(tpr) - 1]], )
        indices = numpy.unique(indices)
        tpr = tpr[indices]
        bg_rejection = bg_rejection[indices]
    if not is_cut:
        pylab.plot(tpr, bg_rejection, label='%s (area = %0.3f)' % (classifier_name, roc_auc))
    else:
        pylab.plot(tpr[1:2], bg_rejection[1:2], 'o', label='%s' % classifier_name)
Ejemplo n.º 21
0
def _log_loss(y_true, y_pred, eps=1e-10, sample_weight=None):
    """ This is shorter ans simpler version og log_loss, which supports sample_weight """
    sample_weight = check_sample_weight(y_true, sample_weight=sample_weight)
    y_true, y_pred, sample_weight = check_arrays(y_true, y_pred, sample_weight)
    y_true = column_or_1d(y_true)

    lb = LabelBinarizer()
    T = lb.fit_transform(y_true)
    if T.shape[1] == 1:
        T = numpy.append(1 - T, T, axis=1)

    # Clipping
    Y = numpy.clip(y_pred, eps, 1 - eps)

    # Check if dimensions are consistent.
    T, Y = check_arrays(T, Y)

    # Renormalize
    Y /= Y.sum(axis=1)[:, numpy.newaxis]
    loss = -(T * numpy.log(Y) *
             sample_weight[:, numpy.newaxis]).sum() / numpy.sum(sample_weight)
    return loss
 def fit(self, X, y, sample_weight=None):
     sample_weight = check_sample_weight(y,  sample_weight=sample_weight)
     X, y = check_arrays(X, y)
     assert X.shape[1] == 1
     X = numpy.ravel(X)
     indices = numpy.argsort(X)
     self.sorted_x = X[indices]
     self.sorted_y = y[indices]
     self.sorted_w = sample_weight[indices]
     window = numpy.hamming(2 * self.knn + 1)
     window[self.knn] = 0
     self.sig_w = numpy.convolve(self.sorted_w * self.sorted_y,       window, mode='same')
     self.bck_w = numpy.convolve(self.sorted_w * (1 - self.sorted_y), window, mode='same')
     assert len(self.sig_w) == len(self.bck_w) == len(self.sorted_y) == len(X)
Ejemplo n.º 23
0
def plot_roc(y_true, y_pred, sample_weight=None, classifier_name=""):
    """Plots ROC curve in the way physicist like it
    :param y_true: numpy.array, shape=[n_samples]
    :param y_pred: numpy.array, shape=[n_samples]
    :param sample_weight: numpy.array | None, shape = [n_samples]
    :param classifier_name: str, the name of classifier for label
    """
    y_true, y_pred = check_arrays(y_true, y_pred)
    fpr, tpr, thresholds = roc_curve(y_true, y_pred, sample_weight=sample_weight)
    # tpr = recall = isSasS / isS = signal efficiency
    # fpr = isBasS / isB = 1 - specificity ?=?  1 - backgroundRejection
    bg_rejection = 1. - numpy.array(fpr)
    roc_auc = auc(fpr, tpr)
    pylab.plot(tpr, bg_rejection, label='%s (area = %0.3f)' % (classifier_name, roc_auc))
Ejemplo n.º 24
0
    def reduce_data(self, X, y):
        X, y = check_arrays(X, y, sparse_format="csr")

        classes = np.unique(y)
        self.classes_ = classes

        self.main_loop(X, y)

        best_index = np.argmax(self.evaluations)
        mask = np.asarray(self.chromosomes[best_index], dtype=bool)
        self.X_ = X[mask]
        self.y_ = y[mask]
        self.reduction_ = 1.0 - float(len(self.y_))/len(y)

        return self.X_, self.y_
    def staged_predict_proba(self, X):
        results=numpy.zeros([len(X), self.n_estimators], dtype=float)
        X, = check_arrays(X, dtype=DTYPE, sparse_format="dense", check_ccontiguous=True)
        w_s = numpy.zeros(len(X))
        w_b = numpy.zeros(len(X))

        for i, (estimator, e_ws, e_wb) in enumerate(zip(self.estimators, self.w_sig, self.w_bck)):
            indices = estimator.predict(X).astype(int)
            assert numpy.all(indices == estimator.tree_.apply(X))
            results[:, i] = e_ws[indices] / e_wb[indices]

        for i in range(1, self.n_estimators):
            score = numpy.median(results[:, :i], axis=1)
            proba = numpy.zeros([len(X), 2], dtype=float)
            proba[:, 1] = expit(score)
            proba[:, 0] = expit(-score)
            yield proba
Ejemplo n.º 26
0
def squared_error(y_true, y_pred):
    """Compute the squared error.
    
    Parameters
    ----------
    y_true : array-like of shape = [n_samples]
        Ground truth (correct) target values.

    y_pred : array-like of shape = [n_samples]
        Estimated target values.

    Returns
    -------
    errors : array of shape = [n_samples]
        Squared error of each sample.
    
    """
    y_true, y_pred = check_arrays(y_true, y_pred)
    return (y_pred - y_true)**2
Ejemplo n.º 27
0
Archivo: renn.py Proyecto: dvro/ml
    def reduce_data(self, X, y):
        X, y = check_arrays(X, y, sparse_format="csr")

        classes = np.unique(y)
        self.classes_ = classes

        enn = ENN(n_neighbors = self.n_neighbors)
        p_, l_, r_ = X, y, 1.0

        while r_ != 0:
            enn.reduce_data(p_, l_)
            p_ = enn.X_
            l_ = enn.y_
            r_ = enn.reduction_
             
        self.X_ = p_
        self.y_ = l_
        self.reduction_ = 1.0 - float(l_.shape[0]) / y.shape[0]

        return self.X_, self.y_
Ejemplo n.º 28
0
    def reduce_data(self, X, y):
        X, y = check_arrays(X, y, sparse_format="csr")

        classes = np.unique(y)
        self.classes_ = classes

        enn = ENN(n_neighbors = self.n_neighbors)
        p_, l_, r_ = X, y, 1.0

        while r_ != 0:
            enn.reduce_data(p_, l_)
            p_ = enn.X_
            l_ = enn.y_
            r_ = enn.reduction_
             
        self.X_ = p_
        self.y_ = l_
        self.reduction_ = 1.0 - float(l_.shape[0]) / y.shape[0]

        return self.X_, self.y_
    def predict_proba(self, X, percentile=50):
        results=numpy.zeros([len(X), self.n_estimators], dtype=float)
        X, = check_arrays(X, dtype=DTYPE, sparse_format="dense", check_ccontiguous=True)
        w_s = numpy.zeros(len(X))
        w_b = numpy.zeros(len(X))

        for i, (estimator, e_ws, e_wb) in enumerate(zip(self.estimators, self.w_sig, self.w_bck)):
            indices = estimator.predict(X).astype(int)

            assert numpy.all(indices == estimator.tree_.apply(X))
            results[:, i] = e_ws[indices] / e_wb[indices]
            w_s += e_ws[indices]
            w_b += e_wb[indices]

        score = numpy.percentile(results, percentile, axis=1)
        # score = w_s / (w_s + w_b + 0.01)

        proba = numpy.zeros([len(X), 2], dtype=float)
        proba[:, 1] = expit( score)
        proba[:, 0] = expit(-score)
        return proba
Ejemplo n.º 30
0
def gain_curve(y_true, y_score, pos_label=None):
    y_true, y_score = check_arrays(y_true, y_score)
    y_true = column_or_1d(y_true)
    y_score = column_or_1d(y_score)
    # ensure binary classification if pos_label is not specified
    classes = np.unique(y_true)
    if (pos_label is None
            and not (np.all(classes == [0, 1]) or np.all(classes == [-1, 1])
                     or np.all(classes == [0]) or np.all(classes == [-1])
                     or np.all(classes == [1]))):
        raise ValueError("Data is not binary and pos_label is not specified")
    elif pos_label is None:
        pos_label = 1.

    # sort scores and corresponding truth values. -1 to invert order
    desc_score_indices = np.argsort(y_score, kind="mergesort")[::-1]
    y_score = y_score[desc_score_indices]
    y_true = y_true[desc_score_indices]

    # make y_true a boolean vector to deal with
    y_true = (y_true == pos_label)

    # accumulate the true positives with decreasing threshold
    total = len(y_true)
    _, total_target = np.bincount(y_true)
    tps_cum = np.cumsum(y_true)
    percentages = np.asarray(range(10, 110, 10))
    percentages_indices = np.divide(np.multiply(total, percentages), 100) - 1
    tps_cum_percentages = np.divide(
        np.multiply(100, tps_cum[percentages_indices]), total_target)

    # Add an extra threshold position if necessary
    percentages = np.r_[0, percentages]
    tps_cum_percentages = np.r_[0, tps_cum_percentages]
    # print 'Indexes of customers on the first percentil'
    # for idx in range(0,percentages_indices[0]):
    #     if y_true[idx]:
    #         print str(desc_score_indices[idx])

    return percentages, tps_cum_percentages
Ejemplo n.º 31
0
    def fit(self, X, y, sample_weight=None, iterations=100, loss=None):
        X, y = check_arrays(X, y)
        sample_weight = check_sample_weight(y, sample_weight=sample_weight)
        if loss is None:
            loss = BinomialDevianceLossFunction()
        loss.fit(X, y, sample_weight=sample_weight)
        self.n_features = X.shape[1]
        self.coeffs = numpy.zeros([self.n_features, self.max_categories], dtype='float')
        assert numpy.max(X) < self.max_categories
        assert numpy.min(X) >= 0

        for iteration in range(iterations):
            # this line could be skipped, but we need it to avoid
            # mistakes after too many steps of computations
            y_pred = self.decision_function(X)
            print(iteration, loss(y_pred))

            for feature in range(self.n_features):
                ngradient = loss.negative_gradient(y_pred)
                nominator = numpy.bincount(X[:, feature], weights=ngradient, minlength=self.max_categories)
                nominator -= self.l2_reg * self.coeffs[feature, :] + self.l1_reg * numpy.sign(self.coeffs[feature, :])

                denominator = numpy.abs(ngradient) * (1. - numpy.abs(ngradient))
                denominator = numpy.bincount(X[:, feature], weights=denominator, minlength=self.max_categories)
                denominator += 2 * self.l2_reg + 5

                gradients = nominator / denominator
                right_gradients = gradients
                # those already zeros not to become nonzero
                mask = (self.coeffs[feature, :] == 0) & (numpy.abs(gradients) < self.l1_reg)
                right_gradients[mask] = 0
                # those already not zeros
                old_coeffs = self.coeffs[feature, :]
                new_coeffs = old_coeffs + self.learning_rate * right_gradients
                new_coeffs[new_coeffs * old_coeffs < 0] = 0
                y_pred += numpy.take(new_coeffs - old_coeffs, X[:, feature])
                self.coeffs[feature, :] = new_coeffs

        return self
Ejemplo n.º 32
0
Archivo: allknn.py Proyecto: dvro/ml
    def reduce_data(self, X, y):
        X, y = check_arrays(X, y, sparse_format="csr")

        classes = np.unique(y)
        self.classes_ = classes

        edited_nn = ENN(n_neighbors = 1)
        p_, l_, r_ = X, y, 1.0

        for k in range(1, self.n_neighbors + 1):
            if l_.shape[0] > k + 1:
                edited_nn.n_neighbors = k
                edited_nn.fit(p_, l_)
                p_ = edited_nn.X_
                l_ = edited_nn.y_
                r_ = edited_nn.reduction_
             
        self.X_ = p_
        self.y_ = l_
        self.reduction_ = 1.0 - float(l_.shape[0]) / y.shape[0]

        return self.X_, self.y_
Ejemplo n.º 33
0
    def reduce_data(self, X, y):
        X, y = check_arrays(X, y, sparse_format="csr")

        classes = np.unique(y)
        self.classes_ = classes

        edited_nn = ENN(n_neighbors=1)
        p_, l_, r_ = X, y, 1.0

        for k in range(1, self.n_neighbors + 1):
            if l_.shape[0] > k + 1:
                edited_nn.n_neighbors = k
                edited_nn.fit(p_, l_)
                p_ = edited_nn.X_
                l_ = edited_nn.y_
                r_ = edited_nn.reduction_

        self.X_ = p_
        self.y_ = l_
        self.reduction_ = 1.0 - float(l_.shape[0]) / y.shape[0]

        return self.X_, self.y_
Ejemplo n.º 34
0
    def _fit(self, X, y, parameter_iterable):
        """Actual fitting,  performing the search over parameters."""
        estimator = self.estimator
        cv = self.cv

        n_samples = _num_samples(X)

        X, y = check_arrays(X, y, allow_lists=True, sparse_format='csr')
        if y is not None:
            if len(y) != n_samples:
                raise ValueError('Target variable (y) has a different number '
                                 'of samples (%i) than data (X: %i samples)'
                                 % (len(y), n_samples))
            y = np.asarray(y)

        cv = check_cv(cv, X, y, classifier=is_classifier(estimator))
        if not self.dataset_filenames:
            self.save_dataset_filename(X, y, cv)

        dataset_filenames = self.dataset_filenames

        client = Client()
        lb_view = client.load_balanced_view()

        if self.verbose > 0:
            print("Number of CPU core %d" % len(client.ids()))

        self.tasks = [([lb_view.apply(evaluate, estimator, dataset_filename, params)
                        for dataset_filename in dataset_filenames], params)
                            for params in parameter_iterable]
        if self.sync:
            self.wait()
            self.set_grid_scores()
            self.set_best_score_params()

            if self.refit:
                self.set_best_estimator(estimator)
        return self
Ejemplo n.º 35
0
Archivo: utils.py Proyecto: spolakh/rep
def calc_ROC(prediction, signal, sample_weight=None, max_points=10000):
    """
    Calculate roc curve

    :param prediction: predictions
    :type prediction: array or list
    :param signal: true labels
    :type signal: array or list
    :param sample_weight: weights
    :type sample_weight: None or array or list
    :param int max_points: maximum of used points on roc curve

    :return: (tpr, tnr), (err_tnr, err_tpr), thresholds
    """
    sample_weight = numpy.ones(len(signal)) if sample_weight is None else sample_weight
    prediction, signal, sample_weight = check_arrays(prediction, signal, sample_weight)

    assert set(signal) == {0, 1}, "the labels should be 0 and 1, labels are " + str(set(signal))
    fpr, tpr, thresholds = roc_curve(signal, prediction, sample_weight=sample_weight)
    tpr = numpy.insert(tpr, 0, 0.)
    fpr = numpy.insert(fpr, 0, 0.)
    thresholds = numpy.insert(thresholds, 0, thresholds[0] + 1.)
    tnr = 1 - fpr

    weight_bck = sample_weight[signal == 0]
    weight_sig = sample_weight[signal == 1]
    err_tnr = numpy.sqrt(tnr * (1 - tnr) * numpy.sum(weight_bck ** 2)) / numpy.sum(weight_bck)
    err_tpr = numpy.sqrt(tpr * (1 - tpr) * numpy.sum(weight_sig ** 2)) / numpy.sum(weight_sig)

    if len(prediction) > max_points:
        sum_weights = numpy.cumsum((fpr + tpr) / 2.)
        sum_weights /= sum_weights[-1]
        positions = numpy.searchsorted(sum_weights, numpy.linspace(0, 1, max_points))
        tpr, tnr = tpr[positions], tnr[positions]
        err_tnr, err_tpr = err_tnr[positions], err_tpr[positions]
        thresholds = thresholds[positions]

    return (tpr, tnr), (err_tnr, err_tpr), thresholds
Ejemplo n.º 36
0
    def reduce_data(self, X, y):
        if self.classifier == None:
            self.classifier = KNeighborsClassifier(n_neighbors=self.n_neighbors, algorithm='brute')
        if self.classifier.n_neighbors != self.n_neighbors:
            self.classifier.n_neighbors = self.n_neighbors

        X, y = check_arrays(X, y, sparse_format="csr")

        classes = np.unique(y)
        self.classes_ = classes
        self.classifier.fit(X, y)
        nn_idx = self.classifier.kneighbors(X, n_neighbors=2, return_distance=False)
        nn_idx = nn_idx.T[1]

        mask = [nn_idx[nn_idx[index]] == index and y[index] != y[nn_idx[index]] for index in xrange(nn_idx.shape[0])]
        mask = ~np.asarray(mask) 
        if self.keep_class != None and self.keep_class in self.classes_:
            mask[y==self.keep_class] = True

        self.X_ = np.asarray(X[mask])
        self.y_ = np.asarray(y[mask])
        self.reduction_ = 1.0 - float(len(self.y_)) / len(y)

        return self.X_, self.y_
Ejemplo n.º 37
0
Archivo: utils.py Proyecto: spolakh/rep
def get_efficiencies(prediction, spectator, sample_weight=None, bins_number=20,
                     thresholds=None, errors=False, ignored_sideband=0.0):
    """
    Construct efficiency function dependent on spectator for each threshold

    Different score functions available: Efficiency, Precision, Recall, F1Score,
    and other things from sklearn.metrics

    Parameters:
    -----------
    :param prediction: list of probabilities
    :param spectator: list of spectator's values
    :param bins_number: int, count of bins for plot

    :param thresholds: list of prediction's threshold

        (default=prediction's cuts for which efficiency will be [0.2, 0.4, 0.5, 0.6, 0.8])

    :return:
        if errors=False
        OrderedDict threshold -> (x_values, y_values)

        if errors=True
        OrderedDict threshold -> (x_values, y_values, y_err, x_err)

        All the parts: x_values, y_values, y_err, x_err are numpy.arrays of the same length.
    """
    prediction, spectator = \
        check_arrays(prediction, spectator)

    spectator_min, spectator_max = numpy.percentile(spectator, [100 * ignored_sideband, 100 * (1. - ignored_sideband)])
    mask = (spectator >= spectator_min) & (spectator <= spectator_max)
    spectator = spectator[mask]
    prediction = prediction[mask]
    bins_number = min(bins_number, len(prediction))
    sample_weight = sample_weight if sample_weight is None else numpy.array(sample_weight)[mask]

    if thresholds is None:
        thresholds = [weighted_percentile(prediction, percentiles=1 - eff, sample_weight=sample_weight)
                      for eff in [0.2, 0.4, 0.5, 0.6, 0.8]]

    binner = Binner(spectator, bins_number=bins_number)
    bins_data = binner.split_into_bins(spectator, prediction)

    bin_edges = numpy.array([spectator_min] + list(binner.limits) + [spectator_max])
    xerr = numpy.diff(bin_edges) / 2.
    result = OrderedDict()
    for threshold in thresholds:
        x_values = []
        y_values = []
        for num, (masses, probabilities) in enumerate(bins_data):
            y_values.append(numpy.mean(probabilities > threshold))
            if errors:
                x_values.append((bin_edges[num + 1] + bin_edges[num]) / 2.)
            else:
                x_values.append(numpy.mean(masses))

        x_values, y_values = check_arrays(x_values, y_values)
        if errors:
            result[threshold] = (x_values, y_values, numpy.sqrt(y_values * (1 - y_values) / len(y_values)), xerr)
        else:
            result[threshold] = (x_values, y_values)
    return result
Ejemplo n.º 38
0
    def _fit(self, X, y, parameter_iterable):
        """Actual fitting, performing the search over parameters."""

        estimator = self.estimator
        cv = self.cv

        n_samples = _num_samples(X)
        X, y = check_arrays(X, y, allow_lists=True, sparse_format='csr')

        self.scorer_ = _deprecate_loss_and_score_funcs(
            self.loss_func, self.score_func, self.scoring)

        if y is not None:
            if len(y) != n_samples:
                raise ValueError('Target variable (y) has a different number '
                                 'of samples (%i) than data (X: %i samples)'
                                 % (len(y), n_samples))
            y = np.asarray(y)
        cv = check_cv(cv, X, y, classifier=is_classifier(estimator))

        if self.verbose > 0:
            if isinstance(parameter_iterable, Sized):
                n_candidates = len(parameter_iterable)
                print("Fitting {0} folds for each of {1} candidates, totalling"
                      " {2} fits".format(len(cv), n_candidates,
                                         n_candidates * len(cv)))

        base_estimator = clone(self.estimator)

        pre_dispatch = self.pre_dispatch

        out = Parallel(
            n_jobs=self.n_jobs, verbose=self.verbose,
            pre_dispatch=pre_dispatch)(
                delayed(fit_grid_point_extended)(
                    X, y, base_estimator, parameters, train, test,
                    self.scorer_, self.verbose, **self.fit_params)
                for parameters in parameter_iterable
                for train, test in cv)
        
#         out = []
#         for parameters in parameter_iterable:
#             fold = 1
#             for train, test in cv:
#                 print "Processing fold", fold, self.fit_params
#                 out.append(fit_grid_point_extended(X, y, base_estimator, parameters, train, test, self.scorer_, self.verbose, **self.fit_params))
#                 fold += 1

        # Out is a list of triplet: score, estimator, n_test_samples
        n_fits = len(out)
        n_folds = len(cv)

        scores = list()
        grid_extras = list()
        grid_scores = list()
        for grid_start in range(0, n_fits, n_folds):
            n_test_samples = 0
            score = 0
            all_scores = []
            all_extras = list()
            for this_score, parameters, this_n_test_samples, extra in \
                    out[grid_start:grid_start + n_folds]:
                all_scores.append(this_score)
                all_extras.append(extra)
                if self.iid:
                    this_score *= this_n_test_samples
                    n_test_samples += this_n_test_samples
                score += this_score
            if self.iid:
                score /= float(n_test_samples)
            else:
                score /= float(n_folds)
            scores.append((score, parameters))
            # TODO: shall we also store the test_fold_sizes?
            grid_scores.append(_CVScoreTuple(
                parameters,
                score,
                np.array(all_scores)))
            grid_extras.append(all_extras)
        # Store the computed scores
        self.grid_scores_ = grid_scores
        self.extras_ = grid_extras

        # Find the best parameters by comparing on the mean validation score:
        # note that `sorted` is deterministic in the way it breaks ties
        best = sorted(grid_scores, key=lambda x: x.mean_validation_score,
                      reverse=True)[0]
        self.best_params_ = best.parameters
        self.best_score_ = best.mean_validation_score

        if self.refit:
            # fit the best estimator using the entire dataset
            # clone first to work around broken estimators
            best_estimator = clone(base_estimator).set_params(
                **best.parameters)
            if y is not None:
                best_estimator.fit(X, y, **self.fit_params)
            else:
                best_estimator.fit(X, **self.fit_params)
            self.best_estimator_ = best_estimator
        return self
Ejemplo n.º 39
0
    def fit(self,
            X,
            y,
            sample_mask=None,
            X_argsorted=None,
            check_input=True,
            sample_weight=None):
        """Build a decision tree from the training set (X, y).

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            The training input samples. Use ``dtype=np.float32`` for maximum
            efficiency.

        y : array-like, shape = [n_samples] or [n_samples, n_outputs]
            The target values (integers that correspond to classes in
            classification, real numbers in regression).
            Use ``dtype=np.float64`` and ``order='C'`` for maximum
            efficiency.

        sample_weight : array-like, shape = [n_samples] or None
            Sample weights. If None, then samples are equally weighted. Splits
            that would create child nodes with net zero or negative weight are
            ignored while searching for a split in each node. In the case of
            classification, splits are also ignored if they would result in any
            single class carrying a negative weight in either child node.

        check_input : boolean, (default=True)
            Allow to bypass several input checking.
            Don't use this parameter unless you know what you do.

        Returns
        -------
        self : object
            Returns self.
        """
        random_state = check_random_state(self.random_state)

        # Deprecations
        if sample_mask is not None:
            warn(
                "The sample_mask parameter is deprecated as of version 0.14 "
                "and will be removed in 0.16.", DeprecationWarning)

        if X_argsorted is not None:
            warn(
                "The X_argsorted parameter is deprecated as of version 0.14 "
                "and will be removed in 0.16.", DeprecationWarning)

        # Convert data
        if check_input:
            X, = check_arrays(X,
                              dtype=DTYPE,
                              sparse_format="dense",
                              check_ccontiguous=True)

        # Determine output settings
        n_samples, self.n_features_ = X.shape
        is_classification = isinstance(self, ClassifierMixin)

        y = np.atleast_1d(y)

        if y.ndim == 1:
            # reshape is necessary to preserve the data contiguity against vs
            # [:, np.newaxis] that does not.
            y = np.reshape(y, (-1, 1))

        self.n_outputs_ = y.shape[1]

        if is_classification:
            y = np.copy(y)

            self.classes_ = []
            self.n_classes_ = []

            for k in xrange(self.n_outputs_):
                classes_k, y[:, k] = unique(y[:, k], return_inverse=True)
                self.classes_.append(classes_k)
                self.n_classes_.append(classes_k.shape[0])

        else:
            self.classes_ = [None] * self.n_outputs_
            self.n_classes_ = [1] * self.n_outputs_

        self.n_classes_ = np.array(self.n_classes_, dtype=np.intp)

        if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous:
            y = np.ascontiguousarray(y, dtype=DOUBLE)

        # Check parameters
        max_depth = (2**31) - 1 if self.max_depth is None else self.max_depth

        if isinstance(self.max_features, six.string_types):
            if self.max_features == "auto":
                if is_classification:
                    max_features = max(1, int(np.sqrt(self.n_features_)))
                else:
                    max_features = self.n_features_
            elif self.max_features == "sqrt":
                max_features = max(1, int(np.sqrt(self.n_features_)))
            elif self.max_features == "log2":
                max_features = max(1, int(np.log2(self.n_features_)))
            else:
                raise ValueError(
                    'Invalid value for max_features. Allowed string '
                    'values are "auto", "sqrt" or "log2".')
        elif self.max_features is None:
            max_features = self.n_features_
        elif isinstance(self.max_features, (numbers.Integral, np.integer)):
            max_features = self.max_features
        else:  # float
            max_features = int(self.max_features * self.n_features_)

        if len(y) != n_samples:
            raise ValueError("Number of labels=%d does not match "
                             "number of samples=%d" % (len(y), n_samples))
        if self.min_samples_split <= 0:
            raise ValueError("min_samples_split must be greater than zero.")
        if self.min_samples_leaf <= 0:
            raise ValueError("min_samples_leaf must be greater than zero.")
        if max_depth <= 0:
            raise ValueError("max_depth must be greater than zero. ")
        if not (0 < max_features <= self.n_features_):
            raise ValueError("max_features must be in (0, n_features]")

        if sample_weight is not None:
            if (getattr(sample_weight, "dtype", None) != DOUBLE
                    or not sample_weight.flags.contiguous):
                sample_weight = np.ascontiguousarray(sample_weight,
                                                     dtype=DOUBLE)
            if len(sample_weight.shape) > 1:
                raise ValueError("Sample weights array has more "
                                 "than one dimension: %d" %
                                 len(sample_weight.shape))
            if len(sample_weight) != n_samples:
                raise ValueError("Number of weights=%d does not match "
                                 "number of samples=%d" %
                                 (len(sample_weight), n_samples))

        # Set min_samples_split sensibly
        min_samples_split = max(self.min_samples_split,
                                2 * self.min_samples_leaf)

        # Build tree
        criterion = self.criterion
        if not isinstance(criterion, Criterion):
            if is_classification:
                criterion = CRITERIA_CLF[self.criterion](self.n_outputs_,
                                                         self.n_classes_)
            else:
                criterion = CRITERIA_REG[self.criterion](self.n_outputs_)

        splitter = self.splitter
        if not isinstance(self.splitter, Splitter):
            splitter = SPLITTERS[self.splitter](criterion, max_features,
                                                self.min_samples_leaf,
                                                random_state)

        self.criterion_ = criterion
        self.splitter_ = splitter
        self.tree_ = Tree(self.n_features_, self.n_classes_, self.n_outputs_,
                          splitter, max_depth, min_samples_split,
                          self.min_samples_leaf, random_state)

        self.tree_.build(X, y, sample_weight=sample_weight)

        if self.n_outputs_ == 1:
            self.n_classes_ = self.n_classes_[0]
            self.classes_ = self.classes_[0]

        return self
Ejemplo n.º 40
0
    def fit(self, X, y, sample_mask=None, X_argsorted=None, check_input=True):
        """Build a decision tree from the training set (X, y).

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            The training input samples. Use ``dtype=np.float32``
            and ``order='F'`` for maximum efficiency.

        y : array-like, shape = [n_samples] or [n_samples, n_outputs]
            The target values (integers that correspond to classes in
            classification, real numbers in regression).
            Use ``dtype=np.float64`` and ``order='C'`` for maximum
            efficiency.

        sample_mask : array-like, shape = [n_samples], dtype = bool or None
            A bit mask that encodes the rows of ``X`` that should be
            used to build the decision tree. It can be used for bagging
            without the need to create of copy of ``X``.
            If None a mask will be created that includes all samples.

        X_argsorted : array-like, shape = [n_samples, n_features] or None
            Each column of ``X_argsorted`` holds the row indices of ``X``
            sorted according to the value of the corresponding feature
            in ascending order.
            I.e. ``X[X_argsorted[i, k], k] <= X[X_argsorted[j, k], k]``
            for each j > i.
            If None, ``X_argsorted`` is computed internally.
            The argument is supported to enable multiple decision trees
            to share the data structure and to avoid re-computation in
            tree ensembles. For maximum efficiency use dtype np.int32.

        Returns
        -------
        self : object
            Returns self.
        """
        if check_input:
            X, y = check_arrays(X, y)
        self.random_state = check_random_state(self.random_state)

        # set min_samples_split sensibly
        self.min_samples_split = max(self.min_samples_split,
                                     2 * self.min_samples_leaf)

        # Convert data
        if getattr(X, "dtype", None) != DTYPE or \
           X.ndim != 2 or not X.flags.fortran:
            X = array2d(X, dtype=DTYPE, order="F")

        n_samples, self.n_features_ = X.shape

        is_classification = isinstance(self, ClassifierMixin)

        y = np.atleast_1d(y)
        if y.ndim == 1:
            y = y[:, np.newaxis]

        self.classes_ = []
        self.n_classes_ = []
        self.n_outputs_ = y.shape[1]

        if is_classification:
            y = np.copy(y)

            for k in xrange(self.n_outputs_):
                unique = np.unique(y[:, k])
                self.classes_.append(unique)
                self.n_classes_.append(unique.shape[0])
                y[:, k] = np.searchsorted(unique, y[:, k])

        else:
            self.classes_ = [None] * self.n_outputs_
            self.n_classes_ = [1] * self.n_outputs_

        if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous:
            y = np.ascontiguousarray(y, dtype=DOUBLE)

        if is_classification:
            criterion = CLASSIFICATION[self.criterion](self.n_outputs_,
                                                       self.n_classes_)
        else:
            criterion = REGRESSION[self.criterion](self.n_outputs_)

        # Check parameters
        max_depth = np.inf if self.max_depth is None else self.max_depth

        if isinstance(self.max_features, basestring):
            if self.max_features == "auto":
                if is_classification:
                    max_features = max(1, int(np.sqrt(self.n_features_)))
                else:
                    max_features = self.n_features_
            elif self.max_features == "sqrt":
                max_features = max(1, int(np.sqrt(self.n_features_)))
            elif self.max_features == "log2":
                max_features = max(1, int(np.log2(self.n_features_)))
            else:
                raise ValueError(
                    'Invalid value for max_features. Allowed string '
                    'values are "auto", "sqrt" or "log2".')
        elif self.max_features is None:
            max_features = self.n_features_
        else:
            max_features = self.max_features

        if len(y) != n_samples:
            raise ValueError("Number of labels=%d does not match "
                             "number of samples=%d" % (len(y), n_samples))
        if self.min_samples_split <= 0:
            raise ValueError("min_samples_split must be greater than zero.")
        if self.min_samples_leaf <= 0:
            raise ValueError("min_samples_leaf must be greater than zero.")
        if max_depth <= 0:
            raise ValueError("max_depth must be greater than zero. ")
        if self.min_density < 0.0 or self.min_density > 1.0:
            raise ValueError("min_density must be in [0, 1]")
        if not (0 < max_features <= self.n_features_):
            raise ValueError("max_features must be in (0, n_features]")
        if sample_mask is not None:
            sample_mask = np.asarray(sample_mask, dtype=np.bool)

            if sample_mask.shape[0] != n_samples:
                raise ValueError("Length of sample_mask=%d does not match "
                                 "number of samples=%d" %
                                 (sample_mask.shape[0], n_samples))

        if X_argsorted is not None:
            X_argsorted = np.asarray(X_argsorted, dtype=np.int32, order='F')
            if X_argsorted.shape != X.shape:
                raise ValueError("Shape of X_argsorted does not match "
                                 "the shape of X")

        # Build tree
        self.tree_ = _tree.Tree(self.n_features_, self.n_classes_,
                                self.n_outputs_, criterion, max_depth,
                                self.min_samples_split, self.min_samples_leaf,
                                self.min_density, max_features,
                                self.find_split_, self.random_state)

        self.tree_.build(X,
                         y,
                         sample_mask=sample_mask,
                         X_argsorted=X_argsorted)

        if self.compute_importances:
            self.feature_importances_ = \
                self.tree_.compute_feature_importances()

        return self
Ejemplo n.º 41
0
    def fit(self, X, y, sample_weight=None, neighbours_matrix=None):
        """Build a boosted classifier from the training set (X, y).
        Parameters
        ----------
        X : array-like of shape = [n_samples, n_features]
            The training input samples.

        y : array-like of shape = [n_samples]
            The target values (integers that correspond to classes).

        sample_weight : array-like of shape = [n_samples], optional
            Sample weights. If None, the sample weights are initialized to
            ``1 / n_samples``.

        neighbours_matrix: array-like of shape [n_samples, n_neighbours],
            each row contains indices of signal neighbours
            (neighbours should be computed for background too),
            if None, this matrix is computed.

        Returns
        -------
        self : object
            Returns self.
        """
        if self.smoothing < 0:
            raise ValueError("Smoothing must be non-negative")
        if not isinstance(self.base_estimator, BaseEstimator):
            raise TypeError("estimator must be a subclass of BaseEstimator")
        if self.n_estimators <= 0:
            raise ValueError("n_estimators must be greater than zero.")
        if self.learning_rate <= 0:
            raise ValueError("learning_rate must be greater than zero")

        # Check that algorithm is supported
        if self.algorithm not in ('SAMME', 'SAMME.R'):
            raise ValueError("algorithm %s is not supported"
                             % self.algorithm)

        if self.algorithm == 'SAMME.R':
            if not hasattr(self.base_estimator, 'predict_proba'):
                raise TypeError(
                    "uBoostBDT with algorithm='SAMME.R' requires "
                    "that the weak learner have a predict_proba method.\n"
                    "Please change the base estimator or set "
                    "algorithm='SAMME' instead.")

        assert np.in1d(y, [0, 1]).all(), \
            "only two-class classification is implemented"

        if neighbours_matrix is not None:
            assert np.shape(neighbours_matrix) == (len(X), self.n_neighbors), \
                "Wrong shape of neighbours_matrix"
            self.knn_indices = neighbours_matrix
        else:
            assert self.uniform_variables is not None,\
                "uniform_variables should be set"
            self.knn_indices = computeSignalKnnIndices(
                self.uniform_variables, X, y > 0.5, self.n_neighbors)

        if sample_weight is None:
            # Initialize weights to 1 / n_samples
            sample_weight = np.ones(len(X), dtype=np.float) / len(X)
        else:
            # Normalize existing weights
            assert np.all(sample_weight >= 0.),\
                'the weights should be non-negative'
            sample_weight /= np.sum(sample_weight)

        # Clear any previous fit results
        self.estimators_ = []
        self.estimator_weights_ = []
        # score cuts correspond to
        # global efficiency == target_efficiency on each iteration.
        self.score_cuts_ = []

        X_train_variables = self.get_train_vars(X)
        y = np.ravel(y)
        X_train_variables, y = check_arrays(
            X_train_variables, y, sparse_format="dense")

        # A dictionary to keep all intermediate weights, efficiencies and so on
        if self.keep_debug_info:
            self.debug_dict = defaultdict(list)

        self.random_generator = check_random_state(self.random_state)

        self._boost(X_train_variables, y, sample_weight)

        self.score_cut = compute_bdt_cut(
            self.target_efficiency, y, self.predict_score(X))
        assert abs(self.score_cut - self.score_cuts_[-1] < 1e-4),\
            "score cut doesn't appear to coincide with the staged one"
        assert len(self.estimators_) == len(self.estimator_weights_) == len(self.score_cuts_)
        return self
Ejemplo n.º 42
0
def _check_rows_and_columns(a, b):
    """Unpacks the row and column arrays and checks their shape."""
    a_rows, a_cols = check_arrays(*a)
    b_rows, b_cols = check_arrays(*b)
    return a_rows, a_cols, b_rows, b_cols
Ejemplo n.º 43
0
    def _fit(self, X, y, parameter_iterable):
        """Actual fitting, performing the search over parameters."""

        estimator = self.estimator
        cv = self.cv

        n_samples = _num_samples(X)
        X, y = check_arrays(X, y, allow_lists=True, sparse_format='csr')

        self.scorer_ = _deprecate_loss_and_score_funcs(self.loss_func,
                                                       self.score_func,
                                                       self.scoring)

        if y is not None:
            if len(y) != n_samples:
                raise ValueError('Target variable (y) has a different number '
                                 'of samples (%i) than data (X: %i samples)' %
                                 (len(y), n_samples))
            y = np.asarray(y)
        cv = check_cv(cv, X, y, classifier=is_classifier(estimator))

        if self.verbose > 0:
            if isinstance(parameter_iterable, Sized):
                n_candidates = len(parameter_iterable)
                print("Fitting {0} folds for each of {1} candidates, totalling"
                      " {2} fits".format(len(cv), n_candidates,
                                         n_candidates * len(cv)))

        base_estimator = clone(self.estimator)

        pre_dispatch = self.pre_dispatch

        out = Parallel(n_jobs=self.n_jobs,
                       verbose=self.verbose,
                       pre_dispatch=pre_dispatch)(delayed(fit_grid_point)(
                           X, y, base_estimator, parameters, train, test,
                           self.scorer_, self.verbose, **{
                               'sample_weight': balance_weights(y[train])
                           }) for parameters in parameter_iterable
                                                  for train, test in cv)

        # Out is a list of triplet: score, estimator, n_test_samples
        n_fits = len(out)
        n_folds = len(cv)

        scores = list()
        grid_scores = list()
        for grid_start in range(0, n_fits, n_folds):
            n_test_samples = 0
            score = 0
            all_scores = []
            for this_score, parameters, this_n_test_samples in \
                    out[grid_start:grid_start + n_folds]:
                all_scores.append(this_score)
                if self.iid:
                    this_score *= this_n_test_samples
                    n_test_samples += this_n_test_samples
                score += this_score
            if self.iid:
                score /= float(n_test_samples)
            else:
                score /= float(n_folds)
            scores.append((score, parameters))
            # TODO: shall we also store the test_fold_sizes?
            grid_scores.append(
                _CVScoreTuple(parameters, score, np.array(all_scores)))
        # Store the computed scores
        self.grid_scores_ = grid_scores

        # Find the best parameters by comparing on the mean validation score:
        # note that `sorted` is deterministic in the way it breaks ties
        best = sorted(grid_scores,
                      key=lambda x: x.mean_validation_score,
                      reverse=True)[0]
        self.best_params_ = best.parameters
        self.best_score_ = best.mean_validation_score

        if self.refit:
            # fit the best estimator using the entire dataset
            # clone first to work around broken estimators
            best_estimator = clone(base_estimator).set_params(
                **best.parameters)
            if y is not None:
                best_estimator.fit(X,
                                   y,
                                   sample_weight=balance_weights(y),
                                   **self.fit_params)
            else:
                best_estimator.fit(X, **self.fit_params)
            self.best_estimator_ = best_estimator
        return self
Ejemplo n.º 44
0
def _check_rows_and_columns(a, b):
    """Unpacks the row and column arrays and checks their shape."""
    a_rows, a_cols = check_arrays(*a)
    b_rows, b_cols = check_arrays(*b)
    return a_rows, a_cols, b_rows, b_cols
Ejemplo n.º 45
0
 def _prepare_data_for_fitting(self, X, y, sample_weight):
     """By default the same format used as for trees """
     X = self.get_train_vars(X)
     X, y = check_arrays(X, y, dtype=self.dtype, sparse_format="dense", check_ccontiguous=True)
     return X, y, sample_weight
Ejemplo n.º 46
0
    def _fit(self, X, y, sample_weight, parameter_iterable):
        """Actual fitting, performing the search over parameters."""

        estimator = self.estimator
        cv = self.cv

        n_samples = _num_samples(X)
        X, y, sample_weight = check_arrays(X,
                                           y,
                                           sample_weight,
                                           allow_lists=True,
                                           sparse_format='csr')

        if y is not None:
            if len(y) != n_samples:
                raise ValueError('Target variable (y) has a different number '
                                 'of samples (%i) than data (X: %i samples)' %
                                 (len(y), n_samples))
            y = np.asarray(y)

        if sample_weight is not None:
            sample_weight = np.asarray(sample_weight)

        cv = check_cv(cv, X, y, classifier=is_classifier(estimator))

        if self.verbose > 0:
            if isinstance(parameter_iterable, Sized):
                n_candidates = len(parameter_iterable)
                print(
                    "Fitting {0} folds for each of {1} candidates, totalling"
                    " {2} fits".format(len(cv), n_candidates,
                                       n_candidates * len(cv)))

        base_estimator = clone(self.estimator)

        # first fit at each grid point using the maximum n_estimators
        param_grid = self.param_grid.copy()
        param_grid['n_estimators'] = [self.max_n_estimators]
        grid = ParameterGrid(param_grid)

        pre_dispatch = self.pre_dispatch

        clfs = Parallel(n_jobs=self.n_jobs,
                        verbose=self.verbose,
                        pre_dispatch=pre_dispatch)(delayed(fit_grid_point)(
                            base_estimator, clf_params, X, y, sample_weight,
                            train, test, self.verbose, **self.fit_params)
                                                   for clf_params in grid
                                                   for train, test in cv)

        # now use the already fitted ensembles but trancate to N estimators for
        # N from 1 to n_estimators_max - 1 (inclusive)
        out = Parallel(
            n_jobs=self.n_jobs,
            verbose=self.verbose,
            pre_dispatch=pre_dispatch)(
                delayed(score_each_boost)
                (clf, clf_params, self.min_n_estimators, X, y, sample_weight,
                 self.score_func, train, test, self.verbose)
                for clf, clf_params, train, test in clfs)

        out = reduce(operator.add, [zip(*stage) for stage in out])
        # out is now a list of triplet: score, estimator_params, n_test_samples

        n_estimators_points = self.max_n_estimators - self.min_n_estimators + 1
        n_fits = len(out)
        n_folds = len(cv)

        grid_scores = list()
        for block in range(0, n_fits, n_folds * n_estimators_points):
            for grid_start in range(block, block + n_estimators_points):
                n_test_samples = 0
                score = 0
                all_scores = list()
                for this_score, parameters, this_n_test_samples in \
                        out[grid_start:
                            grid_start + n_folds * n_estimators_points:
                            n_estimators_points]:
                    all_scores.append(this_score)
                    if self.iid:
                        this_score *= this_n_test_samples
                    score += this_score
                    n_test_samples += this_n_test_samples
                if self.iid:
                    score /= float(n_test_samples)
                else:
                    score /= float(n_folds)
                grid_scores.append(
                    _CVScoreTuple(parameters, score, np.array(all_scores)))

        # Store the computed scores
        self.grid_scores_ = grid_scores

        # Find the best parameters by comparing on the mean validation score:
        # note that `sorted` is deterministic in the way it breaks ties
        best = sorted(grid_scores,
                      key=lambda x: x.mean_validation_score,
                      reverse=True)[0]
        self.best_params_ = best.parameters
        self.best_score_ = best.mean_validation_score

        if self.refit:
            fit_params = self.fit_params
            if sample_weight is not None:
                fit_params = fit_params.copy()
                fit_params['sample_weight'] = sample_weight
            # fit the best estimator using the entire dataset
            # clone first to work around broken estimators
            best_estimator = clone(base_estimator).set_params(
                **best.parameters)
            if y is not None:
                best_estimator.fit(X, y, **fit_params)
            else:
                best_estimator.fit(X, **fit_params)
            self.best_estimator_ = best_estimator
        return self
Ejemplo n.º 47
0
    def fit(self, X, y, sample_weight=None):
        shuffler = Shuffler(X, random_state=self.random_state)
        X, y = check_arrays(X,
                            y,
                            dtype=DTYPE,
                            sparse_format="dense",
                            check_ccontiguous=True)
        y = column_or_1d(y, warn=True)
        n_samples = len(X)
        n_inbag = int(self.subsample * n_samples)
        sample_weight = check_sample_weight(
            y, sample_weight=sample_weight).copy()
        self.random_state = check_random_state(self.random_state)

        # skipping all checks
        assert self.update_on in ['all', 'same', 'other', 'random']
        y_pred = numpy.zeros(len(y), dtype=float)

        self.classifiers = []
        self.learning_rates = []
        self.loss_values = []
        self.loss = copy.copy(self.loss)
        self.loss.fit(X, y, sample_weight=sample_weight)
        iter_X = shuffler.generate(0.)

        prev_smearing = 1
        for iteration in range(self.n_estimators):
            if iteration % self.recount_step == 0:
                if prev_smearing > 0:
                    iter_smearing = interpolate(self.smearing, iteration,
                                                self.n_estimators)
                    prev_smearing = iter_smearing
                    iter_X = shuffler.generate(iter_smearing)
                    iter_X, = check_arrays(iter_X,
                                           dtype=DTYPE,
                                           sparse_format="dense",
                                           check_ccontiguous=True)
                    y_pred = numpy.zeros(len(y))
                    y_pred += sum(
                        cl.predict(X) * rate for rate, cl in zip(
                            self.learning_rates, self.classifiers))

            self.loss_values.append(
                self.loss(y, y_pred, sample_weight=sample_weight))
            tree = DecisionTreeRegressor(
                criterion=self.criterion,
                splitter=self.splitter,
                max_depth=interpolate(self.max_depth, iteration,
                                      self.n_estimators),
                min_samples_split=self.min_samples_split,
                min_samples_leaf=interpolate(self.min_samples_leaf,
                                             iteration,
                                             self.n_estimators,
                                             use_log=True),
                max_features=self.max_features,
                random_state=self.random_state)

            sample_mask = _random_sample_mask(n_samples, n_inbag,
                                              self.random_state)
            loss_weight = sample_weight if self.weights_in_loss else numpy.ones(
                len(sample_weight))
            tree_weight = sample_weight if not self.weights_in_loss else numpy.ones(
                len(sample_weight))
            residual = self.loss.negative_gradient(y,
                                                   y_pred,
                                                   sample_weight=loss_weight)

            tree.fit(numpy.array(iter_X)[sample_mask, :],
                     residual[sample_mask],
                     sample_weight=tree_weight[sample_mask],
                     check_input=False)
            # update tree leaves
            if self.update_tree:
                if self.update_on == 'all':
                    update_mask = numpy.ones(len(sample_mask), dtype=bool)
                elif self.update_on == 'same':
                    update_mask = sample_mask
                elif self.update_on == 'other':
                    update_mask = ~sample_mask
                else:  # random
                    update_mask = _random_sample_mask(n_samples, n_inbag,
                                                      self.random_state)
                self.loss.update_terminal_regions(tree.tree_,
                                                  X=iter_X,
                                                  y=y,
                                                  residual=residual,
                                                  pred=y_pred,
                                                  sample_mask=update_mask,
                                                  sample_weight=sample_weight)
            iter_learning_rate = interpolate(self.learning_rate,
                                             iteration,
                                             self.n_estimators,
                                             use_log=True)
            y_pred += iter_learning_rate * tree.predict(X)
            self.classifiers.append(tree)
            self.learning_rates.append(iter_learning_rate)

        return self
Ejemplo n.º 48
0
    def fit(self, X, y=None):
        """Generate a sparse random projection matrix

        Parameters
        ----------
        X : numpy array or scipy.sparse of shape [n_samples, n_features]
            Training set: only the shape is used to find optimal random
            matrix dimensions based on the theory referenced in the
            afore mentioned papers.

        y : is not used: placeholder to allow for usage in a Pipeline.

        Returns
        -------
        self

        """
        X, y = check_arrays(X, y)

        if not sp.issparse(X):
            X = np.atleast_2d(X)

        n_samples, n_features = X.shape

        if self.n_components == 'auto':
            self.n_components_ = johnson_lindenstrauss_min_dim(
                n_samples=n_samples, eps=self.eps)

            if self.n_components_ <= 0:
                raise ValueError(
                    'eps=%f and n_samples=%d lead to a target dimension of '
                    '%d which is invalid' % (
                        self.eps, n_samples, self.n_components_))

            elif self.n_components_ > n_features:
                raise ValueError(
                    'eps=%f and n_samples=%d lead to a target dimension of '
                    '%d which is larger than the original space with '
                    'n_features=%d' % (self.eps, n_samples, self.n_components_,
                                       n_features))
        else:
            if self.n_components <= 0:
                raise ValueError("n_components must be greater than 0, got %s"
                                 % self.n_components_)

            elif self.n_components > n_features:
                warnings.warn(
                    "The number of components is higher than the number of"
                    " features: n_features > n_components (%s > %s)."
                    "The dimensionality of the problem will not be reduced."
                    % (n_features, self.n_components))

            self.n_components_ = self.n_components

        # Generate a projection matrix of size [n_components, n_features]
        self.components_ = self._make_random_matrix(self.n_components_,
                                                    n_features)

        # Check contract
        assert_equal(
            self.components_.shape,
            (self.n_components_, n_features),
            err_msg=('An error has occurred the self.components_ matrix has '
                     ' not the proper shape.'))

        return self