def _parallel_build_trees(tree, forest, X, y, sample_weight, tree_idx, n_trees,
                          verbose=0, class_weight=None):
    """Private function used to fit a single tree in parallel."""
    if verbose > 1:
        print("building tree %d of %d" % (tree_idx + 1, n_trees))

    if forest.bootstrap:
        n_samples = X.shape[0]
        if sample_weight is None:
            curr_sample_weight = np.ones((n_samples,), dtype=np.float64)
        else:
            curr_sample_weight = sample_weight.copy()

        indices = _generate_sample_indices(tree.random_state, n_samples)
        sample_counts = bincount(indices, minlength=n_samples)
        curr_sample_weight *= sample_counts

        if class_weight == 'subsample':
            with warnings.catch_warnings():
                warnings.simplefilter('ignore', DeprecationWarning)
                curr_sample_weight *= compute_sample_weight('auto', y, indices)
        elif class_weight == 'balanced_subsample':
            curr_sample_weight *= compute_sample_weight('balanced', y, indices)

        tree.fit(X, y, sample_weight=curr_sample_weight, check_input=False)
    else:
        tree.fit(X, y, sample_weight=sample_weight, check_input=False)

    return tree
def _parallel_build_trees(rotation_matrix, tree, forest, X, y, sample_weight, tree_idx, n_trees,
                          verbose=0, class_weight=None):
    """Private function used to fit a single tree in parallel."""
    if verbose > 1:
        print("building tree %d of %d" % (tree_idx + 1, n_trees))

    if forest.bootstrap:
        n_samples = X.shape[0]
        if sample_weight is None:
            curr_sample_weight = np.ones((n_samples,), dtype=np.float64)
        else:
            curr_sample_weight = sample_weight.copy()

        indices = _generate_sample_indices(tree.random_state, n_samples)
        sample_counts = np.bincount(indices, minlength=n_samples)
        curr_sample_weight *= sample_counts

        if class_weight == 'subsample':
            with warnings.catch_warnings():
                warnings.simplefilter('ignore', DeprecationWarning)
                curr_sample_weight *= compute_sample_weight('auto', y, indices)
        elif class_weight == 'balanced_subsample':
            curr_sample_weight *= compute_sample_weight('balanced', y, indices)

        tree.fit(X.dot(rotation_matrix), y, sample_weight=curr_sample_weight, check_input=False)
    else:
        tree.fit(X.dot(rotation_matrix), y, sample_weight=sample_weight, check_input=False)

    return tree
Example #3
0
def fitmodel(model, x_train, y_train, x_test, y_test, batch_size, epochs):
    # print('X Train ', x_train.shape)
    # print(' Y Train ', y_train.shape)

    # print('X Test ', x_test.shape)
    print('Y Test ', y_test.shape)
    print(y_test)
    print(np.unique(y_train))

    cls_weight_dict = [{
        0: 1,
        1: 1
    }, {
        0: 1,
        1: 80
    }]  #two class mapping of weights
    val_sample_weights = compute_sample_weight(cls_weight_dict, y_test)

    weights = compute_sample_weight(class_weight="balanced", y=y_train)
    #weights = compute_sample_weight(class_weight="None", y=y_train)
    #class_weights = compute_class_weight('balanced', y_train,  y_train)

    model.fit(
        x_train,
        y_train,
        batch_size=batch_size,
        epochs=epochs,
        sample_weight=weights,
        class_weight={
            0: 1.,
            1: 80.
        },
        #class_weight={0: 1., 1: 100.},
        validation_data=(x_test, y_test))

    history = model.fit(
        x_train,
        y_train,
        batch_size=batch_size,
        epochs=epochs,
        sample_weight=weights,
        class_weight={
            0: 1.,
            1: 80.
        },
        #class_weight={0: 1., 1: 100.},
        validation_data=(x_test, y_test))

    # model.fit(np.array(x_train), np.array(y_train),binary
    # batch_size = batch_size,
    # epochs =epochs,
    # validation_data=(np.array(x_test), np.array(y_test)))

    model_structure = model.to_json()
    with open("BugAITwoClass_model.json", "w") as json_file:
        json_file.write(model_structure)

    conf_matrix(history, model, x_test, y_test)
    plotresults(history, y_train)
Example #4
0
def get_dataloaders(train_ds, valid_ds,
                    batch_size,
                    num_workers,
                    fast=False,
                    train_sizes=None,
                    balance=False,
                    balance_datasets=False,
                    balance_unlabeled=False):
    sampler = None
    weights = None
    num_samples = 0

    if balance_unlabeled:
        labeled_mask = (train_ds.targets != UNLABELED_CLASS).astype(np.uint8)
        weights = compute_sample_weight('balanced', labeled_mask)
        num_samples = int(np.mean(train_sizes))

    if balance:
        weights = compute_sample_weight('balanced', train_ds.targets)
        hist = np.bincount(train_ds.targets)
        min_class_counts = int(min(hist))
        num_classes = len(np.unique(train_ds.targets))
        num_samples = min_class_counts * num_classes

    if balance_datasets:
        assert train_sizes is not None
        dataset_balancing_term = []

        for subset_size in train_sizes:
            full_dataset_size = float(sum(train_sizes))
            dataset_balancing_term.extend([full_dataset_size / subset_size] * subset_size)

        dataset_balancing_term = np.array(dataset_balancing_term)
        if weights is None:
            weights = np.ones(len(train_ds.targets))

        weights = weights * dataset_balancing_term
        num_samples = int(np.mean(train_sizes))

    # If we do balancing, let's go for fixed number of batches (half of dataset)
    if weights is not None:
        sampler = WeightedRandomSampler(weights, num_samples)

    if fast:
        weights = np.ones(len(train_ds))
        sampler = WeightedRandomSampler(weights, 16)

    train_dl = DataLoader(train_ds, batch_size=batch_size,
                          shuffle=sampler is None, sampler=sampler,
                          pin_memory=True, drop_last=True,
                          num_workers=num_workers)
    valid_dl = DataLoader(valid_ds, batch_size=batch_size, shuffle=False,
                          pin_memory=True, drop_last=False,
                          num_workers=num_workers)

    return train_dl, valid_dl
    def _parallel_build_trees(self, tree, forest, X, y, sample_weight, tree_idx, n_trees,
                              verbose=0, class_weight=None):
        """
        Private function used to fit a single tree in parallel.

        Copied from sklearn.ensemble.forest and converted to a class function to perform undersampling prior to
        fitting the single tree

        :param tree: base_estimator {default=DecisionTreeClassifier()}
        :param forest: self {BalancedRandomForestClassifier object}
        :param X:{array-like, sparse matrix}, shape (n_samples, n_features)
               Matrix containing the training data.
        :param y: array-like, shape (n_samples,)
               Corresponding label for each sample in X.
        :param sample_weight: array-like of shape = [n_samples], optional
               Sample weights.
        :param tree_idx: index for specific tree
        :param n_trees: total number of trees
        :param verbose: int, optional (default=0)
               Controls the verbosity of the building process.
        :param class_weight: dict, list of dicts, "balanced", "balanced_subsample" or None, optional (default=None)
               Weights associated with classes in the form ``{class_label: weight}``.
               If not given, all classes are supposed to have weight one. For multi-output problems, a list of dicts
               can be provided in the same order as the columns of y.
        :return: fitted tree
        """
        if verbose > 1:
            print("building tree %d of %d" % (tree_idx + 1, n_trees))

        X_res, y_res, indices = self.rus.fit_sample(X, y)
        if forest.bootstrap:
            n_samples = X_res.shape[0]
            if sample_weight is None:
                curr_sample_weight = np.ones((n_samples,), dtype=np.float64)
            else:
                curr_sample_weight = sample_weight[indices]

            indices = _generate_sample_indices(tree.random_state, n_samples)
            sample_counts = np.bincount(indices, minlength=n_samples)
            curr_sample_weight *= sample_counts

            if class_weight == 'subsample':
                with warnings.catch_warnings():
                    warnings.simplefilter('ignore', DeprecationWarning)
                    curr_sample_weight *= compute_sample_weight('auto', y, indices)
            elif class_weight == 'balanced_subsample':
                curr_sample_weight *= compute_sample_weight('balanced', y, indices)

            tree.fit(X_res, y_res, sample_weight=curr_sample_weight, check_input=False)
        else:
            tree.fit(X_res, y_res, sample_weight=sample_weight, check_input=False)

        return tree
Example #6
0
def getClassifierSampleWeights(dataframe):
    weights = compute_sample_weight('balanced', dataframe.loc[:, "eventType"])

    binning = np.linspace(PTMIN, PTMAX, PTBINS + 1)
    digitizedSamples = np.digitize(np.clip(dataframe['TransverseMass'].values,
                                           PTMIN, PTMAX - 1.0),
                                   bins=binning,
                                   right=False).astype(np.float32)
    weights = np.ones(dataframe.shape[0])

    weights = compute_sample_weight('balanced', digitizedSamples)

    return weights
Example #7
0
def _parallel_build_trees(tree,
                          forest,
                          X,
                          y,
                          sample_weight,
                          tree_idx,
                          n_trees,
                          verbose=0,
                          class_weight=None,
                          n_samples_bootstrap=None):
    """Private function used to fit a single tree in parallel, adjusted for pipeline trees."""
    if verbose > 1:
        print("building tree %d of %d" % (tree_idx + 1, n_trees))

    # name of step of final estimator in pipeline
    estimator = tree.steps[-1][0]

    if forest.bootstrap:
        n_samples = X.shape[0]
        if sample_weight is None:
            curr_sample_weight = np.ones((n_samples, ), dtype=np.float64)
        else:
            curr_sample_weight = sample_weight.copy()

        indices = _generate_sample_indices(tree.random_state, n_samples,
                                           n_samples_bootstrap)
        sample_counts = np.bincount(indices, minlength=n_samples)
        curr_sample_weight *= sample_counts

        if class_weight == 'subsample':
            with catch_warnings():
                simplefilter('ignore', DeprecationWarning)
                curr_sample_weight *= compute_sample_weight('auto', y, indices)
        elif class_weight == 'balanced_subsample':
            curr_sample_weight *= compute_sample_weight('balanced', y, indices)

        fit_params = {
            f'{estimator}__sample_weight': curr_sample_weight,
            f'{estimator}__check_input': True
        }
        tree.fit(X, y, **fit_params)

    else:
        fit_params = {
            f'{estimator}__sample_weight': sample_weight,
            f'{estimator}__check_input': True
        }
        tree.fit(X, y, **fit_params)

    return tree
Example #8
0
def _parallel_build_trees(
    tree,
    forest,
    X,
    y,
    sample_weight,
    tree_idx,
    n_trees,
    verbose=0,
    class_weight=None,
    n_samples_bootstrap=None,
):
    """
    Private function used to fit a single tree in parallel."""
    if verbose > 1:
        print("building tree %d of %d" % (tree_idx + 1, n_trees))  # noqa: T001

    # name of step of final estimator in pipeline
    final_estimator = tree.steps[-1][1]
    final_estimator_name = tree.steps[-1][0]

    if forest.bootstrap:
        n_samples = X.shape[0]
        if sample_weight is None:
            curr_sample_weight = np.ones((n_samples, ), dtype=np.float64)
        else:
            curr_sample_weight = sample_weight.copy()

        indices = _generate_sample_indices(final_estimator.random_state,
                                           n_samples, n_samples_bootstrap)
        sample_counts = np.bincount(indices, minlength=n_samples)
        curr_sample_weight *= sample_counts

        if class_weight == "subsample":
            with catch_warnings():
                simplefilter("ignore", DeprecationWarning)
                curr_sample_weight *= compute_sample_weight("auto", y, indices)
        elif class_weight == "balanced_subsample":
            curr_sample_weight *= compute_sample_weight("balanced", y, indices)
        fit_params = {
            f"{final_estimator_name}__sample_weight": curr_sample_weight
        }
        tree.fit(X, y, **fit_params)
    else:
        fit_params = {f"{final_estimator_name}__sample_weight": sample_weight}
        tree.fit(X, y, **fit_params)

    return tree
Example #9
0
def get_num_samples(samples_path, params, dontcare):
    """
    Function to retrieve number of samples, either from config file or directly from hdf5 file.
    :param samples_path: (str) Path to samples folder
    :param params: (dict) Parameters found in the yaml config file.
    :param dontcare:
    :return: (dict) number of samples for trn, val and tst.
    """
    num_samples = {'trn': 0, 'val': 0, 'tst': 0}
    weights = []
    samples_weight = None
    for i in ['trn', 'val', 'tst']:
        if get_key_def(f"num_{i}_samples", params['training'], None) is not None:
            num_samples[i] = params['training'][f"num_{i}_samples"]

            with h5py.File(samples_path.joinpath(f"{i}_samples.hdf5"), 'r') as hdf5_file:
                file_num_samples = len(hdf5_file['map_img'])
            if num_samples[i] > file_num_samples:
                raise IndexError(f"The number of training samples in the configuration file ({num_samples[i]}) "
                                 f"exceeds the number of samples in the hdf5 training dataset ({file_num_samples}).")
        else:
            with h5py.File(samples_path.joinpath(f"{i}_samples.hdf5"), "r") as hdf5_file:
                num_samples[i] = len(hdf5_file['map_img'])

        with h5py.File(samples_path.joinpath(f"{i}_samples.hdf5"), "r") as hdf5_file:
            if i == 'trn':
                for x in range(num_samples[i]):
                    label = hdf5_file['map_img'][x]
                    unique_labels = np.unique(label)
                    weights.append(''.join([str(int(i)) for i in unique_labels]))
                    samples_weight = compute_sample_weight('balanced', weights)

    return num_samples, samples_weight
Example #10
0
def get_sample_weight(y_output):
    #news = pd.read_csv(file,encoding='utf-8',header=None)
    #tags = news.ix[:,1:].as_matrix()
    class_weight = []
    sample_weight = compute_sample_weight('balanced', y_output)
    sample_weight = sample_weight[:, np.newaxis]
    return sample_weight
Example #11
0
def f1_accuracy(truth, pred):
    avg = 'binary'
    if len(np.unique(truth)) > 2:
        avg = 'weighted'

    wts = compute_sample_weight('balanced', truth)
    return f1_score(truth, pred, average=avg, sample_weight=wts)
Example #12
0
def plotresults(history, y_train):

    weights = compute_sample_weight(class_weight="balanced", y=y_train)

    #print("Weights :::", weights)

    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']

    loss = history.history['loss']
    val_loss = history.history['val_loss']

    epoch = range(1, len(acc) + 1)

    plt.plot(epoch, acc, 'bo', label='Training acc')
    plt.plot(epoch, val_acc, 'b', label='Validation acc')

    plt.title('Training and Validation accuracy')
    plt.legend()
    plt.figure()

    plt.plot(epoch, loss, 'bo', label='Training loss')
    plt.plot(epoch, val_loss, 'b', label='Validation loss')
    plt.title('Training and Validation loss')
    plt.legend()
    plt.show()
Example #13
0
def get_weight_matrix(batch_size=500):
    tags = np.load('./tags.npy', 'r')
    with mini_batch(tags=tags) as mb:
        length = int(np.ceil(len(mb.id_list) / batch_size))
        y_data = pd.DataFrame()
        classes = 2
        for i in range(length):
            mb.get_ordered_ids(batch_size, i * batch_size)
            x_test_batch, y_test_batch = mb.load_mini_batch()
            y_data = pd.concat([y_data, pd.DataFrame(y_test_batch)],
                               ignore_index=True)
            print('batch:', i)
        # tags_num = y_data.shape[1]
        tags_num = len(tags)
        # sample_num = y_data.shape[0]
        weight_matrix = np.zeros(shape=(2, tags_num))
        recip_freq = compute_sample_weight('balanced', y_data)
        recip_freq = np.vstack(recip_freq).T
        for i in range(tags_num):
            recip_freq = np.bincount(y_data.ix[:, i]) / len(
                y_data.ix[:, i].values)

            # if recip_freq.shape[1] == classes:
            #     weight_matrix_column = y_data.ix[:,i].replace({0:recip_freq[1],1:recip_freq[0]}).tolist()
            # else:
            #     weight_matrix_column = 1
            # weight_matrix .append(weight_matrix_column)
            weight_matrix[:, i] = recip_freq
            print('tag:', i)
        return weight_matrix
def _parallel_build_trees_under(tree,
                                forest,
                                X,
                                y,
                                sample_weight,
                                tree_idx,
                                n_trees,
                                verbose=0,
                                class_weight=None):
    """Private function used to fit a single tree in parallel."""
    if verbose > 1:
        print("building tree %d of %d" % (tree_idx + 1, n_trees))

    # Undersample X and y first
    if forest.undersample is not None:
        rus = RandomUnderSampler(ratio=lambda y: {
            0: int(Counter(y)[0] / forest.undersample),
            1: Counter(y)[1]
        },
                                 return_indices=True)
        X, y, indices_under = rus.fit_sample(X, y)
        if sample_weight is not None:
            sample_weight = sample_weight[indices_under]

    if forest.bootstrap:
        n_samples = X.shape[0]
        if sample_weight is None:
            curr_sample_weight = np.ones((n_samples, ), dtype=np.float64)
        else:
            curr_sample_weight = sample_weight.copy()

        indices = _generate_sample_indices(tree.random_state, n_samples)
        sample_counts = np.bincount(indices, minlength=n_samples)
        curr_sample_weight *= sample_counts

        if class_weight == 'subsample':
            with warnings.catch_warnings():
                warnings.simplefilter('ignore', DeprecationWarning)
                curr_sample_weight *= compute_sample_weight('auto', y, indices)
        elif class_weight == 'balanced_subsample':
            curr_sample_weight *= compute_sample_weight('balanced', y, indices)

        tree.fit(X, y, sample_weight=curr_sample_weight, check_input=False)
    else:
        tree.fit(X, y, sample_weight=sample_weight, check_input=False)

    return tree
    def _validate_y_class_weight(self, y):
        check_classification_targets(y)

        y = np.copy(y)
        expanded_class_weight = None

        if self.class_weight is not None:
            y_original = np.copy(y)

        self.classes_ = []
        self.n_classes_ = []

        y_store_unique_indices = np.zeros(y.shape, dtype=np.int)
        for k in range(self.n_outputs_):
            classes_k, y_store_unique_indices[:, k] = np.unique(y[:, k], return_inverse=True)
            self.classes_.append(classes_k)
            self.n_classes_.append(classes_k.shape[0])
        y = y_store_unique_indices

        if self.class_weight is not None:
            valid_presets = ('auto', 'balanced', 'subsample', 'balanced_subsample')
            if isinstance(self.class_weight, six.string_types):
                if self.class_weight not in valid_presets:
                    raise ValueError('Valid presets for class_weight include '
                                     '"balanced" and "balanced_subsample". Given "%s".'
                                     % self.class_weight)
                if self.class_weight == "subsample":
                    warn("class_weight='subsample' is deprecated in 0.17 and"
                         "will be removed in 0.19. It was replaced by "
                         "class_weight='balanced_subsample' using the balanced"
                         "strategy.", DeprecationWarning)
                if self.warm_start:
                    warn('class_weight presets "balanced" or "balanced_subsample" are '
                         'not recommended for warm_start if the fitted data '
                         'differs from the full dataset. In order to use '
                         '"balanced" weights, use compute_class_weight("balanced", '
                         'classes, y). In place of y you can use a large '
                         'enough sample of the full training set target to '
                         'properly estimate the class frequency '
                         'distributions. Pass the resulting weights as the '
                         'class_weight parameter.')

            if (self.class_weight not in ['subsample', 'balanced_subsample'] or
                    not self.bootstrap):
                if self.class_weight == 'subsample':
                    class_weight = 'auto'
                elif self.class_weight == "balanced_subsample":
                    class_weight = "balanced"
                else:
                    class_weight = self.class_weight
                with warnings.catch_warnings():
                    if class_weight == "auto":
                        warnings.simplefilter('ignore', DeprecationWarning)
                    expanded_class_weight = compute_sample_weight(class_weight,
                                                                  y_original)

        return y, expanded_class_weight
    def _validate_y_class_weight(self, y):
        check_classification_targets(y)

        y = np.copy(y)
        expanded_class_weight = None

        if self.class_weight is not None:
            y_original = np.copy(y)

        self.classes_ = []
        self.n_classes_ = []

        y_store_unique_indices = np.zeros(y.shape, dtype=np.int)
        for k in range(self.n_outputs_):
            classes_k, y_store_unique_indices[:, k] = np.unique(y[:, k], return_inverse=True)
            self.classes_.append(classes_k)
            self.n_classes_.append(classes_k.shape[0])
        y = y_store_unique_indices

        if self.class_weight is not None:
            valid_presets = ('auto', 'balanced', 'subsample', 'balanced_subsample')
            if isinstance(self.class_weight, six.string_types):
                if self.class_weight not in valid_presets:
                    raise ValueError('Valid presets for class_weight include '
                                     '"balanced" and "balanced_subsample". Given "%s".'
                                     % self.class_weight)
                if self.class_weight == "subsample":
                    warn("class_weight='subsample' is deprecated in 0.17 and"
                         "will be removed in 0.19. It was replaced by "
                         "class_weight='balanced_subsample' using the balanced"
                         "strategy.", DeprecationWarning)
                if self.warm_start:
                    warn('class_weight presets "balanced" or "balanced_subsample" are '
                         'not recommended for warm_start if the fitted data '
                         'differs from the full dataset. In order to use '
                         '"balanced" weights, use compute_class_weight("balanced", '
                         'classes, y). In place of y you can use a large '
                         'enough sample of the full training set target to '
                         'properly estimate the class frequency '
                         'distributions. Pass the resulting weights as the '
                         'class_weight parameter.')

            if (self.class_weight not in ['subsample', 'balanced_subsample'] or
                    not self.bootstrap):
                if self.class_weight == 'subsample':
                    class_weight = 'auto'
                elif self.class_weight == "balanced_subsample":
                    class_weight = "balanced"
                else:
                    class_weight = self.class_weight
                with warnings.catch_warnings():
                    if class_weight == "auto":
                        warnings.simplefilter('ignore', DeprecationWarning)
                    expanded_class_weight = compute_sample_weight(class_weight,
                                                                  y_original)

        return y, expanded_class_weight
Example #17
0
def domainAdaptationWeights(dataframe, targetPath):
    target = getSamples([targetPath])
    quantiles = np.linspace(0.0, 1.0, 11)[1:-1]

    totalSampleWeights = np.zeros(dataframe.shape[0])
    for variable in inputVariables:
        minimum = np.min(dataframe.loc[:, variable])
        maximum = np.max(dataframe.loc[:, variable])
        input = dataframe.loc[:, variable]
        clippedTarget = np.clip(target.loc[:, variable], minimum, maximum)
        edges = np.quantile(input, quantiles)
        binning = np.insert(edges, 0, minimum)
        binning = np.append(binning, maximum)
        dataframeIndexed = np.digitize(input, binning, right=True)

        # Fix the difference between digitize and histogram binnings
        dataframeIndexed[dataframeIndexed == 0] = 1
        dataframeIndexed = dataframeIndexed - 1

        binCountsDataframe, _ = np.histogram(input, binning)
        binCountsTarget, _ = np.histogram(clippedTarget, binning)
        binCountsDataframe = binCountsDataframe / dataframe.shape[0]
        binCountsTarget = binCountsTarget / target.shape[0]

        reweightingFactorPerBin = np.zeros(len(binCountsTarget))
        for i in range(len(binCountsTarget)):
            if binCountsTarget[i] != 0:
                reweightingFactorPerBin[
                    i] = binCountsTarget[i] / binCountsDataframe[i]
            else:
                #if the target has empty bin, do not reweight the training samples at all
                reweightingFactorPerBin[i] = 1

        sampleWeights = [reweightingFactorPerBin[i] for i in dataframeIndexed]

        totalSampleWeights = totalSampleWeights + sampleWeights

    #Additionally a small factors to account for the true-fake imbalance in the dataset
    #and the imbalance between different algos
    trueFakeWeights = compute_sample_weight('balanced', dataframe.trk_isTrue)
    algoWeights = compute_sample_weight('balanced', dataframe.trk_algo)

    totalSampleWeights = totalSampleWeights + trueFakeWeights + algoWeights
    return totalSampleWeights
Example #18
0
def test_unbalanced_iris():
    # Check class rebalancing.
    unbalanced_X = iris.data[:125]
    unbalanced_y = iris.target[:125]
    sample_weight = compute_sample_weight("balanced", unbalanced_y)

    for name, TreeClassifier in CLF_TREES.items():
        clf = TreeClassifier(random_state=0)
        clf.fit(unbalanced_X, unbalanced_y, sample_weight=sample_weight)
        assert_almost_equal(clf.predict(unbalanced_X), unbalanced_y)
Example #19
0
def get_balanced_weights(masks_dir):
    all_masks = os.glob
    labels=[]
    for mask in dataset.masks_fps:
      mask = fs.read_image_as_is(mask)
      unique_labels = np.unique(mask)
      labels.append(''.join([str(int(i)) for i in unique_labels]))

    weights = compute_sample_weight('balanced', labels)
    return weights
Example #20
0
def test_unbalanced_iris():
    # Check class rebalancing.
    unbalanced_X = iris.data[:125]
    unbalanced_y = iris.target[:125]
    sample_weight = compute_sample_weight("balanced", unbalanced_y)

    for name, TreeClassifier in CLF_TREES.items():
        clf = TreeClassifier(random_state=0)
        clf.fit(unbalanced_X, unbalanced_y, sample_weight=sample_weight)
        assert_almost_equal(clf.predict(unbalanced_X), unbalanced_y)
Example #21
0
def getAdversarySampleWeights(dataframe):
    binning = np.linspace(PTMIN, PTMAX, PTBINS + 1)
    digitizedSamples = np.digitize(np.clip(dataframe['TransverseMass'].values,
                                           PTMIN, PTMAX - 1.0),
                                   bins=binning,
                                   right=False).astype(np.float32)
    weights = np.ones(dataframe.shape[0])
    weights[dataframe.eventType != 0] = compute_sample_weight(
        'balanced', digitizedSamples[dataframe.eventType != 0])
    weights[dataframe.eventType == 0] = 0

    return weights
def _parallel_build_trees(tree,
                          forest,
                          X,
                          y,
                          sample_weight,
                          tree_idx,
                          n_trees,
                          verbose=0,
                          class_weight=None,
                          n_samples_bootstrap=None):

    if forest.bootstrap:
        n_samples = X.shape[0]
        if sample_weight is None:
            curr_sample_weight = np.ones((n_samples, ), dtype=np.float64)
        else:
            curr_sample_weight = sample_weight.copy()

        indices = _generate_sample_indices(tree.random_state, n_samples,
                                           n_samples_bootstrap)
        sample_counts = np.bincount(indices, minlength=n_samples)
        curr_sample_weight *= sample_counts

        if class_weight == 'subsample':
            with catch_warnings():
                simplefilter('ignore', DeprecationWarning)
                curr_sample_weight *= compute_sample_weight('auto',
                                                            y,
                                                            indices=indices)
        elif class_weight == 'balanced_subsample':
            curr_sample_weight *= compute_sample_weight('balanced',
                                                        y,
                                                        indices=indices)

        tree.fit(X, y, sample_weight=curr_sample_weight, check_input=False)
    else:
        tree.fit(X, y, sample_weight=sample_weight, check_input=False)

    return tree
Example #23
0
    def _validate_y_class_weight(self, y):
        check_classification_targets(y)

        y = np.copy(y)
        expanded_class_weight = None

        if self.class_weight is not None:
            y_original = np.copy(y)

        self.classes_ = []
        self.n_classes_ = []

        y_store_unique_indices = np.zeros(y.shape, dtype=np.int)
        for k in range(self.n_outputs_):
            classes_k, y_store_unique_indices[:, k] = \
                np.unique(y[:, k], return_inverse=True)
            self.classes_.append(classes_k)
            self.n_classes_.append(classes_k.shape[0])
        y = y_store_unique_indices

        if self.class_weight is not None:
            if isinstance(self.class_weight, str):
                valid_presets = ('balanced', 'balanced_subsample')
                if self.class_weight not in valid_presets:
                    raise ValueError('Valid presets for class_weight include '
                                     '"balanced" and "balanced_subsample".'
                                     'Given "%s".'
                                     % self.class_weight)
                if self.warm_start:
                    warn('class_weight presets "balanced" or '
                         '"balanced_subsample" are '
                         'not recommended for warm_start if the fitted data '
                         'differs from the full dataset. In order to use '
                         '"balanced" weights, use compute_class_weight '
                         '("balanced", classes, y). In place of y you can use '
                         'a large enough sample of the full training set '
                         'target to properly estimate the class frequency '
                         'distributions. Pass the resulting weights as the '
                         'class_weight parameter.')

            if (self.class_weight != 'balanced_subsample' or
                    not self.bootstrap):
                if self.class_weight == "balanced_subsample":
                    class_weight = "balanced"
                else:
                    class_weight = self.class_weight
                expanded_class_weight = compute_sample_weight(class_weight,
                                                              y_original)

        return y, expanded_class_weight
def mx_compute_sample_weight(y, class_weight={1: 1., 0: .7}):
    """Compute sample weights for more balanced training. For Bandits this is
    applied to the reward. That is, samples that result in rewards are weighted
    more highly than no reward samples."""

    if isinstance(y, mxnet.ndarray.ndarray.NDArray):
        y = y.asnumpy()

    try:
        weights = compute_sample_weight(class_weight=class_weight, y=y)
    except ValueError as e:
        value = int(np.unique(y)[0])
        weights = np.repeat(y.shape[0], class_weight[value])

    return nd.array(weights)
Example #25
0
 def __fit_one_tree(self, X, y):
     dt = tree.DecisionTreeClassifier(criterion=random.choice(['gini', 'entropy']),
                                      max_depth=self.max_depth,
                                      class_weight=random.choice(['balanced', None]),
                                      splitter=random.choice(['best', 'random'])
                                      )
     # counter += 1
     # if self.verbose >= 2 and counter % 10 == 0:
     #    print('Fitting tree %d of %d ' % (counter, self.n_iter))
     _X_train = X.sample(frac=self.features_fraction, axis=1)
     _y_train = y[_X_train.index]
     dt_fitted = dt.fit(_X_train, _y_train, sample_weight=compute_sample_weight(class_weight='balanced', y=_y_train))
     rules_tuple, rules_set = get_rules_of_decision_tree(dt_fitted, list(_X_train.columns), percent_threshold=self.percent_threshold,
                                                         proportion_threshold=self.proportion_threshold
                                                         )
     rule_list = rules_tuple
     rule_set = rules_set
     return rule_list, rule_set
Example #26
0
    def _validate_y_class_weight(self, y):
        y = np.copy(y)
        expanded_class_weight = None

        if self.class_weight is not None:
            y_original = np.copy(y)

        self.classes_ = []
        self.n_classes_ = []
        for k in range(self.n_outputs_):
            classes_k, y[:, k] = np.unique(y[:, k], return_inverse=True)
            # remove smallest label (assuming that always same (i.e. smallest) over all n_outputs and consistent)
            self.classes_.append(classes_k[1:])
            self.n_classes_.append(classes_k.shape[0] - 1)

        if self.class_weight is not None:
            valid_presets = ('auto', 'subsample')
            if isinstance(self.class_weight, six.string_types):
                if self.class_weight not in valid_presets:
                    raise ValueError('Valid presets for class_weight include '
                                     '"auto" and "subsample". Given "%s".' %
                                     self.class_weight)
                if self.warm_start:
                    warn('class_weight presets "auto" or "subsample" are '
                         'not recommended for warm_start if the fitted data '
                         'differs from the full dataset. In order to use '
                         '"auto" weights, use compute_class_weight("auto", '
                         'classes, y). In place of y you can use a large '
                         'enough sample of the full training set target to '
                         'properly estimate the class frequency '
                         'distributions. Pass the resulting weights as the '
                         'class_weight parameter.')

            if self.class_weight != 'subsample' or not self.bootstrap:
                if self.class_weight == 'subsample':
                    class_weight = 'auto'
                else:
                    class_weight = self.class_weight
                expanded_class_weight = compute_sample_weight(
                    class_weight, y_original)
        return y, expanded_class_weight
def test_balancing():
    train = pd.read_csv('aptos2015_train.csv')
    test = pd.read_csv('aptos2015_test.csv')
    train = pd.concat((train, test), sort=False)

    dataset_size = len(train)
    print(dataset_size)

    x = np.arange(dataset_size)
    y = train['level'].values

    weights = compute_sample_weight('balanced', y)
    weights = np.sqrt(weights)
    # class_weight = compute_class_weight('balanced', np.arange(5), y)
    # weights = class_weight[y]

    sampler = WeightedRandomSampler(weights, dataset_size)
    loader = DataLoader(x, sampler=sampler, batch_size=60)
    hits = np.zeros(dataset_size)

    plt.figure()
    plt.hist(y)
    plt.title('Original distribution')
    plt.show()

    labels = []
    for batch in loader:
        for image in batch:
            label = y[image]
            labels.append(label)
            hits[image] += 1

    plt.figure()
    plt.hist(labels)
    plt.title('Balanced distribution')
    plt.show()

    plt.figure()
    plt.hist(hits)
    plt.title('Hits')
    plt.show()
Example #28
0
    def _validate_y_class_weight(self, y):

        y = np.copy(y)
        expanded_class_weight = None

        if self.class_weight is not None:
            y_original = np.copy(y)

        self.classes_ = []
        self.n_classes_ = []

        y_store_unique_indices = np.zeros(y.shape, dtype=np.int)
        for k in range(self.n_outputs_):
            classes_k, y_store_unique_indices[:, k] = np.unique(
                y[:, k], return_inverse=True
            )
            self.classes_.append(classes_k)
            self.n_classes_.append(classes_k.shape[0])
        y = y_store_unique_indices

        if self.class_weight is not None:
            valid_presets = ("balanced", "balanced_subsample")
            if isinstance(self.class_weight, str):
                if self.class_weight not in valid_presets:
                    raise ValueError(
                        "Valid presets for class_weight include "
                        '"balanced" and "balanced_subsample".'
                        'Given "%s".' % self.class_weight
                    )

            if self.class_weight != "balanced_subsample" or not self.bootstrap:
                if self.class_weight == "balanced_subsample":
                    class_weight = "balanced"
                else:
                    class_weight = self.class_weight
                expanded_class_weight = compute_sample_weight(
                    class_weight, y_original
                )

        return y, expanded_class_weight
Example #29
0
    def create_model_from_training_data(self):
        training_comments = []
        training_ratings = []

        self.logger.info("Training sentiment classifier model..")
        for sentidata in self.training_data:
            comments = preprocess_text(sentidata.text)
            training_comments.append(comments)
            training_ratings.append(sentidata.rating)
        self.logger.info('Text Preprocessing done')

        # Discard stopwords, apply stemming, and discard words present in less than 3 comments
        self.vectorizer = TfidfVectorizer(tokenizer=tokenize_and_stem,
                                          sublinear_tf=True,
                                          max_df=0.5,
                                          stop_words=mystop_words,
                                          min_df=3)
        # Saving TFIDF vectors as .pkl file for future use
        X_train = self.vectorizer.fit_transform(training_comments).toarray()
        Y_train = np.array(training_ratings)
        joblib.dump(self.vectorizer, f'{self.models_dir}/tfidf_vectorizer.pkl')
        self.logger.info('TF-IDF vectorization done')

        sample_weights = compute_sample_weight({
            -1: 0.4,
            0: 0.3,
            1: 0.3
        }, Y_train)
        model = XGBClassifier()
        model.fit(X_train, Y_train, sample_weight=sample_weights)

        self.logger.info('Model Training done\n')

        # Saving XGB model as .pkl file for future use
        joblib_file = f'{self.models_dir}/{self.algo}_senti.pkl'
        joblib.dump(model, joblib_file)

        return model
Example #30
0
 def __fit_one_tree(self, X, y):
     dt = tree.DecisionTreeClassifier(
         criterion=random.choice(['gini', 'entropy']),
         max_depth=self.max_depth,
         class_weight=random.choice(['balanced', None]),
         splitter=random.choice(['best', 'random']))
     # counter += 1
     # if self.verbose >= 2 and counter % 10 == 0:
     #    print('Fitting tree %d of %d ' % (counter, self.n_iter))
     _X_train = X.sample(frac=self.features_fraction, axis=1)
     _y_train = y[_X_train.index]
     dt_fitted = dt.fit(_X_train,
                        _y_train,
                        sample_weight=compute_sample_weight(
                            class_weight='balanced', y=_y_train))
     rules_tuple, rules_set = get_rules_of_decision_tree(
         dt_fitted,
         list(_X_train.columns),
         percent_threshold=self.percent_threshold,
         proportion_threshold=self.proportion_threshold)
     rule_list = rules_tuple
     rule_set = rules_set
     return rule_list, rule_set
Example #31
0
def get_sample_matrix(batch_size=500):
    tags = np.load('./tags.npy', 'r')
    with mini_batch(tags=tags) as mb:
        length = int(np.ceil(len(mb.id_list) / batch_size))
        y_data = pd.DataFrame()
        classes = 2
        for i in range(length):
            mb.get_ordered_ids(batch_size, i * batch_size)
            x_test_batch, y_test_batch = mb.load_mini_batch()
            y_data = pd.concat([y_data, pd.DataFrame(y_test_batch)],
                               ignore_index=True)
            print('batch:', i)
        # tags_num = y_data.shape[1]
        tags_num = len(tags)
        # sample_num = y_data.shape[0]
        sample_weight = compute_sample_weight('balanced', y_data)
        sample_weight = sample_weight[:, np.newaxis]
        class_weight = []
        for i in range(tags_num):
            class_weight_tag = compute_class_weight('balanced', [0, 1],
                                                    y_data.ix[:, i])
            class_weight.append(class_weight_tag)
        class_weight = np.vstack(class_weight).T
        return class_weight, sample_weight
Example #32
0
    def fit(self, X, y, sample_weight=None, check_input=True,
            X_idx_sorted=None):
        random_state = check_random_state(self.random_state)
        if check_input:
            X, y = check_X_y(X, y, dtype=DTYPE, multi_output=False)

        # Determine output settings
        n_samples, self.n_features_ = X.shape
        is_classification = isinstance(self, ClassifierMixin)

        y = np.atleast_1d(y)
        expanded_class_weight = None

        if y.ndim == 1:
            # reshape is necessary to preserve the data contiguity against vs
            # [:, np.newaxis] that does not.
            y = np.reshape(y, (-1, 1))

        self.n_outputs_ = y.shape[1]

        if is_classification:
            check_classification_targets(y)
            y = np.copy(y)

            self.classes_ = []
            self.n_classes_ = []

            if self.class_weight is not None:
                y_original = np.copy(y)

            y_encoded = np.zeros(y.shape, dtype=np.int)
            for k in range(self.n_outputs_):
                classes_k, y_encoded[:, k] = np.unique(y[:, k],
                                                       return_inverse=True)
                self.classes_.append(classes_k)
                self.n_classes_.append(classes_k.shape[0])
            y = y_encoded

            if self.class_weight is not None:
                expanded_class_weight = compute_sample_weight(
                    self.class_weight, y_original)

        else:
            self.classes_ = [None] * self.n_outputs_
            self.n_classes_ = [1] * self.n_outputs_

        self.n_classes_ = np.array(self.n_classes_, dtype=np.intp)

        if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous:
            y = np.ascontiguousarray(y, dtype=DOUBLE)

        # Check parameters
        max_depth = ((2 ** 31) - 1 if self.max_depth is None
                     else self.max_depth)

        if isinstance(self.min_samples_split, (numbers.Integral, np.integer)):
            if not 2 <= self.min_samples_split:
                raise ValueError("min_samples_split must be an integer "
                                 "greater than 1 or a float in (0.0, 1.0]; "
                                 "got the integer %s"
                                 % self.min_samples_split)
            min_samples_split = self.min_samples_split
        else:  # float
            if not 0. < self.min_samples_split <= 1.:
                raise ValueError("min_samples_split must be an integer "
                                 "greater than 1 or a float in (0.0, 1.0]; "
                                 "got the float %s"
                                 % self.min_samples_split)
            min_samples_split = int(ceil(self.min_samples_split * n_samples))
            min_samples_split = max(2, min_samples_split)

        if len(y) != n_samples:
            raise ValueError("Number of labels=%d does not match "
                             "number of samples=%d" % (len(y), n_samples))
        if max_depth <= 0:
            raise ValueError("max_depth must be greater than zero. ")

        if sample_weight is not None:
            if (getattr(sample_weight, "dtype", None) != DOUBLE or
                    not sample_weight.flags.contiguous):
                sample_weight = np.ascontiguousarray(
                    sample_weight, dtype=DOUBLE)
            if len(sample_weight.shape) > 1:
                raise ValueError("Sample weights array has more "
                                 "than one dimension: %d" %
                                 len(sample_weight.shape))
            if len(sample_weight) != n_samples:
                raise ValueError("Number of weights=%d does not match "
                                 "number of samples=%d" %
                                 (len(sample_weight), n_samples))

        if expanded_class_weight is not None:
            if sample_weight is not None:
                sample_weight = sample_weight * expanded_class_weight
            else:
                sample_weight = expanded_class_weight

        # Build tree
        criterion = self.criterion
        if not isinstance(criterion, Criterion):
            if is_classification:
                criterion = CRITERIA_CLF[self.criterion](self.n_outputs_,
                                                         self.n_classes_)
            else:
                criterion = CRITERIA_REG[self.criterion](self.n_outputs_,
                                                         n_samples)
        splitter = self.splitter
        if not isinstance(self.splitter, Splitter):
            splitter = SPLITTERS[self.splitter](criterion,
                                                random_state)
        self.tree_ = Tree(self.n_features_, self.n_classes_, self.n_outputs_)
        builder = DepthFirstTreeBuilder(splitter, min_samples_split,
                                        max_depth)
        builder.build(self.tree_, X, y, sample_weight, X_idx_sorted)

        if self.n_outputs_ == 1:
            self.n_classes_ = self.n_classes_[0]
            self.classes_ = self.classes_[0]

        return self
Example #33
0
    def fit(self,
            X,
            y,
            sample_weight=None,
            check_input=True,
            X_idx_sorted=None):

        random_state = check_random_state(self.random_state)

        if X.dtype != np.uint8:
            msg = "The dtype of `X` should be `np.uint8`, but got {} instead."
            raise RuntimeError(msg.format(X.dtype))

        if check_input:
            # Need to validate separately here.
            # We can't pass multi_ouput=True because that would allow y to be
            # csr.
            check_X_params = dict(dtype=DTYPE, accept_sparse="csc")
            check_y_params = dict(ensure_2d=False, dtype=None)
            X, y = self._validate_data(X,
                                       y,
                                       validate_separately=(check_X_params,
                                                            check_y_params))

        # Determine output settings
        n_samples, self.n_features_ = X.shape

        y = np.atleast_1d(y)
        expanded_class_weight = None

        if y.ndim == 1:
            # reshape is necessary to preserve the data contiguity against vs
            # [:, np.newaxis] that does not.
            y = np.reshape(y, (-1, 1))

        self.n_outputs_ = y.shape[1]

        # `classes_` and `n_classes_` were set by the forest.
        if not hasattr(self, "classes_") and is_classifier(self):
            check_classification_targets(y)
            y = np.copy(y)

            self.classes_ = []
            self.n_classes_ = []

            if self.class_weight is not None:
                y_original = np.copy(y)

            y_encoded = np.zeros(y.shape, dtype=np.int)
            for k in range(self.n_outputs_):
                classes_k, y_encoded[:, k] = np.unique(y[:, k],
                                                       return_inverse=True)
                self.classes_.append(classes_k)
                self.n_classes_.append(classes_k.shape[0])
            y = y_encoded

            if self.class_weight is not None:
                expanded_class_weight = compute_sample_weight(
                    self.class_weight, y_original)

            self.n_classes_ = np.array(self.n_classes_, dtype=np.int32)

        if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous:
            y = np.ascontiguousarray(y, dtype=DOUBLE)

        # Check parameters
        max_depth = (np.iinfo(np.int32).max
                     if self.max_depth is None else self.max_depth)

        if isinstance(self.min_samples_leaf, numbers.Integral):
            if not 1 <= self.min_samples_leaf:
                raise ValueError("min_samples_leaf must be at least 1 "
                                 "or in (0, 0.5], got %s" %
                                 self.min_samples_leaf)
            min_samples_leaf = self.min_samples_leaf
        else:  # float
            if not 0.0 < self.min_samples_leaf <= 0.5:
                raise ValueError("min_samples_leaf must be at least 1 "
                                 "or in (0, 0.5], got %s" %
                                 self.min_samples_leaf)
            min_samples_leaf = int(ceil(self.min_samples_leaf * n_samples))

        if isinstance(self.min_samples_split, numbers.Integral):
            if not 2 <= self.min_samples_split:
                raise ValueError("min_samples_split must be an integer "
                                 "greater than 1 or a float in (0.0, 1.0]; "
                                 "got the integer %s" % self.min_samples_split)
            min_samples_split = self.min_samples_split
        else:  # float
            if not 0.0 < self.min_samples_split <= 1.0:
                raise ValueError("min_samples_split must be an integer "
                                 "greater than 1 or a float in (0.0, 1.0]; "
                                 "got the float %s" % self.min_samples_split)
            min_samples_split = int(ceil(self.min_samples_split * n_samples))
            min_samples_split = max(2, min_samples_split)

        min_samples_split = max(min_samples_split, 2 * min_samples_leaf)

        if isinstance(self.max_features, str):
            if self.max_features in ["auto", "sqrt"]:
                max_features = max(1, int(np.sqrt(self.n_features_)))
            elif self.max_features == "log2":
                max_features = max(1, int(np.log2(self.n_features_)))
            else:
                raise ValueError("Invalid value for max_features. "
                                 "Allowed string values are 'auto', "
                                 "'sqrt' or 'log2'.")
        elif self.max_features is None:
            max_features = self.n_features_
        elif isinstance(self.max_features, numbers.Integral):
            max_features = self.max_features
        else:  # float
            if self.max_features > 0.0:
                max_features = max(1,
                                   int(self.max_features * self.n_features_))
            else:
                max_features = 0

        self.max_features_ = max_features

        if len(y) != n_samples:
            raise ValueError("Number of labels=%d does not match "
                             "number of samples=%d" % (len(y), n_samples))
        if not 0 <= self.min_weight_fraction_leaf <= 0.5:
            raise ValueError("min_weight_fraction_leaf must in [0, 0.5]")
        if max_depth <= 0:
            raise ValueError("max_depth must be greater than zero. ")
        if not (0 < max_features <= self.n_features_):
            raise ValueError("max_features must be in (0, n_features]")

        if sample_weight is not None:
            sample_weight = _check_sample_weight(sample_weight, X, DOUBLE)

        if expanded_class_weight is not None:
            if sample_weight is not None:
                sample_weight = sample_weight * expanded_class_weight
            else:
                sample_weight = expanded_class_weight

        # Set min_weight_leaf from min_weight_fraction_leaf
        if sample_weight is None:
            min_weight_leaf = self.min_weight_fraction_leaf * n_samples
        else:
            min_weight_leaf = self.min_weight_fraction_leaf * np.sum(
                sample_weight)

        min_impurity_split = self.min_impurity_split
        if min_impurity_split is not None:
            warnings.warn(
                "The min_impurity_split parameter is deprecated. "
                "Its default value has changed from 1e-7 to 0 in "
                "version 0.23, and it will be removed in 0.25. "
                "Use the min_impurity_decrease parameter instead.",
                FutureWarning,
            )

            if min_impurity_split < 0.0:
                raise ValueError("min_impurity_split must be greater than "
                                 "or equal to 0")
        else:
            min_impurity_split = 0

        if self.min_impurity_decrease < 0.0:
            raise ValueError("min_impurity_decrease must be greater than "
                             "or equal to 0")

        if self.presort != "deprecated":
            warnings.warn(
                "The parameter 'presort' is deprecated and has no "
                "effect. It will be removed in v0.24. You can "
                "suppress this warning by not passing any value "
                "to the 'presort' parameter.",
                FutureWarning,
            )

        # Build tree
        criterion = self.criterion
        if not isinstance(criterion, Criterion):
            if is_classifier(self):
                criterion = CRITERIA_CLF[self.criterion](self.n_outputs_,
                                                         self.n_classes_)
            else:
                criterion = CRITERIA_REG[self.criterion](self.n_outputs_,
                                                         n_samples)

        SPLITTERS = DENSE_SPLITTERS

        splitter = self.splitter
        if not isinstance(self.splitter, Splitter):
            splitter = SPLITTERS[self.splitter](
                criterion,
                self.max_features_,
                min_samples_leaf,
                min_weight_leaf,
                random_state,
            )

        if is_classifier(self):
            self.tree_ = Tree(self.n_features_, self.n_classes_,
                              self.n_outputs_)
        else:
            self.tree_ = Tree(
                self.n_features_,
                # TODO: tree should't need this in this case
                np.array([1] * self.n_outputs_, dtype=np.int32),
                self.n_outputs_,
            )

        builder = DepthFirstTreeBuilder(
            splitter,
            min_samples_split,
            min_samples_leaf,
            min_weight_leaf,
            max_depth,
            self.min_impurity_decrease,
            min_impurity_split,
        )

        builder.build(self.tree_, X, y, sample_weight, X_idx_sorted)

        if self.n_outputs_ == 1 and is_classifier(self):
            self.n_classes_ = self.n_classes_[0]
            self.classes_ = self.classes_[0]

        # Only return the essential data for using a tree for prediction
        feature = self.tree_.feature
        threshold = self.tree_.threshold
        children = np.vstack(
            (self.tree_.children_left, self.tree_.children_right)).T
        value = self.tree_.value

        return feature, threshold, children, value
Example #34
0
    def fit(self,
            X,
            y,
            sample_weight=None,
            check_input=True,
            X_idx_sorted=None):

        feature_weight = self.feature_weight  # get feature_weight from attribute

        random_state = check_random_state(self.random_state)
        if check_input:
            X = check_array(X, dtype=DTYPE, accept_sparse="csc")
            y = check_array(y, ensure_2d=False, dtype=None)
            if issparse(X):
                X.sort_indices()

                if X.indices.dtype != np.intc or X.indptr.dtype != np.intc:
                    raise ValueError("No support for np.int64 index based "
                                     "sparse matrices")

        # Determine output settings
        n_samples, self.n_features_ = X.shape
        is_classification = isinstance(self, ClassifierMixin)

        y = np.atleast_1d(y)
        expanded_class_weight = None

        if y.ndim == 1 and is_classification:
            # reshape is necessary to preserve the data contiguity against vs
            # [:, np.newaxis] that does not.
            y = np.reshape(y, (-1, 1))

        self.n_outputs_ = y.shape[1]

        if is_classification:
            check_classification_targets(y)
            y = np.copy(y)

            self.classes_ = []
            self.n_classes_ = []

            if self.class_weight is not None:
                y_original = np.copy(y)

            y_encoded = np.zeros(y.shape, dtype=np.int)
            for k in range(self.n_outputs_):
                classes_k, y_encoded[:, k] = np.unique(y[:, k],
                                                       return_inverse=True)
                self.classes_.append(classes_k)
                self.n_classes_.append(classes_k.shape[0])
            y = y_encoded

            if self.class_weight is not None:
                expanded_class_weight = compute_sample_weight(
                    self.class_weight, y_original)
            self.n_classes_ = np.array(self.n_classes_, dtype=np.intp)

        if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous:
            y = np.ascontiguousarray(y, dtype=DOUBLE)

        # Check parameters
        max_depth = ((2**31) - 1 if self.max_depth is None else self.max_depth)
        max_leaf_nodes = (-1 if self.max_leaf_nodes is None else
                          self.max_leaf_nodes)

        if isinstance(self.min_samples_leaf, (numbers.Integral, np.integer)):
            if not 1 <= self.min_samples_leaf:
                raise ValueError("min_samples_leaf must be at least 1 "
                                 "or in (0, 0.5], got %s" %
                                 self.min_samples_leaf)
            min_samples_leaf = self.min_samples_leaf
        else:  # float
            if not 0. < self.min_samples_leaf <= 0.5:
                raise ValueError("min_samples_leaf must be at least 1 "
                                 "or in (0, 0.5], got %s" %
                                 self.min_samples_leaf)
            min_samples_leaf = int(ceil(self.min_samples_leaf * n_samples))

        if isinstance(self.min_samples_split, (numbers.Integral, np.integer)):
            if not 2 <= self.min_samples_split:
                raise ValueError("min_samples_split must be an integer "
                                 "greater than 1 or a float in (0.0, 1.0]; "
                                 "got the integer %s" % self.min_samples_split)
            min_samples_split = self.min_samples_split
        else:  # float
            if not 0. < self.min_samples_split <= 1.:
                raise ValueError("min_samples_split must be an integer "
                                 "greater than 1 or a float in (0.0, 1.0]; "
                                 "got the float %s" % self.min_samples_split)
            min_samples_split = int(ceil(self.min_samples_split * n_samples))
            min_samples_split = max(2, min_samples_split)

        min_samples_split = max(min_samples_split, 2 * min_samples_leaf)

        if isinstance(self.max_features, str):
            if self.max_features == "auto":
                if is_classification:
                    max_features = max(1, int(np.sqrt(self.n_features_)))
                else:
                    max_features = self.n_features_
            elif self.max_features == "sqrt":
                max_features = max(1, int(np.sqrt(self.n_features_)))
            elif self.max_features == "log2":
                max_features = max(1, int(np.log2(self.n_features_)))
            else:
                raise ValueError(
                    'Invalid value for max_features. Allowed string '
                    'values are "auto", "sqrt" or "log2".')
        elif self.max_features is None:
            max_features = self.n_features_
        elif isinstance(self.max_features, (numbers.Integral, np.integer)):
            max_features = self.max_features
        else:  # float
            if self.max_features > 0.0:
                max_features = max(1,
                                   int(self.max_features * self.n_features_))
            else:
                max_features = 0

        self.max_features_ = max_features

        if len(y) != n_samples:
            raise ValueError("Number of labels=%d does not match "
                             "number of samples=%d" % (len(y), n_samples))
        if not 0 <= self.min_weight_fraction_leaf <= 0.5:
            raise ValueError("min_weight_fraction_leaf must in [0, 0.5]")
        if max_depth <= 0:
            raise ValueError("max_depth must be greater than zero. ")
        if not (0 < max_features <= self.n_features_):
            raise ValueError("max_features must be in (0, n_features]")
        if not isinstance(max_leaf_nodes, (numbers.Integral, np.integer)):
            raise ValueError("max_leaf_nodes must be integral number but was "
                             "%r" % max_leaf_nodes)
        if -1 < max_leaf_nodes < 2:
            raise ValueError(("max_leaf_nodes {0} must be either None "
                              "or larger than 1").format(max_leaf_nodes))

        if sample_weight is not None:
            if (getattr(sample_weight, "dtype", None) != DOUBLE
                    or not sample_weight.flags.contiguous):
                sample_weight = np.ascontiguousarray(sample_weight,
                                                     dtype=DOUBLE)
            if len(sample_weight.shape) > 1:
                raise ValueError("Sample weights array has more "
                                 "than one dimension: %d" %
                                 len(sample_weight.shape))
            if len(sample_weight) != n_samples:
                raise ValueError("Number of weights=%d does not match "
                                 "number of samples=%d" %
                                 (len(sample_weight), n_samples))

        if feature_weight is not None:
            if (getattr(feature_weight, "dtype", None) != DOUBLE
                    or not feature_weight.flags.contiguous):
                feature_weight = np.ascontiguousarray(feature_weight,
                                                      dtype=DOUBLE)
            if len(feature_weight.shape) > 1:
                raise ValueError("Feature weights array has more "
                                 "than one dimension: %d" %
                                 len(feature_weight.shape))
            if len(feature_weight) != self.n_features_:
                raise ValueError("Number of weights=%d does not match "
                                 "number of features=%d" %
                                 (len(feature_weight), self.n_features_))

        if expanded_class_weight is not None:
            if sample_weight is not None:
                sample_weight = sample_weight * expanded_class_weight
            else:
                sample_weight = expanded_class_weight

        # Set min_weight_leaf from min_weight_fraction_leaf
        if sample_weight is None:
            min_weight_leaf = (self.min_weight_fraction_leaf * n_samples)
        else:
            min_weight_leaf = (self.min_weight_fraction_leaf *
                               np.sum(sample_weight))

        if self.min_impurity_split is not None:
            warnings.warn(
                "The min_impurity_split parameter is deprecated and"
                " will be removed in version 0.21. "
                "Use the min_impurity_decrease parameter instead.",
                DeprecationWarning)
            min_impurity_split = self.min_impurity_split
        else:
            min_impurity_split = 1e-7

        if min_impurity_split < 0.:
            raise ValueError("min_impurity_split must be greater than "
                             "or equal to 0")

        if self.min_impurity_decrease < 0.:
            raise ValueError("min_impurity_decrease must be greater than "
                             "or equal to 0")

        presort = self.presort
        if self.presort == 'deprecated':
            self.presort = 'auto'
        # Allow presort to be 'auto', which means True if the dataset is dense,
        # otherwise it will be False.
        if self.presort == 'auto' and issparse(X):
            presort = False
        elif self.presort == 'auto':
            presort = True

        if presort is True and issparse(X):
            raise ValueError("Presorting is not supported for sparse "
                             "matrices.")

        # If multiple trees are built on the same dataset, we only want to
        # presort once. Splitters now can accept presorted indices if desired,
        # but do not handle any presorting themselves. Ensemble algorithms
        # which desire presorting must do presorting themselves and pass that
        # matrix into each tree.
        if X_idx_sorted is None and presort:
            X_idx_sorted = np.asfortranarray(np.argsort(X, axis=0),
                                             dtype=np.int32)

        if presort and X_idx_sorted.shape != X.shape:
            raise ValueError("The shape of X (X.shape = {}) doesn't match "
                             "the shape of X_idx_sorted (X_idx_sorted"
                             ".shape = {})".format(X.shape,
                                                   X_idx_sorted.shape))

        # Build tree
        criterion = self.criterion
        if not isinstance(criterion, Criterion):
            if is_classification:
                criterion = CRITERIA_CLF[self.criterion](self.n_outputs_,
                                                         self.n_classes_)
            else:
                criterion = CRITERIA_REG[self.criterion](self.n_outputs_,
                                                         n_samples)

        SPLITTERS = SPARSE_SPLITTERS if issparse(X) else DENSE_SPLITTERS

        splitter = self.splitter
        if not isinstance(self.splitter, Splitter):
            splitter = SPLITTERS[self.splitter](criterion, self.max_features_,
                                                min_samples_leaf,
                                                min_weight_leaf, random_state,
                                                self.presort)
        if is_classification:
            self.tree_ = Tree(self.n_features_, self.n_classes_,
                              self.n_outputs_)
        else:
            self.tree_ = Tree(self.n_features_,
                              np.array([1] * self.n_outputs_, dtype=np.intp),
                              self.n_outputs_)

        # Use BestFirst if max_leaf_nodes given; use DepthFirst otherwise
        if max_leaf_nodes < 0:
            builder = DepthFirstTreeBuilder(splitter, min_samples_split,
                                            min_samples_leaf, min_weight_leaf,
                                            max_depth,
                                            self.min_impurity_decrease,
                                            min_impurity_split)
        else:
            builder = BestFirstTreeBuilder(splitter, min_samples_split,
                                           min_samples_leaf, min_weight_leaf,
                                           max_depth, max_leaf_nodes,
                                           self.min_impurity_decrease,
                                           min_impurity_split)

        builder.build(self.tree_, X, y, sample_weight, feature_weight,
                      X_idx_sorted)

        if self.n_outputs_ == 1 and is_classification:
            self.n_classes_ = self.n_classes_[0]
            self.classes_ = self.classes_[0]

        return self