def _fit_resample(self, X, y):
        random_state = check_random_state(self.random_state)

        idx_under = np.empty((0, ), dtype=int)

        for target_class in np.unique(y):
            if target_class in self.sampling_strategy_.keys():
                n_samples = self.sampling_strategy_[target_class]
                index_target_class = random_state.choice(
                    range(np.count_nonzero(y == target_class)),
                    size=n_samples,
                    replace=self.replacement)
            else:
                index_target_class = slice(None)

            idx_under = np.concatenate(
                (idx_under,
                 np.flatnonzero(y == target_class)[index_target_class]),
                axis=0)

        if self.return_indices:
            return (safe_indexing(X, idx_under), safe_indexing(y, idx_under),
                    idx_under)
        else:
            return safe_indexing(X, idx_under), safe_indexing(y, idx_under)
    def _fit_resample(self, X, y):
        if self.return_indices:
            deprecate_parameter(self, '0.4', 'return_indices',
                                'sample_indices_')

        random_state = check_random_state(self.random_state)
        target_stats = Counter(y)

        sample_indices = range(X.shape[0])

        for class_sample, num_samples in self.sampling_strategy_.items():
            target_class_indices = np.flatnonzero(y == class_sample)
            indices = random_state.randint(low=0,
                                           high=target_stats[class_sample],
                                           size=num_samples)

            sample_indices = np.append(sample_indices,
                                       target_class_indices[indices])
        self.sample_indices_ = np.array(sample_indices)

        if self.return_indices:
            return (safe_indexing(X, sample_indices),
                    safe_indexing(y, sample_indices), sample_indices)
        return (safe_indexing(X, sample_indices),
                safe_indexing(y, sample_indices))
Esempio n. 3
0
def test_safe_indexing_mock_pandas():
    X = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
    X_df = MockDataFrame(X)
    inds = np.array([1, 2])
    X_df_indexed = safe_indexing(X_df, inds)
    X_indexed = safe_indexing(X_df, inds)
    assert_array_equal(np.array(X_df_indexed), X_indexed)
Esempio n. 4
0
def test_safe_indexing_axis_0(asarray):
    X = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
    inds = np.array([1, 2]) if asarray else [1, 2]
    X_inds = safe_indexing(X, inds)
    X_arrays = safe_indexing(np.array(X), inds)
    assert_array_equal(np.array(X_inds), X_arrays)
    assert_array_equal(np.array(X_inds), np.array(X)[inds])
Esempio n. 5
0
def sample_data(data, train_idx, test_idx):
    sample = bunch.Bunch(train=bunch.Bunch(), test=bunch.Bunch(), target_names=None)

    # sample.target_names = data.target_names

    # sample.train.data = safe_indexing(data.train.data,train_idx)
    sample.train.target = safe_indexing(data.train.target,train_idx)
    sample.train.bow = safe_indexing(data.train.bow,train_idx)
    sample.train.remaining = []
    sample.train.validation = []
    sample.train.revisit = []

    sample.train.snippets=safe_indexing(data.train.snippets,train_idx)
    sample.train.sizes=safe_indexing(data.train.sizes,train_idx)
    sample.train.snippet_cost = safe_indexing(data.train.snippet_cost,train_idx)


    if len(test_idx) > 0: #if there are test indexes
        # sample.test.data = safe_indexing(data.train.target,test_idx)
        sample.test.target = safe_indexing(data.train.target,test_idx)
        sample.test.bow = safe_indexing(data.train.bow,train_idx)
        sample.test.snippets=safe_indexing(data.train.snippets,train_idx)
        sample.test.sizes=safe_indexing(data.train.sizes,train_idx)
        sample.test.snippet_cost = safe_indexing(data.train.snippet_cost,train_idx)

    else:
        sample.test = data.test

    return sample.train, sample.test
Esempio n. 6
0
    def generate_train_set(self, train_size=None, test_size=None, rand_state=None):
        """



        :param test_size:
        :param rand_state:
        :param train_size: float or int (default=20)
            If float, should be between 0.0 and 1.0 and represent the
            proportion of the dataset to include in the train split. If
            int, represents the absolute number of train samples.
        :return:
        """
        # self.probe.clear()
        # self.gallery.clear()

        if train_size is None and test_size is None:
            self.probe.files_train, self.probe.files_test = [], self.probe.files
            self.gallery.files_train, self.gallery.files_test = [], self.gallery.files
            self.train_indexes, self.test_indexes = [], list(range(0, len(self.probe.files)))
        else:
            n_samples = len(self.probe.files)
            cv = ShuffleSplit(n_samples, test_size=test_size, train_size=train_size, random_state=rand_state)
            train_indexes, test_indexes = next(iter(cv))
            arrays = [self.probe.files, self.gallery.files]
            self.probe.files_train, self.probe.files_test, self.gallery.files_train, self.gallery.files_test = \
                list(chain.from_iterable((safe_indexing(a, train_indexes),
                                          safe_indexing(a, test_indexes)) for a in arrays))
            self.train_indexes, self.test_indexes = train_indexes, test_indexes

        self.train_size = len(self.train_indexes)
        self.test_size = len(self.test_indexes)
Esempio n. 7
0
    def _split_fit_score_trial(self, X, y, idx=0):
        """
        Splits the dataset, fits a clone of the estimator, then scores it
        according to the required metrics.

        The index of the split is added to the random_state if the
        random_state is not None; this ensures that every split is shuffled
        differently but in a deterministic fashion for testing purposes.
        """
        random_state = self.random_state
        if random_state is not None:
            random_state += idx

        splitter = self._check_cv(self.cv, random_state)

        for train_index, test_index in splitter.split(X, y):
            # Safe indexing handles multiple types of inputs including
            # DataFrames and structured arrays - required for generic splits.
            X_train = safe_indexing(X, train_index)
            y_train = safe_indexing(y, train_index)
            X_test = safe_indexing(X, test_index)
            y_test = safe_indexing(y, test_index)

            model = clone(self.estimator)
            model.fit(X_train, y_train)

            if hasattr(model, "predict_proba"):
                # Get the probabilities for the positive class
                y_scores = model.predict_proba(X_test)[:,1]
            else:
                # Use the decision function to get the scores
                y_scores = model.decision_function(X_test)

            # Compute the curve metrics and thresholds
            curve_metrics = precision_recall_curve(y_test, y_scores)
            precision, recall, thresholds = curve_metrics

            # Compute the F1 score from precision and recall
            # Don't need to warn for F, precision/recall would have warned
            with np.errstate(divide='ignore', invalid='ignore'):
                beta = self.fbeta ** 2
                f_score = ((1 + beta) * precision * recall /
                   (beta * precision + recall))

            # Ensure thresholds ends at 1
            thresholds = np.append(thresholds, 1)

            # Compute the queue rate
            queue_rate = np.array([
                (y_scores >= threshold).mean()
                for threshold in thresholds
            ])

            yield {
                'thresholds': thresholds,
                'precision': precision,
                'recall': recall,
                'fscore': f_score,
                'queue_rate': queue_rate
            }
Esempio n. 8
0
def test_safe_indexing():
    X = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
    inds = np.array([1, 2])
    X_inds = safe_indexing(X, inds)
    X_arrays = safe_indexing(np.array(X), inds)
    assert_array_equal(np.array(X_inds), X_arrays)
    assert_array_equal(np.array(X_inds), np.array(X)[inds])
Esempio n. 9
0
    def __data_generation(self, indices_head, indices_tail):
        l = np.random.beta(self.alpha, self.alpha, self.batch_size)
        X_l = l.reshape(self.batch_size, 1, 1, 1)
        y_l = l.reshape(self.batch_size, 1)

        X1_tmp = safe_indexing(self.X, indices_head)
        X2_tmp = safe_indexing(self.X, indices_tail)
        n, _, w, _ = X1_tmp.shape
        X1 = np.zeros((n, w, w, 1))
        X2 = np.zeros((n, w, w, 1))

        for i in range(self.batch_size):
            X1[i] = crop_image(X1_tmp[i])
            X2[i] = crop_image(X2_tmp[i])

        X = X1 * X_l + X2 * (1.0 - X_l)

        y1 = safe_indexing(self.y, indices_head)
        y2 = safe_indexing(self.y, indices_tail)
        y = y1 * y_l + y2 * (1.0 - y_l)

        if self.datagen is not None:
            for i in range(self.batch_size):
                X[i] = self.datagen.random_transform(X[i])
                X[i] = self.datagen.standardize(X[i])

        return X, y
Esempio n. 10
0
def _safe_split(estimator, X, y, indices, train_indices=None):
    """Create subset of dataset and properly handle kernels"""
    from sklearn.gaussian_process.kernels import Kernel as GPKernel

    if (hasattr(estimator, 'kernel') and callable(estimator.kernel)
            and not isinstance(estimator.kernel, GPKernel)):
        # cannot compute the kernel values with custom function
        raise ValueError("Cannot use a custom kernel function. "
                         "Precompute the kernel matrix instead.")

    if not hasattr(X, "shape"):
        if getattr(estimator, "_pairwise", False):
            raise ValueError("Precomputed kernels or affinity matrices have "
                             "to be passed as arrays or sparse matrices.")
        X_subset = [X[index] for index in indices]
    else:
        if getattr(estimator, "_pairwise", False):
            # X is a precomputed square kernel matrix
            if X.shape[0] != X.shape[1]:
                raise ValueError("X should be a square kernel matrix")
            if train_indices is None:
                X_subset = X[np.ix_(indices, indices)]
            else:
                X_subset = X[np.ix_(indices, train_indices)]
        else:
            X_subset = safe_indexing(X, indices)

    if y is not None:
        y_subset = safe_indexing(y, indices)
    else:
        y_subset = None
    return X_subset, y_subset
Esempio n. 11
0
def test_safe_indexing_mock_pandas():
    X = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
    X_df = MockDataFrame(X)
    inds = np.array([1, 2])
    X_df_indexed = safe_indexing(X_df, inds)
    X_indexed = safe_indexing(X_df, inds)
    assert_array_equal(np.array(X_df_indexed), X_indexed)
Esempio n. 12
0
    def _split_fit_score_trial(self, X, y, idx=0):
        """
        Splits the dataset, fits a clone of the estimator, then scores it
        according to the required metrics.

        The index of the split is added to the random_state if the
        random_state is not None; this ensures that every split is shuffled
        differently but in a deterministic fashion for testing purposes.
        """
        random_state = self.random_state
        if random_state is not None:
            random_state += idx

        splitter = self._check_cv(self.cv, random_state)

        for train_index, test_index in splitter.split(X, y):
            # Safe indexing handles multiple types of inputs including
            # DataFrames and structured arrays - required for generic splits.
            X_train = safe_indexing(X, train_index)
            y_train = safe_indexing(y, train_index)
            X_test = safe_indexing(X, test_index)
            y_test = safe_indexing(y, test_index)

            model = clone(self.estimator)
            model.fit(X_train, y_train)

            if hasattr(model, "predict_proba"):
                # Get the probabilities for the positive class
                y_scores = model.predict_proba(X_test)[:,1]
            else:
                # Use the decision function to get the scores
                y_scores = model.decision_function(X_test)

            # Compute the curve metrics and thresholds
            curve_metrics = precision_recall_curve(y_test, y_scores)
            precision, recall, thresholds = curve_metrics

            # Compute the F1 score from precision and recall
            # Don't need to warn for F, precision/recall would have warned
            with np.errstate(divide='ignore', invalid='ignore'):
                beta = self.fbeta ** 2
                f_score = ((1 + beta) * precision * recall /
                   (beta * precision + recall))

            # Ensure thresholds ends at 1
            thresholds = np.append(thresholds, 1)

            # Compute the queue rate
            queue_rate = np.array([
                (y_scores >= threshold).mean()
                for threshold in thresholds
            ])

            yield {
                'thresholds': thresholds,
                'precision': precision,
                'recall': recall,
                'fscore': f_score,
                'queue_rate': queue_rate
            }
Esempio n. 13
0
def _safe_split(estimator, X, y, indices, train_indices=None):
    """Create subset of dataset and properly handle kernels."""
    if hasattr(estimator, 'kernel') and callable(estimator.kernel):
        # cannot compute the kernel values with custom function
        raise ValueError("Cannot use a custom kernel function. "
                         "Precompute the kernel matrix instead.")

    if not hasattr(X, "shape"):
        if getattr(estimator, "_pairwise", False):
            raise ValueError("Precomputed kernels or affinity matrices have "
                             "to be passed as arrays or sparse matrices.")
        X_subset = [X[idx] for idx in indices]
    else:
        if getattr(estimator, "_pairwise", False):
            # X is a precomputed square kernel matrix
            if X.shape[0] != X.shape[1]:
                raise ValueError("X should be a square kernel matrix")
            if train_indices is None:
                X_subset = X[np.ix_(indices, indices)]
            else:
                X_subset = X[np.ix_(indices, train_indices)]
        else:
            X_subset = safe_indexing(X, indices)

    if y is not None:
        y_subset = safe_indexing(y, indices)
    else:
        y_subset = None

    return X_subset, y_subset
Esempio n. 14
0
def _safe_split(depthmaps, offset_points_projected, direction_vectors, true_joints, indices):
    depth_subset = safe_indexing(depthmaps, indices)
    offsets_subset = safe_indexing(offset_points_projected, indices)
    directions_subset = safe_indexing(direction_vectors, indices)
    truths_subset = safe_indexing(true_joints, indices)
    
    return depth_subset, offsets_subset, directions_subset, truths_subset
Esempio n. 15
0
def test_safe_indexing():
    X = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
    inds = np.array([1, 2])
    X_inds = safe_indexing(X, inds)
    X_arrays = safe_indexing(np.array(X), inds)
    assert_array_equal(np.array(X_inds), X_arrays)
    assert_array_equal(np.array(X_inds), np.array(X)[inds])
Esempio n. 16
0
File: sem.py Progetto: Tbabm/PerRec
def sem_cross_validate(estimator,
                       X,
                       y,
                       scoring,
                       n_splits=10,
                       similarities=None):
    # similarities: doc-doc partial similarities, according to shuffled indexes
    cv = KFold(n_splits=n_splits)
    scores = {}
    result_sims = []
    for key, scorer in scoring.items():
        scores["test_" + key] = []
    for i, (train_idxes, test_idxes) in enumerate(cv.split(X)):
        # create new estimator
        cur_estimator = clone(estimator)
        train_X = safe_indexing(X, train_idxes)
        train_y = safe_indexing(y, train_idxes)
        test_X = safe_indexing(X, test_idxes)
        test_y = safe_indexing(y, test_idxes)
        cur_estimator.fit(train_X, train_y)
        if similarities is None:
            rec_perm_lists = cur_estimator.transform(test_X)
            result_sims.append(cur_estimator.sims_)
        else:
            rec_perm_lists = cur_estimator.transform(test_X, similarities[i])
        for key, scorer in scoring.items():
            cur_score = scorer._sign * scorer._score_func(
                test_y, rec_perm_lists, **scorer._kwargs)
            scores["test_" + key].append(cur_score)
    result_sims = np.array(result_sims)
    return scores, result_sims
Esempio n. 17
0
    def _fit_resample(self, X, y):
        n_samples = X.shape[0]

        # convert y to z_score
        y_z = (y - y.mean()) / y.std()

        index0 = np.arange(n_samples)
        index_negative = index0[y_z > self.negative_thres]
        index_positive = index0[y_z <= self.positive_thres]
        index_unclassified = [
            x for x in index0
            if x not in index_negative and x not in index_positive
        ]

        y_z[index_negative] = 0
        y_z[index_positive] = 1
        y_z[index_unclassified] = -1

        ros = RandomOverSampler(sampling_strategy=self.sampling_strategy,
                                random_state=self.random_state,
                                ratio=self.ratio)
        _, _ = ros.fit_resample(X, y_z)
        sample_indices = ros.sample_indices_

        print("Before sampler: %s. Total after: %s" %
              (Counter(y_z), sample_indices.shape))

        self.sample_indices_ = np.array(sample_indices)

        if self.return_indices:
            return (safe_indexing(X, sample_indices),
                    safe_indexing(y, sample_indices), sample_indices)
        return (safe_indexing(X, sample_indices),
                safe_indexing(y, sample_indices))
Esempio n. 18
0
    def _fit_resample(self, X, y):
        if self.return_indices:
            deprecate_parameter(self, '0.4', 'return_indices',
                                'sample_indices_')
        random_state = check_random_state(self.random_state)

        idx_under = np.empty((0, ), dtype=int)

        for target_class in np.unique(y):
            if target_class in self.sampling_strategy_.keys():
                n_samples = self.sampling_strategy_[target_class]
                index_target_class = random_state.choice(
                    range(np.count_nonzero(y == target_class)),
                    size=n_samples,
                    replace=self.replacement)
            else:
                index_target_class = slice(None)

            idx_under = np.concatenate(
                (idx_under,
                 np.flatnonzero(y == target_class)[index_target_class]),
                axis=0)

        self.sample_indices_ = idx_under

        if self.return_indices:
            return (safe_indexing(X, idx_under), safe_indexing(y, idx_under),
                    idx_under)
        return safe_indexing(X, idx_under), safe_indexing(y, idx_under)
Esempio n. 19
0
    def _fit_resample(self, X, y):
        n_samples = X.shape[0]

        # convert y to z_score
        y_z = (y - y.mean()) / y.std()

        index0 = np.arange(n_samples)
        index_negative = index0[y_z > self.negative_thres]
        index_positive = index0[y_z <= self.positive_thres]
        index_unclassified = [x for x in index0
                              if x not in index_negative
                              and x not in index_positive]

        y_z[index_negative] = 0
        y_z[index_positive] = 1
        y_z[index_unclassified] = -1

        ros = RandomOverSampler(
            sampling_strategy=self.sampling_strategy,
            random_state=self.random_state,
            ratio=self.ratio)
        _, _ = ros.fit_resample(X, y_z)
        sample_indices = ros.sample_indices_

        print("Before sampler: %s. Total after: %s"
              % (Counter(y_z), sample_indices.shape))

        self.sample_indices_ = np.array(sample_indices)

        if self.return_indices:
            return (safe_indexing(X, sample_indices),
                    safe_indexing(y, sample_indices),
                    sample_indices)
        return (safe_indexing(X, sample_indices),
                safe_indexing(y, sample_indices))
Esempio n. 20
0
def _safe_split(estimator, X, y, dy, indices, train_indices=None):
    """Create subset of dataset and properly handle kernels.
    Slice X, y according to indices for cross-validation, but take care of
    precomputed kernel-matrices or pairwise affinities / distances.
    If ``estimator._pairwise is True``, X needs to be square and
    we slice rows and columns. If ``train_indices`` is not None,
    we slice rows using ``indices`` (assumed the test set) and columns
    using ``train_indices``, indicating the training set.
    Labels y will always be indexed only along the first axis.
    Parameters
    ----------
    estimator : object
        Estimator to determine whether we should slice only rows or rows and
        columns.
    X : array-like, sparse matrix or iterable
        Data to be indexed. If ``estimator._pairwise is True``,
        this needs to be a square array-like or sparse matrix.
    y : array-like, sparse matrix or iterable
        Targets to be indexed.
    indices : array of int
        Rows to select from X and y.
        If ``estimator._pairwise is True`` and ``train_indices is None``
        then ``indices`` will also be used to slice columns.
    train_indices : array of int or None, default=None
        If ``estimator._pairwise is True`` and ``train_indices is not None``,
        then ``train_indices`` will be use to slice the columns of X.
    Returns
    -------
    X_subset : array-like, sparse matrix or list
        Indexed data.
    y_subset : array-like, sparse matrix or list
        Indexed targets.
    """
    if getattr(estimator, "_pairwise", False):
        if not hasattr(X, "shape"):
            raise ValueError("Precomputed kernels or affinity matrices have "
                             "to be passed as arrays or sparse matrices.")
        # X is a precomputed square kernel matrix
        if X.shape[0] != X.shape[1]:
            raise ValueError("X should be a square kernel matrix")
        if train_indices is None:
            X_subset = X[np.ix_(indices, indices)]
        else:
            X_subset = X[np.ix_(indices, train_indices)]
    else:
        X_subset = safe_indexing(X, indices)

    if y is not None:
        y_subset = safe_indexing(y, indices)
    else:
        y_subset = None

    if dy is not None:
        dy_subset = safe_indexing(dy, indices)
    else:
        dy_subset = None

    return X_subset, y_subset, dy_subset
Esempio n. 21
0
def _safe_split(y, exog, train, test):
    """Performs the CV indexing given the indices"""
    y_train, y_test = y.take(train), y.take(test)
    if exog is None:
        exog_train = exog_test = None
    else:
        exog_train, exog_test = \
            safe_indexing(exog, train), safe_indexing(exog, test)
    return y_train, y_test, exog_train, exog_test
Esempio n. 22
0
def train(excel_file, text_column, labels_column, train_test_idxs_file, n_jobs, model_file, n_accepted_probs, output_file):
    execution_info = pd.DataFrame()
    execution_info['Start date'] = [get_local_time_str()]
    torch.manual_seed(RANDOM_STATE)
    device = torch.device(f'cuda:{torch.cuda.current_device()}' \
                          if torch.cuda.is_available() \
                          else 'cpu')
    device_str = f'{device.type}:{device.index} ({torch.cuda.get_device_name(device.index)})' \
                 if device.type == 'cuda' \
                 else device.type
    print(f'Device: {device_str}')
    df = pd.read_excel(excel_file)
    df = df.fillna('NaN')
    corpus = df[text_column].tolist()
    labels = df[labels_column].tolist()
    train_test_idxs = load_json(train_test_idxs_file)
    train_idxs = train_test_idxs['train_idxs']
    test_idxs = train_test_idxs['test_idxs']
    corpus_train = utils.safe_indexing(corpus, train_idxs)
    corpus_test = utils.safe_indexing(corpus, test_idxs)
    y_train = utils.safe_indexing(labels, train_idxs)
    y_test = utils.safe_indexing(labels, test_idxs)
    train_set = BERTTokenizedDataset(corpus_train, y_train)
    val_set = BERTTokenizedDataset(corpus_test, y_test)
    train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, num_workers=n_jobs-1)
    val_loader = DataLoader(val_set, batch_size=BATCH_SIZE, num_workers=n_jobs-1)
    assert train_loader.dataset.classes_ == val_loader.dataset.classes_
    net = BERTNeuralNet(len(val_loader.dataset.classes_), freeze_bert=FREEZE_BERT)
    net.load_state_dict(torch.load(model_file, map_location=device)['model_state_dict'])
    net.additional_layers = nn.Sequential(*list(net.additional_layers.children())[0:-1])
    ft = FeatureExtractor(device, net)
    X_train = ft.extract_features(train_loader, 'X_train.pkl', 'X_train.dat')
    X_test = ft.extract_features(val_loader, 'X_test.pkl', 'X_test.dat')
    clfs = [
        ensemble.RandomForestClassifier(n_estimators=100, n_jobs=n_jobs, random_state=RANDOM_STATE),
        LinearSVC(random_state=RANDOM_STATE),
        dummy.DummyClassifier(strategy='stratified', random_state=RANDOM_STATE, constant=None),
        linear_model.SGDClassifier(loss='modified_huber', max_iter=1000, tol=1e-3, n_jobs=n_jobs, random_state=RANDOM_STATE)
    ]
    predictions = {'y_true': y_test}
    for clf in tqdm(iterable=clfs, desc='Fitting classifiers', unit='clf'):
        clf.fit(X_train, y_train)
        dump_pickle(clf, '%s.pkl' % (clf.__class__.__name__))
    for clf in tqdm(iterable=clfs, desc='Obtaining probabilities', unit='clf'):
        y_predict_proba = clf.predict_proba(X_test)
        dicts = predict_proba_to_dicts(clf.classes_, y_predict_proba)
        predictions[clf.__class__.__name__] = dicts
    dump_json(predictions, 'predictions.json')
    execution_info['End date'] = [get_local_time_str()]
    execution_info['Excel file'] = [excel_file]
    execution_info['Text column'] = [text_column]
    execution_info['Label column'] = [labels_column]
    execution_info['Accepted probabilities'] = [n_accepted_probs]
    execution_info['Device'] = [device_str]
    execution_info['Base model'] = [model_file]
    execution_info['Batch size'] = [BATCH_SIZE]
    generate_report(execution_info, predictions, output_file)
    def _sample(self, X, y):
        """Resample the dataset.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            Matrix containing the data which have to be sampled.

        y : array-like, shape (n_samples,)
            Corresponding label for each sample in X.

        Returns
        -------
        X_resampled : {ndarray, sparse matrix}, shape \
(n_samples_new, n_features)
            The array containing the resampled data.

        y_resampled : ndarray, shape (n_samples_new,)
            The corresponding label of `X_resampled`

        """
        self._validate_estimator()

        if self.voting == 'auto':
            if sparse.issparse(X):
                self.voting_ = 'hard'
            else:
                self.voting_ = 'soft'
        else:
            if self.voting in VOTING_KIND:
                self.voting_ = self.voting
            else:
                raise ValueError("'voting' needs to be one of {}. Got {}"
                                 " instead.".format(VOTING_KIND, self.voting))

        X_resampled, y_resampled = [], []
        for target_class in np.unique(y):
            if target_class in self.ratio_.keys():
                n_samples = self.ratio_[target_class]
                self.estimator_.set_params(**{'n_clusters': n_samples})
                self.estimator_.fit(X[y == target_class])
                X_new, y_new = self._generate_sample(
                    X, y, self.estimator_.cluster_centers_, target_class)
                X_resampled.append(X_new)
                y_resampled.append(y_new)
            else:
                target_class_indices = np.flatnonzero(y == target_class)
                X_resampled.append(safe_indexing(X, target_class_indices))
                y_resampled.append(safe_indexing(y, target_class_indices))

        if sparse.issparse(X):
            X_resampled = sparse.vstack(X_resampled)
        else:
            X_resampled = np.vstack(X_resampled)
        y_resampled = np.hstack(y_resampled)

        return X_resampled, np.array(y_resampled)
Esempio n. 24
0
    def _sample(self, X, y):
        """Resample the dataset.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            Matrix containing the data which have to be sampled.

        y : array-like, shape (n_samples,)
            Corresponding label for each sample in X.

        Returns
        -------
        X_resampled : {ndarray, sparse matrix}, shape \
(n_samples_new, n_features)
            The array containing the resampled data.

        y_resampled : ndarray, shape (n_samples_new,)
            The corresponding label of `X_resampled`

        """
        self._validate_estimator()

        if self.voting == 'auto':
            if sparse.issparse(X):
                self.voting_ = 'hard'
            else:
                self.voting_ = 'soft'
        else:
            if self.voting in VOTING_KIND:
                self.voting_ = self.voting
            else:
                raise ValueError("'voting' needs to be one of {}. Got {}"
                                 " instead.".format(VOTING_KIND, self.voting))

        X_resampled, y_resampled = [], []
        for target_class in np.unique(y):
            if target_class in self.sampling_strategy_.keys():
                n_samples = self.sampling_strategy_[target_class]
                self.estimator_.set_params(**{'n_clusters': n_samples})
                self.estimator_.fit(X[y == target_class])
                X_new, y_new = self._generate_sample(
                    X, y, self.estimator_.cluster_centers_, target_class)
                X_resampled.append(X_new)
                y_resampled.append(y_new)
            else:
                target_class_indices = np.flatnonzero(y == target_class)
                X_resampled.append(safe_indexing(X, target_class_indices))
                y_resampled.append(safe_indexing(y, target_class_indices))

        if sparse.issparse(X):
            X_resampled = sparse.vstack(X_resampled)
        else:
            X_resampled = np.vstack(X_resampled)
        y_resampled = np.hstack(y_resampled)

        return X_resampled, np.array(y_resampled)
def _safe_split(X, y, indices):
    """Create subset of dataset"""
    X_subset = safe_indexing(X, indices)
    if y is not None:
        y_subset = safe_indexing(y, indices)
    else:
        y_subset = None

    return X_subset, y_subset
Esempio n. 26
0
def _fit_score(pipe, param_grid, X, y, train_idx, test_idx, cv_idx):
    """Fit a pipeline and score.

    Parameters
    ----------
    pipe : Estimator
        A scikit-learn pipeline.
    param_grid : ParameterGrid
        A ParameterGrid with all the parameters to try for the pipeline.
    X : ndarray, shape (n_samples, n_features)
        The full dataset.
    y : ndarray, shape (n_samples,)
        The associated target.
    train_idx : ndarray, (n_train_samples,)
        The training indexes.
    test_idx : ndarray, (n_test_samples,)
        The testing indexes.
    cv_idx : int
        The index of the fold.
    Returns
    -------
    cv_results : dict
        A dictionary containing the score and parameters.
    """
    cv_results = defaultdict(list)
    X_train, y_train = safe_indexing(X, train_idx), y[train_idx]
    X_test, y_test = safe_indexing(X, test_idx), y[test_idx]

    for param in param_grid:
        pipe_cv = clone(pipe)
        pipe_cv.set_params(**param)

        try:
            pipe_cv.fit(X_train, y_train)
        except ValueError:
            continue
        y_pred_proba_train = pipe_cv.predict_proba(X_train)
        y_pred_proba_test = pipe_cv.predict_proba(X_test)
        y_pred_train = pipe_cv.predict(X_train)
        y_pred_test = pipe_cv.predict(X_test)

        cv_results['auc_train_score'].append(
            roc_auc_score(y_train, y_pred_proba_train[:, 1]))
        cv_results['auc_test_score'].append(
            roc_auc_score(y_test, y_pred_proba_test[:, 1]))
        cv_results['bacc_train_score'].append(
            balanced_accuracy_score(y_train, y_pred_train))
        cv_results['bacc_test_score'].append(
            balanced_accuracy_score(y_test, y_pred_test))

        cv_results['cv_idx'].append(cv_idx)

        for k, v in param.items():
                cv_results[k].append(v)

    return cv_results
Esempio n. 27
0
def fix_target(classes_, target_: np.array, pred_: np.array):
    if not np.array_equal(classes_, np.arange(len(classes_))):
        for i_, c_ in enumerate(classes_):
            target_[target_ == c_] = -i_
        target_ *= -1

    return safe_indexing(target_,
                         np.where(target_ >= 0)[0]), safe_indexing(
                             pred_,
                             np.where(target_ >= 0)[0])
Esempio n. 28
0
def stratify_split(df, y, cats, ratio):
    keys = df[cats]
    if y.dtype.name[:5] != 'float': keys = pd.concat([keys, y], axis=1)
    keys = keys.apply(lambda x: '~'.join([str(j) for j in x.values]), axis=1)

    sss = split_by_cats(train_size =1-ratio, test_size=ratio)
    train, val = next(sss.split(df, keys))                
    x_trn, x_val = safe_indexing(df, train), safe_indexing(df, val)            
    y_trn, y_val = safe_indexing(y, train), safe_indexing(y, val)
    return x_trn, y_trn, x_val, y_val
Esempio n. 29
0
def train_test_split3(*arrays, **options):
    """Split arrays or matrices into random train, test and eval subsets
    Quick utility that wraps input validation and
    ``next(ShuffleSplit().split(X, y))`` and application to input data
    into a single call for splitting (and optionally subsampling) data in a
    oneliner.
    Read more in the :ref:`User Guide <cross_validation>`.
    Parameters
    ----------
    *arrays : sequence of indexables with same length / shape[0]
        Allowed inputs are lists, numpy arrays, scipy-sparse
        matrices or pandas dataframes.
    random_state : int, RandomState instance or None, optional (default=None)
        If int, random_state is the seed used by the random number generator;
        If None, the random number generator is the RandomState instance used
        by `np.random`.
    shuffle : boolean, optional (default=True)
        Whether or not to shuffle the data before splitting.
    Returns
    -------
    splitting : list, length=2 * len(arrays)
        List containing train-test-eval split of inputs.
    """

    n_arrays = len(arrays)
    if n_arrays == 0:
        raise ValueError("At least one array required as input")

    random_state = options.pop('random_state', None)
    shuffleresults = options.pop('shuffle', True)    
    test_fold  = options.pop('test_fold', None)
    if test_fold is None:
        raise TypeError("Parameter test_fold is required.")

    test_fold = np.array(test_fold, dtype=np.int)
    test_fold = column_or_1d(test_fold)

    if options:
        raise TypeError("Invalid parameters passed: %s" % str(options))

    evalu=np.where(test_fold==2)[0]
    if shuffleresults:
        rng = check_random_state(random_state)
        rng.shuffle(evalu)
    cv = PredefinedThreeSplit(test_fold=test_fold, shuffle=shuffleresults, random_state=random_state)
    train, test = next(cv.split())

    #print evalu
    if len(evalu)==0:
        return list(chain.from_iterable((safe_indexing(a, train),
                                         safe_indexing(a, test), np.array(0)) for a in arrays))
    return list(chain.from_iterable((safe_indexing(a, train),
                                     safe_indexing(a, test),
                                     safe_indexing(a, evalu)) for a in arrays))
Esempio n. 30
0
def test_safe_indexing_pandas():
    try:
        import pandas as pd
    except ImportError:
        raise SkipTest("Pandas not found")
    X = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
    X_df = pd.DataFrame(X)
    inds = np.array([1, 2])
    X_df_indexed = safe_indexing(X_df, inds)
    X_indexed = safe_indexing(X_df, inds)
    assert_array_equal(np.array(X_df_indexed), X_indexed)
Esempio n. 31
0
def test_safe_indexing_pandas():
    try:
        import pandas as pd
    except ImportError:
        raise SkipTest("Pandas not found")
    X = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
    X_df = pd.DataFrame(X)
    inds = np.array([1, 2])
    X_df_indexed = safe_indexing(X_df, inds)
    X_indexed = safe_indexing(X_df, inds)
    assert_array_equal(np.array(X_df_indexed), X_indexed)
Esempio n. 32
0
def test_safe_indexing_axis_1_sparse(idx, asarray):
    if isinstance(idx, Iterable) and asarray:
        idx = np.asarray(idx)
    X_true = safe_indexing(X_toy, idx, axis=1)

    # scipy matrix will always return a 2D array
    if X_true.ndim == 1:
        X_true = X_true[:, np.newaxis]

    X_sparse = sp.csc_matrix(X_toy)
    assert_array_equal(safe_indexing(X_sparse, idx, axis=1).toarray(), X_true)
Esempio n. 33
0
def test_safe_indexing_axis_1_pandas(idx_array, idx_df, asarray):
    pd = pytest.importorskip('pandas')
    if asarray and isinstance(idx_array, Iterable):
        idx_array = np.asarray(idx_array)
    if (asarray and
        (not isinstance(idx_df, str) and isinstance(idx_df, Iterable))):
        idx_df = np.asarray(idx_df)

    X_true = safe_indexing(X_toy, idx_array, axis=1)
    X_df = pd.DataFrame(X_toy, columns=['col_{}'.format(i) for i in range(3)])
    assert_array_equal(safe_indexing(X_df, idx_df, axis=1).values, X_true)
Esempio n. 34
0
def split_dataset(dataset):
    X = dataset.drop(y_col, axis=1)
    y = dataset[y_col]
    test_fold = (fold_pattern * (
        (dataset.shape[0] - 1) // len(fold_pattern) + 1))[:dataset.shape[0]]
    splitter = PredefinedSplit(test_fold)
    for train_index, test_index in splitter.split():
        X_train, X_test = safe_indexing(X, train_index), safe_indexing(
            X, test_index)
        y_train, y_test = safe_indexing(y, train_index), safe_indexing(
            y, test_index)
    return X_train, y_train, X_test, y_test
def train(excel_file, text_column, labels_column, train_test_idxs_file, n_jobs,
          n_accepted_probs, output_file):
    execution_info = pd.DataFrame()
    execution_info['Start date'] = [get_local_time_str()]
    df = pd.read_excel(excel_file)
    df = df.fillna('NaN')
    preprocessor = Preprocessor()
    corpus = preprocessor.preprocess(df[text_column])
    dump_json(corpus, 'preprocessed_corpus_ELMo.json')
    labels = df[labels_column].tolist()
    train_test_idxs = load_json(train_test_idxs_file)
    train_idxs = train_test_idxs['train_idxs']
    test_idxs = train_test_idxs['test_idxs']
    corpus_train = utils.safe_indexing(corpus, train_idxs)
    corpus_test = utils.safe_indexing(corpus, test_idxs)
    y_train = utils.safe_indexing(labels, train_idxs)
    y_test = utils.safe_indexing(labels, test_idxs)
    ft = FeatureExtractor()
    X_train = ft.extract_features(corpus_train, 'X_train_ELMo.pkl',
                                  'X_train_ELMo.dat')
    X_test = ft.extract_features(corpus_test, 'X_test_ELMo.pkl',
                                 'X_test_ELMo.dat')
    clfs = [
        ensemble.RandomForestClassifier(n_estimators=100,
                                        n_jobs=n_jobs,
                                        random_state=RANDOM_STATE),
        LinearSVC(random_state=RANDOM_STATE),
        dummy.DummyClassifier(strategy='stratified',
                              random_state=RANDOM_STATE,
                              constant=None),
        linear_model.SGDClassifier(loss='modified_huber',
                                   max_iter=1000,
                                   tol=1e-3,
                                   n_jobs=n_jobs,
                                   random_state=RANDOM_STATE)
    ]
    predictions = {'y_true': y_test}
    for clf in tqdm(iterable=clfs, desc='Fitting classifiers', unit='clf'):
        clf.fit(X_train, y_train)
        dump_pickle(clf, '%s.pkl' % (clf.__class__.__name__))
    for clf in tqdm(iterable=clfs, desc='Obtaining probabilities', unit='clf'):
        y_predict_proba = clf.predict_proba(X_test)
        dicts = predict_proba_to_dicts(clf.classes_, y_predict_proba)
        predictions[clf.__class__.__name__] = dicts
    dump_json(predictions, 'predictions.json')
    execution_info['End date'] = [get_local_time_str()]
    execution_info['Excel file'] = excel_file
    execution_info['Text column'] = text_column
    execution_info['Label column'] = labels_column
    execution_info['n_jobs'] = n_jobs
    execution_info['Accepted probabilities'] = n_accepted_probs
    generate_report(execution_info, predictions, output_file)
Esempio n. 36
0
 def generator(X, y, sample_weight, indices, batch_size):
     while True:
         for index in range(0, len(indices), batch_size):
             X_res = safe_indexing(X, indices[index:index + batch_size])
             y_res = safe_indexing(y, indices[index:index + batch_size])
             if issparse(X_res) and not keep_sparse:
                 X_res = X_res.toarray()
             if sample_weight is None:
                 yield X_res, y_res
             else:
                 sw_res = safe_indexing(sample_weight,
                                        indices[index:index + batch_size])
                 yield X_res, y_res, sw_res
Esempio n. 37
0
def test_safe_indexing_1d_array_error(X_constructor):
    # check that we are raising an error if the array-like passed is 1D and
    # we try to index on the 2nd dimension
    X = list(range(5))
    if X_constructor == 'array':
        X_constructor = np.asarray(X)
    elif X_constructor == 'series':
        pd = pytest.importorskip("pandas")
        X_constructor = pd.Series(X)

    err_msg = "'X' should be a 2D NumPy array, 2D sparse matrix or pandas"
    with pytest.raises(ValueError, match=err_msg):
        safe_indexing(X_constructor, [0, 1], axis=1)
Esempio n. 38
0
 def generator(X, y, sample_weight, indices, batch_size):
     while True:
         for index in range(0, len(indices), batch_size):
             X_res = safe_indexing(X, indices[index:index + batch_size])
             y_res = safe_indexing(y, indices[index:index + batch_size])
             if issparse(X_res) and not keep_sparse:
                 X_res = X_res.toarray()
             if sample_weight is None:
                 yield X_res, y_res
             else:
                 sw_res = safe_indexing(sample_weight,
                                        indices[index:index + batch_size])
                 yield X_res, y_res, sw_res
Esempio n. 39
0
def davies_bouldin_score_eu_cos(X, deltaX, labels):

    X, labels = check_X_y(X, labels)  # 检查X和labels的尺寸
    deltaX, labels = check_X_y(deltaX, labels)  # 检查X和labels的尺寸
    # deltaX = np.diff(X, n = 1, axis = 1)  # 求X的一阶差分
    le = LabelEncoder()
    labels = le.fit_transform(labels)  # 将labels转换为0-k之间的整数
    n_samples, _ = X.shape  # 样本数
    n_labels = len(le.classes_)  # label数
    check_number_of_labels(n_labels, n_samples)  # 检查label数量是否正确

    intra_dists = np.zeros(n_labels)  # 初始化类内距离
    # 初始化聚类中心
    centroids = np.zeros((n_labels, len(X[0])), dtype=np.float)
    centroids_delta = np.zeros((n_labels, len(deltaX[0])), dtype=np.float)

    eu_X = pairwise_distances(X, metric='euclidean')
    eu_max, eu_min = np.max(eu_X), np.min(eu_X)
    cos_X = pairwise_distances(deltaX, metric='cosine')
    cos_max, cos_min = np.max(cos_X), np.min(cos_X)
    scaling_ratio = (eu_max - eu_min) / (cos_max - cos_min)

    for k in range(n_labels):
        cluster_k = safe_indexing(X, labels == k)  # 索引样本集中标签为k的样本(即第k类)
        delta_k = safe_indexing(deltaX, labels == k)  # 索引一阶差分样本集中的标签为k的样本
        centroid = cluster_k.mean(axis=0)  # 求第k类样本的中心点
        centroid_delta = delta_k.mean(axis=0)  # 求第k类一阶差分样本的中心点
        centroids[k] = centroid
        centroids_delta[k] = centroid_delta
        # 求第k类样本到第k类中心的距离的平均值(以metric度量)
        a = 0.5
        b = 1 - a
        intra_dist_eu = np.average(
            pairwise_distances(cluster_k, [centroid], metric='euclidean'))
        intra_dist_cos = np.average(
            pairwise_distances(delta_k, [centroid_delta], metric='cosine'))
        intra_dists[k] = a * intra_dist_eu + b * intra_dist_cos * scaling_ratio

    # 求不同类中心的距离(以metric度量)
    centroid_distances_eu = pairwise_distances(centroids, metric='euclidean')
    centroid_distances_cos = pairwise_distances(centroids_delta,
                                                metric='cosine')
    centroid_distances = a * centroid_distances_eu + b * centroid_distances_cos * scaling_ratio

    # 如果类内、类间距离非常小,接近于0,返回0
    if np.allclose(intra_dists, 0) or np.allclose(centroid_distances, 0):
        return 0.0

    score = (intra_dists[:, None] + intra_dists) / centroid_distances
    score[score == np.inf] = np.nan  # 将无穷大的值转换为nan
    return np.mean(np.nanmax(score, axis=1))
    def _sample(self, X, y):
        """Resample the dataset.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            Matrix containing the data which have to be sampled.

        y : array-like, shape (n_samples,)
            Corresponding label for each sample in X.

        Returns
        -------
        X_resampled : {ndarray, sparse matrix}, shape \
(n_samples_new, n_features)
            The array containing the resampled data.

        y_resampled : ndarray, shape (n_samples_new,)
            The corresponding label of `X_resampled`

        idx_under : ndarray, shape (n_samples, )
            If `return_indices` is `True`, an array will be returned
            containing a boolean for each sample to represent whether
            that sample was selected or not.

        """
        random_state = check_random_state(self.random_state)

        idx_under = np.empty((0, ), dtype=int)

        for target_class in np.unique(y):
            if target_class in self.ratio_.keys():
                n_samples = self.ratio_[target_class]
                index_target_class = random_state.choice(
                    range(np.count_nonzero(y == target_class)),
                    size=n_samples,
                    replace=self.replacement)
            else:
                index_target_class = slice(None)

            idx_under = np.concatenate(
                (idx_under,
                 np.flatnonzero(y == target_class)[index_target_class]),
                axis=0)

        if self.return_indices:
            return (safe_indexing(X, idx_under), safe_indexing(y, idx_under),
                    idx_under)
        else:
            return safe_indexing(X, idx_under), safe_indexing(y, idx_under)
    def _sample(self, X, y):
        """Resample the dataset.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            Matrix containing the data which have to be sampled.

        y : array-like, shape (n_samples,)
            Corresponding label for each sample in X.

        Returns
        -------
        X_resampled : {ndarray, sparse matrix}, shape \
(n_samples_new, n_features)
            The array containing the resampled data.

        y_resampled : ndarray, shape (n_samples_new,)
            The corresponding label of `X_resampled`

        idx_under : ndarray, shape (n_samples, )
            If `return_indices` is `True`, an array will be returned
            containing a boolean for each sample to represent whether
            that sample was selected or not.

        """
        random_state = check_random_state(self.random_state)

        idx_under = np.empty((0, ), dtype=int)

        for target_class in np.unique(y):
            if target_class in self.ratio_.keys():
                n_samples = self.ratio_[target_class]
                index_target_class = random_state.choice(
                    range(np.count_nonzero(y == target_class)),
                    size=n_samples,
                    replace=self.replacement)
            else:
                index_target_class = slice(None)

            idx_under = np.concatenate(
                (idx_under, np.flatnonzero(y == target_class)[
                    index_target_class]), axis=0)

        if self.return_indices:
            return (safe_indexing(X, idx_under), safe_indexing(y, idx_under),
                    idx_under)
        else:
            return safe_indexing(X, idx_under), safe_indexing(y, idx_under)
    def _sample(self, X, y):
        # FIXME: uncomment in version 0.6
        # self._validate_estimator()

        X_resampled = X.copy()
        y_resampled = y.copy()

        for class_sample, n_samples in self.sampling_strategy_.items():
            if n_samples == 0:
                continue
            target_class_indices = np.flatnonzero(y == class_sample)
            X_class = safe_indexing(X, target_class_indices)

            self.nn_k_.fit(X_class)
            nns = self.nn_k_.kneighbors(X_class, return_distance=False)[:, 1:]
            X_new, y_new = self._make_samples(X_class, y.dtype, class_sample,
                                              X_class, nns, n_samples, 1.0)

            if sparse.issparse(X_new):
                X_resampled = sparse.vstack([X_resampled, X_new])
                sparse_func = 'tocsc' if X.format == 'csc' else 'tocsr'
                X_resampled = getattr(X_resampled, sparse_func)()
            else:
                X_resampled = np.vstack((X_resampled, X_new))
            y_resampled = np.hstack((y_resampled, y_new))

        return X_resampled, y_resampled
def _index_param_value(X, v, indices):
    """Private helper function for parameter value indexing."""
    if not _is_arraylike(v) or _num_samples(v) != _num_samples(X):
        # pass through: skip indexing
        return v
    if sp.issparse(v):
        v = v.tocsr()
    return safe_indexing(v, indices)
Esempio n. 44
0
    def _fit_resample(self, X, y):
        # check for deprecated random_state
        if self.random_state is not None:
            deprecate_parameter(self, '0.4', 'random_state')

        # Find the nearest neighbour of every point
        nn = NearestNeighbors(n_neighbors=2, n_jobs=self.n_jobs)
        nn.fit(X)
        nns = nn.kneighbors(X, return_distance=False)[:, 1]

        links = self.is_tomek(y, nns, self.sampling_strategy_)
        idx_under = np.flatnonzero(np.logical_not(links))

        if self.return_indices:
            return (safe_indexing(X, idx_under), safe_indexing(y, idx_under),
                    idx_under)
        else:
            return (safe_indexing(X, idx_under), safe_indexing(y, idx_under))
Esempio n. 45
0
    def extract_param(self, key, x, n):
        if self.cache is not None and (n, key) in self.cache:
            return self.cache[n, key]

        out = safe_indexing(x, self.splits[n][0]) if _is_arraylike(x) else x

        if self.cache is not None:
            self.cache[n, key] = out
        return out
Esempio n. 46
0
    def _sample(self, X, y):
        """Resample the dataset.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            Matrix containing the data which have to be sampled.

        y : array-like, shape (n_samples,)
            Corresponding label for each sample in X.

        Returns
        -------
        X_resampled : {ndarray, sparse matrix}, shape \
(n_samples_new, n_features)
            The array containing the resampled data.

        y_resampled : ndarray, shape (n_samples_new,)
            The corresponding label of `X_resampled`

        idx_under : ndarray, shape (n_samples, )
            If `return_indices` is `True`, a boolean array will be returned
            containing the which samples have been selected.

        """
        # check for deprecated random_state
        if self.random_state is not None:
            deprecate_parameter(self, '0.4', 'random_state')

        # Find the nearest neighbour of every point
        nn = NearestNeighbors(n_neighbors=2, n_jobs=self.n_jobs)
        nn.fit(X)
        nns = nn.kneighbors(X, return_distance=False)[:, 1]

        links = self.is_tomek(y, nns, self.ratio_)
        idx_under = np.flatnonzero(np.logical_not(links))

        if self.return_indices:
            return (safe_indexing(X, idx_under),
                    safe_indexing(y, idx_under),
                    idx_under)
        else:
            return (safe_indexing(X, idx_under),
                    safe_indexing(y, idx_under))
Esempio n. 47
0
    def __getitem__(self, index):
        X_resampled = safe_indexing(
            self.X, self.indices_[index * self.batch_size:
                                  (index + 1) * self.batch_size])
        y_resampled = safe_indexing(
            self.y, self.indices_[index * self.batch_size:
                                  (index + 1) * self.batch_size])
        if issparse(X_resampled) and not self.keep_sparse:
            X_resampled = X_resampled.toarray()
        if self.sample_weight is not None:
            sample_weight_resampled = safe_indexing(
                self.sample_weight,
                self.indices_[index * self.batch_size:
                              (index + 1) * self.batch_size])

        if self.sample_weight is None:
            return X_resampled, y_resampled
        else:
            return X_resampled, y_resampled, sample_weight_resampled
Esempio n. 48
0
    def _extract(self, X, y, n, is_x=True, is_train=True):
        if self.cache is not None and (n, is_x, is_train) in self.cache:
            return self.cache[n, is_x, is_train]

        inds = self.splits[n][0] if is_train else self.splits[n][1]
        result = safe_indexing(X if is_x else y, inds)

        if self.cache is not None:
            self.cache[n, is_x, is_train] = result
        return result
Esempio n. 49
0
def test_safe_indexing_pandas():
    try:
        import pandas as pd
    except ImportError:
        raise SkipTest("Pandas not found")
    X = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
    X_df = pd.DataFrame(X)
    inds = np.array([1, 2])
    X_df_indexed = safe_indexing(X_df, inds)
    X_indexed = safe_indexing(X_df, inds)
    assert_array_equal(np.array(X_df_indexed), X_indexed)
    # fun with read-only data in dataframes
    # this happens in joblib memmapping
    X.setflags(write=False)
    X_df_readonly = pd.DataFrame(X)
    with warnings.catch_warnings(record=True):
        X_df_ro_indexed = safe_indexing(X_df_readonly, inds)

    assert_array_equal(np.array(X_df_ro_indexed), X_indexed)
Esempio n. 50
0
def _shuffle(y, groups, random_state):
    """Return a shuffled copy of y eventually shuffle among same groups."""
    if groups is None:
        indices = random_state.permutation(len(y))
    else:
        indices = np.arange(len(groups))
        for group in np.unique(groups):
            this_mask = (groups == group)
            indices[this_mask] = random_state.permutation(indices[this_mask])
    return safe_indexing(y, indices)
    def _fit_resample(self, X, y):
        if self.return_indices:
            deprecate_parameter(self, '0.4', 'return_indices',
                                'sample_indices_')
        self._validate_estimator()

        random_state = check_random_state(self.random_state)
        target_stats = Counter(y)
        class_minority = min(target_stats, key=target_stats.get)

        idx_under = np.empty((0, ), dtype=int)

        for target_class in np.unique(y):
            if target_class in self.sampling_strategy_.keys():
                # select a sample from the current class
                idx_maj = np.flatnonzero(y == target_class)
                sel_idx_maj = random_state.randint(
                    low=0, high=target_stats[target_class],
                    size=self.n_seeds_S)
                idx_maj_sample = idx_maj[sel_idx_maj]

                minority_class_indices = np.flatnonzero(y == class_minority)
                C_indices = np.append(minority_class_indices, idx_maj_sample)

                # create the set composed of all minority samples and one
                # sample from the current class.
                C_x = safe_indexing(X, C_indices)
                C_y = safe_indexing(y, C_indices)

                # create the set S with removing the seed from S
                # since that it will be added anyway
                idx_maj_extracted = np.delete(idx_maj, sel_idx_maj, axis=0)
                S_x = safe_indexing(X, idx_maj_extracted)
                S_y = safe_indexing(y, idx_maj_extracted)
                self.estimator_.fit(C_x, C_y)
                pred_S_y = self.estimator_.predict(S_x)

                S_misclassified_indices = np.flatnonzero(pred_S_y != S_y)
                idx_tmp = idx_maj_extracted[S_misclassified_indices]
                idx_under = np.concatenate(
                    (idx_under, idx_maj_sample, idx_tmp), axis=0)
            else:
                idx_under = np.concatenate(
                    (idx_under, np.flatnonzero(y == target_class)), axis=0)

        X_resampled = safe_indexing(X, idx_under)
        y_resampled = safe_indexing(y, idx_under)

        # apply Tomek cleaning
        tl = TomekLinks(
            sampling_strategy=list(self.sampling_strategy_.keys()))
        X_cleaned, y_cleaned = tl.fit_resample(X_resampled, y_resampled)

        self.sample_indices_ = safe_indexing(idx_under, tl.sample_indices_)
        if self.return_indices:
            return (X_cleaned, y_cleaned, self.sample_indices_)
        return X_cleaned, y_cleaned
    def _fit_resample(self, X, y):
        random_state = check_random_state(self.random_state)
        target_stats = Counter(y)

        sample_indices = range(X.shape[0])

        for class_sample, num_samples in self.sampling_strategy_.items():
            target_class_indices = np.flatnonzero(y == class_sample)
            indices = random_state.randint(
                low=0, high=target_stats[class_sample], size=num_samples)

            sample_indices = np.append(sample_indices,
                                       target_class_indices[indices])

        if self.return_indices:
            return (safe_indexing(X, sample_indices), safe_indexing(
                    y, sample_indices), sample_indices)
        else:
            return (safe_indexing(X, sample_indices), safe_indexing(
                    y, sample_indices))
Esempio n. 53
0
def _local_parallel_build_trees(sampler, tree, forest, X, y, sample_weight,
                                tree_idx, n_trees, verbose=0,
                                class_weight=None):
    # resample before to fit the tree
    X_resampled, y_resampled = sampler.fit_sample(X, y)
    if sample_weight is not None:
        sample_weight = safe_indexing(sample_weight, sampler.sample_indices_)
    tree = _parallel_build_trees(tree, forest, X_resampled, y_resampled,
                                 sample_weight, tree_idx, n_trees,
                                 verbose=verbose, class_weight=class_weight)
    return sampler, tree
    def _fit_resample(self, X, y):
        if self.return_indices:
            deprecate_parameter(self, '0.4', 'return_indices',
                                'sample_indices_')
        self._validate_estimator()

        target_stats = Counter(y)
        skf = StratifiedKFold(
            n_splits=self.cv, shuffle=False,
            random_state=self.random_state).split(X, y)
        probabilities = np.zeros(y.shape[0], dtype=float)

        for train_index, test_index in skf:
            X_train = safe_indexing(X, train_index)
            X_test = safe_indexing(X, test_index)
            y_train = safe_indexing(y, train_index)
            y_test = safe_indexing(y, test_index)

            self.estimator_.fit(X_train, y_train)

            probs = self.estimator_.predict_proba(X_test)
            classes = self.estimator_.classes_
            probabilities[test_index] = [
                probs[l, np.where(classes == c)[0][0]]
                for l, c in enumerate(y_test)
            ]

        idx_under = np.empty((0, ), dtype=int)

        for target_class in np.unique(y):
            if target_class in self.sampling_strategy_.keys():
                n_samples = self.sampling_strategy_[target_class]
                threshold = np.percentile(
                    probabilities[y == target_class],
                    (1. - (n_samples / target_stats[target_class])) * 100.)
                index_target_class = np.flatnonzero(
                    probabilities[y == target_class] >= threshold)
            else:
                index_target_class = slice(None)

            idx_under = np.concatenate(
                (idx_under,
                 np.flatnonzero(y == target_class)[index_target_class]),
                axis=0)

        self.sample_indices_ = idx_under

        if self.return_indices:
            return (safe_indexing(X, idx_under), safe_indexing(y, idx_under),
                    idx_under)
        return safe_indexing(X, idx_under), safe_indexing(y, idx_under)
    def _fit_resample(self, X, y):
        self._validate_estimator()

        idx_under = np.empty((0, ), dtype=int)

        self.nn_.fit(X)

        for target_class in np.unique(y):
            if target_class in self.sampling_strategy_.keys():
                target_class_indices = np.flatnonzero(y == target_class)
                X_class = safe_indexing(X, target_class_indices)
                y_class = safe_indexing(y, target_class_indices)
                nnhood_idx = self.nn_.kneighbors(
                    X_class, return_distance=False)[:, 1:]
                nnhood_label = y[nnhood_idx]
                if self.kind_sel == 'mode':
                    nnhood_label, _ = mode(nnhood_label, axis=1)
                    nnhood_bool = np.ravel(nnhood_label) == y_class
                elif self.kind_sel == 'all':
                    nnhood_label = nnhood_label == target_class
                    nnhood_bool = np.all(nnhood_label, axis=1)
                index_target_class = np.flatnonzero(nnhood_bool)
            else:
                index_target_class = slice(None)

            idx_under = np.concatenate(
                (idx_under,
                 np.flatnonzero(y == target_class)[index_target_class]),
                axis=0)

        if self.return_indices:
            return (safe_indexing(X, idx_under), safe_indexing(y, idx_under),
                    idx_under)
        else:
            return safe_indexing(X, idx_under), safe_indexing(y, idx_under)
Esempio n. 56
0
    def _sample_regular(self, X, y):
        """Resample the dataset using the regular SMOTE implementation.

        Use the regular SMOTE algorithm proposed in [1]_.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            Matrix containing the data which have to be sampled.

        y : array-like, shape (n_samples,)
            Corresponding label for each sample in X.

        Returns
        -------
        X_resampled : {ndarray, sparse matrix}, shape \
(n_samples_new, n_features)
            The array containing the resampled data.

        y_resampled : ndarray, shape (n_samples_new,)
            The corresponding label of `X_resampled`

        References
        ----------
        .. [1] N. V. Chawla, K. W. Bowyer, L. O.Hall, W. P. Kegelmeyer, "SMOTE:
           synthetic minority over-sampling technique," Journal of artificial
           intelligence research, 321-357, 2002.

        """

        X_resampled = X.copy()
        y_resampled = y.copy()

        for class_sample, n_samples in self.ratio_.items():
            if n_samples == 0:
                continue
            target_class_indices = np.flatnonzero(y == class_sample)
            X_class = safe_indexing(X, target_class_indices)

            self.nn_k_.fit(X_class)
            nns = self.nn_k_.kneighbors(X_class, return_distance=False)[:, 1:]
            X_new, y_new = self._make_samples(X_class, class_sample, X_class,
                                              nns, n_samples, 1.0)

            if sparse.issparse(X_new):
                X_resampled = sparse.vstack([X_resampled, X_new])
            else:
                X_resampled = np.vstack((X_resampled, X_new))
            y_resampled = np.hstack((y_resampled, y_new))

        return X_resampled, y_resampled
    def _sample(self, X, y):
        """Resample the dataset.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            Matrix containing the data which have to be sampled.

        y : array-like, shape (n_samples,)
            Corresponding label for each sample in X.

        Returns
        -------
        X_resampled : {ndarray, sparse matrix}, shape \
(n_samples_new, n_features)
            The array containing the resampled data.

        y_resampled : ndarray, shape (n_samples_new,)
            The corresponding label of `X_resampled`

        idx_under : ndarray, shape (n_samples, )
            If `return_indices` is `True`, a boolean array will be returned
            containing the which samples have been selected.

        """
        self._validate_estimator()

        idx_under = np.empty((0, ), dtype=int)

        self.nn_.fit(X)

        for target_class in np.unique(y):
            if target_class in self.ratio_.keys():
                target_class_indices = np.flatnonzero(y == target_class)
                X_class = safe_indexing(X, target_class_indices)
                y_class = safe_indexing(y, target_class_indices)
                nnhood_idx = self.nn_.kneighbors(
                    X_class, return_distance=False)[:, 1:]
                nnhood_label = y[nnhood_idx]
                if self.kind_sel == 'mode':
                    nnhood_label, _ = mode(nnhood_label, axis=1)
                    nnhood_bool = np.ravel(nnhood_label) == y_class
                elif self.kind_sel == 'all':
                    nnhood_label = nnhood_label == target_class
                    nnhood_bool = np.all(nnhood_label, axis=1)
                index_target_class = np.flatnonzero(nnhood_bool)
            else:
                index_target_class = slice(None)

            idx_under = np.concatenate(
                (idx_under, np.flatnonzero(y == target_class)[
                    index_target_class]), axis=0)

        if self.return_indices:
            return (safe_indexing(X, idx_under), safe_indexing(y, idx_under),
                    idx_under)
        else:
            return safe_indexing(X, idx_under), safe_indexing(y, idx_under)
    def _fit_resample(self, X, y):
        self._validate_estimator()

        if self.voting == 'auto':
            if sparse.issparse(X):
                self.voting_ = 'hard'
            else:
                self.voting_ = 'soft'
        else:
            if self.voting in VOTING_KIND:
                self.voting_ = self.voting
            else:
                raise ValueError("'voting' needs to be one of {}. Got {}"
                                 " instead.".format(VOTING_KIND, self.voting))

        X_resampled, y_resampled = [], []
        for target_class in np.unique(y):
            if target_class in self.sampling_strategy_.keys():
                n_samples = self.sampling_strategy_[target_class]
                self.estimator_.set_params(**{'n_clusters': n_samples})
                self.estimator_.fit(X[y == target_class])
                X_new, y_new = self._generate_sample(
                    X, y, self.estimator_.cluster_centers_, target_class)
                X_resampled.append(X_new)
                y_resampled.append(y_new)
            else:
                target_class_indices = np.flatnonzero(y == target_class)
                X_resampled.append(safe_indexing(X, target_class_indices))
                y_resampled.append(safe_indexing(y, target_class_indices))

        if sparse.issparse(X):
            X_resampled = sparse.vstack(X_resampled)
        else:
            X_resampled = np.vstack(X_resampled)
        y_resampled = np.hstack(y_resampled)

        return X_resampled, np.array(y_resampled, dtype=y.dtype)