Example #1
0
def test_auto_init(n_samples, n_features, n_classes, n_components):
    # Test that auto choose the init as expected with every configuration
    # of order of n_samples, n_features, n_classes and n_components.
    rng = np.random.RandomState(42)
    nca_base = NeighborhoodComponentsAnalysis(init='auto',
                                              n_components=n_components,
                                              max_iter=1,
                                              random_state=rng)
    if n_classes >= n_samples:
        pass
        # n_classes > n_samples is impossible, and n_classes == n_samples
        # throws an error from lda but is an absurd case
    else:
        X = rng.randn(n_samples, n_features)
        y = np.tile(range(n_classes), n_samples // n_classes + 1)[:n_samples]
        if n_components > n_features:
            # this would return a ValueError, which is already tested in
            # test_params_validation
            pass
        else:
            nca = clone(nca_base)
            nca.fit(X, y)
            if n_components <= min(n_classes - 1, n_features):
                nca_other = clone(nca_base).set_params(init='lda')
            elif n_components < min(n_features, n_samples):
                nca_other = clone(nca_base).set_params(init='pca')
            else:
                nca_other = clone(nca_base).set_params(init='identity')
            nca_other.fit(X, y)
            assert_array_almost_equal(nca.components_, nca_other.components_)
Example #2
0
def calibrate_probs(labels, weights, probs, logistic=False, random_state=11, threshold=0., return_calibrator=False, symmetrize=False):
    """
    Calibrate output to probabilities using 2-folding to calibrate all data
    
    :param probs: probabilities, numpy.array of shape [n_samples]
    :param labels: numpy.array of shape [n_samples] with labels 
    :param weights: numpy.array of shape [n_samples]
    :param threshold: float, to set labels 0/1 
    :param logistic: bool, use logistic or isotonic regression
    :param symmetrize: bool, do symmetric calibration, ex. for B+, B-
    
    :return: calibrated probabilities
    """
    labels = (labels > threshold) * 1
    ind = numpy.arange(len(probs))
    ind_1, ind_2 = train_test_split(ind, random_state=random_state, train_size=0.5)
    
    calibrator = LogisticRegression(C=100) if logistic else IsotonicRegression(y_min=0, y_max=1, out_of_bounds='clip')
    est_calib_1, est_calib_2 = clone(calibrator), clone(calibrator)
    probs_1 = probs[ind_1]
    probs_2 = probs[ind_2]
    
    if logistic:
        probs_1 = numpy.clip(probs_1, 0.001, 0.999)
        probs_2 = numpy.clip(probs_2, 0.001, 0.999)
        probs_1 = logit(probs_1)[:, numpy.newaxis]
        probs_2 = logit(probs_2)[:, numpy.newaxis]
        if symmetrize:
            est_calib_1.fit(numpy.r_[probs_1, 1-probs_1], 
                            numpy.r_[labels[ind_1] > 0, labels[ind_1] <= 0])
            est_calib_2.fit(numpy.r_[probs_2, 1-probs_2], 
                            numpy.r_[labels[ind_2] > 0, labels[ind_2] <= 0])
        else:
            est_calib_1.fit(probs_1, labels[ind_1])
            est_calib_2.fit(probs_2, labels[ind_2])
    else:
        if symmetrize:
            est_calib_1.fit(numpy.r_[probs_1, 1-probs_1], 
                            numpy.r_[labels[ind_1] > 0, labels[ind_1] <= 0],
                            numpy.r_[weights[ind_1], weights[ind_1]])
            est_calib_2.fit(numpy.r_[probs_2, 1-probs_2], 
                            numpy.r_[labels[ind_2] > 0, labels[ind_2] <= 0],
                            numpy.r_[weights[ind_2], weights[ind_2]])
        else:
            est_calib_1.fit(probs_1, labels[ind_1], weights[ind_1])
            est_calib_2.fit(probs_2, labels[ind_2], weights[ind_2])
        
    calibrated_probs = numpy.zeros(len(probs))
    if logistic:
        calibrated_probs[ind_1] = est_calib_2.predict_proba(probs_1)[:, 1]
        calibrated_probs[ind_2] = est_calib_1.predict_proba(probs_2)[:, 1]
    else:
        calibrated_probs[ind_1] = est_calib_2.transform(probs_1)
        calibrated_probs[ind_2] = est_calib_1.transform(probs_2)
    if return_calibrator:
        return calibrated_probs, (est_calib_1, est_calib_2)
    else:
        return calibrated_probs
Example #3
0
 def __init__(
     self,
     estimator=LinearSVC(),
     masker=NiftiMasker(),
     labelizer=LabelEncoder(),
     reporter=Reporter(),
     estimated_name="coef_",
 ):
     self.estimator = clone(estimator)
     self.masker = clone(masker)
     self.labelizer = clone(labelizer)
     self.reporter = reporter
     self.estimated_name = estimated_name
Example #4
0
    def classification_metrics(self, X, y, n_iter=10, test_size=0.25, random_state=0):
        """
        returns the roc auc of the classifier binary only., and the portion of correct predictions via CV
        @param y: all non-zero will be set to 1
        @param n_iter, test_size: StratifiedShuffleSplit parameters
        @param random_state: random state used for StratifiedShuffleSplit
        @return: roc, accuracy, accuracy_zero, accuracy_one
        """

        roc = 0
        accuracy = 0
        accuracy_zero = 0  # portion of zeros correctly predicted
        accuracy_one = 0  # portion of ones correctly predicted

        y = np.array([0 if d == 0 else 1 for d in y])
        prePipe = clone(self.common_preprocessing_pipe)
        pipeToUse = clone(self.classifier_pipe)
        cvObj = StratifiedShuffleSplit(y, n_iter=n_iter, test_size=test_size, random_state=random_state)

        for trainInds, testInds in cvObj:  # all cv data
            trainX = X[trainInds]
            trainY = y[trainInds]
            testX = X[testInds]
            testY = y[testInds]

            trainX = prePipe.fit_transform(trainX)
            testX = prePipe.transform(testX)
            pipeToUse.fit(trainX, trainY)
            y_scores = pipeToUse.predict_proba(testX)
            y_pred = pipeToUse.predict(testX)

            temp = next((i for i in range(len(testY)) if y_pred[i] == 1), None)

            roc += roc_auc_score(testY, y_scores[:, 1])
            accuracy += sum(y_pred == testY) * 1.0 / len(testY)
            accuracy_zero += 1.0 * sum(np.logical_and(y_pred == testY, testY == 0)) / sum(testY == 0)
            accuracy_one += 1.0 * sum(np.logical_and(y_pred == testY, testY == 1)) / sum(testY == 1)

        roc /= n_iter
        accuracy_zero /= n_iter
        accuracy_one /= n_iter
        accuracy /= n_iter

        print ">>> The classifier has roc = %0.3f, zero-accuracy = %0.3f, " "one-accuracy = %0.3f, overall accuracy = %0.3f." % (
            roc,
            accuracy_zero,
            accuracy_one,
            accuracy,
        )

        return roc, accuracy, accuracy_zero, accuracy_one
Example #5
0
    def _fit_stage(self, X, y, rmTolerance):
        """
        fits one stage of gradient boosting
        @param X:
        @param y:
        @param rmTolerance: tolerance for 1D optimization
        @return: nothing
        """

        residuals = self.lossFunction.negative_gradient(y, self._currentPrediction)
        trainX, trainY, _, _ = splitTrainTest(X, residuals, 1-self.subsample)   # stochastic boosting. train only on a portion of the data

        if len(np.unique(trainY))==1:
            hm = MajorityPredictor().fit(trainY)
        else:
            cvObj = KFold(n=len(trainX), n_folds=self.cvNumFolds, indices=False, shuffle=True, random_state=self.randomState)

            # find the h that best mimics the negative gradient
            if self.n_jobs > 1:  # parallel
                n_jobs = max(1, self.n_jobs/len(self.learners), self.cvNumFolds)
                # n_jobs = 1
                pool = MyPool(processes=self.n_jobs, initializer=gbjjInit, initargs=(trainX, trainY, self.lossFunction, n_jobs, cvObj))
                temp = pool.map_async(gbjjInnerLoop, self.learners)
                temp.wait()
                h_res = temp.get()
                pool.close()
                pool.join()

            else:   # single thread
                h_res = []

                for learner in self.learners:
                    if self.verbosity >= 2:
                        print 'Fitting learner:', learner
                    l = clone(learner)
                    scores = jjcross_val_score(l, trainX, trainY, score_func=self.lossFunction, n_jobs=1, cv=cvObj)
                    h_res.append(scores.mean())

            hm = clone(self.learners[np.argsort(h_res)[0]])

        if self.verbosity>=1:
            print "The best classifier is", hm.__class__

        # find rm
        hm.fit(trainX, trainY)
        hmx = hm.predict(X)
        rm = minimize_scalar(lambda r: self.lossFunction(y, self._currentPrediction + r*hmx), tol=rmTolerance).x

        # append estimator and weight
        self._estimators.append((hm, rm))
    def fit(self, X, y, sample_weight=None):
        assert isinstance(self.base_estimators, dict), 'Estimators should be passed in a dictionary'
        assert len(X) == len(y), 'the lengths are different'
        assert sample_weight is None or len(sample_weight) == len(y), 'the lengths are different'
        if sample_weight is None:
            sample_weight = numpy.ones(len(y))
        assert self.feature_name in X.columns, 'there is no feature %s' % self.feature_name
        self.columns_order = X.columns

        column = numpy.array(X[self.feature_name])
        self.column_values = list(set(column))
        self.stayed_columns = dict()        # value -> list of columns
        self.common_features = dict()       # (value_from, value_to) -> list of columns
        self.classifiers = dict()           # (value_from, value_to, classifier_name) -> classifier
        self.final_classifiers = dict()     # (value, classifier_name) -> classifier
        rows_dict = dict()                  # (value) -> boolean list of rows
        self.final_columns_orders = dict()  # (value) -> list of features
        for value in self.column_values:
            rows = numpy.array(X[self.feature_name] == value)
            rows_dict[value] = rows
            x_part = X.loc[rows, :]
            cols = pandas.notnull(x_part).all()
            self.stayed_columns[value] = cols[cols==True].keys()

        for value_to, rows_to in rows_dict.items():
            columns_to = self.stayed_columns[value_to]
            new_features = pandas.DataFrame()
            for value_from, rows_from in rows_dict.items():
                if value_from == value_to:
                    continue
                common_columns = list(set(self.stayed_columns[value_from]).union(set(self.stayed_columns[value_to])))
                common_columns.remove(self.feature_name)
                self.common_features[value_from, value_to] = common_columns
                for name, estimator in self.base_estimators.items():
                    rows_from = rows_dict[value_from]
                    new_classifier = sklearn.clone(estimator)\
                        .fit(X.loc[rows_from, common_columns], y[rows_from], sample_weight=sample_weight[rows_from])

                    self.classifiers[value_from, value_to, name] = new_classifier
                    new_feature = new_classifier.predict_proba(X.loc[rows_to, common_columns])[:, 1]
                    new_features[str(value_from) + "_" + name] = new_feature
            X_to_part = X.loc[rows_to, columns_to]
            new_features = new_features.set_index(X_to_part.index)
            X_to_part = pandas.concat([X_to_part, new_features], axis=1)
            final_classifier = sklearn.clone(self.final_estimator)
            final_classifier.fit(X_to_part, y[rows_to], sample_weight=sample_weight[rows_to])
            self.final_columns_orders[value_to] = X_to_part.columns
            self.final_classifiers[value_to] = final_classifier
        return self
Example #7
0
    def fit(self, original, target, original_weight=None, target_weight=None):
        """
        Prepare reweighting formula by training a sequence of trees.

        :param original: values from original distribution, array-like of shape [n_samples, n_features]
        :param target: values from target distribution, array-like of shape [n_samples, n_features]
        :param original_weight: weights for samples of original distributions
        :param target_weight: weights for samples of original distributions
        :return: self
        """
        original, original_weight = self._normalize_input(original, original_weight, normalize=False)
        target, target_weight = self._normalize_input(target, target_weight, normalize=False)

        folds_original = self._get_folds_column(len(original))
        folds_target = self._get_folds_column(len(target))
        for _ in range(self.n_folds):
            self.reweighters.append(clone(self.base_reweighter))

        original = numpy.array(original)
        target = numpy.array(target)

        for i in range(self.n_folds):
            self.reweighters[i].fit(original[folds_original != i, :], target[folds_target != i, :],
                                    original_weight=original_weight[folds_original != i],
                                    target_weight=target_weight[folds_target != i])
        self.train_length = len(original)
        return self
    def train(self,num_examples,deltas=list(range(1,6)),use_transformations=False,use_weights=True,verbosity=0):
        '''Train ensemble of classifiers using newly generated data for every
        member of the ensemble.

        '''
        time_start = time.time()
        
        self.classifiers = dict()

        for delta in deltas:
            self.classifiers[delta] = [] # list for ensemble of classifiers
            for i in range(self.n_estimators):
                # base classifier
                clf = clone(self.base_classifier)
                
                if use_weights:
                    train_x,train_y,train_w = self.make_weighted_training_data(create_examples(num_examples=num_examples,deltas=[delta]),use_transformations=use_transformations)
                    if verbosity>1:
                        print ('delta={0}, #{1}: training with {2} weighted examples'.format(delta,i,len(train_x)))
                    # fit
                    clf.fit(train_x,train_y,sample_weight=train_w)
                else:
                    train_x,train_y = self.make_training_data(create_examples(num_examples=num_examples,deltas=[delta]),use_transformations=use_transformations)
                    if verbosity>1:
                        print ('delta={0}, #{1}: training with {2} examples'.format(delta,i,len(train_x)))
                    clf.fit(train_x,train_y)
                    
                self.classifiers[delta].append(clf)
        
        time_end = time.time()
        if verbosity>0:
            print('training completed in {0:.1f} seconds'.format(time_end-time_start))
    def train(self, examples, use_transformations=False,use_weights=True):
        time_start = time.time()

        self.classifiers = dict()
        
        deltas = set([e.delta for e in examples])
        
        for delta in deltas:
            # create classifier with same params as base classifier
            clf = clone(self.base_classifier)
            
            if use_weights:
                # weighted training data for current delta
                # training data for current delta            
                (train_x,train_y,train_w) = self.make_weighted_training_data([e for e in examples if e.delta==delta],use_transformations=use_transformations) 
                print ('delta={0}, training with {1} weighted examples'.format(delta,len(train_x)))
                # fit
                clf.fit(train_x,train_y,train_w)

            else:
                # training data for current delta            
                (train_x,train_y) = self.make_training_data([e for e in examples if e.delta==delta],use_transformations=use_transformations)            
                print ('delta={0}, training with {1} examples'.format(delta,len(train_x)))
                # fit
                clf.fit(train_x,train_y)

            # store
            self.classifiers[delta] = clf

        time_end = time.time()
        print('training completed in {0} seconds'.format(time_end-time_start))
def test_no_attributes_set_in_init(estimator, preprocessor):
  """Check setting during init. Adapted from scikit-learn."""
  estimator = clone(estimator)
  estimator.set_params(preprocessor=preprocessor)
  if hasattr(type(estimator).__init__, "deprecated_original"):
      return

  init_params = _get_args(type(estimator).__init__)
  parents_init_params = [param for params_parent in
                         (_get_args(parent) for parent in
                          type(estimator).__mro__)
                         for param in params_parent]

  # Test for no setting apart from parameters during init
  invalid_attr = (set(vars(estimator)) - set(init_params) -
                  set(parents_init_params))
  assert not invalid_attr, \
      ("Estimator %s should not set any attribute apart"
       " from parameters during init. Found attributes %s."
       % (type(estimator).__name__, sorted(invalid_attr)))
  # Ensure that each parameter is set in init
  invalid_attr = (set(init_params) - set(vars(estimator)) -
                  set(["self"]))
  assert not invalid_attr, \
      ("Estimator %s should store all parameters"
       " as an attribute during init. Did not find "
       "attributes %s." % (type(estimator).__name__, sorted(invalid_attr)))
def test_various_scoring_on_tuples_learners(estimator, build_dataset,
                                            with_preprocessor):
  """Tests that scikit-learn's scoring returns something finite,
  for other scoring than default scoring. (List of scikit-learn's scores can be
  found in sklearn.metrics.scorer). For each type of output (predict,
  predict_proba, decision_function), we test a bunch of scores.
  We only test on pairs learners because quadruplets don't have a y argument.
  """
  input_data, labels, preprocessor, _ = build_dataset(with_preprocessor)
  estimator = clone(estimator)
  estimator.set_params(preprocessor=preprocessor)
  set_random_state(estimator)

  # scores that need a predict function: every tuples learner should have a
  # predict function (whether the pair is of positive samples or negative
  # samples)
  for scoring in ['accuracy', 'f1']:
    check_score_is_finite(scoring, estimator, input_data, labels)
  # scores that need a predict_proba:
  if hasattr(estimator, "predict_proba"):
    for scoring in ['neg_log_loss', 'brier_score']:
      check_score_is_finite(scoring, estimator, input_data, labels)
  # scores that need a decision_function: every tuples learner should have a
  # decision function (the metric between points)
  for scoring in ['roc_auc', 'average_precision', 'precision', 'recall']:
    check_score_is_finite(scoring, estimator, input_data, labels)
Example #12
0
def best_pipelines_by_algo(grid, algo_tag = 'regressor__algorithm', bigger_is_better=True):
    results = []

    if bigger_is_better:
        best_mean = -np.inf
        def is_better(x, y): return x > y
    else:
        best_mean = np.inf
        def is_better(x, y): return x < y

    best_params_by_algo = {}
    for params, mean, std in grid.grid_scores_:
        # print(params)
        algo_type = type(params[algo_tag])
        if algo_type not in best_params_by_algo:
            best_params_by_algo[algo_type] = (mean, params)
        else:
            best_mean = best_params_by_algo[algo_type][0]
            if is_better(mean, best_mean):
                best_params_by_algo[algo_type] = (mean, params)

    for algo, (mean, params) in best_params_by_algo.iteritems():
        new_estimator = clone(grid.estimator)
        new_estimator.set_params(**params)
        results.append(new_estimator)

    return results
def test_get_metric_works_does_not_raise(estimator, build_dataset):
  """Tests that the metric returned by get_metric does not raise errors (or
  warnings) similarly to the distance functions in scipy.spatial.distance"""
  input_data, labels, _, X = build_dataset()
  model = clone(estimator)
  set_random_state(model)
  model.fit(input_data, labels)
  metric = model.get_metric()

  list_test_get_metric_doesnt_raise = [(X[0], X[1]),
                                       (X[0].tolist(), X[1].tolist()),
                                       (X[0][None], X[1][None])]

  for u, v in list_test_get_metric_doesnt_raise:
    with pytest.warns(None) as record:
      metric(u, v)
    assert len(record) == 0

  # Test that the scalar case works
  model.transformer_ = np.array([3.1])
  metric = model.get_metric()
  for u, v in [(5, 6.7), ([5], [6.7]), ([[5]], [[6.7]])]:
    with pytest.warns(None) as record:
      metric(u, v)
    assert len(record) == 0
Example #14
0
def param_search(estimator, param_dict, n_iter=None, seed=None):
    """
    Generator for cloned copies of `estimator` set with parameters
    as specified by `param_dict`. `param_dict` can contain either lists
    of parameter values (grid search) or a scipy distribution function
    to be sampled from. If distributions, you must specify `n_iter`.

    Parameters:
    ___________

    estimator:
        sklearn-like estimator

    param_dict:
        dict of parameter name: values, where values can be an iterable
        or a distribution function

    n_iter:
        number of draws to take from parameter distributions
    """

    if n_iter is None:
        param_iter = ParameterGrid(param_dict)
    else:
        param_iter = ParameterSampler(param_dict,
                                      n_iter,
                                      random_state=seed)

    estimators = []
    for params in param_iter:
        new_estimator = sklearn.clone(estimator)
        new_estimator.set_params(**params)
        estimators.append(new_estimator)
    return estimators
 def _fit_best_estimator(self, X, y, sample_weight=None):
     # Training classifier once again
     self.best_estimator_ = sklearn.clone(self.base_estimator).set_params(**self.generator.best_params_)
     if sample_weight is None:
         self.best_estimator_.fit(X, y)
     else:
         self.best_estimator_.fit(X, y, sample_weight=sample_weight)
    def _fit(self, X, y, num_workers=1, verbose=False):
        t = time.time()

        param_grid = [{
            'C': [.001, .1, 10, 100],
        }]
        base_clf = sklearn.linear_model.LogisticRegression(
            fit_intercept=False, class_weight='auto',
            dual=False, penalty='l2')

        kfold = sklearn.cross_validation.StratifiedKFold(y, 3)
        param_iterator = sklearn.grid_search.ParameterGrid(param_grid)
        out = joblib.Parallel(n_jobs=num_workers, pre_dispatch=num_workers * 2)(
            joblib.delayed(fit_grid_point)(
                X[train], y[train], X[test], y[test], base_clf, clf_params)
            for clf_params in param_iterator for train, test in kfold
        )

        df = pandas.DataFrame(out, columns=['setting', 'score', 'entropy'])
        dfg = df.groupby('setting').mean()
        if verbose:
            print(dfg)

        dfg = dfg.sort(['score', 'entropy'], ascending=[0, 1])
        best_params = json.loads(dfg.index[0])
        best_score, best_entropy = dfg.ix[0].values
        print('Best at {}: {:.2f} | {:.2f} and took {:.2f} s'.format(best_params, best_score, best_entropy, time.time() - t))

        clf = sklearn.clone(base_clf)
        clf.set_params(**best_params)
        clf.fit(X, y)
        return clf, best_score, best_entropy
def make_per_customer_submission(
    dataset_name, estimator, x_transformation=identity, y_transformation=identity, include_variables=None
):
    """BROKEN FOR NOW!"""
    with open(j(DATA_DIR, dataset_name, "per-customer-train.pickle"), "rb") as f:
        dv, train_customers, train_y, train_x, train_weights = pickle.load(f)

    choose_columnns = lambda x: x[
        :, np.array(get_feature_indexi(dv.get_feature_names(), include_variables), dtype="int")
    ]
    x_all_transforms = lambda x: x_transformation(choose_columnns(x))

    model = clone(estimator)
    try:
        model.fit(x_all_transforms(train_x), y_transformation(train_y), sample_weight=train_weights)
    except TypeError:
        print("%s doesn't support `sample_weight`. Ignoring it." % str(model))
        model.fit(x_all_transforms(train_x), y_transformation(train_y))

    with open(j(DATA_DIR, dataset_name, "per-customer-test.pickle"), "rb") as f:
        _, test_customers, test_y, test_x, test_weights = pickle.load(f)

    with open(j("submissions", "%s.csv" % datetime.datetime.now().strftime("%Y-%m-%dT%H-%M-%S")), "w") as f:
        f.write("customer_ID,plan\n")
        for c, ps in zip(test_customers, model.predict(x_all_transforms(test_x))):
            f.write("%s,%s\n" % (c, "".join(str(pp) for pp in ps)))
Example #18
0
    def fit(self, X, feature):
        try:
            feature = int(feature)
        except Exception:
            self.logger("feature should be int")
            raise TypeError("feature should be int")

        X = X.view(np.ndarray)
        self.input_col_count = X.shape[1]
        self.feature = feature
        my_X = Misc.exclude_cols(X, self.feature)
        my_y = X[:, self.feature]
        y_mean = np.mean(my_y)
        y_std = np.std(my_y)

        # ref: http://www.sciencedirect.com/science/article/pii/S0893608004002102
        self._learner.C = max(abs(y_mean + 3 * y_std), abs(y_mean - 3 * y_std))

        cvs = cv.KFold(len(X), 10, shuffle=True)
        output_errors = np.empty(0)
        for train, test in cvs:
            tmp_l = sklearn.clone(self._learner)
            tmp_l.fit(my_X[train, :], X[train, self.feature])
            output_errors = np.hstack((output_errors, tmp_l.predict(my_X[test, :]) - X[test, self.feature]))

        self.error_std = np.std(output_errors)
        self.error_mean = np.mean(output_errors)

        self._learner.fit(my_X, X[:, self.feature])

        return self
Example #19
0
    def fit(self, X, y, sample_weight=None):
        label = self.uniform_label
        self.uniform_label = numpy.array([label]) if isinstance(label, numbers.Number) else numpy.array(label)

        sample_weight = check_sample_weight(y, sample_weight=sample_weight).copy()
        assert numpy.all(numpy.in1d(y, [0, 1])), 'only two-class classification is supported by now'
        y = column_or_1d(y)
        y_signed = 2 * y - 1

        X = pandas.DataFrame(X)
        knn_indices = computeKnnIndicesOfSameClass(self.uniform_variables, X, y, self.n_neighbours)

        # for those events with non-uniform label we repeat it's own index several times
        for label in [0, 1]:
            if label not in self.uniform_label:
                knn_indices[y == label, :] = numpy.arange(len(y))[y == label][:, numpy.newaxis]

        X = self.get_train_vars(X)
        cumulative_score = numpy.zeros(len(X))
        self.estimators = []

        for stage in range(self.n_estimators):
            classifier = sklearn.clone(self.base_estimator)
            classifier.fit(X, y, sample_weight=sample_weight)
            score = self.learning_rate * self.compute_score(classifier, X=X)
            cumulative_score += score
            sample_weight *= numpy.exp(- y_signed * numpy.take(score, knn_indices).mean(axis=1))
            sample_weight = self.normalize_weights(y=y, sample_weight=sample_weight)
            self.estimators.append(classifier)
Example #20
0
def lambda_choice(penalty, lambdas, n_folds, K, y, n_iter=10000, verbose=0, n_jobs=-1):
    estimator = fista.Fista(penalty=penalty, n_iter=n_iter)
    infos = Parallel(n_jobs=n_jobs, verbose=verbose)(
            delayed(_sub_info)(clone(estimator), K, y, K, y, lambda_)
           for lambda_ in lambdas)

    return infos
Example #21
0
    def fit_models(self, X, Y, bands=None):
        """ Fit timeseries models for `bands` within `Y` for a given `X`

        Args:
            X (np.ndarray): design matrix (number of observations x number of
                features)
            Y (np.ndarray): independent variable matrix (number of series x
                number of observations) observation in the X design matrix
            bands (iterable): Subset of bands of `Y` to fit. If None are
                provided, fit all bands in Y

        Returns:
            np.ndarray: fitted model objects

        """
        if bands is None:
            bands = np.arange(self.n_series)

        models = []
        for b in bands:
            y = Y.take(b, axis=0)
            model = sklearn.clone(self.lm).fit(X, y)  # TODO: no clone?

            # Add in RMSE calculation  # TODO: numba?
            model.rmse = ((y - model.predict(X)) ** 2).mean(axis=0) ** 0.5

            # Add intercept to intercept term of design matrix
            model.coef = model.coef_.copy()
            model.coef[0] += model.intercept_

            models.append(model)

        return np.array(models)
 def run(self, directory="datasets/"):
     loader = ArffLoader("{}/{}.arff".format(directory, self.dataset))
     inputs, labels = loader.get_dataset()
     n_features = inputs.shape[1]
     if self.subset_size >= n_features:
         return None
     if self.normalize:
         preprocessing.normalize(inputs, copy=False)
     results = {
         "experiment": self,
         "scores": {scorer_name: numpy.zeros(self.n_runs) for scorer_name, _ in self.scorers},
         "score_times": {scorer_name: numpy.zeros(self.n_runs) for scorer_name, _ in self.scorers},
         "errors": {classifier_name: numpy.zeros(self.n_runs) for classifier_name, _ in self.classifiers},
         "classifier_times": {classifier_name: numpy.zeros(self.n_runs) for classifier_name, _ in self.classifiers}
     }
     for run in range(self.n_runs):
         numpy.random.seed(run)
         indices = numpy.random.choice(n_features, size=self.subset_size, replace=False)
         inputs_subset = inputs[:, indices].copy()
         for scorer_name, scorer in self.scorers:
             score, t = self._execute_score_run(run, scorer, inputs_subset, labels)
             results["scores"][scorer_name][run] = score
             results["score_times"][scorer_name][run] = t
         for classifier_name, classifier in self.classifiers:
             error, t = self._execute_classifier_run(run, sklearn.clone(classifier), inputs_subset, labels)
             results["errors"][classifier_name][run] = error
             results["classifier_times"][classifier_name][run] = t
     return results
Example #23
0
        def score(parms):
            e = sklearn.clone(learner)
            pp = dict(parms)
            pp.pop("id")
            try:
                e.set_params(**pp)
                e.fit(*trainSet, maxIters=maxIters)

                if visualParams is not None:
                    imgPathBase = os.path.join(imageDestFolder, "{}".format(store.params["id"]))

                    # Write some images!
                    e.visualize(visualParams, path=imgPathBase + ".png")
                    e.visualize(visualParams, path=imgPathBase + "_example.png", inputs=testSet[0][0])

                if scoreModel is None:
                    return dict(score=e.score(*testSet))
                else:
                    return scoreModel(e, testSet)
            except:
                sys.stderr.write("Error for {}:\n{}\n".format(parms, traceback.format_exc()))
                if e.UNSUPERVISED:
                    score = 1.0
                else:
                    score = -1.0
                e = None
def estimate_classifier(params_dict, base_estimator, X, y, folds, fold_checks,
                        score_function, sample_weight=None, label=1,
                        scorer_needs_x=False, catch_exceptions=True):
    """This function is needed to train classifier with some parameters on the cluster."""
    try:
        k_folder = StratifiedKFold(y=y, n_folds=folds, shuffle=True)
        score = 0.
        for train_indices, test_indices in islice(k_folder, fold_checks):
            trainX, trainY = X.irow(train_indices), y[train_indices]
            testX, testY = X.irow(test_indices), y[test_indices]
            estimator = sklearn.clone(base_estimator).set_params(**params_dict)

            train_options = {}
            test_options = {}
            if sample_weight is not None:
                train_weights, test_weights = \
                    sample_weight[train_indices], sample_weight[test_indices]
                train_options['sample_weight'] = train_weights
                test_options['sample_weight'] = test_weights
            if scorer_needs_x:
                test_options['X'] = testX

            estimator.fit(trainX, trainY, **train_options)
            proba = estimator.predict_proba(testX)
            score += score_function(testY, proba[:, label], **test_options)

        return score / fold_checks
    except Exception as e:
        # If there was some exception on the node, it will be returned
        if catch_exceptions:
            return e
        else:
            raise
Example #25
0
 def train(self, n_experts, instances, labels):
     experts = []
     self.weigher_sampler.train(instances)
     for centroid in self.centroid_picker.pick(instances, labels, n_experts):
         expert = LocalExpert(sklearn.clone(self.base_estimator), self.weigher_sampler)
         expert.train(instances, labels, centroid)
         experts.append(expert)
     return experts
Example #26
0
    def __init__(self, test_indices=None,
                 lm=sklearn.linear_model.Lasso(alpha=20),
                 **kwargs):
        self.test_indices = np.asarray(test_indices)
        self.lm = sklearn.clone(lm)

        self.n_record = 0
        self.record = []
Example #27
0
def single_learning_rate(mf, learning_rate, X_tr, X_te):
    mf = clone(mf)
    mf.set_params(learning_rate=learning_rate, verbose=5)
    cb = Callback(X_tr, X_te)
    mf.set_params(callback=cb)
    mf.fit(X_tr)
    return dict(time=cb.times,
                rmse=cb.rmse)
Example #28
0
 def __call__(self, base_estimator, params, X, y, sample_weight=None):
     cl = clone(base_estimator)
     cl.set_params(**params)
     if sample_weight is not None:
         cl.fit(X, y, sample_weight)
     else:
         cl.fit(X, y)
     return roc_auc_score(self.testY, cl.predict_proba(self.testX)[:, 1])
Example #29
0
 def custom(base_estimator, params, X, y, sample_weight=None):
     cl = clone(base_estimator)
     cl.set_params(**params)
     if sample_weight is not None:
         cl.fit(X, y, sample_weight)
     else:
         cl.fit(X, y)
     return roc_auc_score(labels, cl.predict_proba(test)[:, 1])
def fit_grid_point(X, y, X_val, y_val, base_clf, clf_params):
    clf = sklearn.clone(base_clf)
    clf.set_params(**clf_params)
    clf.fit(X, y)
    proba = clf.predict_proba(X_val)
    score = sklearn.metrics.accuracy_score(proba.argmax(1), y_val)
    entropy = (scipy.stats.distributions.entropy(proba.T) / np.log(proba.shape[1])).mean()
    return json.dumps(clf_params), score, entropy
Example #31
0
def test_get_metric_equivalent_to_explicit_mahalanobis(estimator,
                                                       build_dataset):
    """Tests that using the get_metric method of mahalanobis metric learners is
  equivalent to explicitely calling scipy's mahalanobis metric
  """
    rng = np.random.RandomState(42)
    input_data, labels, _, X = build_dataset()
    model = clone(estimator)
    set_random_state(model)
    model.fit(*remove_y(estimator, input_data, labels))
    metric = model.get_metric()
    n_features = X.shape[1]
    a, b = (rng.randn(n_features), rng.randn(n_features))
    expected_dist = mahalanobis(a[None],
                                b[None],
                                VI=model.get_mahalanobis_matrix())
    assert_allclose(metric(a, b), expected_dist, rtol=1e-13)
Example #32
0
def test_score_pairs_dim(estimator, build_dataset):
    # scoring of 3D arrays should return 1D array (several tuples),
    # and scoring of 2D arrays (one tuple) should return an error (like
    # scikit-learn's error when scoring 1D arrays)
    input_data, labels, _, X = build_dataset()
    model = clone(estimator)
    set_random_state(model)
    model.fit(input_data, labels)
    tuples = np.array(list(product(X, X)))
    assert model.score_pairs(tuples).shape == (tuples.shape[0], )
    context = make_context(estimator)
    msg = ("3D array of formed tuples expected{}. Found 2D array "
           "instead:\ninput={}. Reshape your data and/or use a preprocessor.\n"
           .format(context, tuples[1]))
    with pytest.raises(ValueError) as raised_error:
        model.score_pairs(tuples[1])
    assert str(raised_error.value) == msg
Example #33
0
def test_data_valid(model, data, target):
    is_log = 0
    if is_log == 1:
        target = np.log(target)

    test_model = clone(model)
    X_train, X_test, y_train, y_test = train_test_split(data,
                                                        target,
                                                        test_size=0.33,
                                                        random_state=1)
    test_model.fit(X_train, y_train)
    y_predict = test_model.predict(X_test)

    if (is_log == 1):
        rmse(y_test, y_predict)
    else:
        rmse_log(y_test, y_predict)
def crossval_fm_scores(numbered_params, fm, vecs, labels, groups, n_params):
    """Calculates the mean avg precision score for a given classifier via CV.

    Used at training time to find the MAP for the given classifier and params
    via cross validation.
    """
    assert isinstance(fm, FusionModel)
    params_number, params = numbered_params
    logger.debug(
        '%s: param %d/%d, starting CV for:\nparams: %s',
        fm.name, params_number, n_params, params)
    clf = clone(fm.clf)
    clf.set_params(**params)
    n_splits = config.FUSION_PARAM_TUNING_CV_KFOLD
    splits = GroupKFold(n_splits).split(vecs, labels, groups=groups)

    map_ = parallel_map if fm.parallelize_cv else map
    single_split_func = partial(
        crossval_fm_score_single_split,
        fm=fm,
        clf=clf,
        params=params,
        params_number=params_number,
        n_params=n_params,
        n_splits=n_splits,
        vecs=vecs,
        labels=labels,
        groups=groups,
    )
    cv_results = map_(single_split_func, enumerate(splits, 1))

    training_times, all_avg_precs = zip(*cv_results)
    all_avg_precs = list(itertools.chain.from_iterable(all_avg_precs))
    t = sum(training_times, timedelta())

    # if params_number % 10 == 0:
    #     logger.info('%s: completed crossval of param %d', name, params_number)
    mean_avg_prec = np.mean(all_avg_precs)
    std_avg_prec = np.std(all_avg_precs)
    logger.info(
        '%s: param %d/%d:\n'
        'params: %s\n'
        'CV results: score (MAP): %.3f (std: %.3f), training time: %s',
        fm.name, params_number, n_params, params, mean_avg_prec, std_avg_prec, t
    )
    return mean_avg_prec, std_avg_prec
Example #35
0
    def fit(self, X, y=None):
        """
        Fit the model using X, y as training data. Will also learn the groups that exist within the dataset.

        :param X: array-like, shape=(n_columns, n_samples,) training data.
        :param y: array-like, shape=(n_samples,) training data.
        :return: Returns an instance of self.
        """
        X, y = self.__prepare_input_data(X, y)

        if self.shrinkage is not None:
            self.__set_shrinkage_function()

        self.group_colnames_ = [str(_) for _ in as_list(self.groups)]

        if self.value_columns is not None:
            self.value_colnames_ = [str(_) for _ in as_list(self.value_columns)]
        else:
            self.value_colnames_ = [_ for _ in X.columns if _ not in self.group_colnames_]
        self.__validate(X, y)

        # List of all hierarchical subsets of columns
        self.group_colnames_hierarchical_ = expanding_list(self.group_colnames_, list)

        self.fallback_ = None

        if self.shrinkage is None and self.use_global_model:
            subset_x = X[self.value_colnames_]
            self.fallback_ = clone(self.estimator).fit(subset_x, y)

        if self.shrinkage is not None:
            self.estimators_ = {}

            for level_colnames in self.group_colnames_hierarchical_:
                self.estimators_.update(
                    self.__fit_grouped_estimator(X, y, self.value_colnames_, level_colnames)
                )
        else:
            self.estimators_ = self.__fit_grouped_estimator(X, y, self.value_colnames_, self.group_colnames_)

        self.groups_ = as_list(self.estimators_.keys())

        if self.shrinkage is not None:
            self.shrinkage_factors_ = self.__get_shrinkage_factor(X)

        return self
def test_validate_calibration_params_invalid_parameters_error_before__fit(
        estimator, build_dataset):
  """For all pairs metric learners (which currently all have a _fit method),
  make sure that calibration parameters are validated before fitting"""
  estimator = clone(estimator)
  input_data, labels, _, _ = build_dataset()

  def breaking_fun(**args):  # a function that fails so that we will miss
    # the calibration at the end and therefore the right error message from
    # validating params should be thrown before
    raise RuntimeError('Game over.')
  estimator._fit = breaking_fun
  expected_msg = ('Strategy can either be "accuracy", "f_beta" or '
                  '"max_tpr" or "max_tnr". Got "weird" instead.')
  with pytest.raises(ValueError) as raised_error:
    estimator.fit(input_data, labels, calibration_params={'strategy': 'weird'})
  assert str(raised_error.value) == expected_msg
def test_pipeline_consistency(estimator, build_dataset, with_preprocessor):
    # Adapted from scikit learn
    # check that make_pipeline(est) gives same score as est
    # we do this test on all except quadruplets (since they don't have a y
    # in fit):
    if estimator.__class__.__name__ not in [
            e.__class__.__name__ for (e, _) in quadruplets_learners
    ]:
        input_data, y, preprocessor, _ = build_dataset(with_preprocessor)

        def make_random_state(estimator, in_pipeline):
            rs = {}
            name_estimator = estimator.__class__.__name__
            if name_estimator[-11:] == '_Supervised':
                name_param = 'random_state'
                if in_pipeline:
                    name_param = name_estimator.lower() + '__' + name_param
                rs[name_param] = check_random_state(0)
            return rs

        estimator = clone(estimator)
        estimator.set_params(preprocessor=preprocessor)
        pipeline = make_pipeline(estimator)
        estimator.fit(*remove_y_quadruplets(estimator, input_data, y),
                      **make_random_state(estimator, False))
        pipeline.fit(*remove_y_quadruplets(estimator, input_data, y),
                     **make_random_state(estimator, True))

        if hasattr(estimator, 'score'):
            result = estimator.score(
                *remove_y_quadruplets(estimator, input_data, y))
            result_pipe = pipeline.score(
                *remove_y_quadruplets(estimator, input_data, y))
            assert_allclose_dense_sparse(result, result_pipe)

        if hasattr(estimator, 'predict'):
            result = estimator.predict(input_data)
            result_pipe = pipeline.predict(input_data)
            assert_allclose_dense_sparse(result, result_pipe)

        if issubclass(estimator.__class__, TransformerMixin):
            if hasattr(estimator, 'transform'):
                result = estimator.transform(input_data)
                result_pipe = pipeline.transform(input_data)
                assert_allclose_dense_sparse(result, result_pipe)
Example #38
0
    def fit(self, X, y, sample_weight=None):
        """
        Train the classifier, will train several base classifiers on overlapping
        subsets of training dataset.

        :param X: pandas.DataFrame of shape [n_samples, n_features]
        :param y: labels of events - array-like of shape [n_samples]
        :param sample_weight: weight of events,
               array-like of shape [n_samples] or None if all weights are equal
        """
        if hasattr(self.base_estimator, 'features'):
            assert self.base_estimator.features is None, 'Base estimator must have None features! ' \
                                                         'Use features parameter in Folding to fix it'
        X, y, sample_weight = check_inputs(X,
                                           y,
                                           sample_weight=sample_weight,
                                           allow_none_weights=True)
        X = self._get_features(X)
        self._set_classes(y)
        folds_column = self._get_folds_column(len(X))

        for _ in range(self.n_folds):
            self.estimators.append(clone(self.base_estimator))

        if sample_weight is None:
            weights_iterator = (None for _ in range(self.n_folds))
        else:
            weights_iterator = (sample_weight[folds_column != index]
                                for index in range(self.n_folds))

        result = utils.map_on_cluster(self.ipc_profile, train_estimator,
                                      range(len(self.estimators)),
                                      self.estimators,
                                      (X.iloc[folds_column != index, :].copy()
                                       for index in range(self.n_folds)),
                                      (y[folds_column != index]
                                       for index in range(self.n_folds)),
                                      weights_iterator)
        for status, data in result:
            if status == 'success':
                name, classifier, spent_time = data
                self.estimators[name] = classifier
            else:
                print('Problem while training on the node, report:\n', data)
        return self
Example #39
0
def setup_dummy_YATSM(X, Y, dates, i_breaks):
    """ Setup a dummy YATSM model

    Args:
        X (np.ndarray): n x p features
        Y (np.ndarray): n_series x n independent data
        dates (np.ndarray): n dates
        i_breaks (iterable): indices of ``dates`` representing break dates
            (can be zero or nonzero, but len(i_breaks) is len(yatsm.record))

    Returns:
        YATSM model
    """
    n = dates.size
    yatsm = YATSM()
    yatsm.X, yatsm.Y, yatsm.dates = X, Y, dates
    yatsm.n_coef, yatsm.n_series = X.shape[1], Y.shape[0]
    yatsm.models = np.array(
        [sklearn.clone(yatsm.estimator) for i in range(yatsm.n_series)])
    yatsm.test_indices = np.arange(yatsm.n_series)
    n_models = len(i_breaks)
    yatsm.record = np.hstack([yatsm.record_template] * n_models)

    def populate_record(yatsm, i_rec, i_start, i_end, i_break):
        yatsm.record[i_rec]['start'] = yatsm.dates[i_start]
        yatsm.record[i_rec]['end'] = yatsm.dates[i_end]
        yatsm.record[i_rec]['break'] = (yatsm.dates[i_break]
                                        if i_break else i_break)
        yatsm.fit_models(X[i_start:i_end, :], Y[:, i_start:i_end])
        for i, m in enumerate(yatsm.models):
            yatsm.record[i_rec]['coef'][:, i] = m.coef
            yatsm.record[i_rec]['rmse'][i] = m.rmse
        return yatsm

    i_start = 0
    i_end = i_breaks[0] - 1 if i_breaks[0] else n - 1
    i_break = i_breaks[0]
    yatsm = populate_record(yatsm, 0, i_start, i_end, i_break)

    for idx, i_break in enumerate(i_breaks[1:]):
        i_start = i_breaks[idx] + 1
        i_end = i_break - 1 if i_break else n - 1
        yatsm = populate_record(yatsm, idx + 1, i_start, i_end, i_break)

    return yatsm
Example #40
0
    def __init__(self,
                 test_indices=None,
                 estimator={
                     'object': sklearn.linear_model.Lasso(alpha=20),
                     'fit': {}
                 },
                 **kwargs):
        self.test_indices = np.asarray(test_indices)
        self.estimator = sklearn.clone(estimator['object'])
        self.estimator_fit = estimator.get('fit', {})
        self.models = []  # leave empty, fill in during `fit`

        self.n_record = 0
        self.record = []

        self.n_series, self.n_features = 0, 0
        self.px = kwargs.get('px', 0)
        self.py = kwargs.get('py', 0)
def _train(train_data: DataFrame, time_series_predictor: Any,
           clusterer: Clustering) -> dict:
    models = dict()

    train_data = clusterer.cluster_data(train_data)

    for cluster in range(clusterer.n_clusters):

        cluster_train_df = train_data[cluster]
        if not cluster_train_df.empty:
            time_series_predictor.fit(cluster_train_df)

            models[cluster] = time_series_predictor
            time_series_predictor = clone(time_series_predictor, safe=False)
    return {
        ModelType.CLUSTERER.value: clusterer,
        ModelType.TIME_SERIES_PREDICTOR.value: models
    }
Example #42
0
def _setparams_clustering(method, masker, n_clusters, crop=False):
    """Setting the parameters of the clustering method

    method : sklearn clustering-like or Random Projections
    masker : NiftiMasker
    n_clusters : int
    crop: bool, only for slic

    """
    method = clone(method)
    if hasattr(method, 'n_clusters'):
        method.set_params(**{'masker': masker, 'n_clusters': n_clusters})
        if hasattr(method, 'crop'):
            method.set_params(crop=crop)
    else:
        method.set_params(n_components=n_clusters)

    return method
Example #43
0
    def fit(self, X, y, **kwargs):
        """Fit the estimator.

        If `prefit` is set to `True` then the base estimator is kept as is.
        Otherwise it is fitted from the provided arguments.
        """
        if self.estimator is None:
            raise ValueError(BASE_ESTIMATOR_NONE_ERROR_MESSAGE)

        if not self.prefit:
            self.estimator_ = clone(self.estimator).fit(X, y, **kwargs)
        else:
            try:
                check_is_fitted(self.estimator)
            except NotFittedError:
                warn(BASE_ESTIMATOR_NOT_FITTED_WARNING.format(type(self).__name__))
            self.estimator_ = self.estimator
        return self
def extra_trees(X,y,n_est):
    '''
	INPUT: Dataframe with features (X), target variable dataframe (y), number of estimators (parameter)
	OUTPUT: Score of ExtaTrees model
    '''

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    
    ext = ExtraTreesRegressor(n_estimators=n_est)
    mean = X.mean(axis=0)
    std = X.std(axis=0)
    X = (X - mean) / std
    clf = clone(ext)
    clf = ext.fit(X_train, y_train)

    scores = ext.score(X_test, y_test)

    return 'ExtraTrees Score: '+str(scores), dict(zip(cols2, clf.feature_importances_))
Example #45
0
 def get_deep_copy(self):
     """
     This creates an untrained copy of the model structure
     """
     if self.mtype == mtype.SKL:
         return clone(self.model)
     elif self.mtype == mtype.XG:
         if isinstance(self.model, XGBRegressor):
             return XGBRegressor(**self.model.get_params())
         if isinstance(self.model, XGBClassifier):
             return XGBClassifier(**self.model.get_params())
     elif self.mtype == mtype.LGBM:
         # TODO
         pass
     elif self.mtype == mtype.KERAS:
         # TODO Consider whether ref is to KerasNN wrapper or the model itself.  Probably latter.
         #return KerasNN(self.model.neuronList, self.model.dropout, self.model.epochs)
         pass
def randomForests(X, y, X_test):
    predictions = []
    n_estimators = 200
    models = [
        DecisionTreeClassifier(max_depth=None),
        RandomForestClassifier(n_estimators=n_estimators),
        ExtraTreesClassifier(n_estimators=n_estimators),
        AdaBoostClassifier(DecisionTreeClassifier(max_depth=None),
                           n_estimators=n_estimators)
    ]

    for model in models:
        clf = clone(model)
        clf = model.fit(X, y)
        prediction = clf.predict(X_test)

        predictions.append(prediction)
    return (predictions)
    def train_predict(self, X_train, y_train, X_predict):
        # ~~~~~~~~~~~~~~~~~~ Summary ~~~~~~~~~~~~~~~~~~~~
        # This function will train on the entire training dataset and will then create predictions on the Kaggle test set.
        # This function is mean primarily for the purpose of submission.
        #
        # ~~~~~~~~~~~~~~~~ Parameters ~~~~~~~~~~~~~~~~~~~
        # Input:
        #	- X_train: Full train dataset to train on
        #	- y_train: Full y train to train on
        #	- X_predict: Full test dataset (without labels)
        # Output:
        #	- y_pred: predictions on X
        # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        instance = clone(self.model)
        instance.fit(X_train, y_train)

        y_pred = instance.predict(X_predict)
        return y_pred
Example #48
0
    def __init__(self,
                 n_trees=500,
                 min_leaf_size=10, max_depth=10,
                 subsample_ratio=0.7,
                 bootstrap=False,
                 lambda_reg=0.01,
                 propensity_model=LogisticRegression(penalty='l1', solver='saga',
                                                     multi_class='auto'),  # saga solver supports l1
                 model_Y=WeightedLassoCVWrapper(cv=3),
                 propensity_model_final=None,
                 model_Y_final=None,
                 categories='auto',
                 n_jobs=-1,
                 random_state=None):
        # Copy and/or define models
        self.propensity_model = clone(propensity_model, safe=False)
        self.model_Y = clone(model_Y, safe=False)
        self.propensity_model_final = clone(propensity_model_final, safe=False)
        self.model_Y_final = clone(model_Y_final, safe=False)
        if self.propensity_model_final is None:
            self.propensity_model_final = clone(self.propensity_model, safe=False)
        if self.model_Y_final is None:
            self.model_Y_final = clone(self.model_Y, safe=False)

        # Nuisance estimators shall be defined during fitting because they need to know the number of distinct
        # treatments
        nuisance_estimator = None
        second_stage_nuisance_estimator = None
        # Define parameter estimators
        parameter_estimator = DiscreteTreatmentOrthoForest.parameter_estimator_func
        second_stage_parameter_estimator =\
            DiscreteTreatmentOrthoForest.second_stage_parameter_estimator_gen(lambda_reg)
        # Define moment and mean gradient estimator
        moment_and_mean_gradient_estimator =\
            DiscreteTreatmentOrthoForest.moment_and_mean_gradient_estimator_func
        if categories != 'auto':
            categories = [categories]  # OneHotEncoder expects a 2D array with features per column
        self._one_hot_encoder = OneHotEncoder(categories=categories, sparse=False, drop='first')

        super(DiscreteTreatmentOrthoForest, self).__init__(
            nuisance_estimator,
            second_stage_nuisance_estimator,
            parameter_estimator,
            second_stage_parameter_estimator,
            moment_and_mean_gradient_estimator,
            n_trees=n_trees,
            min_leaf_size=min_leaf_size,
            max_depth=max_depth,
            subsample_ratio=subsample_ratio,
            bootstrap=bootstrap,
            n_jobs=n_jobs,
            random_state=random_state)
def test_predict_monotonous(estimator, build_dataset, with_preprocessor):
    """Test that there is a threshold distance separating points labeled as
  similar and points labeled as dissimilar """
    input_data, labels, preprocessor, _ = build_dataset(with_preprocessor)
    estimator = clone(estimator)
    estimator.set_params(preprocessor=preprocessor)
    set_random_state(estimator)
    pairs_train, pairs_test, y_train, y_test = train_test_split(
        input_data, labels)
    estimator.fit(pairs_train, y_train)
    distances = estimator.score_pairs(pairs_test)
    predictions = estimator.predict(pairs_test)
    min_dissimilar = np.min(distances[predictions == -1])
    max_similar = np.max(distances[predictions == 1])
    assert max_similar <= min_dissimilar
    separator = np.mean([min_dissimilar, max_similar])
    assert (predictions[distances > separator] == -1).all()
    assert (predictions[distances < separator] == 1).all()
Example #50
0
def adapt_ubm(ubm, X, adapt_params='m', adapt_iter=10):

    # clone UBM (n_components, covariance type, etc...)
    gmm = sklearn.clone(ubm)

    # initialize with UBM precomputed weights, means and covariance matrices
    gmm.n_init = 1
    gmm.init_params = ''
    gmm.weights_ = ubm.weights_
    gmm.means_ = ubm.means_
    gmm.covars_ = ubm.covars_

    # adapt only some parameters
    gmm.params = adapt_params
    gmm.n_iter = adapt_iter
    gmm.fit(X)

    return gmm
Example #51
0
    def __init__(self,
                 forecaster,
                 refit_interval=0,
                 refit_window_size=None,
                 refit_window_lag=0):
        self.forecaster = forecaster
        self.forecaster_ = clone(forecaster)

        self.refit_interval = refit_interval
        self.refit_window_size = refit_window_size
        self.refit_window_lag = refit_window_lag

        super(UpdateRefitsEvery, self).__init__()

        self.clone_tags(forecaster)

        # fit must be executed to fit the wrapped estimator and remember the cutoff
        self.set_tags(fit_is_empty=False)
Example #52
0
def learn_curve(model, train_x, train_label, cv=3, scoring='neg_log_loss'):
    model_c = clone(model)
    N, train_score, test_score = learning_curve(model_c,
                                                train_x,
                                                train_label,
                                                cv=cv,
                                                train_sizes=np.linspace(
                                                    0.5, 1, 5),
                                                scoring=scoring)

    plt.figure(figsize=(7, 4))
    plt.title('{}'.format(type(model).__name__))
    plt.plot(N, np.mean(train_score, 1), color='blue', label='training score')
    plt.plot(N, np.mean(test_score, 1), color='red', label='validation score')
    plt.xlabel('training sample')
    plt.ylabel(scoring)
    plt.legend(loc=0)
    plt.show()
Example #53
0
    def _transform(self, X, y=None):
        """Transform X and return a transformed version.

        private _transform containing core logic, called from transform

        Parameters
        ----------
        X : Series or Panel of mtype X_inner_mtype
            if X_inner_mtype is list, _transform must support all types in it
            Data to be transformed
        y : Series or Panel of mtype y_inner_mtype, default=None
            Additional data, e.g., labels for transformation

        Returns
        -------
        transformed version of X
        """
        return clone(self.transformer).fit_transform(X=X, y=y)
def test_dont_overwrite_parameters(estimator, build_dataset,
                                   with_preprocessor):
    # Adapted from scikit-learn
    # check that fit method only changes or sets private attributes
    input_data, labels, preprocessor, _ = build_dataset(with_preprocessor)
    estimator = clone(estimator)
    estimator.set_params(preprocessor=preprocessor)
    if hasattr(estimator, "n_components"):
        estimator.n_components = 1
    dict_before_fit = estimator.__dict__.copy()

    estimator.fit(*remove_y_quadruplets(estimator, input_data, labels))
    dict_after_fit = estimator.__dict__

    public_keys_after_fit = [
        key for key in dict_after_fit.keys() if is_public_parameter(key)
    ]

    attrs_added_by_fit = [
        key for key in public_keys_after_fit
        if key not in dict_before_fit.keys()
    ]

    # check that fit doesn't add any public attribute
    assert not attrs_added_by_fit, (
        "Estimator adds public attribute(s) during"
        " the fit method."
        " Estimators are only allowed to add private "
        "attributes"
        " either started with _ or ended"
        " with _ but %s added" % ', '.join(attrs_added_by_fit))

    # check that fit doesn't change any public attribute
    attrs_changed_by_fit = [
        key for key in public_keys_after_fit
        if (dict_before_fit[key] is not dict_after_fit[key])
    ]

    assert not attrs_changed_by_fit, (
        "Estimator changes public attribute(s) during"
        " the fit method. Estimators are only allowed"
        " to change attributes started"
        " or ended with _, but"
        " %s changed" % ', '.join(attrs_changed_by_fit))
Example #55
0
    def __init__(self,
                 controls_model,
                 treated_model,
                 cate_controls_model=None,
                 cate_treated_model=None,
                 propensity_model=LogisticRegression(),
                 propensity_func=None):
        self.controls_model = clone(controls_model, safe=False)
        self.treated_model = clone(treated_model, safe=False)
        self.cate_controls_model = clone(cate_controls_model, safe=False)
        self.cate_treated_model = clone(cate_treated_model, safe=False)

        if self.cate_controls_model is None:
            self.cate_controls_model = clone(self.controls_model, safe=False)
        if self.cate_treated_model is None:
            self.cate_treated_model = clone(self.treated_model, safe=False)

        self.propensity_func = clone(propensity_func, safe=False)
        self.propensity_model = clone(propensity_model, safe=False)
        self.has_propensity_func = self.propensity_func is not None
def test_cross_validation_manual_vs_scikit(estimator, build_dataset,
                                           with_preprocessor):
    """Tests that if we make a manual cross-validation, the result will be the
  same as scikit-learn's cross-validation (some code for generating the
  folds is taken from scikit-learn).
  """
    if any(hasattr(estimator, method) for method in ["predict", "score"]):
        input_data, labels, preprocessor, _ = build_dataset(with_preprocessor)
        estimator = clone(estimator)
        estimator.set_params(preprocessor=preprocessor)
        set_random_state(estimator)
        n_splits = 3
        kfold = KFold(shuffle=False, n_splits=n_splits)
        n_samples = input_data.shape[0]
        fold_sizes = (n_samples // n_splits) * np.ones(n_splits, dtype=np.int)
        fold_sizes[:n_samples % n_splits] += 1
        current = 0
        scores, predictions = [], np.zeros(input_data.shape[0])
        for fold_size in fold_sizes:
            start, stop = current, current + fold_size
            current = stop
            test_slice = slice(start, stop)
            train_mask = np.ones(input_data.shape[0], bool)
            train_mask[test_slice] = False
            y_train, y_test = labels[train_mask], labels[test_slice]
            estimator.fit(*remove_y_quadruplets(
                estimator, input_data[train_mask], y_train))
            if hasattr(estimator, "score"):
                scores.append(
                    estimator.score(*remove_y_quadruplets(
                        estimator, input_data[test_slice], y_test)))
            if hasattr(estimator, "predict"):
                predictions[test_slice] = estimator.predict(
                    input_data[test_slice])
        if hasattr(estimator, "score"):
            assert all(scores == cross_val_score(
                estimator,
                *remove_y_quadruplets(estimator, input_data, labels),
                cv=kfold))
        if hasattr(estimator, "predict"):
            assert all(predictions == cross_val_predict(
                estimator,
                *remove_y_quadruplets(estimator, input_data, labels),
                cv=kfold))
Example #57
0
    def fit(self, X, y=None):
        """
        Fit the model using X, y as training data. Will also learn the groups that exist within the dataset.

        :param X: array-like, shape=(n_columns, n_samples,) training data.
        :param y: array-like, shape=(n_samples,) training data.
        :return: Returns an instance of self.
        """

        X_group, X_value = _split_groups_and_values(X,
                                                    self.groups,
                                                    min_value_cols=0,
                                                    check_X=self.check_X,
                                                    **self._check_kwargs)

        X_group = self.__add_shrinkage_column(X_group)

        if y is not None:
            y = check_array(y, ensure_2d=False)

        if self.shrinkage is not None:
            self.__set_shrinkage_function()

        # List of all hierarchical subsets of columns
        self.group_colnames_hierarchical_ = expanding_list(
            X_group.columns, list)

        self.fallback_ = None

        if self.shrinkage is None and self.use_global_model:
            self.fallback_ = clone(self.estimator).fit(X_value, y)

        if self.shrinkage is not None:
            self.estimators_ = self.__fit_shrinkage_groups(X_group, X_value, y)
        else:
            self.estimators_ = self.__fit_grouped_estimator(
                X_group, X_value, y)

        self.groups_ = as_list(self.estimators_.keys())

        if self.shrinkage is not None:
            self.shrinkage_factors_ = self.__get_shrinkage_factor(X_group)

        return self
Example #58
0
def stacking(model, train_data, train_target, test_data,test_target,n_fold):
    """
    :param model:  模型算法
    :param train_data:  训练集(不含带预测的目标特征)
    :param train_target:  需要预测的目标特征
    :param test_data:   测试集
    :param n_fold:   交叉验证的折数
    :return:
    """
    train_data = pd.DataFrame(train_data)
    train_target = pd.DataFrame(train_target)
    test_data = pd.DataFrame(test_data)
    skf = StratifiedKFold(n_splits=n_fold,shuffle=True)  # StratifiedKFold 默认分层采样
    train_pred = np.zeros((train_data.shape[0], 1), int)  # 存储训练集预测结果
    test_pred = np.zeros((test_data.shape[0], 1), int)  # 存储测试集预测结果 行数:len(test_data) ,列数:1列
    for skf_index, (train_index, val_index) in enumerate(skf.split(train_data, train_target)):
        print('第 ', skf_index + 1, ' 折交叉验证开始... ')
        # 训练集划分
        new_model = clone(model)
        # print('pre-model',model)
        # print('new-model',new_model)
        x_train, x_val = train_data.iloc[train_index], train_data.iloc[val_index]
        y_train, y_val = train_target.iloc[train_index], train_target.iloc[val_index]
        # 模型构建
        y_train = np.ravel(y_train)  # 向量转成数组
        new_model.fit(X=x_train, y=y_train)
        # 模型预测
        accs = accuracy_score(y_val, new_model.predict(x_val))
        print('第 ', skf_index + 1, ' 折交叉验证 :  accuracy : ', accs)

        # 训练集预测结果
        val_pred = new_model.predict(x_val)
        for i in range(len(val_index)):
            train_pred[val_index[i]] = val_pred[i]
        # 保存测试集预测结果

        print('第 ', skf_index + 1, ' 折accuracy : ', accuracy_score(test_target, new_model.predict(test_data)))
        test_pred = np.column_stack((test_pred, new_model.predict(test_data)))  # 将矩阵按列合并

    test_pred_mean = np.mean(test_pred, axis=1)  # 按行计算均值(会出现小数)
    test_pred_mean = pd.DataFrame(test_pred_mean)  # 转成DataFrame
    test_pred_mean = test_pred_mean.apply(lambda x: round(x))  # 小数需要四舍五入成整数
    test_set = np.ravel(test_pred_mean)
    return test_set.reshape(test_set.shape[0],1), np.array(train_pred)
Example #59
0
def sWeights_to_proba(x,
                      sWeights,
                      model,
                      use_cross_val=False,
                      cv_params={
                          "cv": 4,
                          "n_jobs": 1
                      }):
    if model.get_params()['loss_function'] != 'ConstrainedRegression':
        raise ValueError(
            "Smart training requires catboost with ConstrainedRegression loss")
    if use_cross_val:
        raw_predictions = sklearn.model_selection.cross_val_predict(
            model, x, sWeights, **cv_params)
    else:
        model_ = sklearn.clone(model)
        model_.fit(x, sWeights)
        raw_predictions = model_.predict(x)
    return expit(raw_predictions)
def test_accuracy_toy_example(estimator, build_dataset):
    """Test that the default scoring for triplets (accuracy) works on some
  toy example"""
    triplets, _, _, X = build_dataset(with_preprocessor=False)
    estimator = clone(estimator)
    set_random_state(estimator)
    estimator.fit(triplets)
    # We take the two first points and we build 4 regularly spaced points on the
    # line they define, so that it's easy to build triplets of different
    # similarities.
    X_test = X[0] + np.arange(4)[:, np.newaxis] * (X[0] - X[1]) / 4

    triplets_test = np.array([[X_test[0], X_test[2], X_test[1]],
                              [X_test[1], X_test[3], X_test[0]],
                              [X_test[1], X_test[2], X_test[3]],
                              [X_test[3], X_test[0], X_test[2]]])
    # we force the transformation to be identity so that we control what it does
    estimator.components_ = np.eye(X.shape[1])
    assert estimator.score(triplets_test) == 0.25