コード例 #1
0
    def set_params(self, **params):
        """Set parameters for the wrapper and the wrapped estimator.

        This method is required for compatibility with GridSearchCV.
        :param params: A dictionary of parameters for the wrapper and wrapped estimator.
         If a key doesn't match the name of a wrapper parameter, it is assumed to be
         for the wrapped estimator.
         TODO: it would be better to do what sklearn's pipeline does and provide some
         namespacing in case the wrapper and wrapped class share a parameter name
        :return: self
        """

        if not params:
            return self
        valid_params = self.get_params(deep=True)
        model_params = self.model_params
        wrapper_params = {}
        for key, value in params.iteritems():
            if key in valid_params:
                wrapper_params[key] = value
            else:
                model_params[key] = value

        wrapper_params['model_params'] = model_params
        BaseEstimator.set_params(self, **wrapper_params)
        return self
コード例 #2
0
ファイル: what_sklearn.py プロジェクト: sdvillal/whatami
def _check_all_monkeypatched():
    """Double-checks that instances sklearn estimators have acquired the proper "what" method.
    Raises an assertion error if it is not the case.
    """

    # Make sure we have added what to sklearn stuff
    whatamize_sklearn(check=False)

    # Trick to force python to populate part of the BaseEstimator hierarchy
    from sklearn.ensemble.forest import RandomForestClassifier
    assert BaseEstimator.__subclasscheck__(RandomForestClassifier)
    from sklearn.cluster import KMeans
    assert BaseEstimator.__subclasscheck__(KMeans)
    from sklearn.feature_extraction import DictVectorizer
    assert BaseEstimator.__subclasscheck__(DictVectorizer)
    from sklearn.decomposition import KernelPCA
    assert BaseEstimator.__subclasscheck__(KernelPCA)

    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        for cls in all_subclasses(BaseEstimator):
            if not inspect.isabstract(cls):
                try:
                    obj = cls()
                    assert hasattr(obj, 'what'), cls.__name__
                    assert isinstance(obj.what(), What), cls.__name__
                except TypeError:
                    pass
    return True
コード例 #3
0
    def __init__(self,  n_estimators=20, 
                        max_depth=5, min_samples_split=10, min_samples_leaf=10,
                        random_state=0,
                        em_itrs=5,
                        regularization=0.05,
                        passive_dyn_func=None,
                        passive_dyn_ctrl=None,
                        passive_dyn_noise=None,
                        verbose=False):
        '''
        n_estimators        - number of ensembled models
        ...                 - a batch of parameters used for RandomTreesEmbedding, see relevant documents
        em_itrs             - maximum number of EM iterations to take
        regularization      - small positive scalar to prevent singularity of matrix inversion
        passive_dyn_func    - function to evaluate passive dynamics; None for MaxEnt model
        passive_dyn_ctrl    - function to return the control matrix which might depend on the state...
        passive_dyn_noise   - covariance of a Gaussian noise; only applicable when passive_dyn is Gaussian; None for MaxEnt model
                                note this implies a dynamical system with constant input gain. It is extendable to have state dependent
                                input gain then we need covariance for each data point
        verbose             - output training information
        '''
        BaseEstimator.__init__(self)

        self.n_estimators=n_estimators
        self.max_depth=max_depth
        self.min_samples_split=min_samples_split
        self.min_samples_leaf=min_samples_leaf
        self.random_state=random_state
        self.em_itrs=em_itrs
        self.reg=regularization
        self.passive_dyn_func=passive_dyn_func
        self.passive_dyn_ctrl=passive_dyn_ctrl
        self.passive_dyn_noise=passive_dyn_noise
        self.verbose=verbose
        return
コード例 #4
0
ファイル: feature_extractor.py プロジェクト: JieLuoSC/librosa
    def set_params(self, **kwargs):
        """Update the parameters of the feature extractor."""

        # We don't want non-functional arguments polluting kwargs
        params = kwargs.copy()
        for k in ['function', 'target']:
            params.pop(k, None)

        self.kwargs.update(params)
        BaseEstimator.set_params(self, **kwargs)
コード例 #5
0
    def test_vector_alignment(self):
        # Mock out a generic scikit-learn classifier
        mocked_model = BaseEstimator()
        mocked_model.fit = MagicMock()
        mocked_model.predict = MagicMock(return_value=[True])

        # Create a simple data frame extending to January 15
        date_sequence = pd.date_range('1/1/2011', periods=15, freq='D')
        time_series = pd.DataFrame({
            # This column will be accessed by name to generate the targets vector.
            'Violent Crime Committed?': [True, True] + [False]*13,

            # Actual time series used for nonsequential prediction will contain more than one column.
            # However, we just need to verify that it grabs the correct slices of each column,
            # so one stand-in column will suffice.
            'Other Data': [0]*10 + [1]*5
        }, index=date_sequence)

        # Construct a NonsequentialPredictor with the mock
        predictor = NonsequentialPredictor(time_series, model=mocked_model)

        # The date to predict comes before the end of the time series,
        # so all rows from the 13th on should be discarded
        date_to_predict = datetime.date(2011, 1, 13)

        # The mock always predicts True, so predict() should return True
        self.assertTrue(predictor.predict(date_to_predict))

        # And both fit and predict should have been called
        self.assertTrue(mocked_model.fit.called)
        self.assertTrue(mocked_model.predict.called)

        # When feeding training data to the sklearn model,
        # predict() needs to align each day of the time series with whether a violent crime was committed the NEXT day.
        # Thus, the first element of the Violent Crime Committed? column should have been removed
        #  before being used as the model's targets vector because it has no previous day to partner with.
        expected_targets = [True] + [False]*11

        # Similarly, the last element of any other column (in this case, 'Other Data')
        # should only go up to the day before the day we're trying to predict
        expected_features = [[0]]*10 + [[1]]*2

        # Get the two arguments passed to mocked_model
        fit_args = mocked_model.fit.call_args
        observed_features = fit_args[0][0]
        observed_targets = fit_args[0][1]

        # Equality tests with numpy arrays are wonky, so I convert numpy arrays to Python lists
        self.assertEqual(observed_targets.tolist(), expected_targets)
        self.assertEqual(observed_features.tolist(), expected_features)

        # Confirm the correct argument was passed to predict
        print(mocked_model.predict.call_args)
        observed_day_to_predict = mocked_model.predict.call_args[0][0]
        self.assertEqual(observed_day_to_predict.tolist(), [[1]])
コード例 #6
0
ファイル: query_expansion.py プロジェクト: shatha2014/vec4ir
 def __init__(self, embedding, analyzer='word', m=10, verbose=0,
              use_idf=True, **ev_params):
     """Expand a query by the nearest known tokens to its centroid
     """
     self.embedding = embedding
     self.m = m
     self.vect = EmbeddedVectorizer(embedding,
                                    analyzer=analyzer,
                                    use_idf=use_idf,
                                    **ev_params)
     BaseEstimator.__init__(self)
コード例 #7
0
ファイル: eqlm.py プロジェクト: shatha2014/vec4ir
    def __init__(self, embedding, analyzer, m=10):
        """Initializes Embedding Based Query Expansion

        :embedding: TODO
        :analyzer: TODO
        :m: TODO

        """
        BaseEstimator.__init__(self)

        self._embedding = embedding
        self._m = m
        self._cv = CountVectorizer(analyzer=analyzer)
コード例 #8
0
def train_fchl(rep_computer: FCHLRepresentation,
               model: BaseEstimator,
               mols: List[str],
               y: List[float],
               n_jobs: int = 1,
               y_lower: List[float] = None) -> BaseEstimator:
    """Retrain an FCHL-based model

    Args:
        rep_computer: Tool used to compute the FCHL-compatible representations for each molecule
        model: Model to be retrained
        mols: List of molecules (XYZ format) in training set
        y: List of other properties to predict
        n_jobs: Number of threads to use for generating representations
        y_lower: Lower-fidelity estimate of the property. Used for delta learning models
    Returns:
        Retrained model
    """

    # Convert the input molecules into FCHL-ready inputs
    rep_computer.n_jobs = n_jobs
    reps = rep_computer.transform(mols)

    # Retrain the model
    if y_lower is not None:
        y = np.subtract(y, y_lower)
    return model.fit(reps, y)
コード例 #9
0
    def fit(self, X, original_y):
        base_est = BaseEstimator()
        base_est.predict = lambda X: np.zeros(X.shape[0], dtype=float)
        self.estimators_ = [base_est]

        for i in range(self.n_estimators):
            grad = self.loss_grad(original_y, self._predict(X))
            estimator = deepcopy(self.base_regressor)
            estimator.fit(X, grad)

            self.estimators_.append(estimator)

        self.out_ = self._outliers(grad)
        self.feature_importances_ = self._calc_feature_imps()

        return self
コード例 #10
0
ファイル: generate.py プロジェクト: cesarrodrig/pos-tagger
def generate(model: base.BaseEstimator, sentences: List[List[str]]) -> None:
    """Tag the sentences with the given model.

    Parameters
    ----------
    sentences : list
        List of lists of strings representing the sentences to tag.
    """
    print(f"Tagging {len(sentences)} sentences.")

    # Since the models were trained on the lemmatized version of the words,
    # we also lemmatize them when tagging unlabeled sentences.
    lemmatizer = stem.WordNetLemmatizer()

    for sentence in sentences:
        # Convert to the lemmatized versions
        lemmatized = [lemmatizer.lemmatize(w.lower()) for w in sentence]

        # Convert to conllu.TokenList because models expect that.
        # Since they are essentially dicts, we build them that way.
        tags = model.predict([[{"lemma": w} for w in lemmatized]])

        print("Word\tTag")
        for w, t in zip(sentence, tags[0]):
            print(f"{w}\t{t}")
        print()
コード例 #11
0
def evaluate_fchl(rep_computer: FCHLRepresentation,
                  model: BaseEstimator,
                  mols: List[str],
                  n_jobs: int = 1,
                  y_lower: List[float] = None) -> np.ndarray:
    """Run an FCHL-based model

    Args:
        rep_computer: Tool used to compute the FCHL-compatible representations for each molecule
        model: Model to be evaluated
        mols: List of molecules (XYZ format) to evaluate
        n_jobs: Number of threads to use for generating representations
        y_lower: Lower-fidelity estimate of the property. Used for delta learning models
    Returns:
        Results from the inference
    """

    # Convert the input molecules into FCHL-ready inputs
    rep_computer.n_jobs = n_jobs
    reps = rep_computer.transform(mols)

    # Run the model
    y_pred = model.predict(reps).tolist()
    if y_lower is not None:
        y_pred = np.add(y_pred, y_lower)
    return y_pred
コード例 #12
0
ファイル: base.py プロジェクト: achennu/sklearn-xarray
    def get_params(self, deep=True):
        """ Get parameters for this estimator.

        Parameters
        ----------
        deep : boolean, optional
            If True, will return the parameters for this estimator and
            contained subobjects that are estimators.

        Returns
        -------
        params : mapping of string to any
            Parameter names mapped to their values.
        """

        if self.compat:
            return BaseEstimator.get_params(self, deep)

        else:
            if self.estimator is not None:
                params = self.estimator.get_params(deep)
            else:
                # TODO: check if this is necessary
                params = dict()

            for p in self._get_param_names():
                params[p] = getattr(self, p, None)

            return params
コード例 #13
0
ファイル: study.py プロジェクト: adam2392/morf-demo
def summarize_feature_comparisons(
        base_clf: BaseEstimator, comparison_clfs: Dict[str, BaseEstimator], X_test, y_test
):
    from mlxtend.evaluate import mcnemar, cochrans_q, mcnemar_table

    summary_dict = collections.OrderedDict()
    mcnemar_tbs = dict()

    # create list of predicted values
    base_y_predict = base_clf.predict(X_test)
    y_predictions = [base_y_predict]
    for idx, (name, clf) in enumerate(comparison_clfs.items()):
        # get the probability
        y_predict_proba = clf.predict_proba(X_test)
        y_predict = clf.predict(X_test)

        # form mcnemar tables against base classifier
        tb = mcnemar_table(y_test, base_y_predict, y_predict)
        mcnemar_tbs[f"base vs {name}"] = tb.values()

        # store predictions per classifier
        y_predictions.append(y_predict)

    # first run cochrans Q test
    qstat, pval = cochrans_q(y_test, *y_predictions)
    summary_dict["cochrans_q"] = qstat
    summary_dict["cochrans_q_pval"] = pval

    # run mcnemars test against all the predictions
    for name, table in mcnemar_tbs.items():
        chi2stat, pval = mcnemar(table, exact=True)
        summary_dict[f"mcnemar_{name}_chi2stat"] = chi2stat
        summary_dict[f"mcnemar_{name}_pval"] = pval

    return summary_dict
コード例 #14
0
 def __init__(self,
              embedding,
              analyzer='word',
              m=10,
              verbose=0,
              use_idf=True,
              **ev_params):
     """Expand a query by the nearest known tokens to its centroid
     """
     self.embedding = embedding
     self.m = m
     self.vect = EmbeddedVectorizer(embedding,
                                    analyzer=analyzer,
                                    use_idf=use_idf,
                                    **ev_params)
     BaseEstimator.__init__(self)
コード例 #15
0
ファイル: sklearn_utils.py プロジェクト: tchordia/ray
def has_cpu_params(estimator: BaseEstimator) -> bool:
    """Returns True if estimator has any CPU-related params."""
    return any(
        any(
            param.endswith(cpu_param_name)
            for cpu_param_name in SKLEARN_CPU_PARAM_NAMES)
        for param in estimator.get_params(deep=True))
コード例 #16
0
    def out_of_fold(
            self,
            estimator: BaseEstimator,
            train_x, train_y,
            valid_x, valid_y):
        # lightGBMとcatboostの場合は、fit時に下記パラメータを与える
        fit_params = {}
        if type(estimator).__name__ in ('LGBMClassifier', 'CatBoostClassifier',):
            if 'eval_set' not in fit_params:
                fit_params['eval_set'] = [(valid_x, valid_y)]
            if 'early_stopping_rounds' not in fit_params:
                fit_params['early_stopping_rounds'] = 100

        estimator.fit(train_x, train_y, **fit_params)
        oof = self.make_pred(estimator, valid_x)
        return oof
コード例 #17
0
def test_is_pairwise():
    """Test ``_is_pairwise``."""
    # Simple checks for _is_pairwise
    pca = KernelPCA(kernel='precomputed')
    with pytest.warns(None) as record:
        assert _is_pairwise(pca)
    assert not record

    # Pairwise attribute that is not consistent with the pairwise tag
    class IncorrectTagPCA(KernelPCA):
        """Class with incorrect _pairwise attribute."""

        _pairwise = False

    pca = IncorrectTagPCA(kernel='precomputed')
    msg = "_pairwise attribute is inconsistent with tags."
    with pytest.warns(FutureWarning, match=msg):
        assert not _is_pairwise(pca)

    # The _pairwise attribute is present and set to True while pairwise tag is
    # not present
    class TruePairwise(BaseEstimator):
        """Class without pairwise tag."""

        _pairwise = True

    true_pairwise = TruePairwise()
    with pytest.warns(FutureWarning, match=msg):
        assert _is_pairwise(true_pairwise)

    # Pairwise attribute is not defined thus tag is used
    est = BaseEstimator()
    with pytest.warns(None) as record:
        assert not _is_pairwise(est)
    assert not record
コード例 #18
0
def standard_report(
    estimator: BaseEstimator,
    X_test: Union[pd.DataFrame, np.ndarray],
    y_test: Union[pd.Series, np.ndarray],
    zero_division: str = "warn",
) -> None:
    """Display standard report of diagnostic metrics and plots for classification.

    Parameters
    ----------
    estimator : BaseEstimator
        Fitted classification estimator for evaluation.
    X_test : DataFrame or ndarray of shape (n_samples, n_features)
        Predictor test set.
    y_test : Series or ndarray of shape (n_samples,)
        Target test set.
    zero_division : str, optional
        Value to return for division by zero: 0, 1, or 'warn'.
    """
    table = classification_report(y_test,
                                  estimator.predict(X_test),
                                  zero_division=zero_division,
                                  heatmap=True)
    classification_plots(estimator, X_test, y_test)
    display(table)
コード例 #19
0
def test_is_pairwise():
    # simple checks for _is_pairwise
    pca = KernelPCA(kernel='precomputed')
    with pytest.warns(None) as record:
        assert _is_pairwise(pca)
    assert not record

    # pairwise attribute that is not consistent with the pairwise tag
    class IncorrectTagPCA(KernelPCA):
        _pairwise = False

    pca = IncorrectTagPCA(kernel='precomputed')
    msg = "_pairwise was deprecated in 0.24 and will be removed in 1.1"
    with pytest.warns(FutureWarning, match=msg):
        assert not _is_pairwise(pca)

    # the _pairwise attribute is present and set to True while pairwise tag is
    # not present
    class TruePairwise(BaseEstimator):
        _pairwise = True

    true_pairwise = TruePairwise()
    with pytest.warns(FutureWarning, match=msg):
        assert _is_pairwise(true_pairwise)

    # pairwise attribute is not defined thus tag is used
    est = BaseEstimator()
    with pytest.warns(None) as record:
        assert not _is_pairwise(est)
    assert not record
コード例 #20
0
 def get_params(self, deep=True, **kwargs):
     params = BaseEstimator.get_params(self, deep=deep, **kwargs)
     # Callback parameters are not returned by .get_params, needs
     # special treatment.
     params_cb = self._get_params_callbacks(deep=deep)
     params.update(params_cb)
     return params
コード例 #21
0
ファイル: sklearn_inference.py プロジェクト: nielm/beam
 def run_inference(
     self, batch: Sequence[numpy.ndarray], model: BaseEstimator,
     **kwargs) -> Iterable[PredictionResult]:
   # vectorize data for better performance
   vectorized_batch = numpy.stack(batch, axis=0)
   predictions = model.predict(vectorized_batch)
   return [PredictionResult(x, y) for x, y in zip(batch, predictions)]
コード例 #22
0
    def _predict_regression(
            self,
            X: np.ndarray,
            model: BaseEstimator,
            task_type: int,
            Y_train: Optional[np.ndarray] = None) -> np.ndarray:
        def send_warnings_to_log(
            message: Union[Warning, str],
            category: Type[Warning],
            filename: str,
            lineno: int,
            file: Optional[TextIO] = None,
            line: Optional[str] = None,
        ) -> None:
            self.logger.debug('%s:%s: %s:%s' %
                              (filename, lineno, str(category), message))
            return

        with warnings.catch_warnings():
            warnings.showwarning = send_warnings_to_log
            Y_pred = model.predict(X)

        if len(Y_pred.shape) == 1:
            Y_pred = Y_pred.reshape((-1, 1))

        return Y_pred
コード例 #23
0
 def get_params(self, deep=True):
     params = BaseEstimator.get_params(self, deep)
     params['dimensions'] = self.dimensions
     params['noise'] = self.noise
     params['epsilon'] = self.epsilon
     logging.debug("Getting params: %s", str(params))
     return params
コード例 #24
0
ファイル: uncertainty.py プロジェクト: IMDC/cappy
def classifier_margin(classifier: BaseEstimator, X: modALinput,
                      **predict_proba_kwargs) -> np.ndarray:
    """
    Classification margin uncertainty of the classifier for the provided samples. This uncertainty measure takes the
    first and second most likely predictions and takes the difference of their probabilities, which is the margin.

    Args:
        classifier: The classifier for which the prediction margin is to be measured.
        X: The samples for which the prediction margin of classification is to be measured.
        **predict_proba_kwargs: Keyword arguments to be passed for the :meth:`predict_proba` of the classifier.

    Returns:
        Margin uncertainty, which is the difference of the probabilities of first and second most likely predictions.
    """
    try:
        classwise_uncertainty = classifier.predict_proba(
            X, **predict_proba_kwargs)
    except NotFittedError:
        return np.zeros(shape=(X.shape[0], ))

    if classwise_uncertainty.shape[1] == 1:
        return np.zeros(shape=(classwise_uncertainty.shape[0], ))

    part = np.partition(-classwise_uncertainty, 1, axis=1)
    margin = -part[:, 0] + part[:, 1]

    return margin
コード例 #25
0
    def _predict_proba(
        self,
        X: np.ndarray,
        model: BaseEstimator,
        task_type: int,
        Y_train: Optional[np.ndarray] = None,
    ) -> np.ndarray:
        def send_warnings_to_log(
            message: Union[Warning, str],
            category: Type[Warning],
            filename: str,
            lineno: int,
            file: Optional[TextIO] = None,
            line: Optional[str] = None,
        ) -> None:
            self.logger.debug('%s:%s: %s:%s' %
                              (filename, lineno, str(category), message))
            return

        with warnings.catch_warnings():
            warnings.showwarning = send_warnings_to_log
            Y_pred = model.predict_proba(X, batch_size=1000)

        if Y_train is None:
            raise ValueError("Y_train is required for classification problems")

        Y_pred = self._ensure_prediction_array_sizes(Y_pred, Y_train)
        return Y_pred
コード例 #26
0
def max_std_sampling(regressor: BaseEstimator,
                     X: modALinput,
                     n_instances: int = 1,
                     random_tie_break=False,
                     **predict_kwargs) -> Tuple[np.ndarray, modALinput]:
    """
    Regressor standard deviation sampling strategy.

    Args:
        regressor: The regressor for which the labels are to be queried.
        X: The pool of samples to query from.
        n_instances: Number of samples to be queried.
        random_tie_break: If True, shuffles utility scores to randomize the order. This
            can be used to break the tie when the highest utility score is not unique.
        **predict_kwargs: Keyword arguments to be passed to :meth:`predict` of the CommiteeRegressor.

    Returns:
        The indices of the instances from X chosen to be labelled;
        the instances from X chosen to be labelled.
    """
    _, std = regressor.predict(X, return_std=True, **predict_kwargs)
    std = std.reshape(X.shape[0], )

    if not random_tie_break:
        query_idx = multi_argmax(std, n_instances=n_instances)
    else:
        query_idx = shuffled_argmax(std, n_instances=n_instances)

    return query_idx, X[query_idx]
コード例 #27
0
    def decision_boundary(self, x: np.ndarray, y: np.ndarray,
                          model: BaseEstimator):
        x0 = x[:, 0]
        x1 = x[:, 1]

        x_min, x_max = x0.min() - 1, x0.max() + 1
        y_min, y_max = x1.min() - 1, x1.max() + 1

        xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1),
                             np.arange(y_min, y_max, 0.1))

        z = model.predict(np.c_[xx.ravel(), yy.ravel()])
        z = z.reshape(xx.shape)
        z = z.astype(np.str)

        y = [str(label) for label in y]

        fig = px.scatter(x=x0, y=x1, color=y)

        # fig = go.Figure()

        contour = go.Contour(z=z,
                             x=np.arange(x_min, x_max, 0.1),
                             y=np.arange(y_min, y_max, 0.1),
                             line_width=0,
                             colorscale=[[0, '#ff9900'], [1, '#6666ff']],
                             opacity=0.4,
                             showscale=False)

        fig.add_trace(contour)

        fig.update_layout(title='Decision boundary', legend_title='Label')

        pyo.iplot(fig)
コード例 #28
0
ファイル: train.py プロジェクト: ku222/GCP
 def evaluate_model(self, model: BaseEstimator, xtest: np.ndarray,
                    ytest: np.ndarray) -> ModelStats:
     """Get the accuracy, recall, precision of this model"""
     ypreds = model.predict(xtest)
     return ModelStats(accuracy=accuracy_score(ypreds, ytest),
                       precision=precision_score(ypreds, ytest),
                       recall=recall_score(ypreds, ytest))
コード例 #29
0
ファイル: model.py プロジェクト: rharish101/AdvancedML
def finalize_model(
    model: BaseEstimator,
    X_train: CSVData,
    Y_train: CSVData,
    X_test: CSVData,
    test_ids: CSVData,
    output: str,
    smote_fn: SamplerFnType = None,
    outlier_detection: Any = None,
    header: Tuple[str, str] = ("id", "y"),
    label_indexing: int = 0,
    export_int: bool = False,
) -> None:
    """Train the model on the complete data and generate the submission file.

    Parameters
    ----------
    model: The model
    X_train: The training data
    Y_train: The training labels
    X_test: The test data
    test_ids: The IDs for the test data
    output: The path where to dump the output
    smote_fn: The function that takes labels and returns SMOTE
    label_indexing: What to start indexing the label from
    export_int: Whether to export the CSV as integers
    """
    print("Training model...")

    if outlier_detection is not None:
        outliers = outlier_detection.fit_predict(X_train)
        X_train = X_train[outliers == 1]
        Y_train = Y_train[outliers == 1]

    if smote_fn:
        smote = smote_fn(Y_train)
        X_train, Y_train = smote.fit_resample(X_train, Y_train)

    model.fit(X_train, Y_train)

    print("Model trained")
    Y_pred = model.predict(X_test) + label_indexing
    submission: Any = np.stack([test_ids, Y_pred], 1)  # Add IDs
    create_submission_file(output,
                           submission,
                           header=header,
                           export_int=export_int)
コード例 #30
0
ファイル: base.py プロジェクト: njtwomey/har_datasets
def instantiate_and_fit(
    index: pd.DataFrame,
    fold: pd.DataFrame,
    X: np.ndarray,
    y: pd.DataFrame,
    estimator: BaseEstimator,
    n_splits: int = 5,
    param_grid: Optional[Dict[str, Any]] = None,
) -> BaseEstimator:
    assert fold.shape[0] == index.shape[0]
    assert fold.shape[0] == X.shape[0]
    assert fold.shape[0] == y.shape[0]

    fold_vals = fold.ravel()

    train_inds = fold_vals == "train"
    val_inds = fold_vals == "val"

    if val_inds.sum():
        raise NotImplementedError(
            "Explicit validation indices not yet supported.")

    y = y.values.ravel()

    nan_row, nan_col = np.nonzero(np.isnan(X) | np.isinf(X))
    if len(nan_row):
        logger.warning(
            f"Setting {len(nan_row)} NaN elements to zero before fitting {estimator}."
        )
        X[nan_row, nan_col] = 0

    logger.info(f"Fitting {estimator} on data (shape: {X.shape})")

    if param_grid is not None:
        group_k_fold = GroupKFold(n_splits=n_splits).split(
            X[train_inds], y[train_inds], index.trial.values[train_inds])

        grid_search = GridSearchCV(estimator=estimator,
                                   param_grid=param_grid,
                                   verbose=10,
                                   cv=list(group_k_fold))
        grid_search.fit(X[train_inds], y[train_inds])

        return grid_search.best_estimator_

    estimator.fit(X[train_inds], y[train_inds])
    return estimator
コード例 #31
0
def cv_best_hyperparams(model: BaseEstimator, X, y, k_folds, degree_range,
                        lambda_range):
    """
    Cross-validate to find best hyperparameters with k-fold CV.
    :param X: Training data.
    :param y: Training targets.
    :param model: sklearn model.
    :param lambda_range: Range of values for the regularization hyperparam.
    :param degree_range: Range of values for the degree hyperparam.
    :param k_folds: Number of folds for splitting the training data into.
    :return: A dict containing the best model parameters,
        with some of the keys as returned by model.get_params()
    """

    # TODO: Do K-fold cross validation to find the best hyperparameters
    #  Notes:
    #  - You can implement it yourself or use the built in sklearn utilities
    #    (recommended). See the docs for the sklearn.model_selection package
    #    http://scikit-learn.org/stable/modules/classes.html#module-sklearn.model_selection
    #  - If your model has more hyperparameters (not just lambda and degree)
    #    you should add them to the search.
    #  - Use get_params() on your model to see what hyperparameters is has
    #    and their names. The parameters dict you return should use the same
    #    names as keys.
    #  - You can use MSE or R^2 as a score.

    # ====== YOUR CODE: ======
    DEGREE_PARAM = "bostonfeaturestransformer__degree"
    LAMBDA_PARAM = "linearregressor__reg_lambda"

    results = {}
    for degree in degree_range:
        for reg_lambda in lambda_range:
            params = model.get_params()
            params[DEGREE_PARAM] = degree
            params[LAMBDA_PARAM] = reg_lambda
            model.set_params(**params)
            scores = sklearn.model_selection.cross_val_score(
                model, X, y, scoring="neg_mean_squared_error", cv=k_folds)
            score = np.mean(scores)
            results[score] = params

    best_params = max(results.items(), key=lambda x: x[0])[1]

    # ========================

    return best_params
コード例 #32
0
    def __init__(self, model, periods=1, freq='30min'):
        """Lags a dataset.

        Lags all features.
        Missing data is dropped for fitting, and replaced with the mean for predict.

        :periods: Number of timesteps to lag by
        """
        assert isinstance(model, BaseEstimator), "`model` isn't a scikit-learn model"

        BaseEstimator.__init__(self)
        TransformerMixin.__init__(self)

        self.periods = periods
        self.freq = freq

        self.model = model
コード例 #33
0
 def _validate_onnx_data(self, X):
     if X.dtype not in (numpy.float32, numpy.float64):
         raise ValueError(
             "Input X must have dtype float32 or float64.")
     X = BaseEstimator._validate_data(
         self, X, reset=False, dtype=[numpy.float64, numpy.float32],
         order='C')
     return X
コード例 #34
0
ファイル: blending.py プロジェクト: nyuge/xgboost
 def __init__(self, blending_regressor: BaseEstimator, model_name: str,
              params: dict):
     super().__init__(model_name, params)
     self.blend_model = BlendingRegressor(
         blending_regressor.set_params(**params))
     self.MODELS_SERIALIZING_BASEPATH = self.path.join(
         self.MODELS_SERIALIZING_BASEPATH, MACHINE_LEARNING_TECHNIQUE_NAME)
     self.SERIALIZE_FILENAME_PREFIX = SERIALIZE_FILENAME_PREFIX
コード例 #35
0
 def __init__(self, rf_estimator=None, lasso_estimator=None):
     """
     @param  rf_estimator    random forest estimator,
                             :epkg:`sklearn:ensemble:RandomForestRegressor`
                             by default
     @param  lass_estimator  Lasso estimator,
                             :epkg:`sklearn:linear_model:LassoRegression`
                             by default
     """
     BaseEstimator.__init__(self)
     RegressorMixin.__init__(self)
     if rf_estimator is None:
         rf_estimator = RandomForestRegressor()
     if lasso_estimator is None:
         lasso_estimator = Lasso()
     self.rf_estimator = rf_estimator
     self.lasso_estimator = lasso_estimator
コード例 #36
0
ファイル: models.py プロジェクト: carloszanella/pypeline
    def __init__(self, model: BaseEstimator, multi_output: bool = False):
        name = type(model).__name__
        super().__init__(version=name)
        if multi_output:
            model = MultiOutputRegressor(model, -1)

        self.model = model
        self.params = model.get_params()
コード例 #37
0
def test_unsupported():
    vec = CountVectorizer()
    clf = BaseEstimator()
    res = explain_prediction(clf, 'hello, world', vec=vec)
    assert 'BaseEstimator' in res.error
    for expl in format_as_all(res, clf):
        assert 'Error' in expl
        assert 'BaseEstimator' in expl
コード例 #38
0
    def __init__(self, columns=None, remove=None, skip_errors=False, single=False):
        """
        @param      columns         specify a columns selection
        @param      remove          modalities to remove
        @param      skip_errors     skip when a new categories appear (no 1)
        @param      single          use a single column per category, do not multiply them for each value

        The logging function displays a message when a new dense and big matrix
        is created when it should be sparse. A sparse matrix should be allocated instead.
        """
        BaseEstimator.__init__(self)
        TransformerMixin.__init__(self)
        self.columns = columns if isinstance(
            columns, list) or columns is None else [columns]
        self.skip_errors = skip_errors
        self.remove = remove
        self.single = single
コード例 #39
0
ファイル: query_expansion.py プロジェクト: shatha2014/vec4ir
 def __init__(self, wv, m=10, analyzer=str.split, eqe=1, verbose=0, a=1,
              c=0, n_jobs=1):
     """
     Initializes the embedding based query language model query expansion
     technique
     """
     BaseEstimator.__init__(self)
     self._wv = wv
     self._analyzer = analyzer
     if eqe not in [1, 2]:
         raise ValueError
     self._eqe = eqe
     self.verbose = verbose
     self._a = a
     self._c = c
     self.m = m
     self.n_jobs = n_jobs
     self.vocabulary = None
コード例 #40
0
ファイル: core.py プロジェクト: shatha2014/vec4ir
    def __init__(self, retrieval_model, matching=None,
                 query_expansion=None, name='RM',
                 labels=None):
        """TODO: to be defined1.

        :retrieval_model: A retrieval model satisfying fit and query.
        :vectorizer: A vectorizer satisfying fit and transform (and fit_transform).
        :matching: A matching operation satisfying fit and predict.
        :query_expansion: A query operation satisfying fit and transform
        :labels: Pre-defined mapping of indices to identifiers, will be inferred during fit, if not given.

        """
        BaseEstimator.__init__(self)

        self._retrieval_model = retrieval_model
        self._matching = matching
        self._query_expansion = query_expansion
        self.name = name
        self.labels_ = np.asarray(labels) if labels is not None else None
コード例 #41
0
    def __init__(self, columns=None, remove=None, skip_errors=False, single=False, fLOG=None):
        """
        constructor

        @param      columns         specify a columns selection
        @param      remove          modalities to remove
        @param      skip_errors     skip when a new categories appear (no 1)
        @param      single          use a single column per category, do not multiply them for each value
        @param      fLOG            logging function

        The logging function displays a message when a new dense and big matrix
        is created when it should be sparse. A sparse matrix should be allocated instead.
        """
        BaseEstimator.__init__(self)
        TransformerMixin.__init__(self)
        self._p_columns = columns if isinstance(
            columns, list) or columns is None else [columns]
        self._p_skip_errors = skip_errors
        self._p_remove = remove
        self._p_single = single
        self.fLOG = fLOG
コード例 #42
0
 def get_params(self, deep=True):
     params = BaseEstimator.get_params(self, deep)
     params['max_dimensions'] = self.max_dimensions
     params['beta'] = self.beta
     params['C'] = self.C
     return params
コード例 #43
0
 def get_params(self, deep=True):
     params = BaseEstimator.get_params(self, deep)
     params['beta'] = self.beta
     return params
コード例 #44
0
ファイル: base.py プロジェクト: YangHaha11514/skorch
 def get_params(self, deep=True):
     return BaseEstimator.get_params(self, deep=deep)
コード例 #45
0
ファイル: base.py プロジェクト: YangHaha11514/skorch
 def set_params(self, **params):
     BaseEstimator.set_params(self, **params)
コード例 #46
0
 def __init__(self, cost_func, n_class=2):
     BaseEstimator.__init__(self)
     self.n_class = n_class
     self.cost_func = cost_func