Example #1
0
 def fit(self, X):
     points = X[np.random.choice(len(X),
                                 size=self.n_clusters,
                                 replace=False)]
     points_history = np.zeros_like(points)
     """LOOP"""
     for i in range(100):
         slope = np.divide.reduce(np.subtract.reduce(np.fliplr(points),
                                                     axis=0)[None, :],
                                  axis=1)[0]
         x, y = points[0]
         """orthogonal line"""
         slope_orthogonal = -1 / slope
         xbar, ybar = points.mean(axis=0)
         intercept_orthogonal = ybar - (xbar * slope_orthogonal)
         """determine the line function"""
         func = np.vectorize(lambda x, y: y >=
                             (x * slope_orthogonal + intercept_orthogonal))
         """assign the points"""
         y = np.where(func(*X.T), 0, 1)
         """calculate the centroids"""
         from pandas import DataFrame
         points = DataFrame(
             X, dtype='f',
             columns=['x', 'y']).assign(c=y).groupby('c').mean().values
         if np.allclose(points.ravel(), points_history.ravel()): break
         else: points_history = points
     else: "ran out of iterations"
     self._classes = y
     self._centroids = points
     return self
Example #2
0
def invertTransform(transformed_data, transformer=None):
    """
    Invert the transformed data
    
    Inputs:
      > transformed_data: The data that has been transformed
      > transformer (None by default): If not None, this is the scikit-learn transform returned by the transformData function
      
    Output:
      > The inverted data (if transformer is passed) or the original data (if transformed is None)
    """
    if transformer is not None:
        print("Inverting data")
        if isinstance(transformed_data, ndarray):
            if transformed_data.ndim == 1:
                print("  Reshaping transformed data")
                transformed_data = reshape(transformed_data, (-1, 1))
                inverted_data = transformer.inverse_transform(transformed_data)
        if isinstance(transformed_data, DataFrame):
            columns = transformed_data.columns
            inverted_data = transformer.inverse_transform(transformed_data)
            inverted_data = DataFrame(inverted_data, columns=columns)
        if isinstance(transformed_data, Series):
            name = transformed_data.name
            transformed_data = reshape(transformed_data, (-1, 1))
            inverted_data = transformer.inverse_transform(transformed_data)
            inverted_data = Series(inverted_data.ravel(), name=name)
        return inverted_data
    return transformed_data
Example #3
0
def predict(test_data, results, model_name):
    """
    Return predictions of based on model resutls.

    Parameters
    ----------
    test_data: DataFrame
        should be test data you are trying to predict
    results: dict
        should be dict of your models results wrapper and the formula used
        to produce it.
            ie.
            results['Model_Name'] = {
            [<statsmodels.regression.linear_model.RegressionResultsWrapper> , 
            "Price ~ I(Supply, Demand)]
            }
    model_name: str
        should be the name of your model. You can iterate through the results dict.

    Returns
    -------
    NumPy array
        Predictions in a flat NumPy array.

    Example
    -------
    results = {'Logit': [<statsmodels.discrete.discrete_model.BinaryResultsWrapper at 0x117896650>,
               'survived ~ C(pclass) + C(sex) + age + sibsp  + C(embarked)']}
    compared_resuts = predict(test_data, results, 'Logit')

    """
    model_params = DataFrame(results[model_name][0].params)
    formula = results[model_name][1]

    # Create regression friendly test DataFrame
    yt, xt = dmatrices(formula, data=test_data, return_type='dataframe')
    xt, model_params = get_dataframes_intersections(xt, xt.columns,
                                                    model_params,
                                                    model_params.index)
    # Convert to NumPy arrays for performance
    model_params = np.asarray(model_params)
    yt = np.asarray(yt)
    yt = yt.ravel()
    xt = np.asarray(xt)

    # Use our models to create predictions
    row, col = xt.shape
    model_parameters = model_params.ravel()
    model_array = list((model_parameters for parameter in xrange(row)))
    model_array = np.asarray(model_array)

    # Multiply matrix together
    predictions = np.multiply(xt, model_array)
    predictions = np.sum(predictions, axis=1)
    return predictions


# import predict
# __all__ = ['predict']
Example #4
0
def predict(test_data, results, model_name):
    """
    Return predictions of based on model results.

    Parameters
    ----------
    test_data: DataFrame
        should be test data you are trying to predict
    results: dict
        should be dict of your models results wrapper and the formula used
        to produce it.
            ie.
            results['Model_Name'] = [<statsmodels.regression.linear_model.RegressionResultsWrapper>,
                                     "Price ~ I(Supply, Demand)]
    model_name: str
        should be the name of your model. You can iterate through the results dict.

    Returns
    -------
    NumPy array
        Predictions in a flat NumPy array.

    Example
    -------
    results = {
        'Logit': [<statsmodels.discrete.discrete_model.BinaryResultsWrapper at 0x117896650>,
                  'survived ~ C(pclass) + C(sex) + age + sibsp  + C(embarked)']
    }
    compared_resuts = predict(test_data, results, 'Logit')

    """
    model_params = DataFrame(results[model_name][0].params)
    formula = results[model_name][1]

    # Create regression friendly test DataFrame
    yt, xt = dmatrices(formula, data=test_data, return_type='dataframe')
    xt, model_params = ka_df.get__intersections(xt, xt.columns,
                                                model_params, model_params.index)
    # Convert to NumPy arrays for performance
    model_params = np.asarray(model_params)
    yt = np.asarray(yt)
    yt = yt.ravel()
    xt = np.asarray(xt)

    # Use our models to create predictions
    row, col = xt.shape
    model_parameters = model_params.ravel()
    model_array = list((model_parameters for parameter in xrange(row)))
    model_array = np.asarray(model_array)

    # Multiply matrix together
    predictions = np.multiply(xt, model_array)
    predictions = np.sum(predictions, axis=1)
    return predictions
Example #5
0
def instantiate_and_fit(
    index: pd.DataFrame,
    fold: pd.DataFrame,
    X: np.ndarray,
    y: pd.DataFrame,
    estimator: BaseEstimator,
    n_splits: int = 5,
    param_grid: Optional[Dict[str, Any]] = None,
) -> BaseEstimator:
    assert fold.shape[0] == index.shape[0]
    assert fold.shape[0] == X.shape[0]
    assert fold.shape[0] == y.shape[0]

    fold_vals = fold.ravel()

    train_inds = fold_vals == "train"
    val_inds = fold_vals == "val"

    if val_inds.sum():
        raise NotImplementedError(
            "Explicit validation indices not yet supported.")

    y = y.values.ravel()

    nan_row, nan_col = np.nonzero(np.isnan(X) | np.isinf(X))
    if len(nan_row):
        logger.warning(
            f"Setting {len(nan_row)} NaN elements to zero before fitting {estimator}."
        )
        X[nan_row, nan_col] = 0

    logger.info(f"Fitting {estimator} on data (shape: {X.shape})")

    if param_grid is not None:
        group_k_fold = GroupKFold(n_splits=n_splits).split(
            X[train_inds], y[train_inds], index.trial.values[train_inds])

        grid_search = GridSearchCV(estimator=estimator,
                                   param_grid=param_grid,
                                   verbose=10,
                                   cv=list(group_k_fold))
        grid_search.fit(X[train_inds], y[train_inds])

        return grid_search.best_estimator_

    estimator.fit(X[train_inds], y[train_inds])
    return estimator
    def test_onnxt_iris_voting_regressor(self):
        iris = load_iris()
        X, y = iris.data, iris.target
        y = y.astype(numpy.float32)
        X_train, X_test, y_train, __ = train_test_split(X, y, random_state=11)
        clr = VotingRegressor(estimators=[(
            'lr',
            LinearRegression()), ('dt', DecisionTreeRegressor(max_depth=2))])
        clr.fit(X_train, y_train)
        X_test = X_test.astype(numpy.float32)
        X_test = numpy.vstack([X_test[:4], X_test[-4:]])
        res0 = clr.predict(X_test).astype(numpy.float32)

        model_def = to_onnx(clr, X_train.astype(numpy.float32))

        oinf = OnnxInference(model_def, runtime='python')
        res1 = oinf.run({'X': X_test})
        regs = DataFrame(res1['variable']).values
        self.assertEqualArray(res0, regs.ravel(), decimal=6)
Example #7
0
def predict(test_data, results, model_name):
    """
    Return a NumPy array of independent variable predictions of a test file
    basedon your regression of a train file.

    Parameters
    ----------
    test_data: pandas dataframe
        should be test data you are trying to predict
    results: dict
        should be dict of your models results wrapper and the formula used
        to produce it.
            ie.
            results['Model_Name'] = {[<statsmodels.regression.linear_model.RegressionResultsWrapper> , "Price ~ I(Supply, Demand)] }
    model_name: str
        should be the name of your model. You can iterate through the results dict.

    Returns
    -------
    NumPy array
        Predictions in a flat NumPy array.

    Example
    -------
    results = {'Logit': [<statsmodels.discrete.discrete_model.BinaryResultsWrapper at 0x117896650>,
               'survived ~ C(pclass) + C(sex) + age + sibsp  + C(embarked)']}
    compared_resuts = predict(test_data, results, 'Logit')
    """
    model_params = DataFrame(results[model_name][0].params)
    formula = results[model_name][1]

    # Create reg friendly test dataframe
    yt, xt = dmatrices(formula, data=test_data, return_type='dataframe')

    # remove extraneous features for efficiency
    # for c in xt.columns:
    #     if c not in model_params.index:
    #         to_drop.append(c)

    to_drop = list((c for c in xt.columns if c not in model_params.index))
    xt = xt.drop(to_drop, axis=1)


    to_drop = list((c for c in model_params.index if c not in xt.columns))

    for c in model_params.index:
        if c not in xt.columns:
            to_drop.append(c)
    model_params = model_params.drop(to_drop)

    # Convert to NumPy arrays for performance
    model_params = np.asarray(model_params)
    yt = np.asarray(yt)
    yt = yt.ravel()
    xt = np.asarray(xt)

    # Use our models to create predictions
    row, col = xt.shape
    model_params = model_params.ravel()
    model_array = []

    for _ in xrange(row):
            model_array.append(model_params)
    model_array = np.asarray(model_array)

    # Multiply matrix together
    predictions = np.multiply(xt, model_array)
    predictions = np.sum(predictions, axis=1)

    return predictions
Example #8
0
def predict(test_data, results, i):
    """ 
    Returns a NumPy array of independent variable predictions of a test file based on your regression of a train file. Built for speed
    
    Parameters
    --
    Test_data: should be test data you are trying to predict in a pandas dataframe 
    results: should be dict of your models results wrapper and the formula used to produce it. 
        ie.  
        results['Model_Name'] = {[<statsmodels.regression.linear_model.RegressionResultsWrapper> , "Price ~ I(Supply, Demand)] }
    i: should be the name of your model. You can iterate through the results dict. 
    --
   
    Returns
    --
    Predictions in a flat NumPy array. 
    AGC 2013
    """
    import numpy as np
    from pandas import DataFrame
    from patsy import dmatrices

    
    model_params = DataFrame(results[i][0].params)
    formula = results[i][1]
    
    # Create reg friendly test dataframe
    yt, xt = dmatrices(formula, data=test_data, return_type='dataframe')

    
    # remove extraneous features for efficiency 
    to_drop = list()
    to_drop[:] = [] # Empty list, in case cells are executed out of order
    for c in xt.columns:
        if c not in model_params.index:
            to_drop.append(c)
    xt = xt.drop(to_drop, axis=1)
    
    to_drop[:] = [] # Empty list
    for c in model_params.index : 
        if c not in xt.columns:
            to_drop.append(c)
    model_params = model_params.drop(to_drop)
    
    # Convert to NumPy arrays for performance
    model_params = np.asarray(model_params)
    yt = np.asarray(yt)
    yt = yt.ravel()
    xt = np.asarray(xt)

    
    # Use our models to create predictions
    row, col = xt.shape
    model_params = model_params.ravel() # flatten array
    model_array = []
    
    for _ in xrange(row):
            model_array.append(model_params)
    model_array = np.asarray(model_array)
    
    # Multiply matrix together 
    predictions = np.multiply(xt, model_array)
    predictions = np.sum(predictions, axis = 1)

    return predictions
Example #9
0
 def train(self, x_train: pd.DataFrame, y_train: pd.DataFrame):
     self.model.fit(x_train, y_train.ravel())