def fit(self, X): points = X[np.random.choice(len(X), size=self.n_clusters, replace=False)] points_history = np.zeros_like(points) """LOOP""" for i in range(100): slope = np.divide.reduce(np.subtract.reduce(np.fliplr(points), axis=0)[None, :], axis=1)[0] x, y = points[0] """orthogonal line""" slope_orthogonal = -1 / slope xbar, ybar = points.mean(axis=0) intercept_orthogonal = ybar - (xbar * slope_orthogonal) """determine the line function""" func = np.vectorize(lambda x, y: y >= (x * slope_orthogonal + intercept_orthogonal)) """assign the points""" y = np.where(func(*X.T), 0, 1) """calculate the centroids""" from pandas import DataFrame points = DataFrame( X, dtype='f', columns=['x', 'y']).assign(c=y).groupby('c').mean().values if np.allclose(points.ravel(), points_history.ravel()): break else: points_history = points else: "ran out of iterations" self._classes = y self._centroids = points return self
def invertTransform(transformed_data, transformer=None): """ Invert the transformed data Inputs: > transformed_data: The data that has been transformed > transformer (None by default): If not None, this is the scikit-learn transform returned by the transformData function Output: > The inverted data (if transformer is passed) or the original data (if transformed is None) """ if transformer is not None: print("Inverting data") if isinstance(transformed_data, ndarray): if transformed_data.ndim == 1: print(" Reshaping transformed data") transformed_data = reshape(transformed_data, (-1, 1)) inverted_data = transformer.inverse_transform(transformed_data) if isinstance(transformed_data, DataFrame): columns = transformed_data.columns inverted_data = transformer.inverse_transform(transformed_data) inverted_data = DataFrame(inverted_data, columns=columns) if isinstance(transformed_data, Series): name = transformed_data.name transformed_data = reshape(transformed_data, (-1, 1)) inverted_data = transformer.inverse_transform(transformed_data) inverted_data = Series(inverted_data.ravel(), name=name) return inverted_data return transformed_data
def predict(test_data, results, model_name): """ Return predictions of based on model resutls. Parameters ---------- test_data: DataFrame should be test data you are trying to predict results: dict should be dict of your models results wrapper and the formula used to produce it. ie. results['Model_Name'] = { [<statsmodels.regression.linear_model.RegressionResultsWrapper> , "Price ~ I(Supply, Demand)] } model_name: str should be the name of your model. You can iterate through the results dict. Returns ------- NumPy array Predictions in a flat NumPy array. Example ------- results = {'Logit': [<statsmodels.discrete.discrete_model.BinaryResultsWrapper at 0x117896650>, 'survived ~ C(pclass) + C(sex) + age + sibsp + C(embarked)']} compared_resuts = predict(test_data, results, 'Logit') """ model_params = DataFrame(results[model_name][0].params) formula = results[model_name][1] # Create regression friendly test DataFrame yt, xt = dmatrices(formula, data=test_data, return_type='dataframe') xt, model_params = get_dataframes_intersections(xt, xt.columns, model_params, model_params.index) # Convert to NumPy arrays for performance model_params = np.asarray(model_params) yt = np.asarray(yt) yt = yt.ravel() xt = np.asarray(xt) # Use our models to create predictions row, col = xt.shape model_parameters = model_params.ravel() model_array = list((model_parameters for parameter in xrange(row))) model_array = np.asarray(model_array) # Multiply matrix together predictions = np.multiply(xt, model_array) predictions = np.sum(predictions, axis=1) return predictions # import predict # __all__ = ['predict']
def predict(test_data, results, model_name): """ Return predictions of based on model results. Parameters ---------- test_data: DataFrame should be test data you are trying to predict results: dict should be dict of your models results wrapper and the formula used to produce it. ie. results['Model_Name'] = [<statsmodels.regression.linear_model.RegressionResultsWrapper>, "Price ~ I(Supply, Demand)] model_name: str should be the name of your model. You can iterate through the results dict. Returns ------- NumPy array Predictions in a flat NumPy array. Example ------- results = { 'Logit': [<statsmodels.discrete.discrete_model.BinaryResultsWrapper at 0x117896650>, 'survived ~ C(pclass) + C(sex) + age + sibsp + C(embarked)'] } compared_resuts = predict(test_data, results, 'Logit') """ model_params = DataFrame(results[model_name][0].params) formula = results[model_name][1] # Create regression friendly test DataFrame yt, xt = dmatrices(formula, data=test_data, return_type='dataframe') xt, model_params = ka_df.get__intersections(xt, xt.columns, model_params, model_params.index) # Convert to NumPy arrays for performance model_params = np.asarray(model_params) yt = np.asarray(yt) yt = yt.ravel() xt = np.asarray(xt) # Use our models to create predictions row, col = xt.shape model_parameters = model_params.ravel() model_array = list((model_parameters for parameter in xrange(row))) model_array = np.asarray(model_array) # Multiply matrix together predictions = np.multiply(xt, model_array) predictions = np.sum(predictions, axis=1) return predictions
def instantiate_and_fit( index: pd.DataFrame, fold: pd.DataFrame, X: np.ndarray, y: pd.DataFrame, estimator: BaseEstimator, n_splits: int = 5, param_grid: Optional[Dict[str, Any]] = None, ) -> BaseEstimator: assert fold.shape[0] == index.shape[0] assert fold.shape[0] == X.shape[0] assert fold.shape[0] == y.shape[0] fold_vals = fold.ravel() train_inds = fold_vals == "train" val_inds = fold_vals == "val" if val_inds.sum(): raise NotImplementedError( "Explicit validation indices not yet supported.") y = y.values.ravel() nan_row, nan_col = np.nonzero(np.isnan(X) | np.isinf(X)) if len(nan_row): logger.warning( f"Setting {len(nan_row)} NaN elements to zero before fitting {estimator}." ) X[nan_row, nan_col] = 0 logger.info(f"Fitting {estimator} on data (shape: {X.shape})") if param_grid is not None: group_k_fold = GroupKFold(n_splits=n_splits).split( X[train_inds], y[train_inds], index.trial.values[train_inds]) grid_search = GridSearchCV(estimator=estimator, param_grid=param_grid, verbose=10, cv=list(group_k_fold)) grid_search.fit(X[train_inds], y[train_inds]) return grid_search.best_estimator_ estimator.fit(X[train_inds], y[train_inds]) return estimator
def test_onnxt_iris_voting_regressor(self): iris = load_iris() X, y = iris.data, iris.target y = y.astype(numpy.float32) X_train, X_test, y_train, __ = train_test_split(X, y, random_state=11) clr = VotingRegressor(estimators=[( 'lr', LinearRegression()), ('dt', DecisionTreeRegressor(max_depth=2))]) clr.fit(X_train, y_train) X_test = X_test.astype(numpy.float32) X_test = numpy.vstack([X_test[:4], X_test[-4:]]) res0 = clr.predict(X_test).astype(numpy.float32) model_def = to_onnx(clr, X_train.astype(numpy.float32)) oinf = OnnxInference(model_def, runtime='python') res1 = oinf.run({'X': X_test}) regs = DataFrame(res1['variable']).values self.assertEqualArray(res0, regs.ravel(), decimal=6)
def predict(test_data, results, model_name): """ Return a NumPy array of independent variable predictions of a test file basedon your regression of a train file. Parameters ---------- test_data: pandas dataframe should be test data you are trying to predict results: dict should be dict of your models results wrapper and the formula used to produce it. ie. results['Model_Name'] = {[<statsmodels.regression.linear_model.RegressionResultsWrapper> , "Price ~ I(Supply, Demand)] } model_name: str should be the name of your model. You can iterate through the results dict. Returns ------- NumPy array Predictions in a flat NumPy array. Example ------- results = {'Logit': [<statsmodels.discrete.discrete_model.BinaryResultsWrapper at 0x117896650>, 'survived ~ C(pclass) + C(sex) + age + sibsp + C(embarked)']} compared_resuts = predict(test_data, results, 'Logit') """ model_params = DataFrame(results[model_name][0].params) formula = results[model_name][1] # Create reg friendly test dataframe yt, xt = dmatrices(formula, data=test_data, return_type='dataframe') # remove extraneous features for efficiency # for c in xt.columns: # if c not in model_params.index: # to_drop.append(c) to_drop = list((c for c in xt.columns if c not in model_params.index)) xt = xt.drop(to_drop, axis=1) to_drop = list((c for c in model_params.index if c not in xt.columns)) for c in model_params.index: if c not in xt.columns: to_drop.append(c) model_params = model_params.drop(to_drop) # Convert to NumPy arrays for performance model_params = np.asarray(model_params) yt = np.asarray(yt) yt = yt.ravel() xt = np.asarray(xt) # Use our models to create predictions row, col = xt.shape model_params = model_params.ravel() model_array = [] for _ in xrange(row): model_array.append(model_params) model_array = np.asarray(model_array) # Multiply matrix together predictions = np.multiply(xt, model_array) predictions = np.sum(predictions, axis=1) return predictions
def predict(test_data, results, i): """ Returns a NumPy array of independent variable predictions of a test file based on your regression of a train file. Built for speed Parameters -- Test_data: should be test data you are trying to predict in a pandas dataframe results: should be dict of your models results wrapper and the formula used to produce it. ie. results['Model_Name'] = {[<statsmodels.regression.linear_model.RegressionResultsWrapper> , "Price ~ I(Supply, Demand)] } i: should be the name of your model. You can iterate through the results dict. -- Returns -- Predictions in a flat NumPy array. AGC 2013 """ import numpy as np from pandas import DataFrame from patsy import dmatrices model_params = DataFrame(results[i][0].params) formula = results[i][1] # Create reg friendly test dataframe yt, xt = dmatrices(formula, data=test_data, return_type='dataframe') # remove extraneous features for efficiency to_drop = list() to_drop[:] = [] # Empty list, in case cells are executed out of order for c in xt.columns: if c not in model_params.index: to_drop.append(c) xt = xt.drop(to_drop, axis=1) to_drop[:] = [] # Empty list for c in model_params.index : if c not in xt.columns: to_drop.append(c) model_params = model_params.drop(to_drop) # Convert to NumPy arrays for performance model_params = np.asarray(model_params) yt = np.asarray(yt) yt = yt.ravel() xt = np.asarray(xt) # Use our models to create predictions row, col = xt.shape model_params = model_params.ravel() # flatten array model_array = [] for _ in xrange(row): model_array.append(model_params) model_array = np.asarray(model_array) # Multiply matrix together predictions = np.multiply(xt, model_array) predictions = np.sum(predictions, axis = 1) return predictions
def train(self, x_train: pd.DataFrame, y_train: pd.DataFrame): self.model.fit(x_train, y_train.ravel())