def compute(): """Create PFA for kNN.""" inputs = io_helper.fetch_data() dep_var = inputs["data"]["dependent"][0] indep_vars = inputs["data"]["independent"] params = parameters.fetch_parameters() if dep_var['type']['name'] in ('polynominal', 'binominal'): job_type = 'classification' else: job_type = 'regression' logging.info('Creating new estimator') estimator = _create_estimator(job_type, params) featurizer = _create_featurizer(indep_vars) # convert variables into dataframe X = io_helper.fetch_dataframe(variables=[dep_var] + indep_vars) X = utils.remove_nulls(X) y = X.pop(dep_var['name']) X = featurizer.transform(X) # Drop NaN values estimator.fit(X, y) # Create PFA from the estimator types = [(var['name'], var['type']['name']) for var in indep_vars] pfa = sklearn_to_pfa(estimator, types, featurizer.generate_pretty_pfa()) pfa['name'] = "kNN" # Save or update job_result logging.info('Saving PFA to job_results table...') pfa = json.dumps(pfa) io_helper.save_results(pfa, shapes.Shapes.PFA)
def compute(): """Create PFA for kNN.""" # Read intermediate inputs from jobs logging.info("Fetching intermediate data...") inputs = io_helper.fetch_data() indep_vars = inputs["data"]["independent"] # Extract hyperparameters from ENV variables k = parameters.get_param('n_clusters', int, DEFAULT_N_CLUSTERS) # featurization featurizer = _create_featurizer(indep_vars) # convert variables into dataframe X = io_helper.fetch_dataframe(variables=indep_vars) X = utils.remove_nulls(X, errors='ignore') X = featurizer.transform(X) estimator = KMeans(n_clusters=k) estimator.fit(X) # Generate PFA for kmeans types = [(var['name'], var['type']['name']) for var in indep_vars] pfa = sklearn_to_pfa(estimator, types, featurizer.generate_pretty_pfa()) # Add centroids as metadata pfa['metadata'] = { 'centroids': json.dumps(estimator.cluster_centers_.tolist()) } # Save or update job_result logging.info('Saving PFA to job_results table') io_helper.save_results(json.dumps(pfa), shapes.Shapes.PFA) logging.info("DONE")
def test_estimator_to_pfa_mlp_classifier(): """Check that converted PFA is giving the same results as MLPClassifier""" X, y, types = _classification_task() estimator = _mlp_classifier(X, y, classes=['a', 'b', 'c']) pfa = sklearn_to_pfa(estimator, types) estimator_pred = estimator.predict(X) pfa_pred = _predict_pfa(X, types, pfa) assert all(estimator_pred == pfa_pred)
def test_estimator_to_pfa_mlp_regressor(): """Check that converted PFA is giving the same results as MLPRegressor""" X, y, types = _regression_task() estimator = _mlp_regressor(X, y) pfa = sklearn_to_pfa(estimator, types) estimator_pred = estimator.predict(X) pfa_pred = _predict_pfa(X, types, pfa) assert _arrays_equal(estimator_pred, pfa_pred)
def _generate_pfa_regressor(result, indep_vars, featurizer): # Create mock SGDRegressor for sklearn_to_pfa estimator = SGDRegressor() estimator.intercept_ = [result['intercept']] # NOTE: linearly dependent columns will be assigned 0 estimator.coef_ = [ result.get(c, {'coef': 0.})['coef'] for c in featurizer.columns if c != 'intercept' ] types = [(var['name'], var['type']['name']) for var in indep_vars] return sklearn_to_pfa(estimator, types, featurizer.generate_pretty_pfa())
def test_estimator_to_pfa_kneighborsclassifier(): """Check that converted PFA is giving the same results as KNeighborsClassifier""" X, y, types = _classification_task() estimator = _kneighborsclassifier(X, y, n_neighbors=2) pfa = sklearn_to_pfa(estimator, types) estimator_pred = estimator.predict(X) pfa_pred = _predict_pfa(X, types, pfa) assert all(estimator_pred == pfa_pred)
def test_estimator_to_pfa_kmeans(): """Check that converted PFA is giving the same results as KMeans""" X, _, types = _classification_task() estimator = _kmeans(X, n_clusters=2) pfa = sklearn_to_pfa(estimator, types) estimator_pred = estimator.predict(X) pfa_pred = _predict_pfa(X, types, pfa) assert all(estimator_pred == pfa_pred)
def test_estimator_to_pfa_mixednb(dtypes): """Check that converted PFA is giving the same results as MixedNB""" X, y, types = _classification_task(dtypes=dtypes) is_nominal = [t == 'n' for t in dtypes] estimator = _mixednb(X, y, is_nominal=is_nominal, classes=['a', 'b', 'c']) pfa = sklearn_to_pfa(estimator, types) estimator_pred = estimator.predict(X) pfa_pred = _predict_pfa(X, types, pfa) assert all(estimator_pred == pfa_pred)
def test_estimator_to_pfa_gradientboostingclassifier(): """Check that converted PFA is giving the same results as GradientBoostingClassifier""" X, y, types = _classification_task() estimator = _gradientboostingclassifier(X, y, n_estimators=10, learning_rate=0.1) pfa = sklearn_to_pfa(estimator, types) estimator_pred = estimator.predict(X) pfa_pred = _predict_pfa(X, types, pfa) assert all(estimator_pred == pfa_pred)
def test_estimator_to_pfa_gradientboostingregressor(): """Check that converted PFA is giving the same results as GradientBoostingRegressor""" X, y, types = _regression_task() estimator = _gradientboostingregressor(X, y, n_estimators=10, learning_rate=0.1) pfa = sklearn_to_pfa(estimator, types) estimator_pred = estimator.predict(X) pfa_pred = _predict_pfa(X, types, pfa) np.testing.assert_almost_equal(estimator_pred, pfa_pred, decimal=5)
def test_estimator_to_pfa_mixednb_zero_prior(): """Check that converted PFA is giving the same results as MultinomialNB with category that has no values.""" dtypes = 'ccn' X, y, types = _classification_task(n_features=3, dtypes=dtypes) y[:] = 'a' is_nominal = [t == 'n' for t in dtypes] estimator = _mixednb(X, y, is_nominal=is_nominal, classes=['a', 'b', 'c']) pfa = sklearn_to_pfa(estimator, types) estimator_pred = estimator.predict(X) pfa_pred = _predict_pfa(X, types, pfa) assert all(estimator_pred == pfa_pred)
def test_estimator_to_pfa_multinomialnb(): """Check that converted PFA is giving the same results as MultinomialNB""" X, y, types = _classification_task() # artifically create 0, 1 inputs from X because `MultinomialNB` works only with counts X = (X > 0).astype(int) estimator = _multinomialnb(X, y, classes=['a', 'b', 'c']) pfa = sklearn_to_pfa(estimator, types) estimator_pred = estimator.predict(X) pfa_pred = _predict_pfa(X, types, pfa) assert all(estimator_pred == pfa_pred)
def test_estimator_to_pfa_gradientboostingclassifier_nosplits(): X, y, types = _classification_task() # `min_samples_split` guarantees there will be no splits estimator = _gradientboostingclassifier(X, y, min_samples_split=1000000, n_estimators=10, learning_rate=0.1) pfa = sklearn_to_pfa(estimator, types) estimator_pred = estimator.predict(X) pfa_pred = _predict_pfa(X, types, pfa) assert all(estimator_pred == pfa_pred)
def aggregate_kmeans(job_ids): """Compute merging of clusters according to least merging error (e.g. smallest distance betweeen centroids) :input job_ids: list of job_ids with intermediate results """ # Read intermediate inputs from jobs logging.info("Fetching intermediate data...") data = [ json.loads(io_helper.get_results(str(job_id)).data) for job_id in job_ids ] local_centroids = [ np.array(x['centroids']) for x in data if x['centroids'] ] indep_vars = data[0]['indep_vars'] # Aggregate clusters remotely remote_centroids = remote.aggregate_clusters(local_centroids) logging.info("Centroids:\n{}".format(remote_centroids)) # Create fake KMeans estimator and assign it our centroids estimator = KMeans() estimator.cluster_centers_ = np.array(remote_centroids) # Generate PFA for kmeans and add centroids to metadata featurizer = _create_featurizer(indep_vars) types = [(var['name'], var['type']['name']) for var in indep_vars] pfa = sklearn_to_pfa(estimator, types, featurizer.generate_pretty_pfa()) # Add serialized model as metadata pfa['metadata'] = { 'centroids': json.dumps(np.array(remote_centroids).tolist()) } # Save or update job_result logging.info('Saving PFA to job_results table') pfa = json.dumps(pfa) io_helper.save_results(pfa, shapes.Shapes.PFA) logging.info("DONE")
def main(job_id, generate_pfa): inputs = io_helper.fetch_data() dep_var = inputs["data"]["dependent"][0] indep_vars = inputs["data"]["independent"] if dep_var['type']['name'] in ('polynominal', 'binominal'): job_type = 'classification' else: job_type = 'regression' # Get existing results with partial model if they exist if job_id: job_result = io_helper.get_results(job_id=str(job_id)) logging.info('Loading existing estimator') estimator = deserialize_sklearn_estimator(json.loads(job_result.data)['estimator']) else: logging.info('Creating new estimator') estimator = _create_estimator(job_type) # featurization featurizer = _create_featurizer(indep_vars, estimator) # convert variables into dataframe X = io_helper.fetch_dataframe(variables=[dep_var] + indep_vars) X = utils.remove_nulls(X, errors='ignore') y = X.pop(dep_var['name']) X = featurizer.transform(X) if len(X) == 0: # log error, but still save the estimator logging.warning("All data are NULL, cannot fit model") else: # Train single step if hasattr(estimator, 'partial_fit'): if job_type == 'classification': estimator.partial_fit(X, y, classes=dep_var['type']['enumeration']) else: estimator.partial_fit(X, y) else: if not generate_pfa: logging.warning('{} does not support partial fit.'.format(estimator)) if isinstance(estimator, GradientBoostingClassifier) and len(set(y)) == 1: raise errors.UserError( 'All outputs have single category ({}), Gradient boosting cannot fit that.'.format(y.iloc[0]) ) estimator.fit(X, y) if generate_pfa: # Create PFA from the estimator types = [(var['name'], var['type']['name']) for var in indep_vars] # Estimator was not trained on any data if not _is_fitted(estimator): raise errors.UserError('Model was not fitted on any data, cannot generate PFA.') pfa = sklearn_to_pfa(estimator, types, featurizer.generate_pretty_pfa()) # Add serialized model as metadata pfa['metadata'] = _estimator_metadata(estimator, X, y, featurizer) model_type = parameters.get_parameter('type', str, 'linear_model') pfa['name'] = model_type # Save or update job_result logging.info('Saving PFA to job_results table') pfa = json.dumps(pfa) io_helper.save_results(pfa, shapes.Shapes.PFA) else: # Save or update job_result logging.info('Saving serialized estimator into job_results table') io_helper.save_results(json.dumps(_estimator_metadata(estimator, X, y, featurizer)), shapes.Shapes.JSON)