def aggregate_stats(job_ids, graph_type=None): """Get all partial statistics from all nodes and aggregate them. :input job_ids: list of job_ids with intermediate results """ # Read intermediate inputs from jobs logging.info("Fetching intermediate data...") results = io_helper.load_intermediate_json_results(map(str, job_ids)) corr, columns, crosstab = _aggregate_results(results) graph_type = graph_type or parameters.get_parameter( 'graph', str, 'correlation_heatmap') if graph_type == 'correlation_heatmap': fig = _fig_corr_heatmap(corr, columns, crosstab) elif graph_type == 'pca': # save PCA graphs, but leave out the one with PCA scores logging.warning( 'Sample scores graph is not yet implemented for distributed PCA.') fig = _fig_pca(corr, columns, X=None) else: raise errors.UserError( 'MODEL_PARAM_graph only supports values `correlation_heatmap` and `pca`' ) logging.info("Results:\n{}".format(fig)) io_helper.save_results(json.dumps(fig), shapes.Shapes.PLOTLY) logging.info("DONE")
def compute(graph_type=None): """Perform both intermediate step and aggregation at once.""" # Read inputs logging.info("Fetching data...") inputs = io_helper.fetch_data() dep_var = inputs["data"]["dependent"][0] indep_vars = inputs["data"]["independent"] result = _compute_intermediate_result(inputs) corr, columns, crosstab = _aggregate_results([result]) graph_type = graph_type or parameters.get_parameter( 'graph', str, 'correlation_heatmap') if graph_type == 'correlation_heatmap': fig = _fig_corr_heatmap(corr, columns, crosstab) elif graph_type == 'pca': X = io_helper.fetch_dataframe([dep_var] + indep_vars) fig = _fig_pca(corr, columns, X) else: raise errors.UserError( 'MODEL_PARAM_graph only supports values `correlation_heatmap` and `pca`' ) logging.info("Results:\n{}".format(fig)) io_helper.save_results(json.dumps(fig), shapes.Shapes.PLOTLY) logging.info("DONE")
def main(clean_files=False): """ :param clean_files: if True, clean files afterwards """ # Read inputs inputs = io_helper.fetch_data() data = inputs["data"] beam = parameters.get_parameter('beam', int, 10) support = parameters.get_parameter('support', float, '0.00001') out_file = 'input.csv' rules_out_file = 'rules.txt' matrix, attributes = preprocess.to_matrix(data) preprocess.dump_to_csv(matrix, attributes, out_file) # Call hedwig with sensible defaults examples_file = out_file empty_bk = tempfile.mkdtemp() call([ 'python', '-m' 'hedwig.__main__', empty_bk, examples_file, '--beam', str(beam), '--support', str(support), '-f', 'csv', '-l', '-o', rules_out_file, '--nocache' ]) with open(rules_out_file) as f: results = f.read() if clean_files: os.remove(out_file) os.remove(rules_out_file) io_helper.save_results(results.replace('less_than', '<'), shapes.Shapes.TEXT)
def main(): # Configure logging logging.basicConfig(level=logging.INFO) # Read inputs inputs = io_helper.fetch_data() dep_var = inputs["data"]["dependent"][0] inped_vars = inputs["data"]["independent"] design = get_parameter(DESIGN_PARAM, str, DEFAULT_DESIGN) # Check dependent variable type (should be continuous) if dep_var["type"]["name"] not in ["integer", "real"]: raise errors.UserError('Dependent variable should be continuous!') # Extract data and parameters from inputs data = format_data(inputs["data"]) # Compute anova and generate PFA output anova_results = format_output( compute_anova(dep_var, inped_vars, data, design).to_dict()) # Store results io_helper.save_results(anova_results, Shapes.JSON)
def main(job_id, generate_pfa): inputs = io_helper.fetch_data() dep_var = inputs["data"]["dependent"][0] indep_vars = inputs["data"]["independent"] if dep_var['type']['name'] in ('polynominal', 'binominal'): job_type = 'classification' else: job_type = 'regression' # Get existing results with partial model if they exist if job_id: job_result = io_helper.get_results(job_id=str(job_id)) logging.info('Loading existing estimator') estimator = deserialize_sklearn_estimator(json.loads(job_result.data)['estimator']) else: logging.info('Creating new estimator') estimator = _create_estimator(job_type) # featurization featurizer = _create_featurizer(indep_vars, estimator) # convert variables into dataframe X = io_helper.fetch_dataframe(variables=[dep_var] + indep_vars) X = utils.remove_nulls(X, errors='ignore') y = X.pop(dep_var['name']) X = featurizer.transform(X) if len(X) == 0: # log error, but still save the estimator logging.warning("All data are NULL, cannot fit model") else: # Train single step if hasattr(estimator, 'partial_fit'): if job_type == 'classification': estimator.partial_fit(X, y, classes=dep_var['type']['enumeration']) else: estimator.partial_fit(X, y) else: if not generate_pfa: logging.warning('{} does not support partial fit.'.format(estimator)) if isinstance(estimator, GradientBoostingClassifier) and len(set(y)) == 1: raise errors.UserError( 'All outputs have single category ({}), Gradient boosting cannot fit that.'.format(y.iloc[0]) ) estimator.fit(X, y) if generate_pfa: # Create PFA from the estimator types = [(var['name'], var['type']['name']) for var in indep_vars] # Estimator was not trained on any data if not _is_fitted(estimator): raise errors.UserError('Model was not fitted on any data, cannot generate PFA.') pfa = sklearn_to_pfa(estimator, types, featurizer.generate_pretty_pfa()) # Add serialized model as metadata pfa['metadata'] = _estimator_metadata(estimator, X, y, featurizer) model_type = parameters.get_parameter('type', str, 'linear_model') pfa['name'] = model_type # Save or update job_result logging.info('Saving PFA to job_results table') pfa = json.dumps(pfa) io_helper.save_results(pfa, shapes.Shapes.PFA) else: # Save or update job_result logging.info('Saving serialized estimator into job_results table') io_helper.save_results(json.dumps(_estimator_metadata(estimator, X, y, featurizer)), shapes.Shapes.JSON)