def run_aggregation_from_spec(spec, project_id, config={}, conditionals=[]): aggregation_variables_names = spec.get('aggregationVariablesNames') dataset_id = spec.get('datasetId') dependent_variable_name = spec.get('dependentVariableName') weight_variable_name = config.get('weightVariableName') num_variables = len(aggregation_variables_names) if not (dataset_id): return 'Not passed required parameters', 400 all_field_properties = db_access.get_field_properties(project_id, dataset_id) aggregation_variables = [ next((fp for fp in all_field_properties if fp['name'] == n), None) for n in aggregation_variables_names ] dependent_variable = next((fp for fp in all_field_properties if fp['name'] == dependent_variable_name), None) subset_variables = aggregation_variables_names if dependent_variable_name and dependent_variable_name != 'count': subset_variables += [ dependent_variable_name ] if weight_variable_name and weight_variable_name != 'UNIFORM': subset_variables += [ weight_variable_name ] subset_variables = get_unique(subset_variables, preserve_order=True) df = get_data(project_id=project_id, dataset_id=dataset_id) df_conditioned = get_conditioned_data(project_id, dataset_id, df, conditionals) df_subset = df_conditioned[ subset_variables ] df_ready = df_subset.dropna(how='all') # Remove unclean result = {} if num_variables == 1: result['one_dimensional_contingency_table'] = create_one_dimensional_contingency_table(df_ready, aggregation_variables[0], dependent_variable, config=config) elif num_variables == 2: result['two_dimensional_contingency_table'] = create_contingency_table(df_ready, aggregation_variables, dependent_variable, config=config) return result, 200
def get_viz_data_from_enumerated_spec(spec, project_id, conditionals, config, df=None, precomputed={}, data_formats=['visualize', 'table', 'score']): ''' Returns a dictionary containing data corresponding to spec (in automated-viz structure), and all necessary information to interpret data. There are three types of formats: Score: a dict of lists for scoring Visualize: a list of dicts (collection) Table: {columns: list, data: matrix} Args: spec, dataset_id, project_id, format (list of 'score', 'visualize', or 'table') Returns: data specified by spec, in specified format ''' for f in data_formats: if f not in [u'score', u'visualize', u'table', u'count']: raise ValueError('Passed incorrect data format', f) final_data = dict([(f, {}) for f in data_formats]) gp = spec['generating_procedure'] args = spec['args'] dataset_id = spec['dataset_id'] logger.debug('Generating Procedure: %s', gp) logger.debug('Arguments: %s', args) start_time = time() if df is None: df = get_data(project_id=project_id, dataset_id=dataset_id) df = get_conditioned_data(project_id, dataset_id, df, conditionals) id_fields = [ fp for fp in db_access.get_field_properties(project_id, dataset_id) if fp['is_id']] generating_procedure_to_data_function = { GeneratingProcedure.AGG.value: get_agg_data, GeneratingProcedure.IND_VAL.value: get_ind_val_data, GeneratingProcedure.BIN_AGG.value: get_bin_agg_data, GeneratingProcedure.MULTIGROUP_COUNT.value: get_multigroup_count_data, GeneratingProcedure.MULTIGROUP_AGG.value: get_multigroup_agg_data, GeneratingProcedure.VAL_BOX.value: get_val_box_data, GeneratingProcedure.VAL_AGG.value: get_val_agg_data, GeneratingProcedure.VAL_VAL.value: get_raw_comparison_data, GeneratingProcedure.VAL_COUNT.value: get_val_count_data, GeneratingProcedure.AGG_AGG.value: get_agg_agg_data, } data = generating_procedure_to_data_function[gp](df, args, id_fields=id_fields, precomputed=precomputed, config=config, data_formats=data_formats ) logger.debug('Data for %s: %s', gp, time() - start_time) return data
def run_regression_from_spec(spec, project_id, conditionals=[]): ''' Wrapper function for five discrete steps: 1) Parse arguments (in this function) 2) Loading data from DB for fields and dataframe 3) Construct / recommend models given those fields 4) Run regressions described by those models 5) Format results ''' model = spec.get('model', 'lr') regression_type = spec.get('regressionType') independent_variables_names = spec.get('independentVariables', []) dependent_variable_name = spec.get('dependentVariable', []) interaction_term_ids = spec.get('interactionTerms', []) transformations = spec.get('transformations', {}) estimator = spec.get('estimator', 'ols') degree = spec.get('degree', 1) # need to find quantitative, categorical weights = spec.get('weights', None) functions = spec.get('functions', []) dataset_id = spec.get('datasetId') table_layout = spec.get('tableLayout', MCT.LEAVE_ONE_OUT) if not (dataset_id and dependent_variable_name): return 'Not passed required parameters', 400 dependent_variable, independent_variables, interaction_terms, df = \ load_data(dependent_variable_name, independent_variables_names, interaction_term_ids, dataset_id, project_id) df = get_conditioned_data(project_id, dataset_id, df, conditionals) considered_independent_variables_per_model, patsy_models = \ construct_models(df, dependent_variable, independent_variables, transformations, interaction_terms, table_layout=table_layout) raw_table_results = run_models(df, patsy_models, dependent_variable, regression_type) formatted_table_results = format_results( raw_table_results, dependent_variable, independent_variables, considered_independent_variables_per_model, interaction_terms) contribution_to_r_squared = get_contribution_to_r_squared_data( formatted_table_results, table_layout) return { 'table': formatted_table_results, 'contribution_to_r_squared': contribution_to_r_squared }, 200
def attach_data_to_viz_specs(enumerated_viz_specs, dataset_id, project_id, conditionals, config, data_formats=['score', 'table', 'visualize', 'count']): ''' Get data corresponding to each viz spec (before filtering and scoring) ''' viz_specs_with_data = [] start_time = time() TIME_CUTOFF = 10 # Get dataframe if project_id and dataset_id: df = get_data(project_id=project_id, dataset_id=dataset_id) conditioned_df = get_conditioned_data(project_id, dataset_id, df, conditionals) precomputed = { 'groupby': {} } for i, spec in enumerate(enumerated_viz_specs): if (time() - start_time) > TIME_CUTOFF: continue viz_spec_with_data = spec # TODO Move this into another function if spec['args'].get('grouped_field'): grouped_field = spec['args']['grouped_field']['name'] grouped_df = conditioned_df.groupby(grouped_field) precomputed['groupby'][grouped_field] = grouped_df try: data = get_viz_data_from_enumerated_spec(spec, project_id, conditionals, config, df=conditioned_df, precomputed=precomputed, data_formats=data_formats ) except Exception as e: logger.error("Error getting viz data %s", e, exc_info=True) continue if not data: logger.info('No data for spec with generating procedure %s', spec['generating_procedure']) continue viz_spec_with_data['data'] = data viz_specs_with_data.append(viz_spec_with_data) logger.debug('Attaching data to %s specs took %.3fs', len(viz_specs_with_data), time() - start_time) return viz_specs_with_data
def run_correlation_from_spec(spec, project_id, conditionals=[]): dataset_id = spec.get("datasetId") correlation_variables = spec.get("correlationVariables") correlation_variables_names = correlation_variables df = get_data(project_id=project_id, dataset_id=dataset_id) df = get_conditioned_data(project_id, dataset_id, df, conditionals) df_subset = df[ correlation_variables_names ] df_ready = df_subset.dropna(how='all') correlation_result = run_correlation(df_ready, correlation_variables) correlation_scatterplots = get_correlation_scatterplot_data(df_ready, correlation_variables) return { 'table': correlation_result, 'scatterplots': correlation_scatterplots }, 200
def run_correlation_from_spec(spec, project_id, conditionals=[]): dataset_id = spec.get("datasetId") correlation_variables = spec.get("correlationVariables") correlation_variables_names = correlation_variables df = get_data(project_id=project_id, dataset_id=dataset_id) df = get_conditioned_data(project_id, dataset_id, df, conditionals) df_subset = df[correlation_variables_names] df_ready = df_subset.dropna(how='all') correlation_result = run_correlation(df_ready, correlation_variables) correlation_scatterplots = get_correlation_scatterplot_data( df_ready, correlation_variables) return { 'table': correlation_result, 'scatterplots': correlation_scatterplots }, 200
def run_comparison_from_spec(spec, project_id, conditionals=[]): dependent_variables_names = spec.get('dependentVariablesNames', []) independent_variables_names = spec.get('independentVariablesNames', []) # [ iv[1] for iv in independent_variables ] dataset_id = spec.get('datasetId') significance_cutoff = spec.get('significanceCutoff', 0.05) independence = spec.get('independence', True) if not (dataset_id): return 'Not passed required parameters', 400 all_fields = db_access.get_field_properties(project_id, dataset_id) dependent_variables = [ f for f in all_fields if f['name'] in dependent_variables_names ] independent_variables = [ f for f in all_fields if f['name'] in independent_variables_names ] can_run_numerical_comparison_independent = len([ iv for iv in independent_variables if iv['scale'] == 'continuous' ]) >= 2 and len(dependent_variables_names) == 0 can_run_numerical_comparison_dependent = len([ dv for dv in dependent_variables if dv['scale'] == 'continuous' ]) >= 2 and len(independent_variables_names) == 0 can_run_numerical_comparison = (can_run_numerical_comparison_dependent or can_run_numerical_comparison_independent) can_run_anova = (len(dependent_variables) and len(independent_variables)) df = get_data(project_id=project_id, dataset_id=dataset_id) df_conditioned = get_conditioned_data(project_id, dataset_id, df, conditionals) df_subset = df_conditioned[ dependent_variables_names + independent_variables_names ] df_ready = df_subset.dropna(how='any') # Remove unclean result = {} NUM_GROUPS_CUTOFF = 15 if can_run_anova: anova = run_anova(df_ready, independent_variables_names, dependent_variables_names, NUM_GROUPS_CUTOFF=NUM_GROUPS_CUTOFF) anova_boxplot_data = get_anova_boxplot_data(project_id, dataset_id, df_ready, independent_variables_names, dependent_variables_names, NUM_GROUPS_CUTOFF=NUM_GROUPS_CUTOFF) pairwise_comparison_data = get_pairwise_comparison_data(df_ready, independent_variables_names, dependent_variables_names, significance_cutoff=significance_cutoff, NUM_GROUPS_CUTOFF=NUM_GROUPS_CUTOFF) result.update({ 'anova': anova, 'anova_boxplot': anova_boxplot_data, 'pairwise_comparison': pairwise_comparison_data, }) if can_run_numerical_comparison: if can_run_numerical_comparison_independent: numerical_comparison_data = run_valid_numerical_comparison_tests(df_ready, independent_variables_names, independence=True) if can_run_numerical_comparison_dependent: numerical_comparison_data = run_valid_numerical_comparison_tests(df_ready, dependent_variables_names, independence=False) result['numerical_comparison'] = numerical_comparison_data return result, 200
def run_regression_from_spec(spec, project_id, conditionals=[]): ''' Wrapper function for five discrete steps: 1) Parse arguments (in this function) 2) Loading data from DB for fields and dataframe 3) Construct / recommend models given those fields 4) Run regressions described by those models 5) Format results ''' model = spec.get('model', 'lr') regression_type = spec.get('regressionType') independent_variables_names = spec.get('independentVariables', []) dependent_variable_name = spec.get('dependentVariable', []) interaction_term_ids = spec.get('interactionTerms', []) transformations = spec.get('transformations', {}) estimator = spec.get('estimator', 'ols') degree = spec.get('degree', 1) # need to find quantitative, categorical weights = spec.get('weights', None) functions = spec.get('functions', []) dataset_id = spec.get('datasetId') table_layout = spec.get('tableLayout', MCT.LEAVE_ONE_OUT) if not (dataset_id and dependent_variable_name): return 'Not passed required parameters', 400 dependent_variable, independent_variables, interaction_terms, df = \ load_data(dependent_variable_name, independent_variables_names, interaction_term_ids, dataset_id, project_id) df = get_conditioned_data(project_id, dataset_id, df, conditionals) considered_independent_variables_per_model, patsy_models = \ construct_models(df, dependent_variable, independent_variables, transformations, interaction_terms, table_layout=table_layout) raw_table_results = run_models(df, patsy_models, dependent_variable, regression_type) formatted_table_results = format_results(raw_table_results, dependent_variable, independent_variables, considered_independent_variables_per_model, interaction_terms) contribution_to_r_squared = get_contribution_to_r_squared_data(formatted_table_results, table_layout) return { 'table': formatted_table_results, 'contribution_to_r_squared': contribution_to_r_squared }, 200