Example #1
0
def run_aggregation_from_spec(spec, project_id, config={}, conditionals=[]):
    aggregation_variables_names = spec.get('aggregationVariablesNames')
    dataset_id = spec.get('datasetId')
    dependent_variable_name = spec.get('dependentVariableName')
    weight_variable_name = config.get('weightVariableName')
    num_variables = len(aggregation_variables_names)

    if not (dataset_id): return 'Not passed required parameters', 400

    all_field_properties = db_access.get_field_properties(project_id, dataset_id)
    aggregation_variables = [ next((fp for fp in all_field_properties if fp['name'] == n), None) for n in aggregation_variables_names ]
    dependent_variable = next((fp for fp in all_field_properties if fp['name'] == dependent_variable_name), None)

    subset_variables = aggregation_variables_names
    if dependent_variable_name and dependent_variable_name != 'count':
        subset_variables += [ dependent_variable_name ]
    if weight_variable_name and weight_variable_name != 'UNIFORM':
        subset_variables += [ weight_variable_name ]
    subset_variables = get_unique(subset_variables, preserve_order=True)

    df = get_data(project_id=project_id, dataset_id=dataset_id)
    df_conditioned = get_conditioned_data(project_id, dataset_id, df, conditionals)
    df_subset = df_conditioned[ subset_variables ]
    df_ready = df_subset.dropna(how='all')  # Remove unclean

    result = {}
    if num_variables == 1:
        result['one_dimensional_contingency_table'] = create_one_dimensional_contingency_table(df_ready, aggregation_variables[0], dependent_variable, config=config)
    elif num_variables == 2:
        result['two_dimensional_contingency_table'] = create_contingency_table(df_ready, aggregation_variables, dependent_variable, config=config)

    return result, 200
Example #2
0
def get_viz_data_from_enumerated_spec(spec, project_id, conditionals, config, df=None, precomputed={}, data_formats=['visualize', 'table', 'score']):
    '''
    Returns a dictionary containing data corresponding to spec (in automated-viz
    structure), and all necessary information to interpret data.

    There are three types of formats:
        Score: a dict of lists for scoring
        Visualize: a list of dicts (collection)
        Table: {columns: list, data: matrix}

    Args:
    spec, dataset_id, project_id, format (list of 'score', 'visualize', or 'table')
    Returns:
        data specified by spec, in specified format

    '''
    for f in data_formats:
        if f not in [u'score', u'visualize', u'table', u'count']:
            raise ValueError('Passed incorrect data format', f)
    final_data = dict([(f, {}) for f in data_formats])

    gp = spec['generating_procedure']
    args = spec['args']
    dataset_id = spec['dataset_id']

    logger.debug('Generating Procedure: %s', gp)
    logger.debug('Arguments: %s', args)
    start_time = time()

    if df is None:
        df = get_data(project_id=project_id, dataset_id=dataset_id)
        df = get_conditioned_data(project_id, dataset_id, df, conditionals)

    id_fields = [ fp for fp in db_access.get_field_properties(project_id, dataset_id) if fp['is_id']]

    generating_procedure_to_data_function = {
        GeneratingProcedure.AGG.value: get_agg_data,
        GeneratingProcedure.IND_VAL.value: get_ind_val_data,
        GeneratingProcedure.BIN_AGG.value: get_bin_agg_data,
        GeneratingProcedure.MULTIGROUP_COUNT.value: get_multigroup_count_data,
        GeneratingProcedure.MULTIGROUP_AGG.value: get_multigroup_agg_data,
        GeneratingProcedure.VAL_BOX.value: get_val_box_data,
        GeneratingProcedure.VAL_AGG.value: get_val_agg_data,
        GeneratingProcedure.VAL_VAL.value: get_raw_comparison_data,
        GeneratingProcedure.VAL_COUNT.value: get_val_count_data,
        GeneratingProcedure.AGG_AGG.value: get_agg_agg_data,
    }
    data = generating_procedure_to_data_function[gp](df,
        args,
        id_fields=id_fields,
        precomputed=precomputed,
        config=config,
        data_formats=data_formats
    )

    logger.debug('Data for %s: %s', gp, time() - start_time)
    return data
Example #3
0
def get_viz_data_from_enumerated_spec(spec, project_id, conditionals, config, df=None, precomputed={}, data_formats=['visualize', 'table', 'score']):
    '''
    Returns a dictionary containing data corresponding to spec (in automated-viz
    structure), and all necessary information to interpret data.

    There are three types of formats:
        Score: a dict of lists for scoring
        Visualize: a list of dicts (collection)
        Table: {columns: list, data: matrix}

    Args:
    spec, dataset_id, project_id, format (list of 'score', 'visualize', or 'table')
    Returns:
        data specified by spec, in specified format

    '''
    for f in data_formats:
        if f not in [u'score', u'visualize', u'table', u'count']:
            raise ValueError('Passed incorrect data format', f)
    final_data = dict([(f, {}) for f in data_formats])

    gp = spec['generating_procedure']
    args = spec['args']
    dataset_id = spec['dataset_id']

    logger.debug('Generating Procedure: %s', gp)
    logger.debug('Arguments: %s', args)
    start_time = time()

    if df is None:
        df = get_data(project_id=project_id, dataset_id=dataset_id)
        df = get_conditioned_data(project_id, dataset_id, df, conditionals)

    id_fields = [ fp for fp in db_access.get_field_properties(project_id, dataset_id) if fp['is_id']]

    generating_procedure_to_data_function = {
        GeneratingProcedure.AGG.value: get_agg_data,
        GeneratingProcedure.IND_VAL.value: get_ind_val_data,
        GeneratingProcedure.BIN_AGG.value: get_bin_agg_data,
        GeneratingProcedure.MULTIGROUP_COUNT.value: get_multigroup_count_data,
        GeneratingProcedure.MULTIGROUP_AGG.value: get_multigroup_agg_data,
        GeneratingProcedure.VAL_BOX.value: get_val_box_data,
        GeneratingProcedure.VAL_AGG.value: get_val_agg_data,
        GeneratingProcedure.VAL_VAL.value: get_raw_comparison_data,
        GeneratingProcedure.VAL_COUNT.value: get_val_count_data,
        GeneratingProcedure.AGG_AGG.value: get_agg_agg_data,
    }
    data = generating_procedure_to_data_function[gp](df,
        args,
        id_fields=id_fields,
        precomputed=precomputed,
        config=config,
        data_formats=data_formats
    )

    logger.debug('Data for %s: %s', gp, time() - start_time)
    return data
Example #4
0
def run_regression_from_spec(spec, project_id, conditionals=[]):
    '''
    Wrapper function for five discrete steps:
    1) Parse arguments (in this function)
    2) Loading data from DB for fields and dataframe
    3) Construct / recommend models given those fields
    4) Run regressions described by those models
    5) Format results
    '''

    model = spec.get('model', 'lr')
    regression_type = spec.get('regressionType')
    independent_variables_names = spec.get('independentVariables', [])
    dependent_variable_name = spec.get('dependentVariable', [])
    interaction_term_ids = spec.get('interactionTerms', [])
    transformations = spec.get('transformations', {})
    estimator = spec.get('estimator', 'ols')
    degree = spec.get('degree', 1)  # need to find quantitative, categorical
    weights = spec.get('weights', None)
    functions = spec.get('functions', [])
    dataset_id = spec.get('datasetId')
    table_layout = spec.get('tableLayout', MCT.LEAVE_ONE_OUT)

    if not (dataset_id and dependent_variable_name):
        return 'Not passed required parameters', 400

    dependent_variable, independent_variables, interaction_terms, df = \
        load_data(dependent_variable_name, independent_variables_names, interaction_term_ids, dataset_id, project_id)
    df = get_conditioned_data(project_id, dataset_id, df, conditionals)

    considered_independent_variables_per_model, patsy_models = \
        construct_models(df, dependent_variable, independent_variables, transformations, interaction_terms, table_layout=table_layout)
    raw_table_results = run_models(df, patsy_models, dependent_variable,
                                   regression_type)

    formatted_table_results = format_results(
        raw_table_results, dependent_variable, independent_variables,
        considered_independent_variables_per_model, interaction_terms)

    contribution_to_r_squared = get_contribution_to_r_squared_data(
        formatted_table_results, table_layout)

    return {
        'table': formatted_table_results,
        'contribution_to_r_squared': contribution_to_r_squared
    }, 200
def attach_data_to_viz_specs(enumerated_viz_specs, dataset_id, project_id, conditionals, config, data_formats=['score', 'table', 'visualize', 'count']):
    '''
    Get data corresponding to each viz spec (before filtering and scoring)
    '''
    viz_specs_with_data = []

    start_time = time()
    TIME_CUTOFF = 10
    # Get dataframe
    if project_id and dataset_id:
        df = get_data(project_id=project_id, dataset_id=dataset_id)
        conditioned_df = get_conditioned_data(project_id, dataset_id, df, conditionals)

    precomputed = {
        'groupby': {}
    }
    for i, spec in enumerate(enumerated_viz_specs):
        if (time() - start_time) > TIME_CUTOFF:
            continue
        viz_spec_with_data = spec
        # TODO Move this into another function
        if spec['args'].get('grouped_field'):
            grouped_field = spec['args']['grouped_field']['name']
            grouped_df = conditioned_df.groupby(grouped_field)
            precomputed['groupby'][grouped_field] = grouped_df

        try:
            data = get_viz_data_from_enumerated_spec(spec, project_id, conditionals, config,
                df=conditioned_df,
                precomputed=precomputed,
                data_formats=data_formats
            )

        except Exception as e:
            logger.error("Error getting viz data %s", e, exc_info=True)
            continue

        if not data:
            logger.info('No data for spec with generating procedure %s', spec['generating_procedure'])
            continue

        viz_spec_with_data['data'] = data
        viz_specs_with_data.append(viz_spec_with_data)

    logger.debug('Attaching data to %s specs took %.3fs', len(viz_specs_with_data), time() - start_time)
    return viz_specs_with_data
Example #6
0
def run_correlation_from_spec(spec, project_id, conditionals=[]):
    dataset_id = spec.get("datasetId")
    correlation_variables = spec.get("correlationVariables")
    correlation_variables_names = correlation_variables

    df = get_data(project_id=project_id, dataset_id=dataset_id)
    df = get_conditioned_data(project_id, dataset_id, df, conditionals)

    df_subset = df[ correlation_variables_names ]
    df_ready = df_subset.dropna(how='all')

    correlation_result = run_correlation(df_ready, correlation_variables)
    correlation_scatterplots = get_correlation_scatterplot_data(df_ready, correlation_variables)
    return {
        'table': correlation_result,
        'scatterplots': correlation_scatterplots
    }, 200
Example #7
0
def run_correlation_from_spec(spec, project_id, conditionals=[]):
    dataset_id = spec.get("datasetId")
    correlation_variables = spec.get("correlationVariables")
    correlation_variables_names = correlation_variables

    df = get_data(project_id=project_id, dataset_id=dataset_id)
    df = get_conditioned_data(project_id, dataset_id, df, conditionals)

    df_subset = df[correlation_variables_names]
    df_ready = df_subset.dropna(how='all')

    correlation_result = run_correlation(df_ready, correlation_variables)
    correlation_scatterplots = get_correlation_scatterplot_data(
        df_ready, correlation_variables)
    return {
        'table': correlation_result,
        'scatterplots': correlation_scatterplots
    }, 200
Example #8
0
def run_comparison_from_spec(spec, project_id, conditionals=[]):
    dependent_variables_names = spec.get('dependentVariablesNames', [])
    independent_variables_names = spec.get('independentVariablesNames', [])  # [ iv[1] for iv in independent_variables ]
    dataset_id = spec.get('datasetId')
    significance_cutoff = spec.get('significanceCutoff', 0.05)
    independence = spec.get('independence', True)

    if not (dataset_id): return 'Not passed required parameters', 400

    all_fields = db_access.get_field_properties(project_id, dataset_id)
    dependent_variables = [ f for f in all_fields if f['name'] in dependent_variables_names ]
    independent_variables = [ f for f in all_fields if f['name'] in independent_variables_names ]

    can_run_numerical_comparison_independent = len([ iv for iv in independent_variables if iv['scale'] == 'continuous' ]) >= 2 and len(dependent_variables_names) == 0
    can_run_numerical_comparison_dependent = len([ dv for dv in dependent_variables if dv['scale'] == 'continuous' ]) >= 2 and len(independent_variables_names) == 0
    can_run_numerical_comparison = (can_run_numerical_comparison_dependent or can_run_numerical_comparison_independent)

    can_run_anova = (len(dependent_variables) and len(independent_variables))
    df = get_data(project_id=project_id, dataset_id=dataset_id)
    df_conditioned = get_conditioned_data(project_id, dataset_id, df, conditionals)
    df_subset = df_conditioned[ dependent_variables_names + independent_variables_names ]
    df_ready = df_subset.dropna(how='any')  # Remove unclean
    
    result = {}
    NUM_GROUPS_CUTOFF = 15
    if can_run_anova:
        anova = run_anova(df_ready, independent_variables_names, dependent_variables_names, NUM_GROUPS_CUTOFF=NUM_GROUPS_CUTOFF)
        anova_boxplot_data = get_anova_boxplot_data(project_id, dataset_id, df_ready, independent_variables_names, dependent_variables_names, NUM_GROUPS_CUTOFF=NUM_GROUPS_CUTOFF)
        pairwise_comparison_data = get_pairwise_comparison_data(df_ready, independent_variables_names, dependent_variables_names, significance_cutoff=significance_cutoff, NUM_GROUPS_CUTOFF=NUM_GROUPS_CUTOFF)
        result.update({
            'anova': anova,
            'anova_boxplot': anova_boxplot_data,
            'pairwise_comparison': pairwise_comparison_data,
        })

    if can_run_numerical_comparison:
        if can_run_numerical_comparison_independent:
            numerical_comparison_data = run_valid_numerical_comparison_tests(df_ready, independent_variables_names, independence=True)
        if can_run_numerical_comparison_dependent:
            numerical_comparison_data = run_valid_numerical_comparison_tests(df_ready, dependent_variables_names, independence=False)
        result['numerical_comparison'] = numerical_comparison_data

    return result, 200
Example #9
0
def run_regression_from_spec(spec, project_id, conditionals=[]):
    '''
    Wrapper function for five discrete steps:
    1) Parse arguments (in this function)
    2) Loading data from DB for fields and dataframe
    3) Construct / recommend models given those fields
    4) Run regressions described by those models
    5) Format results
    '''

    model = spec.get('model', 'lr')
    regression_type = spec.get('regressionType')
    independent_variables_names = spec.get('independentVariables', [])
    dependent_variable_name = spec.get('dependentVariable', [])
    interaction_term_ids = spec.get('interactionTerms', [])
    transformations = spec.get('transformations', {})    
    estimator = spec.get('estimator', 'ols')
    degree = spec.get('degree', 1)  # need to find quantitative, categorical
    weights = spec.get('weights', None)
    functions = spec.get('functions', [])
    dataset_id = spec.get('datasetId')
    table_layout = spec.get('tableLayout', MCT.LEAVE_ONE_OUT)

    if not (dataset_id and dependent_variable_name):
        return 'Not passed required parameters', 400

    dependent_variable, independent_variables, interaction_terms, df = \
        load_data(dependent_variable_name, independent_variables_names, interaction_term_ids, dataset_id, project_id)
    df = get_conditioned_data(project_id, dataset_id, df, conditionals)

    considered_independent_variables_per_model, patsy_models = \
        construct_models(df, dependent_variable, independent_variables, transformations, interaction_terms, table_layout=table_layout)
    raw_table_results = run_models(df, patsy_models, dependent_variable, regression_type)

    formatted_table_results = format_results(raw_table_results, dependent_variable, independent_variables, considered_independent_variables_per_model, interaction_terms)

    contribution_to_r_squared = get_contribution_to_r_squared_data(formatted_table_results, table_layout)

    return {
        'table': formatted_table_results,
        'contribution_to_r_squared': contribution_to_r_squared
    }, 200