Example #1
0
def join_datasets(project_id, left_dataset_id, right_dataset_id, on, left_on, right_on, how, left_suffix, right_suffix, new_dataset_name_prefix):
    left_df = get_data(project_id=project_id, dataset_id=left_dataset_id)
    right_df = get_data(project_id=project_id, dataset_id=right_dataset_id)

    project = db_access.get_project(project_id)
    original_left_dataset = db_access.get_dataset(project_id, left_dataset_id)
    original_right_dataset = db_access.get_dataset(project_id, right_dataset_id)

    preloaded_project = project.get('preloaded', False)
    if preloaded_project:
        project_dir = os.path.join(task_app.config['PRELOADED_PATH'], project['directory'])
    else:
        project_dir = os.path.join(task_app.config['STORAGE_PATH'], str(project_id))

    original_left_dataset_title = original_left_dataset['title']
    original_right_dataset_title = original_right_dataset['title']

    fallback_title = original_left_dataset_title[:20] + original_left_dataset_title[:20]
    original_dataset_title = original_left_dataset_title + original_right_dataset_title
    dataset_type = '.tsv'
    new_dataset_title, new_dataset_name, new_dataset_path = \
        get_transformed_file_name(project_dir, new_dataset_name_prefix, fallback_title, original_dataset_title, dataset_type)

    left_columns = left_df.columns.values
    right_columns = right_df.columns.values
    on = list_elements_from_indices(left_columns, on)

    # Not using left_on or right_on for now
    df_joined = left_df.merge(right_df, how=how, on=on, suffixes=[left_suffix, right_suffix])

    return df_joined, new_dataset_title, new_dataset_name, new_dataset_path
def get_initial_regression_model_recommendation(project_id, dataset_id, dependent_variable_id=None, recommendation_type=MRT.LASSO.value, table_layout=MCT.LEAVE_ONE_OUT.value, data_size_cutoff=current_app.config['ANALYSIS_DATA_SIZE_CUTOFF'], categorical_value_limit=current_app.config['ANALYSIS_CATEGORICAL_VALUE_LIMIT']):
    df = get_data(project_id=project_id, dataset_id=dataset_id)
    if len(df) > data_size_cutoff:
        df = df.sample(data_size_cutoff)
    field_properties = db_access.get_field_properties(project_id, dataset_id)
    quantitative_field_properties = [ fp for fp in field_properties if fp['general_type'] == 'q']

    dependent_variable = next((f for f in field_properties if f['id'] == dependent_variable_id), None) \
        if dependent_variable_id \
        else np.random.choice(quantitative_field_properties, size=1)[0]

    independent_variables = []
    for fp in field_properties:
        if (fp['name'] != dependent_variable['name']):
            if (fp['general_type'] == 'c' and (fp['is_unique'] or len(fp['unique_values']) > categorical_value_limit)):
                continue
            independent_variables.append(fp)

    recommendationTypeToFunction = {
        MRT.FORWARD_R2.value: forward_r2,
        MRT.LASSO.value: lasso,
        MRT.RFE.value: recursive_feature_elimination,
        MRT.FORWARD_F.value: f_regression
    }

    result = recommendationTypeToFunction[recommendation_type](df, dependent_variable, independent_variables)

    return {
        'recommended': True,
        'table_layout': table_layout,
        'recommendation_type': recommendation_type,
        'dependent_variable_id': dependent_variable['id'],
        'independent_variables_ids': [ x['id'] for x in result ],
    }
Example #3
0
def run_aggregation_from_spec(spec, project_id, config={}, conditionals=[]):
    aggregation_variables_names = spec.get('aggregationVariablesNames')
    dataset_id = spec.get('datasetId')
    dependent_variable_name = spec.get('dependentVariableName')
    weight_variable_name = config.get('weightVariableName')
    num_variables = len(aggregation_variables_names)

    if not (dataset_id): return 'Not passed required parameters', 400

    all_field_properties = db_access.get_field_properties(project_id, dataset_id)
    aggregation_variables = [ next((fp for fp in all_field_properties if fp['name'] == n), None) for n in aggregation_variables_names ]
    dependent_variable = next((fp for fp in all_field_properties if fp['name'] == dependent_variable_name), None)

    subset_variables = aggregation_variables_names
    if dependent_variable_name and dependent_variable_name != 'count':
        subset_variables += [ dependent_variable_name ]
    if weight_variable_name and weight_variable_name != 'UNIFORM':
        subset_variables += [ weight_variable_name ]
    subset_variables = get_unique(subset_variables, preserve_order=True)

    df = get_data(project_id=project_id, dataset_id=dataset_id)
    df_conditioned = get_conditioned_data(project_id, dataset_id, df, conditionals)
    df_subset = df_conditioned[ subset_variables ]
    df_ready = df_subset.dropna(how='all')  # Remove unclean

    result = {}
    if num_variables == 1:
        result['one_dimensional_contingency_table'] = create_one_dimensional_contingency_table(df_ready, aggregation_variables[0], dependent_variable, config=config)
    elif num_variables == 2:
        result['two_dimensional_contingency_table'] = create_contingency_table(df_ready, aggregation_variables, dependent_variable, config=config)

    return result, 200
Example #4
0
    def post(self, field_id):
        args = fieldPostParser.parse_args()
        project_id = args.get('project_id')
        dataset_id = args.get('dataset_id')
        field_type = args.get('type')
        field_is_id = args.get('isId')
        field_color = args.get('color')

        if field_type:
            if (field_type not in quantitative_types) \
                and (field_type not in categorical_types) \
                and (field_type not in temporal_types):
                return make_response(jsonify({'status': 'Invalid field type.'}))
            general_type = specific_type_to_general_type[field_type]

            field_property = db_access.get_field_property(project_id, dataset_id, field_id)
            field_name = field_property['name']
            df = get_data(project_id=project_id, dataset_id=dataset_id)

            updated_properties = compute_single_field_property_nontype(field_name, df[field_name], field_type, general_type)

            field_property_document = \
                db_access.update_field_properties_type_by_id(project_id, field_id, field_type, general_type, updated_properties)

        if field_is_id != None:
            field_property_document = \
                db_access.update_field_properties_is_id_by_id(project_id, field_id, field_is_id)

        if field_color != None:
            field_property_document = \
                db_access.update_field_properties_color_by_id(project_id, field_id, field_color)

        return make_response(jsonify(field_property_document))
Example #5
0
def unpivot_dataset(project_id, dataset_id, pivot_fields, variable_name,
                    value_name, new_dataset_name_prefix):
    '''
    Returns unpivoted dataframe
    '''
    df = get_data(project_id=project_id, dataset_id=dataset_id)
    project = db_access.get_project(project_id)
    original_dataset = db_access.get_dataset(project_id, dataset_id)

    preloaded_project = project.get('preloaded', False)
    if preloaded_project:
        project_dir = os.path.join(task_app.config['PRELOADED_PATH'],
                                   project['directory'])
    else:
        project_dir = os.path.join(task_app.config['STORAGE_PATH'],
                                   str(project_id))

    original_dataset_title = original_dataset['title']
    fallback_title = original_dataset_title[:20]
    dataset_type = '.tsv'
    new_dataset_title, new_dataset_name, new_dataset_path = \
        get_transformed_file_name(project_dir, new_dataset_name_prefix, fallback_title, original_dataset_title, dataset_type)

    columns = df.columns.values
    pivot_fields = list_elements_from_indices(columns, pivot_fields)
    preserved_fields = difference_of_lists(columns, pivot_fields)
    df_unpivoted = pd.melt(df,
                           id_vars=preserved_fields,
                           value_vars=pivot_fields,
                           var_name=variable_name,
                           value_name=value_name)

    return df_unpivoted, new_dataset_title, new_dataset_name, new_dataset_path
Example #6
0
def unpivot_dataset(project_id, dataset_id, pivot_fields, variable_name, value_name, new_dataset_name_prefix):
    '''
    Returns unpivoted dataframe
    '''
    df = get_data(project_id=project_id, dataset_id=dataset_id)
    project = db_access.get_project(project_id)
    original_dataset = db_access.get_dataset(project_id, dataset_id)

    preloaded_project = project.get('preloaded', False)
    if preloaded_project:
        project_dir = os.path.join(task_app.config['PRELOADED_PATH'], project['directory'])
    else:
        project_dir = os.path.join(task_app.config['STORAGE_PATH'], str(project_id))

    original_dataset_title = original_dataset['title']
    fallback_title = original_dataset_title[:20]
    dataset_type = '.tsv'
    new_dataset_title, new_dataset_name, new_dataset_path = \
        get_transformed_file_name(project_dir, new_dataset_name_prefix, fallback_title, original_dataset_title, dataset_type)

    columns = df.columns.values
    pivot_fields = list_elements_from_indices(columns, pivot_fields)
    preserved_fields = difference_of_lists(columns, pivot_fields)
    df_unpivoted = pd.melt(df, id_vars=preserved_fields, value_vars=pivot_fields, var_name=variable_name, value_name=value_name)

    return df_unpivoted, new_dataset_title, new_dataset_name, new_dataset_path
Example #7
0
def load_data(dependent_variable_name, independent_variables_names,
              interaction_term_ids, dataset_id, project_id):
    '''
    Load DF and full field documents
    '''
    # Map variables to field documents
    all_fields = db_access.get_field_properties(project_id, dataset_id)
    interaction_terms = db_access.get_interaction_term_properties(
        interaction_term_ids)
    dependent_variable = next(
        (f for f in all_fields if f['name'] == dependent_variable_name), None)

    independent_variables = []
    if independent_variables_names:
        independent_variables = get_full_field_documents_from_field_names(
            all_fields, independent_variables_names)
    else:
        for field in all_fields:
            if (not (field['general_type'] == 'c' and field['is_unique']) \
                and field['name'] != dependent_variable_name):
                independent_variables.append(field)

    # 2) Access dataset
    df = get_data(project_id=project_id, dataset_id=dataset_id)

    # Drop NAs
    df_subset = df[[dependent_variable_name] + independent_variables_names]
    df_ready = df_subset.dropna(axis=0, how='all')

    return dependent_variable, independent_variables, interaction_terms, df_ready
Example #8
0
def load_data(dependent_variable_name, independent_variables_names, interaction_term_ids, dataset_id, project_id):
    '''
    Load DF and full field documents
    '''
    # Map variables to field documents
    all_fields = db_access.get_field_properties(project_id, dataset_id)
    interaction_terms = db_access.get_interaction_term_properties(interaction_term_ids)
    dependent_variable = next((f for f in all_fields if f['name'] == dependent_variable_name), None)

    independent_variables = []
    if independent_variables_names:
        independent_variables = get_full_field_documents_from_field_names(all_fields, independent_variables_names)
    else:
        for field in all_fields:
            if (not (field['general_type'] == 'c' and field['is_unique']) \
                and field['name'] != dependent_variable_name):
                independent_variables.append(field)

    # 2) Access dataset
    df = get_data(project_id=project_id, dataset_id=dataset_id)

    # Drop NAs
    df_subset = df[[dependent_variable_name] + independent_variables_names]
    df_ready = df_subset.dropna(axis=0, how='all')

    return dependent_variable, independent_variables, interaction_terms, df_ready
Example #9
0
def get_viz_data_from_enumerated_spec(spec, project_id, conditionals, config, df=None, precomputed={}, data_formats=['visualize', 'table', 'score']):
    '''
    Returns a dictionary containing data corresponding to spec (in automated-viz
    structure), and all necessary information to interpret data.

    There are three types of formats:
        Score: a dict of lists for scoring
        Visualize: a list of dicts (collection)
        Table: {columns: list, data: matrix}

    Args:
    spec, dataset_id, project_id, format (list of 'score', 'visualize', or 'table')
    Returns:
        data specified by spec, in specified format

    '''
    for f in data_formats:
        if f not in [u'score', u'visualize', u'table', u'count']:
            raise ValueError('Passed incorrect data format', f)
    final_data = dict([(f, {}) for f in data_formats])

    gp = spec['generating_procedure']
    args = spec['args']
    dataset_id = spec['dataset_id']

    logger.debug('Generating Procedure: %s', gp)
    logger.debug('Arguments: %s', args)
    start_time = time()

    if df is None:
        df = get_data(project_id=project_id, dataset_id=dataset_id)
        df = get_conditioned_data(project_id, dataset_id, df, conditionals)

    id_fields = [ fp for fp in db_access.get_field_properties(project_id, dataset_id) if fp['is_id']]

    generating_procedure_to_data_function = {
        GeneratingProcedure.AGG.value: get_agg_data,
        GeneratingProcedure.IND_VAL.value: get_ind_val_data,
        GeneratingProcedure.BIN_AGG.value: get_bin_agg_data,
        GeneratingProcedure.MULTIGROUP_COUNT.value: get_multigroup_count_data,
        GeneratingProcedure.MULTIGROUP_AGG.value: get_multigroup_agg_data,
        GeneratingProcedure.VAL_BOX.value: get_val_box_data,
        GeneratingProcedure.VAL_AGG.value: get_val_agg_data,
        GeneratingProcedure.VAL_VAL.value: get_raw_comparison_data,
        GeneratingProcedure.VAL_COUNT.value: get_val_count_data,
        GeneratingProcedure.AGG_AGG.value: get_agg_agg_data,
    }
    data = generating_procedure_to_data_function[gp](df,
        args,
        id_fields=id_fields,
        precomputed=precomputed,
        config=config,
        data_formats=data_formats
    )

    logger.debug('Data for %s: %s', gp, time() - start_time)
    return data
Example #10
0
def get_viz_data_from_enumerated_spec(spec, project_id, conditionals, config, df=None, precomputed={}, data_formats=['visualize', 'table', 'score']):
    '''
    Returns a dictionary containing data corresponding to spec (in automated-viz
    structure), and all necessary information to interpret data.

    There are three types of formats:
        Score: a dict of lists for scoring
        Visualize: a list of dicts (collection)
        Table: {columns: list, data: matrix}

    Args:
    spec, dataset_id, project_id, format (list of 'score', 'visualize', or 'table')
    Returns:
        data specified by spec, in specified format

    '''
    for f in data_formats:
        if f not in [u'score', u'visualize', u'table', u'count']:
            raise ValueError('Passed incorrect data format', f)
    final_data = dict([(f, {}) for f in data_formats])

    gp = spec['generating_procedure']
    args = spec['args']
    dataset_id = spec['dataset_id']

    logger.debug('Generating Procedure: %s', gp)
    logger.debug('Arguments: %s', args)
    start_time = time()

    if df is None:
        df = get_data(project_id=project_id, dataset_id=dataset_id)
        df = get_conditioned_data(project_id, dataset_id, df, conditionals)

    id_fields = [ fp for fp in db_access.get_field_properties(project_id, dataset_id) if fp['is_id']]

    generating_procedure_to_data_function = {
        GeneratingProcedure.AGG.value: get_agg_data,
        GeneratingProcedure.IND_VAL.value: get_ind_val_data,
        GeneratingProcedure.BIN_AGG.value: get_bin_agg_data,
        GeneratingProcedure.MULTIGROUP_COUNT.value: get_multigroup_count_data,
        GeneratingProcedure.MULTIGROUP_AGG.value: get_multigroup_agg_data,
        GeneratingProcedure.VAL_BOX.value: get_val_box_data,
        GeneratingProcedure.VAL_AGG.value: get_val_agg_data,
        GeneratingProcedure.VAL_VAL.value: get_raw_comparison_data,
        GeneratingProcedure.VAL_COUNT.value: get_val_count_data,
        GeneratingProcedure.AGG_AGG.value: get_agg_agg_data,
    }
    data = generating_procedure_to_data_function[gp](df,
        args,
        id_fields=id_fields,
        precomputed=precomputed,
        config=config,
        data_formats=data_formats
    )

    logger.debug('Data for %s: %s', gp, time() - start_time)
    return data
Example #11
0
def join_datasets(project_id, left_dataset_id, right_dataset_id, on, left_on,
                  right_on, how, left_suffix, right_suffix,
                  new_dataset_name_prefix):
    left_df = get_data(project_id=project_id, dataset_id=left_dataset_id)
    right_df = get_data(project_id=project_id, dataset_id=right_dataset_id)

    project = db_access.get_project(project_id)
    original_left_dataset = db_access.get_dataset(project_id, left_dataset_id)
    original_right_dataset = db_access.get_dataset(project_id,
                                                   right_dataset_id)

    preloaded_project = project.get('preloaded', False)
    if preloaded_project:
        project_dir = os.path.join(task_app.config['PRELOADED_PATH'],
                                   project['directory'])
    else:
        project_dir = os.path.join(task_app.config['STORAGE_PATH'],
                                   str(project_id))

    original_left_dataset_title = original_left_dataset['title']
    original_right_dataset_title = original_right_dataset['title']

    fallback_title = original_left_dataset_title[:
                                                 20] + original_left_dataset_title[:
                                                                                   20]
    original_dataset_title = original_left_dataset_title + original_right_dataset_title
    dataset_type = '.tsv'
    new_dataset_title, new_dataset_name, new_dataset_path = \
        get_transformed_file_name(project_dir, new_dataset_name_prefix, fallback_title, original_dataset_title, dataset_type)

    left_columns = left_df.columns.values
    right_columns = right_df.columns.values
    on = list_elements_from_indices(left_columns, on)

    # Not using left_on or right_on for now
    df_joined = left_df.merge(right_df,
                              how=how,
                              on=on,
                              suffixes=[left_suffix, right_suffix])

    return df_joined, new_dataset_title, new_dataset_name, new_dataset_path
Example #12
0
def run_correlation_from_spec(spec, project_id, conditionals=[]):
    dataset_id = spec.get("datasetId")
    correlation_variables = spec.get("correlationVariables")
    correlation_variables_names = correlation_variables

    df = get_data(project_id=project_id, dataset_id=dataset_id)
    df = get_conditioned_data(project_id, dataset_id, df, conditionals)

    df_subset = df[ correlation_variables_names ]
    df_ready = df_subset.dropna(how='all')

    correlation_result = run_correlation(df_ready, correlation_variables)
    correlation_scatterplots = get_correlation_scatterplot_data(df_ready, correlation_variables)
    return {
        'table': correlation_result,
        'scatterplots': correlation_scatterplots
    }, 200
def attach_data_to_viz_specs(enumerated_viz_specs, dataset_id, project_id, conditionals, config, data_formats=['score', 'table', 'visualize', 'count']):
    '''
    Get data corresponding to each viz spec (before filtering and scoring)
    '''
    viz_specs_with_data = []

    start_time = time()
    TIME_CUTOFF = 10
    # Get dataframe
    if project_id and dataset_id:
        df = get_data(project_id=project_id, dataset_id=dataset_id)
        conditioned_df = get_conditioned_data(project_id, dataset_id, df, conditionals)

    precomputed = {
        'groupby': {}
    }
    for i, spec in enumerate(enumerated_viz_specs):
        if (time() - start_time) > TIME_CUTOFF:
            continue
        viz_spec_with_data = spec
        # TODO Move this into another function
        if spec['args'].get('grouped_field'):
            grouped_field = spec['args']['grouped_field']['name']
            grouped_df = conditioned_df.groupby(grouped_field)
            precomputed['groupby'][grouped_field] = grouped_df

        try:
            data = get_viz_data_from_enumerated_spec(spec, project_id, conditionals, config,
                df=conditioned_df,
                precomputed=precomputed,
                data_formats=data_formats
            )

        except Exception as e:
            logger.error("Error getting viz data %s", e, exc_info=True)
            continue

        if not data:
            logger.info('No data for spec with generating procedure %s', spec['generating_procedure'])
            continue

        viz_spec_with_data['data'] = data
        viz_specs_with_data.append(viz_spec_with_data)

    logger.debug('Attaching data to %s specs took %.3fs', len(viz_specs_with_data), time() - start_time)
    return viz_specs_with_data
Example #14
0
def run_correlation_from_spec(spec, project_id, conditionals=[]):
    dataset_id = spec.get("datasetId")
    correlation_variables = spec.get("correlationVariables")
    correlation_variables_names = correlation_variables

    df = get_data(project_id=project_id, dataset_id=dataset_id)
    df = get_conditioned_data(project_id, dataset_id, df, conditionals)

    df_subset = df[correlation_variables_names]
    df_ready = df_subset.dropna(how='all')

    correlation_result = run_correlation(df_ready, correlation_variables)
    correlation_scatterplots = get_correlation_scatterplot_data(
        df_ready, correlation_variables)
    return {
        'table': correlation_result,
        'scatterplots': correlation_scatterplots
    }, 200
def run_comparison_from_spec(spec, project_id):
    # 1) Parse and validate arguments
    indep = spec.get('indep', [])
    dep = spec.get('dep', [])
    dataset_id = spec.get('dataset_id')
    test = spec.get('test', 'ttest')
    if not (dataset_id and dep):
        return 'Not passed required parameters', 400

    fields = db_access.get_field_properties(project_id, dataset_id)

    # 2) Access dataset
    df = get_data(project_id=project_id, dataset_id=dataset_id)
    df = df.dropna()  # Remove unclean

    # 3) Run test based on parameters and arguments
    comparison_result = run_comparison(df, fields, indep, dep, test)
    return {'data': comparison_result}, 200
Example #16
0
def compute_dataset_properties(dataset_id, project_id, path=None):
    ''' Compute and return dictionary containing whole
    import pandas as pd-dataset properties '''

    print('compute_dataset_properties', 'path', path)

    if not path:
        dataset = db_access.get_dataset(project_id, dataset_id)
        path = dataset['path']
        df = get_data(project_id=project_id, dataset_id=dataset_id)

    n_rows, n_cols = df.shape
    field_names = df.columns.values.tolist()

    # field_types = []
    # for (i, field_name) in enumerate(df):
    #     logger.debug('Calculating types for field %s', field_name)
    #     field_values = df[field_name]
    #     field_type, field_type_scores = calculate_field_type(field_name, field_values, i, n_cols)
    #     field_types.append(field_type)

    # Forgoing time series detection for now (expensive)
    # time_series = detect_time_series(df, field_types)
    # if time_series:
    #     time_series = True
    time_series = False

    structure = 'wide' if time_series else 'long'

    properties = {
        'n_rows': n_rows,
        'n_cols': n_cols,
        'field_names': field_names,
        # 'field_types': field_types,
        'field_accessors': [i for i in range(0, n_cols)],
        'structure': structure,
        'is_time_series': time_series,
    }

    return {
        'desc': 'Done computing dataset properties',
        'result': properties,
    }
Example #17
0
def run_comparison_from_spec(spec, project_id, conditionals=[]):
    dependent_variables_names = spec.get('dependentVariablesNames', [])
    independent_variables_names = spec.get('independentVariablesNames', [])  # [ iv[1] for iv in independent_variables ]
    dataset_id = spec.get('datasetId')
    significance_cutoff = spec.get('significanceCutoff', 0.05)
    independence = spec.get('independence', True)

    if not (dataset_id): return 'Not passed required parameters', 400

    all_fields = db_access.get_field_properties(project_id, dataset_id)
    dependent_variables = [ f for f in all_fields if f['name'] in dependent_variables_names ]
    independent_variables = [ f for f in all_fields if f['name'] in independent_variables_names ]

    can_run_numerical_comparison_independent = len([ iv for iv in independent_variables if iv['scale'] == 'continuous' ]) >= 2 and len(dependent_variables_names) == 0
    can_run_numerical_comparison_dependent = len([ dv for dv in dependent_variables if dv['scale'] == 'continuous' ]) >= 2 and len(independent_variables_names) == 0
    can_run_numerical_comparison = (can_run_numerical_comparison_dependent or can_run_numerical_comparison_independent)

    can_run_anova = (len(dependent_variables) and len(independent_variables))
    df = get_data(project_id=project_id, dataset_id=dataset_id)
    df_conditioned = get_conditioned_data(project_id, dataset_id, df, conditionals)
    df_subset = df_conditioned[ dependent_variables_names + independent_variables_names ]
    df_ready = df_subset.dropna(how='any')  # Remove unclean
    
    result = {}
    NUM_GROUPS_CUTOFF = 15
    if can_run_anova:
        anova = run_anova(df_ready, independent_variables_names, dependent_variables_names, NUM_GROUPS_CUTOFF=NUM_GROUPS_CUTOFF)
        anova_boxplot_data = get_anova_boxplot_data(project_id, dataset_id, df_ready, independent_variables_names, dependent_variables_names, NUM_GROUPS_CUTOFF=NUM_GROUPS_CUTOFF)
        pairwise_comparison_data = get_pairwise_comparison_data(df_ready, independent_variables_names, dependent_variables_names, significance_cutoff=significance_cutoff, NUM_GROUPS_CUTOFF=NUM_GROUPS_CUTOFF)
        result.update({
            'anova': anova,
            'anova_boxplot': anova_boxplot_data,
            'pairwise_comparison': pairwise_comparison_data,
        })

    if can_run_numerical_comparison:
        if can_run_numerical_comparison_independent:
            numerical_comparison_data = run_valid_numerical_comparison_tests(df_ready, independent_variables_names, independence=True)
        if can_run_numerical_comparison_dependent:
            numerical_comparison_data = run_valid_numerical_comparison_tests(df_ready, dependent_variables_names, independence=False)
        result['numerical_comparison'] = numerical_comparison_data

    return result, 200
Example #18
0
def reduce_dataset(project_id, dataset_id, column_ids_to_keep, new_dataset_name_prefix):
    df = get_data(project_id=project_id, dataset_id=dataset_id)
    project = db_access.get_project(project_id)
    original_dataset = db_access.get_dataset(project_id, dataset_id)

    preloaded_project = project.get('preloaded', False)
    if preloaded_project:
        project_dir = os.path.join(task_app.config['PRELOADED_PATH'], project['directory'])
    else:
        project_dir = os.path.join(task_app.config['STORAGE_PATH'], str(project_id))

    original_dataset_title = original_dataset['title']
    fallback_title = original_dataset_title[:20]
    dataset_type = '.tsv'
    new_dataset_title, new_dataset_name, new_dataset_path = \
        get_transformed_file_name(project_dir, new_dataset_name_prefix, fallback_title, original_dataset_title, dataset_type)

    df_reduced = df.iloc[:, column_ids_to_keep]

    return df_reduced, new_dataset_title, new_dataset_name, new_dataset_path
Example #19
0
    def post(self, field_id):
        args = fieldPostParser.parse_args()
        project_id = args.get('project_id')
        dataset_id = args.get('dataset_id')
        field_type = args.get('type')
        field_is_id = args.get('isId')
        field_color = args.get('color')

        if field_type:
            if (field_type not in quantitative_types) \
                and (field_type not in categorical_types) \
                and (field_type not in temporal_types):
                return make_response(jsonify({'status':
                                              'Invalid field type.'}))
            general_type = specific_type_to_general_type[field_type]

            field_property = db_access.get_field_property(
                project_id, dataset_id, field_id)
            field_name = field_property['name']
            df = get_data(project_id=project_id, dataset_id=dataset_id)

            updated_properties = compute_single_field_property_nontype(
                field_name, df[field_name], field_type, general_type)

            field_property_document = \
                db_access.update_field_properties_type_by_id(project_id, field_id, field_type, general_type, updated_properties)

        if field_is_id != None:
            field_property_document = \
                db_access.update_field_properties_is_id_by_id(project_id, field_id, field_is_id)

        if field_color != None:
            field_property_document = \
                db_access.update_field_properties_color_by_id(project_id, field_id, field_color)

        return make_response(jsonify(field_property_document))
Example #20
0
def compute_all_field_properties(dataset_id, project_id, should_detect_hierarchical_relationships=True, track_started=True):
    '''
    Compute field properties of a specific dataset
    Currently only getting properties by column

    Arguments: project_id + dataset ids
    Returns a mapping from dataset_ids to properties
    '''

    logger.debug("Computing field properties for dataset_id %s", dataset_id)

    df = get_data(project_id=project_id, dataset_id=dataset_id)
    num_fields = len(df.columns)
    field_properties = [ {} for i in range(num_fields) ]

    palette = total_palette + [ '#007BD7' for i in range(0, num_fields - len(total_palette)) ]
    if num_fields <= len(total_palette):
        palette = sample_with_maximum_distance(total_palette, num_fields, random_start=True)

    # 1) Detect field types
    for (i, field_name) in enumerate(df):
        logger.info('[%s | %s] Detecting type for field %s', project_id, dataset_id, field_name)
        field_values = df[field_name]
        d = field_property_type_object = compute_single_field_property_type(field_name, field_values, field_position=i, num_fields=num_fields)
        field_properties[i].update({
            'index': i,
            'name': field_name,
        })
        field_properties[i].update(d)


    temporal_fields = [ fp for fp in field_properties if (fp['general_type'] == GDT.T.value)]

    # Necessary to coerce here?
    coerced_df = coerce_types(df, field_properties)
    IMD.insertData(dataset_id, coerced_df)

    # 2) Rest
    for (i, field_name) in enumerate(coerced_df):
        field_values = coerced_df[field_name]

        d = field_properties_nontype_object = compute_single_field_property_nontype(
            field_name,
            field_values,
            field_properties[i]['type'],
            field_properties[i]['general_type'],
            df=coerced_df,
            temporal_fields=temporal_fields
        )
        field_properties[i].update({
            'color': palette[i],
            'children': [],
            'parents': [],
            'one_to_ones': [],
            'manual': {}
        })
        field_properties[i].update(d)

    if should_detect_hierarchical_relationships:
        hierarchical_relationships = detect_hierarchical_relationships(coerced_df, field_properties)
        MAX_UNIQUE_VALUES_THRESHOLD = 100   
        for field_a, field_b in hierarchical_relationships:

            if [ field_b, field_a ] in hierarchical_relationships:
                field_properties[field_properties.index(field_a)]['one_to_ones'].append(field_b['name'])
            else:
                field_properties[field_properties.index(field_a)]['children'].append(field_b['name'])
                field_properties[field_properties.index(field_b)]['parents'].append(field_a['name'])

    return {
        'desc': 'Done computing field properties for %s fields' % len(field_properties),
        'result': field_properties
    }
def compute_all_field_properties(dataset_id, project_id, should_detect_hierarchical_relationships=True, track_started=True):
    '''
    Compute field properties of a specific dataset
    Currently only getting properties by column

    Arguments: project_id + dataset ids
    Returns a mapping from dataset_ids to properties
    '''

    logger.debug("Computing field properties for dataset_id %s", dataset_id)

    df = get_data(project_id=project_id, dataset_id=dataset_id)
    num_fields = len(df.columns)
    field_properties = [ {} for i in range(num_fields) ]

    palette = total_palette + [ '#007BD7' for i in range(0, num_fields - len(total_palette)) ]
    if num_fields <= len(total_palette):
        palette = sample_with_maximum_distance(total_palette, num_fields, random_start=True)

    # 1) Detect field types
    for (i, field_name) in enumerate(df):
        logger.info('[%s | %s] Detecting type for field %s', project_id, dataset_id, field_name)
        field_values = df[field_name]
        d = field_property_type_object = compute_single_field_property_type(field_name, field_values, field_position=i, num_fields=num_fields)
        field_properties[i].update({
            'index': i,
            'name': field_name,
        })
        field_properties[i].update(d)


    temporal_fields = [ fp for fp in field_properties if (fp['general_type'] == GDT.T.value)]

    # Necessary to coerce here?
    coerced_df = coerce_types(df, field_properties)
    IMD.insertData(dataset_id, coerced_df)

    # 2) Rest
    for (i, field_name) in enumerate(coerced_df):
        field_values = coerced_df[field_name]

        d = field_properties_nontype_object = compute_single_field_property_nontype(
            field_name,
            field_values,
            field_properties[i]['type'],
            field_properties[i]['general_type'],
            df=coerced_df,
            temporal_fields=temporal_fields
        )
        field_properties[i].update({
            'color': palette[i],
            'children': [],
            'parents': [],
            'one_to_ones': [],
            'manual': {}
        })
        field_properties[i].update(d)

    if should_detect_hierarchical_relationships:
        hierarchical_relationships = detect_hierarchical_relationships(coerced_df, field_properties)
        MAX_UNIQUE_VALUES_THRESHOLD = 100   
        for field_a, field_b in hierarchical_relationships:

            if [ field_b, field_a ] in hierarchical_relationships:
                field_properties[field_properties.index(field_a)]['one_to_ones'].append(field_b['name'])
            else:
                field_properties[field_properties.index(field_a)]['children'].append(field_b['name'])
                field_properties[field_properties.index(field_b)]['parents'].append(field_a['name'])

    return {
        'desc': 'Done computing field properties for %s fields' % len(field_properties),
        'result': field_properties
    }