def get_anova_boxplot_data(project_id, dataset_id, df, independent_variables_names, dependent_variables_names, NUM_GROUPS_CUTOFF=15): anova_result = {} considered_independent_variable_name = independent_variables_names[0] considered_dependent_variable_name = dependent_variables_names[0] # Only return boxplot data if number of groups < THRESHOLD num_groups = len(get_unique(df[considered_independent_variable_name])) if num_groups > NUM_GROUPS_CUTOFF: return None val_box_spec = { 'grouped_field': { 'name': considered_independent_variable_name }, 'boxed_field': { 'name': considered_dependent_variable_name } } viz_data = get_val_box_data(df, val_box_spec) result = { 'project_id': project_id, 'dataset_id': dataset_id, 'spec': val_box_spec, 'meta': { 'labels': { 'x': considered_independent_variable_name, 'y': considered_dependent_variable_name }, }, 'data': viz_data } return result
def compute_single_field_property_nontype(field_name, field_values, field_type, general_type, df=None, temporal_fields=[]): temporal = (len(temporal_fields) > 0) field_values_no_na = field_values.dropna(how='any') all_null = (len(field_values_no_na) == 0) num_na = len(field_values) - len(field_values_no_na) is_unique = detect_unique_list(field_values_no_na) if not temporal else get_temporal_uniqueness(field_name, field_type, general_type, df, temporal_fields) is_id = detect_id(field_name, field_type, is_unique) stats, contiguous, scale, viz_data, normality, unique_values = [ None ]*6 if not all_null: stats = calculate_field_stats(field_type, general_type, field_values) contiguous = get_contiguity(field_name, field_values, field_values_no_na, field_type, general_type) scale = get_scale(field_name, field_values, field_type, general_type, contiguous) viz_data = get_field_distribution_viz_data(field_name, field_values, field_type, general_type, scale, is_id, contiguous) normality = get_normality(field_name, field_values, field_type, general_type, scale) unique_values = [ e for e in get_unique(field_values_no_na) if not pd.isnull(e) ] if (scale in [ Scale.NOMINAL.value, Scale.ORDINAL.value ] and not is_unique) else None return { 'scale': scale, # Recompute if continguous 'contiguous': contiguous, 'viz_data': viz_data, 'is_id': is_id, 'stats': stats, 'num_na': num_na, 'normality': normality, 'is_unique': is_unique, 'unique_values': unique_values, 'manual': {} }
def run_aggregation_from_spec(spec, project_id, config={}, conditionals=[]): aggregation_variables_names = spec.get('aggregationVariablesNames') dataset_id = spec.get('datasetId') dependent_variable_name = spec.get('dependentVariableName') weight_variable_name = config.get('weightVariableName') num_variables = len(aggregation_variables_names) if not (dataset_id): return 'Not passed required parameters', 400 all_field_properties = db_access.get_field_properties(project_id, dataset_id) aggregation_variables = [ next((fp for fp in all_field_properties if fp['name'] == n), None) for n in aggregation_variables_names ] dependent_variable = next((fp for fp in all_field_properties if fp['name'] == dependent_variable_name), None) subset_variables = aggregation_variables_names if dependent_variable_name and dependent_variable_name != 'count': subset_variables += [ dependent_variable_name ] if weight_variable_name and weight_variable_name != 'UNIFORM': subset_variables += [ weight_variable_name ] subset_variables = get_unique(subset_variables, preserve_order=True) df = get_data(project_id=project_id, dataset_id=dataset_id) df_conditioned = get_conditioned_data(project_id, dataset_id, df, conditionals) df_subset = df_conditioned[ subset_variables ] df_ready = df_subset.dropna(how='all') # Remove unclean result = {} if num_variables == 1: result['one_dimensional_contingency_table'] = create_one_dimensional_contingency_table(df_ready, aggregation_variables[0], dependent_variable, config=config) elif num_variables == 2: result['two_dimensional_contingency_table'] = create_contingency_table(df_ready, aggregation_variables, dependent_variable, config=config) return result, 200
def get_temporal_uniqueness(field_name, field_type, general_type, df, temporal_fields, MAX_TIMES_TO_SAMPLE=5): is_unique_by_time = False if temporal_fields and (df is not None) and ( general_type == GDT.C.value or field_type == DT.INTEGER.value): uniqueness_by_time_fields = [] for temporal_field in temporal_fields: temporal_field_name = temporal_field['name'] df_column_subset = df[[field_name, temporal_field_name]] unique_times = get_unique(df_column_subset[temporal_field_name]) final_df = df if len(unique_times) > MAX_TIMES_TO_SAMPLE: unique_times = unique_times[:MAX_TIMES_TO_SAMPLE] final_df = df_column_subset[ df_column_subset[temporal_field_name].isin(unique_times)] unique_by_time_field = all( final_df.groupby([temporal_field_name ])[field_name].apply(detect_unique_list)) uniqueness_by_time_fields.append(unique_by_time_field) is_unique_by_time = any(uniqueness_by_time_fields) return is_unique_by_time
def create_contingency_table(df, aggregation_variables, dep_variable, config={}): results_dict = {} formatted_results_dict = {} unique_indep_values = [] bin_data = {} aggregation_mean = False for i, variable in enumerate(aggregation_variables): binningConfigKey = 'binningConfigX' if (i == 0) else 'binningConfigY' name = variable['name'] general_type = variable['general_type'] scale = variable['scale'] if scale in [ Scale.NOMINAL.value, Scale.ORDINAL.value ]: unique_indep_values.append(get_unique(df[name], True)) elif scale in [ Scale.CONTINUOUS.value ]: values = df[name].dropna(how='any') (binning_edges, bin_names) = get_binning_edges_and_names(values, config[binningConfigKey]) num_bins = len(binning_edges) - 1 unique_indep_values.append(bin_names) bin_data[name] = { 'num_bins': num_bins, 'binning_edges': binning_edges, 'bin_names': bin_names } if dep_variable: (results_dict, aggregation_mean) = create_contingency_table_with_dependent_variable(df, aggregation_variables, dep_variable, unique_indep_values, config=config, bin_data=bin_data) else: results_dict = create_contingency_table_with_no_dependent_variable(df, aggregation_variables, unique_indep_values, config=config, bin_data=bin_data) if not aggregation_mean: formatted_results_dict["column_headers"] = unique_indep_values[0] + ['Row Totals'] column_totals = np.zeros(len(unique_indep_values[0]) + 1) else: formatted_results_dict['column_headers'] = unique_indep_values[0] formatted_results_dict["row_headers"] = unique_indep_values[1] formatted_results_dict["rows"] = [] for row in unique_indep_values[1]: values = [ results_dict[row][col] for col in unique_indep_values[0] ] if not aggregation_mean: values.append(sum(values)) column_totals += values formatted_results_dict["rows"].append({ "field": row, "values": values }) if not aggregation_mean: formatted_results_dict['column_totals'] = list(column_totals) return formatted_results_dict
def get_pairwise_comparison_data(df, independent_variables_names, dependent_variables_names, significance_cutoff=0.05, NUM_GROUPS_CUTOFF=15): ''' datasetId independentVariables - list names, must be categorical dependentVariables - list names, must be numerical numBins - number of bins for the independent quantitative variables (if they exist) ''' considered_independent_variable_name = independent_variables_names[0] considered_dependent_variable_name = dependent_variables_names[0] # Only return pairwise comparison data if number of groups < THRESHOLD num_groups = len(get_unique(df[considered_independent_variable_name])) if num_groups > NUM_GROUPS_CUTOFF: return None hsd_result = pairwise_tukeyhsd(df[considered_dependent_variable_name], df[considered_independent_variable_name], alpha=significance_cutoff) hsd_raw_data = hsd_result.summary().data[1:] st_range = np.abs(hsd_result.meandiffs) / hsd_result.std_pairs p_values = psturng(st_range, len(hsd_result.groupsunique), hsd_result.df_total) hsd_headers = [ 'Group 1', 'Group 2', 'Group Mean Difference (2 - 1)', 'Lower Bound', 'Upper Bound', 'p-value', 'Distinct (p < %s)' % significance_cutoff ] hsd_data = [] for i in range(0, len(hsd_raw_data)): if isinstance(p_values, float): p_value = p_values else: p_value = p_values[i] if i < len(p_values) else None hsd_data_row = [ hsd_raw_data[i][0], hsd_raw_data[i][1], hsd_result.meandiffs[i], hsd_result.confint[i][0], hsd_result.confint[i][1], p_value, ( 'True' if (p_value <= significance_cutoff) else 'False' ) ] hsd_data.append(hsd_data_row) return { 'column_headers': hsd_headers, 'rows': hsd_data }
def create_one_dimensional_contingency_table(df, aggregation_variable, dep_variable, config={}): results_dict = {} formatted_results_dict = {} unique_indep_values = [] aggregation_mean = False general_type = aggregation_variable['general_type'] scale = aggregation_variable['scale'] name = aggregation_variable['name'] bin_data = {} if scale in [ Scale.ORDINAL.value, Scale.NOMINAL.value ]: unique_indep_values = get_unique(df[name], True) elif scale in [ Scale.CONTINUOUS.value ]: values = df[name].dropna(how='any') (binning_edges, bin_names) = get_binning_edges_and_names(values, config.get('binningConfigX')) # TODO Update binning function num_bins = len(binning_edges) -1 bin_data = { 'num_bins': num_bins, 'binning_edges': binning_edges, 'bin_names': bin_names } unique_indep_values = bin_names if dep_variable: (results_dict, aggregation_mean) = create_one_dimensional_contingency_table_with_dependent_variable(df, aggregation_variable, dep_variable, unique_indep_values, config=config, bin_data=bin_data) else: results_dict = create_one_dimensional_contingency_table_with_no_dependent_variable(df, aggregation_variable, unique_indep_values, config=config, bin_data=bin_data) formatted_results_dict["column_headers"] = ["VARIABLE", "AGGREGATION"] formatted_results_dict["row_headers"] = unique_indep_values formatted_results_dict["rows"] = [] if not aggregation_mean: formatted_results_dict['column_total'] = 0 for var in unique_indep_values: value = results_dict[var] if not aggregation_mean: formatted_results_dict['column_total'] += value formatted_results_dict["rows"].append({ "field": var, "value": value }) return formatted_results_dict
def ttest(df, fields, indep, dep): # Ensure single field dep_field_name = dep[0] indep_field_name = indep[0] unique_indep_values = get_unique(df[indep_field_name]) subsets = {} for v in unique_indep_values: subsets[v] = np.array(df[df[indep_field_name] == v][dep_field_name]) result = {} for (x, y) in combinations(unique_indep_values, 2): (statistic, pvalue) = ttest_ind(subsets[x], subsets[y]) result[str([x, y])] = {'statistic': statistic, 'pvalue': pvalue} return result
def get_temporal_uniqueness(field_name, field_type, general_type, df, temporal_fields, MAX_TIMES_TO_SAMPLE=5): is_unique_by_time = False if temporal_fields and (df is not None) and (general_type == GDT.C.value or field_type == DT.INTEGER.value): uniqueness_by_time_fields = [] for temporal_field in temporal_fields: temporal_field_name = temporal_field['name'] df_column_subset = df[[field_name, temporal_field_name]] unique_times = get_unique(df_column_subset[temporal_field_name]) final_df = df if len(unique_times) > MAX_TIMES_TO_SAMPLE: unique_times = unique_times[:MAX_TIMES_TO_SAMPLE] final_df = df_column_subset[df_column_subset[temporal_field_name].isin(unique_times)] unique_by_time_field = all(final_df.groupby([temporal_field_name])[field_name].apply(detect_unique_list)) uniqueness_by_time_fields.append(unique_by_time_field) is_unique_by_time = any(uniqueness_by_time_fields) return is_unique_by_time
def return_data_list_categorical(data_column, variable_name): ''' helper function to return visualization data in the right format for categorical variables data_column: represents the array of data variable_name: represents the name of the variable that is being visualized ''' unique_elements = get_unique(data_column) count_dict = {} data_array = [] data_array.append([variable_name, 'count']) for ele in data_column: if count_dict.get(ele): count_dict[ele] += 1 else: count_dict[ele] = 1 for name in unique_elements: data_array.append([name, count_dict[name]]) return data_array