def compute_relationships(project_id): all_datasets = db_access.get_datasets(project_id) relationships = [] if len(all_datasets) == 1: return relationships for dataset_a, dataset_b in combinations(all_datasets, 2): dataset_a_fields = db_access.get_field_properties( project_id, dataset_a['id']) dataset_b_fields = db_access.get_field_properties( project_id, dataset_b['id']) for index_a, field_a in enumerate(dataset_a_fields): for index_b, field_b in enumerate(dataset_b_fields): logger.info('%s:%s - %s:%s', dataset_a['title'], field_a['name'], dataset_b['title'], field_b['name']) unique_field_a_values = field_a.get('unique_values') unique_field_b_values = field_b.get('unique_values') if (not unique_field_a_values) or (not unique_field_b_values): continue len_a = len(unique_field_a_values) len_b = len(unique_field_b_values) d = get_distance(unique_field_a_values, unique_field_b_values) logger.info('%s-%s: %s', field_a['name'], field_b['name'], d) if d >= THRESHOLD: if len_a == len_b: relationship_type = "11" elif (len_a > len_b): relationship_type = "N1" elif (len_a < len_a): relationship_type = "1N" else: relationship_type = None else: continue relationship = { 'source_dataset_id': dataset_a['id'], 'source_field_id': field_a['id'], 'target_dataset_id': dataset_b['id'], 'target_field_id': field_b['id'], 'source_dataset_name': dataset_a['title'], 'source_field_name': field_a['name'], 'target_dataset_name': dataset_b['title'], 'target_field_name': field_b['name'], 'distance': d, 'type': relationship_type } relationships.append(relationship) return relationships
def compute_relationships(project_id): all_datasets = db_access.get_datasets(project_id) relationships = [] if len(all_datasets) == 1: return relationships for dataset_a, dataset_b in combinations(all_datasets, 2): dataset_a_fields = db_access.get_field_properties(project_id, dataset_a['id']) dataset_b_fields = db_access.get_field_properties(project_id, dataset_b['id']) for index_a, field_a in enumerate(dataset_a_fields): for index_b, field_b in enumerate(dataset_b_fields): logger.info('%s:%s - %s:%s', dataset_a['title'], field_a['name'], dataset_b['title'], field_b['name']) unique_field_a_values = field_a.get('unique_values') unique_field_b_values = field_b.get('unique_values') if (not unique_field_a_values) or (not unique_field_b_values): continue len_a = len(unique_field_a_values) len_b = len(unique_field_b_values) d = get_distance(unique_field_a_values, unique_field_b_values) logger.info('%s-%s: %s', field_a['name'], field_b['name'], d) if d >= THRESHOLD: if len_a == len_b: relationship_type = "11" elif (len_a > len_b): relationship_type = "N1" elif (len_a < len_a): relationship_type = "1N" else: relationship_type = None else: continue relationship = { 'source_dataset_id': dataset_a['id'], 'source_field_id': field_a['id'], 'target_dataset_id': dataset_b['id'], 'target_field_id': field_b['id'], 'source_dataset_name': dataset_a['title'], 'source_field_name': field_a['name'], 'target_dataset_name': dataset_b['title'], 'target_field_name': field_b['name'], 'distance': d, 'type': relationship_type } relationships.append(relationship) return relationships
def load_data(dependent_variable_name, independent_variables_names, interaction_term_ids, dataset_id, project_id): ''' Load DF and full field documents ''' # Map variables to field documents all_fields = db_access.get_field_properties(project_id, dataset_id) interaction_terms = db_access.get_interaction_term_properties( interaction_term_ids) dependent_variable = next( (f for f in all_fields if f['name'] == dependent_variable_name), None) independent_variables = [] if independent_variables_names: independent_variables = get_full_field_documents_from_field_names( all_fields, independent_variables_names) else: for field in all_fields: if (not (field['general_type'] == 'c' and field['is_unique']) \ and field['name'] != dependent_variable_name): independent_variables.append(field) # 2) Access dataset df = get_data(project_id=project_id, dataset_id=dataset_id) # Drop NAs df_subset = df[[dependent_variable_name] + independent_variables_names] df_ready = df_subset.dropna(axis=0, how='all') return dependent_variable, independent_variables, interaction_terms, df_ready
def save_field_properties(all_properties_result, dataset_id, project_id): ''' Upsert all field properties corresponding to a dataset ''' logger.debug( 'In save_field_properties for dataset_id %s and project_id %s', dataset_id, project_id) all_properties = all_properties_result['result'] field_properties_with_id = [] for field_properties in all_properties: name = field_properties['name'] existing_field_properties = db_access.get_field_properties(project_id, dataset_id, name=name) if existing_field_properties: field_properties = db_access.update_field_properties( project_id, dataset_id, **field_properties) else: field_properties = db_access.insert_field_properties( project_id, dataset_id, **field_properties) field_properties_with_id.append(field_properties) return { 'desc': 'Saved %s field properties' % len(field_properties_with_id), 'result': { 'id': dataset_id } }
def get(self): args = fieldPropertiesGetParser.parse_args() project_id = args.get('project_id') dataset_id = args.get('dataset_id') group_by = args.get('group_by') has_project_access, auth_message = project_auth(project_id) if not has_project_access: return auth_message field_properties = db_access.get_field_properties( project_id, dataset_id) interaction_terms = db_access.get_interaction_terms( project_id, dataset_id) if group_by: result = {} for fp in field_properties: fp_group_by = fp[group_by] if fp_group_by in result: result[fp_group_by].append(fp) else: result[fp_group_by] = [fp] else: result = {'field_properties': field_properties} result['interactionTerms'] = interaction_terms return make_response(jsonify(result))
def get_initial_regression_model_recommendation(project_id, dataset_id, dependent_variable_id=None, recommendation_type=MRT.LASSO.value, table_layout=MCT.LEAVE_ONE_OUT.value, data_size_cutoff=current_app.config['ANALYSIS_DATA_SIZE_CUTOFF'], categorical_value_limit=current_app.config['ANALYSIS_CATEGORICAL_VALUE_LIMIT']): df = get_data(project_id=project_id, dataset_id=dataset_id) if len(df) > data_size_cutoff: df = df.sample(data_size_cutoff) field_properties = db_access.get_field_properties(project_id, dataset_id) quantitative_field_properties = [ fp for fp in field_properties if fp['general_type'] == 'q'] dependent_variable = next((f for f in field_properties if f['id'] == dependent_variable_id), None) \ if dependent_variable_id \ else np.random.choice(quantitative_field_properties, size=1)[0] independent_variables = [] for fp in field_properties: if (fp['name'] != dependent_variable['name']): if (fp['general_type'] == 'c' and (fp['is_unique'] or len(fp['unique_values']) > categorical_value_limit)): continue independent_variables.append(fp) recommendationTypeToFunction = { MRT.FORWARD_R2.value: forward_r2, MRT.LASSO.value: lasso, MRT.RFE.value: recursive_feature_elimination, MRT.FORWARD_F.value: f_regression } result = recommendationTypeToFunction[recommendation_type](df, dependent_variable, independent_variables) return { 'recommended': True, 'table_layout': table_layout, 'recommendation_type': recommendation_type, 'dependent_variable_id': dependent_variable['id'], 'independent_variables_ids': [ x['id'] for x in result ], }
def load_data(dependent_variable_name, independent_variables_names, interaction_term_ids, dataset_id, project_id): ''' Load DF and full field documents ''' # Map variables to field documents all_fields = db_access.get_field_properties(project_id, dataset_id) interaction_terms = db_access.get_interaction_term_properties(interaction_term_ids) dependent_variable = next((f for f in all_fields if f['name'] == dependent_variable_name), None) independent_variables = [] if independent_variables_names: independent_variables = get_full_field_documents_from_field_names(all_fields, independent_variables_names) else: for field in all_fields: if (not (field['general_type'] == 'c' and field['is_unique']) \ and field['name'] != dependent_variable_name): independent_variables.append(field) # 2) Access dataset df = get_data(project_id=project_id, dataset_id=dataset_id) # Drop NAs df_subset = df[[dependent_variable_name] + independent_variables_names] df_ready = df_subset.dropna(axis=0, how='all') return dependent_variable, independent_variables, interaction_terms, df_ready
def run_aggregation_from_spec(spec, project_id, config={}, conditionals=[]): aggregation_variables_names = spec.get('aggregationVariablesNames') dataset_id = spec.get('datasetId') dependent_variable_name = spec.get('dependentVariableName') weight_variable_name = config.get('weightVariableName') num_variables = len(aggregation_variables_names) if not (dataset_id): return 'Not passed required parameters', 400 all_field_properties = db_access.get_field_properties(project_id, dataset_id) aggregation_variables = [ next((fp for fp in all_field_properties if fp['name'] == n), None) for n in aggregation_variables_names ] dependent_variable = next((fp for fp in all_field_properties if fp['name'] == dependent_variable_name), None) subset_variables = aggregation_variables_names if dependent_variable_name and dependent_variable_name != 'count': subset_variables += [ dependent_variable_name ] if weight_variable_name and weight_variable_name != 'UNIFORM': subset_variables += [ weight_variable_name ] subset_variables = get_unique(subset_variables, preserve_order=True) df = get_data(project_id=project_id, dataset_id=dataset_id) df_conditioned = get_conditioned_data(project_id, dataset_id, df, conditionals) df_subset = df_conditioned[ subset_variables ] df_ready = df_subset.dropna(how='all') # Remove unclean result = {} if num_variables == 1: result['one_dimensional_contingency_table'] = create_one_dimensional_contingency_table(df_ready, aggregation_variables[0], dependent_variable, config=config) elif num_variables == 2: result['two_dimensional_contingency_table'] = create_contingency_table(df_ready, aggregation_variables, dependent_variable, config=config) return result, 200
def get_conditioned_data(project_id, dataset_id, df, conditional_arg): ''' Given a data frame and a conditional dict ({ and: [{field_id, operation, criteria}], or: [...]}). Return the conditioned data frame in same dimensions as original. TODO Turn this into an argument of the get_data function ''' full_conditional = {} and_clause_list = conditional_arg.get('and') or_clause_list = conditional_arg.get('or') if not (and_clause_list or or_clause_list): return df desired_keys = ['general_type', 'name', 'id'] raw_field_properties = db_access.get_field_properties(project_id, dataset_id) all_field_properties = [{ k: field[k] for k in desired_keys } for field in raw_field_properties] query_strings = { 'and': '', 'or': '' } orig_cols = df.columns.tolist() safe_df = df.rename(columns=make_safe_string) if and_clause_list: for c in and_clause_list: field = next((field for field in all_field_properties if c['field_id'] == field['id']), None) if field and c['criteria'] is not None: clause = _construct_conditional_clause(field, c['operation'], c['criteria']) query_strings['and'] = query_strings['and'] + ' & ' + clause if or_clause_list: for c in or_clause_list: field = next((field for field in all_field_properties if c['field_id'] == field['id']), None) if field and c['criteria'] is not None: clause = _construct_conditional_clause(field, c['operation'], c['criteria']) query_strings['or'] = query_strings['or'] + ' | ' + clause query_strings['and'] = query_strings['and'].strip(' & ') query_strings['or'] = query_strings['or'].strip(' | ') # Concatenate final_query_string = '' if query_strings['and'] and query_strings['or']: final_query_string = '%s | %s' % (query_strings['and'], query_strings['or']) elif query_strings['and'] and not query_strings['or']: final_query_string = query_strings['and'] elif query_strings['or'] and not query_strings['and']: final_query_string = query_strings['or'] if not final_query_string: return df conditioned_df = safe_df.query(final_query_string) conditioned_df.columns = orig_cols return conditioned_df
def get_data(project_id=None, dataset_id=None, nrows=None, field_properties=[]): if IMD.hasData(dataset_id): logger.debug('Accessing from IMD, project_id: %s, dataset_id: %s', project_id, dataset_id) df = IMD.getData(dataset_id) return df dataset = db_access.get_dataset(project_id, dataset_id) print(dataset) dialect = dataset['dialect'] encoding = dataset.get('encoding', 'utf-8') if dataset['storage_type'] == 's3': if dataset['preloaded']: file_obj = s3_client.get_object( Bucket=current_app.config['AWS_DATA_BUCKET'], Key="-1/%s" % dataset['file_name'] ) else: file_obj = s3_client.get_object( Bucket=current_app.config['AWS_DATA_BUCKET'], Key="%s/%s" % (str(project_id), dataset['file_name']) ) accessor = file_obj['Body'] if dataset['storage_type'] == 'file': accessor = dataset['path'] if not field_properties: field_properties = db_access.get_field_properties(project_id, dataset_id) # dive-la debug print('accessor:', accessor) import os print('folder contents', os.listdir("/usr/src/app/uploads/1/")) print('now pd read table') df = pd.read_table( accessor, error_bad_lines = False, encoding = encoding, skiprows = dataset['offset'], sep = dialect['delimiter'], engine = 'c', # dtype = field_to_type_mapping, escapechar = dialect['escapechar'], doublequote = dialect['doublequote'], quotechar = dialect['quotechar'], parse_dates = True, nrows = nrows, thousands = ',' ) sanitized_df = sanitize_df(df) coerced_df = coerce_types(sanitized_df, field_properties) IMD.insertData(dataset_id, coerced_df) return coerced_df
def get_viz_data_from_enumerated_spec(spec, project_id, conditionals, config, df=None, precomputed={}, data_formats=['visualize', 'table', 'score']): ''' Returns a dictionary containing data corresponding to spec (in automated-viz structure), and all necessary information to interpret data. There are three types of formats: Score: a dict of lists for scoring Visualize: a list of dicts (collection) Table: {columns: list, data: matrix} Args: spec, dataset_id, project_id, format (list of 'score', 'visualize', or 'table') Returns: data specified by spec, in specified format ''' for f in data_formats: if f not in [u'score', u'visualize', u'table', u'count']: raise ValueError('Passed incorrect data format', f) final_data = dict([(f, {}) for f in data_formats]) gp = spec['generating_procedure'] args = spec['args'] dataset_id = spec['dataset_id'] logger.debug('Generating Procedure: %s', gp) logger.debug('Arguments: %s', args) start_time = time() if df is None: df = get_data(project_id=project_id, dataset_id=dataset_id) df = get_conditioned_data(project_id, dataset_id, df, conditionals) id_fields = [ fp for fp in db_access.get_field_properties(project_id, dataset_id) if fp['is_id']] generating_procedure_to_data_function = { GeneratingProcedure.AGG.value: get_agg_data, GeneratingProcedure.IND_VAL.value: get_ind_val_data, GeneratingProcedure.BIN_AGG.value: get_bin_agg_data, GeneratingProcedure.MULTIGROUP_COUNT.value: get_multigroup_count_data, GeneratingProcedure.MULTIGROUP_AGG.value: get_multigroup_agg_data, GeneratingProcedure.VAL_BOX.value: get_val_box_data, GeneratingProcedure.VAL_AGG.value: get_val_agg_data, GeneratingProcedure.VAL_VAL.value: get_raw_comparison_data, GeneratingProcedure.VAL_COUNT.value: get_val_count_data, GeneratingProcedure.AGG_AGG.value: get_agg_agg_data, } data = generating_procedure_to_data_function[gp](df, args, id_fields=id_fields, precomputed=precomputed, config=config, data_formats=data_formats ) logger.debug('Data for %s: %s', gp, time() - start_time) return data
def get_data(project_id=None, dataset_id=None, nrows=None, field_properties=[]): if IMD.hasData(dataset_id): logger.debug('Accessing from IMD, project_id: %s, dataset_id: %s', project_id, dataset_id) df = IMD.getData(dataset_id) return df dataset = db_access.get_dataset(project_id, dataset_id) dialect = dataset['dialect'] encoding = dataset.get('encoding', 'utf-8') if dataset['storage_type'] == 's3': if dataset['preloaded']: file_obj = s3_client.get_object( Bucket=current_app.config['AWS_DATA_BUCKET'], Key="-1/%s" % dataset['file_name'] ) else: file_obj = s3_client.get_object( Bucket=current_app.config['AWS_DATA_BUCKET'], Key="%s/%s" % (str(project_id), dataset['file_name']) ) accessor = file_obj['Body'] if dataset['storage_type'] == 'file': accessor = dataset['path'] if not field_properties: field_properties = db_access.get_field_properties(project_id, dataset_id) df = pd.read_table( accessor, error_bad_lines = False, encoding = encoding, skiprows = dataset['offset'], sep = dialect['delimiter'], engine = 'c', # dtype = field_to_type_mapping, escapechar = dialect['escapechar'], doublequote = dialect['doublequote'], quotechar = dialect['quotechar'], parse_dates = True, nrows = nrows, thousands = ',' ) sanitized_df = sanitize_df(df) coerced_df = coerce_types(sanitized_df, field_properties) IMD.insertData(dataset_id, coerced_df) return coerced_df
def get_full_fields_for_conditionals(conditionals, dataset_id, project_id): conditionals_with_full_docs = {'and': [], 'or': []} field_properties = db_access.get_field_properties(project_id, dataset_id) for clause, conditional_list in conditionals.iteritems(): for conditional in conditional_list: new_conditional = { 'operation': conditional['operation'], 'criteria': conditional['criteria'] } matched_field_doc = next((f for f in field_properties if f['id'] == conditional['field_id']), None) new_conditional['field'] = { 'general_type': matched_field_doc['general_type'], 'name': matched_field_doc['name'] } conditionals_with_full_docs[clause].append(new_conditional) return conditionals_with_full_docs
def run_comparison_from_spec(spec, project_id): # 1) Parse and validate arguments indep = spec.get('indep', []) dep = spec.get('dep', []) dataset_id = spec.get('dataset_id') test = spec.get('test', 'ttest') if not (dataset_id and dep): return 'Not passed required parameters', 400 fields = db_access.get_field_properties(project_id, dataset_id) # 2) Access dataset df = get_data(project_id=project_id, dataset_id=dataset_id) df = df.dropna() # Remove unclean # 3) Run test based on parameters and arguments comparison_result = run_comparison(df, fields, indep, dep, test) return {'data': comparison_result}, 200
def run_comparison_from_spec(spec, project_id, conditionals=[]): dependent_variables_names = spec.get('dependentVariablesNames', []) independent_variables_names = spec.get('independentVariablesNames', []) # [ iv[1] for iv in independent_variables ] dataset_id = spec.get('datasetId') significance_cutoff = spec.get('significanceCutoff', 0.05) independence = spec.get('independence', True) if not (dataset_id): return 'Not passed required parameters', 400 all_fields = db_access.get_field_properties(project_id, dataset_id) dependent_variables = [ f for f in all_fields if f['name'] in dependent_variables_names ] independent_variables = [ f for f in all_fields if f['name'] in independent_variables_names ] can_run_numerical_comparison_independent = len([ iv for iv in independent_variables if iv['scale'] == 'continuous' ]) >= 2 and len(dependent_variables_names) == 0 can_run_numerical_comparison_dependent = len([ dv for dv in dependent_variables if dv['scale'] == 'continuous' ]) >= 2 and len(independent_variables_names) == 0 can_run_numerical_comparison = (can_run_numerical_comparison_dependent or can_run_numerical_comparison_independent) can_run_anova = (len(dependent_variables) and len(independent_variables)) df = get_data(project_id=project_id, dataset_id=dataset_id) df_conditioned = get_conditioned_data(project_id, dataset_id, df, conditionals) df_subset = df_conditioned[ dependent_variables_names + independent_variables_names ] df_ready = df_subset.dropna(how='any') # Remove unclean result = {} NUM_GROUPS_CUTOFF = 15 if can_run_anova: anova = run_anova(df_ready, independent_variables_names, dependent_variables_names, NUM_GROUPS_CUTOFF=NUM_GROUPS_CUTOFF) anova_boxplot_data = get_anova_boxplot_data(project_id, dataset_id, df_ready, independent_variables_names, dependent_variables_names, NUM_GROUPS_CUTOFF=NUM_GROUPS_CUTOFF) pairwise_comparison_data = get_pairwise_comparison_data(df_ready, independent_variables_names, dependent_variables_names, significance_cutoff=significance_cutoff, NUM_GROUPS_CUTOFF=NUM_GROUPS_CUTOFF) result.update({ 'anova': anova, 'anova_boxplot': anova_boxplot_data, 'pairwise_comparison': pairwise_comparison_data, }) if can_run_numerical_comparison: if can_run_numerical_comparison_independent: numerical_comparison_data = run_valid_numerical_comparison_tests(df_ready, independent_variables_names, independence=True) if can_run_numerical_comparison_dependent: numerical_comparison_data = run_valid_numerical_comparison_tests(df_ready, dependent_variables_names, independence=False) result['numerical_comparison'] = numerical_comparison_data return result, 200
def save_field_properties(all_properties_result, dataset_id, project_id): ''' Upsert all field properties corresponding to a dataset ''' logger.debug('In save_field_properties for dataset_id %s and project_id %s', dataset_id, project_id) all_properties = all_properties_result['result'] field_properties_with_id = [] for field_properties in all_properties: name = field_properties['name'] existing_field_properties = db_access.get_field_properties(project_id, dataset_id, name=name) if existing_field_properties: field_properties = db_access.update_field_properties(project_id, dataset_id, **field_properties) else: field_properties = db_access.insert_field_properties(project_id, dataset_id, **field_properties) field_properties_with_id.append(field_properties) return { 'desc': 'Saved %s field properties' % len(field_properties_with_id), 'result': { 'id': dataset_id } }
def enumerate_viz_specs(project_id, dataset_id, selected_fields, recommendation_types=[], spec_limit=None, expanded_spec_limit=20): ''' TODO Move key filtering to the db query TODO Incorporate 0D and 1D data returns ''' specs = [] num_selected_fields = len(selected_fields) # Get field properties desired_keys = [ 'is_id', 'is_unique', 'general_type', 'type', 'scale', 'name', 'id', 'contiguous' ] raw_field_properties = db_access.get_field_properties(project_id, dataset_id, is_id=False) field_properties = [{k: field[k] for k in desired_keys} for field in raw_field_properties] if selected_fields: selected_field_docs, c_fields, c_fields_not_selected, q_fields, q_fields_not_selected, t_fields, t_fields_not_selected = \ get_selected_fields(field_properties, selected_fields) if 'baseline' in recommendation_types: baseline_viz_specs = get_baseline_viz_specs(selected_field_docs) specs.extend([ dict(s, recommendation_type='baseline') for s in baseline_viz_specs ]) if 'subset' in recommendation_types: subset_viz_specs = get_subset_viz_specs(c_fields, q_fields, t_fields, c_fields_not_selected, q_fields_not_selected, t_fields_not_selected) specs.extend([ dict(s, recommendation_type='subset') for s in subset_viz_specs ]) if 'exact' in recommendation_types: exact_viz_specs = get_exact_viz_specs(c_fields, q_fields, t_fields, c_fields_not_selected, q_fields_not_selected, t_fields_not_selected) specs.extend([ dict(s, recommendation_type='exact') for s in exact_viz_specs ]) if 'expanded' in recommendation_types: expanded_viz_specs = get_expanded_viz_specs( c_fields, q_fields, t_fields, c_fields_not_selected, q_fields_not_selected, t_fields_not_selected) if expanded_spec_limit: expanded_viz_specs = expanded_viz_specs[:expanded_spec_limit] specs.extend([ dict(s, recommendation_type='expanded') for s in expanded_viz_specs ]) else: if 'exact' in recommendation_types: baseline_viz_specs = get_baseline_viz_specs(field_properties) specs.extend([ dict(s, recommendation_type='exact') for s in baseline_viz_specs ]) # Deduplicate specs = get_list_of_unique_dicts(specs) # Limit Number of specs if spec_limit: specs = specs[:spec_limit] # Assign viz_types and dataset_id for spec in specs: spec['dataset_id'] = dataset_id logger.info('Number of unique specs: %s', len(specs)) return specs