def join_datasets(project_id, left_dataset_id, right_dataset_id, on, left_on, right_on, how, left_suffix, right_suffix, new_dataset_name_prefix): left_df = get_data(project_id=project_id, dataset_id=left_dataset_id) right_df = get_data(project_id=project_id, dataset_id=right_dataset_id) project = db_access.get_project(project_id) original_left_dataset = db_access.get_dataset(project_id, left_dataset_id) original_right_dataset = db_access.get_dataset(project_id, right_dataset_id) preloaded_project = project.get('preloaded', False) if preloaded_project: project_dir = os.path.join(task_app.config['PRELOADED_PATH'], project['directory']) else: project_dir = os.path.join(task_app.config['STORAGE_PATH'], str(project_id)) original_left_dataset_title = original_left_dataset['title'] original_right_dataset_title = original_right_dataset['title'] fallback_title = original_left_dataset_title[:20] + original_left_dataset_title[:20] original_dataset_title = original_left_dataset_title + original_right_dataset_title dataset_type = '.tsv' new_dataset_title, new_dataset_name, new_dataset_path = \ get_transformed_file_name(project_dir, new_dataset_name_prefix, fallback_title, original_dataset_title, dataset_type) left_columns = left_df.columns.values right_columns = right_df.columns.values on = list_elements_from_indices(left_columns, on) # Not using left_on or right_on for now df_joined = left_df.merge(right_df, how=how, on=on, suffixes=[left_suffix, right_suffix]) return df_joined, new_dataset_title, new_dataset_name, new_dataset_path
def get_initial_regression_model_recommendation(project_id, dataset_id, dependent_variable_id=None, recommendation_type=MRT.LASSO.value, table_layout=MCT.LEAVE_ONE_OUT.value, data_size_cutoff=current_app.config['ANALYSIS_DATA_SIZE_CUTOFF'], categorical_value_limit=current_app.config['ANALYSIS_CATEGORICAL_VALUE_LIMIT']): df = get_data(project_id=project_id, dataset_id=dataset_id) if len(df) > data_size_cutoff: df = df.sample(data_size_cutoff) field_properties = db_access.get_field_properties(project_id, dataset_id) quantitative_field_properties = [ fp for fp in field_properties if fp['general_type'] == 'q'] dependent_variable = next((f for f in field_properties if f['id'] == dependent_variable_id), None) \ if dependent_variable_id \ else np.random.choice(quantitative_field_properties, size=1)[0] independent_variables = [] for fp in field_properties: if (fp['name'] != dependent_variable['name']): if (fp['general_type'] == 'c' and (fp['is_unique'] or len(fp['unique_values']) > categorical_value_limit)): continue independent_variables.append(fp) recommendationTypeToFunction = { MRT.FORWARD_R2.value: forward_r2, MRT.LASSO.value: lasso, MRT.RFE.value: recursive_feature_elimination, MRT.FORWARD_F.value: f_regression } result = recommendationTypeToFunction[recommendation_type](df, dependent_variable, independent_variables) return { 'recommended': True, 'table_layout': table_layout, 'recommendation_type': recommendation_type, 'dependent_variable_id': dependent_variable['id'], 'independent_variables_ids': [ x['id'] for x in result ], }
def run_aggregation_from_spec(spec, project_id, config={}, conditionals=[]): aggregation_variables_names = spec.get('aggregationVariablesNames') dataset_id = spec.get('datasetId') dependent_variable_name = spec.get('dependentVariableName') weight_variable_name = config.get('weightVariableName') num_variables = len(aggregation_variables_names) if not (dataset_id): return 'Not passed required parameters', 400 all_field_properties = db_access.get_field_properties(project_id, dataset_id) aggregation_variables = [ next((fp for fp in all_field_properties if fp['name'] == n), None) for n in aggregation_variables_names ] dependent_variable = next((fp for fp in all_field_properties if fp['name'] == dependent_variable_name), None) subset_variables = aggregation_variables_names if dependent_variable_name and dependent_variable_name != 'count': subset_variables += [ dependent_variable_name ] if weight_variable_name and weight_variable_name != 'UNIFORM': subset_variables += [ weight_variable_name ] subset_variables = get_unique(subset_variables, preserve_order=True) df = get_data(project_id=project_id, dataset_id=dataset_id) df_conditioned = get_conditioned_data(project_id, dataset_id, df, conditionals) df_subset = df_conditioned[ subset_variables ] df_ready = df_subset.dropna(how='all') # Remove unclean result = {} if num_variables == 1: result['one_dimensional_contingency_table'] = create_one_dimensional_contingency_table(df_ready, aggregation_variables[0], dependent_variable, config=config) elif num_variables == 2: result['two_dimensional_contingency_table'] = create_contingency_table(df_ready, aggregation_variables, dependent_variable, config=config) return result, 200
def post(self, field_id): args = fieldPostParser.parse_args() project_id = args.get('project_id') dataset_id = args.get('dataset_id') field_type = args.get('type') field_is_id = args.get('isId') field_color = args.get('color') if field_type: if (field_type not in quantitative_types) \ and (field_type not in categorical_types) \ and (field_type not in temporal_types): return make_response(jsonify({'status': 'Invalid field type.'})) general_type = specific_type_to_general_type[field_type] field_property = db_access.get_field_property(project_id, dataset_id, field_id) field_name = field_property['name'] df = get_data(project_id=project_id, dataset_id=dataset_id) updated_properties = compute_single_field_property_nontype(field_name, df[field_name], field_type, general_type) field_property_document = \ db_access.update_field_properties_type_by_id(project_id, field_id, field_type, general_type, updated_properties) if field_is_id != None: field_property_document = \ db_access.update_field_properties_is_id_by_id(project_id, field_id, field_is_id) if field_color != None: field_property_document = \ db_access.update_field_properties_color_by_id(project_id, field_id, field_color) return make_response(jsonify(field_property_document))
def unpivot_dataset(project_id, dataset_id, pivot_fields, variable_name, value_name, new_dataset_name_prefix): ''' Returns unpivoted dataframe ''' df = get_data(project_id=project_id, dataset_id=dataset_id) project = db_access.get_project(project_id) original_dataset = db_access.get_dataset(project_id, dataset_id) preloaded_project = project.get('preloaded', False) if preloaded_project: project_dir = os.path.join(task_app.config['PRELOADED_PATH'], project['directory']) else: project_dir = os.path.join(task_app.config['STORAGE_PATH'], str(project_id)) original_dataset_title = original_dataset['title'] fallback_title = original_dataset_title[:20] dataset_type = '.tsv' new_dataset_title, new_dataset_name, new_dataset_path = \ get_transformed_file_name(project_dir, new_dataset_name_prefix, fallback_title, original_dataset_title, dataset_type) columns = df.columns.values pivot_fields = list_elements_from_indices(columns, pivot_fields) preserved_fields = difference_of_lists(columns, pivot_fields) df_unpivoted = pd.melt(df, id_vars=preserved_fields, value_vars=pivot_fields, var_name=variable_name, value_name=value_name) return df_unpivoted, new_dataset_title, new_dataset_name, new_dataset_path
def load_data(dependent_variable_name, independent_variables_names, interaction_term_ids, dataset_id, project_id): ''' Load DF and full field documents ''' # Map variables to field documents all_fields = db_access.get_field_properties(project_id, dataset_id) interaction_terms = db_access.get_interaction_term_properties( interaction_term_ids) dependent_variable = next( (f for f in all_fields if f['name'] == dependent_variable_name), None) independent_variables = [] if independent_variables_names: independent_variables = get_full_field_documents_from_field_names( all_fields, independent_variables_names) else: for field in all_fields: if (not (field['general_type'] == 'c' and field['is_unique']) \ and field['name'] != dependent_variable_name): independent_variables.append(field) # 2) Access dataset df = get_data(project_id=project_id, dataset_id=dataset_id) # Drop NAs df_subset = df[[dependent_variable_name] + independent_variables_names] df_ready = df_subset.dropna(axis=0, how='all') return dependent_variable, independent_variables, interaction_terms, df_ready
def load_data(dependent_variable_name, independent_variables_names, interaction_term_ids, dataset_id, project_id): ''' Load DF and full field documents ''' # Map variables to field documents all_fields = db_access.get_field_properties(project_id, dataset_id) interaction_terms = db_access.get_interaction_term_properties(interaction_term_ids) dependent_variable = next((f for f in all_fields if f['name'] == dependent_variable_name), None) independent_variables = [] if independent_variables_names: independent_variables = get_full_field_documents_from_field_names(all_fields, independent_variables_names) else: for field in all_fields: if (not (field['general_type'] == 'c' and field['is_unique']) \ and field['name'] != dependent_variable_name): independent_variables.append(field) # 2) Access dataset df = get_data(project_id=project_id, dataset_id=dataset_id) # Drop NAs df_subset = df[[dependent_variable_name] + independent_variables_names] df_ready = df_subset.dropna(axis=0, how='all') return dependent_variable, independent_variables, interaction_terms, df_ready
def get_viz_data_from_enumerated_spec(spec, project_id, conditionals, config, df=None, precomputed={}, data_formats=['visualize', 'table', 'score']): ''' Returns a dictionary containing data corresponding to spec (in automated-viz structure), and all necessary information to interpret data. There are three types of formats: Score: a dict of lists for scoring Visualize: a list of dicts (collection) Table: {columns: list, data: matrix} Args: spec, dataset_id, project_id, format (list of 'score', 'visualize', or 'table') Returns: data specified by spec, in specified format ''' for f in data_formats: if f not in [u'score', u'visualize', u'table', u'count']: raise ValueError('Passed incorrect data format', f) final_data = dict([(f, {}) for f in data_formats]) gp = spec['generating_procedure'] args = spec['args'] dataset_id = spec['dataset_id'] logger.debug('Generating Procedure: %s', gp) logger.debug('Arguments: %s', args) start_time = time() if df is None: df = get_data(project_id=project_id, dataset_id=dataset_id) df = get_conditioned_data(project_id, dataset_id, df, conditionals) id_fields = [ fp for fp in db_access.get_field_properties(project_id, dataset_id) if fp['is_id']] generating_procedure_to_data_function = { GeneratingProcedure.AGG.value: get_agg_data, GeneratingProcedure.IND_VAL.value: get_ind_val_data, GeneratingProcedure.BIN_AGG.value: get_bin_agg_data, GeneratingProcedure.MULTIGROUP_COUNT.value: get_multigroup_count_data, GeneratingProcedure.MULTIGROUP_AGG.value: get_multigroup_agg_data, GeneratingProcedure.VAL_BOX.value: get_val_box_data, GeneratingProcedure.VAL_AGG.value: get_val_agg_data, GeneratingProcedure.VAL_VAL.value: get_raw_comparison_data, GeneratingProcedure.VAL_COUNT.value: get_val_count_data, GeneratingProcedure.AGG_AGG.value: get_agg_agg_data, } data = generating_procedure_to_data_function[gp](df, args, id_fields=id_fields, precomputed=precomputed, config=config, data_formats=data_formats ) logger.debug('Data for %s: %s', gp, time() - start_time) return data
def join_datasets(project_id, left_dataset_id, right_dataset_id, on, left_on, right_on, how, left_suffix, right_suffix, new_dataset_name_prefix): left_df = get_data(project_id=project_id, dataset_id=left_dataset_id) right_df = get_data(project_id=project_id, dataset_id=right_dataset_id) project = db_access.get_project(project_id) original_left_dataset = db_access.get_dataset(project_id, left_dataset_id) original_right_dataset = db_access.get_dataset(project_id, right_dataset_id) preloaded_project = project.get('preloaded', False) if preloaded_project: project_dir = os.path.join(task_app.config['PRELOADED_PATH'], project['directory']) else: project_dir = os.path.join(task_app.config['STORAGE_PATH'], str(project_id)) original_left_dataset_title = original_left_dataset['title'] original_right_dataset_title = original_right_dataset['title'] fallback_title = original_left_dataset_title[: 20] + original_left_dataset_title[: 20] original_dataset_title = original_left_dataset_title + original_right_dataset_title dataset_type = '.tsv' new_dataset_title, new_dataset_name, new_dataset_path = \ get_transformed_file_name(project_dir, new_dataset_name_prefix, fallback_title, original_dataset_title, dataset_type) left_columns = left_df.columns.values right_columns = right_df.columns.values on = list_elements_from_indices(left_columns, on) # Not using left_on or right_on for now df_joined = left_df.merge(right_df, how=how, on=on, suffixes=[left_suffix, right_suffix]) return df_joined, new_dataset_title, new_dataset_name, new_dataset_path
def run_correlation_from_spec(spec, project_id, conditionals=[]): dataset_id = spec.get("datasetId") correlation_variables = spec.get("correlationVariables") correlation_variables_names = correlation_variables df = get_data(project_id=project_id, dataset_id=dataset_id) df = get_conditioned_data(project_id, dataset_id, df, conditionals) df_subset = df[ correlation_variables_names ] df_ready = df_subset.dropna(how='all') correlation_result = run_correlation(df_ready, correlation_variables) correlation_scatterplots = get_correlation_scatterplot_data(df_ready, correlation_variables) return { 'table': correlation_result, 'scatterplots': correlation_scatterplots }, 200
def attach_data_to_viz_specs(enumerated_viz_specs, dataset_id, project_id, conditionals, config, data_formats=['score', 'table', 'visualize', 'count']): ''' Get data corresponding to each viz spec (before filtering and scoring) ''' viz_specs_with_data = [] start_time = time() TIME_CUTOFF = 10 # Get dataframe if project_id and dataset_id: df = get_data(project_id=project_id, dataset_id=dataset_id) conditioned_df = get_conditioned_data(project_id, dataset_id, df, conditionals) precomputed = { 'groupby': {} } for i, spec in enumerate(enumerated_viz_specs): if (time() - start_time) > TIME_CUTOFF: continue viz_spec_with_data = spec # TODO Move this into another function if spec['args'].get('grouped_field'): grouped_field = spec['args']['grouped_field']['name'] grouped_df = conditioned_df.groupby(grouped_field) precomputed['groupby'][grouped_field] = grouped_df try: data = get_viz_data_from_enumerated_spec(spec, project_id, conditionals, config, df=conditioned_df, precomputed=precomputed, data_formats=data_formats ) except Exception as e: logger.error("Error getting viz data %s", e, exc_info=True) continue if not data: logger.info('No data for spec with generating procedure %s', spec['generating_procedure']) continue viz_spec_with_data['data'] = data viz_specs_with_data.append(viz_spec_with_data) logger.debug('Attaching data to %s specs took %.3fs', len(viz_specs_with_data), time() - start_time) return viz_specs_with_data
def run_correlation_from_spec(spec, project_id, conditionals=[]): dataset_id = spec.get("datasetId") correlation_variables = spec.get("correlationVariables") correlation_variables_names = correlation_variables df = get_data(project_id=project_id, dataset_id=dataset_id) df = get_conditioned_data(project_id, dataset_id, df, conditionals) df_subset = df[correlation_variables_names] df_ready = df_subset.dropna(how='all') correlation_result = run_correlation(df_ready, correlation_variables) correlation_scatterplots = get_correlation_scatterplot_data( df_ready, correlation_variables) return { 'table': correlation_result, 'scatterplots': correlation_scatterplots }, 200
def run_comparison_from_spec(spec, project_id): # 1) Parse and validate arguments indep = spec.get('indep', []) dep = spec.get('dep', []) dataset_id = spec.get('dataset_id') test = spec.get('test', 'ttest') if not (dataset_id and dep): return 'Not passed required parameters', 400 fields = db_access.get_field_properties(project_id, dataset_id) # 2) Access dataset df = get_data(project_id=project_id, dataset_id=dataset_id) df = df.dropna() # Remove unclean # 3) Run test based on parameters and arguments comparison_result = run_comparison(df, fields, indep, dep, test) return {'data': comparison_result}, 200
def compute_dataset_properties(dataset_id, project_id, path=None): ''' Compute and return dictionary containing whole import pandas as pd-dataset properties ''' print('compute_dataset_properties', 'path', path) if not path: dataset = db_access.get_dataset(project_id, dataset_id) path = dataset['path'] df = get_data(project_id=project_id, dataset_id=dataset_id) n_rows, n_cols = df.shape field_names = df.columns.values.tolist() # field_types = [] # for (i, field_name) in enumerate(df): # logger.debug('Calculating types for field %s', field_name) # field_values = df[field_name] # field_type, field_type_scores = calculate_field_type(field_name, field_values, i, n_cols) # field_types.append(field_type) # Forgoing time series detection for now (expensive) # time_series = detect_time_series(df, field_types) # if time_series: # time_series = True time_series = False structure = 'wide' if time_series else 'long' properties = { 'n_rows': n_rows, 'n_cols': n_cols, 'field_names': field_names, # 'field_types': field_types, 'field_accessors': [i for i in range(0, n_cols)], 'structure': structure, 'is_time_series': time_series, } return { 'desc': 'Done computing dataset properties', 'result': properties, }
def run_comparison_from_spec(spec, project_id, conditionals=[]): dependent_variables_names = spec.get('dependentVariablesNames', []) independent_variables_names = spec.get('independentVariablesNames', []) # [ iv[1] for iv in independent_variables ] dataset_id = spec.get('datasetId') significance_cutoff = spec.get('significanceCutoff', 0.05) independence = spec.get('independence', True) if not (dataset_id): return 'Not passed required parameters', 400 all_fields = db_access.get_field_properties(project_id, dataset_id) dependent_variables = [ f for f in all_fields if f['name'] in dependent_variables_names ] independent_variables = [ f for f in all_fields if f['name'] in independent_variables_names ] can_run_numerical_comparison_independent = len([ iv for iv in independent_variables if iv['scale'] == 'continuous' ]) >= 2 and len(dependent_variables_names) == 0 can_run_numerical_comparison_dependent = len([ dv for dv in dependent_variables if dv['scale'] == 'continuous' ]) >= 2 and len(independent_variables_names) == 0 can_run_numerical_comparison = (can_run_numerical_comparison_dependent or can_run_numerical_comparison_independent) can_run_anova = (len(dependent_variables) and len(independent_variables)) df = get_data(project_id=project_id, dataset_id=dataset_id) df_conditioned = get_conditioned_data(project_id, dataset_id, df, conditionals) df_subset = df_conditioned[ dependent_variables_names + independent_variables_names ] df_ready = df_subset.dropna(how='any') # Remove unclean result = {} NUM_GROUPS_CUTOFF = 15 if can_run_anova: anova = run_anova(df_ready, independent_variables_names, dependent_variables_names, NUM_GROUPS_CUTOFF=NUM_GROUPS_CUTOFF) anova_boxplot_data = get_anova_boxplot_data(project_id, dataset_id, df_ready, independent_variables_names, dependent_variables_names, NUM_GROUPS_CUTOFF=NUM_GROUPS_CUTOFF) pairwise_comparison_data = get_pairwise_comparison_data(df_ready, independent_variables_names, dependent_variables_names, significance_cutoff=significance_cutoff, NUM_GROUPS_CUTOFF=NUM_GROUPS_CUTOFF) result.update({ 'anova': anova, 'anova_boxplot': anova_boxplot_data, 'pairwise_comparison': pairwise_comparison_data, }) if can_run_numerical_comparison: if can_run_numerical_comparison_independent: numerical_comparison_data = run_valid_numerical_comparison_tests(df_ready, independent_variables_names, independence=True) if can_run_numerical_comparison_dependent: numerical_comparison_data = run_valid_numerical_comparison_tests(df_ready, dependent_variables_names, independence=False) result['numerical_comparison'] = numerical_comparison_data return result, 200
def reduce_dataset(project_id, dataset_id, column_ids_to_keep, new_dataset_name_prefix): df = get_data(project_id=project_id, dataset_id=dataset_id) project = db_access.get_project(project_id) original_dataset = db_access.get_dataset(project_id, dataset_id) preloaded_project = project.get('preloaded', False) if preloaded_project: project_dir = os.path.join(task_app.config['PRELOADED_PATH'], project['directory']) else: project_dir = os.path.join(task_app.config['STORAGE_PATH'], str(project_id)) original_dataset_title = original_dataset['title'] fallback_title = original_dataset_title[:20] dataset_type = '.tsv' new_dataset_title, new_dataset_name, new_dataset_path = \ get_transformed_file_name(project_dir, new_dataset_name_prefix, fallback_title, original_dataset_title, dataset_type) df_reduced = df.iloc[:, column_ids_to_keep] return df_reduced, new_dataset_title, new_dataset_name, new_dataset_path
def post(self, field_id): args = fieldPostParser.parse_args() project_id = args.get('project_id') dataset_id = args.get('dataset_id') field_type = args.get('type') field_is_id = args.get('isId') field_color = args.get('color') if field_type: if (field_type not in quantitative_types) \ and (field_type not in categorical_types) \ and (field_type not in temporal_types): return make_response(jsonify({'status': 'Invalid field type.'})) general_type = specific_type_to_general_type[field_type] field_property = db_access.get_field_property( project_id, dataset_id, field_id) field_name = field_property['name'] df = get_data(project_id=project_id, dataset_id=dataset_id) updated_properties = compute_single_field_property_nontype( field_name, df[field_name], field_type, general_type) field_property_document = \ db_access.update_field_properties_type_by_id(project_id, field_id, field_type, general_type, updated_properties) if field_is_id != None: field_property_document = \ db_access.update_field_properties_is_id_by_id(project_id, field_id, field_is_id) if field_color != None: field_property_document = \ db_access.update_field_properties_color_by_id(project_id, field_id, field_color) return make_response(jsonify(field_property_document))
def compute_all_field_properties(dataset_id, project_id, should_detect_hierarchical_relationships=True, track_started=True): ''' Compute field properties of a specific dataset Currently only getting properties by column Arguments: project_id + dataset ids Returns a mapping from dataset_ids to properties ''' logger.debug("Computing field properties for dataset_id %s", dataset_id) df = get_data(project_id=project_id, dataset_id=dataset_id) num_fields = len(df.columns) field_properties = [ {} for i in range(num_fields) ] palette = total_palette + [ '#007BD7' for i in range(0, num_fields - len(total_palette)) ] if num_fields <= len(total_palette): palette = sample_with_maximum_distance(total_palette, num_fields, random_start=True) # 1) Detect field types for (i, field_name) in enumerate(df): logger.info('[%s | %s] Detecting type for field %s', project_id, dataset_id, field_name) field_values = df[field_name] d = field_property_type_object = compute_single_field_property_type(field_name, field_values, field_position=i, num_fields=num_fields) field_properties[i].update({ 'index': i, 'name': field_name, }) field_properties[i].update(d) temporal_fields = [ fp for fp in field_properties if (fp['general_type'] == GDT.T.value)] # Necessary to coerce here? coerced_df = coerce_types(df, field_properties) IMD.insertData(dataset_id, coerced_df) # 2) Rest for (i, field_name) in enumerate(coerced_df): field_values = coerced_df[field_name] d = field_properties_nontype_object = compute_single_field_property_nontype( field_name, field_values, field_properties[i]['type'], field_properties[i]['general_type'], df=coerced_df, temporal_fields=temporal_fields ) field_properties[i].update({ 'color': palette[i], 'children': [], 'parents': [], 'one_to_ones': [], 'manual': {} }) field_properties[i].update(d) if should_detect_hierarchical_relationships: hierarchical_relationships = detect_hierarchical_relationships(coerced_df, field_properties) MAX_UNIQUE_VALUES_THRESHOLD = 100 for field_a, field_b in hierarchical_relationships: if [ field_b, field_a ] in hierarchical_relationships: field_properties[field_properties.index(field_a)]['one_to_ones'].append(field_b['name']) else: field_properties[field_properties.index(field_a)]['children'].append(field_b['name']) field_properties[field_properties.index(field_b)]['parents'].append(field_a['name']) return { 'desc': 'Done computing field properties for %s fields' % len(field_properties), 'result': field_properties }