def test_3(): """ The result table without the k-condition is hardcoded for the query - Top-4 Salespersons with maximum Sales Here the top-k vs others suggestion is asserted when the ratio = topk_sum / others_sum is negative """ # hardcoding the artificial results table result_table = pandas.DataFrame() result_table['Salesperson'] = pandas.Series(['A', 'B', 'C', 'D', 'E', 'F', 'G']) result_table['Sales'] = pandas.Series([5, 4, 3, 3, -1, -2, -10]) k = 4 metric = 'Sales' # The data should be already sorted assert(result_table[metric].is_monotonic_decreasing) # suggestions = oversights.looking_at_tails.looking_at_tails(result_table, k, metric) suggestions = topk_vs_others.topk_vs_others(result_table, k, metric) print(suggestions) expected_suggestions = """{'change_list': {'topKLimit': 7}, 'suggestion': 'The sum of Sales in top-k rows is negative whereas sum of rows not in top-k is positive', 'confidence_score': -1.1538461538461537}""" assert(expected_suggestions == str(suggestions))
def test_2(): """ The result table without the k-condition is hardcoded for the query - Top-4 Salespersons with maximum Sales Here the top-k vs others suggestion is asserted when the ratio = topk_sum / others_sum is positive and is more than the threshold, so no suggestion is returned """ # hardcoding the artificial results table result_table = pandas.DataFrame() result_table['Salesperson'] = pandas.Series(['A', 'B', 'C', 'D', 'E', 'F', 'G']) result_table['Sales'] = pandas.Series([5, 4, 3, 3, 3, 2, 1]) k = 4 metric = 'Sales' # The data should be already sorted assert(result_table[metric].is_monotonic_decreasing) # suggestions = oversights.looking_at_tails.looking_at_tails(result_table, k, metric) suggestions = topk_vs_others.topk_vs_others(result_table, k, metric) print(suggestions) expected_suggestions = """None""" assert(expected_suggestions == str(suggestions))
def test_1(): """ Tests the topk_vs_others based on the result table generated & stored in data/data_for_test_topk_vs_others/result_table for the query - Top-10 Item based on minimum unit costs. Here the top-k vs others suggestion is asserted when the ratio = topk_sum / others_sum is positive and is less than the threshold """ result_table = pandas.read_csv( 'data/data_for_test_topk_vs_others/result_table.csv') k = 10 metric = 'UnitCost' # The data should be already sorted assert (result_table[metric].is_monotonic_increasing) # suggestions = oversights.looking_at_tails.looking_at_tails(result_table, k, metric) suggestions = topk_vs_others.topk_vs_others(result_table, k, metric) print(suggestions) expected_suggestions = """{'oversight': <Oversights.TOPK_VS_OTHERS: 6>, 'change_list': {'topKLimit': 43}, 'suggestion': 'The rows NOT in the top-k have a much larger sum over UnitCost than the rows in top-k', 'confidence_score': 0.008621645877239863}""" assert (expected_suggestions == str(suggestions))
def topk(table, metric, dimensions, is_asc, k, **kwargs): """ This function returns both the results according to the intent as well as the debiasing suggestions. Also, if summary operator is applied, the name of metric column is renamed to "<summary operator> of metric". Oversights that may be detected in top-k 1. Regression to the mean 2. Looking at tails to find causes 3. Duplicates in top-k 4. More than just top-k 5. Top-k vs others 6. Top-k when less than k present Args: table: Type-pandas.dataframe It has the contents of the csv file metric: Type-string It is the name of the column according to which we sort, and in the case when grouping has to be done, summary operator is applied on metric. Metric could a column containing strings, if we are applying count operator on it. dimensions: Type-list of str It is the name of column we want. In query:'top 5 batsman according to runs', dimension is 'batsman'. When summary_operator is not None, we group by dimensions. is_asc: Type-Bool Denotes the sort order, True for ascending, False for Descending k: Type-int It is the number of entries to be taken date_range: Type-tuple Tuple of start_date and end_date date_column_name: Type-str It is the name of column which contains date day_first: Type-str Day_first denotes that does day in the date occurs before month in the dates in the date column Example - '29-02-19', here day_first is true slices: Type-List of tuples Tuple represents the conditon to keep the row. (column_name, filter, value) column_name - is the value of the column that the condition is applied upon. filter - Filters enum members, ex. Filters.IN summary_operator: Type-summary_operators enum members It denotes the summary operator, after grouping by dimensions. ex. SummaryOperators.MAX, SummaryOperators.SUM Note-summary_operator is always applied on metric column passed, and only when grouping is done Returns: The function will return both suggestions and the results in a tuple. (results, suggestions) results: Type -pandas dataframe, The results of the weighted mean intent suggestions: Type - List of dictionaries(suggestion structure), List of suggestions. """ date_column_name = kwargs.get('date_column_name', 'date') date_range = kwargs.get('date_range', None) day_first = kwargs.get('day_first', False) slices = kwargs.get('slices', None) summary_operator = kwargs.get('summary_operator', None) result_tuple = topk_results(table, metric, dimensions, is_asc, k, date_column_name=date_column_name, date_range=date_range, day_first=day_first, slices=slices, summary_operator=summary_operator) result_table = result_tuple[0] suggestions = result_tuple[1] duplicates_in_topk_suggestion = duplicates_in_topk(result_table, dimensions) if duplicates_in_topk_suggestion is not None: suggestions.append(duplicates_in_topk_suggestion) else: # Check for RMT suggestion only when no duplicates present. rmt_suggestion = regression_to_mean(table, metric, dimensions, is_asc, k, date_column_name=date_column_name, date_range=date_range, day_first=day_first, slices=slices, summary_operator=summary_operator) if rmt_suggestion is not None: suggestions.append(rmt_suggestion) results_without_k_condition = topk_results( table, metric, dimensions, is_asc, -1, date_column_name=date_column_name, date_range=date_range, day_first=day_first, slices=slices, summary_operator=summary_operator)[0] more_than_just_topk_suggestion = more_than_just_topk( results_without_k_condition, k, metric) if more_than_just_topk_suggestion is not None: suggestions.append(more_than_just_topk_suggestion) topk_vs_others_suggestion = topk_vs_others(results_without_k_condition, k, metric) if topk_vs_others_suggestion is not None: suggestions.append(topk_vs_others_suggestion) looking_at_tails_suggestion = looking_at_tails(results_without_k_condition, k, metric) if looking_at_tails_suggestion is not None: suggestions.append(looking_at_tails_suggestion) topk_when_less_than_k_present_suggestion = topk_when_less_than_k_present( result_table, k) if topk_when_less_than_k_present_suggestion is not None: suggestions.append(topk_when_less_than_k_present_suggestion) order = oversights_order.ORDER_IN_TOPK suggestions = rank_oversights.rank_oversights(suggestions, order) if summary_operator is not None: result_table = aspects.update_metric_column_name( result_table, summary_operator, metric) return (result_table, suggestions)
def topk(table, metric, dimensions, is_asc, k, **kwargs): """ This function returns both the results according to the intent as well as the debiasing suggestions. Some of the oversights considered in this intent are- 1. Regression to the mean 2. Looking at tails to find causes - TODO Args: table: Type-pandas.dataframe It has the contents of the csv file metric: Type-string It is the name of the column according to which we sort, and in the case when grouping has to be done, summary operator is applied on metric. Metric could a column containing strings, if we are applying count operator on it. dimensions: Type-list of str It is the name of column we want. In query:'top 5 batsman according to runs', dimension is 'batsman'. When summary_operator is not None, we group by dimensions. is_asc: Type-Bool Denotes the sort order, True for ascending, False for Descending k: Type-int It is the number of entries to be taken date_range: Type-tuple Tuple of start_date and end_date date_column_name: Type-str It is the name of column which contains date date_format: Type-str It is required by datetime.strp_time to parse the date in the format Format Codes https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior slices: Type-List of tuples Tuple represents the conditon to keep the row. (column_name, filter, value) column_name - is the value of the column that the condition is applied upon. filter - Filters enum members, ex. Filters.IN summary_operator: Type-summary_operators enum members It denotes the summary operator, after grouping by dimensions. ex. SummaryOperators.MAX, SummaryOperators.SUM Note-summary_operator is always applied on metric column passed, and only when grouping is done Returns: The function will return both suggestions and the results in a tuple. (results, suggestions) results: Type - pandas dataframe, The results of the intended top-k suggestions: Type - List of strings, List of suggestions. """ date_column_name = kwargs.get('date_column_name', 'date') date_range = kwargs.get('date_range', None) date_format = kwargs.get('date_format', '%Y-%m-%d') slices = kwargs.get('slices', None) summary_operator = kwargs.get('summary_operator', None) result_table = topk_results(table, metric, dimensions, is_asc, k, date_column_name=date_column_name, date_range=date_range, date_format=date_format, slices=slices, summary_operator=summary_operator) suggestions = [] duplicates_in_topk_suggestion = duplicates_in_topk(result_table, dimensions) if duplicates_in_topk_suggestion is not None: suggestions.append(duplicates_in_topk_suggestion) else: # Check for RMT suggestion only when no duplicates present. rmt_suggestion = regression_to_mean(table, metric, dimensions, is_asc, k, date_column_name=date_column_name, date_range=date_range, date_format=date_format, slices=slices, summary_operator=summary_operator) if rmt_suggestion is not None: suggestions.append(rmt_suggestion) results_without_k_condition = topk_results( table, metric, dimensions, is_asc, -1, date_column_name=date_column_name, date_range=date_range, date_format=date_format, slices=slices, summary_operator=summary_operator) more_than_just_topk_suggestion = more_than_just_topk( results_without_k_condition, k, metric) if more_than_just_topk_suggestion is not None: suggestions.append(more_than_just_topk_suggestion) topk_vs_others_suggestion = topk_vs_others(results_without_k_condition, k, metric) if topk_vs_others_suggestion is not None: suggestions.append(topk_vs_others_suggestion) looking_at_tails_suggestion = looking_at_tails(results_without_k_condition, k, metric) if looking_at_tails_suggestion is not None: suggestions.append(looking_at_tails_suggestion) return (result_table, suggestions)