Example #1
0
def test_3():
    """
    The result table without the k-condition is hardcoded for
    the query -
    Top-4 Salespersons with maximum Sales

    Here the top-k vs others suggestion is asserted when the
    ratio = topk_sum / others_sum is negative
    """
    # hardcoding the artificial results table
    result_table = pandas.DataFrame()
    result_table['Salesperson'] = pandas.Series(['A', 'B', 'C', 'D', 'E', 'F', 'G'])
    result_table['Sales'] = pandas.Series([5, 4, 3, 3, -1, -2, -10])
    k = 4
    metric = 'Sales'

    # The data should be already sorted
    assert(result_table[metric].is_monotonic_decreasing)

    # suggestions = oversights.looking_at_tails.looking_at_tails(result_table, k, metric)
    suggestions = topk_vs_others.topk_vs_others(result_table, k, metric)

    print(suggestions)
    expected_suggestions = """{'change_list': {'topKLimit': 7}, 'suggestion': 'The sum of Sales in top-k rows is negative whereas sum of rows not in top-k is positive', 'confidence_score': -1.1538461538461537}"""

    assert(expected_suggestions == str(suggestions))
Example #2
0
def test_2():
    """
    The result table without the k-condition is hardcoded for
    the query -
    Top-4 Salespersons with maximum Sales

    Here the top-k vs others suggestion is asserted when the
    ratio = topk_sum / others_sum is positive and is more than
    the threshold, so no suggestion is returned
    """
    # hardcoding the artificial results table
    result_table = pandas.DataFrame()
    result_table['Salesperson'] = pandas.Series(['A', 'B', 'C', 'D', 'E', 'F', 'G'])
    result_table['Sales'] = pandas.Series([5, 4, 3, 3, 3, 2, 1])
    k = 4
    metric = 'Sales'

    # The data should be already sorted
    assert(result_table[metric].is_monotonic_decreasing)

    # suggestions = oversights.looking_at_tails.looking_at_tails(result_table, k, metric)
    suggestions = topk_vs_others.topk_vs_others(result_table, k, metric)

    print(suggestions)
    expected_suggestions = """None"""

    assert(expected_suggestions == str(suggestions))
def test_1():
    """
    Tests the topk_vs_others based on the result table generated & stored
    in data/data_for_test_topk_vs_others/result_table for the query -
    Top-10 Item based on minimum unit costs.

    Here the top-k vs others suggestion is asserted when the
    ratio = topk_sum / others_sum is positive and is less than
    the threshold
    """
    result_table = pandas.read_csv(
        'data/data_for_test_topk_vs_others/result_table.csv')
    k = 10
    metric = 'UnitCost'

    # The data should be already sorted
    assert (result_table[metric].is_monotonic_increasing)

    # suggestions = oversights.looking_at_tails.looking_at_tails(result_table, k, metric)
    suggestions = topk_vs_others.topk_vs_others(result_table, k, metric)

    print(suggestions)
    expected_suggestions = """{'oversight': <Oversights.TOPK_VS_OTHERS: 6>, 'change_list': {'topKLimit': 43}, 'suggestion': 'The rows NOT in the top-k have a much larger sum over UnitCost than the rows in top-k', 'confidence_score': 0.008621645877239863}"""

    assert (expected_suggestions == str(suggestions))
Example #4
0
def topk(table, metric, dimensions, is_asc, k, **kwargs):
    """ This function returns both the results according to the intent
    as well as the debiasing suggestions.

    Also, if summary operator is applied, the name of metric column is
    renamed to "<summary operator> of metric".

    Oversights that may be detected in top-k
    1. Regression to the mean
    2. Looking at tails to find causes
    3. Duplicates in top-k
    4. More than just top-k
    5. Top-k vs others
    6. Top-k when less than k present

    Args:
        table: Type-pandas.dataframe
            It has the contents of the csv file
        metric: Type-string
            It is the name of the column according to which we sort,
            and in the case when grouping has to be done,
            summary operator is applied on metric. Metric could a column
            containing strings, if we are applying count operator on it.
        dimensions: Type-list of str
            It is the name of column we want.
            In query:'top 5 batsman according to runs', dimension is 'batsman'.
            When summary_operator is not None, we group by dimensions.
        is_asc: Type-Bool
            Denotes the sort order, True for ascending, False for Descending
        k: Type-int
            It is the number of entries to be taken
        date_range: Type-tuple
            Tuple of start_date and end_date
        date_column_name: Type-str
            It is the name of column which contains date
        day_first: Type-str
            Day_first denotes that does day in the date occurs before month in the
            dates in the date column
            Example - '29-02-19', here day_first is true
        slices: Type-List of tuples
            Tuple represents the conditon to keep the row.
            (column_name, filter, value)
            column_name - is the value of the column that the
            condition is applied upon.
            filter - Filters enum members, ex. Filters.IN
        summary_operator: Type-summary_operators enum members
            It denotes the summary operator, after grouping by dimensions.
            ex. SummaryOperators.MAX, SummaryOperators.SUM

    Note-summary_operator is always applied on metric column passed,
         and only when grouping is done

    Returns:
        The function will return both suggestions and the results in a tuple.
        (results, suggestions)

        results: Type -pandas dataframe, The results of the weighted mean intent

        suggestions: Type - List of dictionaries(suggestion structure), List of
            suggestions.
    """
    date_column_name = kwargs.get('date_column_name', 'date')
    date_range = kwargs.get('date_range', None)
    day_first = kwargs.get('day_first', False)

    slices = kwargs.get('slices', None)

    summary_operator = kwargs.get('summary_operator', None)

    result_tuple = topk_results(table,
                                metric,
                                dimensions,
                                is_asc,
                                k,
                                date_column_name=date_column_name,
                                date_range=date_range,
                                day_first=day_first,
                                slices=slices,
                                summary_operator=summary_operator)

    result_table = result_tuple[0]

    suggestions = result_tuple[1]

    duplicates_in_topk_suggestion = duplicates_in_topk(result_table,
                                                       dimensions)

    if duplicates_in_topk_suggestion is not None:
        suggestions.append(duplicates_in_topk_suggestion)

    else:
        # Check for RMT suggestion only when no duplicates present.
        rmt_suggestion = regression_to_mean(table,
                                            metric,
                                            dimensions,
                                            is_asc,
                                            k,
                                            date_column_name=date_column_name,
                                            date_range=date_range,
                                            day_first=day_first,
                                            slices=slices,
                                            summary_operator=summary_operator)

        if rmt_suggestion is not None:
            suggestions.append(rmt_suggestion)

    results_without_k_condition = topk_results(
        table,
        metric,
        dimensions,
        is_asc,
        -1,
        date_column_name=date_column_name,
        date_range=date_range,
        day_first=day_first,
        slices=slices,
        summary_operator=summary_operator)[0]

    more_than_just_topk_suggestion = more_than_just_topk(
        results_without_k_condition, k, metric)

    if more_than_just_topk_suggestion is not None:
        suggestions.append(more_than_just_topk_suggestion)

    topk_vs_others_suggestion = topk_vs_others(results_without_k_condition, k,
                                               metric)

    if topk_vs_others_suggestion is not None:
        suggestions.append(topk_vs_others_suggestion)

    looking_at_tails_suggestion = looking_at_tails(results_without_k_condition,
                                                   k, metric)

    if looking_at_tails_suggestion is not None:
        suggestions.append(looking_at_tails_suggestion)

    topk_when_less_than_k_present_suggestion = topk_when_less_than_k_present(
        result_table, k)

    if topk_when_less_than_k_present_suggestion is not None:
        suggestions.append(topk_when_less_than_k_present_suggestion)

    order = oversights_order.ORDER_IN_TOPK
    suggestions = rank_oversights.rank_oversights(suggestions, order)

    if summary_operator is not None:
        result_table = aspects.update_metric_column_name(
            result_table, summary_operator, metric)

    return (result_table, suggestions)
Example #5
0
def topk(table, metric, dimensions, is_asc, k, **kwargs):
    """ This function returns both the results according to the intent
    as well as the debiasing suggestions.
    Some of the oversights considered in this intent are-
    1. Regression to the mean
    2. Looking at tails to find causes - TODO

    Args:
        table: Type-pandas.dataframe
            It has the contents of the csv file
        metric: Type-string
            It is the name of the column according to which we sort,
            and in the case when grouping has to be done,
            summary operator is applied on metric. Metric could a column
            containing strings, if we are applying count operator on it.
        dimensions: Type-list of str
            It is the name of column we want.
            In query:'top 5 batsman according to runs', dimension is 'batsman'.
            When summary_operator is not None, we group by dimensions.
        is_asc: Type-Bool
            Denotes the sort order, True for ascending, False for Descending
        k: Type-int
            It is the number of entries to be taken
        date_range: Type-tuple
            Tuple of start_date and end_date
        date_column_name: Type-str
            It is the name of column which contains date
        date_format: Type-str
            It is required by datetime.strp_time to parse the date in the format
            Format Codes
https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior
        slices: Type-List of tuples
            Tuple represents the conditon to keep the row.
            (column_name, filter, value)
            column_name - is the value of the column that the
            condition is applied upon.
            filter - Filters enum members, ex. Filters.IN
        summary_operator: Type-summary_operators enum members
            It denotes the summary operator, after grouping by dimensions.
            ex. SummaryOperators.MAX, SummaryOperators.SUM

    Note-summary_operator is always applied on metric column passed,
         and only when grouping is done

    Returns:
        The function will return both suggestions and the results in a tuple.
        (results, suggestions)
        results: Type - pandas dataframe, The results of the intended top-k
        suggestions: Type - List of strings, List of suggestions.

    """
    date_column_name = kwargs.get('date_column_name', 'date')
    date_range = kwargs.get('date_range', None)
    date_format = kwargs.get('date_format', '%Y-%m-%d')

    slices = kwargs.get('slices', None)

    summary_operator = kwargs.get('summary_operator', None)

    result_table = topk_results(table,
                                metric,
                                dimensions,
                                is_asc,
                                k,
                                date_column_name=date_column_name,
                                date_range=date_range,
                                date_format=date_format,
                                slices=slices,
                                summary_operator=summary_operator)

    suggestions = []

    duplicates_in_topk_suggestion = duplicates_in_topk(result_table,
                                                       dimensions)

    if duplicates_in_topk_suggestion is not None:
        suggestions.append(duplicates_in_topk_suggestion)

    else:
        # Check for RMT suggestion only when no duplicates present.
        rmt_suggestion = regression_to_mean(table,
                                            metric,
                                            dimensions,
                                            is_asc,
                                            k,
                                            date_column_name=date_column_name,
                                            date_range=date_range,
                                            date_format=date_format,
                                            slices=slices,
                                            summary_operator=summary_operator)

        if rmt_suggestion is not None:
            suggestions.append(rmt_suggestion)

    results_without_k_condition = topk_results(
        table,
        metric,
        dimensions,
        is_asc,
        -1,
        date_column_name=date_column_name,
        date_range=date_range,
        date_format=date_format,
        slices=slices,
        summary_operator=summary_operator)

    more_than_just_topk_suggestion = more_than_just_topk(
        results_without_k_condition, k, metric)

    if more_than_just_topk_suggestion is not None:
        suggestions.append(more_than_just_topk_suggestion)

    topk_vs_others_suggestion = topk_vs_others(results_without_k_condition, k,
                                               metric)

    if topk_vs_others_suggestion is not None:
        suggestions.append(topk_vs_others_suggestion)

    looking_at_tails_suggestion = looking_at_tails(results_without_k_condition,
                                                   k, metric)

    if looking_at_tails_suggestion is not None:
        suggestions.append(looking_at_tails_suggestion)

    return (result_table, suggestions)