Ejemplo n.º 1
0
def generate_2():
    """
    Generates data required for test_2.
    dataset used - randomly generated in util/test_aspects
    Stored in - data/data_for_test_looking_at_tails/fide_historical.csv
    """
    table = pandas.read_csv('data/data_for_test_aspects/test_1.csv')
    result_table = topk.topk_results(table, 'Age', ['Name', 'Gender'], False, -1)
    result_table.to_csv('data/data_for_test_looking_at_tails/result_table_for_test_2',
                        index=False)
Ejemplo n.º 2
0
def generate_1():
    """
    Generates data required for test_1.
    dataset used - https://www.kaggle.com/odartey/top-chess-players
    Stored in - data/data_for_test_looking_at_tails/fide_historical.csv
    """
    table = pandas.read_csv('data/data_for_test_looking_at_tails/fide_historical.csv')
    result_table = topk.topk_results(table, 'rating',
                                     ['name', 'birth_year', 'games'], False, -1)
    result_table.to_csv('data/data_for_test_looking_at_tails/result_table_for_test_1',
                        index=False)
def generate():
    """
    Generates data required for both test_1 & test_2
    dataset used - https://www.contextures.com/xlSampleData01.html

    Top-k query - "Find the top-k Item with maximum UnitCost"

    This function takes stores the result table of the query without
    applying the k condition & stores the result in a csv file.
    """
    table = pandas.read_csv(
        'data/data_for_test_more_than_just_topk/sheet1.csv')
    result_table = topk.topk_results(table, 'UnitCost', ['Item'], False, -1)
    result_table.to_csv(
        'data/data_for_test_more_than_just_topk/result_table.csv', index=False)
Ejemplo n.º 4
0
def regression_to_mean(table, metric, dimensions, is_asc, k, **kwargs):
    """ This function gives suggestions if the regression to the
    mean oversight is detected in the top-k results.
    It checks the top-k results under the same slicingcondition
    in the previous window, and if those results differ a lot it
    returns the debiasing suggestion. It has 2 methods to check
    if the 2 results differ.
    1. Set intersection method.
        Checks if the sets formed by both the results differs a lot.
    2. Similartity in the ranks method.
        Checks if the ranks of the common items in both the
        results differ a lot.

    The cut-off in both the methods is fixed in the util/constants module

    Args:
        table: Type-pandas.dataframe
            It has the contents of the csv file
        metric: Type-string
            It is the name of the column according to which we sort,
            and in the case when grouping has to be done,
            summary operator is applied on metric. Metric could a column
            containing strings, if we are applying count operator on it.
        dimensions: Type-list of str
            It is the name of column we want.
            In query:'top 5 batsman according to runs', dimension is 'batsman'.
            When summary_operator is not None, we group by dimensions.
        is_asc: Type-Bool
            Denotes the sort order, True for ascending, False for Descending
        k: Type-int
            It is the number of entries to be taken
        date_range: Type-tuple
            Tuple of start_date and end_date
        date_column_name: Type-str
            It is the name of column which contains date
        date_format: Type-str
            It is required by datetime.strp_time to parse the date in the format
            Format Codes
https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior
        slices: Type-List of tuples
            Tuple represents the conditon to keep the row.
            (column_name, filter, value)
            column_name - is the value of the column that the
            condition is applied upon.
            filter - Filters enum members, ex. Filters.IN
        summary_operator: Type-summary_operators enum members
            It denotes the summary operator, after grouping by dimensions.
            ex. SummaryOperators.MAX, SummaryOperators.SUM

    Returns:
        suggestion : dictonary with keys 'suggestion', 'oversight_name'
    """
    date_column_name = kwargs.get('date_column_name', 'date')
    date_range = kwargs.get('date_range', None)
    date_format = kwargs.get('date_format', '%Y-%m-%d')
    slices = kwargs.get('slices', None)
    summary_operator = kwargs.get('summary_operator', None)

    # top-k in the given time window
    current_topk = topk.topk_results(table,
                                     metric,
                                     dimensions,
                                     is_asc,
                                     k,
                                     date_column_name=date_column_name,
                                     date_range=date_range,
                                     date_format=date_format,
                                     slices=slices,
                                     summary_operator=summary_operator)[0]

    if date_range is None:
        return

    current_topk_set = _convert_to_set(current_topk, dimensions)

    # results of the other time interval may contain duplicates,
    # so setting the summary operator to MAX/MIN
    if summary_operator is None:
        if is_asc:
            summary_operator = SummaryOperators.MIN
        else:
            summary_operator = SummaryOperators.MAX

    # start & end dates of the previous time window
    previous_start, previous_end = time_window.previous(
        date_range[0], date_range[1], date_format)

    # top-k in previous window
    previous_topk = topk.topk_results(table,
                                      metric,
                                      dimensions,
                                      is_asc,
                                      k,
                                      slices=slices,
                                      summary_operator=summary_operator,
                                      date_column_name=date_column_name,
                                      date_format=date_format,
                                      date_range=(previous_start,
                                                  previous_end))[0]

    set_intersect_suggestions = _set_intersect(previous_topk, current_topk,
                                               dimensions)

    suggestion = {}
    suggestion['oversight'] = Oversights.REGRESSION_TO_THE_MEAN

    if set_intersect_suggestions is not None:
        suggestion['suggestion'] = set_intersect_suggestions
        return suggestion

    rank_vector_suggestion = _similarity_between_ranks(previous_topk,
                                                       current_topk,
                                                       dimensions)

    if rank_vector_suggestion is not None:
        suggestion['suggestion'] = rank_vector_suggestion
        return suggestion

    return None