def generate_2(): """ Generates data required for test_2. dataset used - randomly generated in util/test_aspects Stored in - data/data_for_test_looking_at_tails/fide_historical.csv """ table = pandas.read_csv('data/data_for_test_aspects/test_1.csv') result_table = topk.topk_results(table, 'Age', ['Name', 'Gender'], False, -1) result_table.to_csv('data/data_for_test_looking_at_tails/result_table_for_test_2', index=False)
def generate_1(): """ Generates data required for test_1. dataset used - https://www.kaggle.com/odartey/top-chess-players Stored in - data/data_for_test_looking_at_tails/fide_historical.csv """ table = pandas.read_csv('data/data_for_test_looking_at_tails/fide_historical.csv') result_table = topk.topk_results(table, 'rating', ['name', 'birth_year', 'games'], False, -1) result_table.to_csv('data/data_for_test_looking_at_tails/result_table_for_test_1', index=False)
def generate(): """ Generates data required for both test_1 & test_2 dataset used - https://www.contextures.com/xlSampleData01.html Top-k query - "Find the top-k Item with maximum UnitCost" This function takes stores the result table of the query without applying the k condition & stores the result in a csv file. """ table = pandas.read_csv( 'data/data_for_test_more_than_just_topk/sheet1.csv') result_table = topk.topk_results(table, 'UnitCost', ['Item'], False, -1) result_table.to_csv( 'data/data_for_test_more_than_just_topk/result_table.csv', index=False)
def regression_to_mean(table, metric, dimensions, is_asc, k, **kwargs): """ This function gives suggestions if the regression to the mean oversight is detected in the top-k results. It checks the top-k results under the same slicingcondition in the previous window, and if those results differ a lot it returns the debiasing suggestion. It has 2 methods to check if the 2 results differ. 1. Set intersection method. Checks if the sets formed by both the results differs a lot. 2. Similartity in the ranks method. Checks if the ranks of the common items in both the results differ a lot. The cut-off in both the methods is fixed in the util/constants module Args: table: Type-pandas.dataframe It has the contents of the csv file metric: Type-string It is the name of the column according to which we sort, and in the case when grouping has to be done, summary operator is applied on metric. Metric could a column containing strings, if we are applying count operator on it. dimensions: Type-list of str It is the name of column we want. In query:'top 5 batsman according to runs', dimension is 'batsman'. When summary_operator is not None, we group by dimensions. is_asc: Type-Bool Denotes the sort order, True for ascending, False for Descending k: Type-int It is the number of entries to be taken date_range: Type-tuple Tuple of start_date and end_date date_column_name: Type-str It is the name of column which contains date date_format: Type-str It is required by datetime.strp_time to parse the date in the format Format Codes https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior slices: Type-List of tuples Tuple represents the conditon to keep the row. (column_name, filter, value) column_name - is the value of the column that the condition is applied upon. filter - Filters enum members, ex. Filters.IN summary_operator: Type-summary_operators enum members It denotes the summary operator, after grouping by dimensions. ex. SummaryOperators.MAX, SummaryOperators.SUM Returns: suggestion : dictonary with keys 'suggestion', 'oversight_name' """ date_column_name = kwargs.get('date_column_name', 'date') date_range = kwargs.get('date_range', None) date_format = kwargs.get('date_format', '%Y-%m-%d') slices = kwargs.get('slices', None) summary_operator = kwargs.get('summary_operator', None) # top-k in the given time window current_topk = topk.topk_results(table, metric, dimensions, is_asc, k, date_column_name=date_column_name, date_range=date_range, date_format=date_format, slices=slices, summary_operator=summary_operator)[0] if date_range is None: return current_topk_set = _convert_to_set(current_topk, dimensions) # results of the other time interval may contain duplicates, # so setting the summary operator to MAX/MIN if summary_operator is None: if is_asc: summary_operator = SummaryOperators.MIN else: summary_operator = SummaryOperators.MAX # start & end dates of the previous time window previous_start, previous_end = time_window.previous( date_range[0], date_range[1], date_format) # top-k in previous window previous_topk = topk.topk_results(table, metric, dimensions, is_asc, k, slices=slices, summary_operator=summary_operator, date_column_name=date_column_name, date_format=date_format, date_range=(previous_start, previous_end))[0] set_intersect_suggestions = _set_intersect(previous_topk, current_topk, dimensions) suggestion = {} suggestion['oversight'] = Oversights.REGRESSION_TO_THE_MEAN if set_intersect_suggestions is not None: suggestion['suggestion'] = set_intersect_suggestions return suggestion rank_vector_suggestion = _similarity_between_ranks(previous_topk, current_topk, dimensions) if rank_vector_suggestion is not None: suggestion['suggestion'] = rank_vector_suggestion return suggestion return None