def test_6(): """ Test for oversight : Attribution With Hidden Negative Proportion of sum of reading score for each race/ethnicity Dataset used : https://www.kaggle.com/spscientist/students-performance-in-exams Args: Returns: """ table = pandas.read_csv( 'data/data_for_test_aspects/student_performance_updated_to_create_attribution_with_hidden_negative_oversight.csv' ) result = aspects.group_by(table, ['race/ethnicity'], enums.SummaryOperators.PROPORTION_OF_SUM) result_table = result['table'] result_table = aspects.crop_other_columns( result_table, ['race/ethnicity', 'reading score']) result_suggestions = result['suggestions'] print(result_table) expected_result_table = """ race/ethnicity reading score 0 group A 0.083434 1 group B 0.185493 2 group C 0.316920 3 group D 0.265955 4 group E 0.148198""" expected_suggestions = "[{'suggestion': 'There exists negative values among the values on which proportion is being applied', 'oversight': <Oversights.ATTRIBUTION_WITH_HIDDEN_NEGATIVES: 11>, 'is_row_level_suggestion': True, 'confidence_score': 1, 'row_list': [{'row': 14, 'confidence_score': 1}]}]" assert (expected_result_table == result_table.to_string()) assert (str(result_suggestions) == expected_suggestions)
def test_4(): """ Test for summary operator = PROPORTION_OF_COUNT Proportion of count of gender for each race/ethnicity Dataset used : https://www.kaggle.com/spscientist/students-performance-in-exams Args: Returns: """ table = pandas.read_csv( 'data/data_for_test_aspects/student_performance.csv') result = aspects.group_by(table, ['race/ethnicity'], enums.SummaryOperators.PROPORTION_OF_COUNT) result_table = result['table'] result_table = aspects.crop_other_columns(result_table, ['race/ethnicity', 'gender']) result_suggestions = result['suggestions'] # Sum of proportion column should be(close to) 1.0 assert (result_table['gender'].sum() == 1.0) print(result_table) expected_result_table = """ race/ethnicity gender 0 group A 0.089 1 group B 0.190 2 group C 0.319 3 group D 0.262 4 group E 0.140""" expected_suggestions = "[]" assert (expected_result_table == result_table.to_string()) assert (str(result_suggestions) == expected_suggestions)
def test_5(): """ Test for summary operator = PROPORTION_OF_SUM Proportion of sum of reading score for each race/ethnicity Dataset used : https://www.kaggle.com/spscientist/students-performance-in-exams Args: Returns: """ table = pandas.read_csv( 'data/data_for_test_aspects/student_performance.csv') result = aspects.group_by(table, ['race/ethnicity'], enums.SummaryOperators.PROPORTION_OF_SUM) result_table = result['table'] result_table = aspects.crop_other_columns( result_table, ['race/ethnicity', 'reading score']) result_suggestions = result['suggestions'] # Sum of proportion column should be(close to) 1.0 assert (float(format(result_table['reading score'].sum(), '.5f')) == 1) print(result_table) expected_result_table = """ race/ethnicity reading score 0 group A 0.083216 1 group B 0.185011 2 group C 0.318698 3 group D 0.265263 4 group E 0.147812""" expected_suggestions = "[]" assert (expected_result_table == result_table.to_string()) assert (str(result_suggestions) == expected_suggestions)
def test_3(): """ Situation : This tests the median aspect. In the same randomly generated dataset calculate the median age group by gender Args: Returns: """ table = pandas.read_csv('data/data_for_test_aspects/test_1.csv') result = aspects.group_by(table, ['Gender'], enums.SummaryOperators.MEDIAN) result_table = result['table'] result_suggestions = result['suggestions'] print(result_table) expected_result = """ Gender Age 0 Female 50 1 Male 51""" expected_suggestions = "[]" assert (result_table.to_string() == expected_result) assert (str(result_suggestions) == expected_suggestions)
def slice_compare(table, metric, dimensions, slices, slice_compare_column, summary_operator, **kwargs): """This function will implement the slice-compare intent Also removes the tuples that do not lie in the given date range. The arguments 'table, metric,dimension,slices, slices_compare_column, summary_operator' are not optional, so they are passed as it is, 'date_range' will be passed in kwargs. If some the optional args are None(not passed), it is assumed that we don't have to apply them. Args: table: Type-pandas.dataframe It has the contents of the csv file metric: Type-string It is the name of the column according to which we have group to be done, summary operator is applied on metric. Metric could a column containing strings, if we are applying count operator on it. dimensions: Type-list of str It is the name of column we want. In query:'top 5 batsman according to runs', dimension is 'batsman'. When summary_operator is not None, we group by dimensions. date_range: Type-tuple Tuple of start_date and end_date date_column_name: Type-str It is the name of column which contains date date_format: Type-str It is required by datetime.strp_time to parse the date in the format Format Codes https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior slices: Type-dictionary (will be changed) contains the key as column name and value as instance we want to slice slice_compare_column: Type-list of string first element denotes the column name by which we will do comparision. rest elements will the value belongs to that column by which we will compare the slices. summary_operator: Type-summary_operators enum members It denotes the summary operator, after grouping by dimensions. ex. SummaryOperators.MAX, SummaryOperators.SUM Note-summary_operator is always applied on metric column passed, and only when grouping is done Returns: The function will return the `table(a pandas dataframe object)` after applying the intent on the given `table(a pandas dataframe object)`` """ date_column_name = kwargs.get('date_column_name', 'date') date_range = kwargs.get('date_range', None) date_format = kwargs.get('date_format', 'yyyy-mm-dd') table = aspects.apply_date_range(table, date_range, date_column_name, date_format) table = aspects.slice_table(table, slices) # collecting the colums not to be removed required_columns = [] if dimensions is not None: required_columns = dimensions.copy() required_columns.append(metric) table = aspects.crop_other_columns(table, required_columns) # slice_compare_column should be the last element of the group # so that groupby will show them together for every grouping dimensions.remove(slice_compare_column[0]) dimensions.append(slice_compare_column[0]) table = aspects.group_by(table, dimensions, summary_operator) return table
def top_down_error(table, metric, all_dimensions, slice_compare_column, slice1, slice2, summary_operator, **kwargs): """This function will implement the top down error debaising Args: table: Type-pandas.dataframe It has the contents of the csv file metric: Type-string It is the name of the column according to which we will do grouping, summary operator is applied on metric. Metric could a column containing strings, if we are applying count operator on it. dimensions: Type-list of str or None It is the name of column we want. In query:'compare batsman A and B according to total_runs', dimension is 'batsman'. we group by dimensions. all_dimension: Type-list of str It contains list of all dimensions slice_compare_column: Type-string name of the slice-compare column. slice1: Type-string the first value of comparision slice2: Type-string the second value of comparision date_range: Type-tuple Tuple of start_date and end_date date_column_name: Type-str It is the name of column which contains date day_first: Type-str It is required by datetime.strp_time to parse the date in the format Format Codes https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior slices: Type-List of tuples Tuple represents the conditon to keep the row. (column_name, filter, value) column_name - is the value of the column that the condition is applied upon. filter - Filters enum members, ex. Filters.IN summary_operator: Type-summary_operators enum members It denotes the summary operator, after grouping by dimensions. ex. SummaryOperators.MAX, SummaryOperators.SUM Note-summary_operator is always applied on metric column passed, and only when grouping is done Returns: return a list of dictionary where each dictionary represent a debiasing suggestion according to the new column. """ date_column_name = kwargs.get('date_column_name', 'date') date_range = kwargs.get('date_range', None) day_first = kwargs.get('day_first', '%Y-%m-%d') slices = kwargs.get('slices', None) dimensions = kwargs.get('dimensions', None) table = aspects.apply_date_range(table, date_range, date_column_name, day_first) slice_list = [] if slices is not None: slice_list = slices.copy() slice_list.append((slice_compare_column, Filters.IN, [slice1, slice2])) table = aspects.slice_table(table, slice_list) # removing all metric column except the one by which we do # group_by operation required_columns = all_dimensions.copy() required_columns.append(metric) table = aspects.crop_other_columns(table, required_columns) # operational_dimensions contain list of all dimension except # slice_compare_column operational_dimensions = all_dimensions.copy() operational_dimensions.remove(slice_compare_column) required_columns = [] if dimensions is not None: required_columns = dimensions.copy() required_columns.append(slice_compare_column) required_columns.append(metric) query_table = aspects.crop_other_columns(table, required_columns) grouping_columns = [] if dimensions is not None: grouping_columns = dimensions.copy() grouping_columns.append(slice_compare_column) # result_table is the result table requested by user. result_table = aspects.group_by(query_table, grouping_columns, summary_operator) # suggestions store the list of debiasing for this oversight. suggestions = [] dimension_list = [] if dimensions is not None: dimension_list = dimensions.copy() dimensions_len = len(dimension_list) for column in operational_dimensions: # we try to find the debiasing for every column which # is not in the grouping list initially if column not in dimension_list: new_grouping_columns = dimension_list.copy() new_grouping_columns.append(column) new_grouping_columns.append(slice_compare_column) new_required_columns = new_grouping_columns.copy() new_required_columns.append(metric) new_cropped_table = aspects.crop_other_columns( table, new_required_columns) # result table after adding the new column in the grouping list. new_result_table = aspects.group_by(new_cropped_table, new_grouping_columns, summary_operator) # it will return the debiasing suggestion after comparing the # initial result table and new result table. new_suggestion = _check_top_down_error(result_table, new_result_table, column, slice1, dimensions_len) if new_suggestion != None: suggestions.append(new_suggestion) return suggestions
def topk_results(table, metric, dimensions, is_asc, k, **kwargs): """This function will implement the top-k intent Sorts tuples in the order metric, after applying slice, groupby operations. Also removes the tuples that do not lie in the given date range. The arguments 'table, metric,dimension,sort_order, k' are not optional, so they are passed as it is, rest of the arguments that are optional('date_range', 'slices') will be passed in kwargs. If the summary_operator is not None, it groups by dimensions. If some the optional args are None(not passed), it is assumed that we don't have to apply them. Args: table: Type-pandas.dataframe It has the contents of the csv file metric: Type-string It is the name of the column according to which we sort, and in the case when grouping has to be done, summary operator is applied on metric. Metric could a column containing strings, if we are applying count operator on it. dimensions: Type-list of str It is the name of column we want. In query:'top 5 batsman according to runs', dimension is 'batsman'. When summary_operator is not None, we group by dimensions. is_asc: Type-Bool Denotes the sort order, True for ascending, False for Descending k: Type-int It is the number of entries to be taken. Also k = -1 means taking all entries date_range: Type-tuple Tuple of start_date and end_date date_column_name: Type-str It is the name of column which contains date day_first: Type-str Day_first denotes that does day in the date occurs before month in the dates in the date column Example - '29-02-19', here day_first is true slices: Type-List of tuples Tuple represents the conditon to keep the row. (column_name, filter, value) column_name - is the value of the column that the condition is applied upon. filter - Filters enum members, ex. Filters.IN summary_operator: Type-summary_operators enum members It denotes the summary operator, after grouping by dimensions. ex. SummaryOperators.MAX, SummaryOperators.SUM Note-summary_operator is always applied on metric column passed, and only when grouping is done Returns: The function will return both suggestions and the results in a tuple. (results, suggestions) results: Type - pandas dataframe, The results of the intended top-k suggestions: Type - List of strings, List of suggestions. """ date_column_name = kwargs.get('date_column_name', 'date') date_range = kwargs.get('date_range', None) day_first = kwargs.get('day_first', False) slices = kwargs.get('slices', None) summary_operator = kwargs.get('summary_operator', None) table = aspects.apply_date_range(table, date_range, date_column_name, day_first) table = aspects.slice_table(table, slices) # collecting the colums not to be removed required_columns = [] if dimensions is not None: required_columns = dimensions.copy() required_columns.append(metric) table = aspects.crop_other_columns(table, required_columns) after_group_by = aspects.group_by(table, dimensions, summary_operator) table = after_group_by['table'] suggestions = after_group_by['suggestions'] # using a stable sort('mergesort') will help to preserve the order # if equal values of [metric] are present table = table.sort_values(by=[metric], ascending=is_asc, kind='mergesort') # reordering the index # drop=True drops the new columnn named 'index' created in reset_index call table = table.reset_index(drop=True) # selecting only the top-k if k != -1: table = table.head(k) return (table, suggestions)
def _time_compare_results(table, metric, time_compare_column, date_range1, date_range2, day_first, summary_operator, **kwargs): """ This function returns the results according to the intent. Some of the oversights considered in this intent are- Args: table: Type-pandas.dataframe It has the contents of the csv file metric: Type-string It is the name of the column according to which grouping will be done. summary operator is applied on metric. Metric could a column containing strings, if we are applying count operator on it. dimensions: Type-list of str It is the name of column we want. 'compare batsman A and B according to total_runs', dimension is 'batsman'. we group by dimensions. slices: Type-List of tuples Tuple represents the conditon to keep the row. (column_name, filter, value) column_name - is the value of the column that the condition is applied upon. filter - Filters enum members, ex. Filters.IN time_compare_column: Type-string the column name by which we will do comparision. date_range1: Type-tuple of start_date and end_date first date range for which we have to do comparision date_range2: Type-tuple of start_date and end_date second date range for which we have to do comparision day_first: Type-str Day_first denotes that does day in the date occurs before month in the dates in the date column Example - '29-02-19', here day_first is true summary_operator: Type-summary_operators enum members It denotes the summary operator, after grouping by dimensions. ex. SummaryOperators.MAX, SummaryOperators.SUM Note-summary_operator is always applied on metric column passed, and only when grouping is done Returns: The function will return both suggestions and the results in a tuple. (results, suggestions) results: Type - pandas dataframe, The results of the intended slice-compare suggestions: Type - List of strings, List of suggestions. """ slices = kwargs.get('slices', None) dimensions = kwargs.get('dimensions', None) table = aspects.slice_table(table, slices) required_columns = [] if dimensions is not None: required_columns = dimensions.copy() required_columns.append(time_compare_column) required_columns.append(metric) required_table = aspects.crop_other_columns(table, required_columns) # we shall have two list so that we can combine both the table and # after applying the groupby operation we can get the desired result. table_slice1 = aspects.apply_date_range(required_table, date_range1, time_compare_column, day_first) table_slice1[time_compare_column] = date_range1[0] + " - " + date_range1[1] table_slice2 = aspects.apply_date_range(required_table, date_range2, time_compare_column, day_first) table_slice2[time_compare_column] = date_range2[0] + " - " + date_range2[1] # Pandas library to combine the tables. updated_table = pandas.concat([table_slice2, table_slice1]) updated_table = updated_table.reset_index(drop = True) grouping_columns = [] if dimensions is not None: grouping_columns = dimensions.copy() grouping_columns.append(time_compare_column) after_group_by = aspects.group_by(updated_table, grouping_columns, summary_operator) result_table = after_group_by['table'] suggestions = after_group_by['suggestions'] return (result_table, suggestions)
def benchmark_set_too_different(table, metric, all_metric, slice_compare_column, slice1, summary_operator, **kwargs): """This function can give the benchmark set too different oversight. Args: table: Type-pandas.dataframe It has the contents of the csv file metric: Type-string It is the name of the column according to which we will do grouping, summary operator is applied on metric. Metric could a column containing strings, if we are applying count operator on it. dimensions: Type-list of str It is the name of column we want. In query:'compare batsman A and B according to total_runs', dimension is 'batsman'. we group by dimensions. all_metric: Type-list of str It contains list of all metrics slice_compare_column: Type-list of string first element denotes the column name by which we will do comparision. rest elements will the value belongs to that column by which we will compare the slices. summary_operator: Type-summary_operators enum members It denotes the summary operator, after grouping by dimensions. ex. SummaryOperators.MAX, SummaryOperators.SUM Note-summary_operator is always applied on metric column passed, and only when grouping is done Returns: return a list of dictionary in which every dictionary represent a oversight for some column. """ date_column_name = kwargs.get('date_column_name', 'date') date_range = kwargs.get('date_range', None) day_first = kwargs.get('day_first', '%Y-%m-%d') slices = kwargs.get('slices', None) dimensions = kwargs.get('dimensions', None) table = aspects.apply_date_range(table, date_range, date_column_name, day_first) table = aspects.slice_table(table, slices) required_columns = [] if dimensions is not None: required_columns = dimensions.copy() required_columns.append(slice_compare_column) required_columns = required_columns + all_metric table = aspects.crop_other_columns(table, required_columns) """ required_table_for_one is a table which has data for the single variable given by the user by which we shall do comparision""" required_table_for_one = aspects.slice_table( table, [(slice_compare_column, Filters.EQUAL_TO, slice1)]) """ required_table_for_all is a table which has all the data which was in the initial table but with the comparision column as '*' because we have to compare single value with the whole table""" required_table_for_all = table.copy() required_table_for_all[slice_compare_column] = '*' updated_table = pandas.concat( [required_table_for_one, required_table_for_all]) updated_table = updated_table.reset_index() grouping_columns = [] if dimensions is not None: grouping_columns = dimensions.copy() grouping_columns.append(slice_compare_column) result_table = aspects.group_by(updated_table, grouping_columns, summary_operator)['table'] other_metrics = all_metric.copy() other_metrics.remove(metric) columns_order = grouping_columns.copy() columns_order.append(metric) columns_order = columns_order + other_metrics """ We have done the reordering of columns so that all the dimension should appear first followed by the metric given by the user and and then rest of the columns.""" result_table = result_table[columns_order] num_rows = result_table.shape[0] num_columns = result_table.shape[1] dimensions_len = 0 if dimensions is not None: dimensions_len = len(dimensions) result_table_matrix = result_table.values.tolist() suggestion = [] # We have to iterate through all metric which was not involved # in the computation initially. for column_i in range(dimensions_len + 2, num_columns): # it can store the index of row on whcih the oversight might appear. column_i_suggestion_list = [] row_i = 0 while row_i < num_rows: if row_i == num_rows - 1 or result_table_matrix[ row_i][:dimensions_len] != result_table_matrix[ row_i + 1][:dimensions_len]: column_i_suggestion_list.append({ 'row': row_i + 1, 'confidence_score': 100 }) else: if _calculate_relation( result_table_matrix[row_i][column_i], result_table_matrix[row_i + 1] [column_i]) < constants.BSTD_DISIMILARITY_THRESHOLD: row_i = row_i + 1 elif _calculate_relation( result_table_matrix[row_i][dimensions_len + 1], result_table_matrix[row_i + 1][dimensions_len + 1] ) < constants.BSTD_DISIMILARITY_THRESHOLD: row_i = row_i + 1 else: column_i_suggestion_list.append({ 'row': row_i + 1, 'confidence_score': 100 }) column_i_suggestion_list.append({ 'row': row_i + 2, 'confidence_score': 100 }) row_i = row_i + 1 row_i = row_i + 1 if len(column_i_suggestion_list) > 0: """ for every suggestion we form a dictionary and append it to the list of dictionary.""" suggestion_i = {} suggestion_i[ 'suggestion'] = slice1 + ' looks different from others on ' + metric + '. You might also want to look at ' + columns_order[ column_i] + ' since ' + slice1 + ' also looks different on this.' suggestion_i['oversight'] = Oversights.BENCHMARK_SET_TOO_DIFFERENT suggestion_i['is_row_level_suggestion'] = True suggestion_i['row_list'] = column_i_suggestion_list suggestion.append(suggestion_i) if len(suggestion) == 0: return None else: return suggestion
def _slice_compare_results_for_all(table, metric, slice_compare_column, slice1, slice2, summary_operator, **kwargs): """This function will implement the slice-compare intent Also removes the tuples that do not lie in the given date range. The arguments 'table, metric,dimension,slices_compare_column, summary_operator' are not optional, so they are passed as it is, 'date_range', 'slices' will be passed in kwargs. If some the optional args are None(not passed), it is assumed that we don't have to apply them. Args: table: Type-pandas.dataframe It has the contents of the csv file metric: Type-string It is the name of the column according to which grouping will be done. summary operator is applied on metric. Metric could a column containing strings, if we are applying count operator on it. dimensions: Type-list of str It is the name of column we want. 'compare batsman A and B according to total_runs' dimension is 'batsman'. we group by dimensions. date_range: Type-tuple Tuple of start_date and end_date date_column_name: Type-str It is the name of column which contains date day_first: Type-str It is required by datetime.strp_time to parse the date in the format Format Codes https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior slices: Type-List of tuples Tuple represents the conditon to keep the row. (column_name, filter, value) column_name - is the value of the column that the condition is applied upon. filter - Filters enum members, ex. Filters.IN slice_compare_column: Type-list of string first element denotes the column name by which we will do comparision. rest elements will the value belongs to that column by which we will compare the slices. summary_operator: Type-summary_operators enum members It denotes the summary operator, after grouping by dimensions. ex. SummaryOperators.MAX, SummaryOperators.SUM Note-summary_operator is always applied on metric column passed, and only when grouping is done Returns: The function will return the `table(a pandas dataframe object)` after applying the intent on the given `table(a pandas dataframe object)`` """ date_column_name = kwargs.get('date_column_name', 'date') date_range = kwargs.get('date_range', None) day_first = kwargs.get('day_first', '%Y-%m-%d') slices = kwargs.get('slices', None) dimensions = kwargs.get('dimensions', None) table = aspects.apply_date_range(table, date_range, date_column_name, day_first) table = aspects.slice_table(table, slices) # collecting the colums not to be removed required_columns = [] if dimensions is not None: required_columns = dimensions.copy() required_columns.append(slice_compare_column) required_columns.append(metric) table = aspects.crop_other_columns(table, required_columns) required_table_for_one = aspects.slice_table( table, [(slice_compare_column, Filters.EQUAL_TO, slice1)]) required_table_for_all = table.copy() required_table_for_all[slice_compare_column] = 'ALL' updated_table = pandas.concat( [required_table_for_all, required_table_for_one]) updated_table = updated_table.reset_index(drop=True) # collecting the colums on whcih we shall do grouping grouping_columns = [] if dimensions is not None: grouping_columns = dimensions.copy() grouping_columns.append(slice_compare_column) result_table = aspects.group_by(updated_table, grouping_columns, summary_operator) return result_table
def trend(table, metric, granularity, summary_operator, **kwargs): """This function will implement the trend intent Groups by time, then applies summary operator on the column-name=metric to compress the rows having similar date according to time granularity, after applying slicing and date range. If some the optional args are None(not passed), it is assumed that we don't have to apply them. Only slices and date range parameters(range, column_name, format) is optional, so it is passed using kwargs. Args: table: Type-pandas.dataframe It has the contents of the csv file metric: Type-string It is the name of the column according to which we sort, and in the case when grouping has to be done, summary operator is applied on metric. Metric could be a column containing strings, if we are applying count operator on it. date_range: Type-tuple Tuple of start_date and end_date, if it is null, represents that we don't need to crop date_column_name: Type-str It is the name of column which contains date date_format: Type-str It is required by datetime.strp_time to parse the date in the format Format Codes- https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior slices: Type-dictionary (will be changed) contains the key as column name and value as instance we want to slice summary_operator: Type-summary_operators enum member It denotes the summary operator granularity: Type-granularities enum member It denotes the granularity we need to apply to the dates. Returns: The function will return the `table(a pandas dataframe object)` after applying the intent on the given `table(a pandas dataframe object)` """ date_column_name = kwargs.get('date_column_name', 'date') date_range = kwargs.get('date_range', None) date_format = kwargs.get('date_format', 'yyyy-mm-dd') slices = kwargs.get('slices', None) table = aspects.apply_date_range(table, date_range, date_column_name, date_format) table = aspects.slice_table(table, slices) # collecting the colums not to be removed required_columns = [date_column_name, metric] table = aspects.crop_other_columns(table, required_columns) num_rows = table.shape[0] for row in range(num_rows): row_date = datetime.datetime.strptime(table.loc[row, date_column_name], date_format) row_date = aspects.granular_time(row_date, granularity) table.loc[row, date_column_name] = row_date.strftime(date_format) table = aspects.group_by(table, [date_column_name], summary_operator) table = table.sort_values(by=[date_column_name]) return table
def show(table, **kwargs): """This function will implement the show intent Firstly removes the tuples that do not lie in the given date range. Then applies slicing and groupby operations. The argument 'table' is not optional , hence passed as it is. Rest of the arguments are optional , hence passed in kwargs If the summary_operator is not None , it groups by dimensions. If some of the optional args are None (not passed), it is assumed that we don't have to apply them. Args: table: Type-pandas.dataframe It has the contents of the csv file metric: Type-string It is the name of the column on which summary operator is applied in case of grouping. Metric could a column containing strings, if we are applying count operator on it. dimensions: Type-list of str It is the name of column we want. In query:'show all batsman', dimension is 'batsman'. When summary_operator is not None, we group by dimensions. date_range: Type-tuple Tuple of start_date and end_date date_column_name: Type-str It is the name of column which contains date date_format: Type-str It is required by datetime.strp_time to parse the date in the format Format Codes https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior slices: Type-dictionary (will be changed) contains the key as column name and value as instance we want to slice summary_operator: Type-summary_operators enum members It denotes the summary operator, after grouping by dimensions. ex. SummaryOperators.MAX, SummaryOperators.SUM Note-summary_operator is always applied on metric column passed, and only when grouping is done Returns: The function will return the `table(a pandas dataframe object)` after applying the intent on the given `table(a pandas dataframe object)`` """ date_column_name = kwargs.get('date_column_name', 'date') date_range = kwargs.get('date_range', None) date_format = kwargs.get('date_format', 'yyyy-mm-dd') slices = kwargs.get('slices', None) summary_operator = kwargs.get('summary_operator', None) metric = kwargs.get('metric', None) dimensions = kwargs.get('dimensions', None) table = aspects.apply_date_range(table, date_range, date_column_name, date_format) table = aspects.slice_table(table, slices) # collecting the colums not to be removed required_columns = [] if dimensions is not None: required_columns = dimensions.copy() # metric is optional in show if metric is not None: required_columns.append(metric) table = aspects.crop_other_columns(table, required_columns) table = aspects.group_by(table, dimensions, summary_operator) return table
def topk_results(table, metric, dimensions, is_asc, k, **kwargs): """This function will implement the top-k intent Sorts tuples in the order metric, after applying slice, groupby operations. Also removes the tuples that do not lie in the given date range. The arguments 'table, metric,dimension,sort_order, k' are not optional, so they are passed as it is, rest of the arguments that are optional('date_range', 'slices') will be passed in kwargs. If the summary_operator is not None, it groups by dimensions. If some the optional args are None(not passed), it is assumed that we don't have to apply them. Args: table: Type-pandas.dataframe It has the contents of the csv file metric: Type-string It is the name of the column according to which we sort, and in the case when grouping has to be done, summary operator is applied on metric. Metric could a column containing strings, if we are applying count operator on it. dimensions: Type-list of str It is the name of column we want. In query:'top 5 batsman according to runs', dimension is 'batsman'. When summary_operator is not None, we group by dimensions. is_asc: Type-Bool Denotes the sort order, True for ascending, False for Descending k: Type-int It is the number of entries to be taken. Also k = -1 means taking all entries date_range: Type-tuple Tuple of start_date and end_date date_column_name: Type-str It is the name of column which contains date date_format: Type-str It is required by datetime.strp_time to parse the date in the format Format Codes https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior slices: Type-List of tuples Tuple represents the conditon to keep the row. (column_name, filter, value) column_name - is the value of the column that the condition is applied upon. filter - Filters enum members, ex. Filters.IN summary_operator: Type-summary_operators enum members It denotes the summary operator, after grouping by dimensions. ex. SummaryOperators.MAX, SummaryOperators.SUM Note-summary_operator is always applied on metric column passed, and only when grouping is done Returns: The function will return the `table(a pandas dataframe object)` after applying the intent on the given `table(a pandas dataframe object)`` """ date_column_name = kwargs.get('date_column_name', 'date') date_range = kwargs.get('date_range', None) date_format = kwargs.get('date_format', '%Y-%m-%d') slices = kwargs.get('slices', None) summary_operator = kwargs.get('summary_operator', None) table = aspects.apply_date_range(table, date_range, date_column_name, date_format) table = aspects.slice_table(table, slices) # collecting the colums not to be removed required_columns = [] if dimensions is not None: required_columns = dimensions.copy() required_columns.append(metric) table = aspects.crop_other_columns(table, required_columns) table = aspects.group_by(table, dimensions, summary_operator) table = table.sort_values(by=[metric], ascending=is_asc) # reordering the index # drop=True drops the new columnn named 'index' created in reset_index call table = table.reset_index(drop=True) # selecting only the top-k if k != -1: table = table.head(k) return table
def show(table, **kwargs): """This function will implement the show intent Firstly removes the tuples that do not lie in the given date range. Then applies slicing and groupby operations. The argument 'table' is not optional , hence passed as it is. Rest of the arguments are optional , hence passed in kwargs If the summary_operator is not None , it groups by dimensions. If some of the optional args are None (not passed), it is assumed that we don't have to apply them. Also, if summary operator is applied, the name of metric column is renamed to "<summary operator> of metric". Args: table: Type-pandas.dataframe It has the contents of the csv file metric: Type-string It is the name of the column on which summary operator is applied in case of grouping. Metric could a column containing strings, if we are applying count operator on it. dimensions: Type-list of str It is the name of column we want. In query:'show all batsman', dimension is 'batsman'. When summary_operator is not None, we group by dimensions. date_range: Type-tuple Tuple of start_date and end_date date_column_name: Type-str It is the name of column which contains date day_first: Type-str Day_first denotes that does day in the date occurs before month in the dates in the date column Example - '29-02-19', here day_first is true slices: Type-List of tuples Tuple represents the conditon to keep the row. (column_name, filter, value) column_name - is the value of the column that the condition is applied upon. filter - Filters enum members, ex. Filters.IN summary_operator: Type-summary_operators enum members It denotes the summary operator, after grouping by dimensions. ex. SummaryOperators.MAX, SummaryOperators.SUM Note-summary_operator is always applied on metric column passed, and only when grouping is done Returns: The function will return both suggestions and the results in a tuple. (results, suggestions) results: Type - pandas dataframe, The results of the intended show suggestions: Type - List of dictionaries(suggestion structure), List of suggestions. """ date_column_name = kwargs.get('date_column_name', 'date') date_range = kwargs.get('date_range', None) day_first = kwargs.get('day_first', False) slices = kwargs.get('slices', None) summary_operator = kwargs.get('summary_operator', None) metric = kwargs.get('metric', None) dimensions = kwargs.get('dimensions', None) table = aspects.apply_date_range(table, date_range, date_column_name, day_first) table = aspects.slice_table(table, slices) # collecting the colums not to be removed required_columns = [] if dimensions is not None: required_columns = dimensions.copy() # metric is optional in show if metric is not None: required_columns.append(metric) table = aspects.crop_other_columns(table, required_columns) # When there is no dimension to group by but a summary_operator exists , # we assume that user wants to apply the summary operator on whole data if ((dimensions is None) and (summary_operator is not None)): # We add temporary column of 'Summary Operator' by which we can groupby # to obtain the final result _add_temporary_column_of_summary_operator(table, summary_operator) dimensions = [] # To groupby 'Summary Operator' column inserted dimensions.append('Summary Operator') after_group_by = aspects.group_by(table, dimensions, summary_operator) table = after_group_by['table'] suggestions = [] if len(after_group_by['suggestions']) > 0: suggestions.extend(after_group_by['suggestions']) # Droping the 'Summary Operator' column which was inserted above table = table.drop(columns=['Summary Operator']) table = aspects.update_metric_column_name(table, summary_operator, metric) return (table, suggestions) after_group_by = aspects.group_by(table, dimensions, summary_operator) table = after_group_by['table'] suggestions = [] different_weight_suggestion = weighted_mean_with_different_weights.\ weighted_mean_with_different_weights(table, metric) if different_weight_suggestion is not None: suggestions.append(different_weight_suggestion) if len(after_group_by['suggestions']) > 0: suggestions.extend(after_group_by['suggestions']) order = oversights_order.ORDER_IN_SHOW suggestions = rank_oversights.rank_oversights(suggestions, order) if summary_operator is not None: table = aspects.update_metric_column_name(table, summary_operator, metric) return (table, suggestions) order = oversights_order.ORDER_IN_SHOW suggestions = rank_oversights.rank_oversights(suggestions, order) return (table, suggestions)
def _slice_compare_results(table, metric, slice_compare_column, slice1, slice2, summary_operator, **kwargs): """This function will implement the slice-compare intent Also removes the tuples that do not lie in the given date range. The arguments 'table, metric,dimension,slices_compare_column, summary_operator' are not optional, so they are passed as it is, 'date_range','slices' will be passed in kwargs. If some the optional args are None(not passed), it is assumed that we don't have to apply them. Args: table: Type-pandas.dataframe It has the contents of the csv file metric: Type-string It is the name of the column according to which grouping will be done. summary operator is applied on metric. Metric could a column containing strings, if we are applying count operator on it. dimensions: Type-list of str It is the name of column we want. 'compare batsman A and B according to total_runs' dimension is 'batsman'. we group by dimensions. date_range: Type-tuple Tuple of start_date and end_date date_column_name: Type-str It is the name of column which contains date day_first: Type-str It is required by datetime.strp_time to parse the date in the format Format Codes https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior slices: Type-List of tuples Tuple represents the conditon to keep the row. (column_name, filter, value) column_name - is the value of the column that the condition is applied upon. filter - Filters enum members, ex. Filters.IN slice_compare_column: Type-list of string first element denotes the column name by which we will do comparision. rest elements will the value belongs to that column by which we will compare the slices. summary_operator: Type-summary_operators enum members It denotes the summary operator, after grouping by dimensions. ex. SummaryOperators.MAX, SummaryOperators.SUM Note-summary_operator is always applied on metric column passed, and only when grouping is done Returns: The function will return both suggestions and the results in a tuple. (results, suggestions) results: Type - pandas dataframe, The results of the intended slice-compare suggestions: Type - List of strings, List of suggestions. """ date_column_name = kwargs.get('date_column_name', 'date') date_range = kwargs.get('date_range', None) day_first = kwargs.get('day_first', False) slices = kwargs.get('slices', None) dimensions = kwargs.get('dimensions', None) table = aspects.apply_date_range(table, date_range, date_column_name, day_first) if slices == None: slices = [(slice_compare_column, Filters.IN, [slice1, slice2])] else: slices.append((slice_compare_column, Filters.IN, [slice1, slice2])) table = aspects.slice_table(table, slices) # collecting the colums not to be removed required_columns = [] if dimensions is not None: required_columns = dimensions.copy() required_columns.append(slice_compare_column) required_columns.append(metric) table = aspects.crop_other_columns(table, required_columns) # slice_compare_column should be the last element of the group # so that groupby will show them together for every grouping grouping_columns = [] if dimensions is not None: grouping_columns = dimensions.copy() grouping_columns.append(slice_compare_column) after_group_by = aspects.group_by(table, grouping_columns, summary_operator) result_table = after_group_by['table'] suggestions = after_group_by['suggestions'] return (result_table, suggestions)