def test_4(): """An example from the spider dataset question : What are the themes of farm competitions sorted by year in ascending order? """ table = data.spider_eval.evaluation.get_table('farm', 'farm_competition') query_result = topk.topk(table, 'Year', ['Theme'], True, -1, slices=None, date_range=None, date_column_name='date', day_first=False, group_columns=None, summary_operator=None) print(query_result) expected_result = """ Year Theme 0 2002 Aliens 1 2003 MTV Cube 2 2004 Valentine's Day 3 2005 MTV Asia Aid 4 2006 Codehunters 5 2013 Carnival M is back!""" expected_suggestions = """[]""" assert (expected_result == query_result[0].to_string()) assert (expected_suggestions == str(query_result[1]))
def test_1(): """An example from the IPL dataset question : top-15 city based on win_by_runs in season = 2017 in the date range 2008-05-08 to 2017-04-12 """ table = pandas.read_csv('data/matches.csv') query_result = topk.topk(table, 'win_by_runs', ['city'], False, 40, slices=[('season', enums.Filters.EQUAL_TO, 2017)], date_range=('2008-05-08', '2017-04-12'), date_column_name='date', day_first=False) print(query_result) expected_result = """ city win_by_runs 0 Pune 97 1 Hyderabad 35 2 Bangalore 15 3 Pune 0 4 Rajkot 0 5 Indore 0 6 Hyderabad 0 7 Mumbai 0 8 Indore 0 9 Mumbai 0""" expected_suggestions = """[{'suggestion': 'The results has duplicates', 'oversight': <Oversights.DUPLICATES_IN_TOPK: 1>}, {'suggestion': 'Instead of 40 only 10 rows are present in the results', 'oversight': <Oversights.TOPK_WHEN_LESS_THAN_K_PRESENT: 2>}]""" assert (expected_result == query_result[0].to_string()) assert (expected_suggestions == str(query_result[1]))
def test_6(): """An example from the spider dataset question : What are the dates of publications in descending order of price? """ table = data.spider_eval.evaluation.get_table('book_2', 'publication') query_result = topk.topk(table, 'Price', ['Publication_Date'], False, -1, slices=None, date_range=None, date_column_name='date', day_first=False) print(query_result) expected_result = """ Publication_Date Price 0 August 2008 15000000.0 1 March 2008 6000000.0 2 June 2006 4100000.0 3 October 2005 3000000.0 4 August 2008 3000000.0 5 March 2007 2000000.0 6 April 2007 2000000.0""" expected_suggestions = """[{'suggestion': 'The results has duplicates', 'oversight': <Oversights.DUPLICATES_IN_TOPK: 1>}]""" assert (expected_result == query_result[0].to_string()) assert (expected_suggestions == str(query_result[1]))
def test_1(): """An example from the IPL dataset question : top-15 city based on win_by_runs in season = 2017 in the date range 2008-05-08 to 2017-04-12 """ table = pandas.read_csv('data/matches.csv') query_result = topk.topk(table, 'win_by_runs', ['city'], False, 40, slices=[('season', enums.Filters.EQUAL_TO, 2017)], date_range=('2008-05-08', '2017-04-12'), date_column_name='date', date_format='%Y-%m-%d') print(query_result) expected_result = """ city win_by_runs 0 Pune 97 1 Hyderabad 35 2 Bangalore 15 3 Pune 0 4 Rajkot 0 5 Indore 0 6 Hyderabad 0 7 Mumbai 0 8 Indore 0 9 Mumbai 0""" expected_suggestions = """[{'suggestion': 'The results has duplicates', 'oversight_name': 'Duplicates in top-k'}]""" assert (expected_result == query_result[0].to_string()) assert (expected_suggestions == str(query_result[1]))
def test_2(): """An example from the IPL dataset question : top 5 player_of_match based on avg(win_by_runs) in season 2017 in date range 2008-05-08 to 2017-04-12 """ table = pandas.read_csv('data/matches.csv') query_result = topk.topk(table, 'win_by_runs', ['player_of_match'], False, 5, slices=[('season', enums.Filters.EQUAL_TO, 2017)], date_range=('2017-05-09', '2017-05-12'), date_column_name='date', day_first=False, summary_operator=enums.SummaryOperators.MEAN) print(query_result) expected_result = """ player_of_match MEAN of win_by_runs 0 MM Sharma 14 1 KK Nair 7 2 WP Saha 7 3 SS Iyer 0""" expected_suggestions = """[{'suggestion': 'Instead of 5 only 4 rows are present in the results', 'oversight': <Oversights.TOPK_WHEN_LESS_THAN_K_PRESENT: 2>}, {'oversight': <Oversights.REGRESSION_TO_THE_MEAN: 4>, 'suggestion': "very few of the top-k in the given date range will be in the previous window's top-k"}]""" assert (expected_result == query_result[0].to_string()) assert (expected_suggestions == str(query_result[1]))
def test_6(): """An example from the spider dataset question : What are the dates of publications in descending order of price? """ table = data.spider_eval.evaluation.get_table('book_2', 'publication') query_result = topk.topk(table, 'Price', ['Publication_Date'], False, 10000, slices=None, date_range=None, date_column_name='date', date_format='%Y-%m-%d') print(query_result) expected_result = """ Publication_Date Price 0 August 2008 15000000.0 1 March 2008 6000000.0 2 June 2006 4100000.0 3 October 2005 3000000.0 4 August 2008 3000000.0 5 March 2007 2000000.0 6 April 2007 2000000.0""" expected_suggestions = """['The results has duplicates, you forgot to apply group by']""" assert (expected_result == query_result[0].to_string()) assert (expected_suggestions == str(query_result[1]))
def test_5(): """An example from the spider dataset question : For each city list their names in decreasing order by their highest station latitude. """ table = data.spider_eval.evaluation.get_table('bike_1', 'station') query_result = topk.topk(table, 'lat', ['city'], False, -1, slices=None, date_range=None, date_column_name='date', day_first=False, summary_operator=enums.SummaryOperators.MAX) print(query_result) expected_result = """ city MAX of lat 0 San Francisco 37.804770 1 Redwood City 37.491269 2 Palo Alto 37.448598 3 Mountain View 37.406940 4 San Jose 37.352601""" expected_suggestions = """[]""" assert (expected_result == query_result[0].to_string()) assert (expected_suggestions == str(query_result[1]))
def topk_sim(input_vec, k_value): neighbours = [] for row in train_rows: sim = similarity(input_vec, row[3]) neighbours.append((row, sim)) #neighbours.append((row[2], sim)) neigh_topk = topk(neighbours, k_value, True) #neigh_topk = neighbours for neigh in neigh_topk: row = neigh[0] sim = neigh[1] #print row[2], sim return neigh_topk
def topk_sim(input_vec, k_value): neighbours = [] for row in train_rows: sim = similarity(input_vec, row[3]) neighbours.append((row, sim)) #neighbours.append((row[2], sim)) neigh_topk = topk(neighbours, k_value, True) #neigh_topk = neighbours for neigh in neigh_topk: row = neigh[0] sim = neigh[1] #print row[2], sim return neigh_topk
def test_7(): """An example from the spider dataset question : What is the name and salary of all employees in order of salary? """ table = data.spider_eval.evaluation.get_table('flight_1', 'employee') query_result = topk.topk(table, 'salary', ['name'], True, -1, slices=None, date_range=None, date_column_name='date', day_first=False) print(query_result) expected_result = """ name salary 0 Milo Brooks 20 1 Donald King 18050 2 Richard Jackson 23980 3 Patricia Jones 24450 4 Linda Davis 27984 5 Elizabeth Taylor 32021 6 Haywood Kelly 32899 7 Chad Stewart 33546 8 David Anderson 43001 9 Barbara Wilson 43723 10 Robert Brown 44740 11 Michael Miller 48090 12 William Moore 48250 13 Jennifer Thomas 54921 14 William Ward 84476 15 Michael Miller 99890 16 Larry West 101745 17 William Jones 105743 18 Eric Cooper 114323 19 James Smith 120433 20 Dorthy Lewis 152013 21 John Williams 153972 22 Mary Johnson 178345 23 Karen Scott 205187 24 Mark Young 205187 25 Lawrence Sperry 212156 26 Angela Martinez 212156 27 Joseph Thompson 212156 28 Betty Adams 227489 29 Lisa Walker 256481 30 George Wright 289950""" expected_suggestions = """[{'suggestion': 'The results has duplicates', 'oversight': <Oversights.DUPLICATES_IN_TOPK: 1>}]""" assert (expected_result == query_result[0].to_string()) assert (expected_suggestions == str(query_result[1]))
def test_3(): """An example from the spider dataset question : In which year were most departments established? """ table = data.spider_eval.evaluation.get_table('department_management', 'department') query_result = topk.topk(table, 'Department_ID', ['Creation'], False, 1, slices=None, date_range=None, date_column_name='date', day_first=False, summary_operator=enums.SummaryOperators.COUNT) print(query_result) expected_result = """ Creation COUNT of Department_ID 0 1789 2""" expected_suggestions = """[{'oversight': <Oversights.TOPK_VS_OTHERS: 6>, 'change_list': {'topKLimit': 14}, 'suggestion': 'The rows NOT in the top-k have a much larger sum over Department_ID than the rows in top-k', 'confidence_score': 0.15384615384615385}]""" assert (expected_result == query_result[0].to_string()) assert (expected_suggestions == str(query_result[1]))
def test_3(): """An example from the spider dataset question : In which year were most departments established? """ table = data.spider_eval.evaluation.get_table('department_management', 'department') query_result = topk.topk(table, 'Department_ID', ['Creation'], False, 1, slices=None, date_range=None, date_column_name='date', date_format='%Y-%m-%d', summary_operator=SummaryOperators.COUNT) print(query_result) expected_result = """ Creation Department_ID 0 1789 2""" expected_suggestions = """['No suggestions as date condition is not there.']""" assert (expected_result == query_result[0].to_string()) assert (expected_suggestions == str(query_result[1]))
def test_8(): """This query uses a manually created dataset - to test similarity between rank vectors question : Top-4 rating between 23/05/2010 to 25/05/2011 """ # in this database the ranks reverse if the previous window is considered table = pandas.read_csv('data/rating.csv') query_result = topk.topk(table, 'Rating', ['User Name'], True, 4, slices=None, date_range=('2010-05-23', '2011-05-25'), date_column_name='date', day_first=True) print(query_result) expected_result = """ User Name Rating 0 Benq 3400 1 300iq 4300 2 cba 5200 3 tourist 6100""" expected_suggestions = """[{'oversight': <Oversights.REGRESSION_TO_THE_MEAN: 4>, 'suggestion': "The ranks of the top-k in the date range differs much from the previous window's top-k"}]""" assert (expected_result == query_result[0].to_string()) assert (expected_suggestions == str(query_result[1]))
def test_2(): """An example from the IPL dataset question : top 5 player_of_match based on avg(win_by_runs) in season 2017 in date range 2008-05-08 to 2017-04-12 """ table = pandas.read_csv('data/matches.csv') query_result = topk.topk(table, 'win_by_runs', ['player_of_match'], False, 5, slices=[('season', enums.Filters.EQUAL_TO, 2017)], date_range=('2017-05-09', '2017-05-12'), date_column_name='date', date_format='%Y-%m-%d', summary_operator=enums.SummaryOperators.MEAN) print(query_result) expected_result = """ player_of_match win_by_runs 0 MM Sharma 14 1 KK Nair 7 2 WP Saha 7 3 SS Iyer 0""" expected_suggestions = """[{'oversight_name': 'Regression to the mean', 'suggestion': "very few of the top-k in the given date range will be in the previous window's top-k"}]""" assert (expected_result == query_result[0].to_string()) assert (expected_suggestions == str(query_result[1]))
def hello_http(request): """HTTP Cloud Function. Args: request (flask.Request): The request object. <http://flask.pocoo.org/docs/1.0/api/#flask.Request> Returns: The response text, or any set of values that can be turned into a Response object using `make_response` <http://flask.pocoo.org/docs/1.0/api/#flask.Flask.make_response>. """ request_json = request.get_json(silent=True) request_args = request.args # extracting the intent parameters from the json intent = request_json['intent'] table = request_json['table'] row_range = request_json['rowRange'] metric = request_json['metric'] dimensions = request_json['dimensions'] summary_operator = request_json['summaryOperator'] slices = request_json['slices'] is_asc = request_json['isAsc'] k = request_json['k'] slices = request_json['slices'] slice_compare_column = request_json['comparisonValue'] date = request_json['dateRange'] time_granularity = request_json['timeGranularity'] row_start = row_range["rowStart"] row_end = row_range["rowEnd"] row_header = row_range["header"] # Converting the list of list into a pandas dataframe. query_table = [] for row in range(row_start - 1, row_end): if row != row_header - 1: query_table.append(table[row]) query_table_dataframe = pandas.DataFrame(query_table, columns=table[row_header - 1]) # Converting the variables that contain denote the # date range into the desired format. date_column_name = None date_range = None if date != "null": date_column_name = date['dateCol'] date_range = (date['dateStart'], date['dateEnd']) # Converting the Slices passed in the json into a # list of tuples (col, operator, val) slices_list = None if slices != "null": slices_list = [] for item in slices: val = item['sliceVal'] col = item['sliceCol'] operator = _str_to_filter_enum(item['sliceOp']) slices_list.append((col, operator, val)) if dimensions == 'null': dimensions = None if metric == 'null': metric = None summary_operator = _str_to_summary_operator_enum(summary_operator) suggestions = [] if intent == 'show': query_table_dataframe = show(query_table_dataframe, slices=slices_list, metric=metric, dimensions=dimensions, summary_operator=summary_operator) elif intent == 'topk': query_result = topk.topk(query_table_dataframe, metric, dimensions, is_asc, k, summary_operator=summary_operator, date_column_name=date_column_name, date_range=date_range, slices=slices_list) query_table_dataframe = query_result[0] suggestions = query_result[1] elif intent == 'slice_compare': query_table_dataframe = slice_compare.slice_compare( query_table_dataframe, metric, dimensions, [], [], slice_compare_column_list, summary_operator=summary_operator, date_column_name=date_column_name, date_range=date_range, slices=slices_list) else: raise Exception("Intent name does not match") # In updated suggestions, change_list is replaced with the json of # the new query. updated_suggestions = [] for suggestion in suggestions: updated_suggestion = suggestion if 'change_list' in suggestion.keys(): updated_suggestion['json'] = \ _convert_change_list_to_new_query_json(request_json, suggestion['change_list']) updated_suggestions.append(updated_suggestion) suggestions = updated_suggestions final_table = [] # converting into a json object and returning final_table = query_table_dataframe.values.tolist() final_table.insert(0, list(query_table_dataframe.columns.values)) json_ret = {'outputTable': final_table, 'suggestions': suggestions} json_string = json.dumps(json_ret) return json_string
def hello_http(request): """HTTP Cloud Function. Args: request (flask.Request): The request object. <http://flask.pocoo.org/docs/1.0/api/#flask.Request> Returns: The response text, or any set of values that can be turned into a Response object using `make_response` <http://flask.pocoo.org/docs/1.0/api/#flask.Flask.make_response>. """ request_json = request.get_json(silent=True) request_args = request.args # extracting the intent parameters from the json intent = _get_value(request_json, 'intent') table = _get_value(request_json, 'table') metric = _get_value(request_json, 'metric') dimensions = _get_value(request_json, 'dimensions') summary_operator = _get_value(request_json, 'summaryOperator') slices = _get_value(request_json, 'slices') is_asc = _get_value(request_json, 'isAsc') k = _get_value(request_json, 'topKLimit') slices = _get_value(request_json, 'slices') slice_comparision_arg = _get_value(request_json, 'comparisonValue') time_comparision_arg = _get_value(request_json, 'compareDateRange') date = _get_value(request_json, 'dateRange') time_granularity = _get_value(request_json, 'timeGranularity') correlation_metrics = _get_value(request_json, 'correlationMetrics') rangeA1Notation = _get_value(request_json, 'rangeA1Notation') # Converting the list of list into a pandas dataframe. query_table = [] for row in range(1, len(table)): if row != 0: query_table.append(table[row]) query_table_dataframe = pandas.DataFrame(query_table, columns=table[0]) (all_dimensions, all_metrics) = _list_all_dimensions_metrics(query_table_dataframe, dimensions, metric) # Remove empty columns query_table_dataframe = remove_empty_columns(query_table_dataframe) # Remove duplicate named columns query_table_dataframe = remove_duplicate_named_columns( query_table_dataframe) # Converting the variables that contain denote the # date range into the desired format. date_column_name = None date_range = None day_first = None if date != None: date_columns = request_json['dateColumns'] date_column_name = date['dateCol'] date_range = (date['dateStart'], date['dateEnd']) day_first = date_columns[date_column_name]['day_first'] # Converting the Slices passed in the json into a # list of tuples (col, operator, val) slices_list = None if slices != None: slices_list = [] for item in slices: val = item['sliceVal'] col = item['sliceCol'] operator = _str_to_filter_enum(item['sliceOp']) slices_list.append((col, operator, val)) if dimensions == 'null': dimensions = None if slice_comparision_arg is not None: slice_compare_column = slice_comparision_arg['comparisonColumn'] slice1 = slice_comparision_arg['slice1'] slice2 = slice_comparision_arg['slice2'] if time_comparision_arg is not None: time_compare_column = time_comparision_arg['dateCol'] date_range1 = (time_comparision_arg['dateStart1'], time_comparision_arg['dateEnd1']) date_range2 = (time_comparision_arg['dateStart2'], time_comparision_arg['dateEnd2']) day_first = request_json['dateColumns'][time_compare_column][ 'day_first'] if metric == 'null': metric = None summary_operator = _str_to_summary_operator_enum(summary_operator) time_granularity = _str_to_time_granularity_enum(time_granularity) suggestions = [] wrong_points_suggestion = wrong_points.wrong_points(query_table_dataframe) if intent == 'show': query_table_dataframe = show(query_table_dataframe, slices=slices_list, metric=metric, dimensions=dimensions, summary_operator=summary_operator, date_column_name=date_column_name, day_first=day_first, date_range=date_range) if summary_operator == enums.SummaryOperators.MEAN: suggestions.append(get_hardcoded_mean_vs_median_suggestion()) updated_suggestions = [] for suggestion in suggestions: updated_suggestion = suggestion if 'change_list' in suggestion.keys(): updated_suggestion['json'] = func(request_json, suggestion['change_list']) updated_suggestions.append(updated_suggestion) suggestions = updated_suggestions elif intent == 'topk': query_result = topk.topk(query_table_dataframe, metric, dimensions, is_asc, k, summary_operator=summary_operator, slices=slices_list, date_column_name=date_column_name, day_first=day_first, date_range=date_range) query_table_dataframe = query_result[0] suggestions = query_result[1] updated_suggestions = [] for suggestion in suggestions: updated_suggestion = suggestion if 'change_list' in suggestion.keys(): updated_suggestion['json'] = func(request_json, suggestion['change_list']) updated_suggestion['oversight'] = updated_suggestion[ 'oversight'].name updated_suggestions.append(updated_suggestion) suggestions = updated_suggestions elif intent == 'slice_compare': query_result = slice_compare.slice_compare( query_table_dataframe, metric, all_dimensions, all_metrics, slice_compare_column, slice1, slice2, summary_operator, date_column_name=date_column_name, date_range=date_range, day_first=day_first, slices=slices_list, dimensions=dimensions) query_table_dataframe = query_result[0] suggestions = query_result[1] updated_suggestions = [] for suggestion in suggestions: updated_suggestion = suggestion if 'change_list' in suggestion.keys(): updated_suggestion['json'] = func(request_json, suggestion['change_list']) updated_suggestion['oversight'] = updated_suggestion[ 'oversight'].name updated_suggestions.append(updated_suggestion) suggestions = updated_suggestions elif intent == 'time_compare': query_result = time_compare.time_compare(query_table_dataframe, metric, all_dimensions, time_compare_column, date_range1, date_range2, day_first, summary_operator, slices=slices_list, dimensions=dimensions) query_table_dataframe = query_result[0] suggestions = query_result[1] updated_suggestions = [] for suggestion in suggestions: updated_suggestion = suggestion if 'change_list' in suggestion.keys(): updated_suggestion['json'] = func(request_json, suggestion['change_list']) updated_suggestion['oversight'] = updated_suggestion[ 'oversight'].name updated_suggestions.append(updated_suggestion) suggestions = updated_suggestions elif intent == 'correlation': query_table_dataframe = correlation.correlation( query_table_dataframe, correlation_metrics['metric1'], correlation_metrics['metric2'], slices=slices_list, date_column_name=date_column_name, day_first=day_first, date_range=date_range, dimensions=dimensions) elif intent == 'trend': query_table_dataframe = trend.trend(query_table_dataframe, metric, time_granularity, summary_operator, date_column_name=date_column_name, day_first=day_first, date_range=date_range, slices=slices_list) else: raise Exception("Intent name does not match") if wrong_points_suggestion is not None: wrong_points_suggestion['oversight'] = wrong_points_suggestion[ 'oversight'].name suggestions = [wrong_points_suggestion] + suggestions final_table = [] # converting into a json object and returning final_table = query_table_dataframe.values.tolist() final_table.insert(0, list(query_table_dataframe.columns.values)) json_ret = {'outputTable': final_table, 'suggestions': suggestions} if rangeA1Notation is not None: all_row_labels = _get_all_row_labels(rangeA1Notation) all_column_labels = _get_all_column_labels(rangeA1Notation) cheader_to_clabel = _get_cheader_to_clabel(table, all_column_labels) if slices_list is not None: json_ret[ 'slicing_passed_list'] = insert_as_column.insert_as_column_show( table, cheader_to_clabel, all_row_labels[0], all_row_labels[-1], all_column_labels[0], all_column_labels[-1], slices=slices_list) if intent == 'topk' and summary_operator is None: filter_column_label_number = _get_number_of_column_label( all_column_labels[-1]) + 1 filter_column_label = _get_label_from_number( filter_column_label_number) json_ret[ 'list_topk_indices'] = insert_as_column.insert_as_column_topk_column( table, cheader_to_clabel, all_row_labels[0], all_row_labels[-1], all_column_labels[0], all_column_labels[-1], filter_column_label, metric, is_asc, k) json_string = json.dumps(json_ret) return json_string
k=int(args.k) t=int(args.t) b=int(args.b) if len(sys.argv) != 11: print "Incomplete arguments, type 'python run_ranked_insert.py -h' " exit() # Calculate the budget: budget = b # Calculate the number of iterations iterations = 50*math.log(0.51*(n+1)) print "Number of vectors " + str(n) print "Size of vectors " + str(m) print "The 'k' in top k " + str(k) print "Type of data " + str(t) print "Budget " + str(b) print "Number of iterations "+str(iterations) print "** ** ** " print "Result" print "** ** ** " # Initialize sorted vectors vectors,means = data.getSortedVectors(n,m,10000,t) tk.topk(vectors,means,k,budget,iterations,"runs") exit()