def test_join_optimizable_3(optimize=False): """ :return: """ cleanup() print_test_separator("Starting test_optimizable_3, optimize = " + str(optimize)) cat = CSVCatalog.CSVCatalog() cds = [] cds.append(CSVCatalog.ColumnDefinition("playerID", "text", True)) cds.append(CSVCatalog.ColumnDefinition("nameLast", "text", True)) cds.append(CSVCatalog.ColumnDefinition("nameFirst", column_type="text")) cds.append(CSVCatalog.ColumnDefinition("birthCity", "text")) cds.append(CSVCatalog.ColumnDefinition("birthCountry", "text")) cds.append(CSVCatalog.ColumnDefinition("throws", column_type="text")) t = cat.create_table("people", data_dir + "People.csv", cds) t.define_index("pid_idx", ['playerID'], "INDEX") print("People table metadata = \n", json.dumps(t.describe_table(), indent=2)) cds = [] cds.append(CSVCatalog.ColumnDefinition("playerID", "text", True)) cds.append(CSVCatalog.ColumnDefinition("H", "number", True)) cds.append(CSVCatalog.ColumnDefinition("AB", column_type="number")) cds.append(CSVCatalog.ColumnDefinition("teamID", "text", True)) cds.append(CSVCatalog.ColumnDefinition("yearID", "text", True)) cds.append( CSVCatalog.ColumnDefinition("stint", column_type="number", not_null=True)) t = cat.create_table("batting", data_dir + "Batting.csv", cds) print("Batting table metadata = \n", json.dumps(t.describe_table(), indent=2)) t.define_index("pid_idx", ['playerID'], "INDEX") people_tbl = CSVTable.CSVTable("people") batting_tbl = CSVTable.CSVTable("batting") print("Loaded people table = \n", people_tbl) print("Loaded batting table = \n", batting_tbl) start_time = time.time() tmp = {"playerID": "willite01"} join_result = people_tbl.join(batting_tbl, ['playerID'], tmp, optimize=optimize) end_time = time.time() print("Result = \n", join_result) elapsed_time = end_time - start_time print("\n\nElapsed time = ", elapsed_time) print_test_separator("Complete test_join_optimizable_3")
def test_join_optimizable_2_op(optimize=False): ''' I used limit=15 here, because I just want to show the first 15 records and the whole table is so big ''' cleanup() print_test_separator("Starting test_join_optimizable_2_op, optimize = " + str(optimize)) cat = CSVCatalog.CSVCatalog() cds = [] cds = [] cds.append(CSVCatalog.ColumnDefinition("playerID", "text", True)) cds.append(CSVCatalog.ColumnDefinition("nameLast", "text", True)) cds.append(CSVCatalog.ColumnDefinition("nameFirst", column_type="text")) cds.append(CSVCatalog.ColumnDefinition("birthCity", "text")) cds.append(CSVCatalog.ColumnDefinition("birthCountry", "text")) cds.append(CSVCatalog.ColumnDefinition("throws", column_type="text")) t = cat.create_table( "people", data_dir + "People.csv", cds) print("People table metadata = \n", json.dumps(t.describe_table(), indent=2)) t.define_index("pid_idx", ['playerID'],"INDEX") cds = [] cds.append(CSVCatalog.ColumnDefinition("playerID", "text", True)) cds.append(CSVCatalog.ColumnDefinition("H", "number", True)) cds.append(CSVCatalog.ColumnDefinition("AB", column_type="number")) cds.append(CSVCatalog.ColumnDefinition("teamID", "text", True)) cds.append(CSVCatalog.ColumnDefinition("yearID", "text", True)) cds.append(CSVCatalog.ColumnDefinition("stint", column_type="number", not_null=True)) t = cat.create_table( "batting", data_dir + "Batting.csv", cds) print("Batting table metadata = \n", json.dumps(t.describe_table(), indent=2)) t.define_index("pid_idx", ['playerID'],"INDEX") people_tbl = CSVTable.CSVTable("people") batting_tbl = CSVTable.CSVTable("batting") start_time = time.time() tmp = {"playerID": "willite01"} join_result = people_tbl.join(batting_tbl,['playerID'], None, optimize=optimize) end_time = time.time() print('table = \n') print_out(join_result,limit=15) elapsed_time = end_time - start_time print("\n\nElapsed time = ", elapsed_time) print_test_separator("Complete test_join_optimizable_2_op")
def test_join_optimizable_4(optimize=False): """ Calling this with optimize=True turns on optimizations in the JOIN code. :return: """ cleanup() print_test_separator("Starting test_optimizable_4, optimize = " + str(optimize)) cat = CSVCatalog.CSVCatalog() cds = [] cds.append(CSVCatalog.ColumnDefinition("playerID", "text", True)) cds.append(CSVCatalog.ColumnDefinition("H", "number", True)) cds.append(CSVCatalog.ColumnDefinition("AB", column_type="number")) cds.append(CSVCatalog.ColumnDefinition("teamID", "text", True)) cds.append(CSVCatalog.ColumnDefinition("yearID", "text", True)) cds.append( CSVCatalog.ColumnDefinition("stint", column_type="number", not_null=True)) t = cat.create_table("batting", data_dir + "Batting.csv", cds) t.define_index("tid_idx", ['teamID'], "INDEX") print("Batting table metadata = \n", json.dumps(t.describe_table(), indent=2)) batting_tbl = CSVTable.CSVTable("batting") print("Loaded batting table = \n", batting_tbl) cds = [] cds.append(CSVCatalog.ColumnDefinition("yearID", "text", True)) cds.append(CSVCatalog.ColumnDefinition("teamID", "text", True)) cds.append(CSVCatalog.ColumnDefinition("lgID", "text", True)) cds.append(CSVCatalog.ColumnDefinition("playerID", "text", True)) p = cat.create_table("appearances", data_dir + "Appearances.csv", cds) print("Appearances table metadata = \n", json.dumps(p.describe_table(), indent=2)) appearances_tbl = CSVTable.CSVTable("appearances") print("Loaded appearances table = \n", appearances_tbl) start_time = time.time() join_result = batting_tbl.join(appearances_tbl, ['playerID', 'teamID'], {"teamID": "SEA"}, optimize=optimize) end_time = time.time() print("Result = \n", join_result) elapsed_time = end_time - start_time print("\n\nElapsed time = ", elapsed_time) print_test_separator("Complete test_join_optimizable_4")
def test_find_by_template_index_batting(): print_test_separator("Starting test_find_by_template_index_batting") cleanup() cat = CSVCatalog.CSVCatalog() cds = [] cds.append(CSVCatalog.ColumnDefinition("playerID", "text", True)) cds.append(CSVCatalog.ColumnDefinition("teamID", "text", True)) cds.append(CSVCatalog.ColumnDefinition("yearID", column_type="text", not_null=True)) cds.append(CSVCatalog.ColumnDefinition("stint", column_type="number", not_null=True)) cds.append(CSVCatalog.ColumnDefinition("H", column_type="number", not_null=False)) cds.append(CSVCatalog.ColumnDefinition("AB", column_type="number", not_null=False)) t = cat.create_table("batting", '/Users/apple/Desktop/HW3/src/Data/Batting.csv', cds) t.define_index("team_year_idx", ['teamID', 'yearID'], "INDEX") batting_tbl = CSVTable.CSVTable("batting") start_time = time.time() tmp = {'playerID':'willite01','yearID':'1939','stint': '1','teamID':'BOS'} fields = ['playerID','yearID','stint','teamID'] result = batting_tbl.find_by_template(tmp,fields) print("Result = \n",json.dumps(result,indent=2)) print("table:\n") print_out(result) end_time = time.time() elapsed_time = end_time - start_time print("\nElapsed time to execute queries = ",elapsed_time) print_test_separator("Starting test_find_by_template_index_batting")
def test_find_by_template(): cleanup() print_test_separator("Starting test_find_by_template") cat = CSVCatalog.CSVCatalog() cds = [] cds = [] cds.append(CSVCatalog.ColumnDefinition("playerID", "text", True)) cds.append(CSVCatalog.ColumnDefinition("nameLast", "text", True)) cds.append(CSVCatalog.ColumnDefinition("nameFirst", column_type="text")) cds.append(CSVCatalog.ColumnDefinition("birthCity", "text")) cds.append(CSVCatalog.ColumnDefinition("birthCountry", "text")) cds.append(CSVCatalog.ColumnDefinition("throws", column_type="text")) t = cat.create_table("people", "../data/People.csv", cds) t.define_index("id_idx", ['nameLast'], "INDEX") print("People table metadata = \n", json.dumps(t.describe_table(), indent=2)) people_tbl = CSVTable.CSVTable("people") print("Loaded people table = \n", people_tbl) tries = 100 start_time = time.time() templ = {"nameLast": "Williams"} print("Starting test on find using indexed field, tmpl = ", json.dumps(templ)) for i in range(0, tries): result = people_tbl.find_by_template( templ, ['playerID', 'nameLast', 'nameFirst']) if i == 0: print("Sample result = ", json.dumps(result)) end_time = time.time() print("Elapsed time for ", tries, "lookups = ", end_time - start_time) tries = 1000 start_time = time.time() templ = {"nameFirst": "Ted"} print("\n\nStarting test on find using NON-indexed field, tmpl = ", json.dumps(templ)) for i in range(0, tries): result = people_tbl.find_by_template( templ, ['playerID', 'nameLast', 'nameFirst']) if i == 0: print("Sample result = ", json.dumps(result)) end_time = time.time() print("Elapsed time for ", tries, "lookups = ", end_time - start_time) print_test_separator("Complete test_find_by_template")
def test_find_by_template(tries): print_test_separator("Starting test_find_by_template") people_tbl = CSVTable.CSVTable("people") #print("rows:",people_tbl.__rows__) start_time = time.time() for i in range(0,tries): template = {"nameLast":"Williams","birthCity":"San Diego"} result = people_tbl.find_by_template(template,fields=["playerID","nameLast","birthCity"]) if i == 0: print("Testing result. Result = \n", json.dumps(result, indent=2)) print_out(result) end_time = time.time() print("\nElapsed time to execute", tries, "queries = ", end_time-start_time) print_test_separator("Finishing test_find_by_template")
def test_find_by_template_index(tries): '''I just tested that with __find_by_template_index__() directly ''' print_test_separator("Starting test_find_by_template_index") people_tbl = CSVTable.CSVTable("people") start_time = time.time() for i in range(0,tries): template = {"nameLast":"Williams","birthCity":"San Diego"} result = people_tbl.__find_by_template_index__(template,'ln_idx',fields=["playerID","nameLast","birthCity"]) if i == 0: print("Testing result. Result = \n", json.dumps(result, indent=2)) print_out(result) end_time = time.time() print("\nElapsed time to execute", tries, "queries = ", end_time-start_time) print_test_separator("Finishing test_find_by_template_index")
def test_select_with_index_3(): """ :return: """ print_test_separator("Starting test_select_with_index_3") cleanup() cat = CSVCatalog.CSVCatalog() cds = [] cds.append(CSVCatalog.ColumnDefinition("playerID", "text", True)) cds.append(CSVCatalog.ColumnDefinition("H", "number", True)) cds.append(CSVCatalog.ColumnDefinition("AB", column_type="number")) cds.append(CSVCatalog.ColumnDefinition("teamID", "text", True)) cds.append(CSVCatalog.ColumnDefinition("yearID", "text", True)) cds.append(CSVCatalog.ColumnDefinition("stint", column_type="number", not_null=True)) t = cat.create_table( "batting", data_dir + "Batting.csv", cds) t.define_index("playerID", ['playerID'], 'INDEX') # t.define_index("teamID_yearID", ['teamID', 'yearID'], "INDEX") print("Batting table metadata = \n", json.dumps(t.describe_table(), indent=2)) batting_tbl = CSVTable.CSVTable("batting") print("Loaded batting table = \n", batting_tbl) start_time = time.time() select_result = batting_tbl.find_by_template({'playerID': 'willste01'}) end_time = time.time() print("Result = \n", json.dumps(select_result, indent=2)) elapsed_time = end_time - start_time print("\n\nElapsed time = ", elapsed_time) print_test_separator("Complete test_select_with_index_3")
def test_select_no_index_1(): """ :return: """ print_test_separator("Starting test_select_no_index_1") cleanup() cat = CSVCatalog.CSVCatalog() cds = [] cds = [] cds.append(CSVCatalog.ColumnDefinition("playerID", "text", True)) cds.append(CSVCatalog.ColumnDefinition("nameLast", "text", True)) cds.append(CSVCatalog.ColumnDefinition("nameFirst", column_type="text")) cds.append(CSVCatalog.ColumnDefinition("birthCity", "text")) cds.append(CSVCatalog.ColumnDefinition("birthCountry", "text")) cds.append(CSVCatalog.ColumnDefinition("throws", column_type="text")) t = cat.create_table( "people", data_dir + "People.csv", cds) print("People table metadata = \n", json.dumps(t.describe_table(), indent=2)) people_tbl = CSVTable.CSVTable("people") print("Loaded people table = \n", people_tbl) start_time = time.time() select_result = people_tbl.find_by_template({'nameLast': 'Williams', 'nameFirst': 'Ted'}) end_time = time.time() print("Result = \n", select_result) elapsed_time = end_time - start_time print("\n\nElapsed time = ", elapsed_time) print_test_separator("Complete test_select_no_index_1")