def test_join_optimizable_3(optimize=False):
    """

    :return:
    """
    cleanup()
    print_test_separator("Starting test_optimizable_3, optimize = " +
                         str(optimize))

    cat = CSVCatalog.CSVCatalog()

    cds = []
    cds.append(CSVCatalog.ColumnDefinition("playerID", "text", True))
    cds.append(CSVCatalog.ColumnDefinition("nameLast", "text", True))
    cds.append(CSVCatalog.ColumnDefinition("nameFirst", column_type="text"))
    cds.append(CSVCatalog.ColumnDefinition("birthCity", "text"))
    cds.append(CSVCatalog.ColumnDefinition("birthCountry", "text"))
    cds.append(CSVCatalog.ColumnDefinition("throws", column_type="text"))

    t = cat.create_table("people", data_dir + "People.csv", cds)
    t.define_index("pid_idx", ['playerID'], "INDEX")
    print("People table metadata = \n", json.dumps(t.describe_table(),
                                                   indent=2))

    cds = []
    cds.append(CSVCatalog.ColumnDefinition("playerID", "text", True))
    cds.append(CSVCatalog.ColumnDefinition("H", "number", True))
    cds.append(CSVCatalog.ColumnDefinition("AB", column_type="number"))
    cds.append(CSVCatalog.ColumnDefinition("teamID", "text", True))
    cds.append(CSVCatalog.ColumnDefinition("yearID", "text", True))
    cds.append(
        CSVCatalog.ColumnDefinition("stint",
                                    column_type="number",
                                    not_null=True))

    t = cat.create_table("batting", data_dir + "Batting.csv", cds)
    print("Batting table metadata = \n",
          json.dumps(t.describe_table(), indent=2))
    t.define_index("pid_idx", ['playerID'], "INDEX")

    people_tbl = CSVTable.CSVTable("people")
    batting_tbl = CSVTable.CSVTable("batting")

    print("Loaded people table = \n", people_tbl)
    print("Loaded batting table = \n", batting_tbl)

    start_time = time.time()

    tmp = {"playerID": "willite01"}
    join_result = people_tbl.join(batting_tbl, ['playerID'],
                                  tmp,
                                  optimize=optimize)

    end_time = time.time()

    print("Result = \n", join_result)
    elapsed_time = end_time - start_time
    print("\n\nElapsed time = ", elapsed_time)

    print_test_separator("Complete test_join_optimizable_3")
def test_join_optimizable_2_op(optimize=False):
    '''
    I used limit=15 here, because I just want to show the first 15 records and the whole table is so big
    '''
    
    cleanup()
    print_test_separator("Starting test_join_optimizable_2_op, optimize = " + str(optimize))

    cat = CSVCatalog.CSVCatalog()
    cds = []

    cds = []
    cds.append(CSVCatalog.ColumnDefinition("playerID", "text", True))
    cds.append(CSVCatalog.ColumnDefinition("nameLast", "text", True))
    cds.append(CSVCatalog.ColumnDefinition("nameFirst", column_type="text"))
    cds.append(CSVCatalog.ColumnDefinition("birthCity", "text"))
    cds.append(CSVCatalog.ColumnDefinition("birthCountry", "text"))
    cds.append(CSVCatalog.ColumnDefinition("throws", column_type="text"))

    t = cat.create_table(
        "people",
        data_dir + "People.csv",
        cds)
    
    print("People table metadata = \n", json.dumps(t.describe_table(), indent=2))
    t.define_index("pid_idx", ['playerID'],"INDEX")
    cds = []
    cds.append(CSVCatalog.ColumnDefinition("playerID", "text", True))
    cds.append(CSVCatalog.ColumnDefinition("H", "number", True))
    cds.append(CSVCatalog.ColumnDefinition("AB", column_type="number"))
    cds.append(CSVCatalog.ColumnDefinition("teamID", "text", True))
    cds.append(CSVCatalog.ColumnDefinition("yearID", "text", True))
    cds.append(CSVCatalog.ColumnDefinition("stint", column_type="number", not_null=True))

    t = cat.create_table(
        "batting",
        data_dir + "Batting.csv",
        cds)
    print("Batting table metadata = \n", json.dumps(t.describe_table(), indent=2))
    t.define_index("pid_idx", ['playerID'],"INDEX")

    people_tbl = CSVTable.CSVTable("people")
    batting_tbl = CSVTable.CSVTable("batting")


    start_time = time.time()

    tmp = {"playerID": "willite01"}
    join_result = people_tbl.join(batting_tbl,['playerID'], None, optimize=optimize)

    end_time = time.time()
    
    print('table = \n')
    print_out(join_result,limit=15)
    elapsed_time = end_time - start_time
    print("\n\nElapsed time = ", elapsed_time)

    print_test_separator("Complete test_join_optimizable_2_op")
def test_join_optimizable_4(optimize=False):
    """
    Calling this with optimize=True turns on optimizations in the JOIN code.
    :return:
    """
    cleanup()
    print_test_separator("Starting test_optimizable_4, optimize = " +
                         str(optimize))

    cat = CSVCatalog.CSVCatalog()

    cds = []
    cds.append(CSVCatalog.ColumnDefinition("playerID", "text", True))
    cds.append(CSVCatalog.ColumnDefinition("H", "number", True))
    cds.append(CSVCatalog.ColumnDefinition("AB", column_type="number"))
    cds.append(CSVCatalog.ColumnDefinition("teamID", "text", True))
    cds.append(CSVCatalog.ColumnDefinition("yearID", "text", True))
    cds.append(
        CSVCatalog.ColumnDefinition("stint",
                                    column_type="number",
                                    not_null=True))
    t = cat.create_table("batting", data_dir + "Batting.csv", cds)
    t.define_index("tid_idx", ['teamID'], "INDEX")
    print("Batting table metadata = \n",
          json.dumps(t.describe_table(), indent=2))
    batting_tbl = CSVTable.CSVTable("batting")
    print("Loaded batting table = \n", batting_tbl)

    cds = []
    cds.append(CSVCatalog.ColumnDefinition("yearID", "text", True))
    cds.append(CSVCatalog.ColumnDefinition("teamID", "text", True))
    cds.append(CSVCatalog.ColumnDefinition("lgID", "text", True))
    cds.append(CSVCatalog.ColumnDefinition("playerID", "text", True))
    p = cat.create_table("appearances", data_dir + "Appearances.csv", cds)
    print("Appearances table metadata = \n",
          json.dumps(p.describe_table(), indent=2))
    appearances_tbl = CSVTable.CSVTable("appearances")
    print("Loaded appearances table = \n", appearances_tbl)

    start_time = time.time()
    join_result = batting_tbl.join(appearances_tbl, ['playerID', 'teamID'],
                                   {"teamID": "SEA"},
                                   optimize=optimize)
    end_time = time.time()
    print("Result = \n", join_result)
    elapsed_time = end_time - start_time
    print("\n\nElapsed time = ", elapsed_time)

    print_test_separator("Complete test_join_optimizable_4")
def test_find_by_template_index_batting():
    print_test_separator("Starting test_find_by_template_index_batting")

    cleanup()

    cat = CSVCatalog.CSVCatalog()

    cds = []
    cds.append(CSVCatalog.ColumnDefinition("playerID", "text", True))
    cds.append(CSVCatalog.ColumnDefinition("teamID", "text", True))
    cds.append(CSVCatalog.ColumnDefinition("yearID", column_type="text", not_null=True))
    cds.append(CSVCatalog.ColumnDefinition("stint", column_type="number", not_null=True))
    cds.append(CSVCatalog.ColumnDefinition("H", column_type="number", not_null=False))
    cds.append(CSVCatalog.ColumnDefinition("AB", column_type="number", not_null=False))

    t = cat.create_table("batting",
                         '/Users/apple/Desktop/HW3/src/Data/Batting.csv',
                         cds)
    t.define_index("team_year_idx", ['teamID', 'yearID'], "INDEX")
    batting_tbl = CSVTable.CSVTable("batting")

    start_time = time.time()

    tmp = {'playerID':'willite01','yearID':'1939','stint': '1','teamID':'BOS'}
    fields = ['playerID','yearID','stint','teamID']

    result = batting_tbl.find_by_template(tmp,fields)
    print("Result = \n",json.dumps(result,indent=2))
    print("table:\n")
    print_out(result)
    end_time = time.time()

    elapsed_time = end_time - start_time
    print("\nElapsed time to execute queries = ",elapsed_time)
    print_test_separator("Starting test_find_by_template_index_batting")
Esempio n. 5
0
def test_find_by_template():

    cleanup()
    print_test_separator("Starting test_find_by_template")

    cat = CSVCatalog.CSVCatalog()
    cds = []

    cds = []
    cds.append(CSVCatalog.ColumnDefinition("playerID", "text", True))
    cds.append(CSVCatalog.ColumnDefinition("nameLast", "text", True))
    cds.append(CSVCatalog.ColumnDefinition("nameFirst", column_type="text"))
    cds.append(CSVCatalog.ColumnDefinition("birthCity", "text"))
    cds.append(CSVCatalog.ColumnDefinition("birthCountry", "text"))
    cds.append(CSVCatalog.ColumnDefinition("throws", column_type="text"))

    t = cat.create_table("people", "../data/People.csv", cds)
    t.define_index("id_idx", ['nameLast'], "INDEX")
    print("People table metadata = \n", json.dumps(t.describe_table(),
                                                   indent=2))

    people_tbl = CSVTable.CSVTable("people")
    print("Loaded people table = \n", people_tbl)

    tries = 100
    start_time = time.time()
    templ = {"nameLast": "Williams"}
    print("Starting test on find using indexed field, tmpl = ",
          json.dumps(templ))
    for i in range(0, tries):
        result = people_tbl.find_by_template(
            templ, ['playerID', 'nameLast', 'nameFirst'])
        if i == 0:
            print("Sample result = ", json.dumps(result))
    end_time = time.time()
    print("Elapsed time for ", tries, "lookups = ", end_time - start_time)

    tries = 1000
    start_time = time.time()
    templ = {"nameFirst": "Ted"}
    print("\n\nStarting test on find using NON-indexed field, tmpl = ",
          json.dumps(templ))
    for i in range(0, tries):
        result = people_tbl.find_by_template(
            templ, ['playerID', 'nameLast', 'nameFirst'])
        if i == 0:
            print("Sample result = ", json.dumps(result))
    end_time = time.time()
    print("Elapsed time for ", tries, "lookups = ", end_time - start_time)

    print_test_separator("Complete test_find_by_template")
def test_find_by_template(tries):
    print_test_separator("Starting test_find_by_template")
    people_tbl = CSVTable.CSVTable("people")
    #print("rows:",people_tbl.__rows__)
    start_time = time.time()
    for i in range(0,tries):
        template = {"nameLast":"Williams","birthCity":"San Diego"}
        result = people_tbl.find_by_template(template,fields=["playerID","nameLast","birthCity"])
        if i == 0:
            print("Testing result. Result = \n", json.dumps(result, indent=2))
            print_out(result)
    end_time = time.time()
    print("\nElapsed time to execute", tries, "queries = ", end_time-start_time)
    print_test_separator("Finishing test_find_by_template")
def test_find_by_template_index(tries):
    '''I just tested that with __find_by_template_index__() directly '''
    print_test_separator("Starting test_find_by_template_index")
    people_tbl = CSVTable.CSVTable("people")
    start_time = time.time()
    for i in range(0,tries):
        template = {"nameLast":"Williams","birthCity":"San Diego"}
        result = people_tbl.__find_by_template_index__(template,'ln_idx',fields=["playerID","nameLast","birthCity"])
        if i == 0:
            print("Testing result. Result = \n", json.dumps(result, indent=2))
            print_out(result)
    end_time = time.time()
    print("\nElapsed time to execute", tries, "queries = ", end_time-start_time)
    print_test_separator("Finishing test_find_by_template_index")
Esempio n. 8
0
def test_select_with_index_3():
    """

    :return:
    """

    print_test_separator("Starting test_select_with_index_3")

    cleanup()

    cat = CSVCatalog.CSVCatalog()
    cds = []
    cds.append(CSVCatalog.ColumnDefinition("playerID", "text", True))
    cds.append(CSVCatalog.ColumnDefinition("H", "number", True))
    cds.append(CSVCatalog.ColumnDefinition("AB", column_type="number"))
    cds.append(CSVCatalog.ColumnDefinition("teamID", "text", True))
    cds.append(CSVCatalog.ColumnDefinition("yearID", "text", True))
    cds.append(CSVCatalog.ColumnDefinition("stint", column_type="number", not_null=True))

    t = cat.create_table(
        "batting",
        data_dir + "Batting.csv",
        cds)
    t.define_index("playerID", ['playerID'], 'INDEX')
#    t.define_index("teamID_yearID", ['teamID', 'yearID'], "INDEX")
    print("Batting table metadata = \n", json.dumps(t.describe_table(), indent=2))



    batting_tbl = CSVTable.CSVTable("batting")

    print("Loaded batting table = \n", batting_tbl)

    start_time = time.time()

    select_result = batting_tbl.find_by_template({'playerID': 'willste01'})

    end_time = time.time()

    print("Result = \n", json.dumps(select_result, indent=2))
    elapsed_time = end_time - start_time
    print("\n\nElapsed time = ", elapsed_time)

    print_test_separator("Complete test_select_with_index_3")
Esempio n. 9
0
def test_select_no_index_1():
    """

    :return:
    """

    print_test_separator("Starting test_select_no_index_1")

    cleanup()

    cat = CSVCatalog.CSVCatalog()
    cds = []

    cds = []
    cds.append(CSVCatalog.ColumnDefinition("playerID", "text", True))
    cds.append(CSVCatalog.ColumnDefinition("nameLast", "text", True))
    cds.append(CSVCatalog.ColumnDefinition("nameFirst", column_type="text"))
    cds.append(CSVCatalog.ColumnDefinition("birthCity", "text"))
    cds.append(CSVCatalog.ColumnDefinition("birthCountry", "text"))
    cds.append(CSVCatalog.ColumnDefinition("throws", column_type="text"))

    t = cat.create_table(
        "people",
        data_dir + "People.csv",
        cds)
    print("People table metadata = \n", json.dumps(t.describe_table(), indent=2))


    people_tbl = CSVTable.CSVTable("people")

    print("Loaded people table = \n", people_tbl)

    start_time = time.time()

    select_result = people_tbl.find_by_template({'nameLast': 'Williams', 'nameFirst': 'Ted'})

    end_time = time.time()

    print("Result = \n", select_result)
    elapsed_time = end_time - start_time
    print("\n\nElapsed time = ", elapsed_time)

    print_test_separator("Complete test_select_no_index_1")