Example #1
0
def test_only_index_in_level():
    prng = np.random.Generator(np.random.MT19937(seed=1337))

    structure = {
        "A": {
            "height": {
                "dtype": "<i8"
            }
        },
        "B": {},
    }

    table = {}
    table["A"] = spt.dict_to_recarray({
        spt.IDX:
        np.arange(10).astype(spt.IDX_DTYPE),
        "height":
        np.ones(10, dtype='<i8'),
    })
    table["B"] = spt.dict_to_recarray({
        spt.IDX:
        prng.choice(table["A"][spt.IDX], 5),
    })

    spt.assert_table_has_structure(table=table, structure=structure)

    with tempfile.TemporaryDirectory(prefix='test_sparse_table') as tmp:
        path = os.path.join(tmp, 'table_with_index_only_level.tar')
        spt.write(path=path, table=table, structure=structure)
        table_back = spt.read(path=path, structure=structure)
        spt.assert_tables_are_equal(table, table_back)
Example #2
0
def test_write_read_full_table():
    prng = np.random.Generator(np.random.MT19937(seed=1337))

    my_table = _make_example_table(prng=prng, size=1000 * 1000)
    with tempfile.TemporaryDirectory(prefix='test_sparse_table') as tmp:
        path = os.path.join(tmp, 'my_table.tar')
        spt.write(path=path, table=my_table, structure=EXAMPLE_TABLE_STRUCTURE)
        my_table_back = spt.read(path=path, structure=EXAMPLE_TABLE_STRUCTURE)
        spt.assert_tables_are_equal(my_table, my_table_back)

        # no structure
        path_nos = os.path.join(tmp, 'my_table_no_structure.tar')
        spt.write(path=path_nos, table=my_table)
        my_table_back_nos = spt.read(path=path_nos)
        spt.assert_tables_are_equal(my_table, my_table_back_nos)
        spt.assert_table_has_structure(table=my_table_back_nos,
                                       structure=EXAMPLE_TABLE_STRUCTURE)
Example #3
0
def _make_example_table(prng, size, start_index=0):
    """
    Children start in elementary school. 10% progress to high school, and 10%
    of those progress to university.
    At each point in their career statistics are collected that can be put to
    columns, while every child is represented by a line.
    Unfortunately, a typical example of a sparse table.
    """

    t = {}
    t['elementary_school'] = spt.dict_to_recarray({
        spt.IDX:
        start_index + np.arange(size).astype(spt.IDX_DTYPE),
        'lunchpack_size':
        prng.uniform(size=size).astype('<f8'),
        'num_friends':
        prng.uniform(low=0, high=5, size=size).astype('<i8'),
    })
    high_school_size = size // 10
    t['high_school'] = spt.dict_to_recarray({
        spt.IDX:
        prng.choice(t['elementary_school'][spt.IDX],
                    size=high_school_size,
                    replace=False),
        'time_spent_on_homework':
        100 + 100 * prng.uniform(size=high_school_size).astype('<f8'),
        'num_best_friends':
        prng.uniform(low=0, high=5, size=high_school_size).astype('<i8'),
    })
    university_size = high_school_size // 10
    t['university'] = spt.dict_to_recarray({
        spt.IDX:
        prng.choice(t['high_school'][spt.IDX],
                    size=university_size,
                    replace=False),
        'num_missed_classes':
        100 * prng.uniform(size=university_size).astype('<i8'),
        'num_fellow_students':
        prng.uniform(low=0, high=5, size=university_size).astype('<i8'),
    })
    spt.assert_structure_keys_are_valid(structure=EXAMPLE_TABLE_STRUCTURE)
    spt.assert_table_has_structure(table=t, structure=EXAMPLE_TABLE_STRUCTURE)
    return t
Example #4
0
def test_concatenate_several_tables():
    prng = np.random.Generator(np.random.MT19937(seed=1337))

    block_size = 10 * 1000
    num_blocks = 100

    with tempfile.TemporaryDirectory(prefix='test_sparse_table') as tmp:
        paths = []
        for i in range(num_blocks):
            table_i = _make_example_table(prng=prng,
                                          size=block_size,
                                          start_index=i * block_size)
            paths.append(os.path.join(tmp, "{:06d}.tar".format(i)))
            spt.write(path=paths[-1],
                      table=table_i,
                      structure=EXAMPLE_TABLE_STRUCTURE)
        output_path = os.path.join(tmp, "full.tar")
        full_table = spt.concatenate_files(
            list_of_table_paths=paths,
            structure=EXAMPLE_TABLE_STRUCTURE,
        )
    spt.assert_table_has_structure(table=full_table,
                                   structure=EXAMPLE_TABLE_STRUCTURE)

    assert (full_table['elementary_school'][spt.IDX].shape[0] == num_blocks *
            block_size)
    assert (len(set(full_table['elementary_school'][spt.IDX])) == num_blocks *
            block_size), "The indices must be uniqe"
    assert (full_table['high_school'][spt.IDX].shape[0] == num_blocks *
            block_size // 10)
    assert (len(set(full_table['high_school'][spt.IDX])) == num_blocks *
            block_size // 10)
    assert (full_table['university'][spt.IDX].shape[0] == num_blocks *
            block_size // 100)
    assert (len(set(full_table['university'][spt.IDX])) == num_blocks *
            block_size // 100)
Example #5
0
def test_from_records():
    prng = np.random.Generator(np.random.MT19937(seed=0))
    rnd = prng.uniform

    # define what your table will look like
    # -------------------------------------
    structure = {
        "A": {
            "a": {
                "dtype": '<f8'
            },
            "b": {
                "dtype": '<f8'
            },
        },
        "B": {
            "c": {
                "dtype": '<f8'
            },
            "d": {
                "dtype": '<f8'
            },
        },
        "C": {
            "e": {
                "dtype": '<f8'
            },
        },
    }

    # populate the table using records
    # --------------------------------
    with tempfile.TemporaryDirectory(prefix='test_sparse_table') as tmp:

        num_jobs = 100
        n = 5
        job_result_paths = []
        for j in range(num_jobs):

            # map the population of the sparse table onto many jobs
            # -----------------------------------------------------
            i = j * n
            table_records = {}

            table_records["A"] = []
            table_records["A"].append({spt.IDX: i + 0, "a": rnd(), "b": rnd()})
            table_records["A"].append({spt.IDX: i + 1, "a": rnd(), "b": rnd()})
            table_records["A"].append({spt.IDX: i + 2, "a": rnd(), "b": rnd()})
            table_records["A"].append({spt.IDX: i + 3, "a": rnd(), "b": rnd()})
            table_records["A"].append({spt.IDX: i + 4, "a": rnd(), "b": rnd()})

            table_records["B"] = []
            table_records["B"].append({
                spt.IDX: i + 0,
                "c": rnd(),
                "d": 5 * rnd()
            })
            table_records["B"].append({
                spt.IDX: i + 3,
                "c": rnd(),
                "d": 5 * rnd()
            })

            table_records["C"] = []
            if rnd() > 0.9:
                table_records["C"].append({spt.IDX: i + 3, "e": -rnd()})

            table = spt.table_of_records_to_sparse_numeric_table(
                table_records=table_records, structure=structure)

            path = os.path.join(tmp, '{:06d}.tar'.format(j))
            job_result_paths.append(path)
            spt.write(path=path, table=table, structure=structure)

        # reduce
        # ------
        full_table = spt.concatenate_files(
            list_of_table_paths=job_result_paths, structure=structure)

    spt.assert_table_has_structure(table=full_table, structure=structure)