Exemple #1
0
@time_this(lambda x: len(x))
def numba_fast_cusum(values: np.ndarray) -> np.ndarray:
    return _numba_fast_cusum(values)


@time_this(lambda x: len(x))
def np_fast_cusum(values: np.ndarray) -> np.ndarray:
    """
    This is O(n) and optimized with C code
    """
    return values.cumsum()


if __name__ == '__main__':

    exp_range = ExponentialRange(0, 8, 1 / 4)
    values = random_numeric_list(exp_range.max)

    with timed_report():
        for i in exp_range.iterator(4):
            slow_cusum(values[:i])

        for i in exp_range.iterator(4):
            slow_cusum_expanded(values[:i])

        for i in exp_range.iterator():
            python_fast_cusum(values[:i])

        for i in exp_range.iterator():
            pandas_fast_cusum(pd.Series(values[:i]))
    return result_count


if __name__ == '__main__':

    # the_words = ['A', 'A', 'A', 'B', 'B', 'B', 'C'] * 100
    # print(slow_count_occurrences(the_words))
    # print(fast_count_occurrences(the_words))
    # print(defaultdict_fast_count(the_words))
    # print(counter_fast_count(the_words))
    # print(np_fast_count(np.array(the_words)))
    # print(pd_fast_count(pd.Series(the_words)))
    # print(parallel_fast_count(the_words))

    exp_range = ExponentialRange(0, 7, 1 / 4)
    the_words = random_words(exp_range.max)
    the_array = np.array(the_words)
    the_series = pd.Series(the_words)

    with timed_report():
        for i in exp_range.iterator(4):
            slow_count_occurrences(the_words[:i])

        for i in exp_range.iterator():
            fast_count_occurrences(the_words[:i])

        for i in exp_range.iterator():
            defaultdict_fast_count(the_words[:i])

        for i in exp_range.iterator():
"""
Sizing information for dictionaries
"""
import sys

from utils.profiler import ExponentialRange
exp_range = ExponentialRange(0, 7, 1 / 8)

for i in exp_range.iterator():
    _dict = {j: j**2 for j in range(i)}
    _dict_size = sys.getsizeof(_dict)
    print(f'{len(_dict):<8} keys {_dict_size:>12} bytes')

# Returns ...
# 1        keys          248 bytes
# 2        keys          248 bytes
# 3        keys          248 bytes
# 4        keys          248 bytes
# 5        keys          248 bytes
# 7        keys          376 bytes
# 10       keys          376 bytes
# 13       keys          656 bytes
# 17       keys          656 bytes
# 23       keys         1192 bytes
# 31       keys         1192 bytes
# 42       keys         1192 bytes
# 56       keys         2288 bytes
# 74       keys         2288 bytes
# 100      keys         4712 bytes
# 133      keys         4712 bytes
# 177      keys         9328 bytes
    n = len(values)
    is_sorted = all(values[i] >= values[i + 1] for i in range(n - 1))
    assert is_sorted, 'values are not sorted.'


def assert_top_k(top_k_values, values):
    assert_sorted(top_k_values)
    kth_value = top_k_values[-1]
    k = len(top_k_values)
    assert sum(v >= kth_value for v in values) == k, \
        'Something went wrong'


if __name__ == '__main__':

    exp_range = ExponentialRange(2, 7, 1 / 4)
    values = random_numeric_list(exp_range.max)

    with timed_report():
        for i in exp_range.iterator():
            _values = values[:i].copy()
            _top_k = naive_find_top_k(_values)
            assert_top_k(_top_k, _values)

        for i in exp_range.iterator():
            _values = values[:i].copy()
            _top_k = heap_find_top_k(_values)
            assert_top_k(_top_k, _values)

        for i in exp_range.iterator():
            _values = values[:i].copy()
Exemple #5
0
Generate sample CSV files for file-reading tests
"""
import numpy as np
import pandas as pd
import os
import string
import itertools

from utils.profiler import ExponentialRange

src_dir = os.path.dirname(os.path.abspath(__file__))
data_dir = os.path.join(src_dir, '..', 'data')
target_dir = os.path.join(data_dir, 'big_numeric_csv_files')

# Max size in rows as a power of ten
exp_range = ExponentialRange(0, 7, 1 / 4)
num_cols = 10
col_names = list(string.ascii_uppercase[:num_cols])

_data = np.random.random((exp_range.max, num_cols))
data = pd.DataFrame(_data, columns=col_names)

_letters = string.ascii_uppercase
_file_codes = itertools.product(_letters, repeat=2)
_file_codes = list(_file_codes)[:exp_range.max]
file_codes = [''.join(code) for code in _file_codes]

for j, i in enumerate(exp_range.iterator()):
    code = file_codes[j]
    filename = f'file_{code}_rows_{i}.csv'
    filepath = os.path.join(target_dir, filename)