Exemple #1
0
def make_large_ngroups_bmark(ngroups, func_name, func_args=''):
    bmark_name = 'groupby_ngroups_%s_%s' % (ngroups, func_name)
    stmt = _stmt_template % ('%s(%s)' % (func_name, func_args))
    setup = _setup_template % ngroups
    bmark = Benchmark(stmt, setup, start_date=START_DATE)
    # MUST set name
    bmark.name = bmark_name
    return bmark
Exemple #2
0
def make_large_ngroups_bmark(ngroups, func_name, func_args=''):
    bmark_name = 'groupby_ngroups_%s_%s' % (ngroups, func_name)
    stmt = _stmt_template % ('%s(%s)' % (func_name, func_args))
    setup = _setup_template % ngroups
    bmark = Benchmark(stmt, setup, start_date=START_DATE)
    # MUST set name
    bmark.name = bmark_name
    return bmark
Exemple #3
0
from datetime import datetime

common_setup = """from pandas_vb_common import *
from pandas import read_csv, read_table
"""

setup = common_setup + """
import os
N = 10000
K = 8
df = DataFrame(np.random.randn(N, K) * np.random.randint(100, 10000, (N, K)))
df.to_csv('test.csv', sep='|')
"""

read_csv_vb = Benchmark("read_csv('test.csv', sep='|')",
                        setup,
                        cleanup="os.remove('test.csv')",
                        start_date=datetime(2012, 5, 7))

setup = common_setup + """
import os
N = 10000
K = 8
format = lambda x: '{:,}'.format(x)
df = DataFrame(np.random.randn(N, K) * np.random.randint(100, 10000, (N, K)))
df = df.applymap(format)
df.to_csv('test.csv', sep='|')
"""

read_csv_thou_vb = Benchmark("read_csv('test.csv', sep='|', thousands=',')",
                             setup,
                             cleanup="os.remove('test.csv')",
Exemple #4
0
"""

#----------------------------------------------------------------------
# lookup

setup = common_setup + """
df = DataFrame(np.random.randn(10000, 8), columns=list('abcdefgh'))
df['foo'] = 'bar'

row_labels = list(df.index[::10])[:900]
col_labels = list(df.columns) * 100
row_labels_all = np.array(list(df.index) * len(df.columns), dtype='object')
col_labels_all = np.array(list(df.columns) * len(df.index), dtype='object')
"""

frame_fancy_lookup = Benchmark('df.lookup(row_labels, col_labels)', setup,
                               start_date=datetime(2012, 1, 12))

frame_fancy_lookup_all = Benchmark('df.lookup(row_labels_all, col_labels_all)',
                                   setup,
                                   start_date=datetime(2012, 1, 12))

#----------------------------------------------------------------------
# fillna in place

setup = common_setup + """
df = DataFrame(randn(10000, 100))
df.values[::2] = np.nan
"""

frame_fillna_inplace = Benchmark('df.fillna(0, inplace=True)', setup,
                                 start_date=datetime(2012, 4, 4))
Exemple #5
0
from vbench.api import Benchmark
from datetime import datetime

common_setup = """from pandas_vb_common import *
index = MultiIndex.from_arrays([np.arange(100).repeat(100),
                               np.roll(np.tile(np.arange(100), 100), 25)])
df = DataFrame(np.random.randn(10000, 4), index=index)
"""

reshape_unstack_simple = Benchmark('df.unstack(1)',
                                   common_setup,
                                   start_date=datetime(2011, 10, 1))

setup = common_setup + """
udf = df.unstack(1)
"""

reshape_stack_simple = Benchmark('udf.stack()',
                                 setup,
                                 start_date=datetime(2011, 10, 1))

setup = common_setup + """
def unpivot(frame):
    N, K = frame.shape
    data = {'value' : frame.values.ravel('F'),
            'variable' : np.asarray(frame.columns).repeat(N),
            'date' : np.tile(np.asarray(frame.index), K)}
    return DataFrame(data, columns=['date', 'variable', 'value'])
index = date_range('1/1/2000', periods=10000, freq='h')
df = DataFrame(randn(10000, 50), index=index, columns=range(50))
pdf = unpivot(df)
Exemple #6
0
index = date_range('20000101',periods=N,freq='H')
df2 = DataFrame(dict([ ("float{0}".format(i),randn(N)) for i in range(C) ]),
                index=index)
df2['object'] = ['%08x'%randrange(16**8) for _ in range(N)]
remove(f)
"""

#----------------------------------------------------------------------
# msgpack

setup = common_setup + """
df2.to_msgpack(f)
"""

packers_read_pack = Benchmark("pd.read_msgpack(f)",
                              setup,
                              start_date=start_date)

setup = common_setup + """
"""

packers_write_pack = Benchmark("df2.to_msgpack(f)",
                               setup,
                               cleanup="remove(f)",
                               start_date=start_date)

#----------------------------------------------------------------------
# pickle

setup = common_setup + """
df2.to_pickle(f)
Exemple #7
0
#----------------------------------------------------------------------
# get from a store

setup1 = common_setup + """
index = tm.makeStringIndex(25000)
df = DataFrame({'float1' : randn(25000),
                'float2' : randn(25000)},
               index=index)
remove(f)
store = HDFStore(f)
store.put('df1',df)
"""

read_store = Benchmark("store.get('df1')",
                       setup1,
                       cleanup="store.close()",
                       start_date=start_date)

#----------------------------------------------------------------------
# write to a store

setup2 = common_setup + """
index = tm.makeStringIndex(25000)
df = DataFrame({'float1' : randn(25000),
                'float2' : randn(25000)},
               index=index)
remove(f)
store = HDFStore(f)
"""

write_store = Benchmark("store.put('df2',df)",
Exemple #8
0
setup = """from pandas_vb_common import *
import pandas as pd
N = 500000
df_int64 = DataFrame(dict(A = np.arange(N,dtype='int64'), B = np.arange(N,dtype='int64')))
df_int32 = DataFrame(dict(A = np.arange(N,dtype='int32'), B = np.arange(N,dtype='int32')))
df_uint32 = DataFrame(dict(A = np.arange(N,dtype='uint32'), B = np.arange(N,dtype='uint32')))
df_float64 = DataFrame(dict(A = np.arange(N,dtype='float64'), B = np.arange(N,dtype='float64')))
df_float32 = DataFrame(dict(A = np.arange(N,dtype='float32'), B = np.arange(N,dtype='float32')))
df_datetime64 = DataFrame(dict(A = pd.to_datetime(np.arange(N,dtype='int64'),unit='ms'),
                               B = pd.to_datetime(np.arange(N,dtype='int64'),unit='ms')))
df_timedelta64 = DataFrame(dict(A = df_datetime64['A']-df_datetime64['B'],
                                B = df_datetime64['B']))
"""

dtype_infer_int64 = Benchmark('df_int64["A"] + df_int64["B"]',
                              setup,
                              start_date=datetime(2014, 1, 1))
dtype_infer_int32 = Benchmark('df_int32["A"] + df_int32["B"]',
                              setup,
                              start_date=datetime(2014, 1, 1))
dtype_infer_uint32 = Benchmark('df_uint32["A"] + df_uint32["B"]',
                               setup,
                               start_date=datetime(2014, 1, 1))
dtype_infer_float64 = Benchmark('df_float64["A"] + df_float64["B"]',
                                setup,
                                start_date=datetime(2014, 1, 1))
dtype_infer_float32 = Benchmark('df_float32["A"] + df_float32["B"]',
                                setup,
                                start_date=datetime(2014, 1, 1))
dtype_infer_datetime64 = Benchmark('df_datetime64["A"] - df_datetime64["B"]',
                                   setup,
Exemple #9
0
    return arr

# aggregate multiple columns
df = DataFrame({'key1' : get_test_data(ngroups=ngroups),
                'key2' : get_test_data(ngroups=ngroups),
                'data1' : np.random.randn(N),
                'data2' : np.random.randn(N)})
def f():
    df.groupby(['key1', 'key2']).agg(lambda x: x.values.sum())

simple_series = Series(np.random.randn(N))
key1 = df['key1']
"""

stmt1 = "df.groupby(['key1', 'key2'])['data1'].agg(lambda x: x.values.sum())"
groupby_multi_python = Benchmark(stmt1, setup,
                                 start_date=datetime(2011, 7, 1))

stmt3 = "df.groupby(['key1', 'key2']).sum()"
groupby_multi_cython = Benchmark(stmt3, setup,
                                 start_date=datetime(2011, 7, 1))

stmt = "df.groupby(['key1', 'key2'])['data1'].agg(np.std)"
groupby_multi_series_op = Benchmark(stmt, setup,
                                    start_date=datetime(2011, 8, 1))

groupby_series_simple_cython = \
    Benchmark('simple_series.groupby(key1).sum()', setup,
              start_date=datetime(2011, 3, 1))

#----------------------------------------------------------------------
# 2d grouping, aggregate many columns
Exemple #10
0
#----------------------------------------------------------------------
# lookup

setup = common_setup + """
df = DataFrame(np.random.randn(10000, 8), columns=list('abcdefgh'))
df['foo'] = 'bar'

row_labels = list(df.index[::10])[:900]
col_labels = list(df.columns) * 100
row_labels_all = np.array(list(df.index) * len(df.columns), dtype='object')
col_labels_all = np.array(list(df.columns) * len(df.index), dtype='object')
"""

frame_fancy_lookup = Benchmark('df.lookup(row_labels, col_labels)',
                               setup,
                               start_date=datetime(2012, 1, 12))

frame_fancy_lookup_all = Benchmark('df.lookup(row_labels_all, col_labels_all)',
                                   setup,
                                   start_date=datetime(2012, 1, 12))

#----------------------------------------------------------------------
# fillna in place

setup = common_setup + """
df = DataFrame(randn(10000, 100))
df.values[::2] = np.nan
"""

frame_fillna_inplace = Benchmark('df.fillna(0, inplace=True)',
Exemple #11
0

#### test all groupby funcs ####

setup = basic + """

@test_parallel(num_threads=2)
def pg2():
    df.groupby('key')['data'].func()

"""

for f in ['sum','prod','var','count','min','max','mean','last']:

    name = "nogil_groupby_{f}_2".format(f=f)
    bmark = Benchmark('pg2()', setup.replace('func',f), start_date=datetime(2015, 1, 1))
    bmark.name = name
    globals()[name] = bmark

del bmark


#### test take_1d ####
setup = basic + """
from pandas.core import common as com

N = 1e7
df = DataFrame({'int64' : np.arange(N,dtype='int64'),
                'float64' : np.arange(N,dtype='float64')})
indexer = np.arange(100,len(df)-100)
Exemple #12
0
#----------------------------------------------------------------------
# read_csv

setup1 = common_setup + """
index = tm.makeStringIndex(10000)
df = DataFrame({'float1' : randn(10000),
                'float2' : randn(10000),
                'string1' : ['foo'] * 10000,
                'bool1' : [True] * 10000,
                'int1' : np.random.randint(0, 100000, size=10000)},
               index=index)
df.to_csv('__test__.csv')
"""

read_csv_standard = Benchmark("read_csv('__test__.csv')",
                              setup1,
                              start_date=datetime(2011, 9, 15))

#----------------------------------
# skiprows

setup1 = common_setup + """
index = tm.makeStringIndex(20000)
df = DataFrame({'float1' : randn(20000),
                'float2' : randn(20000),
                'string1' : ['foo'] * 20000,
                'bool1' : [True] * 20000,
                'int1' : np.random.randint(0, 200000, size=20000)},
               index=index)
df.to_csv('__test__.csv')
"""
Exemple #13
0
@test_parallel(num_threads=2)
def pg2():
    f()

@test_parallel(num_threads=4)
def pg4():
    f()

@test_parallel(num_threads=8)
def pg8():
    f()

"""

nogil_groupby_sum_4 = Benchmark(
    'pg4()', setup,
    start_date=datetime(2015, 1, 1))

nogil_groupby_sum_8 = Benchmark(
    'pg8()', setup,
    start_date=datetime(2015, 1, 1))


#### test all groupby funcs ####

setup = basic + """

@test_parallel(num_threads=2)
def pg2():
    df.groupby('key')['data'].func()
Exemple #14
0
from vbench.api import Benchmark
from datetime import datetime

common_setup = """from pandas_vb_common import *
from datetime import timedelta
import pandas._tseries as lib
N = 1000000

try:
    rng = date_range('1/1/2000', periods=N, freq='min')
except NameError:
    rng = DateRange('1/1/2000', periods=N, offset=datetools.Minute())
    date_range = DateRange

ts = Series(np.random.randn(N), index=rng)

def replace_slow(ser, old, new):
    lib.slow_replace(ser.values, old, new)
    return ser
"""

replace_fillna = Benchmark('ts.fillna(0., inplace=True)',
                           common_setup,
                           start_date=datetime(2012, 4, 4))
replace_replacena = Benchmark('ts.replace(np.nan, 0., inplace=True)',
                              common_setup,
                              start_date=datetime(2012, 5, 15))
replace_putmask = Benchmark('replace_slow(ts, np.nan, 0.)',
                            common_setup,
                            start_date=datetime(2012, 5, 15))
Exemple #15
0
from datetime import datetime

common_setup = """from pandas_vb_common import *
"""

setup = common_setup + """
s1 = Series(np.random.randn(10000))
s2 = Series(np.random.randint(1, 10, 10000))
s3 = Series(np.random.randint(1, 10, 100000)).astype('int64')
values = [1,2]
s4 = s3.astype('object')
"""

series_nlargest1 = Benchmark(
    's1.nlargest(3, take_last=True);'
    's1.nlargest(3, take_last=False)',
    setup,
    start_date=datetime(2014, 1, 25))
series_nlargest2 = Benchmark(
    's2.nlargest(3, take_last=True);'
    's2.nlargest(3, take_last=False)',
    setup,
    start_date=datetime(2014, 1, 25))

series_nsmallest2 = Benchmark(
    's1.nsmallest(3, take_last=True);'
    's1.nsmallest(3, take_last=False)',
    setup,
    start_date=datetime(2014, 1, 25))

series_nsmallest2 = Benchmark(
Exemple #16
0
#----------------------------------------------------------------------
# read_csv

setup1 = common_setup + """
index = [rands(10) for _ in xrange(10000)]
df = DataFrame({'float1' : randn(10000),
                'float2' : randn(10000),
                'string1' : ['foo'] * 10000,
                'bool1' : [True] * 10000,
                'int1' : np.random.randint(0, 100000, size=10000)},
               index=index)
df.to_csv('__test__.csv')
"""

read_csv_standard = Benchmark("read_csv('__test__.csv')", setup1,
                              start_date=datetime(2011, 9, 15))


#----------------------------------------------------------------------
# write_csv

setup2 = common_setup + """
index = [rands(10) for _ in xrange(10000)]
df = DataFrame({'float1' : randn(10000),
                'float2' : randn(10000),
                'string1' : ['foo'] * 10000,
                'bool1' : [True] * 10000,
                'int1' : np.random.randint(0, 100000, size=10000)},
               index=index)
"""
Exemple #17
0
    if len(arr) < n:
        arr = np.asarray(list(arr) + unique_groups[:n - len(arr)],
                         dtype=object)

    random.shuffle(arr)
    return arr

# aggregate multiple columns
df = DataFrame({'key1' : get_test_data(ngroups=ngroups),
                'key2' : get_test_data(ngroups=ngroups),
                'data1' : np.random.randn(N),
                'data2' : np.random.randn(N)})
def f():
    df.groupby(['key1', 'key2']).agg(lambda x: x.values.sum())
"""

stmt1 = "df.groupby(['key1', 'key2'])['data1'].agg(lambda x: x.values.sum())"
groupby_multi_python = Benchmark(stmt1, setup,
                                 name="groupby_multi_python",
                                 start_date=datetime(2011, 7, 1))

stmt3 = "df.groupby(['key1', 'key2']).sum()"
groupby_multi_cython = Benchmark(stmt3, setup,
                                 name="groupby_multi_cython",
                                 start_date=datetime(2011, 7, 1))

stmt = "df.groupby(['key1', 'key2'])['data1'].agg(np.std)"
groupby_multi_series_op = Benchmark(stmt, setup,
                                    name="groupby_multi_series_op",
                                    start_date=datetime(2011, 8, 1))
Exemple #18
0
    random.shuffle(arr)
    return arr

# aggregate multiple columns
df = DataFrame({'key1' : get_test_data(ngroups=ngroups),
                'key2' : get_test_data(ngroups=ngroups),
                'data1' : np.random.randn(N),
                'data2' : np.random.randn(N)})
def f():
    df.groupby(['key1', 'key2']).agg(lambda x: x.values.sum())

simple_series = Series(np.random.randn(N))
key1 = df['key1']
"""

stmt1 = "df.groupby(['key1', 'key2'])['data1'].agg(lambda x: x.values.sum())"
groupby_multi_python = Benchmark(stmt1, setup, start_date=datetime(2011, 7, 1))

stmt3 = "df.groupby(['key1', 'key2']).sum()"
groupby_multi_cython = Benchmark(stmt3, setup, start_date=datetime(2011, 7, 1))

stmt = "df.groupby(['key1', 'key2'])['data1'].agg(np.std)"
groupby_multi_series_op = Benchmark(stmt,
                                    setup,
                                    start_date=datetime(2011, 8, 1))

groupby_series_simple_cython = \
    Benchmark('simple_series.groupby(key1).sum()', setup,
              start_date=datetime(2011, 3, 1))
Exemple #19
0
"""

setup = common_setup + """
import string
import itertools as IT

def make_series(letters, strlen, size):
    return Series(
        [str(x) for x in np.fromiter(IT.cycle(letters), count=size*strlen, dtype='|S1')
        .view('|S{}'.format(strlen))])

many = make_series('matchthis'+string.ascii_uppercase, strlen=19, size=10000) # 31% matches
few = make_series('matchthis'+string.ascii_uppercase*42, strlen=19, size=10000) # 1% matches
"""

strings_cat = Benchmark("many.str.cat(sep=',')", setup)
strings_title = Benchmark("many.str.title()", setup)
strings_count = Benchmark("many.str.count('matchthis')", setup)
strings_contains_many = Benchmark("many.str.contains('matchthis')", setup)
strings_contains_few = Benchmark("few.str.contains('matchthis')", setup)
strings_contains_many_noregex = Benchmark(
    "many.str.contains('matchthis', regex=False)", setup)
strings_contains_few_noregex = Benchmark(
    "few.str.contains('matchthis', regex=False)", setup)
strings_startswith = Benchmark("many.str.startswith('matchthis')", setup)
strings_endswith = Benchmark("many.str.endswith('matchthis')", setup)
strings_lower = Benchmark("many.str.lower()", setup)
strings_upper = Benchmark("many.str.upper()", setup)
strings_replace = Benchmark("many.str.replace(r'(matchthis)', r'\1\1')", setup)
strings_repeat = Benchmark(
    "many.str.repeat(list(IT.islice(IT.cycle(range(1,4)),len(many))))", setup)
Exemple #20
0

#-------------------------------------------------------------------------------
# to_sql

setup = common_setup + """
index = tm.makeStringIndex(10000)
df = DataFrame({'float1' : randn(10000),
                'float2' : randn(10000),
                'string1' : ['foo'] * 10000,
                'bool1' : [True] * 10000,
                'int1' : np.random.randint(0, 100000, size=10000)},
               index=index)
"""

sql_write_sqlalchemy = Benchmark("df.to_sql('test1', engine, if_exists='replace')",
                                 setup, start_date=sdate)

sql_write_fallback = Benchmark("df.to_sql('test1', con, if_exists='replace')",
                               setup, start_date=sdate)


#-------------------------------------------------------------------------------
# read_sql

setup = common_setup + """
index = tm.makeStringIndex(10000)
df = DataFrame({'float1' : randn(10000),
                'float2' : randn(10000),
                'string1' : ['foo'] * 10000,
                'bool1' : [True] * 10000,
                'int1' : np.random.randint(0, 100000, size=10000)},
Exemple #21
0
index = date_range('20000101',periods=50000,freq='H')
df = DataFrame({'float1' : randn(50000),
                'float2' : randn(50000)},
               index=index)
remove(f)
"""

#----------------------------------------------------------------------
# msgpack

setup = common_setup + """
df.to_msgpack(f)
"""

packers_read_pack = Benchmark("pd.read_msgpack(f)",
                              setup,
                              start_date=start_date)

setup = common_setup + """
"""

packers_write_pack = Benchmark("df.to_msgpack(f)",
                               setup,
                               cleanup="remove(f)",
                               start_date=start_date)

#----------------------------------------------------------------------
# pickle

setup = common_setup + """
df.to_pickle(f)
Exemple #22
0
#----------------------------------------------------------------------
# read_csv

setup1 = common_setup + """
index = [rands(10) for _ in xrange(10000)]
df = DataFrame({'float1' : randn(10000),
                'float2' : randn(10000),
                'string1' : ['foo'] * 10000,
                'bool1' : [True] * 10000,
                'int1' : np.random.randint(0, 100000, size=10000)},
               index=index)
df.to_csv('__test__.csv')
"""

read_csv_standard = Benchmark("read_csv('__test__.csv')",
                              setup1,
                              start_date=datetime(2011, 9, 15))

#----------------------------------------------------------------------
# write_csv

setup2 = common_setup + """
index = [rands(10) for _ in xrange(10000)]
df = DataFrame({'float1' : randn(10000),
                'float2' : randn(10000),
                'string1' : ['foo'] * 10000,
                'bool1' : [True] * 10000,
                'int1' : np.random.randint(0, 100000, size=10000)},
               index=index)
"""
Exemple #23
0
from vbench.api import Benchmark
from datetime import datetime

common_setup = """from .pandas_vb_common import *
from pandas import to_timedelta
"""

#----------------------------------------------------------------------
# conversion

setup = common_setup + """
arr = np.random.randint(0,1000,size=10000)
"""

stmt = "to_timedelta(arr,unit='s')"
timedelta_convert_int = Benchmark(stmt, setup, start_date=datetime(2014, 1, 1))

setup = common_setup + """
arr = np.random.randint(0,1000,size=10000)
arr = [ '{0} days'.format(i) for i in arr ]
"""

stmt = "to_timedelta(arr)"
timedelta_convert_string = Benchmark(stmt,
                                     setup,
                                     start_date=datetime(2014, 1, 1))

setup = common_setup + """
arr = np.random.randint(0,60,size=10000)
arr = [ '00:00:{0:02d}'.format(i) for i in arr ]
"""
Exemple #24
0
from vbench.api import Benchmark
from datetime import datetime

common_setup = """from pandas_vb_common import *
"""

#----------------------------------------------------------------------
# shift

setup = common_setup + """
index = date_range(start="2000", freq="D", periods=1000)
panel = Panel(np.random.randn(100, len(index), 1000))
"""

panel_shift = Benchmark('panel.shift(1)', setup,
                               start_date=datetime(2012, 1, 12))

panel_shift_minor = Benchmark('panel.shift(1, axis="minor")', setup,
                               start_date=datetime(2012, 1, 12))

panel_pct_change_major = Benchmark('panel.pct_change(1, axis="major")', setup,
                                   start_date=datetime(2014, 4, 19))

panel_pct_change_minor = Benchmark('panel.pct_change(1, axis="minor")', setup,
                                   start_date=datetime(2014, 4, 19))

panel_pct_change_items = Benchmark('panel.pct_change(1, axis="items")', setup,
                                   start_date=datetime(2014, 4, 19))
Exemple #25
0
    return arr

# aggregate multiple columns
df = DataFrame({'key1' : get_test_data(ngroups=ngroups),
                'key2' : get_test_data(ngroups=ngroups),
                'data1' : np.random.randn(N),
                'data2' : np.random.randn(N)})
def f():
    df.groupby(['key1', 'key2']).agg(lambda x: x.values.sum())

simple_series = Series(np.random.randn(N))
key1 = df['key1']
"""

stmt1 = "df.groupby(['key1', 'key2'])['data1'].agg(lambda x: x.values.sum())"
groupby_multi_python = Benchmark(stmt1, setup, start_date=datetime(2011, 7, 1))

stmt3 = "df.groupby(['key1', 'key2']).sum()"
groupby_multi_cython = Benchmark(stmt3, setup, start_date=datetime(2011, 7, 1))

stmt = "df.groupby(['key1', 'key2'])['data1'].agg(np.std)"
groupby_multi_series_op = Benchmark(stmt,
                                    setup,
                                    start_date=datetime(2011, 8, 1))

groupby_series_simple_cython = \
    Benchmark('simple_series.groupby(key1).sum()', setup,
              start_date=datetime(2011, 3, 1))

#----------------------------------------------------------------------
# 2d grouping, aggregate many columns
Exemple #26
0
"""

#----------------------------------------------------------------------
# lookup

setup = common_setup + """
df = DataFrame(np.random.randn(10000, 8), columns=list('abcdefgh'))
df['foo'] = 'bar'

row_labels = list(df.index[::10])[:900]
col_labels = list(df.columns) * 100
row_labels_all = list(df.index) * len(df.columns)
col_labels_all = list(df.columns) * len(df.index)
"""

frame_fancy_lookup = Benchmark('df.lookup(row_labels, col_labels)', setup,
                               start_date=datetime(2012, 1, 12))

frame_fancy_lookup_all = Benchmark('df.lookup(row_labels_all, col_labels_all)',
                                   setup,
                                   start_date=datetime(2012, 1, 12))

#----------------------------------------------------------------------
# fillna in place

setup = common_setup + """
df = DataFrame(randn(10000, 100))
df.values[::2] = np.nan
"""

frame_fillna_inplace = Benchmark('df.fillna(0, inplace=True)', setup,
                                 start_date=datetime(2012, 4, 4))
Exemple #27
0
if hasattr(Series, 'convert'):
    Series.resample = Series.convert

ts = Series(np.random.randn(N), index=rng)
"""

#----------------------------------------------------------------------
# Lookup value in large time series, hash map population

setup = common_setup + """
rng = date_range('1/1/2000', periods=1500000, freq='s')
ts = Series(1, index=rng)
"""

stmt = "ts[ts.index[len(ts) // 2]]; ts.index._cleanup()"
timeseries_large_lookup_value = Benchmark(stmt, setup,
                                          start_date=datetime(2012, 1, 1))

#----------------------------------------------------------------------
# Test slice minutely series

timeseries_slice_minutely = Benchmark('ts[:10000]', common_setup)

#----------------------------------------------------------------------
# Test conversion

setup = common_setup + """

"""

timeseries_1min_5min_ohlc = Benchmark("ts[:10000].resample('5min', how='ohlc')",
                                      common_setup,
Exemple #28
0
if hasattr(Series, 'convert'):
    Series.resample = Series.convert

ts = Series(np.random.randn(N), index=rng)
"""

#----------------------------------------------------------------------
# Lookup value in large time series, hash map population

setup = common_setup + """
rng = date_range('1/1/2000', periods=1500000, freq='s')
ts = Series(1, index=rng)
"""

stmt = "ts[ts.index[len(ts) // 2]]; ts.index._cleanup()"
timeseries_large_lookup_value = Benchmark(stmt, setup,
                                          start_date=datetime(2012, 1, 1))

#----------------------------------------------------------------------
# Test slice minutely series

timeseries_slice_minutely = Benchmark('ts[:10000]', common_setup)

#----------------------------------------------------------------------
# Test conversion

setup = common_setup + """

"""

timeseries_1min_5min_ohlc = Benchmark(
    "ts[:10000].resample('5min', how='ohlc')",
Exemple #29
0
    return arr

# aggregate multiple columns
df = DataFrame({'key1' : get_test_data(ngroups=ngroups),
                'key2' : get_test_data(ngroups=ngroups),
                'data1' : np.random.randn(N),
                'data2' : np.random.randn(N)})
def f():
    df.groupby(['key1', 'key2']).agg(lambda x: x.values.sum())

simple_series = Series(np.random.randn(N))
key1 = df['key1']
"""

stmt1 = "df.groupby(['key1', 'key2'])['data1'].agg(lambda x: x.values.sum())"
groupby_multi_python = Benchmark(stmt1, setup, start_date=datetime(2011, 7, 1))

stmt3 = "df.groupby(['key1', 'key2']).sum()"
groupby_multi_cython = Benchmark(stmt3, setup, start_date=datetime(2011, 7, 1))

stmt = "df.groupby(['key1', 'key2'])['data1'].agg(np.std)"
groupby_multi_series_op = Benchmark(stmt,
                                    setup,
                                    start_date=datetime(2011, 8, 1))

groupby_series_simple_cython = \
    Benchmark('simple_series.groupby(key1).sum()', setup,
              start_date=datetime(2011, 3, 1))

stmt4 = "df.groupby('key1').rank(pct=True)"
groupby_series_simple_cython = Benchmark(stmt4,
Exemple #30
0
from vbench.api import Benchmark
from datetime import datetime

common_setup = """from pandas_vb_common import *
index = MultiIndex.from_arrays([np.arange(100).repeat(100),
                               np.roll(np.tile(np.arange(100), 100), 25)])
df = DataFrame(np.random.randn(10000, 4), index=index)
"""

reshape_unstack_simple = Benchmark('df.unstack(1)',
                                   common_setup,
                                   start_date=datetime(2011, 10, 1))

setup = common_setup + """
udf = df.unstack(1)
"""

reshape_stack_simple = Benchmark('udf.stack()',
                                 setup,
                                 start_date=datetime(2011, 10, 1))
Exemple #31
0
#----------------------------------------------------------------------
# lookup

setup = common_setup + """
df = DataFrame(np.random.randn(10000, 8), columns=list('abcdefgh'))
df['foo'] = 'bar'

row_labels = list(df.index[::10])[:900]
col_labels = list(df.columns) * 100
row_labels_all = np.array(list(df.index) * len(df.columns), dtype='object')
col_labels_all = np.array(list(df.columns) * len(df.index), dtype='object')
"""

frame_fancy_lookup = Benchmark('df.lookup(row_labels, col_labels)',
                               setup,
                               start_date=datetime(2012, 1, 12))

frame_fancy_lookup_all = Benchmark('df.lookup(row_labels_all, col_labels_all)',
                                   setup,
                                   start_date=datetime(2012, 1, 12))

#----------------------------------------------------------------------
# fillna in place

setup = common_setup + """
df = DataFrame(randn(10000, 100))
df.values[::2] = np.nan
"""

frame_fillna_inplace = Benchmark('df.fillna(0, inplace=True)',
Exemple #32
0
from vbench.api import Benchmark
from datetime import datetime

common_setup = """from .pandas_vb_common import *
"""

setup = common_setup + """
s1 = Series(np.random.randn(10000))
s2 = Series(np.random.randint(1, 10, 10000))
s3 = Series(np.random.randint(1, 10, 100000)).astype('int64')
values = [1,2]
s4 = s3.astype('object')
"""

series_nlargest1 = Benchmark("s1.nlargest(3, keep='last');"
                             "s1.nlargest(3, keep='first')",
                             setup,
                             start_date=datetime(2014, 1, 25))
series_nlargest2 = Benchmark("s2.nlargest(3, keep='last');"
                             "s2.nlargest(3, keep='first')",
                             setup,
                             start_date=datetime(2014, 1, 25))

series_nsmallest2 = Benchmark("s1.nsmallest(3, keep='last');"
                              "s1.nsmallest(3, keep='first')",
                              setup,
                              start_date=datetime(2014, 1, 25))

series_nsmallest2 = Benchmark("s2.nsmallest(3, keep='last');"
                              "s2.nsmallest(3, keep='first')",
                              setup,
                              start_date=datetime(2014, 1, 25))