Ejemplo n.º 1
0
    def _bcolz(self, tblname, dbname=None, type=None, df=None, blaze=False):
        # Suppress the warning until the next wersion
        import warnings
        #from flask.exthook import ExtDeprecationWarning
        #warnings.simplefilter('ignore',ExtDeprecationWarning)
        warnings.simplefilter('ignore', FutureWarning)
        import blaze as blz

        if type is None:
            type = self.type
        if dbname is None:
            dbname = self.name
        if df is None:
            # return the dataframe if it exists
            try:
                df = bcz.open(
                    os.path.expanduser(
                        os.path.join(cf.options.basedir, 'databases',
                                     "{}.{}.{}".format(type, dbname,
                                                       tblname))))
            except IOError:
                return None
            else:
                if len(df) == 0:
                    df = pd.DataFrame()
                    if blaze:
                        df = blz.data(df)
                else:
                    if blaze:
                        df = blz.data(df)
                    else:
                        df = df.todataframe()
                if not blaze and 'idx' in df.columns.values:
                    df.set_index('idx', drop=True, inplace=True)
                    df.index.name = None
                return df

        else:
            if not (df.index.dtype_str == 'int64') and not (df.empty):
                df = df.copy()
                df['idx'] = df.index
            if isinstance(df, pd.DataFrame):
                path = os.path.expanduser(
                    os.path.join(cf.options.basedir, 'databases',
                                 "{}.{}.{}".format(type, dbname, tblname)))
                if df.empty:
                    bcz.fromiter((),
                                 dtype=np.int32,
                                 mode='w',
                                 count=0,
                                 rootdir=path)
                else:
                    bcz.ctable.fromdataframe(df, mode='w', rootdir=path)

            if 'idx' in df.columns.values:
                del df
            return
Ejemplo n.º 2
0
 def test06(self):
     """Testing `fetchwhere` method off of a timestamp (pd.datetime64)"""
     N = self.N
     query_idx = np.random.randint(0, self.N)
     t = bcolz.fromiter(((i, np.datetime64('2018-03-01') + i) for i in range(N)), dtype="i4,M8[D]", count=N)
     threshold = t[query_idx][1]
     result = t.fetchwhere('(f1 > threshold)', user_dict={'threshold': threshold})
     t_fin = bcolz.fromiter(((i + query_idx, threshold + i) for i in range(1, N - query_idx)), dtype="i4,M8[D]",
                            count=N)
     np.testing.assert_array_equal(result[:], t_fin[:])
Ejemplo n.º 3
0
    def _bcolz(self, tblname, dbname=None, type=None, df=None, blaze=False):
        '''
        This is the access point to the bcolz database
        '''
        if type is None:
            type = self._m80_type
        if dbname is None:
            dbname = self._m80_name
        if df is None:
            # return the dataframe if it exists
            try:
                df = bcz.open(
                    os.path.expanduser(
                        os.path.join(cf.options.basedir, 'databases',
                                     "{}.{}.{}".format(type, dbname,
                                                       tblname))))
            except IOError:
                return None
            else:
                if len(df) == 0:
                    df = pd.DataFrame()
                    if blaze:
                        df = blz.data(df)
                else:
                    if blaze:
                        df = blz.data(df)
                    else:
                        df = df.todataframe()
                if not blaze and 'idx' in df.columns.values:
                    df.set_index('idx', drop=True, inplace=True)
                    df.index.name = None
                return df

        else:
            if not (df.index.dtype_str == 'int64') and not (df.empty):
                df = df.copy()
                df['idx'] = df.index
            if isinstance(df, pd.DataFrame):
                path = os.path.expanduser(
                    os.path.join(cf.options.basedir, 'databases',
                                 "{}.{}.{}".format(type, dbname, tblname)))
                if df.empty:
                    bcz.fromiter((),
                                 dtype=np.int32,
                                 mode='w',
                                 count=0,
                                 rootdir=path)
                else:
                    bcz.ctable.fromdataframe(df, mode='w', rootdir=path)

            if 'idx' in df.columns.values:
                del df
            return
Ejemplo n.º 4
0
def shards(bcolz_dir, taxi_df):
    single_bcolz = str(bcolz_dir.join('yellow_tripdata_2016-01.bcolz'))
    ct = ctable.fromdataframe(taxi_df, rootdir=single_bcolz)

    step, remainder = divmod(len(ct), NR_SHARDS)
    count = 0
    shards = [single_bcolz]

    for idx in range(0, len(ct), step):
        print("Creating shard {}".format(count + 1))

        if idx == len(ct) * (NR_SHARDS - 1):
            step = step + remainder

        shard_file = str(bcolz_dir.join('tripdata_2016-01-%s.bcolzs' % count))
        ct_shard = bcolz.fromiter(ct.iter(idx, idx + step),
                                  ct.dtype,
                                  step,
                                  rootdir=shard_file,
                                  mode='w')
        shards.append(shard_file)

        ct_shard.flush()
        count += 1

    yield shards
Ejemplo n.º 5
0
 def getobject(self):
     if self.flavor == 'carray':
         obj = bcolz.zeros(10, dtype="i1", rootdir=self.rootdir)
         assert type(obj) == bcolz.carray
     elif self.flavor == 'ctable':
         obj = bcolz.fromiter(((i, i*2) for i in range(10)), dtype='i2,f4',
                              count=10, rootdir=self.rootdir)
         assert type(obj) == bcolz.ctable
     return obj
Ejemplo n.º 6
0
def on_disk_data_cleaner(generator):
    rootdir = tempfile.mkdtemp(prefix='bcolz-')
    os.rmdir(rootdir)  # folder should be emtpy
    ct = bz.fromiter(generator, dtype='i4,i4', count=N, rootdir=rootdir)
    ct = bq.open(rootdir)
    # print ct
    ct.flush()
    ct = bq.open(rootdir)

    yield ct

    shutil.rmtree(rootdir)
Ejemplo n.º 7
0
def on_disk_data_cleaner(generator):
    rootdir = tempfile.mkdtemp(prefix='bcolz-')
    os.rmdir(rootdir)  # folder should be emtpy
    ct = bz.fromiter(generator, dtype='i4,i4', count=N, rootdir=rootdir)
    ct = bq.open(rootdir)
    # print ct
    ct.flush()
    ct = bq.open(rootdir)

    yield ct

    shutil.rmtree(rootdir)
Ejemplo n.º 8
0
def test_ctable(clevel):
    enter()
    tc = bcolz.fromiter(
        (mv + np.random.rand(NC) - mv for i in xrange(int(NR))),
        dtype=dt,
        cparams=bcolz.cparams(clevel, cname=cname),
        count=int(NR))
    after_create()

    out = np.fromiter((row for row in tc.where(squery, 'f1,f3')),
                      dtype="f8,f8")
    after_query()
    return out
Ejemplo n.º 9
0
def test_ctable(clevel):
    enter()
    tc = bcolz.fromiter(
        (mv + np.random.rand(NC) - mv for i in xrange(int(NR))),
        dtype=dt,
        cparams=bcolz.cparams(clevel, cname=cname),
        count=int(NR))
    after_create()

    out = np.fromiter((row for row in tc.where(squery, 'f1,f3')),
                      dtype="f8,f8")
    after_query()
    return out
Ejemplo n.º 10
0
def floats_to_bcolz(input_dir, output_dir, progress=False, **kwargs):
    """Convert MITgcm float data to bcolz format.

    Paramters
    ---------
    input_dir : path
        Where to find the MITgcm output data
    output_dir : path
        Where to but the bcolz data store (equivalent to bcolz rootdir)
    kwargs :
        Extra keyword arguments to pass to floater.input_formats.MITgcmFloatData
    """
    import bcolz
    output_dir = _maybe_add_suffix(output_dir, '.bcolz')
    mfd = input.MITgcmFloatData(input_dir, cast_to_dtype='f4', **kwargs)
    # it does NOT WORK to typecast at this point
    # values get all mangled
    #output_dtype = _convert_dtype(mfd.out_dtype, 'f4')
    ct = bcolz.fromiter(mfd.generator(progress=progress),
            dtype=mfd.out_dtype,
            count=int(mfd.nrecs), mode='w', rootdir=output_dir)
    return ct
Ejemplo n.º 11
0
# Benchmark for evaluate best ways to convert from a pandas dataframe
# (version with a mix of columns of ints and strings)

import bcolz
import pandas as pd
import numpy as np
from time import time

NR = int(1e6)
NC = 100

#bcolz.cparams.setdefaults(clevel=0)

print("Creating inputs...")
a = bcolz.arange(NR, dtype='i4')
s = bcolz.fromiter(("%d"%i for i in xrange(NR)), dtype='S7', count=NR)
df = pd.DataFrame.from_items((
    ('f%d'%i, a[:] if i < (NC//2) else s[:]) for i in range(NC)))

dsize = (NR * (NC//2) * (a.dtype.itemsize + s.dtype.itemsize)) / 2. ** 20

print("Performing benchmarks...")
# # Using an iterator (will get objects)
# t0 = time()
# names = list(df.columns.values)
# t = bcolz.ctable([df[key] for key in names], names)
# tt = time() - t0
# print("time with constructor: %.2f (%.2f MB/s)" % (tt, dsize / tt))
# print(repr(t.dtype))

# Using generic implementation
Ejemplo n.º 12
0
a = bcolz.arange(NR, dtype='i4')
#ra = np.rec.fromarrays([a]*NC, names=['f%d'%i for i in range(NC)])
ra = bcolz.ctable((a,)*NC)[:]

t0 = time()
f = tb.open_file(filepath, "w")
f.create_table(f.root, nodepath[1:], ra)
f.close()
tt = time() - t0
print("time for storing the HDF5 table: %.2f (%.2f GB/s)" % (tt, dsize / tt))

# Using an iterator
t0 = time()
f = tb.open_file(filepath)
t = f.get_node(nodepath)
t = bcolz.fromiter((r[:] for r in t), dtype=t.dtype, count=len(t))
f.close()
tt = time() - t0
print("time with fromiter: %.2f (%.2f GB/s)" % (tt, dsize / tt))

# Using blocked read
t0 = time()
f = tb.open_file(filepath)
t = f.get_node(nodepath)
names = t.colnames
dtypes = [dt[0] for dt in t.dtype.fields.values()]
cols = [np.zeros(0, dtype=dt) for dt in dtypes]
ct = bcolz.ctable(cols, names)
bs = t._v_chunkshape[0]
for i in xrange(0, len(t), bs):
    ct.append(t[i:i+bs])
Ejemplo n.º 13
0
    def handle_work(self, msg):
        if msg.isa('execute_code'):
            return self.execute_code(msg)

        tmp_dir = tempfile.mkdtemp(prefix='result_')
        buf_file_fd, buf_file = tempfile.mkstemp(prefix='tar_')
        os.close(buf_file_fd)

        args, kwargs = msg.get_args_kwargs()
        self.logger.info('doing calc %s' % args)
        filename = args[0]
        groupby_col_list = args[1]
        aggregation_list = args[2]
        where_terms_list = args[3]
        expand_filter_column = kwargs.get('expand_filter_column')
        aggregate = kwargs.get('aggregate', True)

        # create rootdir
        rootdir = os.path.join(self.data_dir, filename)
        if not os.path.exists(rootdir):
            raise Exception('Path %s does not exist' % rootdir)

        ct = bquery.ctable(rootdir=rootdir, mode='r', auto_cache=True)

        # prepare filter
        if not where_terms_list:
            bool_arr = None
        else:
            # quickly verify the where_terms_list
            if not ct.where_terms_factorization_check(where_terms_list):
                # return an empty result because the where terms do not give a result for this ctable
                msg['data'] = ''
                return msg
            # else create the boolean array
            bool_arr = ct.where_terms(where_terms_list, cache=True)

        # expand filter column check
        if expand_filter_column:
            bool_arr = ct.is_in_ordered_subgroups(basket_col=expand_filter_column, bool_arr=bool_arr)

        # retrieve & aggregate if needed
        rm_file_or_dir(tmp_dir)
        if aggregate:
            # aggregate by groupby parameters
            result_ctable = ct.groupby(groupby_col_list, aggregation_list, bool_arr=bool_arr,
                                       rootdir=tmp_dir)
        else:
            # direct result from the ctable
            column_list = groupby_col_list + [x[0] for x in aggregation_list]
            if bool_arr is not None:
                result_ctable = bcolz.fromiter(ct[column_list].where(bool_arr), ct[column_list].dtype, sum(bool_arr),
                                               rootdir=tmp_dir, mode='w')
            else:
                result_ctable = bcolz.fromiter(ct[column_list], ct[column_list].dtype, ct.len,
                                               rootdir=tmp_dir, mode='w')

        # *** clean up temporary files and memory objects
        # filter
        del bool_arr

        # input
        ct.free_cachemem()
        ct.clean_tmp_rootdir()
        del ct

        # save result to archive
        result_ctable.flush()
        result_ctable.free_cachemem()
        with tarfile.open(buf_file, mode='w') as archive:
            archive.add(tmp_dir, arcname=os.path.basename(tmp_dir))
        del result_ctable
        rm_file_or_dir(tmp_dir)

        # create message
        with open(buf_file, 'r') as file:
            # add result to message
            msg['data'] = file.read()
        rm_file_or_dir(buf_file)

        return msg
Ejemplo n.º 14
0
z = xrange(2, N + 2)

print("Starting benchmark now for creating arrays...")
# Create a ndarray
# x = (i for i in xrange(N))    # true iterable
t0 = time()
out = np.fromiter(x, dtype='f8', count=N)
print("Time for array--> %.3f" % (time() - t0,))
print("out-->", len(out))

#bcolz.set_num_threads(bcolz.ncores//2)

# Create a carray
#x = (i for i in xrange(N))    # true iterable
t0 = time()
cout = bcolz.fromiter(x, dtype='f8', count=N, cparams=bcolz.cparams(clevel))
print("Time for carray--> %.3f" % (time() - t0,))
print("cout-->", len(cout))
assert_array_equal(out, cout, "Arrays are not equal")

# Create a carray (with unknown size)
#x = (i for i in xrange(N))    # true iterable
t0 = time()
cout = bcolz.fromiter(x, dtype='f8', count=-1, cparams=bcolz.cparams(clevel))
print("Time for carray (count=-1)--> %.3f" % (time() - t0,))
print("cout-->", len(cout))
assert_array_equal(out, cout, "Arrays are not equal")

# Retrieve from a structured ndarray
gen = ((i, j, k) for i, j, k in izip(x, y, z))
t0 = time()
Ejemplo n.º 15
0
    def _bcolz(self, tblname, dbname=None, type=None, df=None, blaze=False):
    # Suppress the warning until the next wersion
        import warnings
        #from flask.exthook import ExtDeprecationWarning
        #warnings.simplefilter('ignore',ExtDeprecationWarning)
        warnings.simplefilter('ignore',FutureWarning)
        import blaze as blz

        if type is None:
            type = self.type
        if dbname is None:
            dbname = self.name
        if df is None:
            # return the dataframe if it exists 
            try:
                df = bcz.open(
                    os.path.expanduser(
                        os.path.join(
                            cf.options.basedir,
                            'databases',
                            "{}.{}.{}".format(type, dbname, tblname)
                        )
                    )
                )
            except IOError:
                return None
            else:
                if len(df) == 0:
                    df = pd.DataFrame()
                    if blaze:
                        df = blz.data(df)
                else:
                    if blaze:
                        df = blz.data(df)
                    else:
                        df = df.todataframe()
                if not blaze and 'idx' in df.columns.values:
                    df.set_index('idx', drop=True, inplace=True)
                    df.index.name = None
                return df
        
        else:
            if not(df.index.dtype_str == 'int64') and not(df.empty):
                df = df.copy()
                df['idx'] = df.index
            if isinstance(df,pd.DataFrame):
                path = os.path.expanduser(
                        os.path.join(
                            cf.options.basedir,
                            'databases',
                            "{}.{}.{}".format(type, dbname, tblname)
                        )
                    )
                if df.empty:
                    bcz.fromiter((),dtype=np.int32,mode='w',count=0,rootdir=path)
                else:
                    bcz.ctable.fromdataframe(df,mode='w',rootdir=path)
                
            if 'idx' in df.columns.values:
                del df
            return
Ejemplo n.º 16
0
from __future__ import print_function
import contextlib, time
import bcolz, numpy

@contextlib.contextmanager
def ctime(label=""):
    "Counts the time spent in some context"
    t = time.time()
    yield
    print(label, round(time.time() - t, 3), "sec")


N = 1000 * 1000

ct = bcolz.fromiter(((i, i*i, i*i*i) for i in xrange(N)), dtype='i8,i8,i8', count=N)

b = numpy.array(numpy.arange(N) % 2, dtype="bool")
c = bcolz.carray(b)

sorted_index = range(1, N, 2)
with ctime():
    r0 = (ct['f0'][sorted_index]).tolist()

with ctime():
    r1 = [x.f0 for x in ct.where(b)]
assert r0 == r1

with ctime():
    r2 = [x.f0 for x in ct.where(c)]
assert r0 == r2
Ejemplo n.º 17
0
import bcolz
import numpy
from .bench_helper import ctime

N = 1000 * 1000

ct = bcolz.fromiter(((i, i * i, i * i * i) for i in xrange(N)),
                    dtype='i8,i8,i8',
                    count=N)

b = numpy.array(numpy.arange(N) % 2, dtype="bool")
c = bcolz.carray(b)

sorted_index = range(1, N, 2)


class Suite:
    def time_tolist(self):
        return (ct['f0'][sorted_index]).tolist()

    def time_where_01(self):
        return [x.f0 for x in ct.where(b)]

    def time_where_02(self):
        return [x.f0 for x in ct.where(c)]

    def time_where_03(self):
        return [x for x in ct['f0'].where(b)]

    def time_where_04(self):
        return [x for x in ct['f0'].where(c)]
Ejemplo n.º 18
0
    def fetchwhere(self, expression, outcols=None, limit=None, skip=0,
                   out_flavor=None, user_dict={}, vm=None, **kwargs):
        """Fetch the rows fulfilling the `expression` condition.

        Parameters
        ----------
        expression : string or carray
            A boolean Numexpr expression or a boolean carray.
        outcols : list of strings or string
            The list of column names that you want to get back in results.
            Alternatively, it can be specified as a string such as 'f0 f1' or
            'f0, f1'.  If None, all the columns are returned.  If the special
            name 'nrow__' is present, the number of row will be included in
            output.
        limit : int
            A maximum number of elements to return.  The default is return
            everything.
        skip : int
            An initial number of elements to skip.  The default is 0.
        out_flavor : string
            The flavor for the `out` object.  It can be 'bcolz' or 'numpy'.
            If None, the value is get from `bcolz.defaults.out_flavor`.
        user_dict : dict
            An user-provided dictionary where the variables in expression
            can be found by name.
        vm : string
            The virtual machine to be used in computations.  It can be
            'numexpr', 'python' or 'dask'.  The default is to use 'numexpr' if
            it is installed.
        kwargs : list of parameters or dictionary
            Any parameter supported by the carray constructor.

        Returns
        -------
        out : bcolz or numpy object
            The outcome of the expression.  In case out_flavor='bcolz', you
            can adjust the properties of this object by passing any additional
            arguments supported by the carray constructor in `kwargs`.

        See Also
        --------
        whereblocks

        """
        if out_flavor is None:
            out_flavor = bcolz.defaults.out_flavor

        if out_flavor == "numpy":
            it = self.whereblocks(expression, len(self), outcols, limit, skip,
                                  user_dict=self._ud(user_dict), vm=vm)
            return next(it)
        elif out_flavor in ("bcolz", "carray"):
            dtype = self._dtype_fromoutcols(outcols)
            it = self.where(expression, outcols, limit, skip,
                            out_flavor=tuple, user_dict=self._ud(user_dict),
                            vm=vm)
            ct = bcolz.fromiter(it, dtype, count=-1, **kwargs)
            ct.flush()
            return ct
        else:
            raise ValueError(
                "`out_flavor` can only take 'bcolz' or 'numpy values")
Ejemplo n.º 19
0
    def _bcolz(self,
               tblname,
               df=None,
               m80name=None,
               m80type=None,
               blaze=False):
        '''
            This is the access point to the bcolz database
        '''
        try:
            import blaze as blz
        except FutureWarning:  # pragma: no cover
            pass
        import warnings
        # from flask.exthook import ExtDeprecationWarning
        # warnings.simplefilter('ignore', ExtDeprecationWarning)
        warnings.simplefilter('ignore', FutureWarning)

        # Fill in the defaults if they were not provided
        if m80type is None:
            m80type = self._m80_dtype
        if m80name is None:
            m80name = self._m80_name
        path = self._get_dbpath('bcz', create=True)

        # function is a getter if df is provided
        if df is None:
            # return the dataframe if it exists
            try:
                df = bcz.open(os.path.join(path, tblname))
            except IOError:
                raise IOError(
                    f'could not open database for {m80type}:{m80name} ')
            else:
                if len(df) == 0:
                    df = pd.DataFrame()
                    if blaze:
                        df = blz.data(df)
                else:
                    if blaze:
                        df = blz.data(df)
                    else:
                        df = df.todataframe()
                if not blaze and f'{tblname}_index' in self._dict:
                    df.set_index(self._dict[f'{tblname}_index'], inplace=True)
                return df
        # If df is set, then store the table
        else:
            df = df.copy()
            if df.index.name is not None:
                # We need to remember to index
                self._dict[tblname + '_index'] = df.index.name
                df.reset_index(inplace=True)
            path = os.path.join(path, tblname)
            if df.empty:
                bcz.fromiter((),
                             dtype=np.int32,
                             mode='w',
                             count=0,
                             rootdir=path)
            else:
                bcz.ctable.fromdataframe(df, mode='w', rootdir=path)
            return
Ejemplo n.º 20
0
def sql2bcolz(sql, dsfilename, con, type_hints={}):
    """
    Read SQL query, return a DataFrame.

    Parameters
    ----------
    sql : SQL string with all parameters substituted
    con : connectable (django connection, or psycopg connection) 
    """

    # We have to estimate number of rows for one-time allocation of numpy arrays

    count = None
    sql_count = "select count(*) from (%s) s" % sql

    cursor = con.cursor()
    cursor.execute(sql_count)
    count = cursor.fetchone()[0]
    cursor.close()

    if count == 0:
        return None

    # Funny way to reliable get psycopg connection. We need it to get server-side cursors.
    pgcon = con.cursor().connection

    # with transaction.atomic():
    if True:
        cursor = pgcon.cursor("serversidecursor", withhold=True)
        pgcon.commit()
        chunk_size = 100000
        if count:
            chunk_size = int(max(min(count / 10, 100000), 10))
        cursor.itersize = chunk_size

        print(str(datetime.datetime.now()), "Start executing query …  ")
        cursor.execute(sql)
        print(str(datetime.datetime.now()), "End executing query …  ")
        row0 = cursor.fetchone()
        print(str(datetime.datetime.now()), "End fetch first row …  ")
        columns = []
        dtypes = []

        def ResultIter(cursor):
            while True:
                results = cursor.fetchmany(chunk_size)
                if not results:
                    break
                for result in results:
                    yield result

        for i, col_desc in enumerate(cursor.description):
            col_name = col_desc[0]
            dtype = None
            if col_name == 'agg_color_model_ids':
                iii = 1
                pass
            if col_name in type_hints:
                dtype = type_hints[col_name]
            else:
                if col_desc.type_code == 25:
                    dtype = 'S16'
                    pass
                if col_desc.type_code == 16:
                    dtype = 'bool'
                elif col_desc.type_code == 1043:
                    dtype = 'S16'
                    pass
                elif col_desc.type_code == 1082:
                    dtype = 'i8'
                elif col_desc.type_code in [700, 701]:
                    dtype = 'f%d' % col_desc.internal_size
                elif col_desc.type_code == 1016:
                    dtype = '(128,)i8'
                elif col_desc.type_code == psycopg2.NUMBER:
                    size_ = col_desc.internal_size
                    if size_ < 0:
                        size_ = 8
                    dtype = 'i%d' % size_
            columns.append(col_name)
            dtypes.append((col_name, dtype))

        # ct = bcolz.fromiter(ResultIter(cursor), dtype=dtypes, count=count, rootdir=dsfilename)
        ct = bcolz.fromiter(cursor,
                            dtype=dtypes,
                            count=count,
                            rootdir=dsfilename)
        cursor.close()
        del cursor
    pass
Ejemplo n.º 21
0
 def time_sum_03(self):
     return bcolz.fromiter((x for x in ct['f0'].where(c)),
                           dtype=ct['f0'].dtype, count=c.wheretrue().sum()).sum()
Ejemplo n.º 22
0
 def time_sum_03(self):
     return bcolz.fromiter((x for x in ct['f0'].where(c)),
                           dtype=ct['f0'].dtype,
                           count=c.wheretrue().sum()).sum()
Ejemplo n.º 23
0
z = xrange(2, N + 2)

print("Starting benchmark now for creating arrays...")
# Create a ndarray
# x = (i for i in xrange(N))    # true iterable
t0 = time()
out = np.fromiter(x, dtype='f8', count=N)
print("Time for array--> %.3f" % (time() - t0, ))
print("out-->", len(out))

#bcolz.set_num_threads(bcolz.ncores//2)

# Create a carray
#x = (i for i in xrange(N))    # true iterable
t0 = time()
cout = bcolz.fromiter(x, dtype='f8', count=N, cparams=bcolz.cparams(clevel))
print("Time for carray--> %.3f" % (time() - t0, ))
print("cout-->", len(cout))
assert_array_equal(out, cout, "Arrays are not equal")

# Create a carray (with unknown size)
#x = (i for i in xrange(N))    # true iterable
t0 = time()
cout = bcolz.fromiter(x, dtype='f8', count=-1, cparams=bcolz.cparams(clevel))
print("Time for carray (count=-1)--> %.3f" % (time() - t0, ))
print("cout-->", len(cout))
assert_array_equal(out, cout, "Arrays are not equal")

# Retrieve from a structured ndarray
gen = ((i, j, k) for i, j, k in izip(x, y, z))
t0 = time()
Ejemplo n.º 24
0
# Benchmark for evaluate best ways to convert from a pandas dataframe
# (version with a mix of columns of ints and strings)

import bcolz
import pandas as pd
import numpy as np
from time import time

NR = int(1e6)
NC = 100

#bcolz.cparams.setdefaults(clevel=0)

print("Creating inputs...")
a = bcolz.arange(NR, dtype='i4')
s = bcolz.fromiter(("%d" % i for i in xrange(NR)), dtype='S7', count=NR)
df = pd.DataFrame.from_items(
    (('f%d' % i, a[:] if i < (NC // 2) else s[:]) for i in range(NC)))

dsize = (NR * (NC // 2) * (a.dtype.itemsize + s.dtype.itemsize)) / 2.**20

print("Performing benchmarks...")
# # Using an iterator (will get objects)
# t0 = time()
# names = list(df.columns.values)
# t = bcolz.ctable([df[key] for key in names], names)
# tt = time() - t0
# print("time with constructor: %.2f (%.2f MB/s)" % (tt, dsize / tt))
# print(repr(t.dtype))

# Using generic implementation
Ejemplo n.º 25
0
import bcolz
import numpy
from .bench_helper import ctime

N = 1000 * 1000

ct = bcolz.fromiter(((i, i * i, i * i * i)
                     for i in xrange(N)), dtype='i8,i8,i8', count=N)

b = numpy.array(numpy.arange(N) % 2, dtype="bool")
c = bcolz.carray(b)

sorted_index = range(1, N, 2)


class Suite:

    def time_tolist(self):
        return (ct['f0'][sorted_index]).tolist()

    def time_where_01(self):
        return [x.f0 for x in ct.where(b)]

    def time_where_02(self):
        return [x.f0 for x in ct.where(c)]

    def time_where_03(self):
        return [x for x in ct['f0'].where(b)]

    def time_where_04(self):
        return [x for x in ct['f0'].where(c)]