Ejemplo n.º 1
0
def compress_data(original_class):
    import bcolz
    bcolz.cparams(clevel=4, shuffle=1, cname="blosclz")

    orig_prepare = original_class.prepare

    def prepare(self, *args, **kwargs):
        orig_prepare(self, *args, **kwargs)
        for key in self.data.keys():
            self.data[key] = bcolz.carray(self.data[key])

    original_class.prepare = prepare  # set the class' __init__ to the new one
    return original_class
Ejemplo n.º 2
0
    def setChannelData(self, channelName, data, compression=False):
        """Modifies data of channel

        Parameters
        ----------------
        channelName : str
            channel name
        data : numpy array
            channel data
        compression : bool or str
            trigger for data compression
        """
        if compression and CompressionPossible:
            if not isinstance(compression, str):
                if isinstance(compression, int):
                    comp = compression
                else:
                    comp = self._compression_level
                temp = carray(data,
                              cparams=cparams(clevel=comp),
                              expectedlen=int(getsizeof(data) / 10))
            else:
                temp = compressed_data()
                temp.compression(data)
            self._setChannel(channelName, temp, field=dataField)
        else:
            self._setChannel(channelName, data, field=dataField)
Ejemplo n.º 3
0
    def __init__(self, ncols, names, meta_data={}, *args, **kwargs):
        self._type = 'BColz'
        self._ncols = ncols
        self._colnames = names
        self._meta_data = meta_data
        self._cparams = kwargs.pop("cparams", bcolz.cparams())
        self._schema = kwargs.pop("schema", None)

        if not isinstance(self._schema, BcolzSchema):
            raise ValueError("Illegal or no schema supplied.")

        if not isinstance(self._cparams, bcolz.toplevel.cparams):
            try:
                self._cparams = bcolz.cparams(**self._cparams)
            except (TypeError, NameError):
                raise ValueError("Illegal compression params supplied.")
Ejemplo n.º 4
0
def get_quantized_ctable(dtype, cparams, quantize=None, expectedlen=None):
    """Return a ctable with the quantize filter enabled for floating point cols.
    
    License
        This function is taken from the reflexible package (https://github.com/spectraphilic/reflexible/tree/master/reflexible).
        Authored by John F Burkhart <*****@*****.**> with contributions Francesc Alted <*****@*****.**>.
        Licensed under: 'This script follows creative commons usage.'
    """
    columns, names = [], []
    for fname, ftype in dtype.descr:
        names.append(fname)
        if 'f' in ftype:
            cparams2 = bcolz.cparams(clevel=cparams.clevel,
                                     cname=cparams.cname,
                                     quantize=quantize)
            columns.append(
                bcolz.zeros(0,
                            dtype=ftype,
                            cparams=cparams2,
                            expectedlen=expectedlen))
        else:
            columns.append(
                bcolz.zeros(0,
                            dtype=ftype,
                            cparams=cparams,
                            expectedlen=expectedlen))
    return bcolz.ctable(columns=columns, names=names)
Ejemplo n.º 5
0
    def __init__(self, columns=None, names=None, **kwargs):

        # Important optional params
        self._cparams = kwargs.get("cparams", bcolz.cparams())
        self.rootdir = kwargs.get("rootdir", None)
        "The directory where this object is saved."
        if self.rootdir is None and columns is None:
            raise ValueError("For creating a new ctable you should pass a `columns` param")
        if os.path.exists(self.rootdir):
            self.mode = kwargs.setdefault("mode", "a")
        else:
            self.mode = kwargs.setdefault("mode", "w")
        "The mode in which the object is created/opened."

        # Setup the columns accessor
        self.cols = cols(self.rootdir, self.mode)
        "The ctable columns accessor."

        # The length counter of this array
        self.len = 0

        # Create a new ctable or open it from disk
        if self.mode in ("r", "a"):
            self.open_ctable()
            _new = False
        elif columns is not None:
            self.create_ctable(columns, names, **kwargs)
            _new = True

        # Attach the attrs to this object
        self.attrs = attrs.attrs(self.rootdir, self.mode, _new=_new)

        # Cache a structured array of len 1 for ctable[int] acceleration
        self._arr1 = np.empty(shape=(1,), dtype=self.dtype)
Ejemplo n.º 6
0
    def setChannelData(self, channelName, data, compression=False):
        """Modifies data of channel

        Parameters
        ----------------
        channelName : str
            channel name
        data : numpy array
            channel data
        compression : bool or str
            trigger for data compression
        """
        if compression and CompressionPossible:
            if not isinstance(compression, str):
                if isinstance(compression, int):
                    comp = compression
                else:
                    comp = self._compression_level
                temp = carray(data,
                              cparams=cparams(clevel=comp),
                              expectedlen=int(getsizeof(data) / 10))
            else:
                temp = compressed_data()
                temp.compression(data)
            self._setChannel(channelName, temp, field=dataField)
        else:
            self._setChannel(channelName, data, field=dataField)
Ejemplo n.º 7
0
    def _init_ctable(self, path):
        """
        Create empty ctable for given path.
        Obtain 、Create 、Append、Attr empty ctable for given path.
        addcol(newcol[, name, pos, move])	Add a new newcol object as column.
        append(cols)	Append cols to this ctable -- e.g. : ctable
        Flush data in internal buffers to disk:
        This call should typically be done after performing modifications
        (__settitem__(), append()) in persistence mode. If you don’t do this,
        you risk losing part of your modifications.

        Parameters
        ----------
        path : string
            The path to rootdir of the new ctable.
        """
        bcolz_dir = os.path.dirname(path)
        print('bcolz_dir', bcolz_dir)
        if not os.path.exists(bcolz_dir):
            os.makedirs(bcolz_dir)
            print('path', path)
        initial_array = np.empty(0, np.uint32)
        # 配置bcolz
        bcolz.set_nthreads(Num * bcolz.detect_number_of_cores())
        # Print all the versions of packages that bcolz relies on.
        bcolz.print_versions()
        """
        clevel : int (0 <= clevel < 10) The compression level.
        shuffle : int The shuffle filter to be activated. Allowed values are bcolz.NOSHUFFLE (0), 
                bcolz.SHUFFLE (1) and bcolz.BITSHUFFLE (2). The default is bcolz.SHUFFLE.
        cname : string (‘blosclz’, ‘lz4’, ‘lz4hc’, ‘snappy’, ‘zlib’, ‘zstd’)
                Select the compressor to use inside Blosc.
        quantize : int (number of significant digits)
                Quantize data to improve (lossy) compression. Data is quantized using np.around(scale*data)/scale,
                 where scale is 2**bits, and bits is determined from the quantize value. For example,
                  if quantize=1, bits will be 4. 0 means that the quantization is disabled.
        default : cparams(clevel=5, shuffle=1, cname='lz4', quantize=0)
        """
        params = bcolz.cparams(clevel=9)
        table = bcolz.ctable(
            rootdir=path,
            columns=[
                initial_array,
                initial_array,
                initial_array,
                initial_array,
                initial_array,
                initial_array,
                initial_array,
            ],
            names=self._bcolz_fields,
            mode='w',
            cparams=params
        )
        print('cparams', table.cparams)
        table.flush()
        table = self._init_attr(table, path)
        # table.attrs['metadata'] = self._init_metadata(path)
        return table
Ejemplo n.º 8
0
def save_bcolz(data, rootdir):
    data_bcolz = bcolz.carray(array=data,
                              # chunklen=data.shape[0],
                              dtype="uint8",
                              cparams=bcolz.cparams(clevel=1, cname="zlib"),  # lz4hc zlib blosc
                              rootdir=rootdir,
                              mode="w")
    data_bcolz.flush()
Ejemplo n.º 9
0
def read_releases_v10(pathname):
    """
    Parses release file in `pathname` and return a ctable with its contents.

    This is only suited for files in Fortran90 namelist format (FP v10).

    Parameters
    ----------
    pathname : pathname
      Release file name (in Fortran90 namelist format).

    Returns
    -------
    A ctable object from bcolz package.
    """
    import bcolz

    # Setup the container for the data
    dtype = [('IDATE1', np.int32), ('ITIME1', np.int32), ('IDATE2', np.int32),
             ('ITIME2', np.int32), ('LON1', np.float32), ('LON2', np.float32),
             ('LAT1', np.float32), ('LAT2', np.float32), ('Z1', np.float32),
             ('Z2', np.float32), ('ZKIND', np.int8), ('MASS', np.float32),
             ('PARTS', np.int32), ('COMMENT', 'S32')]
    cparams = bcolz.cparams(cname="lz4", clevel=6, shuffle=1)
    ctable = bcolz.zeros(0, dtype=dtype, cparams=cparams)
    nrecords = ctable['IDATE1'].chunklen
    releases = np.zeros(nrecords, dtype=dtype)

    # Prepare for reading the input
    input_str = open(pathname, 'r').read()
    marker = "&RELEASE\n"
    len_marker = len(marker)
    release_re = r'\S+=\s+[\"|\s](\S+)[,|\"|\w]'

    # Loop over all the marker groups
    i, n = 0, 0
    while True:
        i = input_str.find(marker, i)
        j = input_str.find(marker, i + 1)
        n += 1
        group_block = input_str[i + len_marker:j]
        i = j
        values = tuple(re.findall(release_re, group_block))
        try:
            releases[(n - 1) % nrecords] = values
        except ValueError:
            print("Problem at: group: %d, %s" % (n, group_block))
            print("values:", values)
            raise
        if (n % nrecords) == 0:
            ctable.append(releases)
        if (i == -1) or (j == -1):
            break  # marker is not found anymore
    # Remainder
    ctable.append(releases[:n % nrecords])
    ctable.flush()

    return ctable
def save_pred(fpath, pred_arr, meta_dict=None):
    bc = bcolz.carray(pred_arr,
                      mode='w',
                      rootdir=fpath,
                      cparams=bcolz.cparams(clevel=9, cname='lz4'))
    if meta_dict is not None:
        bc.attrs['meta'] = meta_dict
    bc.flush()
    return bc
Ejemplo n.º 11
0
def compute_bcolz(sexpr, clevel, vm):
    # Uncomment the next for disabling threading
    # bcolz.set_nthreads(1)
    #bcolz.blosc_set_nthreads(1)
    print("*** bcolz (using compression clevel = %d):" % clevel)
    x = cx  # comment this for using numpy arrays in inputs
    t0 = time()
    cout = bcolz.eval(sexpr, vm=vm, cparams=bcolz.cparams(clevel))
    print("Time for bcolz.eval (%s) --> %.3f" % (vm, time() - t0,))
Ejemplo n.º 12
0
def create_bcolz(arr, dirname):
    cparams = bcolz.cparams(clevel=5, cname='lz4')
    ca = bcolz.carray(arr,
                      rootdir=dirname,
                      mode='w',
                      cparams=cparams,
                      chunklen=1)
    ca.flush()
    return ca
Ejemplo n.º 13
0
    def __init__(self, columns=None, names=None, **kwargs):

        # Important optional params
        self._cparams = kwargs.get('cparams', bcolz.cparams())
        self.rootdir = kwargs.get('rootdir', None)
        if self.rootdir is not None:
            self.auto_flush = kwargs.pop('auto_flush', True)
        else:
            self.auto_flush = False
            # We actually need to pop it from the kwargs, so it doesn't get
            # passed down to the carray.
            try:
                kwargs.pop('auto_flush')
            except KeyError:
                pass
        "The directory where this object is saved."
        if self.rootdir is None and columns is None:
            raise ValueError(
                "You should pass either a `columns` or a `rootdir` param"
                " at very least")
        # The mode in which the object is created/opened
        if self.rootdir is not None and os.path.exists(self.rootdir):
            self.mode = kwargs.setdefault('mode', 'a')
            if columns is not None and self.mode == 'a':
                raise ValueError(
                    "You cannot pass a `columns` param in 'a'ppend mode.\n"
                    "(If you are trying to create a new ctable, perhaps the "
                    "directory exists already.)")
        else:
            self.mode = kwargs.setdefault('mode', 'w')

        # Setup the columns accessor
        self.cols = cols(self.rootdir, self.mode)
        "The ctable columns accessor."

        # The length counter of this array
        self.len = 0

        # Create a new ctable or open it from disk
        _new = False
        if self.mode in ('r', 'a'):
            self._open_ctable()
        elif columns is not None:
            self._create_ctable(columns, names, **kwargs)
            _new = True
        else:
            raise ValueError(
                "You cannot open a ctable in 'w'rite mode"
                " without a `columns` param")

        # Attach the attrs to this object
        self.attrs = attrs.attrs(self.rootdir, self.mode, _new=_new)

        # Cache a structured array of len 1 for ctable[int] acceleration
        self._arr1 = np.empty(shape=(1,), dtype=self.dtype)
Ejemplo n.º 14
0
def compute_carray(sexpr, clevel, vm):
    # Uncomment the next for disabling threading
    # Maybe due to some contention between Numexpr and Blosc?
    # bcolz.set_nthreads(bcolz.ncores//2)
    print("*** carray (using compression clevel = %d):" % clevel)
    if clevel > 0:
        x, y, z = cx, cy, cz
    t0 = time()
    cout = bcolz.eval(sexpr, vm=vm, cparams=bcolz.cparams(clevel))
    print("Time for bcolz.eval (%s) --> %.3f" % (vm, time() - t0,), end="")
    print(", cratio (out): %.1f" % (cout.nbytes / float(cout.cbytes)))
Ejemplo n.º 15
0
def compute_bcolz(sexpr, clevel, vm):
    # Uncomment the next for disabling threading
    # bcolz.set_nthreads(1)
    #bcolz.blosc_set_nthreads(1)
    print("*** bcolz (using compression clevel = %d):" % clevel)
    x = cx  # comment this for using numpy arrays in inputs
    t0 = time()
    cout = bcolz.eval(sexpr, vm=vm, cparams=bcolz.cparams(clevel))
    print("Time for bcolz.eval (%s) --> %.3f" % (
        vm,
        time() - t0,
    ))
Ejemplo n.º 16
0
def test_ctable(clevel):
    enter()
    tc = bcolz.fromiter(
        (mv + np.random.rand(NC) - mv for i in xrange(int(NR))),
        dtype=dt,
        cparams=bcolz.cparams(clevel, cname=cname),
        count=int(NR))
    after_create()

    out = np.fromiter((row for row in tc.where(squery, 'f1,f3')),
                      dtype="f8,f8")
    after_query()
    return out
Ejemplo n.º 17
0
 def test01a(self):
     """Testing `__setitem()__` method with start,stop (scalar)"""
     a = np.ones((500, 200), dtype="i4") * 3
     b = bcolz.fill((500, 200), 3, dtype="i4", rootdir=self.rootdir,
                    cparams=bcolz.cparams())
     sl = slice(100, 400)
     a[sl, :] = 0
     b[sl] = 0
     if self.open:
         b.flush()
         b = bcolz.open(rootdir=self.rootdir)
     # print "b[sl]->", `b[sl]`
     assert_array_equal(a[sl], b[sl], "Arrays are not equal")
Ejemplo n.º 18
0
def test_ctable(clevel):
    enter()
    tc = bcolz.fromiter(
        (mv + np.random.rand(NC) - mv for i in xrange(int(NR))),
        dtype=dt,
        cparams=bcolz.cparams(clevel, cname=cname),
        count=int(NR))
    after_create()

    out = np.fromiter((row for row in tc.where(squery, 'f1,f3')),
                      dtype="f8,f8")
    after_query()
    return out
Ejemplo n.º 19
0
def get_quantized_ctable(dtype, cparams, quantize=None, expectedlen=None):
    """Return a ctable with the quantize filter enabled for floating point cols.
    """
    import bcolz

    columns, names = [], []
    for fname, ftype in dtype.descr:
        names.append(fname)
        if 'f' in ftype:
            cparams2 = bcolz.cparams(clevel=cparams.clevel, cname=cparams.cname, quantize=quantize)
            columns.append(bcolz.zeros(0, dtype=ftype, cparams=cparams2, expectedlen=expectedlen))
        else:
            columns.append(bcolz.zeros(0, dtype=ftype, cparams=cparams, expectedlen=expectedlen))
    return bcolz.ctable(columns=columns, names=names)
Ejemplo n.º 20
0
    def __init__(self,
                 path=None,
                 journal=None,
                 contiguity=None,
                 # bcolz params
                 expectedlen=None,
                 chunklen=1024 ** 2 // 2,  # 500K rows
                 cparams=bcolz.cparams(clevel=5, shuffle=False, cname='lz4hc')):

        super(JaggedByCarray, self).__init__(path, journal=journal, contiguity=contiguity)

        self.expectedlen = expectedlen
        self.chunklen = chunklen
        self.cparams = whatable(cparams, add_properties=True)
        self._bcolz = None
Ejemplo n.º 21
0
 def csv_to_carray(self):
     list_csvs = WalkDir(self._srcdir)
     logger.info("totoal counts:", len(list_csvs))
     for csv in list_csvs:
         df = ReadFromCsv(pd_names, csv, 1, ',')
         # print(df)
         df['time'] = df['time'].map(timestamp_to_unix)
         # print(df)
         arr = np.array(df)
         dst_root = os.path.join(self._dstdir, os.path.basename(csv))
         carr = bcolz.carray(arr,
                             chunklen=100 * 1024,
                             expectedlen=100 * 1024,
                             rootdir=dst_root,
                             cparams=bcolz.cparams(quantize=1))
         carr.flush()
Ejemplo n.º 22
0
    def df_to_carray(self, df, dir, name):
        '''

        :param df: 数据
        :param dir: 目录
        :param name: 名称
        :return: carray
        '''
        arr = np.array(df)
        dst_root = os.path.join(dir, name)
        carr = bcolz.carray(arr,
                            chunklen=100 * 1024,
                            expectedlen=100 * 1024,
                            rootdir=dst_root,
                            cparams=bcolz.cparams(quantize=1))
        return carr
Ejemplo n.º 23
0
def test_whatid():

    assert "JaggedByCarray(chunklen=1000," \
           "contiguity=None," \
           "cparams=cparams(clevel=3,cname='zlib',quantize=0,shuffle=False)," \
           "expectedlen=None)" \
           == JaggedByCarray(chunklen=1000,
                             cparams=bcolz.cparams(clevel=3, cname='zlib', shuffle=False),
                             expectedlen=None).what().id()
    assert "JaggedByH5Py(checksum=False," \
           "chunklen=1000," \
           "compression='lzf'," \
           "compression_opts=0," \
           "contiguity=None," \
           "shuffle=True)" \
           == JaggedByH5Py(chunklen=1000,
                           compression='lzf',
                           compression_opts=0,
                           shuffle=True).what().id()
Ejemplo n.º 24
0
    def __init__(self,
                 data_element_shape,
                 dtype,
                 batch_size,
                 save_path,
                 length=None,
                 append=False,
                 kwargs={}):
        import bcolz
        super(bcolz_array_writer, self).__init__(None, data_element_shape,
                                                 dtype, batch_size, length)
        self.save_path = save_path
        self.kwargs = kwargs

        # Set up array kwargs
        self.arr_kwargs = {
            'expectedlen': length,
            'cparams': bcolz.cparams(clevel=5, shuffle=True, cname='blosclz'),
            'dtype': dtype,
            'rootdir': save_path
        }
        if kwargs is not None:
            self.arr_kwargs.update(kwargs)

        # Create the file-backed array, open for writing.
        # (check if the array exists; if not, create it)
        if append:
            try:
                self.storage_array = bcolz.open(self.save_path, mode='a')
                self.storage_array_ptr = len(self.storage_array)
            except FileNotFoundError:
                append = False
        if not append:
            try:
                self.storage_array = bcolz.zeros(shape=(0, ) +
                                                 data_element_shape,
                                                 mode='w',
                                                 **self.arr_kwargs)
                self.storage_array_ptr = 0
            except:
                print("Error: failed to create file-backed bcolz storage "
                      "array.")
                raise
Ejemplo n.º 25
0
    def test_constructor(self):

        # missing data arg
        with assert_raises(ValueError):
            # noinspection PyArgumentList
            GenotypeCArray()

        # data has wrong dtype
        data = 'foo bar'
        with assert_raises(TypeError):
            GenotypeCArray(data)

        # data has wrong dtype
        data = [4., 5., 3.7]
        with assert_raises(TypeError):
            GenotypeCArray(data)

        # data has wrong dimensions
        data = [1, 2, 3]
        with assert_raises(TypeError):
            GenotypeCArray(data)

        # data has wrong dimensions
        data = [[1, 2], [3, 4]]  # use HaplotypeCArray instead
        with assert_raises(TypeError):
            GenotypeCArray(data)

        # diploid data (typed)
        g = GenotypeCArray(diploid_genotype_data, dtype='i1')
        aeq(diploid_genotype_data, g)
        eq(np.int8, g.dtype)

        # polyploid data (typed)
        g = GenotypeCArray(triploid_genotype_data, dtype='i1')
        aeq(triploid_genotype_data, g)
        eq(np.int8, g.dtype)

        # cparams
        g = GenotypeCArray(diploid_genotype_data,
                           cparams=bcolz.cparams(clevel=10))
        aeq(diploid_genotype_data, g)
        eq(10, g.cparams.clevel)
Ejemplo n.º 26
0
    def test_constructor(self):

        # missing data arg
        with assert_raises(ValueError):
            # noinspection PyArgumentList
            GenotypeCArray()

        # data has wrong dtype
        data = 'foo bar'
        with assert_raises(TypeError):
            GenotypeCArray(data)

        # data has wrong dtype
        data = [4., 5., 3.7]
        with assert_raises(TypeError):
            GenotypeCArray(data)

        # data has wrong dimensions
        data = [1, 2, 3]
        with assert_raises(TypeError):
            GenotypeCArray(data)

        # data has wrong dimensions
        data = [[1, 2], [3, 4]]  # use HaplotypeCArray instead
        with assert_raises(TypeError):
            GenotypeCArray(data)

        # diploid data (typed)
        g = GenotypeCArray(diploid_genotype_data, dtype='i1')
        aeq(diploid_genotype_data, g)
        eq(np.int8, g.dtype)

        # polyploid data (typed)
        g = GenotypeCArray(triploid_genotype_data, dtype='i1')
        aeq(triploid_genotype_data, g)
        eq(np.int8, g.dtype)

        # cparams
        g = GenotypeCArray(diploid_genotype_data,
                           cparams=bcolz.cparams(clevel=10))
        aeq(diploid_genotype_data, g)
        eq(10, g.cparams.clevel)
Ejemplo n.º 27
0
    def __init__(self, like, blockdivs, path=None, **kwargs):
        # Create directory
        if path is None:
            path = tempfile.mkdtemp('.pframe')
            self._explicitly_given_path = False
        else:
            # TODO: support loading of existing pframe
            os.mkdir(path)
            self._explicitly_given_path = True
        self.path = path

        self.blockdivs = tuple(blockdivs)

        # Store Metadata
        self.columns = like.columns
        self.dtypes = like.dtypes
        self.index_name = like.index.name

        self.categories = categorical_metadata(like)
        like2 = strip_categories(like.copy()).iloc[:10]

        if any(str(dt) == 'O'
               for dt in like.dtypes) or like.index.dtype == 'O':
            raise TypeError(
                'Object dtypes not supported, consider categoricals')

        # Compression
        # TODO:    Sane default compression
        if not kwargs:
            cp = bcolz.cparams(clevel=0, shuffle=False, cname=None)
            kwargs['cparams'] = cp

        # Create partitions
        npartitions = len(blockdivs) + 1
        logn = int(ceil(log(npartitions, 10)))
        subpath = 'part-%0' + str(logn) + 'd'
        self.partitions = [
            cframe(like2, rootdir=os.path.join(path, subpath % i), **kwargs)
            for i in range(npartitions)
        ]
        self.lock = Lock()
Ejemplo n.º 28
0
Archivo: core.py Proyecto: OspreyX/dask
    def __init__(self, like, divisions, path=None, **kwargs):
        # Create directory
        if path is None:
            path = tempfile.mkdtemp('.pframe')
            self._explicitly_given_path = False
        else:
            # TODO: support loading of existing pframe
            os.mkdir(path)
            self._explicitly_given_path = True
        self.path = path

        self.divisions = tuple(divisions)

        # Store Metadata
        self.columns = like.columns
        self.dtypes = like.dtypes
        self.index_name = like.index.name

        self.categories = categorical_metadata(like)
        like2 = strip_categories(like.copy()).iloc[:10]

        if (any(str(dt) in ('O', 'object') for dt in like.dtypes) or
            str(like.index.dtype) in ('O', 'object')):
            raise TypeError('Object dtypes not supported, consider categoricals')

        # Compression
        # TODO:    Sane default compression
        if not kwargs:
            cp = bcolz.cparams(clevel=0, shuffle=False, cname=None)
            kwargs['cparams'] = cp

        # Create partitions
        npartitions = len(divisions) + 1
        logn = int(ceil(log(npartitions, 10)))
        subpath = 'part-%0' + str(logn) + 'd'
        self.partitions = [cframe(like2, rootdir=os.path.join(path, subpath % i),
                                  **kwargs)
                            for i in range(npartitions)]
        self.lock = Lock()
Ejemplo n.º 29
0
def create_dataset(*, source_file, out_dir):
    data_frame_chunks = pd.read_csv(source_file, chunksize=1_000_000)
    first_chunk: pd.DataFrame = next(data_frame_chunks)
    _convert_df_to_32_bit(first_chunk)
    column_names = first_chunk.columns.tolist()

    # Note: To work around a bug when `names` is present but `columns` is empty,
    # construct this manually.
    table = bcolz.ctable.fromdataframe(
        first_chunk,
        # For some reason, higher compression levels are actually performing worse.
        cparams=bcolz.cparams(clevel=3, cname="lz4hc", shuffle=1),
        rootdir=str(out_dir),
    )

    for next_chunk in data_frame_chunks:
        _convert_df_to_32_bit(next_chunk)
        table.append(cols=[next_chunk[col] for col in column_names])
    table.flush()
    num_rows = table.shape[0]
    size_mb = table.cbytes / (1024.0**2)
    print(f"Created bcolz table with {num_rows} rows, compression settings "
          f"{table.cparams}, final size {size_mb:.1f} MiB")
Ejemplo n.º 30
0
    def test_constructor(self):

        # missing data arg
        with assert_raises(ValueError):
            # noinspection PyArgumentList
            HaplotypeCArray()

        # data has wrong dtype
        data = 'foo bar'
        with assert_raises(TypeError):
            HaplotypeCArray(data)

        # data has wrong dtype
        data = [4., 5., 3.7]
        with assert_raises(TypeError):
            HaplotypeCArray(data)

        # data has wrong dimensions
        data = [1, 2, 3]
        with assert_raises(TypeError):
            HaplotypeCArray(data)

        # data has wrong dimensions
        data = [[[1, 2], [3, 4]]]  # use GenotypeCArray instead
        with assert_raises(TypeError):
            HaplotypeCArray(data)

        # typed data (typed)
        h = HaplotypeCArray(haplotype_data, dtype='i1')
        aeq(haplotype_data, h)
        eq(np.int8, h.dtype)

        # cparams
        h = HaplotypeCArray(haplotype_data,
                            cparams=bcolz.cparams(clevel=10))
        aeq(haplotype_data, h)
        eq(10, h.cparams.clevel)
Ejemplo n.º 31
0
    def test_constructor(self):

        # missing data arg
        with assert_raises(ValueError):
            # noinspection PyArgumentList
            AlleleCountsCArray()

        # data has wrong dtype
        data = 'foo bar'
        with assert_raises(TypeError):
            AlleleCountsCArray(data)

        # data has wrong dtype
        data = [4., 5., 3.7]
        with assert_raises(TypeError):
            AlleleCountsCArray(data)

        # data has wrong dimensions
        data = [1, 2, 3]
        with assert_raises(TypeError):
            AlleleCountsCArray(data)

        # data has wrong dimensions
        data = [[[1, 2], [3, 4]]]
        with assert_raises(TypeError):
            AlleleCountsCArray(data)

        # typed data (typed)
        ac = AlleleCountsCArray(allele_counts_data, dtype='u1')
        aeq(allele_counts_data, ac)
        eq(np.uint8, ac.dtype)

        # cparams
        ac = AlleleCountsCArray(allele_counts_data,
                                cparams=bcolz.cparams(clevel=10))
        aeq(allele_counts_data, ac)
        eq(10, ac.cparams.clevel)
Ejemplo n.º 32
0
    def test_constructor(self):

        # missing data arg
        with assert_raises(ValueError):
            # noinspection PyArgumentList
            AlleleCountsCArray()

        # data has wrong dtype
        data = 'foo bar'
        with assert_raises(TypeError):
            AlleleCountsCArray(data)

        # data has wrong dtype
        data = [4., 5., 3.7]
        with assert_raises(TypeError):
            AlleleCountsCArray(data)

        # data has wrong dimensions
        data = [1, 2, 3]
        with assert_raises(TypeError):
            AlleleCountsCArray(data)

        # data has wrong dimensions
        data = [[[1, 2], [3, 4]]]
        with assert_raises(TypeError):
            AlleleCountsCArray(data)

        # typed data (typed)
        ac = AlleleCountsCArray(allele_counts_data, dtype='u1')
        aeq(allele_counts_data, ac)
        eq(np.uint8, ac.dtype)

        # cparams
        ac = AlleleCountsCArray(allele_counts_data,
                                cparams=bcolz.cparams(clevel=10))
        aeq(allele_counts_data, ac)
        eq(10, ac.cparams.clevel)
Ejemplo n.º 33
0
    def test_constructor(self):

        # missing data arg
        with assert_raises(ValueError):
            # noinspection PyArgumentList
            HaplotypeCArray()

        # data has wrong dtype
        data = 'foo bar'
        with assert_raises(TypeError):
            HaplotypeCArray(data)

        # data has wrong dtype
        data = [4., 5., 3.7]
        with assert_raises(TypeError):
            HaplotypeCArray(data)

        # data has wrong dimensions
        data = [1, 2, 3]
        with assert_raises(TypeError):
            HaplotypeCArray(data)

        # data has wrong dimensions
        data = [[[1, 2], [3, 4]]]  # use GenotypeCArray instead
        with assert_raises(TypeError):
            HaplotypeCArray(data)

        # typed data (typed)
        h = HaplotypeCArray(haplotype_data, dtype='i1')
        aeq(haplotype_data, h)
        eq(np.int8, h.dtype)

        # cparams
        h = HaplotypeCArray(haplotype_data, cparams=bcolz.cparams(clevel=10))
        aeq(haplotype_data, h)
        eq(10, h.cparams.clevel)
Ejemplo n.º 34
0
    #print("cout-->", repr(cout))


if __name__ == "__main__":

    N = 1e8  # the number of elements in x
    clevel = 3  # the compression level
    sexpr = "(x+1)<0"
    sexpr = "(((.25*x + .75)*x - 1.5)*x - 2)<0"
    # sexpr = "(((.25*x + .75)*x - 1.5)*x - 2)"
    doprofile = 0

    print("Creating inputs...")
    x = np.arange(N)
    #x = np.linspace(0,100,N)
    cx = bcolz.carray(x, cparams=bcolz.cparams(clevel))

    print("Evaluating '%s' with 10^%d points" % (sexpr, int(math.log10(N))))

    t0 = time()
    cout = ne.evaluate(sexpr)
    print("Time for numexpr --> %.3f" % (time() - t0,))

    if doprofile:
        import pstats
        import cProfile as prof

        prof.run('compute_bcolz(sexpr, clevel=clevel, vm="numexpr")',
                 #prof.run('compute_bcolz(sexpr, clevel=clevel, vm="python")',
                 'eval.prof')
        stats = pstats.Stats('eval.prof')
Ejemplo n.º 35
0
from time import time

import numpy as np

import bcolz

N = 1e8
# a = np.arange(N, dtype='f8')
a = np.random.randint(0, 10, N).astype('bool')

t0 = time()
sa = a.sum()
print("Time sum() numpy --> %.3f" % (time() - t0))

t0 = time()
ac = bcolz.carray(a, cparams=bcolz.cparams(9))
print("Time carray conv --> %.3f" % (time() - t0))
print("ac-->", repr(ac))

t0 = time()
sac = ac.sum()
#sac = ac.sum(dtype=np.dtype('i8'))
print("Time sum() carray --> %.3f" % (time() - t0))

# t0 = time()
# sac = sum(i for i in ac)
# print "Time sum() carray (iter) --> %.3f" % (time()-t0)

print("sa, sac-->", sa, sac, type(sa), type(sac))
assert (sa == sac)
Ejemplo n.º 36
0
# Benchmark for iterators

from time import time

import numpy as np

import bcolz

N = 1e8  # the number of elements in x
clevel = 5  # the compression level
sexpr = "(x-1) < 10."  # the expression to compute
# sexpr = "((x-1) % 1000) == 0."  # the expression to compute
#sexpr = "(2*x**3+.3*y**2+z+1)<0"  # the expression to compute

cparams = bcolz.cparams(clevel)

print("Creating inputs...")

x = np.arange(N)
cx = bcolz.carray(x, cparams=cparams)
if 'y' not in sexpr:
    ct = bcolz.ctable((cx, ), names=['x'])
else:
    y = np.arange(N)
    z = np.arange(N)
    cy = bcolz.carray(y, cparams=cparams)
    cz = bcolz.carray(z, cparams=cparams)
    ct = bcolz.ctable((cx, cy, cz), names=['x', 'y', 'z'])

print("Evaluating...", sexpr)
t0 = time()
Ejemplo n.º 37
0
    cPickle.dump(spacings, f, protocol=cPickle.HIGHEST_PROTOCOL)

with gzip.open(OUTPUT_FOLDER + "origins.pkl.gz", "wb") as f:
    cPickle.dump(origins, f, protocol=cPickle.HIGHEST_PROTOCOL)

# STAGE1

patients = os.listdir(INPUT_FOLDER_STAGE1)
patients.sort()
print len(patients), "patients"

for i, patient in enumerate(patients):
    scan = preptools.load_scan(INPUT_FOLDER_STAGE1 + patient)
    # scan = preptools.load_scan(INPUT_FOLDER_STAGE1 + patient, stop_before_pixels=True)
    pixels = preptools.get_pixels_hu(scan)
    spacing, flipped = preptools.get_spacing(scan[0])
    spacings[patient] = tuple(spacing)
    if flipped: pixels = pixels[::-1, :, :]
    # if i > -1: preptools.plot_3d(pixels, theshold=-500, spacing=spacing)
    data_bcolz = bcolz.carray(
        array=pixels,
        chunklen=pixels.shape[0],
        dtype="int16",
        cparams=bcolz.cparams(clevel=1, cname="zlib"),  #lz4hc zlib blosc
        rootdir=DATA_FOLDER + patient,
        mode="w")
    data_bcolz.flush()
    print "%i/%i" % (i + 1, len(patients)), patient, spacing  #, pixels.shape

with gzip.open(OUTPUT_FOLDER + "spacings.pkl.gz", "wb") as f:
    cPickle.dump(spacings, f, protocol=cPickle.HIGHEST_PROTOCOL)
Ejemplo n.º 38
0
        suffix = kwargs.pop('suffix', '.bcolz')
        prefix = kwargs.pop('prefix', 'scikit_allel_')
        tempdir = kwargs.pop('dir', None)
        rootdir = tempfile.mkdtemp(suffix=suffix, prefix=prefix, dir=tempdir)
        atexit.register(shutil.rmtree, rootdir)
        kwargs['rootdir'] = rootdir
        kwargs['mode'] = 'w'
        return kwargs


bcolz_storage = BcolzStorage()
"""bcolz storage with default parameters"""
bcolzmem_storage = BcolzMemStorage()
"""bcolz in-memory storage with default compression"""
bcolztmp_storage = BcolzTmpStorage()
"""bcolz temporary file storage with default compression"""
_zlib1 = bcolz.cparams(cname='zlib', clevel=1)
bcolz_zlib1_storage = BcolzStorage(cparams=_zlib1)
"""bcolz storage with zlib level 1 compression"""
bcolzmem_zlib1_storage = BcolzMemStorage(cparams=_zlib1)
"""bcolz in-memory storage with zlib level 1 compression"""
bcolztmp_zlib1_storage = BcolzTmpStorage(cparams=_zlib1)
"""bcolz temporary file storage with zlib level 1 compression"""

_util.storage_registry['bcolz'] = bcolz_storage
_util.storage_registry['bcolzmem'] = bcolzmem_storage
_util.storage_registry['bcolztmp'] = bcolztmp_storage
_util.storage_registry['bcolz_zlib1'] = bcolz_zlib1_storage
_util.storage_registry['bcolzmem_zlib1'] = bcolzmem_zlib1_storage
_util.storage_registry['bcolztmp_zlib1'] = bcolztmp_zlib1_storage
Ejemplo n.º 39
0
from time import time

import numpy as np

import bcolz


N = int(1e7)
CLEVEL = 5

a = np.linspace(0, 1, N)

t0 = time()
ac = bcolz.carray(a, cparams=bcolz.cparams(clevel=CLEVEL))
print("time creation (memory) ->", round(time() - t0, 3))
print("data (memory):", repr(ac))

t0 = time()
b = bcolz.carray(a, cparams=bcolz.cparams(clevel=CLEVEL),
                 rootdir='myarray', mode='w')
b.flush()
print("time creation (disk) ->", round(time() - t0, 3))
# print "meta (disk):", b.read_meta()

t0 = time()
an = np.array(a)
print("time creation (numpy) ->", round(time() - t0, 3))

t0 = time()
c = bcolz.carray(rootdir='myarray')
print("time open (disk) ->", round(time() - t0, 3))
Ejemplo n.º 40
0
import six

import bcolz
from pybedtools import BedTool
import pyBigWig
from pysam import FastaFile

from .util import makedirs
from .util import one_hot_encode_sequence
from .util import nan_to_zero
from .tiledb_array import write_tiledb
from .tiledb_array import load_tiledb

NUM_SEQ_CHARS = 4

_blosc_params = bcolz.cparams(clevel=5, shuffle=bcolz.SHUFFLE, cname="lz4")

_array_writer = {
    "numpy":
    lambda arr, path: np.save(path, arr),
    "bcolz":
    lambda arr, path: bcolz.carray(
        arr, rootdir=path, cparams=_blosc_params, mode="w").flush(),
    "tiledb":
    write_tiledb,
}


def extract_fasta_to_file(fasta, output_dir, mode="bcolz", overwrite=False):
    assert mode in _array_writer
Ejemplo n.º 41
0
z = xrange(2, N + 2)

print("Starting benchmark now for creating arrays...")
# Create a ndarray
# x = (i for i in xrange(N))    # true iterable
t0 = time()
out = np.fromiter(x, dtype='f8', count=N)
print("Time for array--> %.3f" % (time() - t0,))
print("out-->", len(out))

#bcolz.set_num_threads(bcolz.ncores//2)

# Create a carray
#x = (i for i in xrange(N))    # true iterable
t0 = time()
cout = bcolz.fromiter(x, dtype='f8', count=N, cparams=bcolz.cparams(clevel))
print("Time for carray--> %.3f" % (time() - t0,))
print("cout-->", len(cout))
assert_array_equal(out, cout, "Arrays are not equal")

# Create a carray (with unknown size)
#x = (i for i in xrange(N))    # true iterable
t0 = time()
cout = bcolz.fromiter(x, dtype='f8', count=-1, cparams=bcolz.cparams(clevel))
print("Time for carray (count=-1)--> %.3f" % (time() - t0,))
print("cout-->", len(cout))
assert_array_equal(out, cout, "Arrays are not equal")

# Retrieve from a structured ndarray
gen = ((i, j, k) for i, j, k in izip(x, y, z))
t0 = time()
Ejemplo n.º 42
0
def append(data, clevel):
    alldata = bcolz.carray(data[0], cparams=bcolz.cparams(clevel))
    for carr in data[1:]:
        alldata.append(carr)

    return alldata
Ejemplo n.º 43
0
Archivo: query.py Proyecto: Blosc/bcolz
import bcolz


N = int(1e8)  # the number of elements in x
clevel = 9    # the compression level
cname = "blosclz"  # the compressor name
sexpr = "(x+1)<10"  # small number of items
# sexpr = "(x+1)<1000000"              # large number
# sexpr = "(2*x*x*x+.3*y**2+z+1)<10"  # small number
#sexpr = "(2*x*x*x+.3*y**2+z+1)<1e15"  # medium number
#sexpr = "(2*x*x*x+.3*y**2+z+1)<1e20"  # large number

print("Creating inputs...")

cparams = bcolz.cparams(clevel=clevel, cname=cname)

x = np.arange(N)
cx = bcolz.carray(x, cparams=cparams)
if 'y' not in sexpr:
    t = bcolz.ctable((cx,), names=['x'])
else:
    y = np.arange(N)
    z = np.arange(N)
    cy = bcolz.carray(y, cparams=cparams)
    cz = bcolz.carray(z, cparams=cparams)
    t = bcolz.ctable((cx, cy, cz), names=['x', 'y', 'z'])
nt = t[:]

print("Querying '%s' with 10^%d points" % (sexpr, int(math.log10(N))))
Ejemplo n.º 44
0
import numpy as np

import bcolz

N = 1e7  # the number of elements in x
clevel = 5  # the compression level
cname = "blosclz"  # the compressor name
sexpr = "(x+1)<10"  # small number of items
# sexpr = "(x+1)<1000000"              # large number
sexpr = "(2*x*x*x+.3*y**2+z+1)<10"  # small number
#sexpr = "(2*x*x*x+.3*y**2+z+1)<1e15"  # medium number
#sexpr = "(2*x*x*x+.3*y**2+z+1)<1e20"  # large number

print("Creating inputs...")

cparams = bcolz.cparams(clevel=clevel, cname=cname)

x = np.arange(N)
cx = bcolz.carray(x, cparams=cparams)
if 'y' not in sexpr:
    t = bcolz.ctable((cx, ), names=['x'])
else:
    y = np.arange(N)
    z = np.arange(N)
    cy = bcolz.carray(y, cparams=cparams)
    cz = bcolz.carray(z, cparams=cparams)
    t = bcolz.ctable((cx, cy, cz), names=['x', 'y', 'z'])
nt = t[:]

print("Querying '%s' with 10^%d points" % (sexpr, int(math.log10(N))))
Ejemplo n.º 45
0
Archivo: sum.py Proyecto: Blosc/bcolz
import numpy as np

import bcolz


N = 1e8
# a = np.arange(N, dtype='f8')
a = np.random.randint(0, 10, N).astype('bool')

t0 = time()
sa = a.sum()
print("Time sum() numpy --> %.3f" % (time() - t0))

t0 = time()
ac = bcolz.carray(a, cparams=bcolz.cparams(9))
print("Time carray conv --> %.3f" % (time() - t0))
print("ac-->", repr(ac))

t0 = time()
sac = ac.sum()
#sac = ac.sum(dtype=np.dtype('i8'))
print("Time sum() carray --> %.3f" % (time() - t0))

# t0 = time()
# sac = sum(i for i in ac)
# print "Time sum() carray (iter) --> %.3f" % (time()-t0)

print("sa, sac-->", sa, sac, type(sa), type(sac))
assert (sa == sac)
Ejemplo n.º 46
0
 def setUp(self):
     chunked.storage_registry['default'] = chunked.BcolzMemStorage(
         cparams=bcolz.cparams(cname='zlib', clevel=1)
     )
Ejemplo n.º 47
0
def read_partpositions(filename,
                       nspec,
                       ctable=True,
                       clevel=5,
                       cname="lz4",
                       quantize=None):
    """Read the particle positions in `filename`.

    This function strives to use as less memory as possible; for this, a
    bcolz ctable container is used for holding the data.  Besides to be compressed
    in-memory, its chunked nature makes a natural fit for data that needs to
    be appended because it does not need expensive memory resize operations.

    NOTE: This code reads directly from un UNFORMATTED SEQUENTIAL data Fortran
    file so care has been taken to skip the record length at the beginning and
    the end of every record.  See:
    http://stackoverflow.com/questions/8751185/fortran-unformatted-file-format

    Parameters
    ----------
    filename : string
        The file name of the particle raw data
    nspec : int
        number of species in particle raw data
    ctable : bool
        Return a bcolz ctable container.  If not, a numpy structured array is returned instead.
    clevel : int
        Compression level for the ctable container
    cname : string
        Codec name for the ctable container.  Can be 'blosclz', 'lz4', 'zlib' or 'zstd'.
    quantize : int
        Quantize data to improve (lossy) compression.  Data is quantized using
        np.around(scale*data)/scale, where scale is 2**bits, and bits is
        determined from the quantize value.  For example, if quantize=1, bits
        will be 4.  0 means that the quantization is disabled.

    Returns
    -------
    ctable object OR structured_numpy_array

    Returning a ctable is preferred because it is used internally so it does not require to be
    converted to other formats, so it is faster and uses less memory.

    Note: Passing a `quantize` param > 0 can increase the compression ratio of the ctable
    container, but it may also slow down the reading speed significantly.

    License
        This function is taken from the reflexible package (https://github.com/spectraphilic/reflexible/tree/master/reflexible).
        Authored by John F Burkhart <*****@*****.**> with contributions Francesc Alted <*****@*****.**>.
        Licensed under: 'This script follows creative commons usage.'


    """

    CHUNKSIZE = 10 * 1000
    xmass_dtype = [('xmass_%d' % (i + 1), 'f4') for i in range(nspec)]
    # note age is calculated from itramem by adding itimein
    out_fields = [('npoint', 'i4'), ('xtra1', 'f4'), ('ytra1', 'f4'),
                  ('ztra1', 'f4'), ('itramem', 'i4'), ('topo', 'f4'),
                  ('pvi', 'f4'), ('qvi', 'f4'), ('rhoi', 'f4'),
                  ('hmixi', 'f4'), ('tri', 'f4'), ('tti', 'f4')] + xmass_dtype
    raw_fields = [('begin_recsize', 'i4')
                  ] + out_fields + [('end_recsize', 'i4')]
    raw_rectype = np.dtype(raw_fields)
    recsize = raw_rectype.itemsize

    cparams = bcolz.cparams(clevel=clevel, cname=cname)
    if quantize is not None and quantize > 0:
        out = get_quantized_ctable(raw_rectype,
                                   cparams=cparams,
                                   quantize=quantize,
                                   expectedlen=int(1e6))
    else:
        out = bcolz.zeros(0,
                          dtype=raw_rectype,
                          cparams=cparams,
                          expectedlen=int(1e6))

    with open(filename, "rb", buffering=1) as f:
        # The timein value is at the beginning of the file
        reclen = np.ndarray(shape=(1, ), buffer=f.read(4), dtype="i4")[0]
        assert reclen == 4
        itimein = np.ndarray(shape=(1, ), buffer=f.read(4), dtype="i4")
        reclen = np.ndarray(shape=(1, ), buffer=f.read(4), dtype="i4")[0]
        assert reclen == 4
        nrec = 0
        while True:
            # Try to read a complete chunk
            data = f.read(CHUNKSIZE * recsize)
            read_records = int(len(data) /
                               recsize)  # the actual number of records read
            chunk = np.ndarray(shape=(read_records, ),
                               buffer=data,
                               dtype=raw_rectype)
            # Add the chunk to the out array
            out.append(chunk[:read_records])
            nrec += read_records
            if read_records < CHUNKSIZE:
                # We reached the end of the file
                break

    # Truncate at the max length (last row is always a sentinel, so remove it)
    out.trim(1)
    # Remove the first and last columns
    out.delcol("begin_recsize")
    out.delcol("end_recsize")

    if ctable:
        return out
    else:
        return out[:]
Ejemplo n.º 48
0
import numpy as np
import numexpr as ne

import bcolz


N = 1e7  # the number of elements in x
clevel = 3  # the compression level
# sexpr = "(x+1)<0"  # the expression to compute
#sexpr = "(2*x**3+.3*y**2+z+1)<0"  # the expression to compute
sexpr = "((.25*x + .75)*x - 1.5)*x - 2"  # a computer-friendly polynomial
#sexpr = "(((.25*x + .75)*x - 1.5)*x - 2)<0"  # a computer-friendly polynomial

print("Creating inputs...")

cparams = bcolz.cparams(clevel)

x = np.arange(N)
#x = np.linspace(0,100,N)
cx = bcolz.carray(x, cparams=cparams)
if 'y' not in sexpr:
    t = bcolz.ctable((cx,), names=['x'])
else:
    y = np.arange(N)
    z = np.arange(N)
    cy = bcolz.carray(y, cparams=cparams)
    cz = bcolz.carray(z, cparams=cparams)
    t = bcolz.ctable((cx, cy, cz), names=['x', 'y', 'z'])

print("Evaluating '%s' with 10^%d points" % (sexpr, int(math.log10(N))))
Ejemplo n.º 49
0
 def setUp(self):
     chunked.storage_registry['default'] = chunked.BcolzMemStorage(
         cparams=bcolz.cparams(cname='zlib', clevel=1)
     )