def test_store_config_map_c(self) -> None:
        sc1 = StoreConfig(index_depth=3, columns_depth=3)
        maps = {'a': StoreConfig(index_depth=2),
                'b': StoreConfig(index_depth=3)}
        sc1m = StoreConfigMap(maps)

        sc2m = StoreConfigMap.from_initializer(sc1)
        self.assertEqual(sc2m['a'].index_depth, 3)

        sc3m = StoreConfigMap.from_initializer(sc1m)
        self.assertEqual(sc3m['a'].index_depth, 2)
        self.assertEqual(sc3m['b'].index_depth, 3)

        sc4m = StoreConfigMap.from_initializer(maps)
        self.assertEqual(sc4m['a'].index_depth, 2)
        self.assertEqual(sc4m['b'].index_depth, 3)
Beispiel #2
0
    def __init__(self,
            series: Series,
            *,
            store: tp.Optional[Store] = None,
            config: StoreConfigMapInitializer = None
            ):
        '''
        Args:
            config: StoreConfig for handling ``Frame`` construction and exporting from Store.
        '''

        if series.dtype != DTYPE_OBJECT:
            raise ErrorInitBus(
                    f'Series passed to initializer must have dtype object, not {series.dtype}')

        # do a one time iteration of series
        def gen() -> tp.Iterator[bool]:
            for label, value in series.items():
                if not isinstance(label, str):
                    raise ErrorInitBus(f'supplied label {label} is not a string.')

                if isinstance(value, Frame):
                    yield True
                elif value is FrameDeferred:
                    yield False
                else:
                    raise ErrorInitBus(f'supplied {value.__class__} is not a Frame or FrameDeferred.')

        self._loaded = np.fromiter(gen(), dtype=DTYPE_BOOL, count=len(series))
        self._loaded_all = self._loaded.all()
        self._series = series
        self._store = store

        # providing None will result in default; providing a StoreConfig or StoreConfigMap will return an appropriate map
        self._config = StoreConfigMap.from_initializer(config)
    def test_store_read_many_single_thread_weak_cache(self) -> None:

        f1, f2, f3 = get_test_framesA()

        with temp_file('.zip') as fp:

            st = StoreZipTSV(fp)
            st.write((f.name, f) for f in (f1, f2, f3))

            kwargs = dict(config_map=StoreConfigMap.from_initializer(
                StoreConfig(index_depth=1)),
                          constructor=st._container_type_to_constructor(Frame),
                          container_type=Frame)

            labels = tuple(st.labels(strip_ext=False))
            self.assertEqual(labels, ('foo.txt', 'bar.txt', 'baz.txt'))

            self.assertEqual(0, len(list(st._weak_cache)))

            # Result is not held onto!
            next(st._read_many_single_thread(('foo', ), **kwargs))

            self.assertEqual(0, len(list(st._weak_cache)))

            # Result IS held onto!
            frame = next(st._read_many_single_thread(('foo', ), **kwargs))

            self.assertEqual(1, len(list(st._weak_cache)))

            # Reference in our weak_cache _is_ `frame`
            self.assertIs(frame, st._weak_cache['foo'])
            del frame

            # Reference is gone now!
            self.assertEqual(0, len(list(st._weak_cache)))
Beispiel #4
0
    def write(self,
            items: tp.Iterable[tp.Tuple[tp.Optional[str], Frame]],
            *,
            config: StoreConfigMapInitializer = None,
            # store_filter: tp.Optional[StoreFilter] = STORE_FILTER_DEFAULT,
            ) -> None:

        config_map = StoreConfigMap.from_initializer(config)

        # NOTE: register adapters for NP types:
        # numpy types go in as blobs if they are not individualy converted tp python types
        sqlite3.register_adapter(np.int64, int)
        sqlite3.register_adapter(np.int32, int)
        sqlite3.register_adapter(np.int16, int)
        # common python types
        sqlite3.register_adapter(Fraction, str)
        sqlite3.register_adapter(complex, lambda x: f'{x.real}:{x.imag}')


        # hierarchical columns might be stored as tuples
        with sqlite3.connect(self._fp, detect_types=sqlite3.PARSE_DECLTYPES) as conn:
            cursor = conn.cursor()
            for label, frame in items:
                c = config_map[label]

                self._frame_to_table(frame=frame,
                        label=label,
                        cursor=cursor,
                        include_columns=c.include_columns,
                        include_index=c.include_index,
                        # store_filter=store_filter
                        )

            conn.commit()
    def write(self,
              items: tp.Iterable[tp.Tuple[tp.Hashable, Frame]],
              *,
              config: StoreConfigMapInitializer = None) -> None:
        config_map = StoreConfigMap.from_initializer(config)
        multiprocess = (config_map.default.write_max_workers is not None
                        and config_map.default.write_max_workers > 1)

        def gen() -> tp.Iterable[PayloadFrameToBytes]:
            for label, frame in items:
                yield PayloadFrameToBytes(  # pylint: disable=no-value-for-parameter
                    name=label,
                    config=config_map[label].to_store_config_he(),
                    frame=frame,
                    exporter=self.__class__._EXPORTER,
                )

        if multiprocess:

            def label_and_bytes() -> tp.Iterator[LabelAndBytes]:
                with ProcessPoolExecutor(max_workers=config_map.default.
                                         write_max_workers) as executor:
                    yield from executor.map(
                        self._payload_to_bytes,
                        gen(),
                        chunksize=config_map.default.write_chunksize)
        else:
            label_and_bytes = lambda: (self._payload_to_bytes(x)
                                       for x in gen())

        with zipfile.ZipFile(self._fp, 'w', zipfile.ZIP_DEFLATED) as zf:
            for label, frame_bytes in label_and_bytes():
                label_encoded = config_map.default.label_encode(label)
                # this will write it without a container
                zf.writestr(label_encoded + self._EXT_CONTAINED, frame_bytes)
    def read_many(self,
            labels: tp.Iterable[tp.Hashable],
            *,
            config: StoreConfigMapInitializer = None,
            container_type: tp.Type[Frame] = Frame,
            ) -> tp.Iterator[Frame]:
        import tables
        config_map = StoreConfigMap.from_initializer(config)

        with tables.open_file(self._fp, mode='r') as file:
            for label in labels:
                c = config_map[label]
                label_encoded = config_map.default.label_encode(label)

                index_depth = c.index_depth
                index_constructors = c.index_constructors
                columns_depth = c.columns_depth
                columns_constructors = c.columns_constructors
                consolidate_blocks = c.consolidate_blocks
                if c.dtypes:
                    raise NotImplementedError('using config.dtypes on HDF5 not yet supported')

                index_arrays = []
                columns_labels = []

                table = file.get_node(f'/{label_encoded}')
                colnames = table.cols._v_colnames

                def blocks() -> tp.Iterator[np.ndarray]:
                    for col_idx, colname in enumerate(colnames):
                        # can also do: table.read(field=colname)
                        array = table.col(colname)
                        if array.dtype.kind in DTYPE_STR_KINDS:
                            array = array.astype(str)
                        array.flags.writeable = False

                        if col_idx < index_depth:
                            index_arrays.append(array)
                            continue
                        # only store column labels for those yielded
                        columns_labels.append(colname)
                        yield array

                if consolidate_blocks:
                    data = TypeBlocks.from_blocks(TypeBlocks.consolidate_blocks(blocks()))
                else:
                    data = TypeBlocks.from_blocks(blocks())

                # this will own_data in subsequent constructor call
                yield container_type._from_data_index_arrays_column_labels(
                        data=data,
                        index_depth=index_depth,
                        index_arrays=index_arrays,
                        index_constructors=index_constructors,
                        columns_depth=columns_depth,
                        columns_labels=columns_labels,
                        columns_constructors=columns_constructors,
                        name=label,
                        )
Beispiel #7
0
    def test_store_config_map_get_default_a(self) -> None:
        maps = {
            'a': StoreConfig(index_depth=2),
            'b': StoreConfig(index_depth=3)
        }

        sc1m = StoreConfigMap.from_initializer(maps)
        self.assertTrue(sc1m.default == StoreConfigMap._DEFAULT)
Beispiel #8
0
    def test_store_config_map_init_a(self) -> None:
        maps = {
            'a': StoreConfig(index_depth=2),
            'b': StoreConfig(index_depth=3, label_encoder=str)
        }

        with self.assertRaises(ErrorInitStoreConfig):
            sc1m = StoreConfigMap.from_initializer(maps)
Beispiel #9
0
 def _from_store(
     cls,
     store: Store,
     config: StoreConfigMapInitializer = None,
 ) -> 'Batch':
     config_map = StoreConfigMap.from_initializer(config)
     items = ((label, store.read(label, config=config_map[label]))
              for label in store.labels())
     return cls(items, config=config)
Beispiel #10
0
 def _from_store(
         cls,
         store: Store,
         config: StoreConfigMapInitializer = None,
         max_persist: tp.Optional[int] = None,  # not used
 ) -> 'Batch':
     config_map = StoreConfigMap.from_initializer(config)
     items = ((label, store.read(label, config=config_map[label]))
              for label in store.labels())
     return cls(items, config=config)
Beispiel #11
0
    def __init__(self,
            series: Series,
            *,
            store: tp.Optional[Store] = None,
            config: StoreConfigMapInitializer = None,
            max_persist: tp.Optional[int] = None,
            own_data: bool = False,
            ):
        '''
        Default Bus constructor.

        {args}
        '''
        if series.dtype != DTYPE_OBJECT:
            raise ErrorInitBus(
                    f'Series passed to initializer must have dtype object, not {series.dtype}')

        if max_persist is not None:
            # use an (ordered) dictionary to give use an ordered set, simply pointing to None for all keys
            self._last_accessed: tp.Dict[str, None] = {}

        # do a one time iteration of series
        def gen() -> tp.Iterator[bool]:
            for label, value in series.items():
                if isinstance(value, Frame):
                    if max_persist is not None:
                        self._last_accessed[label] = None
                    yield True
                elif value is FrameDeferred:
                    yield False
                else:
                    raise ErrorInitBus(f'supplied {value.__class__} is not a Frame or FrameDeferred.')

        self._loaded = np.fromiter(gen(), dtype=DTYPE_BOOL, count=len(series))
        self._loaded_all = self._loaded.all()

        if own_data:
            self._values_mutable = series.values
            self._values_mutable.flags.writeable = True
        else:
            self._values_mutable = series.values.copy()

        self._index = series._index
        self._name = series._name
        self._store = store

        # Not handling cases of max_persist being greater than the length of the Series (might floor to length)
        if max_persist is not None and max_persist < self._loaded.sum():
            raise ErrorInitBus('max_persist cannot be less than the number of already loaded Frames')
        self._max_persist = max_persist

        # providing None will result in default; providing a StoreConfig or StoreConfigMap will return an appropriate map
        self._config = StoreConfigMap.from_initializer(config)
    def labels(
        self,
        *,
        config: StoreConfigMapInitializer = None,
        strip_ext: bool = True,
    ) -> tp.Iterator[tp.Hashable]:

        config_map = StoreConfigMap.from_initializer(config)

        with zipfile.ZipFile(self._fp) as zf:
            for name in zf.namelist():
                if strip_ext:
                    name = name.replace(self._EXT_CONTAINED, '')
                # always use default decoder
                yield config_map.default.label_decode(name)
    def labels(
        self,
        *,
        config: StoreConfigMapInitializer = None,
        strip_ext: bool = True,
    ) -> tp.Iterator[tp.Hashable]:

        config_map = StoreConfigMap.from_initializer(config)

        wb = self._load_workbook(self._fp)
        labels = tuple(wb.sheetnames)
        wb.close()

        for label in labels:
            yield config_map.default.label_decode(label)
Beispiel #14
0
    def labels(
        self,
        *,
        config: StoreConfigMapInitializer = None,
        strip_ext: bool = True,
    ) -> tp.Iterator[tp.Hashable]:

        config_map = StoreConfigMap.from_initializer(config)

        with sqlite3.connect(self._fp) as conn:
            cursor = conn.cursor()
            cursor.execute(
                "SELECT name FROM sqlite_master WHERE type='table';")
            for row in cursor:
                yield config_map.default.label_decode(row[0])
Beispiel #15
0
    def __init__(self,
            series: Series,
            *,
            store: tp.Optional[Store] = None,
            config: StoreConfigMapInitializer = None,
            max_persist: tp.Optional[int] = None,
            ):
        '''
        Args:
            config: StoreConfig for handling :obj:`Frame` construction and exporting from Store.
            max_persist: When loading :obj:`Frame` from a :obj:`Store`, optionally define the maximum number of :obj:`Frame` to remain in the :obj:`Bus`, regardless of the size of the :obj:`Bus`. If more than ``max_persist`` number of :obj:`Frame` are loaded, least-recently loaded :obj:`Frame` will be replaced by ``FrameDeferred``. A ``max_persist`` of 1, for example, permits reading one :obj:`Frame` at a time without ever holding in memory more than 1 :obj:`Frame`.
        '''

        if series.dtype != DTYPE_OBJECT:
            raise ErrorInitBus(
                    f'Series passed to initializer must have dtype object, not {series.dtype}')

        if max_persist is not None:
            self._last_accessed: tp.Dict[str, None] = {}

        # do a one time iteration of series
        def gen() -> tp.Iterator[bool]:
            for label, value in series.items():
                if not isinstance(label, str):
                    raise ErrorInitBus(f'supplied label {label} is not a string.')

                if isinstance(value, Frame):
                    if max_persist is not None:
                        self._last_accessed[label] = None
                    yield True
                elif value is FrameDeferred:
                    yield False
                else:
                    raise ErrorInitBus(f'supplied {value.__class__} is not a Frame or FrameDeferred.')

        self._loaded = np.fromiter(gen(), dtype=DTYPE_BOOL, count=len(series))
        self._loaded_all = self._loaded.all()
        self._series = series
        self._store = store

        # max_persist might be less than the number of Frames already loaded
        if max_persist is not None:
            self._max_persist = max(max_persist, self._loaded.sum())
        else:
            self._max_persist = None

        # providing None will result in default; providing a StoreConfig or StoreConfigMap will return an appropriate map
        self._config = StoreConfigMap.from_initializer(config)
    def labels(self, *,
            config: StoreConfigMapInitializer = None,
            strip_ext: bool = True,
            ) -> tp.Iterator[tp.Hashable]:
        '''
        Iterator of labels.
        '''
        import tables

        config_map = StoreConfigMap.from_initializer(config)

        with tables.open_file(self._fp, mode='r') as file:
            for node in file.iter_nodes(where='/',
                    classname=tables.Table.__name__):
                # NOTE: this is not the complete path
                yield config_map.default.label_decode(node.name)
Beispiel #17
0
    def write(
        self,
        items: tp.Iterable[tp.Tuple[tp.Hashable, Frame]],
        *,
        config: StoreConfigMapInitializer = None,
        # store_filter: tp.Optional[StoreFilter] = STORE_FILTER_DEFAULT,
    ) -> None:

        config_map = StoreConfigMap.from_initializer(config)

        # NOTE: register adapters for NP types:
        # numpy types go in as blobs if they are not individually converted tp python types
        sqlite3.register_adapter(np.int64, int)
        sqlite3.register_adapter(np.int32, int)
        sqlite3.register_adapter(np.int16, int)
        sqlite3.register_adapter(np.bool_, bool)
        # common python types
        sqlite3.register_adapter(Fraction, str)
        sqlite3.register_adapter(complex, lambda x: f'{x.real}:{x.imag}')

        # SQLite will naturally try to update, no replace, a DB found at an FP; this is not how all other stores work, so best to remove the file first.
        with suppress(FileNotFoundError):
            os.remove(self._fp)

        # hierarchical columns might be stored as tuples
        with sqlite3.connect(self._fp,
                             detect_types=sqlite3.PARSE_DECLTYPES) as conn:
            cursor = conn.cursor()
            for label, frame in items:
                c = config_map[label]

                # for interface compatibility with StoreXLSX, where label can be None
                if label is STORE_LABEL_DEFAULT:
                    label = 'None'
                else:
                    label = config_map.default.label_encode(label)

                self._frame_to_table(
                    frame=frame,
                    label=label,
                    cursor=cursor,
                    include_columns=c.include_columns,
                    include_index=c.include_index,
                    # store_filter=store_filter
                )

            conn.commit()
Beispiel #18
0
    def write(self,
            items: tp.Iterable[tp.Tuple[tp.Hashable, Frame]],
            *,
            config: StoreConfigMapInitializer = None,
            # store_filter: tp.Optional[StoreFilter] = STORE_FILTER_DEFAULT
            ) -> None:

        config_map = StoreConfigMap.from_initializer(config)

        with WarningsSilent():
            import tables
            # silence: DeprecationWarning: `np.typeDict` is a deprecated alias for `np.sctypeDict`.

        with tables.open_file(self._fp, mode='w') as file, WarningsSilent():
            # silence NaturalNameWarning: object name is not a valid Python identifier:

            for label, frame in items:
                c = config_map[label]
                label = config_map.default.label_encode(label)

                # should all tables be under a common group?
                field_names, dtypes = self.get_field_names_and_dtypes(
                        frame=frame,
                        include_index=c.include_index,
                        include_index_name=True,
                        include_columns=c.include_columns,
                        include_columns_name=False,
                        )

                # Must set pos to have stable position
                description = {}
                for i, (k, v) in enumerate(zip(field_names, dtypes)):
                    if v == object:
                        raise RuntimeError('cannot store object dtypes in HDF5')
                    description[k] = tables.Col.from_dtype(v, pos=i)

                table = file.create_table('/', # create off root from sring
                        name=label,
                        description=description,
                        expectedrows=len(frame),
                        )

                values = self._get_row_iterator(frame=frame,
                        include_index=c.include_index)
                table.append(tuple(values()))
                table.flush()
Beispiel #19
0
    def write(self,
              items: tp.Iterable[tp.Tuple[str, Frame]],
              *,
              config: StoreConfigMapInitializer = None) -> None:

        config_map = StoreConfigMap.from_initializer(config)

        with zipfile.ZipFile(self._fp, 'w', zipfile.ZIP_DEFLATED) as zf:
            for label, frame in items:
                c = config_map[label]
                dst = BytesIO()
                # call from class to explicitly pass self as frame
                frame.to_parquet(dst,
                                 include_index=c.include_index,
                                 include_columns=c.include_columns)
                dst.seek(0)
                # this will write it without a container
                zf.writestr(label + self._EXT_CONTAINED, dst.read())
Beispiel #20
0
    def __init__(
        self,
        items: IteratorFrameItems,
        *,
        name: NameType = None,
        config: StoreConfigMapInitializer = None,
        max_workers: tp.Optional[int] = None,
        chunksize: int = 1,
        use_threads: bool = False,
    ):
        self._items = items  # might be a generator!
        self._name = name

        self._config = StoreConfigMap.from_initializer(config)

        self._max_workers = max_workers
        self._chunksize = chunksize
        self._use_threads = use_threads
Beispiel #21
0
    def write(
        self,
        items: tp.Iterable[tp.Tuple[tp.Optional[str], Frame]],
        *,
        config: StoreConfigMapInitializer = None
        # include_index: bool = True,
        # include_columns: bool = True,
        # store_filter: tp.Optional[StoreFilter] = STORE_FILTER_DEFAULT
    ) -> None:

        config_map = StoreConfigMap.from_initializer(config)

        import tables

        with tables.open_file(self._fp, mode='w') as file:
            for label, frame in items:
                c = config_map[label]

                # should all tables be under a common group?
                field_names, dtypes = self.get_field_names_and_dtypes(
                    frame=frame,
                    include_index=c.include_index,
                    include_columns=c.include_columns)

                # Must set pos to have stable position
                description = {}
                for i, (k, v) in enumerate(zip(field_names, dtypes)):
                    if v == object:
                        raise RuntimeError(
                            'cannot store object dtypes in HDF5')
                    description[k] = tables.Col.from_dtype(v, pos=i)

                table = file.create_table(
                    '/',  # create off root from sring
                    name=label,
                    description=description,
                    expectedrows=len(frame),
                )

                values = self._get_row_iterator(frame=frame,
                                                include_index=c.include_index)
                table.append(tuple(values()))
                table.flush()
Beispiel #22
0
    def write(self,
              items: tp.Iterable[tp.Tuple[str, Frame]],
              config: StoreConfigMapInitializer = None) -> None:

        # will create default from None, will pass let a map pass through
        config_map = StoreConfigMap.from_initializer(config)

        with zipfile.ZipFile(self._fp, 'w', zipfile.ZIP_DEFLATED) as zf:
            for label, frame in items:
                c = config_map[label]
                dst = StringIO()
                # call from class to explicitly pass self as frame
                self.__class__._EXPORTER(frame,
                                         dst,
                                         include_index=c.include_index,
                                         include_columns=c.include_columns)
                dst.seek(0)
                # this will write it without a container
                zf.writestr(label + self._EXT_CONTAINED, dst.read())
Beispiel #23
0
    def write(
        self,
        items: tp.Iterable[tp.Tuple[tp.Optional[str], Frame]],
        *,
        config: StoreConfigMapInitializer = None,
        # include_index: bool = True,
        # include_columns: bool = True,
        # format_index: tp.Optional[tp.Dict[str, tp.Any]] = None,
        # format_columns: tp.Optional[tp.Dict[str, tp.Any]] = None,
        # merge_hierarchical_labels: bool = True,
        store_filter: tp.Optional[StoreFilter] = STORE_FILTER_DEFAULT
    ) -> None:
        '''
        Args:
            store_filter: a dictionary of objects to string, enabling replacement of NaN and None values when writng to XLSX.

        '''
        # format_data: tp.Optional[tp.Dict[tp.Hashable, tp.Dict[str, tp.Any]]]
        # format_data: dictionary of dictionaries, keyed by column label, that contains dictionaries of XlsxWriter format specifications.

        # will create default from None, will pass let a map pass through
        config_map = StoreConfigMap.from_initializer(config)

        import xlsxwriter

        wb = xlsxwriter.Workbook(self._fp)

        for label, frame in items:
            c = config_map[label]
            format_columns = self._get_format_or_default(wb, c.format_columns)
            format_index = self._get_format_or_default(wb, c.format_index)

            ws = wb.add_worksheet(label)
            self._frame_to_worksheet(
                frame,
                ws,
                format_columns=format_columns,
                format_index=format_index,
                include_index=c.include_index,
                include_columns=c.include_columns,
                merge_hierarchical_labels=c.merge_hierarchical_labels,
                store_filter=store_filter)
        wb.close()
Beispiel #24
0
    def _from_store(
        cls,
        store: Store,
        *,
        config: StoreConfigMapInitializer = None,
        max_workers: tp.Optional[int] = None,
        chunksize: int = 1,
        use_threads: bool = False,
    ) -> 'Batch':
        config_map = StoreConfigMap.from_initializer(config)

        items = ((label, store.read(label, config=config_map[label]))
                 for label in store.labels(config=config_map))

        return cls(
            items,
            config=config,
            max_workers=max_workers,
            chunksize=chunksize,
            use_threads=use_threads,
        )
Beispiel #25
0
    def read_many(
        self,
        labels: tp.Iterable[tp.Hashable],
        *,
        config: StoreConfigMapInitializer = None,
        container_type: tp.Type[Frame] = Frame,
    ) -> tp.Iterator[Frame]:

        config_map = StoreConfigMap.from_initializer(config)
        sqlite3.register_converter('BOOLEAN', lambda x: x == self._BYTES_ONE)

        with sqlite3.connect(self._fp,
                             detect_types=sqlite3.PARSE_DECLTYPES) as conn:

            for label in labels:
                c = config_map[label]

                if label is STORE_LABEL_DEFAULT:
                    label_encoded = 'None'
                    name = None
                else:
                    label_encoded = config_map.default.label_encode(label)
                    name = label

                query = f'SELECT * from "{label_encoded}"'

                yield tp.cast(
                    Frame,
                    container_type.from_sql(
                        query=query,
                        connection=conn,
                        index_depth=c.index_depth,
                        index_constructors=c.index_constructors,
                        columns_depth=c.columns_depth,
                        columns_select=c.columns_select,
                        columns_constructors=c.columns_constructors,
                        dtypes=c.dtypes,
                        name=name,
                        consolidate_blocks=c.consolidate_blocks))
Beispiel #26
0
    def write(
            self,
            items: tp.Iterable[tp.Tuple[tp.Optional[str], Frame]],
            *,
            config: StoreConfigMapInitializer = None,
            store_filter: tp.Optional[StoreFilter] = STORE_FILTER_DEFAULT
    ) -> None:
        '''
        Args:
            store_filter: a dictionary of objects to string, enabling replacement of NaN and None values when writng to XLSX.

        '''
        # format_data: tp.Optional[tp.Dict[tp.Hashable, tp.Dict[str, tp.Any]]]
        # format_data: dictionary of dictionaries, keyed by column label, that contains dictionaries of XlsxWriter format specifications.

        # will create default from None, will pass let a map pass through
        config_map = StoreConfigMap.from_initializer(config)

        import xlsxwriter

        # NOTE: can supply second argument: {'default_date_format': 'dd/mm/yy'}
        wb = xlsxwriter.Workbook(self._fp, {'remove_timezone': True})

        for label, frame in items:
            c = config_map[label]

            # NOTE: this must be called here, as we need the workbook been assigning formats, and we need to get a config per label
            format_columns = FormatDefaults.get_format_or_default(
                wb, format_funcs=(FormatDefaults.label, ))
            format_index = FormatDefaults.get_format_or_default(
                wb, format_funcs=(FormatDefaults.label, ))

            format_date = FormatDefaults.get_format_or_default(
                wb, format_funcs=(FormatDefaults.date, ))
            format_datetime = FormatDefaults.get_format_or_default(
                wb, format_funcs=(FormatDefaults.datetime, ))

            format_columns_date = FormatDefaults.get_format_or_default(
                wb, format_funcs=(FormatDefaults.label, FormatDefaults.date))
            format_columns_datetime = FormatDefaults.get_format_or_default(
                wb,
                format_funcs=(
                    FormatDefaults.label,
                    FormatDefaults.datetime,
                ))

            format_index_date = FormatDefaults.get_format_or_default(
                wb, format_funcs=(FormatDefaults.label, FormatDefaults.date))
            format_index_datetime = FormatDefaults.get_format_or_default(
                wb,
                format_funcs=(
                    FormatDefaults.label,
                    FormatDefaults.datetime,
                ))

            ws = wb.add_worksheet(label)
            self._frame_to_worksheet(
                frame,
                ws,
                format_columns=format_columns,
                format_index=format_index,
                format_date=format_date,
                format_datetime=format_datetime,
                format_columns_date=format_columns_date,
                format_columns_datetime=format_columns_datetime,
                format_index_date=format_index_date,
                format_index_datetime=format_index_datetime,
                include_index=c.include_index,
                include_index_name=c.include_index_name,
                include_columns=c.include_columns,
                include_columns_name=c.include_columns_name,
                merge_hierarchical_labels=c.merge_hierarchical_labels,
                store_filter=store_filter)
        wb.close()
Beispiel #27
0
    def __init__(
        self,
        frames: tp.Optional[tp.Iterable[tp.Union[Frame,
                                                 tp.Type[FrameDeferred]]]],
        *,
        index: IndexInitializer,
        index_constructor: IndexConstructor = None,
        name: NameType = NAME_DEFAULT,
        store: tp.Optional[Store] = None,
        config: StoreConfigMapInitializer = None,
        max_persist: tp.Optional[int] = None,
        own_index: bool = False,
        own_data: bool = False,
    ):
        '''
        Default Bus constructor.

        {args}
        '''
        if max_persist is not None:
            # use an (ordered) dictionary to give use an ordered set, simply pointing to None for all keys
            self._last_accessed: tp.Dict[tp.Hashable, None] = {}

        if own_index:
            self._index = index  #type: ignore
        else:
            self._index = index_from_optional_constructor(
                index,
                default_constructor=Index,
                explicit_constructor=index_constructor)
        count = len(self._index)
        frames_array: np.ndarray

        if frames is None:
            if store is None:
                raise ErrorInitBus(
                    'Cannot initialize a :obj:`Bus` with neither `frames` nor `store`.'
                )
            self._values_mutable = np.full(count,
                                           FrameDeferred,
                                           dtype=DTYPE_OBJECT)
            self._loaded = np.full(count, False, dtype=DTYPE_BOOL)
            self._loaded_all = False
        else:
            if frames.__class__ is np.ndarray:
                if frames.dtype != DTYPE_OBJECT:  #type: ignore
                    raise ErrorInitBus(
                        f'Series passed to initializer must have dtype object, not {frames.dtype}'
                    )  #type: ignore
                frames_array = frames
                load_array = False
            else:
                if own_data:
                    raise ErrorInitBus(
                        'Cannot use `own_data` when not supplying an array.')
                frames_array = np.empty(count, dtype=DTYPE_OBJECT)
                load_array = True

            self._loaded = np.empty(count, dtype=DTYPE_BOOL)
            # do a one time iteration of series

            for i, (label, value) in enumerate(
                    zip_longest(
                        index,
                        frames,
                        fillvalue=ZIP_LONGEST_DEFAULT,
                    )):
                if label is ZIP_LONGEST_DEFAULT or value is ZIP_LONGEST_DEFAULT:
                    raise ErrorInitBus(
                        'frames and index are not of equal length')

                if load_array:
                    frames_array[i] = value

                if value is FrameDeferred:
                    self._loaded[i] = False
                elif isinstance(value, Frame):  # permit FrameGO?
                    if max_persist is not None:
                        self._last_accessed[label] = None
                    self._loaded[i] = True
                else:
                    raise ErrorInitBus(
                        f'supplied {value.__class__} is not a Frame or FrameDeferred.'
                    )

            self._loaded_all = self._loaded.all()

            if own_data or load_array:
                self._values_mutable = frames_array
            else:
                self._values_mutable = frames_array.copy()
            self._values_mutable.flags.writeable = True

        # self._index = index
        self._name = None if name is NAME_DEFAULT else name
        self._store = store

        # Not handling cases of max_persist being greater than the length of the Series (might floor to length)
        if max_persist is not None and max_persist < self._loaded.sum():
            raise ErrorInitBus(
                'max_persist cannot be less than the number of already loaded Frames'
            )
        self._max_persist = max_persist

        # providing None will result in default; providing a StoreConfig or StoreConfigMap will return an appropriate map
        self._config = StoreConfigMap.from_initializer(config)
    def read_many(
        self,
        labels: tp.Iterable[tp.Hashable],
        *,
        config: StoreConfigMapInitializer = None,
        store_filter: tp.Optional[StoreFilter] = STORE_FILTER_DEFAULT,
        container_type: tp.Type[Frame] = Frame,
    ) -> tp.Iterator[Frame]:

        config_map = StoreConfigMap.from_initializer(config)
        wb = self._load_workbook(self._fp)

        for label in labels:
            c = config_map[label]

            index_depth = c.index_depth
            index_name_depth_level = c.index_name_depth_level
            index_constructors = c.index_constructors
            columns_depth = c.columns_depth
            columns_name_depth_level = c.columns_name_depth_level
            columns_constructors = c.columns_constructors
            trim_nadir = c.trim_nadir
            skip_header = c.skip_header
            skip_footer = c.skip_footer
            dtypes = c.dtypes
            consolidate_blocks = c.consolidate_blocks

            if label is STORE_LABEL_DEFAULT:
                ws = wb[wb.sheetnames[0]]
                name = None  # do not set to default sheet name
            else:
                label_encoded = config_map.default.label_encode(label)
                ws = wb[label_encoded]
                name = label  # set name to the un-encoded hashable

            if ws.max_column <= 1 or ws.max_row <= 1:
                # https://openpyxl.readthedocs.io/en/stable/optimized.html
                # says that some clients might not report correct dimensions
                ws.calculate_dimension()

            max_column = ws.max_column
            max_row = ws.max_row

            # adjust for downward shift for skipping header, then reduce for footer; at this value and beyond we stop
            last_row_count = max_row - skip_header - skip_footer

            index_values: tp.List[tp.Any] = []
            columns_values: tp.List[tp.Any] = []
            data = []
            apex_rows = []

            if trim_nadir:
                mask = np.full((last_row_count, max_column), False)

            for row_count, row in enumerate(ws.iter_rows(max_row=max_row),
                                            start=-skip_header):
                if row_count < 0:
                    continue  # due to skip header; preserves comparison to columns_depth
                if row_count >= last_row_count:
                    break

                if trim_nadir:
                    row_data: tp.Sequence[tp.Any] = []
                    for col_count, cell in enumerate(row):
                        if store_filter is None:
                            value = cell.value
                        else:
                            value = store_filter.to_type_filter_element(
                                cell.value)
                        if value is None:  # NOTE: only checking None, not np.nan
                            mask[row_count, col_count] = True
                        row_data.append(value)  # type: ignore
                    if not row_data:
                        # NOTE: there might be scenarios where there are empty ``row`` iterables that still increment the row_count; we cannot generate these directly for test
                        mask[row_count] = True  #pragma: no cover
                else:
                    if store_filter is None:
                        row_data = tuple(cell.value for cell in row)
                    else:  # only need to filter string values, but probably too expensive to pre-check
                        row_data = tuple(
                            store_filter.to_type_filter_element(cell.value)
                            for cell in row)

                if row_count <= columns_depth - 1:
                    apex_rows.append(row_data[:index_depth])
                    if columns_depth == 1:
                        columns_values.extend(row_data[index_depth:])
                    elif columns_depth > 1:
                        columns_values.append(row_data[index_depth:])
                    continue

                if index_depth == 0:
                    data.append(row_data)
                elif index_depth == 1:
                    index_values.append(row_data[0])
                    data.append(row_data[1:])
                else:
                    index_values.append(row_data[:index_depth])
                    data.append(row_data[index_depth:])

            #-----------------------------------------------------------------------
            # Trim all-empty trailing rows created from style formatting GH#146. As the wb is opened in read-only mode, reverse iterating on the wb is not an option, nor is direct row access by integer
            if trim_nadir:
                # NOTE: `mask` is all data, while `data` is post index/columns extraction; this means that if a non-None label is found, the row/column will not be trimmed.
                row_mask = mask.all(axis=1)
                row_trim_start = array1d_to_last_contiguous_to_edge(
                    row_mask) - columns_depth
                if row_trim_start < len(row_mask) - columns_depth:
                    data = data[:row_trim_start]
                    if index_depth > 0:  # this handles depth 1 and greater
                        index_values = index_values[:row_trim_start]

                col_mask = mask.all(axis=0)
                col_trim_start = array1d_to_last_contiguous_to_edge(
                    col_mask) - index_depth
                if col_trim_start < len(col_mask) - index_depth:
                    data = (r[:col_trim_start] for r in data)  #type: ignore
                    if columns_depth == 1:
                        columns_values = columns_values[:col_trim_start]
                    if columns_depth > 1:
                        columns_values = (r[:col_trim_start]
                                          for r in columns_values
                                          )  #type: ignore

            #-----------------------------------------------------------------------
            # continue with Index and Frame creation
            index_name = None if columns_depth == 0 else apex_to_name(
                rows=apex_rows,
                depth_level=index_name_depth_level,
                axis=0,
                axis_depth=index_depth)

            # index: tp.Optional[IndexBase] = None

            if index_depth <= 1:
                index_default_constructor = partial(Index, name=index_name)
            else:  # > 1
                index_default_constructor = partial(
                    IndexHierarchy.from_labels,
                    name=index_name,
                    continuation_token=None,  # NOTE: needed
                )
            index, own_index = index_from_optional_constructors(
                index_values,
                depth=index_depth,
                default_constructor=index_default_constructor,
                explicit_constructors=index_constructors,  # cannot supply name
            )

            columns_name = None if index_depth == 0 else apex_to_name(
                rows=apex_rows,
                depth_level=columns_name_depth_level,
                axis=1,
                axis_depth=columns_depth)

            # columns: tp.Optional[IndexBase] = None
            # own_columns = False

            if columns_depth <= 1:
                columns_default_constructor = partial(
                    container_type._COLUMNS_CONSTRUCTOR,
                    name=columns_name,
                )
            elif columns_depth > 1:
                columns_default_constructor = partial(
                    container_type._COLUMNS_HIERARCHY_CONSTRUCTOR.from_labels,
                    name=columns_name,
                    continuation_token=None,  # NOTE: needed, not the default
                )
                columns_values = zip(*columns_values)  #type: ignore

            columns, own_columns = index_from_optional_constructors(
                columns_values,
                depth=columns_depth,
                default_constructor=columns_default_constructor,
                explicit_constructors=columns_constructors,  # cannot supply name
            )

            yield container_type.from_records(
                data,
                index=index,
                columns=columns,
                dtypes=dtypes,
                own_index=own_index,
                own_columns=own_columns,
                name=name,
                consolidate_blocks=consolidate_blocks)
        wb.close()
    def read_many(
        self,
        labels: tp.Iterable[tp.Hashable],
        *,
        config: StoreConfigMapInitializer = None,
        container_type: tp.Type[Frame] = Frame,
    ) -> tp.Iterator[Frame]:

        config_map = StoreConfigMap.from_initializer(config)
        multiprocess: bool = config_map.default.read_max_workers is not None
        constructor: FrameConstructor = self._container_type_to_constructor(
            container_type)

        if not multiprocess:
            yield from self._read_many_single_thread(
                labels=labels,
                config_map=config_map,
                constructor=constructor,
                container_type=container_type,
            )
            return

        count_cache: int = 0
        if self._weak_cache:
            count_labels: int = 0
            results: tp.Dict[tp.Hashable, tp.Optional[Frame]] = {}
            for label in labels:
                count_labels += 1
                cache_lookup = self._weak_cache.get(label,
                                                    NOT_IN_CACHE_SENTINEL)
                if cache_lookup is not NOT_IN_CACHE_SENTINEL:
                    results[label] = self._set_container_type(
                        cache_lookup, container_type)
                    count_cache += 1
                else:
                    results[label] = None

            def results_items() -> IteratorItemsLabelOptionalFrame:
                yield from results.items()
        else:
            labels = list(labels)

            def results_items() -> IteratorItemsLabelOptionalFrame:
                for label in labels:
                    yield label, None

        # Avoid spinning up a process pool if all requested labels had weakrefs
        if count_cache and count_cache == count_labels:
            for _, frame in results_items():
                assert frame is not None  # mypy
                yield frame
            return

        def gen() -> tp.Iterator[PayloadBytesToFrame]:
            '''
            This method is synchronized with the following `for label in results_items` loop, as they both share the same necessary & initial condition: `if cached_frame is not None`.
            '''
            with zipfile.ZipFile(self._fp) as zf:
                for label, cached_frame in results_items():
                    if cached_frame is not None:
                        continue

                    c: StoreConfig = config_map[label]

                    label_encoded: str = config_map.default.label_encode(label)
                    src: bytes = zf.read(label_encoded + self._EXT_CONTAINED)

                    yield PayloadBytesToFrame(  # pylint: disable=no-value-for-parameter
                        src=src,
                        name=label,
                        config=c.to_store_config_he(),
                        constructor=constructor,
                    )

        chunksize = config_map.default.read_chunksize

        with ProcessPoolExecutor(
                max_workers=config_map.default.read_max_workers) as executor:
            frame_gen = executor.map(self._payload_to_frame,
                                     gen(),
                                     chunksize=chunksize)

            for label, cached_frame in results_items():
                if cached_frame is not None:
                    yield cached_frame
                else:
                    frame = next(frame_gen)

                    # Newly read frame, add it to our weak_cache
                    self._weak_cache[label] = frame
                    yield frame