def test_store_sqlite_read_many_a(self) -> None: f1 = Frame.from_dict(dict(x=(1, 2, -5, 200), y=(3, 4, -5, -3000)), index=IndexHierarchy.from_product(('I', 'II'), ('a', 'b')), name='f1') f2 = Frame.from_dict(dict(a=(1, 2, 3), b=(4, 5, 6)), index=('x', 'y', 'z'), name='f2') f3 = Frame.from_records(((10, 20, 50, 60), (50.0, 60.4, -50, -60)), index=('p', 'q'), columns=IndexHierarchy.from_product( ('I', 'II'), ('a', 'b')), name='f3') f4 = Frame.from_records( ( (10, 20, 50, False, 10, 20, 50, False), (50.0, 60.4, -50, True, 50.0, 60.4, -50, True), (234, 44452, 0, False, 234, 44452, 0, False), (4, -4, 2000, True, 4, -4, 2000, True), (10, 20, 50, False, 10, 20, 50, False), (50.0, 60.4, -50, True, 50.0, 60.4, -50, True), (234, 44452, 0, False, 234, 44452, 0, False), (4, -4, 2000, True, 4, -4, 2000, True), ), index=IndexHierarchy.from_product( ('top', 'bottom'), ('far', 'near'), ('left', 'right')), columns=IndexHierarchy.from_product(('I', 'II'), ('a', 'b'), (1, 2)), name='f4') frames = (f1, f2, f3, f4) config_map_write = StoreConfigMap.from_config( StoreConfig(include_index=True, include_columns=True)) with temp_file('.sqlite') as fp: st1 = StoreSQLite(fp) st1.write(((f.name, f) for f in frames), config=config_map_write) labels = tuple( st1.labels()) # this will read from file, not in memory self.assertEqual(tuple(f.name for f in frames), labels) config_map_read: tp.Dict[tp.Hashable, StoreConfig] = {} for i, name in enumerate(labels): f_src = frames[i] c = StoreConfig(index_depth=f_src.index.depth, columns_depth=f_src.columns.depth) config_map_read[name] = c for i, f_loaded in enumerate( st1.read_many(labels, config=config_map_read)): f_src = frames[i] self.assertEqualFrames(f_src, f_loaded, compare_dtype=False)
def test_store_hdf5_write_a(self) -> None: f1 = Frame.from_dict(dict(x=(1, 2, -5, 200), y=(3, 4, -5, -3000)), index=IndexHierarchy.from_product(('I', 'II'), ('a', 'b')), name='f1') f2 = Frame.from_dict(dict(a=(1, 2, 3), b=(4, 5, 6)), index=('x', 'y', 'z'), name='f2') f3 = Frame.from_records(((10, 20, 50, 60), (50.0, 60.4, -50, -60)), index=('p', 'q'), columns=IndexHierarchy.from_product( ('I', 'II'), ('a', 'b')), name='f3') f4 = Frame.from_records( ( (10, 20, 50, False, 10, 20, 50, False), (50.0, 60.4, -50, True, 50.0, 60.4, -50, True), (234, 44452, 0, False, 234, 44452, 0, False), (4, -4, 2000, True, 4, -4, 2000, True), (10, 20, 50, False, 10, 20, 50, False), (50.0, 60.4, -50, True, 50.0, 60.4, -50, True), (234, 44452, 0, False, 234, 44452, 0, False), (4, -4, 2000, True, 4, -4, 2000, True), ), index=IndexHierarchy.from_product( ('top', 'bottom'), ('far', 'near'), ('left', 'right')), columns=IndexHierarchy.from_product(('I', 'II'), ('a', 'b'), (1, 2)), name='f4') frames = (f1, f2, f3, f4) with temp_file('.hdf5') as fp: st1 = StoreHDF5(fp) st1.write((f.name, f) for f in frames) sheet_names = tuple( st1.labels()) # this will read from file, not in memory self.assertEqual(tuple(f.name for f in frames), sheet_names) for i, name in enumerate(sheet_names): f_src = frames[i] f_loaded = st1.read(name, index_depth=f_src.index.depth, columns_depth=f_src.columns.depth) # print(f_loaded) # self.assertEqualFrames(f_src, f_loaded) # import ipdb; ipdb.set_trace() pass
def test_bus_to_hierarchy_b(self) -> None: class CustomError(Exception): pass tree1 = dict(a_I=Index((1,2,3)), a_II=Index((1,2,3))) tree2 = dict(b_I=Index((1,2,3)), b_II=Index((1,2,3))) tree3 = dict(c_I=Index((1,2,3)), c_II=Index((1,2,3))) index1 = IndexHierarchy.from_tree(tree1) index2 = IndexHierarchy.from_tree(tree2) index3 = IndexHierarchy.from_tree(tree3) values = np.arange(36).reshape(6,6) # Align all the frames on columns! f1 = Frame(values, index=index1, columns=index1, name='f1') f2 = Frame(values, index=index2, columns=index1, name='f2') f3 = Frame(values, index=index3, columns=index1, name='f3') b1 = Bus.from_frames((f1, f2, f3)) def test_assertions(hierarchy: IndexHierarchy, opposite: Index) -> None: expected_tree = dict(f1=tree1, f2=tree2, f3=tree3) self.compare_trees(hierarchy.to_tree(), expected_tree) self.assertTrue(index1.equals(opposite)) test_assertions(*bus_to_hierarchy(b1, axis=0, deepcopy_from_bus=False, init_exception_cls=CustomError)) test_assertions(*bus_to_hierarchy(b1, axis=0, deepcopy_from_bus=True, init_exception_cls=CustomError)) # Cannot do this since the frames do not share the same index with self.assertRaises(CustomError): bus_to_hierarchy(b1, axis=1, deepcopy_from_bus=False, init_exception_cls=CustomError) with self.assertRaises(CustomError): bus_to_hierarchy(b1, axis=1, deepcopy_from_bus=True, init_exception_cls=CustomError) # Align all the frames on index! f1 = Frame(values, index=index1, columns=index1, name='f1') f2 = Frame(values, index=index1, columns=index2, name='f2') f3 = Frame(values, index=index1, columns=index3, name='f3') b1 = Bus.from_frames((f1, f2, f3)) test_assertions(*bus_to_hierarchy(b1, axis=1, deepcopy_from_bus=False, init_exception_cls=CustomError)) test_assertions(*bus_to_hierarchy(b1, axis=1, deepcopy_from_bus=True, init_exception_cls=CustomError)) # Cannot do this since the frames do not share the same columns with self.assertRaises(CustomError): bus_to_hierarchy(b1, axis=0, deepcopy_from_bus=False, init_exception_cls=CustomError) with self.assertRaises(CustomError): bus_to_hierarchy(b1, axis=0, deepcopy_from_bus=True, init_exception_cls=CustomError)
def test_store_sqlite_write_a(self) -> None: f1 = Frame.from_dict(dict(x=(None, -np.inf, np.inf, None), y=(3, 4, -5, -3000)), index=IndexHierarchy.from_product(('I', 'II'), ('a', 'b')), name='f1') f2 = Frame.from_dict(dict(a=(1, 2, 3), b=(4, 5, 6)), index=('x', 'y', 'z'), name='f2') f3 = Frame.from_records(((10.4, 20.1, 50, 60), (50.1, 60.4, -50, -60)), index=('p', 'q'), columns=IndexHierarchy.from_product( ('I', 'II'), ('a', 'b')), name='f3') f4 = Frame.from_records( ( (10, 20, 50, False, 10, 20, 50, False), (50.0, 60.4, -50, True, 50.0, 60.4, -50, True), (234, 44452, 0, False, 234, 44452, 0, False), (4, -4, 2000, True, 4, -4, 2000, True), (10, 20, 50, False, 10, 20, 50, False), (50.0, 60.4, -50, True, 50.0, 60.4, -50, True), (234, 44452, 0, False, 234, 44452, 0, False), (4, -4, 2000, True, 4, -4, 2000, True), ), index=IndexHierarchy.from_product( ('top', 'bottom'), ('far', 'near'), ('left', 'right')), columns=IndexHierarchy.from_product(('I', 'II'), ('a', 'b'), (1, 2)), name='f4') frames = (f1, f2, f3, f4) with temp_file('.sqlite') as fp: st1 = StoreSQLite(fp) st1.write((f.name, f) for f in frames) sheet_names = tuple( st1.labels()) # this will read from file, not in memory self.assertEqual(tuple(f.name for f in frames), sheet_names) for i, name in enumerate(sheet_names): f_src = frames[i] config = StoreConfig.from_frame(f_src) f_loaded = st1.read(name, config=config) self.assertEqualFrames(f_src, f_loaded)
def test_store_xlsx_read_many_d(self) -> None: records = ( (2, 2, 'a', False, None), (30, 73, 'd', True, None), (None, None, None, None, None), (None, None, None, None, None), ) columns = IndexHierarchy.from_labels(( ('a', 1), ('a', 2), ('b', 1), ('b', 2), (None, None) )) f1 = Frame.from_records(records, columns=columns) with temp_file('.xlsx') as fp: f1.to_xlsx(fp, label='f1', include_index=False, include_columns=True) st1 = StoreXLSX(fp) c = StoreConfig( index_depth=0, columns_depth=2, trim_nadir=True, ) f2 = next(st1.read_many(('f1',), config=c)) self.assertEqual(f2.shape, (2, 4)) self.assertEqual(f2.to_pairs(), ((('a', 1), ((0, 2), (1, 30))), (('a', 2), ((0, 2), (1, 73))), (('b', 1), ((0, 'a'), (1, 'd'))), (('b', 2), ((0, False), (1, True)))))
def test_store_xlsx_write_b(self) -> None: f1 = Frame.from_records( ((None, np.nan, 50, 'a'), (None, -np.inf, -50, 'b'), (None, 60.4, -50, 'c')), index=('p', 'q', 'r'), columns=IndexHierarchy.from_product(('I', 'II'), ('a', 'b')), ) config_map = StoreConfigMap.from_config( StoreConfig(include_index=True, include_columns=True)) with temp_file('.xlsx') as fp: st = StoreXLSX(fp) st.write(((STORE_LABEL_DEFAULT, f1),), config=config_map) c = StoreConfig( index_depth=f1.index.depth, columns_depth=f1.columns.depth ) f2 = st.read(STORE_LABEL_DEFAULT, config=c) # just a sample column for now self.assertEqual( f1[HLoc[('II', 'a')]].values.tolist(), f2[HLoc[('II', 'a')]].values.tolist() ) self.assertEqualFrames(f1, f2)
def buses_to_hierarchy( buses: tp.Iterable[Bus], labels: tp.Iterable[tp.Hashable], deepcopy_from_bus: bool, init_exception_cls: tp.Type[Exception], ) -> IndexHierarchy: ''' Given an iterable of named :obj:`Bus` derive a :obj:`Series` with an :obj:`IndexHierarchy`. ''' # NOTE: for now, the Returned Series will have bus Names as values; this requires the Yarn to store a dict, not a list extractor = get_extractor(deepcopy_from_bus, is_array=False, memo_active=False) tree = {} for label, bus in zip(labels, buses): if not isinstance(bus, Bus): raise init_exception_cls(f'Must provide an interable of Bus.') if label in tree: raise init_exception_cls( f'Bus names must be unique: {label} duplicated') tree[label] = extractor(bus._index) return IndexHierarchy.from_tree(tree)
def test_store_sqlite_write_b(self) -> None: f1 = Frame.from_dict( dict( x=(Fraction(3,2), Fraction(1,2), Fraction(2,3), Fraction(3,7)), y=(3,4,-5,-3000)), index=IndexHierarchy.from_product(('I', 'II'), ('a', 'b')), name='f1-dash') frames = (f1,) with temp_file('.sqlite') as fp: st1 = StoreSQLite(fp) st1.write((f.name, f) for f in frames) config = StoreConfig.from_frame(f1) f_loaded = st1.read(f1.name, config=config) # for now, Fractions come back as strings self.assertEqual( f_loaded['x'].to_pairs(), ((('I', 'a'), '3/2'), (('I', 'b'), '1/2'), (('II', 'a'), '2/3'), (('II', 'b'), '3/7')) )
def test_store_xlsx_read_c(self) -> None: index = IndexHierarchy.from_product(('left', 'right'), ('up', 'down')) columns = IndexHierarchy.from_labels(((100, -5, 20),)) f1 = Frame([1, 2, 3, 4], index=index, columns=columns) with temp_file('.xlsx') as fp: st = StoreXLSX(fp) st.write(((None, f1),), include_index=True, include_columns=False) f2 = st.read(index_depth=f1.index.depth, columns_depth=0) self.assertTrue((f1.values == f2.values).all()) self.assertEqual(f2.to_pairs(0), ((0, ((('left', 'up'), 1), (('left', 'down'), 2), (('right', 'up'), 3), (('right', 'down'), 4))),) )
def test_store_xlsx_read_b(self) -> None: index = IndexHierarchy.from_product(('left', 'right'), ('up', 'down')) columns = IndexHierarchy.from_labels(((100, -5, 20), )) f1 = Frame.from_elements([1, 2, 3, 4], index=index, columns=columns) config_map = StoreConfigMap.from_config( StoreConfig(include_index=False, include_columns=True)) with temp_file('.xlsx') as fp: st = StoreXLSX(fp) st.write(((None, f1), ), config=config_map) c = StoreConfig(index_depth=0, columns_depth=f1.columns.depth) f2 = st.read(None, config=c) self.assertTrue((f1.values == f2.values).all()) self.assertEqual(f2.to_pairs(0), (((100, -5, 20), ((0, 1), (1, 2), (2, 3), (3, 4))), ))
def test_store_get_field_names_and_dtypes_d(self) -> None: from static_frame.core.index_hierarchy import IndexHierarchy columns = IndexHierarchy.from_labels(((1, 'a'), (1, 'b'), (2, 'c')), name=('foo', 'bar')) f1 = Frame.from_records((('a', True, None), ), index=(('a', )), columns=columns) field_names, dtypes = Store.get_field_names_and_dtypes( frame=f1, include_index=True, include_index_name=False, include_columns=True, include_columns_name=True, ) self.assertEqual(field_names, [('foo', 'bar'), "[1 'a']", "[1 'b']", "[2 'c']"]) self.assertTrue(len(field_names) == len(dtypes)) field_names, dtypes = Store.get_field_names_and_dtypes( frame=f1, include_index=True, include_index_name=False, include_columns=True, include_columns_name=True, force_brackets=True, ) self.assertEqual(field_names, ["['foo' 'bar']", "[1 'a']", "[1 'b']", "[2 'c']"]) with self.assertRaises(StoreParameterConflict): field_names, dtypes = Store.get_field_names_and_dtypes( frame=f1, include_index=True, include_index_name=False, include_columns=True, include_columns_name=False, ) with self.assertRaises(StoreParameterConflict): field_names, dtypes = Store.get_field_names_and_dtypes( frame=f1, include_index=False, include_index_name=False, include_columns=True, include_columns_name=True, )
def bus_to_hierarchy( bus: tp.Union[Bus, 'Yarn'], axis: int, deepcopy_from_bus: bool, init_exception_cls: tp.Type[Exception], ) -> tp.Tuple[IndexHierarchy, IndexBase]: ''' Given a :obj:`Bus` and an axis, derive a :obj:`IndexHierarchy`; also return and validate the :obj:`Index` of the opposite axis. ''' # NOTE: need to extract just axis labels, not the full Frame; need new Store/Bus loaders just for label data extractor = get_extractor(deepcopy_from_bus, is_array=False, memo_active=False) def tree_extractor(index: IndexBase) -> tp.Union[IndexBase, TreeNodeT]: index = extractor(index) if isinstance(index, IndexHierarchy): return index.to_tree() return index tree: TreeNodeT = {} opposite: tp.Optional[IndexBase] = None for label, f in bus.items(): if axis == 0: tree[label] = tree_extractor(f.index) if opposite is None: opposite = extractor(f.columns) else: if not opposite.equals(f.columns): raise init_exception_cls( 'opposite axis must have equivalent indices') elif axis == 1: tree[label] = tree_extractor(f.columns) if opposite is None: opposite = extractor(f.index) else: if not opposite.equals(f.index): raise init_exception_cls( 'opposite axis must have equivalent indices') else: raise AxisInvalid(f'invalid axis {axis}') # NOTE: we could try to collect index constructors by using the index of the Bus and observing the inidices of the contained Frames, but it is not clear that will be better then using IndexAutoConstructorFactory return IndexHierarchy.from_tree( tree, index_constructors=IndexAutoConstructorFactory ), opposite # type: ignore
def from_pandas( cls, value: 'pandas.Index', ) -> 'IndexBase': ''' Given a Pandas index, return the appropriate IndexBase derived class. ''' import pandas if not isinstance(value, pandas.Index): raise ErrorInitIndex( f'from_pandas must be called with a Pandas Index object, not: {type(value)}' ) from static_frame import Index from static_frame import IndexGO from static_frame import IndexHierarchy from static_frame import IndexHierarchyGO from static_frame import IndexNanosecond from static_frame import IndexNanosecondGO from static_frame.core.index_datetime import IndexDatetime if isinstance(value, pandas.MultiIndex): # iterating over a hierarchical index will iterate over labels name: tp.Optional[tp.Tuple[tp.Hashable, ...]] = tuple(value.names) # if not assigned Pandas returns None for all components, which will raise issue if trying to unset this index. if all(n is None for n in name): #type: ignore name = None depth = value.nlevels if not cls.STATIC: return IndexHierarchyGO.from_labels(value, name=name, depth_reference=depth) return IndexHierarchy.from_labels(value, name=name, depth_reference=depth) elif isinstance(value, pandas.DatetimeIndex): # if IndexDatetime, use cls, else use IndexNanosecond if issubclass(cls, IndexDatetime): return cls(value, name=value.name) else: if not cls.STATIC: return IndexNanosecondGO(value, name=value.name) return IndexNanosecond(value, name=value.name) if not cls.STATIC: return IndexGO(value, name=value.name) return Index(value, name=value.name)
def test_bus_extract_loc_a(self) -> None: f1 = Frame.from_dict(dict(a=(1, 2), b=(3, 4)), index=('x', 'y'), name='foo') f2 = Frame.from_dict(dict(a=(1, 2, 3), b=(4, 5, 6)), index=('x', 'y', 'z'), name='bar') f3 = Frame.from_dict(dict(d=(10, 20), b=(50, 60)), index=('p', 'q'), name='f3') ih = IndexHierarchy.from_labels((('a', 1), ('b', 2), ('b', 1))) s1 = Series((f1, f2, f3), index=ih, dtype=object) # do not support IndexHierarchy, as lables are tuples, not strings with self.assertRaises(ErrorInitBus): b1 = Bus(s1)
def test_yarn_rehierarch_a(self) -> None: f1 = ff.parse('s(4,2)').rename('f1') f2 = ff.parse('s(4,5)').rename('f2') f3 = ff.parse('s(2,2)').rename('f3') f4 = ff.parse('s(2,8)').rename('f4') f5 = ff.parse('s(4,4)').rename('f5') f6 = ff.parse('s(6,4)').rename('f6') b1 = Bus.from_frames((f1, f2, f3)) b2 = Bus.from_frames((f4, )) b3 = Bus.from_frames((f5, f6)) y1 = Yarn((b1, b2, b3), index=IndexHierarchy.from_product(('a', 'b'), (1, 2, 3))) self.assertEqual( y1.iloc[[0, 2, 4]].rehierarch((1, 0)).status['shape'].to_pairs(), (((1, 'a'), (4, 2)), ((3, 'a'), (2, 2)), ((2, 'b'), (4, 4))))
def test_yarn_relabel_flat_a(self) -> None: f1 = ff.parse('s(4,2)').rename('f1') f2 = ff.parse('s(4,5)').rename('f2') f3 = ff.parse('s(2,2)').rename('f3') f4 = ff.parse('s(2,8)').rename('f4') f5 = ff.parse('s(4,4)').rename('f5') f6 = ff.parse('s(6,4)').rename('f6') b1 = Bus.from_frames((f1, f2, f3)) b2 = Bus.from_frames((f4,)) b3 = Bus.from_frames((f5, f6)) y1 = Yarn((b1, b2, b3), index=IndexHierarchy.from_product(('a', 'b'), (1, 2, 3))) self.assertEqual( y1.relabel_flat()[('a', 3):].status['shape'].to_pairs(), ((('a', 3), (2, 2)), (('b', 1), (2, 8)), (('b', 2), (4, 4)), (('b', 3), (6, 4))) )
def from_concat(cls, containers: tp.Iterable['Series'], *, name: tp.Hashable = None): ''' Concatenate multiple Series into a new Series, assuming the combination of all Indices result in a unique Index. ''' array_values = [] array_index = [] for c in containers: array_values.append(c.values) array_index.append(c.index.values) # returns immutable arrays values = concat_resolved(array_values) index = concat_resolved(array_index) if index.ndim == 2: index = IndexHierarchy.from_labels(index) return cls(values, index=index, name=name)
def test_store_xlsx_write_b(self) -> None: f1 = Frame.from_records( ((None, np.nan, 50, 'a'), (None, -np.inf, -50, 'b'), (None, 60.4, -50, 'c')), index=('p', 'q', 'r'), columns=IndexHierarchy.from_product(('I', 'II'), ('a', 'b')), ) with temp_file('.xlsx') as fp: st = StoreXLSX(fp) st.write(((None, f1),)) f2 = st.read(index_depth=f1.index.depth, columns_depth=f1.columns.depth) # just a sample column for now self.assertEqual( f1[HLoc[('II', 'a')]].values.tolist(), f2[HLoc[('II', 'a')]].values.tolist() ) self.assertEqualFrames(f1, f2)
def test_store_sqlite_write_c(self) -> None: f1 = Frame.from_dict( dict( x=np.array([1.2, 4.5, 3.2, 6.5], dtype=np.float16), y=(3,4,-5,-3000)), index=IndexHierarchy.from_product(('I', 'II'), ('a', 'b')), name='f1') frames = (f1,) with temp_file('.sqlite') as fp: st1 = StoreSQLite(fp) st1.write((f.name, f) for f in frames) config = StoreConfig.from_frame(f1) f_loaded = st1.read(f1.name, config=config) self.assertAlmostEqualItems(f_loaded['x'].to_pairs(), ((('I', 'a'), 1.2001953125), (('I', 'b'), 4.5), (('II', 'a'), 3.19921875), (('II', 'b'), 6.5)) )
def test_assertions(hierarchy: IndexHierarchy, opposite: Index) -> None: expected_tree = dict(f1=tree1, f2=tree2, f3=tree3) self.compare_trees(hierarchy.to_tree(), expected_tree) self.assertTrue(index1.equals(opposite))
def read( self, label: tp.Optional[str] = None, *, index_depth: int = 1, columns_depth: int = 1, dtypes: DtypesSpecifier = None, store_filter: tp.Optional[StoreFilter] = STORE_FILTER_DEFAULT ) -> Frame: ''' Args: {dtypes} ''' wb = self._load_workbook(self._fp) if label is None: ws = wb[wb.sheetnames[0]] name = None # do not set to default sheet name else: ws = wb[label] name = ws.title if ws.max_column <= 1 or ws.max_row <= 1: # https://openpyxl.readthedocs.io/en/stable/optimized.html # says that some clients might not repare correct dimensions; not sure what conditions are best to show this ws.calculate_dimension() max_column = ws.max_column max_row = ws.max_row index_values: tp.List[tp.Any] = [] columns_values: tp.List[tp.Any] = [] # print() # for row in ws.iter_rows(): # print(tuple(str(c.value).ljust(10) for c in row)) data = [] for row_count, row in enumerate( ws.iter_rows()): # cannot use values_only on 2.5.4 if store_filter is None: row = tuple(c.value for c in row) else: # only need to filter string values, but probably too expensive to pre-check row = tuple( store_filter.to_type_filter_element(c.value) for c in row) if row_count <= columns_depth - 1: if columns_depth == 1: columns_values.extend(row[index_depth:]) elif columns_depth > 1: # NOTE: this orientation will need to be rotated columns_values.append(row[index_depth:]) continue if index_depth == 0: data.append(row) elif index_depth == 1: index_values.append(row[0]) data.append(row[1:]) else: index_values.append(row[:index_depth]) data.append(row[index_depth:]) wb.close() index: tp.Optional[IndexBase] = None own_index = False if index_depth == 1: index = Index(index_values) own_index = True elif index_depth > 1: index = IndexHierarchy.from_labels(index_values, continuation_token=None) own_index = True columns: tp.Optional[IndexBase] = None own_columns = False if columns_depth == 1: columns = Index(columns_values) own_columns = True elif columns_depth > 1: columns = IndexHierarchy.from_labels(zip(*columns_values), continuation_token=None) own_columns = True return tp.cast( Frame, Frame.from_records(data, index=index, columns=columns, dtypes=dtypes, own_index=own_index, own_columns=own_columns, name=name))
def from_frame(cls, frame: Frame, *, chunksize: int, retain_labels: bool, axis: int = 0, name: NameType = None, label_extractor: tp.Optional[tp.Callable[[IndexBase], tp.Hashable]] = None, config: StoreConfigMapInitializer = None, deepcopy_from_bus: bool = False, ) -> 'Quilt': ''' Given a :obj:`Frame`, create a :obj:`Quilt` by partitioning it along the specified ``axis`` in units of ``chunksize``, where ``axis`` 0 partitions vertically (retaining aligned columns) and 1 partions horizontally (retaining aligned index). Args: label_extractor: Function that, given the partitioned index component along the specified axis, returns a string label for that chunk. ''' vector = frame._index if axis == 0 else frame._columns vector_len = len(vector) starts = range(0, vector_len, chunksize) if len(starts) == 1: ends: tp.Iterable[int] = (vector_len,) else: ends = range(starts[1], vector_len, chunksize) if label_extractor is None: label_extractor = lambda x: x.iloc[0] #type: ignore axis_map_components: tp.Dict[tp.Hashable, IndexBase] = {} opposite = None def values() -> tp.Iterator[Frame]: nonlocal opposite for start, end in zip_longest(starts, ends, fillvalue=vector_len): if axis == 0: # along rows f = frame.iloc[start:end] label = label_extractor(f.index) #type: ignore axis_map_components[label] = f.index if opposite is None: opposite = f.columns elif axis == 1: # along columns f = frame.iloc[:, start:end] label = label_extractor(f.columns) #type: ignore axis_map_components[label] = f.columns if opposite is None: opposite = f.index else: raise AxisInvalid(f'invalid axis {axis}') yield f.rename(label) name = name if name else frame.name bus = Bus.from_frames(values(), config=config, name=name) axis_hierarchy = IndexHierarchy.from_tree(axis_map_components) return cls(bus, axis=axis, axis_hierarchy=axis_hierarchy, axis_opposite=opposite, retain_labels=retain_labels, deepcopy_from_bus=deepcopy_from_bus, )
def read( self, label: tp.Optional[str] = None, *, config: tp.Optional[StoreConfig] = None, store_filter: tp.Optional[StoreFilter] = STORE_FILTER_DEFAULT, container_type: tp.Type[Frame] = Frame, ) -> Frame: ''' Args: label: Name of sheet to read from XLSX. container_type: Type of container to be returned, either Frame or a Frame subclass ''' if config is None: config = StoreConfig() # get default index_depth = config.index_depth index_name_depth_level = config.index_name_depth_level columns_depth = config.columns_depth columns_name_depth_level = config.columns_name_depth_level trim_nadir = config.trim_nadir skip_header = config.skip_header skip_footer = config.skip_footer wb = self._load_workbook(self._fp) if label is None: ws = wb[wb.sheetnames[0]] name = None # do not set to default sheet name else: ws = wb[label] name = ws.title if ws.max_column <= 1 or ws.max_row <= 1: # https://openpyxl.readthedocs.io/en/stable/optimized.html # says that some clients might not report correct dimensions ws.calculate_dimension() max_column = ws.max_column max_row = ws.max_row # adjust for downward shift for skipping header, then reduce for footer; at this value and beyond we stop last_row_count = max_row - skip_header - skip_footer index_values: tp.List[tp.Any] = [] columns_values: tp.List[tp.Any] = [] data = [] # pre-size with None? apex_rows = [] if trim_nadir: mask = np.full((last_row_count, max_column), False) for row_count, row in enumerate(ws.iter_rows(max_row=max_row), start=-skip_header): if row_count < 0: continue # due to skip header; perserves comparison to columns_depth if row_count >= last_row_count: break if trim_nadir: row_data: tp.Sequence[tp.Any] = [] for col_count, c in enumerate(row): if store_filter is None: value = c.value else: value = store_filter.to_type_filter_element(c.value) if value is None: # NOTE: only checking None, not np.nan mask[row_count, col_count] = True row_data.append(value) # type: ignore if not row_data: mask[row_count] = True else: if store_filter is None: row_data = tuple(c.value for c in row) else: # only need to filter string values, but probably too expensive to pre-check row_data = tuple( store_filter.to_type_filter_element(c.value) for c in row) if row_count <= columns_depth - 1: apex_rows.append(row_data[:index_depth]) if columns_depth == 1: columns_values.extend(row_data[index_depth:]) elif columns_depth > 1: columns_values.append(row_data[index_depth:]) continue if index_depth == 0: data.append(row_data) elif index_depth == 1: index_values.append(row_data[0]) data.append(row_data[1:]) else: index_values.append(row_data[:index_depth]) data.append(row_data[index_depth:]) wb.close() #----------------------------------------------------------------------- # Trim all-empty trailing rows created from style formatting GH#146. As the wb is opened in read-only mode, reverse iterating on the wb is not an option, nor is direct row access by integer if trim_nadir: # NOTE: `mask` is all data, while `data` is post index/columns extraction; this means that if a non-None label is found, the row/column will not be trimmed. row_mask = mask.all(axis=1) row_trim_start = array1d_to_last_contiguous_to_edge( row_mask) - columns_depth if row_trim_start < len(row_mask) - columns_depth: data = data[:row_trim_start] if index_depth > 0: # this handles depth 1 and greater index_values = index_values[:row_trim_start] col_mask = mask.all(axis=0) col_trim_start = array1d_to_last_contiguous_to_edge( col_mask) - index_depth if col_trim_start < len(col_mask) - index_depth: data = (r[:col_trim_start] for r in data) #type: ignore if columns_depth == 1: columns_values = columns_values[:col_trim_start] if columns_depth > 1: columns_values = (r[:col_trim_start] for r in columns_values) #type: ignore #----------------------------------------------------------------------- # continue with Index and Frame creation index_name = None if columns_depth == 0 else apex_to_name( rows=apex_rows, depth_level=index_name_depth_level, axis=0, axis_depth=index_depth) index: tp.Optional[IndexBase] = None own_index = False if index_depth == 1: index = Index(index_values, name=index_name) own_index = True elif index_depth > 1: index = IndexHierarchy.from_labels( index_values, continuation_token=None, name=index_name, ) own_index = True columns_name = None if index_depth == 0 else apex_to_name( rows=apex_rows, depth_level=columns_name_depth_level, axis=1, axis_depth=columns_depth) columns: tp.Optional[IndexBase] = None own_columns = False if columns_depth == 1: columns = container_type._COLUMNS_CONSTRUCTOR(columns_values, name=columns_name) own_columns = True elif columns_depth > 1: columns = container_type._COLUMNS_HIERARCHY_CONSTRUCTOR.from_labels( zip(*columns_values), continuation_token=None, name=columns_name, ) own_columns = True return container_type.from_records( data, #type: ignore index=index, columns=columns, dtypes=config.dtypes, own_index=own_index, own_columns=own_columns, name=name, consolidate_blocks=config.consolidate_blocks)
def read( self, label: tp.Optional[str] = None, *, config: tp.Optional[StoreConfig] = None, store_filter: tp.Optional[StoreFilter] = STORE_FILTER_DEFAULT, container_type: tp.Type[Frame] = Frame, ) -> Frame: ''' Args: label: Name of sheet to read from XLSX. container_type: Type of container to be returned, either Frame or a Frame subclass ''' if config is None: config = StoreConfig() # get default index_depth = config.index_depth columns_depth = config.columns_depth wb = self._load_workbook(self._fp) if label is None: ws = wb[wb.sheetnames[0]] name = None # do not set to default sheet name else: ws = wb[label] name = ws.title if ws.max_column <= 1 or ws.max_row <= 1: # https://openpyxl.readthedocs.io/en/stable/optimized.html # says that some clients might not repare correct dimensions; not sure what conditions are best to show this ws.calculate_dimension() max_column = ws.max_column max_row = ws.max_row index_values: tp.List[tp.Any] = [] columns_values: tp.List[tp.Any] = [] data = [] # pre-size with None? for row_count, row in enumerate(ws.iter_rows(max_row=max_row)): if store_filter is None: row = tuple(c.value for c in row) else: # only need to filter string values, but probably too expensive to pre-check row = tuple( store_filter.to_type_filter_element(c.value) for c in row) if row_count <= columns_depth - 1: if columns_depth == 1: columns_values.extend(row[index_depth:]) elif columns_depth > 1: # NOTE: this orientation will need to be rotated columns_values.append(row[index_depth:]) continue if index_depth == 0: data.append(row) elif index_depth == 1: index_values.append(row[0]) data.append(row[1:]) else: index_values.append(row[:index_depth]) data.append(row[index_depth:]) wb.close() # Trim all-empty trailing rows created from style formatting GH#146. As the wb is opened in read-only mode, reverse iterating on the wb is not an option, nor is direct row access by integer; alos, evaluating all rows on forward iteration is expensive. Instead, after collecting all the data in a list and closing the wb, reverse iterate and find rows that are all empty. # NOTE: need to handle case where there are valid index values empty_token = (None if store_filter is None else store_filter.to_type_filter_element(None)) for row_count in range(len(data) - 1, -2, -1): if row_count < 0: break if any(c != empty_token for c in data[row_count]): # try to break early with any break if index_depth == 1 and index_values[row_count] != empty_token: break if index_depth > 1 and any(c != empty_token for c in index_values[row_count]): break # row_count is set to the first row that has data or index; can be -1 empty_row_idx = row_count + 1 # index of all-empty row if empty_row_idx != len(data): # trim data and index_values, if index_depth > 0 data = data[:empty_row_idx] if index_depth > 0: index_values = index_values[:empty_row_idx] # continue with Index and Frame creation index: tp.Optional[IndexBase] = None own_index = False if index_depth == 1: index = Index(index_values) own_index = True elif index_depth > 1: index = IndexHierarchy.from_labels(index_values, continuation_token=None) own_index = True columns: tp.Optional[IndexBase] = None own_columns = False if columns_depth == 1: columns = container_type._COLUMNS_CONSTRUCTOR(columns_values) own_columns = True elif columns_depth > 1: columns = container_type._COLUMNS_HIERARCHY_CONSTRUCTOR.from_labels( zip(*columns_values), continuation_token=None) own_columns = True # NOTE: this might be a Frame or a FrameGO return tp.cast( Frame, container_type.from_records( data, index=index, columns=columns, dtypes=config.dtypes, own_index=own_index, own_columns=own_columns, name=name, consolidate_blocks=config.consolidate_blocks))