def _read_parquet(fname, start, end, cols=None): pf = ParquetFile(fname) if cols is None: cols = pf.columns i = 0 df_set = [] for rg in pf.row_groups: last_idx_in_rg = i + rg.num_rows - 1 if start <= last_idx_in_rg: f = pf.open(pf.fn) df = pf.read_row_group(rg, cols, pf.categories, infile=f) filters = [] if start > i: filters.append(df.index >= (start - i)) if end is not None and end < last_idx_in_rg: filters.append(df.index < (end - i)) if filters: _LOG.warning('unaligned chunk fname:[%s] start:[%s] end:[%s]', fname, start, end) df = df[functools.reduce(operator.and_, filters)] df_set.append(df) i += rg.num_rows if end is not None and i >= end: break return df_set
def chunk(self): pf = ParquetFile(self.path) rg = pf.row_groups[0] df = pf.read_row_group(rg, pf.columns, categories=pf.categories, infile=pf.open(pf.fn)) return df