def __init__( self, path: Path, freq: str, one_dim_target: bool = True, cache: bool = False, use_timestamp: bool = False, ) -> None: self.freq = to_offset(freq) self.cache = cache self.path = path self.process = ProcessDataEntry(freq, one_dim_target=one_dim_target, use_timestamp=use_timestamp) self._len_per_file = None if not self.files(): raise OSError(f"no valid file found in {path}") # necessary, in order to preserve the cached datasets, in case caching # was enabled self._json_line_files = [ jsonl.JsonLinesFile(path=path, cache=cache) for path in self.files() ]
def __iter__(self) -> Iterator[DataEntry]: for path in self.files(): for line in jsonl.JsonLinesFile(path): data = self.process(line.content) data["source"] = SourceContext(source=line.span.path, row=line.span.line) yield data
def __len__(self): if self._len is None: len_sum = sum( [len(jsonl.JsonLinesFile(path=path)) for path in self.files()] ) self._len = len_sum return self._len
def __len__(self): return sum([len(jsonl.JsonLinesFile(path)) for path in self.files()])