Esempio n. 1
0
    def __init__(
        self,
        path: Path,
        freq: str,
        one_dim_target: bool = True,
        cache: bool = False,
        use_timestamp: bool = False,
    ) -> None:
        self.freq = to_offset(freq)
        self.cache = cache
        self.path = path
        self.process = ProcessDataEntry(freq,
                                        one_dim_target=one_dim_target,
                                        use_timestamp=use_timestamp)
        self._len_per_file = None

        if not self.files():
            raise OSError(f"no valid file found in {path}")

        # necessary, in order to preserve the cached datasets, in case caching
        # was enabled
        self._json_line_files = [
            jsonl.JsonLinesFile(path=path, cache=cache)
            for path in self.files()
        ]
Esempio n. 2
0
 def __iter__(self) -> Iterator[DataEntry]:
     for path in self.files():
         for line in jsonl.JsonLinesFile(path):
             data = self.process(line.content)
             data["source"] = SourceContext(source=line.span.path,
                                            row=line.span.line)
             yield data
Esempio n. 3
0
 def __len__(self):
     if self._len is None:
         len_sum = sum(
             [len(jsonl.JsonLinesFile(path=path)) for path in self.files()]
         )
         self._len = len_sum
     return self._len
Esempio n. 4
0
 def __len__(self):
     return sum([len(jsonl.JsonLinesFile(path)) for path in self.files()])