Beispiel #1
0
    def __iter__(self) -> Iterator[DataEntry]:
        source_name = "list_data"
        # Basic idea is to split the dataset into roughly equally sized segments
        # with lower and upper bound, where each worker is assigned one segment
        bounds = util.get_bounds_for_mp_data_loading(len(self))
        for row_number, data in enumerate(self.list_data):
            if not bounds.lower <= row_number < bounds.upper:
                continue

            data = data.copy()
            data = self.process(data)
            data["source"] = SourceContext(source=source_name, row=row_number)
            yield data
Beispiel #2
0
    def __iter__(self):
        # Basic idea is to split the dataset into roughly equally sized segments
        # with lower and upper bound, where each worker is assigned one segment
        bounds = get_bounds_for_mp_data_loading(len(self))
        if not self.cache or (self.cache and not self._data_cache):
            with self.open(self.path) as jsonl_file:
                for line_number, raw in enumerate(jsonl_file):
                    if not bounds.lower <= line_number < bounds.upper:
                        continue

                    span = Span(path=self.path, line=line_number)
                    try:
                        parsed_line = Line(json.loads(raw), span=span)
                        if self.cache:
                            self._data_cache.append(parsed_line)
                        yield parsed_line
                    except ValueError:
                        raise GluonTSDataError(
                            f"Could not read json line {line_number}, {raw}")
        else:
            yield from self._data_cache