def test_read_limit(test_folder_connection_path_csv):
    datasource = PandasDatasource(
        "PandasCSV",
        batch_kwargs_generators={
            "subdir_reader": {
                "class_name": "SubdirReaderBatchKwargsGenerator",
                "base_directory": test_folder_connection_path_csv,
            }
        },
    )

    batch_kwargs = PathBatchKwargs({
        "path":
        os.path.join(str(test_folder_connection_path_csv), "test.csv"),
        # "reader_options": {"sep": ",", "header": 0, "index_col": 0},
        "reader_options": {
            "sep": ","
        },
    })
    nested_update(batch_kwargs, datasource.process_batch_parameters(limit=1))

    batch = datasource.get_batch(batch_kwargs=batch_kwargs)
    assert isinstance(batch, Batch)
    dataset = batch.data
    assert (dataset["col_1"] == [1]).all()
    assert len(dataset) == 1

    # A datasource should always return an object with a typed batch_id
    assert isinstance(batch.batch_kwargs, PathBatchKwargs)
    assert isinstance(batch.batch_markers, BatchMarkers)
Ejemplo n.º 2
0
    def _build_batch_kwargs_from_path(self,
                                      path,
                                      glob_config,
                                      reader_options=None,
                                      limit=None,
                                      partition_id=None):
        # We could add MD5 (e.g. for smallish files)
        # but currently don't want to assume the extra read is worth it
        # unless it's configurable
        # with open(path,'rb') as f:
        #     md5 = hashlib.md5(f.read()).hexdigest()
        batch_kwargs = PathBatchKwargs({"path": path})
        computed_partition_id = self._partitioner(path, glob_config)
        if partition_id and computed_partition_id:
            if partition_id != computed_partition_id:
                logger.warning(
                    "Provided partition_id does not match computed partition_id; consider explicitly "
                    "defining the asset or updating your partitioner.")
            batch_kwargs["partition_id"] = partition_id
        elif partition_id:
            batch_kwargs["partition_id"] = partition_id
        elif computed_partition_id:
            batch_kwargs["partition_id"] = computed_partition_id

        # Apply globally-configured reader options first
        batch_kwargs['reader_options'] = self.reader_options
        if reader_options:
            # Then update with any locally-specified reader options
            batch_kwargs['reader_options'].update(reader_options)

        if limit is not None:
            batch_kwargs['limit'] = limit

        return batch_kwargs
Ejemplo n.º 3
0
    def _get_iterator(self, generator_asset, reader_options=None, limit=None):
        logger.debug("Beginning SubdirReaderGenerator _get_iterator for generator_asset: %s" % generator_asset)
        # If the generator_asset is a file, then return the path.
        # Otherwise, use files in a subdir as batches
        if os.path.isdir(os.path.join(self.base_directory, generator_asset)):
            subdir_options = os.listdir(os.path.join(self.base_directory, generator_asset))
            batches = []
            for file_option in subdir_options:
                for extension in self.known_extensions:
                    if file_option.endswith(extension) and not file_option.startswith("."):
                        batches.append(os.path.join(self.base_directory, generator_asset, file_option))

            return self._build_batch_kwargs_path_iter(batches, reader_options=reader_options, limit=limit)
        else:
            for extension in self.known_extensions:
                path = os.path.join(self.base_directory, generator_asset + extension)
                if os.path.isfile(path):
                    return iter([
                        self._build_batch_kwargs_from_path(path, reader_options=reader_options, limit=limit)
                    ])
            # If we haven't returned yet, raise
            raise BatchKwargsError("No valid files found when searching {:s} using configured known_extensions: "
                                   "{:s} ".format(os.path.join(self.base_directory, generator_asset),
                                                  ', '.join(map(str, self.known_extensions))),
                                   batch_kwargs=PathBatchKwargs(
                                       path=os.path.join(self.base_directory, generator_asset))
                                   )
 def _build_batch_kwargs_from_path(self, path, reader_method=None, reader_options=None, limit=None):
     batch_kwargs = self._datasource.process_batch_parameters(
         reader_method=reader_method or self.reader_method,
         reader_options=reader_options or self.reader_options,
         limit=limit)
     batch_kwargs["path"] = path
     batch_kwargs["datasource"] = self._datasource.name
     return PathBatchKwargs(batch_kwargs)
Ejemplo n.º 5
0
 def _build_batch_kwargs_from_path(self, path):
     # We could add MD5 (e.g. for smallish files)
     # but currently don't want to assume the extra read is worth it
     # unless it's configurable
     # with open(path,'rb') as f:
     #     md5 = hashlib.md5(f.read()).hexdigest()
     batch_kwargs = PathBatchKwargs({
         "path": path,
     })
     partition_id = self._partitioner(path)
     if partition_id is not None:
         batch_kwargs.update({"partition_id": partition_id})
     batch_kwargs.update(self.reader_options)
     return batch_kwargs
def test_standalone_pandas_datasource(test_folder_connection_path_csv):
    datasource = PandasDatasource(
        "PandasCSV",
        batch_kwargs_generators={
            "subdir_reader": {
                "class_name": "SubdirReaderBatchKwargsGenerator",
                "base_directory": test_folder_connection_path_csv,
            }
        },
    )

    assert datasource.get_available_data_asset_names() == {
        "subdir_reader": {
            "names": [("test", "file")],
            "is_complete_list": True
        }
    }
    manual_batch_kwargs = PathBatchKwargs(
        path=os.path.join(str(test_folder_connection_path_csv), "test.csv"))

    generator = datasource.get_batch_kwargs_generator("subdir_reader")
    auto_batch_kwargs = generator.yield_batch_kwargs("test")

    assert manual_batch_kwargs["path"] == auto_batch_kwargs["path"]

    # Include some extra kwargs...
    # auto_batch_kwargs.update(
    #     {"reader_options": {"sep": ",", "header": 0, "index_col": 0}}
    # )
    auto_batch_kwargs.update({"reader_options": {"sep": ","}})
    batch = datasource.get_batch(batch_kwargs=auto_batch_kwargs)
    assert isinstance(batch, Batch)
    dataset = batch.data
    assert (dataset["col_1"] == [1, 2, 3, 4, 5]).all()
    assert len(dataset) == 5

    # A datasource should always return an object with a typed batch_id
    assert isinstance(batch.batch_kwargs, PathBatchKwargs)
    assert isinstance(batch.batch_markers, BatchMarkers)