def test_get_schema_fails(mocker, source, downloaded_schema, expected_error): mocker.patch( "arche.readers.schema.get_schema_from_url", return_value=downloaded_schema, autospec=True, ) with pytest.raises(expected_error): reader.get_schema(source)
def test_get_schema(mocker, source, downloaded, expected): mocker.patch( "arche.readers.schema.get_schema_from_url", return_value=downloaded, autospec=True, ) assert reader.get_schema(source) == expected
def __init__( self, source: Union[str, pd.DataFrame, RawItems], schema: Optional[sr.SchemaSource] = None, target: Optional[Union[str, pd.DataFrame]] = None, start: int = 0, count: Optional[int] = None, filters: Optional[api.Filters] = None, expand: bool = True, ): """ Args: source: a data source to validate, accepts job keys, pandas df, lists schema: a JSON schema source used to run validation target: a data source to compare with start: an item number to start reading from count: the amount of items to read from start filters: Scrapinghub filtering, see https://python-scrapinghub.readthedocs.io/en/latest/client/apidocs.html#scrapinghub.client.items.Items # noqa expand: if True, use flattened data in garbage rules, affects performance see flatten_df """ if isinstance(source, str) and target == source: raise ValueError( "'target' is equal to 'source'. Data to compare should have different sources." ) if isinstance(source, pd.DataFrame): logging.warning( "Pandas stores `NA` (missing) data differently, " "which might affect schema validation. " "Should you care, consider passing raw data in array-like types.\n" "For more details, see https://pandas.pydata.org/pandas-docs/" "stable/user_guide/gotchas.html#nan-integer-na-values-and-na-type-promotions" ) self.source = source self._schema = None self.schema_source = None if schema: self.schema = sr.get_schema(schema) self.target = target self.start = start self.count = count self.filters = filters self.expand = expand self._source_items = None self._target_items = None self.report = Report()
def __init__( self, source: str, schema: Optional[SchemaSource] = None, target: Optional[str] = None, start: int = 0, count: Optional[int] = None, filters: Optional[api.Filters] = None, expand: bool = True, ): """ Args: source: a data source to validate. Supports job or collection keys schema: a JSON schema source used to run validation target: a data source to compare with start: an item number to start reading from count: the amount of items to read from start filters: Scrapinghub filtering expand: if enabled, use flattened data in garbage rules, affects performance, see flatten_df # noqa """ self.source = source if target == self.source: logger.warning( "'target' is the same as 'source', and will be ignored") self.target = None else: self.target = target self.start = start self.count = count self.filters = filters self.expand = expand self.schema_source = schema if schema: self._schema = get_schema(schema) else: self._schema = None self._source_items = None self._target_items = None self.report = Report()
def schema(self, schema_source): self.schema_source = schema_source self._schema = sr.get_schema(schema_source)
def schema(self): if not self._schema and self.schema_source: self._schema = sr.get_schema(self.schema_source) return self._schema
def test_get_schema_fails_on_type(): with pytest.raises(ValueError) as excinfo: reader.get_schema(1) assert str(excinfo.value) == ( """"1" is an unidentified schema source.\nA dict, a full s3 path or URL is expected""" )