Esempio n. 1
0
def test_get_schema_fails(mocker, source, downloaded_schema, expected_error):
    mocker.patch(
        "arche.readers.schema.get_schema_from_url",
        return_value=downloaded_schema,
        autospec=True,
    )
    with pytest.raises(expected_error):
        reader.get_schema(source)
Esempio n. 2
0
def test_get_schema(mocker, source, downloaded, expected):
    mocker.patch(
        "arche.readers.schema.get_schema_from_url",
        return_value=downloaded,
        autospec=True,
    )
    assert reader.get_schema(source) == expected
Esempio n. 3
0
    def __init__(
        self,
        source: Union[str, pd.DataFrame, RawItems],
        schema: Optional[sr.SchemaSource] = None,
        target: Optional[Union[str, pd.DataFrame]] = None,
        start: int = 0,
        count: Optional[int] = None,
        filters: Optional[api.Filters] = None,
        expand: bool = True,
    ):
        """
        Args:
            source: a data source to validate, accepts job keys, pandas df, lists
            schema: a JSON schema source used to run validation
            target: a data source to compare with
            start: an item number to start reading from
            count: the amount of items to read from start
            filters: Scrapinghub filtering, see
            https://python-scrapinghub.readthedocs.io/en/latest/client/apidocs.html#scrapinghub.client.items.Items # noqa
            expand: if True, use flattened data in garbage rules, affects performance
            see flatten_df
        """
        if isinstance(source, str) and target == source:
            raise ValueError(
                "'target' is equal to 'source'. Data to compare should have different sources."
            )
        if isinstance(source, pd.DataFrame):
            logging.warning(
                "Pandas stores `NA` (missing) data differently, "
                "which might affect schema validation. "
                "Should you care, consider passing raw data in array-like types.\n"
                "For more details, see https://pandas.pydata.org/pandas-docs/"
                "stable/user_guide/gotchas.html#nan-integer-na-values-and-na-type-promotions"
            )
        self.source = source
        self._schema = None
        self.schema_source = None
        if schema:
            self.schema = sr.get_schema(schema)
        self.target = target
        self.start = start
        self.count = count
        self.filters = filters
        self.expand = expand
        self._source_items = None
        self._target_items = None

        self.report = Report()
Esempio n. 4
0
    def __init__(
        self,
        source: str,
        schema: Optional[SchemaSource] = None,
        target: Optional[str] = None,
        start: int = 0,
        count: Optional[int] = None,
        filters: Optional[api.Filters] = None,
        expand: bool = True,
    ):
        """
        Args:
            source: a data source to validate. Supports job or collection keys
            schema: a JSON schema source used to run validation
            target: a data source to compare with
            start: an item number to start reading from
            count: the amount of items to read from start
            filters: Scrapinghub filtering
            expand: if enabled, use flattened data in garbage rules, affects performance, see flatten_df # noqa
        """
        self.source = source
        if target == self.source:
            logger.warning(
                "'target' is the same as 'source', and will be ignored")
            self.target = None
        else:
            self.target = target
        self.start = start
        self.count = count
        self.filters = filters
        self.expand = expand
        self.schema_source = schema
        if schema:
            self._schema = get_schema(schema)
        else:
            self._schema = None
        self._source_items = None
        self._target_items = None

        self.report = Report()
Esempio n. 5
0
 def schema(self, schema_source):
     self.schema_source = schema_source
     self._schema = sr.get_schema(schema_source)
Esempio n. 6
0
 def schema(self):
     if not self._schema and self.schema_source:
         self._schema = sr.get_schema(self.schema_source)
     return self._schema
Esempio n. 7
0
def test_get_schema_fails_on_type():
    with pytest.raises(ValueError) as excinfo:
        reader.get_schema(1)
    assert str(excinfo.value) == (
        """"1" is an unidentified schema source.\nA dict, a full s3 path or URL is expected"""
    )