Example #1
0
def create_json_schema(source_key: str,
                       item_numbers: List[int] = None) -> dict:
    client = ScrapinghubClient()
    if helpers.is_collection_key(source_key):
        store = api.get_collection(source_key)
        items_count = store.count()
    elif helpers.is_job_key(source_key):
        job = client.get_job(source_key)
        items_count = api.get_items_count(job)
        store = job.items
    else:
        logger.error(f"{source_key} is not a job or collection key")
        return

    if items_count == 0:
        logger.error(f"{source_key} does not have any items")
        return

    item_n_err = "{} is a bad item number, choose numbers between 0 and {}"
    if item_numbers:
        item_numbers.sort()
        if item_numbers[-1] >= items_count or item_numbers[0] < 0:
            logger.error(item_n_err.format(item_numbers[-1], items_count - 1))
            return
    else:
        item_numbers = set_item_no(items_count)

    samples = []
    for n in item_numbers:
        items = api.get_items(source_key, start_index=n, count=1)
        samples.append(items[0])

    return infer_schema(samples)
Example #2
0
 def data_quality_report(self, bucket: Optional[str] = None):
     if helpers.is_collection_key(str(self.source)):
         raise ValueError("Collections are not supported")
     if not self.schema:
         raise ValueError("Schema is empty")
     IPython.display.clear_output()
     DataQualityReport(self.source_items, self.schema, self.report, bucket)
Example #3
0
def create_json_schema(source_key: str,
                       item_numbers: Optional[List[int]] = None) -> dict:
    if helpers.is_collection_key(source_key):
        store = api.get_collection(source_key)
        items_count = store.count()
    elif helpers.is_job_key(source_key):
        job = ScrapinghubClient().get_job(source_key)
        items_count = api.get_items_count(job)
    else:
        raise ValueError(
            f"'{source_key}' is not a valid job or collection key")

    if items_count == 0:
        raise ValueError(f"'{source_key}' does not have any items")

    item_n_err = "{} is a bad item number, choose numbers between 0 and {}"
    if item_numbers:
        item_numbers.sort()
        if item_numbers[-1] >= items_count or item_numbers[0] < 0:
            raise ValueError(
                item_n_err.format(item_numbers[-1], items_count - 1))
    else:
        item_numbers = set_item_no(items_count)

    samples = []
    for n in item_numbers:
        items = api.get_items(source_key, start_index=n, count=1, p_bar=None)
        samples.append(items[0])

    return infer_schema(samples)
Example #4
0
 def get_items(
     source: Union[str, pd.DataFrame, RawItems],
     start: int,
     count: Optional[int],
     filters: Optional[api.Filters],
     expand: bool,
 ) -> Union[JobItems, CollectionItems]:
     if isinstance(source, pd.DataFrame):
         return Items.from_df(source, expand=expand)
     elif isinstance(source, Iterable) and not isinstance(source, str):
         return Items.from_array(source, expand=expand)
     elif helpers.is_job_key(source):
         return JobItems(key=source,
                         start=start,
                         count=count,
                         filters=filters,
                         expand=expand)
     elif helpers.is_collection_key(source):
         if start:
             raise ValueError(
                 "Collections API does not support 'start' parameter")
         return CollectionItems(key=source,
                                count=count,
                                filters=filters,
                                expand=expand)
     else:
         raise ValueError(
             f"'{source}' is not a valid job or collection key")
Example #5
0
def create_json_schema(source_key: str,
                       items_numbers: Optional[List[int]] = None) -> RawSchema:
    """Create schema based on sampled `source_key` items."""
    if helpers.is_collection_key(source_key):
        store = api.get_collection(source_key)
        items_count = store.count()
        start_mask = ""
    elif helpers.is_job_key(source_key):
        items_count = api.get_items_count(api.get_job(source_key))
        start_mask = f"{source_key}/"
    else:
        raise ValueError(
            f"'{source_key}' is not a valid job or collection key")

    if items_count == 0:
        raise ValueError(f"'{source_key}' does not have any items")

    items_numbers = items_numbers or set_item_no(items_count)
    if max(items_numbers) >= items_count or min(items_numbers) < 0:
        raise ValueError(
            f"Expected values between 0 and {items_count}, got '{items_numbers}'"
        )

    samples = []
    for n in items_numbers:
        item = api.get_items(source_key,
                             count=1,
                             start_index=n,
                             start=f"{start_mask}{n}",
                             p_bar=None)[0]
        item.pop("_type", None)
        item.pop("_key", None)
        samples.append(item)

    return infer_schema(samples)
Example #6
0
    def data_quality_report(self, bucket: Optional[str] = None):
        if helpers.is_collection_key(self.source):
            raise ValueError("Collections are not supported")
        if not self.schema:
            raise ValueError("Schema is empty")
        if not self.report.results:
            self.save_result(
                schema_rules.validate(self.schema,
                                      items_dicts=self.source_items.dicts,
                                      fast=False))

        DataQualityReport(self.source_items, self.schema, self.report, bucket)
Example #7
0
 def get_items(
     source: Union[str, pd.DataFrame, RawItems],
     count: Optional[int],
     start: Optional[str],
     filters: Optional[api.Filters],
 ) -> Items:
     if isinstance(source, pd.DataFrame):
         return Items.from_df(source)
     elif isinstance(source, Iterable) and not isinstance(source, str):
         return Items.from_array(cast(RawItems, source))
     elif helpers.is_job_key(source):
         return JobItems(source, count, int(start or 0), filters)
     elif helpers.is_collection_key(source):
         return CollectionItems(source, count, start, filters)
     else:
         raise ValueError(
             f"'{source}' is not a valid job or collection key")
Example #8
0
 def get_items(
     source: str,
     start: int,
     count: Optional[int],
     filters: Optional[api.Filters],
     expand: bool,
 ) -> Union[JobItems, CollectionItems]:
     if helpers.is_job_key(source):
         return JobItems(
             key=source, start=start, count=count, filters=filters, expand=expand
         )
     elif helpers.is_collection_key(source):
         if start:
             raise ValueError("Collections API does not support 'start' parameter")
         return CollectionItems(
             key=source, count=count, filters=filters, expand=expand
         )
     else:
         raise ValueError(f"'{source}' is not a valid job or collection key")
Example #9
0
def get_source(source_key):
    if helpers.is_collection_key(source_key):
        return get_collection(source_key)
    if helpers.is_job_key(source_key):
        return ScrapinghubClient().get_job(source_key).items
Example #10
0
def test_is_collection_key(c_key, expected):
    assert h.is_collection_key(c_key) is expected