def create_json_schema(source_key: str, item_numbers: List[int] = None) -> dict: client = ScrapinghubClient() if helpers.is_collection_key(source_key): store = api.get_collection(source_key) items_count = store.count() elif helpers.is_job_key(source_key): job = client.get_job(source_key) items_count = api.get_items_count(job) store = job.items else: logger.error(f"{source_key} is not a job or collection key") return if items_count == 0: logger.error(f"{source_key} does not have any items") return item_n_err = "{} is a bad item number, choose numbers between 0 and {}" if item_numbers: item_numbers.sort() if item_numbers[-1] >= items_count or item_numbers[0] < 0: logger.error(item_n_err.format(item_numbers[-1], items_count - 1)) return else: item_numbers = set_item_no(items_count) samples = [] for n in item_numbers: items = api.get_items(source_key, start_index=n, count=1) samples.append(items[0]) return infer_schema(samples)
def create_json_schema(source_key: str, item_numbers: Optional[List[int]] = None) -> dict: if helpers.is_collection_key(source_key): store = api.get_collection(source_key) items_count = store.count() elif helpers.is_job_key(source_key): job = ScrapinghubClient().get_job(source_key) items_count = api.get_items_count(job) else: raise ValueError( f"'{source_key}' is not a valid job or collection key") if items_count == 0: raise ValueError(f"'{source_key}' does not have any items") item_n_err = "{} is a bad item number, choose numbers between 0 and {}" if item_numbers: item_numbers.sort() if item_numbers[-1] >= items_count or item_numbers[0] < 0: raise ValueError( item_n_err.format(item_numbers[-1], items_count - 1)) else: item_numbers = set_item_no(items_count) samples = [] for n in item_numbers: items = api.get_items(source_key, start_index=n, count=1, p_bar=None) samples.append(items[0]) return infer_schema(samples)
def get_items( source: Union[str, pd.DataFrame, RawItems], start: int, count: Optional[int], filters: Optional[api.Filters], expand: bool, ) -> Union[JobItems, CollectionItems]: if isinstance(source, pd.DataFrame): return Items.from_df(source, expand=expand) elif isinstance(source, Iterable) and not isinstance(source, str): return Items.from_array(source, expand=expand) elif helpers.is_job_key(source): return JobItems(key=source, start=start, count=count, filters=filters, expand=expand) elif helpers.is_collection_key(source): if start: raise ValueError( "Collections API does not support 'start' parameter") return CollectionItems(key=source, count=count, filters=filters, expand=expand) else: raise ValueError( f"'{source}' is not a valid job or collection key")
def create_json_schema(source_key: str, items_numbers: Optional[List[int]] = None) -> RawSchema: """Create schema based on sampled `source_key` items.""" if helpers.is_collection_key(source_key): store = api.get_collection(source_key) items_count = store.count() start_mask = "" elif helpers.is_job_key(source_key): items_count = api.get_items_count(api.get_job(source_key)) start_mask = f"{source_key}/" else: raise ValueError( f"'{source_key}' is not a valid job or collection key") if items_count == 0: raise ValueError(f"'{source_key}' does not have any items") items_numbers = items_numbers or set_item_no(items_count) if max(items_numbers) >= items_count or min(items_numbers) < 0: raise ValueError( f"Expected values between 0 and {items_count}, got '{items_numbers}'" ) samples = [] for n in items_numbers: item = api.get_items(source_key, count=1, start_index=n, start=f"{start_mask}{n}", p_bar=None)[0] item.pop("_type", None) item.pop("_key", None) samples.append(item) return infer_schema(samples)
def run_all_rules(self): if helpers.is_job_key(self.source_items.key): self.check_metadata(self.source_items.job) if self.target_items: self.compare_metadata(self.source_items.job, self.target_items.job) self.run_general_rules() self.run_comparison_rules() self.run_schema_rules()
def get_items( source: Union[str, pd.DataFrame, RawItems], count: Optional[int], start: Optional[str], filters: Optional[api.Filters], ) -> Items: if isinstance(source, pd.DataFrame): return Items.from_df(source) elif isinstance(source, Iterable) and not isinstance(source, str): return Items.from_array(cast(RawItems, source)) elif helpers.is_job_key(source): return JobItems(source, count, int(start or 0), filters) elif helpers.is_collection_key(source): return CollectionItems(source, count, start, filters) else: raise ValueError( f"'{source}' is not a valid job or collection key")
def get_items( source: str, start: int, count: Optional[int], filters: Optional[api.Filters], expand: bool, ) -> Union[JobItems, CollectionItems]: if helpers.is_job_key(source): return JobItems( key=source, start=start, count=count, filters=filters, expand=expand ) elif helpers.is_collection_key(source): if start: raise ValueError("Collections API does not support 'start' parameter") return CollectionItems( key=source, count=count, filters=filters, expand=expand ) else: raise ValueError(f"'{source}' is not a valid job or collection key")
def get_source(source_key): if helpers.is_collection_key(source_key): return get_collection(source_key) if helpers.is_job_key(source_key): return ScrapinghubClient().get_job(source_key).items
def test_is_job_key(job_key, expected): assert h.is_job_key(job_key) is expected