def create_json_schema(source_key: str, item_numbers: List[int] = None) -> dict: client = ScrapinghubClient() if helpers.is_collection_key(source_key): store = api.get_collection(source_key) items_count = store.count() elif helpers.is_job_key(source_key): job = client.get_job(source_key) items_count = api.get_items_count(job) store = job.items else: logger.error(f"{source_key} is not a job or collection key") return if items_count == 0: logger.error(f"{source_key} does not have any items") return item_n_err = "{} is a bad item number, choose numbers between 0 and {}" if item_numbers: item_numbers.sort() if item_numbers[-1] >= items_count or item_numbers[0] < 0: logger.error(item_n_err.format(item_numbers[-1], items_count - 1)) return else: item_numbers = set_item_no(items_count) samples = [] for n in item_numbers: items = api.get_items(source_key, start_index=n, count=1) samples.append(items[0]) return infer_schema(samples)
def fetch_data(self) -> np.ndarray: if self.filters or self.count < 200_000: return api.get_items(self.key, self.count, self.start_index, self.filters) else: return api.get_items_with_pool(self.key, self.count, self.start_index)
def create_json_schema(source_key: str, item_numbers: Optional[List[int]] = None) -> dict: if helpers.is_collection_key(source_key): store = api.get_collection(source_key) items_count = store.count() elif helpers.is_job_key(source_key): job = ScrapinghubClient().get_job(source_key) items_count = api.get_items_count(job) else: raise ValueError( f"'{source_key}' is not a valid job or collection key") if items_count == 0: raise ValueError(f"'{source_key}' does not have any items") item_n_err = "{} is a bad item number, choose numbers between 0 and {}" if item_numbers: item_numbers.sort() if item_numbers[-1] >= items_count or item_numbers[0] < 0: raise ValueError( item_n_err.format(item_numbers[-1], items_count - 1)) else: item_numbers = set_item_no(items_count) samples = [] for n in item_numbers: items = api.get_items(source_key, start_index=n, count=1, p_bar=None) samples.append(items[0]) return infer_schema(samples)
def create_json_schema(source_key: str, items_numbers: Optional[List[int]] = None) -> RawSchema: """Create schema based on sampled `source_key` items.""" if helpers.is_collection_key(source_key): store = api.get_collection(source_key) items_count = store.count() start_mask = "" elif helpers.is_job_key(source_key): items_count = api.get_items_count(api.get_job(source_key)) start_mask = f"{source_key}/" else: raise ValueError( f"'{source_key}' is not a valid job or collection key") if items_count == 0: raise ValueError(f"'{source_key}' does not have any items") items_numbers = items_numbers or set_item_no(items_count) if max(items_numbers) >= items_count or min(items_numbers) < 0: raise ValueError( f"Expected values between 0 and {items_count}, got '{items_numbers}'" ) samples = [] for n in items_numbers: item = api.get_items(source_key, count=1, start_index=n, start=f"{start_mask}{n}", p_bar=None)[0] item.pop("_type", None) item.pop("_key", None) samples.append(item) return infer_schema(samples)
def test_get_items(mocker, mocked_items, count, start_index, start, filters, expected_items): mocker.patch("arche.tools.api.get_source", return_value=Source(mocked_items), autospec=True) items = api.get_items("source_key", count, start_index, start, filters) np.testing.assert_array_equal(items, expected_items)
def fetch_data(self) -> np.ndarray: desc = f"Fetching from '{self.key.rsplit('/')[-1]}'" return api.get_items(self.key, self.count, 0, self.start, self.filters, desc=desc)
def test_get_items(mocker, mocked_items, expected_items, start, count, filters): mocker.patch("arche.tools.api.get_source", return_value=Source(mocked_items), autospec=True) items = api.get_items("source_key", start_index=start, count=count, filters=filters) assert items == expected_items[start:start + count]
def fetch_data(self) -> np.ndarray: return api.get_items(self.key, self.count, 0, self.filters)
def fetch_data(self): return api.get_items(self.key, self.count, 0, self.filters)