Beispiel #1
0
def create_json_schema(source_key: str,
                       item_numbers: List[int] = None) -> dict:
    client = ScrapinghubClient()
    if helpers.is_collection_key(source_key):
        store = api.get_collection(source_key)
        items_count = store.count()
    elif helpers.is_job_key(source_key):
        job = client.get_job(source_key)
        items_count = api.get_items_count(job)
        store = job.items
    else:
        logger.error(f"{source_key} is not a job or collection key")
        return

    if items_count == 0:
        logger.error(f"{source_key} does not have any items")
        return

    item_n_err = "{} is a bad item number, choose numbers between 0 and {}"
    if item_numbers:
        item_numbers.sort()
        if item_numbers[-1] >= items_count or item_numbers[0] < 0:
            logger.error(item_n_err.format(item_numbers[-1], items_count - 1))
            return
    else:
        item_numbers = set_item_no(items_count)

    samples = []
    for n in item_numbers:
        items = api.get_items(source_key, start_index=n, count=1)
        samples.append(items[0])

    return infer_schema(samples)
Beispiel #2
0
 def fetch_data(self) -> np.ndarray:
     if self.filters or self.count < 200_000:
         return api.get_items(self.key, self.count, self.start_index,
                              self.filters)
     else:
         return api.get_items_with_pool(self.key, self.count,
                                        self.start_index)
Beispiel #3
0
def create_json_schema(source_key: str,
                       item_numbers: Optional[List[int]] = None) -> dict:
    if helpers.is_collection_key(source_key):
        store = api.get_collection(source_key)
        items_count = store.count()
    elif helpers.is_job_key(source_key):
        job = ScrapinghubClient().get_job(source_key)
        items_count = api.get_items_count(job)
    else:
        raise ValueError(
            f"'{source_key}' is not a valid job or collection key")

    if items_count == 0:
        raise ValueError(f"'{source_key}' does not have any items")

    item_n_err = "{} is a bad item number, choose numbers between 0 and {}"
    if item_numbers:
        item_numbers.sort()
        if item_numbers[-1] >= items_count or item_numbers[0] < 0:
            raise ValueError(
                item_n_err.format(item_numbers[-1], items_count - 1))
    else:
        item_numbers = set_item_no(items_count)

    samples = []
    for n in item_numbers:
        items = api.get_items(source_key, start_index=n, count=1, p_bar=None)
        samples.append(items[0])

    return infer_schema(samples)
Beispiel #4
0
def create_json_schema(source_key: str,
                       items_numbers: Optional[List[int]] = None) -> RawSchema:
    """Create schema based on sampled `source_key` items."""
    if helpers.is_collection_key(source_key):
        store = api.get_collection(source_key)
        items_count = store.count()
        start_mask = ""
    elif helpers.is_job_key(source_key):
        items_count = api.get_items_count(api.get_job(source_key))
        start_mask = f"{source_key}/"
    else:
        raise ValueError(
            f"'{source_key}' is not a valid job or collection key")

    if items_count == 0:
        raise ValueError(f"'{source_key}' does not have any items")

    items_numbers = items_numbers or set_item_no(items_count)
    if max(items_numbers) >= items_count or min(items_numbers) < 0:
        raise ValueError(
            f"Expected values between 0 and {items_count}, got '{items_numbers}'"
        )

    samples = []
    for n in items_numbers:
        item = api.get_items(source_key,
                             count=1,
                             start_index=n,
                             start=f"{start_mask}{n}",
                             p_bar=None)[0]
        item.pop("_type", None)
        item.pop("_key", None)
        samples.append(item)

    return infer_schema(samples)
Beispiel #5
0
def test_get_items(mocker, mocked_items, count, start_index, start, filters,
                   expected_items):
    mocker.patch("arche.tools.api.get_source",
                 return_value=Source(mocked_items),
                 autospec=True)
    items = api.get_items("source_key", count, start_index, start, filters)
    np.testing.assert_array_equal(items, expected_items)
Beispiel #6
0
 def fetch_data(self) -> np.ndarray:
     desc = f"Fetching from '{self.key.rsplit('/')[-1]}'"
     return api.get_items(self.key,
                          self.count,
                          0,
                          self.start,
                          self.filters,
                          desc=desc)
Beispiel #7
0
def test_get_items(mocker, mocked_items, expected_items, start, count,
                   filters):
    mocker.patch("arche.tools.api.get_source",
                 return_value=Source(mocked_items),
                 autospec=True)
    items = api.get_items("source_key",
                          start_index=start,
                          count=count,
                          filters=filters)

    assert items == expected_items[start:start + count]
Beispiel #8
0
 def fetch_data(self) -> np.ndarray:
     return api.get_items(self.key, self.count, 0, self.filters)
Beispiel #9
0
 def fetch_data(self):
     return api.get_items(self.key, self.count, 0, self.filters)