def test_stream_results_kwarg(stream_results): def paged_results(page_token): assert False, "should not be called" with pytest.raises(TypeError): actual_data = stream_results(paged_results, page_token=42) list(actual_data)
def test_stream_results_pagination(): input_data = [ { "url": "something" }, { "url": "something2" }, ] input_data2 = [ { "url": "something3" }, ] input_data3 = [ { "url": "something4" }, ] def page_results2(page_token=None) -> TestPagedResult: result_per_token = { None: TestPagedResult(results=input_data, next_page_token=b"two"), b"two": TestPagedResult(results=input_data2, next_page_token=b"three"), b"three": TestPagedResult(results=input_data3, next_page_token=None), } return result_per_token[page_token] # multiple calls to solve the pagination calls actual_data = stream_results(page_results2) assert list(actual_data) == input_data + input_data2 + input_data3
def test_stream_results_no_result(): def paged_results(page_token) -> TestPagedResult: return TestPagedResult(results=[], next_page_token=None) # only 1 call, no pagination actual_data = stream_results(paged_results) assert list(actual_data) == []
def projects_last_modified(self) -> ProjectsLastModifiedCache: if not self.incremental: # No point in loading the previous results if we're doing a full run return {} if self._project_last_modified is not None: return self._project_last_modified # We know there will be at least that many origins stream = stream_results(self.scheduler.get_listed_origins, self.lister_obj.id, limit=300_000) listed_origins = dict() # Projects can have slashes in them if they're subprojects, but the # mointpoint (last component) cannot. url_match = re.compile( r".*\.code\.sf\.net/(?P<namespace>[^/]+)/(?P<project>.+)/.*") for origin in stream: url = origin.url match = url_match.match(url) assert match is not None matches = match.groupdict() namespace = matches["namespace"] project = matches["project"] # "Last modified" dates are the same across all VCS (tools, even) # within a project or subproject. An assertion here would be overkill. last_modified = origin.last_update assert last_modified is not None listed_origins[(namespace, project)] = last_modified.date() self._project_last_modified = listed_origins return listed_origins
def test_fill_test_data(swh_scheduler): for task_type in TASK_TYPES.values(): swh_scheduler.create_task_type(task_type) simulator.fill_test_data(swh_scheduler, num_origins=NUM_ORIGINS) origins = list(stream_results(swh_scheduler.get_listed_origins)) assert len(origins) == NUM_ORIGINS res = swh_scheduler.search_tasks() assert len(res) == NUM_ORIGINS
def test_content_get_partition_murmur3_collision(self, swh_storage, mocker, sample_data): """The Murmur3 token is used as link from index tables to the main table; and non-matching contents with colliding murmur3-hash are filtered-out when reading the main table. This test checks the content_get_partition endpoints return all contents, even the collisions. """ called = 0 rows: Dict[int, Dict] = {} for tok, content in enumerate(sample_data.contents): cont = attr.evolve(content, data=None, ctime=now()) row_d = {**cont.to_dict(), "tok": tok} rows[tok] = row_d # For all tokens, always return cont def mock_content_get_token_range(range_start, range_end, limit): nonlocal called called += 1 for tok in list( rows.keys()) * 3: # yield multiple times the same tok row_d = dict(rows[tok].items()) row_d.pop("tok") yield (tok, ContentRow(**row_d)) mocker.patch.object( swh_storage._cql_runner, "content_get_token_range", mock_content_get_token_range, ) actual_results = list( stream_results(swh_storage.content_get_partition, partition_id=0, nb_partitions=1)) assert called > 0 # everything is listed, even collisions assert len(actual_results) == 3 * len(sample_data.contents) # as we duplicated the returned results, dropping duplicate should yield # the original length assert len(set(actual_results)) == len(sample_data.contents)
def test_stream_results_no_pagination(): input_data = [ { "url": "something" }, { "url": "something2" }, ] def paged_results(page_token) -> TestPagedResult: return TestPagedResult(results=input_data, next_page_token=None) # only 1 call, no pagination actual_data = stream_results(paged_results) assert list(actual_data) == input_data
def projects_last_modified(self) -> ProjectsLastModifiedCache: if not self.incremental: # No point in loading the previous results if we're doing a full run return {} if self._project_last_modified is not None: return self._project_last_modified # We know there will be at least that many origins stream = stream_results(self.scheduler.get_listed_origins, self.lister_obj.id, limit=300_000) listed_origins = dict() # Projects can have slashes in them if they're subprojects, but the # mointpoint (last component) cannot. url_match = re.compile( r".*\.code\.sf\.net/(?P<namespace>[^/]+)/(?P<project>.+)/.*") bzr_url_match = re.compile( r"http://(?P<project>[^/]+).bzr.sourceforge.net/bzr/([^/]+)") cvs_url_match = re.compile( r"rsync://a.cvs.sourceforge.net/cvsroot/(?P<project>.+)/([^/]+)") for origin in stream: url = origin.url match = url_match.match(url) if match is None: # Could be a bzr or cvs special endpoint bzr_match = bzr_url_match.match(url) cvs_match = cvs_url_match.match(url) matches = None if bzr_match is not None: matches = bzr_match.groupdict() elif cvs_match is not None: matches = cvs_match.groupdict() assert matches project = matches["project"] namespace = "p" # no special namespacing for bzr and cvs projects else: matches = match.groupdict() namespace = matches["namespace"] project = matches["project"] # "Last modified" dates are the same across all VCS (tools, even) # within a project or subproject. An assertion here would be overkill. last_modified = origin.last_update assert last_modified is not None listed_origins[(namespace, project)] = last_modified.date() self._project_last_modified = listed_origins return listed_origins
def assert_results_ok(self, partition_id, nb_partitions, actual_results): expected_ids = [ c.sha1 for c in stream_results( self.indexer.storage.content_get_partition, partition_id=partition_id, nb_partitions=nb_partitions, ) ] actual_results = list(actual_results) for indexed_data in actual_results: _id = indexed_data.id assert _id in expected_ids _tool_id = indexed_data.indexer_configuration_id assert _tool_id == self.indexer.tool["id"]
def indexed_contents_in_partition( self, partition_id: int, nb_partitions: int, ) -> Iterable[Sha1]: """Retrieve indexed content ids within partition_id. Args: partition_id: Index of the partition to fetch nb_partitions: Total number of partitions to split into page_token: opaque token used for pagination """ return stream_results( self.idx_storage.content_mimetype_get_partition, self.tool["id"], partition_id, nb_partitions, )
def indexed_contents_in_partition( self, partition_id: int, nb_partitions: int, page_token: Optional[str] = None) -> Iterable[Sha1]: """Retrieve indexed content id within the partition id Args: partition_id: Index of the partition to fetch nb_partitions: Total number of partitions to split into page_token: opaque token used for pagination """ return stream_results( self.idx_storage.content_fossology_license_get_partition, self.tool["id"], partition_id, nb_partitions, )
def test_task_schedule_origins_with_limit(swh_scheduler, storage): """Tests support of extra keyword-arguments.""" _fill_storage_with_origins(storage, 50) limit = 20 expected_origins = list(islice(stream_results(storage.origin_list), limit)) nb_origins = len(expected_origins) assert nb_origins == limit max_task_size = 5 nb_tasks, remainder = divmod(nb_origins, max_task_size) assert remainder == 0 # made the numbers go round result = invoke( swh_scheduler, False, [ "task", "schedule_origins", "swh-test-ping", "--batch-size", max_task_size, "--limit", limit, ], ) # Check the output expected = rf""" Scheduled {nb_tasks} tasks \({nb_origins} origins\). Done. """.lstrip() assert result.exit_code == 0, result.output assert re.fullmatch(expected, result.output, re.MULTILINE), repr(result.output) tasks = swh_scheduler.search_tasks() _assert_origin_tasks_contraints(tasks, max_task_size, nb_origins, expected_origins)