def test_task_lister_gitlab( task_name, incremental, swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker, ): stats = ListerStats(pages=10, origins=200) mock_lister = mocker.patch("swh.lister.gitlab.tasks.GitLabLister") mock_lister.from_configfile.return_value = mock_lister mock_lister.run.return_value = ListerStats(pages=10, origins=200) kwargs = dict(url="https://gitweb.torproject.org/") res = swh_scheduler_celery_app.send_task( f"swh.lister.gitlab.tasks.{task_name}", kwargs=kwargs, ) assert res res.wait() assert res.successful() mock_lister.from_configfile.assert_called_once_with( incremental=incremental, **kwargs) mock_lister.run.assert_called_once_with() assert res.result == stats.dict()
def test_crates_lister(swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker): # setup the mocked CratesLister lister = mocker.patch("swh.lister.crates.tasks.CratesLister") lister.from_configfile.return_value = lister stats = ListerStats(pages=42, origins=42) lister.run.return_value = stats res = swh_scheduler_celery_app.send_task("swh.lister.crates.tasks.CratesListerTask") assert res res.wait() assert res.successful() assert res.result == stats.dict() lister.from_configfile.assert_called_once_with() lister.run.assert_called_once_with()
def test_lister_gitlab(datadir, swh_scheduler, requests_mock): """Gitlab lister supports full listing""" instance = "gitlab.com" lister = GitLabLister(swh_scheduler, url=api_url(instance), instance=instance) response = gitlab_page_response(datadir, instance, 1) requests_mock.get( lister.page_url(), [{"json": response}], additional_matcher=_match_request, ) listed_result = lister.run() expected_nb_origins = len(response) assert listed_result == ListerStats(pages=1, origins=expected_nb_origins) scheduler_origins = lister.scheduler.get_listed_origins( lister.lister_obj.id ).results assert len(scheduler_origins) == expected_nb_origins for listed_origin in scheduler_origins: assert listed_origin.visit_type == "git" assert listed_origin.url.startswith(f"https://{instance}") assert listed_origin.last_update is not None
def test_lister_gitlab_http_error_500(swh_scheduler, requests_mock, datadir): """Gitlab lister should skip buggy URL and move to next page.""" instance = "gite.lirmm.fr" url = api_url(instance) lister = GitLabLister(swh_scheduler, url=url, instance=instance) url_page1 = lister.page_url() response1 = gitlab_page_response(datadir, instance, 1) url_page2 = lister.page_url(lister.per_page) url_page3 = lister.page_url(2 * lister.per_page) response3 = gitlab_page_response(datadir, instance, 3) requests_mock.get( url_page1, [{"json": response1, "headers": {"Link": f"<{url_page2}>; rel=next"}}], additional_matcher=_match_request, ) requests_mock.get( url_page2, [ {"status_code": 500}, ], additional_matcher=_match_request, ) requests_mock.get( url_page3, [{"json": response3}], additional_matcher=_match_request, ) listed_result = lister.run() expected_nb_origins = len(response1) + len(response3) assert listed_result == ListerStats(pages=2, origins=expected_nb_origins)
def test_ratelimit_once_recovery( swh_scheduler, caplog, requests_ratelimited, num_ratelimit, monkeypatch_sleep_calls, lister_credentials, ): """Check that the lister recovers from hitting the rate-limit once""" caplog.set_level(logging.DEBUG, "swh.core.github.utils") lister = GitHubLister(scheduler=swh_scheduler, credentials=lister_credentials) res = lister.run() # check that we used all the pages assert res == ListerStats(pages=NUM_PAGES, origins=ORIGIN_COUNT) token_users = [] for record in caplog.records: if "Using authentication token" in record.message: token_users.append(record.args[0]) # check that we used one more token than we saw rate limited requests assert len(token_users) == 1 + num_ratelimit # check that we slept for one second between our token uses assert monkeypatch_sleep_calls == [1]
def test_ratelimit_reset_sleep( swh_scheduler, caplog, requests_ratelimited, monkeypatch_sleep_calls, num_before_ratelimit, ratelimit_reset, github_credentials, lister_credentials, ): """Check that the lister properly handles rate-limiting when providing it with authentication tokens""" caplog.set_level(logging.DEBUG, "swh.core.github.utils") lister = GitHubLister(scheduler=swh_scheduler, credentials=lister_credentials) res = lister.run() assert res == ListerStats(pages=NUM_PAGES, origins=ORIGIN_COUNT) # We sleep 1 second every time we change credentials, then we sleep until # ratelimit_reset + 1 expected_sleep_calls = len(github_credentials) * [1] + [ ratelimit_reset + 1 ] assert monkeypatch_sleep_calls == expected_sleep_calls found_exhaustion_message = False for record in caplog.records: if record.levelname == "INFO": if "Rate limits exhausted for all tokens" in record.message: found_exhaustion_message = True break assert found_exhaustion_message
def test_incremental(swh_scheduler, caplog, requests_mocker) -> None: caplog.set_level(logging.DEBUG, "swh.lister.github.lister") # Number of origins to skip skip_origins = 2000 expected_origins = ORIGIN_COUNT - skip_origins # Bump the last_seen_id in the scheduler backend set_lister_state(swh_scheduler, {"last_seen_id": skip_origins}) # Run the lister in incremental mode lister = GitHubLister(scheduler=swh_scheduler) res = lister.run() # add 1 page to the number of full_pages if partial_page_len is not 0 full_pages, partial_page_len = divmod(expected_origins, GitHubLister.PAGE_SIZE) expected_pages = full_pages + bool(partial_page_len) assert res == ListerStats(pages=expected_pages, origins=expected_origins) listed_origins = swh_scheduler.get_listed_origins(limit=expected_origins + 1) assert len(listed_origins.results) == expected_origins assert listed_origins.next_page_token is None lister_data = get_lister_data(swh_scheduler) assert lister_data.current_state == {"last_seen_id": ORIGIN_COUNT} check_origin_4321(swh_scheduler, lister_data) check_origin_5555(swh_scheduler, lister_data)
def test_launchpad_full_listing_task(swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker): lister = mocker.patch("swh.lister.launchpad.tasks.LaunchpadLister") lister.from_configfile.return_value = lister stats = ListerStats(pages=1, origins=28000) lister.run.return_value = stats res = swh_scheduler_celery_app.send_task( "swh.lister.launchpad.tasks.FullLaunchpadLister") assert res res.wait() assert res.successful() assert res.result == stats.dict() lister.from_configfile.assert_called_once_with() lister.run.assert_called_once_with()
def test_full_lister_task( swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker ): stats = ListerStats(pages=10, origins=900) mock_lister = mocker.patch("swh.lister.npm.tasks.NpmLister") mock_lister.from_configfile.return_value = mock_lister mock_lister.run.return_value = stats res = swh_scheduler_celery_app.send_task("swh.lister.npm.tasks.NpmListerTask") assert res res.wait() assert res.successful() mock_lister.from_configfile.assert_called_once_with(incremental=False) mock_lister.run.assert_called_once_with() assert res.result == stats.dict()
def test_incremental_listing(swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker): stats = ListerStats(pages=1, origins=90) mock_lister = mocker.patch(lister_module) mock_lister.from_configfile.return_value = mock_lister mock_lister.run.return_value = stats res = swh_scheduler_celery_app.send_task( "swh.lister.sourceforge.tasks.IncrementalSourceForgeLister") assert res res.wait() assert res.successful() mock_lister.from_configfile.assert_called_once() mock_lister.run.assert_called_once() assert res.result == stats.dict()
def test_lister(swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker): lister = mocker.patch("swh.lister.packagist.tasks.PackagistLister") lister.from_configfile.return_value = lister stats = ListerStats(pages=1, origins=286500) lister.run.return_value = stats res = swh_scheduler_celery_app.send_task( "swh.lister.packagist.tasks.PackagistListerTask" ) assert res res.wait() assert res.successful() assert res.result == stats.dict() lister.from_configfile.assert_called_once_with() lister.run.assert_called_once_with()
def test_lister_gitlab_heptapod(datadir, swh_scheduler, requests_mock): """Heptapod lister happily lists hg, hg_git as hg and git origins""" name = "heptapod" instance = "foss.heptapod.net" lister = GitLabLister( swh_scheduler, url=api_url(instance), name=name, instance=instance ) assert lister.LISTER_NAME == name response = gitlab_page_response(datadir, instance, 1) requests_mock.get( lister.page_url(), [{"json": response}], additional_matcher=_match_request, ) listed_result = lister.run() expected_nb_origins = len(response) for entry in response: assert entry["vcs_type"] in ("hg", "hg_git") assert listed_result == ListerStats(pages=1, origins=expected_nb_origins) scheduler_origins = lister.scheduler.get_listed_origins( lister.lister_obj.id ).results assert len(scheduler_origins) == expected_nb_origins for listed_origin in scheduler_origins: assert listed_origin.visit_type == "hg" assert listed_origin.url.startswith(f"https://{instance}") assert listed_origin.last_update is not None
def test_lister_cgit_with_base_git_url(url, base_git_url, expected_nb_origins, requests_mock_datadir, swh_scheduler): """With base git url provided, listed urls should be the computed origin urls """ lister_cgit = CGitLister( swh_scheduler, url=url, base_git_url=base_git_url, ) stats = lister_cgit.run() assert stats == ListerStats(pages=1, origins=expected_nb_origins) # test page parsing scheduler_origins = swh_scheduler.get_listed_origins( lister_cgit.lister_obj.id).results assert len(scheduler_origins) == expected_nb_origins # test listed repositories for listed_origin in scheduler_origins: assert listed_origin.visit_type == "git" assert listed_origin.url.startswith(base_git_url) assert (listed_origin.url.startswith(url) is False), f"url should be mapped to {base_git_url}"
def test_lister_gitlab_with_pages(swh_scheduler, requests_mock, datadir): """Gitlab lister supports pagination""" instance = "gite.lirmm.fr" lister = GitLabLister(swh_scheduler, url=api_url(instance)) response1 = gitlab_page_response(datadir, instance, 1) response2 = gitlab_page_response(datadir, instance, 2) requests_mock.get( lister.page_url(), [{"json": response1, "headers": {"Link": f"<{lister.page_url(2)}>; rel=next"}}], additional_matcher=_match_request, ) requests_mock.get( lister.page_url(2), [{"json": response2}], additional_matcher=_match_request, ) listed_result = lister.run() expected_nb_origins = len(response1) + len(response2) assert listed_result == ListerStats(pages=2, origins=expected_nb_origins) scheduler_origins = lister.scheduler.get_listed_origins( lister.lister_obj.id ).results assert len(scheduler_origins) == expected_nb_origins for listed_origin in scheduler_origins: assert listed_origin.visit_type == "git" assert listed_origin.url.startswith(f"https://{instance}") assert listed_origin.last_update is not None
def test_lister_cgit_run_with_page(requests_mock_datadir, swh_scheduler): """cgit lister supports pagination""" url = "https://git.tizen/cgit/" lister_cgit = CGitLister(swh_scheduler, url=url) stats = lister_cgit.run() expected_nb_origins = 16 assert stats == ListerStats(pages=3, origins=expected_nb_origins) # test page parsing scheduler_origins = swh_scheduler.get_listed_origins( lister_cgit.lister_obj.id).results assert len(scheduler_origins) == expected_nb_origins # test listed repositories for listed_origin in scheduler_origins: assert listed_origin.visit_type == "git" assert listed_origin.url.startswith("https://git.tizen") # test user agent content assert len(requests_mock_datadir.request_history) != 0 for request in requests_mock_datadir.request_history: assert "User-Agent" in request.headers user_agent = request.headers["User-Agent"] assert "Software Heritage Lister" in user_agent assert __version__ in user_agent
def test_lister_cgit_run_populates_last_update(requests_mock_datadir, swh_scheduler): """cgit lister returns last updated date""" url = "https://git.tizen/cgit" urls_without_date = [ f"https://git.tizen.org/cgit/{suffix_url}" for suffix_url in [ "All-Projects", "All-Users", "Lock-Projects", ] ] lister_cgit = CGitLister(swh_scheduler, url=url) stats = lister_cgit.run() expected_nb_origins = 16 assert stats == ListerStats(pages=3, origins=expected_nb_origins) # test page parsing scheduler_origins = swh_scheduler.get_listed_origins( lister_cgit.lister_obj.id).results assert len(scheduler_origins) == expected_nb_origins # test listed repositories for listed_origin in scheduler_origins: if listed_origin.url in urls_without_date: assert listed_origin.last_update is None else: assert listed_origin.last_update is not None
def test_lister_gitlab_rate_limit(swh_scheduler, requests_mock, datadir, mocker): """Gitlab lister supports rate-limit """ instance = "gite.lirmm.fr" url = api_url(instance) lister = GitLabLister(swh_scheduler, url=url, instance=instance) url_page1 = lister.page_url() response1 = gitlab_page_response(datadir, instance, 1) url_page2 = lister.page_url(2) response2 = gitlab_page_response(datadir, instance, 2) requests_mock.get( url_page1, [{ "json": response1, "headers": { "Link": f"<{url_page2}>; rel=next" } }], additional_matcher=_match_request, ) requests_mock.get( url_page2, [ # rate limited twice { "status_code": codes.forbidden, "headers": { "RateLimit-Remaining": "0" } }, { "status_code": codes.forbidden, "headers": { "RateLimit-Remaining": "0" } }, # ok { "json": response2 }, ], additional_matcher=_match_request, ) # To avoid this test being too slow, we mock sleep within the retry behavior mock_sleep = mocker.patch.object(lister.get_page_result.retry, "sleep") listed_result = lister.run() expected_nb_origins = len(response1) + len(response2) assert listed_result == ListerStats(pages=2, origins=expected_nb_origins) assert_sleep_calls(mocker, mock_sleep, [1, WAIT_EXP_BASE])
def test_lister_cgit_get_origin_from_repo_failing( requests_mock_datadir_missing_url, swh_scheduler): url = "https://git.tizen/cgit/" lister_cgit = CGitLister(swh_scheduler, url=url) stats = lister_cgit.run() expected_nb_origins = 15 assert stats == ListerStats(pages=3, origins=expected_nb_origins)
def test_lister(lister, swh_scheduler_celery_app, swh_scheduler_celery_worker): # setup the mocked PypiLister lister.from_configfile.return_value = lister lister.run.return_value = ListerStats(pages=1, origins=0) res = swh_scheduler_celery_app.send_task( "swh.lister.pypi.tasks.PyPIListerTask") assert res res.wait() assert res.successful() lister.from_configfile.assert_called_once_with() lister.run.assert_called_once_with()
def test_phabricator_lister_task(lister, swh_scheduler_celery_app, swh_scheduler_celery_worker): # setup the mocked PhabricatorLister lister.from_configfile.return_value = lister lister_stats = ListerStats(pages=2, origins=200) lister.run.return_value = lister_stats task_params = { "url": "https://forge.softwareheritage.org", "instance": "swh", "api_token": None, } res = swh_scheduler_celery_app.send_task( "swh.lister.phabricator.tasks.FullPhabricatorLister", kwargs=task_params) assert res res.wait() assert res.successful() assert res.result == lister_stats.dict() lister.from_configfile.assert_called_once_with(**task_params)
def test_incremental(lister, swh_scheduler_celery_app, swh_scheduler_celery_worker): # setup the mocked GitHubLister lister.from_configfile.return_value = lister lister.state = GitHubListerState() lister.run.return_value = ListerStats(pages=5, origins=5000) res = swh_scheduler_celery_app.send_task( "swh.lister.github.tasks.IncrementalGitHubLister") assert res res.wait() assert res.successful() lister.from_configfile.assert_called_once_with()
def test_anonymous_ratelimit(swh_scheduler, caplog, requests_ratelimited) -> None: caplog.set_level(logging.DEBUG, "swh.lister.github.lister") lister = GitHubLister(scheduler=swh_scheduler) assert lister.anonymous assert "using anonymous mode" in caplog.records[-1].message caplog.clear() res = lister.run() assert res == ListerStats(pages=0, origins=0) last_log = caplog.records[-1] assert last_log.levelname == "WARNING" assert "No X-Ratelimit-Reset value found in responses" in last_log.message
def test_lister(lister, swh_scheduler_celery_app, swh_scheduler_celery_worker): # setup the mocked DebianLister lister.from_configfile.return_value = lister stats = ListerStats(pages=12, origins=35618) lister.run.return_value = stats kwargs = dict( mirror_url= "http://www-ftp.lip6.fr/pub/linux/distributions/Ubuntu/archive/", distribution="Ubuntu", suites=["xenial", "bionic", "focal"], components=["main", "multiverse", "restricted", "universe"], ) res = swh_scheduler_celery_app.send_task( "swh.lister.debian.tasks.DebianListerTask", kwargs=kwargs) assert res res.wait() assert res.successful() lister.from_configfile.assert_called_once_with(**kwargs) lister.run.assert_called_once_with() assert res.result == stats.dict()
def test_range(lister, swh_scheduler_celery_app, swh_scheduler_celery_worker): # setup the mocked GitHubLister lister.return_value = lister lister.from_configfile.return_value = lister lister.run.return_value = ListerStats(pages=5, origins=5000) res = swh_scheduler_celery_app.send_task( "swh.lister.github.tasks.RangeGitHubLister", kwargs=dict(first_id=12, last_id=42), ) assert res res.wait() assert res.successful() lister.from_configfile.assert_called_once_with(first_id=12, last_id=42) lister.run.assert_called_once_with()
def test_relister(swh_scheduler, caplog, requests_mocker) -> None: caplog.set_level(logging.DEBUG, "swh.lister.github.lister") # Only set this state as a canary: in the currently tested mode, the lister # should not be touching it. set_lister_state(swh_scheduler, {"last_seen_id": 123}) # Use "relisting" mode to list origins between id 10 and 1011 lister = GitHubLister(scheduler=swh_scheduler, first_id=10, last_id=1011) res = lister.run() # Make sure we got two full pages of results assert res == ListerStats(pages=2, origins=2000) # Check that the relisting mode hasn't touched the stored state. lister_data = get_lister_data(swh_scheduler) assert lister_data.current_state == {"last_seen_id": 123}
def test_full_listing(lister, swh_scheduler_celery_app, swh_scheduler_celery_worker): lister.from_configfile.return_value = lister lister.run.return_value = ListerStats(pages=5, origins=5000) res = swh_scheduler_celery_app.send_task( "swh.lister.bitbucket.tasks.FullBitBucketRelister", kwargs=dict( page_size=100, username="******", password="******", ), ) assert res res.wait() assert res.successful() lister.run.assert_called_once()
def test_full_listing(swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker): lister = mocker.patch("swh.lister.tuleap.tasks.TuleapLister") lister.from_configfile.return_value = lister lister.run.return_value = ListerStats(pages=10, origins=500) kwargs = dict(url="https://tuleap.net") res = swh_scheduler_celery_app.send_task( "swh.lister.tuleap.tasks.FullTuleapLister", kwargs=kwargs, ) assert res res.wait() assert res.successful() lister.from_configfile.assert_called_once_with(**kwargs) lister.run.assert_called_once_with()
def test_from_empty_state(swh_scheduler, caplog, requests_mocker: requests_mock.Mocker) -> None: caplog.set_level(logging.DEBUG, "swh.lister.github.lister") # Run the lister in incremental mode lister = GitHubLister(scheduler=swh_scheduler) res = lister.run() assert res == ListerStats(pages=NUM_PAGES, origins=ORIGIN_COUNT) listed_origins = swh_scheduler.get_listed_origins(limit=ORIGIN_COUNT + 1) assert len(listed_origins.results) == ORIGIN_COUNT assert listed_origins.next_page_token is None lister_data = get_lister_data(swh_scheduler) assert lister_data.current_state == {"last_seen_id": ORIGIN_COUNT} check_origin_4321(swh_scheduler, lister_data) check_origin_5555(swh_scheduler, lister_data)
def test_cgit_lister_task( swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker ): # setup the mocked CGitLister lister = mocker.patch("swh.lister.cgit.tasks.CGitLister") lister.from_configfile.return_value = lister lister.run.return_value = ListerStats(pages=10, origins=500) kwargs = dict(url="https://git.kernel.org/", instance="kernel", base_git_url=None) res = swh_scheduler_celery_app.send_task( "swh.lister.cgit.tasks.CGitListerTask", kwargs=kwargs, ) assert res res.wait() assert res.successful() lister.from_configfile.assert_called_once_with(**kwargs) lister.run.assert_called_once_with()
def test_full_listing_params(lister, swh_scheduler_celery_app, swh_scheduler_celery_worker): lister.from_configfile.return_value = lister lister.run.return_value = ListerStats(pages=10, origins=500) kwargs = dict( url="https://0xacab.org/api/v4", instance="0xacab", api_token="test", page_size=50, ) res = swh_scheduler_celery_app.send_task( "swh.lister.gitea.tasks.FullGiteaRelister", kwargs=kwargs, ) assert res res.wait() assert res.successful() lister.from_configfile.assert_called_once_with(**kwargs) lister.run.assert_called_once_with()